diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,104628 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9106271944801982, + "eval_steps": 200, + "global_step": 13000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.004824572924602e-05, + "grad_norm": 6.222772121429443, + "learning_rate": 9.99930017513135e-05, + "loss": 1.1076, + "num_input_tokens_seen": 16384, + "step": 1 + }, + { + "epoch": 0.00014009649145849205, + "grad_norm": 6.042057037353516, + "learning_rate": 9.998600350262697e-05, + "loss": 1.1086, + "num_input_tokens_seen": 32768, + "step": 2 + }, + { + "epoch": 0.00021014473718773804, + "grad_norm": 7.119229316711426, + "learning_rate": 9.997900525394046e-05, + "loss": 1.4047, + "num_input_tokens_seen": 49152, + "step": 3 + }, + { + "epoch": 0.0002801929829169841, + "grad_norm": 7.133191108703613, + "learning_rate": 9.997200700525395e-05, + "loss": 1.3921, + "num_input_tokens_seen": 65536, + "step": 4 + }, + { + "epoch": 0.0003502412286462301, + "grad_norm": 6.1078338623046875, + "learning_rate": 9.996500875656743e-05, + "loss": 1.3171, + "num_input_tokens_seen": 81920, + "step": 5 + }, + { + "epoch": 0.0004202894743754761, + "grad_norm": 6.466420650482178, + "learning_rate": 9.995801050788092e-05, + "loss": 1.0732, + "num_input_tokens_seen": 97344, + "step": 6 + }, + { + "epoch": 0.0004903377201047221, + "grad_norm": 5.578189849853516, + "learning_rate": 9.99510122591944e-05, + "loss": 0.9929, + "num_input_tokens_seen": 113728, + "step": 7 + }, + { + "epoch": 0.0005603859658339682, + "grad_norm": 7.197720527648926, + "learning_rate": 9.994401401050789e-05, + "loss": 1.2512, + "num_input_tokens_seen": 129528, + "step": 8 + }, + { + "epoch": 0.0006304342115632141, + "grad_norm": 6.618913650512695, + "learning_rate": 9.993701576182136e-05, + "loss": 1.3495, + "num_input_tokens_seen": 145704, + "step": 9 + }, + { + "epoch": 0.0007004824572924602, + "grad_norm": 6.955508232116699, + "learning_rate": 9.993001751313485e-05, + "loss": 1.1823, + "num_input_tokens_seen": 161664, + "step": 10 + }, + { + "epoch": 0.0007705307030217062, + "grad_norm": 6.6807074546813965, + "learning_rate": 9.992301926444835e-05, + "loss": 1.1693, + "num_input_tokens_seen": 177960, + "step": 11 + }, + { + "epoch": 0.0008405789487509522, + "grad_norm": 6.784447193145752, + "learning_rate": 9.991602101576183e-05, + "loss": 1.3744, + "num_input_tokens_seen": 194344, + "step": 12 + }, + { + "epoch": 0.0009106271944801982, + "grad_norm": 6.7418437004089355, + "learning_rate": 9.990902276707532e-05, + "loss": 1.22, + "num_input_tokens_seen": 210728, + "step": 13 + }, + { + "epoch": 0.0009806754402094443, + "grad_norm": 6.43395471572876, + "learning_rate": 9.990202451838879e-05, + "loss": 1.1772, + "num_input_tokens_seen": 227112, + "step": 14 + }, + { + "epoch": 0.0010507236859386903, + "grad_norm": 6.09422492980957, + "learning_rate": 9.989502626970228e-05, + "loss": 1.195, + "num_input_tokens_seen": 243496, + "step": 15 + }, + { + "epoch": 0.0011207719316679364, + "grad_norm": 6.238271236419678, + "learning_rate": 9.988802802101577e-05, + "loss": 1.2623, + "num_input_tokens_seen": 259744, + "step": 16 + }, + { + "epoch": 0.0011908201773971822, + "grad_norm": 6.56187629699707, + "learning_rate": 9.988102977232926e-05, + "loss": 1.2721, + "num_input_tokens_seen": 276128, + "step": 17 + }, + { + "epoch": 0.0012608684231264283, + "grad_norm": 6.818358898162842, + "learning_rate": 9.987403152364275e-05, + "loss": 1.2649, + "num_input_tokens_seen": 292512, + "step": 18 + }, + { + "epoch": 0.0013309166688556743, + "grad_norm": 5.950352191925049, + "learning_rate": 9.986703327495622e-05, + "loss": 1.0024, + "num_input_tokens_seen": 308632, + "step": 19 + }, + { + "epoch": 0.0014009649145849204, + "grad_norm": 6.387479305267334, + "learning_rate": 9.986003502626971e-05, + "loss": 1.2783, + "num_input_tokens_seen": 325016, + "step": 20 + }, + { + "epoch": 0.0014710131603141664, + "grad_norm": 6.187346458435059, + "learning_rate": 9.985303677758318e-05, + "loss": 1.1701, + "num_input_tokens_seen": 341384, + "step": 21 + }, + { + "epoch": 0.0015410614060434125, + "grad_norm": 5.371951103210449, + "learning_rate": 9.984603852889667e-05, + "loss": 1.0483, + "num_input_tokens_seen": 357768, + "step": 22 + }, + { + "epoch": 0.0016111096517726585, + "grad_norm": 6.2206807136535645, + "learning_rate": 9.983904028021016e-05, + "loss": 1.2516, + "num_input_tokens_seen": 374152, + "step": 23 + }, + { + "epoch": 0.0016811578975019044, + "grad_norm": 6.121264457702637, + "learning_rate": 9.983204203152365e-05, + "loss": 1.1506, + "num_input_tokens_seen": 390536, + "step": 24 + }, + { + "epoch": 0.0017512061432311504, + "grad_norm": 6.353756904602051, + "learning_rate": 9.982504378283714e-05, + "loss": 1.3118, + "num_input_tokens_seen": 406920, + "step": 25 + }, + { + "epoch": 0.0018212543889603965, + "grad_norm": 6.270686149597168, + "learning_rate": 9.981804553415061e-05, + "loss": 1.0883, + "num_input_tokens_seen": 422728, + "step": 26 + }, + { + "epoch": 0.0018913026346896425, + "grad_norm": 6.117632865905762, + "learning_rate": 9.98110472854641e-05, + "loss": 1.3346, + "num_input_tokens_seen": 439112, + "step": 27 + }, + { + "epoch": 0.0019613508804188886, + "grad_norm": 6.429015159606934, + "learning_rate": 9.980404903677759e-05, + "loss": 1.2494, + "num_input_tokens_seen": 455144, + "step": 28 + }, + { + "epoch": 0.0020313991261481346, + "grad_norm": 6.4467620849609375, + "learning_rate": 9.979705078809107e-05, + "loss": 1.3335, + "num_input_tokens_seen": 470360, + "step": 29 + }, + { + "epoch": 0.0021014473718773807, + "grad_norm": 6.57926082611084, + "learning_rate": 9.979005253940455e-05, + "loss": 1.2126, + "num_input_tokens_seen": 486120, + "step": 30 + }, + { + "epoch": 0.0021714956176066267, + "grad_norm": 5.650569915771484, + "learning_rate": 9.978305429071804e-05, + "loss": 1.1363, + "num_input_tokens_seen": 501896, + "step": 31 + }, + { + "epoch": 0.0022415438633358728, + "grad_norm": 6.380292892456055, + "learning_rate": 9.977605604203153e-05, + "loss": 1.2251, + "num_input_tokens_seen": 517752, + "step": 32 + }, + { + "epoch": 0.002311592109065119, + "grad_norm": 5.704173564910889, + "learning_rate": 9.976905779334502e-05, + "loss": 1.1685, + "num_input_tokens_seen": 534136, + "step": 33 + }, + { + "epoch": 0.0023816403547943644, + "grad_norm": 5.342978000640869, + "learning_rate": 9.97620595446585e-05, + "loss": 1.2012, + "num_input_tokens_seen": 550216, + "step": 34 + }, + { + "epoch": 0.0024516886005236105, + "grad_norm": 5.7014241218566895, + "learning_rate": 9.975506129597198e-05, + "loss": 1.2342, + "num_input_tokens_seen": 566600, + "step": 35 + }, + { + "epoch": 0.0025217368462528565, + "grad_norm": 6.26229190826416, + "learning_rate": 9.974806304728546e-05, + "loss": 1.2041, + "num_input_tokens_seen": 582984, + "step": 36 + }, + { + "epoch": 0.0025917850919821026, + "grad_norm": 6.583463191986084, + "learning_rate": 9.974106479859896e-05, + "loss": 1.3021, + "num_input_tokens_seen": 598968, + "step": 37 + }, + { + "epoch": 0.0026618333377113486, + "grad_norm": 5.58498477935791, + "learning_rate": 9.973406654991245e-05, + "loss": 1.1622, + "num_input_tokens_seen": 614840, + "step": 38 + }, + { + "epoch": 0.0027318815834405947, + "grad_norm": 5.906906604766846, + "learning_rate": 9.972706830122592e-05, + "loss": 1.1971, + "num_input_tokens_seen": 631224, + "step": 39 + }, + { + "epoch": 0.0028019298291698407, + "grad_norm": 5.962359428405762, + "learning_rate": 9.972007005253941e-05, + "loss": 1.1326, + "num_input_tokens_seen": 647000, + "step": 40 + }, + { + "epoch": 0.002871978074899087, + "grad_norm": 6.447500705718994, + "learning_rate": 9.971307180385289e-05, + "loss": 1.0905, + "num_input_tokens_seen": 662480, + "step": 41 + }, + { + "epoch": 0.002942026320628333, + "grad_norm": 5.7290520668029785, + "learning_rate": 9.970607355516638e-05, + "loss": 1.3585, + "num_input_tokens_seen": 678480, + "step": 42 + }, + { + "epoch": 0.003012074566357579, + "grad_norm": 6.063445568084717, + "learning_rate": 9.969907530647987e-05, + "loss": 1.2841, + "num_input_tokens_seen": 694256, + "step": 43 + }, + { + "epoch": 0.003082122812086825, + "grad_norm": 5.302809238433838, + "learning_rate": 9.969207705779335e-05, + "loss": 1.1168, + "num_input_tokens_seen": 710152, + "step": 44 + }, + { + "epoch": 0.003152171057816071, + "grad_norm": 5.634128093719482, + "learning_rate": 9.968507880910684e-05, + "loss": 1.0609, + "num_input_tokens_seen": 726184, + "step": 45 + }, + { + "epoch": 0.003222219303545317, + "grad_norm": 5.652642726898193, + "learning_rate": 9.967808056042032e-05, + "loss": 1.2228, + "num_input_tokens_seen": 742520, + "step": 46 + }, + { + "epoch": 0.0032922675492745627, + "grad_norm": 5.340751647949219, + "learning_rate": 9.96710823117338e-05, + "loss": 1.0595, + "num_input_tokens_seen": 758904, + "step": 47 + }, + { + "epoch": 0.0033623157950038087, + "grad_norm": 5.422239780426025, + "learning_rate": 9.966408406304728e-05, + "loss": 1.1161, + "num_input_tokens_seen": 775040, + "step": 48 + }, + { + "epoch": 0.0034323640407330548, + "grad_norm": 5.29241418838501, + "learning_rate": 9.965708581436077e-05, + "loss": 1.0255, + "num_input_tokens_seen": 790856, + "step": 49 + }, + { + "epoch": 0.003502412286462301, + "grad_norm": 5.146270275115967, + "learning_rate": 9.965008756567426e-05, + "loss": 0.9762, + "num_input_tokens_seen": 807064, + "step": 50 + }, + { + "epoch": 0.003572460532191547, + "grad_norm": 5.825758457183838, + "learning_rate": 9.964308931698775e-05, + "loss": 1.2108, + "num_input_tokens_seen": 823448, + "step": 51 + }, + { + "epoch": 0.003642508777920793, + "grad_norm": 6.179538726806641, + "learning_rate": 9.963609106830124e-05, + "loss": 1.322, + "num_input_tokens_seen": 838888, + "step": 52 + }, + { + "epoch": 0.003712557023650039, + "grad_norm": 6.464454174041748, + "learning_rate": 9.962909281961471e-05, + "loss": 1.5077, + "num_input_tokens_seen": 855272, + "step": 53 + }, + { + "epoch": 0.003782605269379285, + "grad_norm": 5.4227294921875, + "learning_rate": 9.96220945709282e-05, + "loss": 1.2679, + "num_input_tokens_seen": 871656, + "step": 54 + }, + { + "epoch": 0.003852653515108531, + "grad_norm": 5.949041366577148, + "learning_rate": 9.961509632224169e-05, + "loss": 1.3618, + "num_input_tokens_seen": 888040, + "step": 55 + }, + { + "epoch": 0.003922701760837777, + "grad_norm": 6.050904750823975, + "learning_rate": 9.960809807355516e-05, + "loss": 1.3155, + "num_input_tokens_seen": 904400, + "step": 56 + }, + { + "epoch": 0.003992750006567023, + "grad_norm": 6.048308849334717, + "learning_rate": 9.960109982486866e-05, + "loss": 1.3131, + "num_input_tokens_seen": 919952, + "step": 57 + }, + { + "epoch": 0.004062798252296269, + "grad_norm": 5.683863162994385, + "learning_rate": 9.959410157618214e-05, + "loss": 1.1692, + "num_input_tokens_seen": 936336, + "step": 58 + }, + { + "epoch": 0.004132846498025515, + "grad_norm": 5.449287414550781, + "learning_rate": 9.958710332749563e-05, + "loss": 1.0613, + "num_input_tokens_seen": 952152, + "step": 59 + }, + { + "epoch": 0.004202894743754761, + "grad_norm": 5.31496524810791, + "learning_rate": 9.958010507880912e-05, + "loss": 0.9605, + "num_input_tokens_seen": 967824, + "step": 60 + }, + { + "epoch": 0.004272942989484007, + "grad_norm": 5.57105016708374, + "learning_rate": 9.957310683012259e-05, + "loss": 1.1701, + "num_input_tokens_seen": 983864, + "step": 61 + }, + { + "epoch": 0.004342991235213253, + "grad_norm": 5.3456830978393555, + "learning_rate": 9.956610858143608e-05, + "loss": 1.0995, + "num_input_tokens_seen": 1000248, + "step": 62 + }, + { + "epoch": 0.004413039480942499, + "grad_norm": 5.453295707702637, + "learning_rate": 9.955911033274957e-05, + "loss": 1.2413, + "num_input_tokens_seen": 1016632, + "step": 63 + }, + { + "epoch": 0.0044830877266717455, + "grad_norm": 4.975449562072754, + "learning_rate": 9.955211208406306e-05, + "loss": 1.0961, + "num_input_tokens_seen": 1033016, + "step": 64 + }, + { + "epoch": 0.004553135972400991, + "grad_norm": 5.542137145996094, + "learning_rate": 9.954511383537655e-05, + "loss": 1.1171, + "num_input_tokens_seen": 1049400, + "step": 65 + }, + { + "epoch": 0.004623184218130238, + "grad_norm": 5.213950157165527, + "learning_rate": 9.953811558669002e-05, + "loss": 1.2228, + "num_input_tokens_seen": 1065784, + "step": 66 + }, + { + "epoch": 0.004693232463859483, + "grad_norm": 5.496099948883057, + "learning_rate": 9.953111733800351e-05, + "loss": 1.1529, + "num_input_tokens_seen": 1082168, + "step": 67 + }, + { + "epoch": 0.004763280709588729, + "grad_norm": 5.64145565032959, + "learning_rate": 9.952411908931698e-05, + "loss": 1.2301, + "num_input_tokens_seen": 1098024, + "step": 68 + }, + { + "epoch": 0.004833328955317975, + "grad_norm": 5.566709995269775, + "learning_rate": 9.951712084063047e-05, + "loss": 1.2679, + "num_input_tokens_seen": 1114408, + "step": 69 + }, + { + "epoch": 0.004903377201047221, + "grad_norm": 6.443673133850098, + "learning_rate": 9.951012259194396e-05, + "loss": 1.2313, + "num_input_tokens_seen": 1130792, + "step": 70 + }, + { + "epoch": 0.0049734254467764675, + "grad_norm": 5.882962226867676, + "learning_rate": 9.950312434325745e-05, + "loss": 1.4304, + "num_input_tokens_seen": 1147176, + "step": 71 + }, + { + "epoch": 0.005043473692505713, + "grad_norm": 6.0052666664123535, + "learning_rate": 9.949612609457094e-05, + "loss": 1.3027, + "num_input_tokens_seen": 1160968, + "step": 72 + }, + { + "epoch": 0.0051135219382349596, + "grad_norm": 5.260256767272949, + "learning_rate": 9.948912784588441e-05, + "loss": 1.1526, + "num_input_tokens_seen": 1177352, + "step": 73 + }, + { + "epoch": 0.005183570183964205, + "grad_norm": 5.641814708709717, + "learning_rate": 9.94821295971979e-05, + "loss": 1.0666, + "num_input_tokens_seen": 1193032, + "step": 74 + }, + { + "epoch": 0.005253618429693452, + "grad_norm": 5.121115207672119, + "learning_rate": 9.947513134851138e-05, + "loss": 1.2404, + "num_input_tokens_seen": 1208952, + "step": 75 + }, + { + "epoch": 0.005323666675422697, + "grad_norm": 5.63930082321167, + "learning_rate": 9.946813309982487e-05, + "loss": 1.5127, + "num_input_tokens_seen": 1225000, + "step": 76 + }, + { + "epoch": 0.005393714921151944, + "grad_norm": 4.880716800689697, + "learning_rate": 9.946113485113837e-05, + "loss": 1.1484, + "num_input_tokens_seen": 1241384, + "step": 77 + }, + { + "epoch": 0.005463763166881189, + "grad_norm": 5.59611177444458, + "learning_rate": 9.945413660245184e-05, + "loss": 1.1678, + "num_input_tokens_seen": 1257680, + "step": 78 + }, + { + "epoch": 0.005533811412610436, + "grad_norm": 5.052026271820068, + "learning_rate": 9.944713835376533e-05, + "loss": 1.2207, + "num_input_tokens_seen": 1274064, + "step": 79 + }, + { + "epoch": 0.0056038596583396815, + "grad_norm": 5.285096168518066, + "learning_rate": 9.944014010507881e-05, + "loss": 1.1457, + "num_input_tokens_seen": 1290448, + "step": 80 + }, + { + "epoch": 0.005673907904068927, + "grad_norm": 5.4286580085754395, + "learning_rate": 9.94331418563923e-05, + "loss": 1.3047, + "num_input_tokens_seen": 1306832, + "step": 81 + }, + { + "epoch": 0.005743956149798174, + "grad_norm": 5.937953472137451, + "learning_rate": 9.942614360770578e-05, + "loss": 1.4353, + "num_input_tokens_seen": 1323216, + "step": 82 + }, + { + "epoch": 0.005814004395527419, + "grad_norm": 5.129006385803223, + "learning_rate": 9.941914535901927e-05, + "loss": 1.1434, + "num_input_tokens_seen": 1339408, + "step": 83 + }, + { + "epoch": 0.005884052641256666, + "grad_norm": 5.179675102233887, + "learning_rate": 9.941214711033276e-05, + "loss": 1.2452, + "num_input_tokens_seen": 1355792, + "step": 84 + }, + { + "epoch": 0.005954100886985911, + "grad_norm": 4.912832736968994, + "learning_rate": 9.940514886164624e-05, + "loss": 1.1255, + "num_input_tokens_seen": 1372176, + "step": 85 + }, + { + "epoch": 0.006024149132715158, + "grad_norm": 5.190899848937988, + "learning_rate": 9.939815061295973e-05, + "loss": 1.2543, + "num_input_tokens_seen": 1388560, + "step": 86 + }, + { + "epoch": 0.006094197378444403, + "grad_norm": 5.1751275062561035, + "learning_rate": 9.939115236427321e-05, + "loss": 1.3145, + "num_input_tokens_seen": 1404944, + "step": 87 + }, + { + "epoch": 0.00616424562417365, + "grad_norm": 5.450705528259277, + "learning_rate": 9.938415411558669e-05, + "loss": 1.2844, + "num_input_tokens_seen": 1421328, + "step": 88 + }, + { + "epoch": 0.0062342938699028955, + "grad_norm": 5.593935012817383, + "learning_rate": 9.937715586690018e-05, + "loss": 1.3284, + "num_input_tokens_seen": 1437464, + "step": 89 + }, + { + "epoch": 0.006304342115632142, + "grad_norm": 5.156428813934326, + "learning_rate": 9.937015761821367e-05, + "loss": 1.1682, + "num_input_tokens_seen": 1452952, + "step": 90 + }, + { + "epoch": 0.006374390361361388, + "grad_norm": 4.673638820648193, + "learning_rate": 9.936315936952715e-05, + "loss": 1.004, + "num_input_tokens_seen": 1469336, + "step": 91 + }, + { + "epoch": 0.006444438607090634, + "grad_norm": 4.996700763702393, + "learning_rate": 9.935616112084064e-05, + "loss": 1.087, + "num_input_tokens_seen": 1485448, + "step": 92 + }, + { + "epoch": 0.00651448685281988, + "grad_norm": 4.817474365234375, + "learning_rate": 9.934916287215412e-05, + "loss": 1.151, + "num_input_tokens_seen": 1501472, + "step": 93 + }, + { + "epoch": 0.006584535098549125, + "grad_norm": 5.400479316711426, + "learning_rate": 9.934216462346761e-05, + "loss": 1.3144, + "num_input_tokens_seen": 1516424, + "step": 94 + }, + { + "epoch": 0.006654583344278372, + "grad_norm": 5.232216835021973, + "learning_rate": 9.933516637478108e-05, + "loss": 1.0019, + "num_input_tokens_seen": 1532792, + "step": 95 + }, + { + "epoch": 0.006724631590007617, + "grad_norm": 5.392521381378174, + "learning_rate": 9.932816812609457e-05, + "loss": 1.3195, + "num_input_tokens_seen": 1548600, + "step": 96 + }, + { + "epoch": 0.006794679835736864, + "grad_norm": 5.5280866622924805, + "learning_rate": 9.932116987740806e-05, + "loss": 1.283, + "num_input_tokens_seen": 1564088, + "step": 97 + }, + { + "epoch": 0.0068647280814661095, + "grad_norm": 4.963179588317871, + "learning_rate": 9.931417162872155e-05, + "loss": 1.2716, + "num_input_tokens_seen": 1580040, + "step": 98 + }, + { + "epoch": 0.006934776327195356, + "grad_norm": 4.920302391052246, + "learning_rate": 9.930717338003504e-05, + "loss": 1.088, + "num_input_tokens_seen": 1595880, + "step": 99 + }, + { + "epoch": 0.007004824572924602, + "grad_norm": 4.935486793518066, + "learning_rate": 9.930017513134851e-05, + "loss": 1.0122, + "num_input_tokens_seen": 1611864, + "step": 100 + }, + { + "epoch": 0.007074872818653848, + "grad_norm": 5.099087238311768, + "learning_rate": 9.9293176882662e-05, + "loss": 1.1605, + "num_input_tokens_seen": 1627472, + "step": 101 + }, + { + "epoch": 0.007144921064383094, + "grad_norm": 5.3764328956604, + "learning_rate": 9.928617863397548e-05, + "loss": 1.2225, + "num_input_tokens_seen": 1643856, + "step": 102 + }, + { + "epoch": 0.00721496931011234, + "grad_norm": 5.281564712524414, + "learning_rate": 9.927918038528898e-05, + "loss": 1.1483, + "num_input_tokens_seen": 1660240, + "step": 103 + }, + { + "epoch": 0.007285017555841586, + "grad_norm": 5.395167827606201, + "learning_rate": 9.927218213660247e-05, + "loss": 1.6014, + "num_input_tokens_seen": 1676624, + "step": 104 + }, + { + "epoch": 0.007355065801570832, + "grad_norm": 5.322319507598877, + "learning_rate": 9.926518388791594e-05, + "loss": 1.0933, + "num_input_tokens_seen": 1693008, + "step": 105 + }, + { + "epoch": 0.007425114047300078, + "grad_norm": 5.301229953765869, + "learning_rate": 9.925818563922943e-05, + "loss": 1.1998, + "num_input_tokens_seen": 1708424, + "step": 106 + }, + { + "epoch": 0.0074951622930293236, + "grad_norm": 4.958597183227539, + "learning_rate": 9.92511873905429e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1724808, + "step": 107 + }, + { + "epoch": 0.00756521053875857, + "grad_norm": 4.3913960456848145, + "learning_rate": 9.924418914185639e-05, + "loss": 0.9017, + "num_input_tokens_seen": 1740752, + "step": 108 + }, + { + "epoch": 0.007635258784487816, + "grad_norm": 5.401021480560303, + "learning_rate": 9.923719089316988e-05, + "loss": 1.3646, + "num_input_tokens_seen": 1755176, + "step": 109 + }, + { + "epoch": 0.007705307030217062, + "grad_norm": 4.894444942474365, + "learning_rate": 9.923019264448337e-05, + "loss": 0.9955, + "num_input_tokens_seen": 1771560, + "step": 110 + }, + { + "epoch": 0.007775355275946308, + "grad_norm": 4.878688335418701, + "learning_rate": 9.922319439579686e-05, + "loss": 1.1766, + "num_input_tokens_seen": 1787944, + "step": 111 + }, + { + "epoch": 0.007845403521675554, + "grad_norm": 4.9379777908325195, + "learning_rate": 9.921619614711033e-05, + "loss": 1.1631, + "num_input_tokens_seen": 1803568, + "step": 112 + }, + { + "epoch": 0.0079154517674048, + "grad_norm": 5.101811408996582, + "learning_rate": 9.920919789842382e-05, + "loss": 1.2165, + "num_input_tokens_seen": 1819952, + "step": 113 + }, + { + "epoch": 0.007985500013134045, + "grad_norm": 5.32574987411499, + "learning_rate": 9.920219964973731e-05, + "loss": 1.3012, + "num_input_tokens_seen": 1835296, + "step": 114 + }, + { + "epoch": 0.008055548258863293, + "grad_norm": 5.2391180992126465, + "learning_rate": 9.919520140105079e-05, + "loss": 1.2451, + "num_input_tokens_seen": 1851224, + "step": 115 + }, + { + "epoch": 0.008125596504592538, + "grad_norm": 4.865017890930176, + "learning_rate": 9.918820315236427e-05, + "loss": 1.1683, + "num_input_tokens_seen": 1867608, + "step": 116 + }, + { + "epoch": 0.008195644750321784, + "grad_norm": 4.943136215209961, + "learning_rate": 9.918120490367776e-05, + "loss": 1.31, + "num_input_tokens_seen": 1883696, + "step": 117 + }, + { + "epoch": 0.00826569299605103, + "grad_norm": 4.769871711730957, + "learning_rate": 9.917420665499125e-05, + "loss": 1.1212, + "num_input_tokens_seen": 1900080, + "step": 118 + }, + { + "epoch": 0.008335741241780275, + "grad_norm": 4.785780429840088, + "learning_rate": 9.916720840630474e-05, + "loss": 1.2415, + "num_input_tokens_seen": 1916464, + "step": 119 + }, + { + "epoch": 0.008405789487509523, + "grad_norm": 4.802333831787109, + "learning_rate": 9.916021015761822e-05, + "loss": 1.0513, + "num_input_tokens_seen": 1932848, + "step": 120 + }, + { + "epoch": 0.008475837733238768, + "grad_norm": 5.22212553024292, + "learning_rate": 9.91532119089317e-05, + "loss": 1.2574, + "num_input_tokens_seen": 1949232, + "step": 121 + }, + { + "epoch": 0.008545885978968014, + "grad_norm": 5.104204177856445, + "learning_rate": 9.914621366024518e-05, + "loss": 1.0436, + "num_input_tokens_seen": 1964184, + "step": 122 + }, + { + "epoch": 0.00861593422469726, + "grad_norm": 5.11055326461792, + "learning_rate": 9.913921541155868e-05, + "loss": 1.1939, + "num_input_tokens_seen": 1980568, + "step": 123 + }, + { + "epoch": 0.008685982470426507, + "grad_norm": 4.784866809844971, + "learning_rate": 9.913221716287216e-05, + "loss": 1.2056, + "num_input_tokens_seen": 1996952, + "step": 124 + }, + { + "epoch": 0.008756030716155752, + "grad_norm": 4.763037204742432, + "learning_rate": 9.912521891418564e-05, + "loss": 1.1403, + "num_input_tokens_seen": 2013336, + "step": 125 + }, + { + "epoch": 0.008826078961884998, + "grad_norm": 4.813408851623535, + "learning_rate": 9.911822066549913e-05, + "loss": 1.1897, + "num_input_tokens_seen": 2029720, + "step": 126 + }, + { + "epoch": 0.008896127207614244, + "grad_norm": 4.79008674621582, + "learning_rate": 9.911122241681261e-05, + "loss": 1.2315, + "num_input_tokens_seen": 2046104, + "step": 127 + }, + { + "epoch": 0.008966175453343491, + "grad_norm": 4.843508720397949, + "learning_rate": 9.91042241681261e-05, + "loss": 1.0883, + "num_input_tokens_seen": 2061592, + "step": 128 + }, + { + "epoch": 0.009036223699072737, + "grad_norm": 4.917592525482178, + "learning_rate": 9.909722591943959e-05, + "loss": 1.2512, + "num_input_tokens_seen": 2077792, + "step": 129 + }, + { + "epoch": 0.009106271944801982, + "grad_norm": 4.9154133796691895, + "learning_rate": 9.909022767075307e-05, + "loss": 1.3284, + "num_input_tokens_seen": 2094176, + "step": 130 + }, + { + "epoch": 0.009176320190531228, + "grad_norm": 5.2125420570373535, + "learning_rate": 9.908322942206656e-05, + "loss": 1.3469, + "num_input_tokens_seen": 2110480, + "step": 131 + }, + { + "epoch": 0.009246368436260475, + "grad_norm": 4.715712547302246, + "learning_rate": 9.907623117338004e-05, + "loss": 1.0844, + "num_input_tokens_seen": 2126864, + "step": 132 + }, + { + "epoch": 0.009316416681989721, + "grad_norm": 4.805694580078125, + "learning_rate": 9.906923292469353e-05, + "loss": 1.069, + "num_input_tokens_seen": 2142848, + "step": 133 + }, + { + "epoch": 0.009386464927718966, + "grad_norm": 4.961355209350586, + "learning_rate": 9.9062234676007e-05, + "loss": 1.3387, + "num_input_tokens_seen": 2159232, + "step": 134 + }, + { + "epoch": 0.009456513173448212, + "grad_norm": 4.582219123840332, + "learning_rate": 9.905523642732049e-05, + "loss": 1.2013, + "num_input_tokens_seen": 2175616, + "step": 135 + }, + { + "epoch": 0.009526561419177458, + "grad_norm": 5.195998191833496, + "learning_rate": 9.904823817863398e-05, + "loss": 1.2552, + "num_input_tokens_seen": 2191872, + "step": 136 + }, + { + "epoch": 0.009596609664906705, + "grad_norm": 4.934189319610596, + "learning_rate": 9.904123992994747e-05, + "loss": 1.2961, + "num_input_tokens_seen": 2208208, + "step": 137 + }, + { + "epoch": 0.00966665791063595, + "grad_norm": 4.981037616729736, + "learning_rate": 9.903424168126096e-05, + "loss": 1.1546, + "num_input_tokens_seen": 2224592, + "step": 138 + }, + { + "epoch": 0.009736706156365196, + "grad_norm": 5.469496250152588, + "learning_rate": 9.902724343257443e-05, + "loss": 1.3833, + "num_input_tokens_seen": 2240976, + "step": 139 + }, + { + "epoch": 0.009806754402094442, + "grad_norm": 4.889583587646484, + "learning_rate": 9.902024518388792e-05, + "loss": 1.2095, + "num_input_tokens_seen": 2257360, + "step": 140 + }, + { + "epoch": 0.00987680264782369, + "grad_norm": 4.532052516937256, + "learning_rate": 9.901324693520141e-05, + "loss": 1.143, + "num_input_tokens_seen": 2272848, + "step": 141 + }, + { + "epoch": 0.009946850893552935, + "grad_norm": 5.278079032897949, + "learning_rate": 9.900624868651488e-05, + "loss": 1.2849, + "num_input_tokens_seen": 2289232, + "step": 142 + }, + { + "epoch": 0.01001689913928218, + "grad_norm": 4.549891948699951, + "learning_rate": 9.899925043782839e-05, + "loss": 1.0482, + "num_input_tokens_seen": 2305424, + "step": 143 + }, + { + "epoch": 0.010086947385011426, + "grad_norm": 4.7777180671691895, + "learning_rate": 9.899225218914186e-05, + "loss": 1.1926, + "num_input_tokens_seen": 2320968, + "step": 144 + }, + { + "epoch": 0.010156995630740673, + "grad_norm": 4.320313453674316, + "learning_rate": 9.898525394045535e-05, + "loss": 1.0468, + "num_input_tokens_seen": 2337352, + "step": 145 + }, + { + "epoch": 0.010227043876469919, + "grad_norm": 4.915202617645264, + "learning_rate": 9.897825569176882e-05, + "loss": 1.1326, + "num_input_tokens_seen": 2353064, + "step": 146 + }, + { + "epoch": 0.010297092122199165, + "grad_norm": 4.569783687591553, + "learning_rate": 9.897125744308231e-05, + "loss": 0.8586, + "num_input_tokens_seen": 2369128, + "step": 147 + }, + { + "epoch": 0.01036714036792841, + "grad_norm": 4.591664791107178, + "learning_rate": 9.89642591943958e-05, + "loss": 1.1369, + "num_input_tokens_seen": 2385512, + "step": 148 + }, + { + "epoch": 0.010437188613657656, + "grad_norm": 4.913016319274902, + "learning_rate": 9.895726094570929e-05, + "loss": 1.1564, + "num_input_tokens_seen": 2401208, + "step": 149 + }, + { + "epoch": 0.010507236859386903, + "grad_norm": 4.908018112182617, + "learning_rate": 9.895026269702278e-05, + "loss": 1.1247, + "num_input_tokens_seen": 2417592, + "step": 150 + }, + { + "epoch": 0.010577285105116149, + "grad_norm": 4.536910057067871, + "learning_rate": 9.894326444833625e-05, + "loss": 1.014, + "num_input_tokens_seen": 2433976, + "step": 151 + }, + { + "epoch": 0.010647333350845395, + "grad_norm": 4.899227142333984, + "learning_rate": 9.893626619964974e-05, + "loss": 1.0418, + "num_input_tokens_seen": 2448072, + "step": 152 + }, + { + "epoch": 0.01071738159657464, + "grad_norm": 4.600861072540283, + "learning_rate": 9.892926795096323e-05, + "loss": 1.0459, + "num_input_tokens_seen": 2464240, + "step": 153 + }, + { + "epoch": 0.010787429842303888, + "grad_norm": 4.707681179046631, + "learning_rate": 9.89222697022767e-05, + "loss": 1.0859, + "num_input_tokens_seen": 2480624, + "step": 154 + }, + { + "epoch": 0.010857478088033133, + "grad_norm": 4.748518466949463, + "learning_rate": 9.89152714535902e-05, + "loss": 1.0608, + "num_input_tokens_seen": 2497008, + "step": 155 + }, + { + "epoch": 0.010927526333762379, + "grad_norm": 4.794179439544678, + "learning_rate": 9.890827320490368e-05, + "loss": 1.2243, + "num_input_tokens_seen": 2513392, + "step": 156 + }, + { + "epoch": 0.010997574579491624, + "grad_norm": 4.593925476074219, + "learning_rate": 9.890127495621717e-05, + "loss": 1.1002, + "num_input_tokens_seen": 2529776, + "step": 157 + }, + { + "epoch": 0.011067622825220872, + "grad_norm": 4.318257808685303, + "learning_rate": 9.889427670753066e-05, + "loss": 0.9561, + "num_input_tokens_seen": 2546160, + "step": 158 + }, + { + "epoch": 0.011137671070950117, + "grad_norm": 4.631777286529541, + "learning_rate": 9.888727845884414e-05, + "loss": 1.1553, + "num_input_tokens_seen": 2562544, + "step": 159 + }, + { + "epoch": 0.011207719316679363, + "grad_norm": 4.896609783172607, + "learning_rate": 9.888028021015762e-05, + "loss": 1.1779, + "num_input_tokens_seen": 2578088, + "step": 160 + }, + { + "epoch": 0.011277767562408609, + "grad_norm": 4.3978681564331055, + "learning_rate": 9.88732819614711e-05, + "loss": 1.1778, + "num_input_tokens_seen": 2594416, + "step": 161 + }, + { + "epoch": 0.011347815808137854, + "grad_norm": 4.82927942276001, + "learning_rate": 9.886628371278459e-05, + "loss": 1.0339, + "num_input_tokens_seen": 2609776, + "step": 162 + }, + { + "epoch": 0.011417864053867102, + "grad_norm": 4.413319110870361, + "learning_rate": 9.885928546409809e-05, + "loss": 1.0992, + "num_input_tokens_seen": 2626160, + "step": 163 + }, + { + "epoch": 0.011487912299596347, + "grad_norm": 4.626354694366455, + "learning_rate": 9.885228721541156e-05, + "loss": 1.1948, + "num_input_tokens_seen": 2642464, + "step": 164 + }, + { + "epoch": 0.011557960545325593, + "grad_norm": 4.328434467315674, + "learning_rate": 9.884528896672505e-05, + "loss": 1.1493, + "num_input_tokens_seen": 2658528, + "step": 165 + }, + { + "epoch": 0.011628008791054838, + "grad_norm": 4.57839822769165, + "learning_rate": 9.883829071803853e-05, + "loss": 1.0775, + "num_input_tokens_seen": 2674912, + "step": 166 + }, + { + "epoch": 0.011698057036784086, + "grad_norm": 5.103973865509033, + "learning_rate": 9.883129246935202e-05, + "loss": 1.2458, + "num_input_tokens_seen": 2690792, + "step": 167 + }, + { + "epoch": 0.011768105282513331, + "grad_norm": 4.558016300201416, + "learning_rate": 9.88242942206655e-05, + "loss": 1.0122, + "num_input_tokens_seen": 2705616, + "step": 168 + }, + { + "epoch": 0.011838153528242577, + "grad_norm": 4.811260223388672, + "learning_rate": 9.8817295971979e-05, + "loss": 1.2989, + "num_input_tokens_seen": 2721704, + "step": 169 + }, + { + "epoch": 0.011908201773971823, + "grad_norm": 4.726966857910156, + "learning_rate": 9.881029772329248e-05, + "loss": 1.176, + "num_input_tokens_seen": 2738088, + "step": 170 + }, + { + "epoch": 0.01197825001970107, + "grad_norm": 4.874902725219727, + "learning_rate": 9.880329947460596e-05, + "loss": 1.2586, + "num_input_tokens_seen": 2754040, + "step": 171 + }, + { + "epoch": 0.012048298265430316, + "grad_norm": 4.379549980163574, + "learning_rate": 9.879630122591945e-05, + "loss": 1.1771, + "num_input_tokens_seen": 2770424, + "step": 172 + }, + { + "epoch": 0.012118346511159561, + "grad_norm": 4.455331802368164, + "learning_rate": 9.878930297723292e-05, + "loss": 1.0714, + "num_input_tokens_seen": 2786808, + "step": 173 + }, + { + "epoch": 0.012188394756888807, + "grad_norm": 4.42273473739624, + "learning_rate": 9.878230472854641e-05, + "loss": 1.1798, + "num_input_tokens_seen": 2803176, + "step": 174 + }, + { + "epoch": 0.012258443002618052, + "grad_norm": 4.4078874588012695, + "learning_rate": 9.87753064798599e-05, + "loss": 1.1672, + "num_input_tokens_seen": 2819448, + "step": 175 + }, + { + "epoch": 0.0123284912483473, + "grad_norm": 4.79048490524292, + "learning_rate": 9.876830823117339e-05, + "loss": 1.3331, + "num_input_tokens_seen": 2835832, + "step": 176 + }, + { + "epoch": 0.012398539494076545, + "grad_norm": 4.212133884429932, + "learning_rate": 9.876130998248688e-05, + "loss": 1.0007, + "num_input_tokens_seen": 2851776, + "step": 177 + }, + { + "epoch": 0.012468587739805791, + "grad_norm": 5.7587738037109375, + "learning_rate": 9.875431173380035e-05, + "loss": 1.4729, + "num_input_tokens_seen": 2867896, + "step": 178 + }, + { + "epoch": 0.012538635985535037, + "grad_norm": 4.3469462394714355, + "learning_rate": 9.874731348511384e-05, + "loss": 0.957, + "num_input_tokens_seen": 2884280, + "step": 179 + }, + { + "epoch": 0.012608684231264284, + "grad_norm": 4.584625244140625, + "learning_rate": 9.874031523642733e-05, + "loss": 1.0753, + "num_input_tokens_seen": 2899208, + "step": 180 + }, + { + "epoch": 0.01267873247699353, + "grad_norm": 4.544627666473389, + "learning_rate": 9.87333169877408e-05, + "loss": 1.1706, + "num_input_tokens_seen": 2915416, + "step": 181 + }, + { + "epoch": 0.012748780722722775, + "grad_norm": 4.8749237060546875, + "learning_rate": 9.872631873905429e-05, + "loss": 1.3382, + "num_input_tokens_seen": 2931360, + "step": 182 + }, + { + "epoch": 0.01281882896845202, + "grad_norm": 4.593903541564941, + "learning_rate": 9.871932049036778e-05, + "loss": 1.1588, + "num_input_tokens_seen": 2947744, + "step": 183 + }, + { + "epoch": 0.012888877214181268, + "grad_norm": 4.478219509124756, + "learning_rate": 9.871232224168127e-05, + "loss": 1.1013, + "num_input_tokens_seen": 2963664, + "step": 184 + }, + { + "epoch": 0.012958925459910514, + "grad_norm": 5.028106212615967, + "learning_rate": 9.870532399299476e-05, + "loss": 1.3223, + "num_input_tokens_seen": 2980048, + "step": 185 + }, + { + "epoch": 0.01302897370563976, + "grad_norm": 4.866946697235107, + "learning_rate": 9.869832574430823e-05, + "loss": 1.2376, + "num_input_tokens_seen": 2995992, + "step": 186 + }, + { + "epoch": 0.013099021951369005, + "grad_norm": 4.421341419219971, + "learning_rate": 9.869132749562172e-05, + "loss": 1.2252, + "num_input_tokens_seen": 3012000, + "step": 187 + }, + { + "epoch": 0.01316907019709825, + "grad_norm": 4.88083028793335, + "learning_rate": 9.86843292469352e-05, + "loss": 1.2951, + "num_input_tokens_seen": 3028384, + "step": 188 + }, + { + "epoch": 0.013239118442827498, + "grad_norm": 4.654318809509277, + "learning_rate": 9.86773309982487e-05, + "loss": 1.2839, + "num_input_tokens_seen": 3044768, + "step": 189 + }, + { + "epoch": 0.013309166688556744, + "grad_norm": 4.626763820648193, + "learning_rate": 9.867033274956219e-05, + "loss": 1.2389, + "num_input_tokens_seen": 3061152, + "step": 190 + }, + { + "epoch": 0.01337921493428599, + "grad_norm": 4.178484916687012, + "learning_rate": 9.866333450087566e-05, + "loss": 1.1186, + "num_input_tokens_seen": 3077056, + "step": 191 + }, + { + "epoch": 0.013449263180015235, + "grad_norm": 4.755034923553467, + "learning_rate": 9.865633625218915e-05, + "loss": 1.0594, + "num_input_tokens_seen": 3093400, + "step": 192 + }, + { + "epoch": 0.013519311425744482, + "grad_norm": 4.437506198883057, + "learning_rate": 9.864933800350263e-05, + "loss": 1.2078, + "num_input_tokens_seen": 3109784, + "step": 193 + }, + { + "epoch": 0.013589359671473728, + "grad_norm": 5.140488624572754, + "learning_rate": 9.864233975481611e-05, + "loss": 1.4312, + "num_input_tokens_seen": 3124976, + "step": 194 + }, + { + "epoch": 0.013659407917202973, + "grad_norm": 4.72155237197876, + "learning_rate": 9.86353415061296e-05, + "loss": 1.1752, + "num_input_tokens_seen": 3140632, + "step": 195 + }, + { + "epoch": 0.013729456162932219, + "grad_norm": 4.914645671844482, + "learning_rate": 9.862834325744309e-05, + "loss": 1.2464, + "num_input_tokens_seen": 3156616, + "step": 196 + }, + { + "epoch": 0.013799504408661466, + "grad_norm": 4.23387336730957, + "learning_rate": 9.862134500875658e-05, + "loss": 0.9722, + "num_input_tokens_seen": 3172840, + "step": 197 + }, + { + "epoch": 0.013869552654390712, + "grad_norm": 4.659370422363281, + "learning_rate": 9.861434676007005e-05, + "loss": 1.1981, + "num_input_tokens_seen": 3188584, + "step": 198 + }, + { + "epoch": 0.013939600900119958, + "grad_norm": 4.580902576446533, + "learning_rate": 9.860734851138354e-05, + "loss": 1.1913, + "num_input_tokens_seen": 3204432, + "step": 199 + }, + { + "epoch": 0.014009649145849203, + "grad_norm": 4.208237648010254, + "learning_rate": 9.860035026269702e-05, + "loss": 1.2056, + "num_input_tokens_seen": 3220816, + "step": 200 + }, + { + "epoch": 0.014009649145849203, + "eval_loss": 1.2226407527923584, + "eval_runtime": 0.3992, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 2.505, + "num_input_tokens_seen": 3220816, + "step": 200 + }, + { + "epoch": 0.014079697391578449, + "grad_norm": 4.526260852813721, + "learning_rate": 9.85933520140105e-05, + "loss": 1.0488, + "num_input_tokens_seen": 3237200, + "step": 201 + }, + { + "epoch": 0.014149745637307696, + "grad_norm": 4.46895170211792, + "learning_rate": 9.8586353765324e-05, + "loss": 1.1101, + "num_input_tokens_seen": 3253336, + "step": 202 + }, + { + "epoch": 0.014219793883036942, + "grad_norm": 4.367347717285156, + "learning_rate": 9.857935551663748e-05, + "loss": 1.0425, + "num_input_tokens_seen": 3269632, + "step": 203 + }, + { + "epoch": 0.014289842128766187, + "grad_norm": 4.860860347747803, + "learning_rate": 9.857235726795097e-05, + "loss": 1.4068, + "num_input_tokens_seen": 3285432, + "step": 204 + }, + { + "epoch": 0.014359890374495433, + "grad_norm": 4.336480617523193, + "learning_rate": 9.856535901926445e-05, + "loss": 1.2579, + "num_input_tokens_seen": 3301632, + "step": 205 + }, + { + "epoch": 0.01442993862022468, + "grad_norm": 4.587873458862305, + "learning_rate": 9.855836077057794e-05, + "loss": 1.1508, + "num_input_tokens_seen": 3318016, + "step": 206 + }, + { + "epoch": 0.014499986865953926, + "grad_norm": 4.719262599945068, + "learning_rate": 9.855136252189142e-05, + "loss": 1.0208, + "num_input_tokens_seen": 3333168, + "step": 207 + }, + { + "epoch": 0.014570035111683172, + "grad_norm": 4.419138431549072, + "learning_rate": 9.85443642732049e-05, + "loss": 1.2576, + "num_input_tokens_seen": 3349384, + "step": 208 + }, + { + "epoch": 0.014640083357412417, + "grad_norm": 4.3150835037231445, + "learning_rate": 9.85373660245184e-05, + "loss": 1.1786, + "num_input_tokens_seen": 3365768, + "step": 209 + }, + { + "epoch": 0.014710131603141665, + "grad_norm": 4.5917649269104, + "learning_rate": 9.853036777583188e-05, + "loss": 1.2821, + "num_input_tokens_seen": 3382152, + "step": 210 + }, + { + "epoch": 0.01478017984887091, + "grad_norm": 4.9094343185424805, + "learning_rate": 9.852336952714537e-05, + "loss": 1.2415, + "num_input_tokens_seen": 3397896, + "step": 211 + }, + { + "epoch": 0.014850228094600156, + "grad_norm": 4.394861698150635, + "learning_rate": 9.851637127845885e-05, + "loss": 1.1776, + "num_input_tokens_seen": 3414280, + "step": 212 + }, + { + "epoch": 0.014920276340329401, + "grad_norm": 4.196374416351318, + "learning_rate": 9.850937302977233e-05, + "loss": 1.065, + "num_input_tokens_seen": 3430584, + "step": 213 + }, + { + "epoch": 0.014990324586058647, + "grad_norm": 4.728682518005371, + "learning_rate": 9.850237478108582e-05, + "loss": 1.2686, + "num_input_tokens_seen": 3446968, + "step": 214 + }, + { + "epoch": 0.015060372831787894, + "grad_norm": 4.291411876678467, + "learning_rate": 9.84953765323993e-05, + "loss": 1.1877, + "num_input_tokens_seen": 3462568, + "step": 215 + }, + { + "epoch": 0.01513042107751714, + "grad_norm": 4.405060768127441, + "learning_rate": 9.84883782837128e-05, + "loss": 1.2873, + "num_input_tokens_seen": 3478952, + "step": 216 + }, + { + "epoch": 0.015200469323246386, + "grad_norm": 4.254365921020508, + "learning_rate": 9.848138003502628e-05, + "loss": 1.1062, + "num_input_tokens_seen": 3495304, + "step": 217 + }, + { + "epoch": 0.015270517568975631, + "grad_norm": 4.741672039031982, + "learning_rate": 9.847438178633976e-05, + "loss": 1.1983, + "num_input_tokens_seen": 3511688, + "step": 218 + }, + { + "epoch": 0.015340565814704879, + "grad_norm": 4.352742671966553, + "learning_rate": 9.846738353765325e-05, + "loss": 1.2028, + "num_input_tokens_seen": 3528072, + "step": 219 + }, + { + "epoch": 0.015410614060434124, + "grad_norm": 4.996603488922119, + "learning_rate": 9.846038528896672e-05, + "loss": 1.1561, + "num_input_tokens_seen": 3542904, + "step": 220 + }, + { + "epoch": 0.01548066230616337, + "grad_norm": 4.911815166473389, + "learning_rate": 9.845338704028021e-05, + "loss": 1.3375, + "num_input_tokens_seen": 3558352, + "step": 221 + }, + { + "epoch": 0.015550710551892616, + "grad_norm": 4.638419151306152, + "learning_rate": 9.84463887915937e-05, + "loss": 1.1963, + "num_input_tokens_seen": 3574736, + "step": 222 + }, + { + "epoch": 0.015620758797621863, + "grad_norm": 4.323521614074707, + "learning_rate": 9.843939054290719e-05, + "loss": 1.1224, + "num_input_tokens_seen": 3591120, + "step": 223 + }, + { + "epoch": 0.01569080704335111, + "grad_norm": 4.466544151306152, + "learning_rate": 9.843239229422068e-05, + "loss": 1.3988, + "num_input_tokens_seen": 3607392, + "step": 224 + }, + { + "epoch": 0.015760855289080354, + "grad_norm": 4.476973533630371, + "learning_rate": 9.842539404553415e-05, + "loss": 1.184, + "num_input_tokens_seen": 3623776, + "step": 225 + }, + { + "epoch": 0.0158309035348096, + "grad_norm": 4.648625373840332, + "learning_rate": 9.841839579684764e-05, + "loss": 1.1768, + "num_input_tokens_seen": 3640008, + "step": 226 + }, + { + "epoch": 0.015900951780538845, + "grad_norm": 4.364476203918457, + "learning_rate": 9.841139754816112e-05, + "loss": 1.0208, + "num_input_tokens_seen": 3656392, + "step": 227 + }, + { + "epoch": 0.01597100002626809, + "grad_norm": 4.3054633140563965, + "learning_rate": 9.84043992994746e-05, + "loss": 1.1215, + "num_input_tokens_seen": 3672392, + "step": 228 + }, + { + "epoch": 0.016041048271997337, + "grad_norm": 4.83436918258667, + "learning_rate": 9.83974010507881e-05, + "loss": 1.2284, + "num_input_tokens_seen": 3688776, + "step": 229 + }, + { + "epoch": 0.016111096517726586, + "grad_norm": 4.447519779205322, + "learning_rate": 9.839040280210158e-05, + "loss": 1.1765, + "num_input_tokens_seen": 3705080, + "step": 230 + }, + { + "epoch": 0.01618114476345583, + "grad_norm": 4.269217491149902, + "learning_rate": 9.838340455341507e-05, + "loss": 1.0466, + "num_input_tokens_seen": 3721464, + "step": 231 + }, + { + "epoch": 0.016251193009185077, + "grad_norm": 4.41223669052124, + "learning_rate": 9.837640630472854e-05, + "loss": 1.2098, + "num_input_tokens_seen": 3737184, + "step": 232 + }, + { + "epoch": 0.016321241254914323, + "grad_norm": 4.632737159729004, + "learning_rate": 9.836940805604203e-05, + "loss": 1.1562, + "num_input_tokens_seen": 3753192, + "step": 233 + }, + { + "epoch": 0.016391289500643568, + "grad_norm": 4.379425525665283, + "learning_rate": 9.836240980735552e-05, + "loss": 1.1219, + "num_input_tokens_seen": 3767976, + "step": 234 + }, + { + "epoch": 0.016461337746372814, + "grad_norm": 4.28551721572876, + "learning_rate": 9.835541155866901e-05, + "loss": 1.0259, + "num_input_tokens_seen": 3784008, + "step": 235 + }, + { + "epoch": 0.01653138599210206, + "grad_norm": 4.642453670501709, + "learning_rate": 9.83484133099825e-05, + "loss": 1.1684, + "num_input_tokens_seen": 3800000, + "step": 236 + }, + { + "epoch": 0.016601434237831305, + "grad_norm": 4.367178440093994, + "learning_rate": 9.834141506129597e-05, + "loss": 1.2877, + "num_input_tokens_seen": 3816384, + "step": 237 + }, + { + "epoch": 0.01667148248356055, + "grad_norm": 4.5724005699157715, + "learning_rate": 9.833441681260946e-05, + "loss": 1.1814, + "num_input_tokens_seen": 3830328, + "step": 238 + }, + { + "epoch": 0.0167415307292898, + "grad_norm": 4.318159580230713, + "learning_rate": 9.832741856392295e-05, + "loss": 1.1143, + "num_input_tokens_seen": 3846712, + "step": 239 + }, + { + "epoch": 0.016811578975019045, + "grad_norm": 4.408501625061035, + "learning_rate": 9.832042031523643e-05, + "loss": 1.1508, + "num_input_tokens_seen": 3861776, + "step": 240 + }, + { + "epoch": 0.01688162722074829, + "grad_norm": 4.20060920715332, + "learning_rate": 9.831342206654991e-05, + "loss": 1.209, + "num_input_tokens_seen": 3877736, + "step": 241 + }, + { + "epoch": 0.016951675466477537, + "grad_norm": 4.431649208068848, + "learning_rate": 9.83064238178634e-05, + "loss": 1.2458, + "num_input_tokens_seen": 3893320, + "step": 242 + }, + { + "epoch": 0.017021723712206782, + "grad_norm": 4.000490188598633, + "learning_rate": 9.829942556917689e-05, + "loss": 1.0274, + "num_input_tokens_seen": 3909704, + "step": 243 + }, + { + "epoch": 0.017091771957936028, + "grad_norm": 4.703495025634766, + "learning_rate": 9.829242732049038e-05, + "loss": 1.1711, + "num_input_tokens_seen": 3925808, + "step": 244 + }, + { + "epoch": 0.017161820203665273, + "grad_norm": 4.639338970184326, + "learning_rate": 9.828542907180386e-05, + "loss": 1.3046, + "num_input_tokens_seen": 3942192, + "step": 245 + }, + { + "epoch": 0.01723186844939452, + "grad_norm": 4.414276599884033, + "learning_rate": 9.827843082311734e-05, + "loss": 1.271, + "num_input_tokens_seen": 3958528, + "step": 246 + }, + { + "epoch": 0.017301916695123768, + "grad_norm": 4.404853820800781, + "learning_rate": 9.827143257443082e-05, + "loss": 1.0693, + "num_input_tokens_seen": 3974912, + "step": 247 + }, + { + "epoch": 0.017371964940853014, + "grad_norm": 4.519491195678711, + "learning_rate": 9.826443432574431e-05, + "loss": 1.2894, + "num_input_tokens_seen": 3991296, + "step": 248 + }, + { + "epoch": 0.01744201318658226, + "grad_norm": 4.261727809906006, + "learning_rate": 9.825743607705781e-05, + "loss": 1.2059, + "num_input_tokens_seen": 4006544, + "step": 249 + }, + { + "epoch": 0.017512061432311505, + "grad_norm": 4.102485656738281, + "learning_rate": 9.825043782837129e-05, + "loss": 0.9365, + "num_input_tokens_seen": 4022320, + "step": 250 + }, + { + "epoch": 0.01758210967804075, + "grad_norm": 4.804764270782471, + "learning_rate": 9.824343957968477e-05, + "loss": 1.3344, + "num_input_tokens_seen": 4037048, + "step": 251 + }, + { + "epoch": 0.017652157923769996, + "grad_norm": 4.130600452423096, + "learning_rate": 9.823644133099825e-05, + "loss": 1.2349, + "num_input_tokens_seen": 4053432, + "step": 252 + }, + { + "epoch": 0.017722206169499242, + "grad_norm": 4.234742641448975, + "learning_rate": 9.822944308231174e-05, + "loss": 1.1371, + "num_input_tokens_seen": 4069816, + "step": 253 + }, + { + "epoch": 0.017792254415228487, + "grad_norm": 4.754928112030029, + "learning_rate": 9.822244483362521e-05, + "loss": 1.5168, + "num_input_tokens_seen": 4085864, + "step": 254 + }, + { + "epoch": 0.017862302660957733, + "grad_norm": 4.542768478393555, + "learning_rate": 9.821544658493871e-05, + "loss": 1.1943, + "num_input_tokens_seen": 4102240, + "step": 255 + }, + { + "epoch": 0.017932350906686982, + "grad_norm": 4.411310195922852, + "learning_rate": 9.82084483362522e-05, + "loss": 1.2694, + "num_input_tokens_seen": 4118544, + "step": 256 + }, + { + "epoch": 0.018002399152416228, + "grad_norm": 4.205377101898193, + "learning_rate": 9.820145008756568e-05, + "loss": 1.1581, + "num_input_tokens_seen": 4134928, + "step": 257 + }, + { + "epoch": 0.018072447398145473, + "grad_norm": 4.451165199279785, + "learning_rate": 9.819445183887917e-05, + "loss": 1.089, + "num_input_tokens_seen": 4150848, + "step": 258 + }, + { + "epoch": 0.01814249564387472, + "grad_norm": 4.366336822509766, + "learning_rate": 9.818745359019264e-05, + "loss": 1.1767, + "num_input_tokens_seen": 4167184, + "step": 259 + }, + { + "epoch": 0.018212543889603965, + "grad_norm": 4.394649982452393, + "learning_rate": 9.818045534150613e-05, + "loss": 1.0741, + "num_input_tokens_seen": 4183376, + "step": 260 + }, + { + "epoch": 0.01828259213533321, + "grad_norm": 4.344518184661865, + "learning_rate": 9.817345709281962e-05, + "loss": 1.2282, + "num_input_tokens_seen": 4199760, + "step": 261 + }, + { + "epoch": 0.018352640381062456, + "grad_norm": 4.403041362762451, + "learning_rate": 9.816645884413311e-05, + "loss": 1.2317, + "num_input_tokens_seen": 4215816, + "step": 262 + }, + { + "epoch": 0.0184226886267917, + "grad_norm": 4.715320110321045, + "learning_rate": 9.81594605954466e-05, + "loss": 1.3074, + "num_input_tokens_seen": 4231504, + "step": 263 + }, + { + "epoch": 0.01849273687252095, + "grad_norm": 4.5754265785217285, + "learning_rate": 9.815246234676007e-05, + "loss": 1.253, + "num_input_tokens_seen": 4247888, + "step": 264 + }, + { + "epoch": 0.018562785118250196, + "grad_norm": 4.2346930503845215, + "learning_rate": 9.814546409807356e-05, + "loss": 1.1727, + "num_input_tokens_seen": 4264248, + "step": 265 + }, + { + "epoch": 0.018632833363979442, + "grad_norm": 4.186713218688965, + "learning_rate": 9.813846584938705e-05, + "loss": 1.2693, + "num_input_tokens_seen": 4280632, + "step": 266 + }, + { + "epoch": 0.018702881609708687, + "grad_norm": 4.6356706619262695, + "learning_rate": 9.813146760070052e-05, + "loss": 1.3755, + "num_input_tokens_seen": 4296648, + "step": 267 + }, + { + "epoch": 0.018772929855437933, + "grad_norm": 4.466466903686523, + "learning_rate": 9.812446935201401e-05, + "loss": 1.283, + "num_input_tokens_seen": 4311408, + "step": 268 + }, + { + "epoch": 0.01884297810116718, + "grad_norm": 4.3369140625, + "learning_rate": 9.81174711033275e-05, + "loss": 1.1555, + "num_input_tokens_seen": 4326736, + "step": 269 + }, + { + "epoch": 0.018913026346896424, + "grad_norm": 4.434782028198242, + "learning_rate": 9.811047285464099e-05, + "loss": 1.2859, + "num_input_tokens_seen": 4343120, + "step": 270 + }, + { + "epoch": 0.01898307459262567, + "grad_norm": 4.346708297729492, + "learning_rate": 9.810347460595448e-05, + "loss": 1.1421, + "num_input_tokens_seen": 4359504, + "step": 271 + }, + { + "epoch": 0.019053122838354915, + "grad_norm": 4.529878616333008, + "learning_rate": 9.809647635726795e-05, + "loss": 1.2654, + "num_input_tokens_seen": 4375888, + "step": 272 + }, + { + "epoch": 0.019123171084084165, + "grad_norm": 4.051745891571045, + "learning_rate": 9.808947810858144e-05, + "loss": 1.1469, + "num_input_tokens_seen": 4392224, + "step": 273 + }, + { + "epoch": 0.01919321932981341, + "grad_norm": 4.403522491455078, + "learning_rate": 9.808247985989492e-05, + "loss": 1.233, + "num_input_tokens_seen": 4408608, + "step": 274 + }, + { + "epoch": 0.019263267575542656, + "grad_norm": 4.166261196136475, + "learning_rate": 9.807548161120842e-05, + "loss": 1.1697, + "num_input_tokens_seen": 4424992, + "step": 275 + }, + { + "epoch": 0.0193333158212719, + "grad_norm": 4.29187536239624, + "learning_rate": 9.806848336252191e-05, + "loss": 1.0503, + "num_input_tokens_seen": 4441376, + "step": 276 + }, + { + "epoch": 0.019403364067001147, + "grad_norm": 4.4056172370910645, + "learning_rate": 9.806148511383538e-05, + "loss": 1.1965, + "num_input_tokens_seen": 4457760, + "step": 277 + }, + { + "epoch": 0.019473412312730393, + "grad_norm": 4.355875015258789, + "learning_rate": 9.805448686514887e-05, + "loss": 1.1024, + "num_input_tokens_seen": 4474144, + "step": 278 + }, + { + "epoch": 0.019543460558459638, + "grad_norm": 4.46420955657959, + "learning_rate": 9.804748861646235e-05, + "loss": 1.203, + "num_input_tokens_seen": 4488912, + "step": 279 + }, + { + "epoch": 0.019613508804188884, + "grad_norm": 4.48052453994751, + "learning_rate": 9.804049036777583e-05, + "loss": 1.2089, + "num_input_tokens_seen": 4505296, + "step": 280 + }, + { + "epoch": 0.01968355704991813, + "grad_norm": 4.458749294281006, + "learning_rate": 9.803349211908932e-05, + "loss": 1.1557, + "num_input_tokens_seen": 4520576, + "step": 281 + }, + { + "epoch": 0.01975360529564738, + "grad_norm": 4.551771640777588, + "learning_rate": 9.802649387040281e-05, + "loss": 1.1671, + "num_input_tokens_seen": 4536960, + "step": 282 + }, + { + "epoch": 0.019823653541376624, + "grad_norm": 4.038064956665039, + "learning_rate": 9.80194956217163e-05, + "loss": 1.1562, + "num_input_tokens_seen": 4553344, + "step": 283 + }, + { + "epoch": 0.01989370178710587, + "grad_norm": 4.647075653076172, + "learning_rate": 9.801249737302978e-05, + "loss": 1.3069, + "num_input_tokens_seen": 4568928, + "step": 284 + }, + { + "epoch": 0.019963750032835115, + "grad_norm": 4.258941650390625, + "learning_rate": 9.800549912434326e-05, + "loss": 1.0349, + "num_input_tokens_seen": 4585312, + "step": 285 + }, + { + "epoch": 0.02003379827856436, + "grad_norm": 4.348769664764404, + "learning_rate": 9.799850087565674e-05, + "loss": 1.1163, + "num_input_tokens_seen": 4601696, + "step": 286 + }, + { + "epoch": 0.020103846524293607, + "grad_norm": 4.105901718139648, + "learning_rate": 9.799150262697023e-05, + "loss": 1.0313, + "num_input_tokens_seen": 4617312, + "step": 287 + }, + { + "epoch": 0.020173894770022852, + "grad_norm": 4.079495429992676, + "learning_rate": 9.798450437828372e-05, + "loss": 1.0828, + "num_input_tokens_seen": 4633696, + "step": 288 + }, + { + "epoch": 0.020243943015752098, + "grad_norm": 4.03472375869751, + "learning_rate": 9.79775061295972e-05, + "loss": 0.9475, + "num_input_tokens_seen": 4650080, + "step": 289 + }, + { + "epoch": 0.020313991261481347, + "grad_norm": 4.077049732208252, + "learning_rate": 9.797050788091069e-05, + "loss": 1.1323, + "num_input_tokens_seen": 4666328, + "step": 290 + }, + { + "epoch": 0.020384039507210593, + "grad_norm": 4.086606025695801, + "learning_rate": 9.796350963222417e-05, + "loss": 1.1218, + "num_input_tokens_seen": 4682256, + "step": 291 + }, + { + "epoch": 0.020454087752939838, + "grad_norm": 4.296900749206543, + "learning_rate": 9.795651138353766e-05, + "loss": 1.2964, + "num_input_tokens_seen": 4698640, + "step": 292 + }, + { + "epoch": 0.020524135998669084, + "grad_norm": 4.040759086608887, + "learning_rate": 9.794951313485115e-05, + "loss": 1.1077, + "num_input_tokens_seen": 4714928, + "step": 293 + }, + { + "epoch": 0.02059418424439833, + "grad_norm": 3.8260273933410645, + "learning_rate": 9.794251488616462e-05, + "loss": 0.9667, + "num_input_tokens_seen": 4731312, + "step": 294 + }, + { + "epoch": 0.020664232490127575, + "grad_norm": 4.294517993927002, + "learning_rate": 9.793551663747811e-05, + "loss": 1.2704, + "num_input_tokens_seen": 4747544, + "step": 295 + }, + { + "epoch": 0.02073428073585682, + "grad_norm": 4.206037521362305, + "learning_rate": 9.79285183887916e-05, + "loss": 1.1593, + "num_input_tokens_seen": 4763928, + "step": 296 + }, + { + "epoch": 0.020804328981586066, + "grad_norm": 4.147867202758789, + "learning_rate": 9.792152014010509e-05, + "loss": 1.1256, + "num_input_tokens_seen": 4780312, + "step": 297 + }, + { + "epoch": 0.020874377227315312, + "grad_norm": 4.23718786239624, + "learning_rate": 9.791452189141857e-05, + "loss": 1.2353, + "num_input_tokens_seen": 4796384, + "step": 298 + }, + { + "epoch": 0.02094442547304456, + "grad_norm": 4.172685146331787, + "learning_rate": 9.790752364273205e-05, + "loss": 1.1868, + "num_input_tokens_seen": 4812768, + "step": 299 + }, + { + "epoch": 0.021014473718773807, + "grad_norm": 4.167289733886719, + "learning_rate": 9.790052539404554e-05, + "loss": 1.0606, + "num_input_tokens_seen": 4829152, + "step": 300 + }, + { + "epoch": 0.021084521964503052, + "grad_norm": 4.096963882446289, + "learning_rate": 9.789352714535903e-05, + "loss": 1.0557, + "num_input_tokens_seen": 4845384, + "step": 301 + }, + { + "epoch": 0.021154570210232298, + "grad_norm": 4.223779678344727, + "learning_rate": 9.788652889667252e-05, + "loss": 1.1485, + "num_input_tokens_seen": 4861768, + "step": 302 + }, + { + "epoch": 0.021224618455961543, + "grad_norm": 3.8243472576141357, + "learning_rate": 9.7879530647986e-05, + "loss": 1.004, + "num_input_tokens_seen": 4878152, + "step": 303 + }, + { + "epoch": 0.02129466670169079, + "grad_norm": 4.092590808868408, + "learning_rate": 9.787253239929948e-05, + "loss": 1.0211, + "num_input_tokens_seen": 4894536, + "step": 304 + }, + { + "epoch": 0.021364714947420035, + "grad_norm": 4.42412805557251, + "learning_rate": 9.786553415061297e-05, + "loss": 0.9915, + "num_input_tokens_seen": 4910320, + "step": 305 + }, + { + "epoch": 0.02143476319314928, + "grad_norm": 4.488316535949707, + "learning_rate": 9.785853590192644e-05, + "loss": 1.1782, + "num_input_tokens_seen": 4926704, + "step": 306 + }, + { + "epoch": 0.021504811438878526, + "grad_norm": 4.110256195068359, + "learning_rate": 9.785153765323993e-05, + "loss": 1.102, + "num_input_tokens_seen": 4943088, + "step": 307 + }, + { + "epoch": 0.021574859684607775, + "grad_norm": 4.246950149536133, + "learning_rate": 9.784453940455342e-05, + "loss": 1.067, + "num_input_tokens_seen": 4958736, + "step": 308 + }, + { + "epoch": 0.02164490793033702, + "grad_norm": 4.175214767456055, + "learning_rate": 9.783754115586691e-05, + "loss": 1.0638, + "num_input_tokens_seen": 4975120, + "step": 309 + }, + { + "epoch": 0.021714956176066266, + "grad_norm": 4.427795886993408, + "learning_rate": 9.78305429071804e-05, + "loss": 1.1347, + "num_input_tokens_seen": 4991504, + "step": 310 + }, + { + "epoch": 0.021785004421795512, + "grad_norm": 4.158191204071045, + "learning_rate": 9.782354465849387e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5007152, + "step": 311 + }, + { + "epoch": 0.021855052667524758, + "grad_norm": 4.184347629547119, + "learning_rate": 9.781654640980736e-05, + "loss": 1.0791, + "num_input_tokens_seen": 5023536, + "step": 312 + }, + { + "epoch": 0.021925100913254003, + "grad_norm": 3.8506295680999756, + "learning_rate": 9.780954816112084e-05, + "loss": 1.0615, + "num_input_tokens_seen": 5039728, + "step": 313 + }, + { + "epoch": 0.02199514915898325, + "grad_norm": 4.310062408447266, + "learning_rate": 9.780254991243432e-05, + "loss": 1.1363, + "num_input_tokens_seen": 5056008, + "step": 314 + }, + { + "epoch": 0.022065197404712494, + "grad_norm": 4.215006351470947, + "learning_rate": 9.779555166374781e-05, + "loss": 1.1715, + "num_input_tokens_seen": 5072096, + "step": 315 + }, + { + "epoch": 0.022135245650441743, + "grad_norm": 4.219073295593262, + "learning_rate": 9.77885534150613e-05, + "loss": 1.219, + "num_input_tokens_seen": 5088432, + "step": 316 + }, + { + "epoch": 0.02220529389617099, + "grad_norm": 4.319522857666016, + "learning_rate": 9.778155516637479e-05, + "loss": 1.3085, + "num_input_tokens_seen": 5104240, + "step": 317 + }, + { + "epoch": 0.022275342141900235, + "grad_norm": 4.118961334228516, + "learning_rate": 9.777455691768827e-05, + "loss": 1.0926, + "num_input_tokens_seen": 5120624, + "step": 318 + }, + { + "epoch": 0.02234539038762948, + "grad_norm": 4.195051193237305, + "learning_rate": 9.776755866900175e-05, + "loss": 1.0894, + "num_input_tokens_seen": 5137008, + "step": 319 + }, + { + "epoch": 0.022415438633358726, + "grad_norm": 4.114197254180908, + "learning_rate": 9.776056042031524e-05, + "loss": 1.1897, + "num_input_tokens_seen": 5153272, + "step": 320 + }, + { + "epoch": 0.02248548687908797, + "grad_norm": 4.014908313751221, + "learning_rate": 9.775356217162872e-05, + "loss": 1.0932, + "num_input_tokens_seen": 5169472, + "step": 321 + }, + { + "epoch": 0.022555535124817217, + "grad_norm": 4.190642356872559, + "learning_rate": 9.774656392294222e-05, + "loss": 1.1413, + "num_input_tokens_seen": 5185856, + "step": 322 + }, + { + "epoch": 0.022625583370546463, + "grad_norm": 4.562993049621582, + "learning_rate": 9.77395656742557e-05, + "loss": 1.2865, + "num_input_tokens_seen": 5202240, + "step": 323 + }, + { + "epoch": 0.02269563161627571, + "grad_norm": 4.607022762298584, + "learning_rate": 9.773256742556918e-05, + "loss": 1.1465, + "num_input_tokens_seen": 5218168, + "step": 324 + }, + { + "epoch": 0.022765679862004957, + "grad_norm": 3.956439256668091, + "learning_rate": 9.772556917688267e-05, + "loss": 1.028, + "num_input_tokens_seen": 5234368, + "step": 325 + }, + { + "epoch": 0.022835728107734203, + "grad_norm": 4.20713472366333, + "learning_rate": 9.771857092819615e-05, + "loss": 1.2332, + "num_input_tokens_seen": 5249808, + "step": 326 + }, + { + "epoch": 0.02290577635346345, + "grad_norm": 4.4092864990234375, + "learning_rate": 9.771157267950964e-05, + "loss": 1.104, + "num_input_tokens_seen": 5266120, + "step": 327 + }, + { + "epoch": 0.022975824599192694, + "grad_norm": 4.529845237731934, + "learning_rate": 9.770457443082312e-05, + "loss": 1.3475, + "num_input_tokens_seen": 5282504, + "step": 328 + }, + { + "epoch": 0.02304587284492194, + "grad_norm": 4.221986293792725, + "learning_rate": 9.769757618213661e-05, + "loss": 1.4115, + "num_input_tokens_seen": 5298344, + "step": 329 + }, + { + "epoch": 0.023115921090651186, + "grad_norm": 4.29000186920166, + "learning_rate": 9.76905779334501e-05, + "loss": 1.2855, + "num_input_tokens_seen": 5314728, + "step": 330 + }, + { + "epoch": 0.02318596933638043, + "grad_norm": 4.426812648773193, + "learning_rate": 9.768357968476358e-05, + "loss": 1.514, + "num_input_tokens_seen": 5330816, + "step": 331 + }, + { + "epoch": 0.023256017582109677, + "grad_norm": 4.210752964019775, + "learning_rate": 9.767658143607706e-05, + "loss": 1.0854, + "num_input_tokens_seen": 5346552, + "step": 332 + }, + { + "epoch": 0.023326065827838922, + "grad_norm": 4.216427326202393, + "learning_rate": 9.766958318739054e-05, + "loss": 1.1573, + "num_input_tokens_seen": 5362936, + "step": 333 + }, + { + "epoch": 0.02339611407356817, + "grad_norm": 4.132325649261475, + "learning_rate": 9.766258493870403e-05, + "loss": 1.0942, + "num_input_tokens_seen": 5379320, + "step": 334 + }, + { + "epoch": 0.023466162319297417, + "grad_norm": 4.277027130126953, + "learning_rate": 9.765558669001752e-05, + "loss": 1.1227, + "num_input_tokens_seen": 5395704, + "step": 335 + }, + { + "epoch": 0.023536210565026663, + "grad_norm": 4.228096961975098, + "learning_rate": 9.7648588441331e-05, + "loss": 1.1094, + "num_input_tokens_seen": 5412088, + "step": 336 + }, + { + "epoch": 0.02360625881075591, + "grad_norm": 4.194522380828857, + "learning_rate": 9.76415901926445e-05, + "loss": 1.2066, + "num_input_tokens_seen": 5428472, + "step": 337 + }, + { + "epoch": 0.023676307056485154, + "grad_norm": 4.336326599121094, + "learning_rate": 9.763459194395797e-05, + "loss": 1.2251, + "num_input_tokens_seen": 5444856, + "step": 338 + }, + { + "epoch": 0.0237463553022144, + "grad_norm": 4.2723307609558105, + "learning_rate": 9.762759369527146e-05, + "loss": 1.0927, + "num_input_tokens_seen": 5460304, + "step": 339 + }, + { + "epoch": 0.023816403547943645, + "grad_norm": 4.190036773681641, + "learning_rate": 9.762059544658493e-05, + "loss": 1.2036, + "num_input_tokens_seen": 5476688, + "step": 340 + }, + { + "epoch": 0.02388645179367289, + "grad_norm": 4.477560043334961, + "learning_rate": 9.761359719789842e-05, + "loss": 1.362, + "num_input_tokens_seen": 5493072, + "step": 341 + }, + { + "epoch": 0.02395650003940214, + "grad_norm": 4.160232067108154, + "learning_rate": 9.760659894921192e-05, + "loss": 1.1602, + "num_input_tokens_seen": 5509456, + "step": 342 + }, + { + "epoch": 0.024026548285131386, + "grad_norm": 3.857335090637207, + "learning_rate": 9.75996007005254e-05, + "loss": 1.0963, + "num_input_tokens_seen": 5525840, + "step": 343 + }, + { + "epoch": 0.02409659653086063, + "grad_norm": 4.141246318817139, + "learning_rate": 9.759260245183889e-05, + "loss": 1.2009, + "num_input_tokens_seen": 5541888, + "step": 344 + }, + { + "epoch": 0.024166644776589877, + "grad_norm": 4.50364875793457, + "learning_rate": 9.758560420315236e-05, + "loss": 1.1483, + "num_input_tokens_seen": 5557848, + "step": 345 + }, + { + "epoch": 0.024236693022319122, + "grad_norm": 4.3343353271484375, + "learning_rate": 9.757860595446585e-05, + "loss": 1.3594, + "num_input_tokens_seen": 5573504, + "step": 346 + }, + { + "epoch": 0.024306741268048368, + "grad_norm": 4.050408363342285, + "learning_rate": 9.757160770577934e-05, + "loss": 1.0563, + "num_input_tokens_seen": 5589544, + "step": 347 + }, + { + "epoch": 0.024376789513777614, + "grad_norm": 4.051811695098877, + "learning_rate": 9.756460945709283e-05, + "loss": 1.0288, + "num_input_tokens_seen": 5605368, + "step": 348 + }, + { + "epoch": 0.02444683775950686, + "grad_norm": 4.365113258361816, + "learning_rate": 9.755761120840632e-05, + "loss": 1.3054, + "num_input_tokens_seen": 5621752, + "step": 349 + }, + { + "epoch": 0.024516886005236105, + "grad_norm": 4.0057501792907715, + "learning_rate": 9.755061295971979e-05, + "loss": 1.1302, + "num_input_tokens_seen": 5638136, + "step": 350 + }, + { + "epoch": 0.024586934250965354, + "grad_norm": 4.254896640777588, + "learning_rate": 9.754361471103328e-05, + "loss": 1.0495, + "num_input_tokens_seen": 5653168, + "step": 351 + }, + { + "epoch": 0.0246569824966946, + "grad_norm": 3.8119771480560303, + "learning_rate": 9.753661646234677e-05, + "loss": 1.0349, + "num_input_tokens_seen": 5669504, + "step": 352 + }, + { + "epoch": 0.024727030742423845, + "grad_norm": 4.5082621574401855, + "learning_rate": 9.752961821366024e-05, + "loss": 1.2537, + "num_input_tokens_seen": 5685168, + "step": 353 + }, + { + "epoch": 0.02479707898815309, + "grad_norm": 4.392731189727783, + "learning_rate": 9.752261996497373e-05, + "loss": 1.2534, + "num_input_tokens_seen": 5701240, + "step": 354 + }, + { + "epoch": 0.024867127233882336, + "grad_norm": 4.293395519256592, + "learning_rate": 9.751562171628722e-05, + "loss": 1.2774, + "num_input_tokens_seen": 5717624, + "step": 355 + }, + { + "epoch": 0.024937175479611582, + "grad_norm": 4.64813756942749, + "learning_rate": 9.750862346760071e-05, + "loss": 1.2795, + "num_input_tokens_seen": 5733104, + "step": 356 + }, + { + "epoch": 0.025007223725340828, + "grad_norm": 4.5166778564453125, + "learning_rate": 9.75016252189142e-05, + "loss": 1.1301, + "num_input_tokens_seen": 5749488, + "step": 357 + }, + { + "epoch": 0.025077271971070073, + "grad_norm": 3.894291400909424, + "learning_rate": 9.749462697022767e-05, + "loss": 0.901, + "num_input_tokens_seen": 5765872, + "step": 358 + }, + { + "epoch": 0.02514732021679932, + "grad_norm": 4.10056209564209, + "learning_rate": 9.748762872154116e-05, + "loss": 1.0529, + "num_input_tokens_seen": 5780856, + "step": 359 + }, + { + "epoch": 0.025217368462528568, + "grad_norm": 4.6277666091918945, + "learning_rate": 9.748063047285464e-05, + "loss": 1.3649, + "num_input_tokens_seen": 5796856, + "step": 360 + }, + { + "epoch": 0.025287416708257814, + "grad_norm": 4.029720306396484, + "learning_rate": 9.747363222416813e-05, + "loss": 0.8863, + "num_input_tokens_seen": 5812176, + "step": 361 + }, + { + "epoch": 0.02535746495398706, + "grad_norm": 3.7772202491760254, + "learning_rate": 9.746663397548161e-05, + "loss": 1.0448, + "num_input_tokens_seen": 5828064, + "step": 362 + }, + { + "epoch": 0.025427513199716305, + "grad_norm": 4.379861354827881, + "learning_rate": 9.74596357267951e-05, + "loss": 1.3274, + "num_input_tokens_seen": 5843680, + "step": 363 + }, + { + "epoch": 0.02549756144544555, + "grad_norm": 4.254587173461914, + "learning_rate": 9.745263747810859e-05, + "loss": 1.1502, + "num_input_tokens_seen": 5859024, + "step": 364 + }, + { + "epoch": 0.025567609691174796, + "grad_norm": 4.271276473999023, + "learning_rate": 9.744563922942207e-05, + "loss": 1.2785, + "num_input_tokens_seen": 5874320, + "step": 365 + }, + { + "epoch": 0.02563765793690404, + "grad_norm": 4.224324703216553, + "learning_rate": 9.743864098073555e-05, + "loss": 1.0926, + "num_input_tokens_seen": 5890704, + "step": 366 + }, + { + "epoch": 0.025707706182633287, + "grad_norm": 4.289444446563721, + "learning_rate": 9.743164273204903e-05, + "loss": 1.1913, + "num_input_tokens_seen": 5906016, + "step": 367 + }, + { + "epoch": 0.025777754428362536, + "grad_norm": 4.280707359313965, + "learning_rate": 9.742464448336253e-05, + "loss": 1.2238, + "num_input_tokens_seen": 5921784, + "step": 368 + }, + { + "epoch": 0.025847802674091782, + "grad_norm": 4.554803848266602, + "learning_rate": 9.741764623467602e-05, + "loss": 1.2491, + "num_input_tokens_seen": 5938072, + "step": 369 + }, + { + "epoch": 0.025917850919821028, + "grad_norm": 4.677784442901611, + "learning_rate": 9.74106479859895e-05, + "loss": 1.2387, + "num_input_tokens_seen": 5954456, + "step": 370 + }, + { + "epoch": 0.025987899165550273, + "grad_norm": 4.268225193023682, + "learning_rate": 9.740364973730298e-05, + "loss": 1.2983, + "num_input_tokens_seen": 5970664, + "step": 371 + }, + { + "epoch": 0.02605794741127952, + "grad_norm": 4.361818790435791, + "learning_rate": 9.739665148861646e-05, + "loss": 1.199, + "num_input_tokens_seen": 5987048, + "step": 372 + }, + { + "epoch": 0.026127995657008764, + "grad_norm": 3.9990735054016113, + "learning_rate": 9.738965323992995e-05, + "loss": 1.0777, + "num_input_tokens_seen": 6003432, + "step": 373 + }, + { + "epoch": 0.02619804390273801, + "grad_norm": 3.992142915725708, + "learning_rate": 9.738265499124344e-05, + "loss": 1.0443, + "num_input_tokens_seen": 6019816, + "step": 374 + }, + { + "epoch": 0.026268092148467256, + "grad_norm": 4.270167827606201, + "learning_rate": 9.737565674255693e-05, + "loss": 1.1764, + "num_input_tokens_seen": 6036200, + "step": 375 + }, + { + "epoch": 0.0263381403941965, + "grad_norm": 4.362086296081543, + "learning_rate": 9.736865849387041e-05, + "loss": 1.2735, + "num_input_tokens_seen": 6052120, + "step": 376 + }, + { + "epoch": 0.02640818863992575, + "grad_norm": 3.6900475025177, + "learning_rate": 9.736166024518389e-05, + "loss": 0.8729, + "num_input_tokens_seen": 6068264, + "step": 377 + }, + { + "epoch": 0.026478236885654996, + "grad_norm": 3.8281285762786865, + "learning_rate": 9.735466199649738e-05, + "loss": 1.1096, + "num_input_tokens_seen": 6084504, + "step": 378 + }, + { + "epoch": 0.02654828513138424, + "grad_norm": 3.9335553646087646, + "learning_rate": 9.734766374781087e-05, + "loss": 1.0763, + "num_input_tokens_seen": 6100592, + "step": 379 + }, + { + "epoch": 0.026618333377113487, + "grad_norm": 4.332645416259766, + "learning_rate": 9.734066549912434e-05, + "loss": 1.1751, + "num_input_tokens_seen": 6116976, + "step": 380 + }, + { + "epoch": 0.026688381622842733, + "grad_norm": 4.160863399505615, + "learning_rate": 9.733366725043783e-05, + "loss": 1.0778, + "num_input_tokens_seen": 6133360, + "step": 381 + }, + { + "epoch": 0.02675842986857198, + "grad_norm": 4.388178825378418, + "learning_rate": 9.732666900175132e-05, + "loss": 1.2214, + "num_input_tokens_seen": 6149744, + "step": 382 + }, + { + "epoch": 0.026828478114301224, + "grad_norm": 4.354910373687744, + "learning_rate": 9.73196707530648e-05, + "loss": 1.4115, + "num_input_tokens_seen": 6166048, + "step": 383 + }, + { + "epoch": 0.02689852636003047, + "grad_norm": 4.058071613311768, + "learning_rate": 9.73126725043783e-05, + "loss": 1.0934, + "num_input_tokens_seen": 6181840, + "step": 384 + }, + { + "epoch": 0.026968574605759715, + "grad_norm": 4.060855865478516, + "learning_rate": 9.730567425569177e-05, + "loss": 1.1395, + "num_input_tokens_seen": 6198224, + "step": 385 + }, + { + "epoch": 0.027038622851488964, + "grad_norm": 4.316681385040283, + "learning_rate": 9.729867600700526e-05, + "loss": 1.1052, + "num_input_tokens_seen": 6214608, + "step": 386 + }, + { + "epoch": 0.02710867109721821, + "grad_norm": 4.322516918182373, + "learning_rate": 9.729167775831873e-05, + "loss": 1.2512, + "num_input_tokens_seen": 6230992, + "step": 387 + }, + { + "epoch": 0.027178719342947456, + "grad_norm": 4.090857028961182, + "learning_rate": 9.728467950963224e-05, + "loss": 1.0772, + "num_input_tokens_seen": 6246760, + "step": 388 + }, + { + "epoch": 0.0272487675886767, + "grad_norm": 4.0143961906433105, + "learning_rate": 9.727768126094571e-05, + "loss": 1.0578, + "num_input_tokens_seen": 6261968, + "step": 389 + }, + { + "epoch": 0.027318815834405947, + "grad_norm": 4.911194324493408, + "learning_rate": 9.72706830122592e-05, + "loss": 1.3016, + "num_input_tokens_seen": 6276664, + "step": 390 + }, + { + "epoch": 0.027388864080135192, + "grad_norm": 4.057498931884766, + "learning_rate": 9.726368476357269e-05, + "loss": 1.026, + "num_input_tokens_seen": 6293048, + "step": 391 + }, + { + "epoch": 0.027458912325864438, + "grad_norm": 3.9827401638031006, + "learning_rate": 9.725668651488616e-05, + "loss": 1.136, + "num_input_tokens_seen": 6309432, + "step": 392 + }, + { + "epoch": 0.027528960571593684, + "grad_norm": 4.640822887420654, + "learning_rate": 9.724968826619965e-05, + "loss": 1.2823, + "num_input_tokens_seen": 6325568, + "step": 393 + }, + { + "epoch": 0.027599008817322933, + "grad_norm": 4.372538089752197, + "learning_rate": 9.724269001751314e-05, + "loss": 1.0354, + "num_input_tokens_seen": 6341952, + "step": 394 + }, + { + "epoch": 0.02766905706305218, + "grad_norm": 4.018289566040039, + "learning_rate": 9.723569176882663e-05, + "loss": 1.029, + "num_input_tokens_seen": 6358336, + "step": 395 + }, + { + "epoch": 0.027739105308781424, + "grad_norm": 4.440858364105225, + "learning_rate": 9.722869352014012e-05, + "loss": 1.2272, + "num_input_tokens_seen": 6374680, + "step": 396 + }, + { + "epoch": 0.02780915355451067, + "grad_norm": 4.246788024902344, + "learning_rate": 9.722169527145359e-05, + "loss": 1.0161, + "num_input_tokens_seen": 6390672, + "step": 397 + }, + { + "epoch": 0.027879201800239915, + "grad_norm": 4.27274751663208, + "learning_rate": 9.721469702276708e-05, + "loss": 1.293, + "num_input_tokens_seen": 6407056, + "step": 398 + }, + { + "epoch": 0.02794925004596916, + "grad_norm": 4.171760559082031, + "learning_rate": 9.720769877408056e-05, + "loss": 1.2766, + "num_input_tokens_seen": 6423440, + "step": 399 + }, + { + "epoch": 0.028019298291698407, + "grad_norm": 4.174622535705566, + "learning_rate": 9.720070052539405e-05, + "loss": 1.049, + "num_input_tokens_seen": 6439824, + "step": 400 + }, + { + "epoch": 0.028019298291698407, + "eval_loss": 1.1994441747665405, + "eval_runtime": 0.2131, + "eval_samples_per_second": 4.693, + "eval_steps_per_second": 4.693, + "num_input_tokens_seen": 6439824, + "step": 400 + }, + { + "epoch": 0.028089346537427652, + "grad_norm": 4.199150562286377, + "learning_rate": 9.719370227670753e-05, + "loss": 1.3432, + "num_input_tokens_seen": 6456208, + "step": 401 + }, + { + "epoch": 0.028159394783156898, + "grad_norm": 3.9011733531951904, + "learning_rate": 9.718670402802102e-05, + "loss": 1.0895, + "num_input_tokens_seen": 6472592, + "step": 402 + }, + { + "epoch": 0.028229443028886147, + "grad_norm": 4.142306327819824, + "learning_rate": 9.717970577933451e-05, + "loss": 0.9031, + "num_input_tokens_seen": 6488976, + "step": 403 + }, + { + "epoch": 0.028299491274615392, + "grad_norm": 3.9745633602142334, + "learning_rate": 9.717270753064799e-05, + "loss": 0.9951, + "num_input_tokens_seen": 6505360, + "step": 404 + }, + { + "epoch": 0.028369539520344638, + "grad_norm": 3.838865280151367, + "learning_rate": 9.716570928196147e-05, + "loss": 0.809, + "num_input_tokens_seen": 6521744, + "step": 405 + }, + { + "epoch": 0.028439587766073884, + "grad_norm": 4.48146390914917, + "learning_rate": 9.715871103327496e-05, + "loss": 1.4985, + "num_input_tokens_seen": 6538128, + "step": 406 + }, + { + "epoch": 0.02850963601180313, + "grad_norm": 4.393556594848633, + "learning_rate": 9.715171278458844e-05, + "loss": 1.2355, + "num_input_tokens_seen": 6554512, + "step": 407 + }, + { + "epoch": 0.028579684257532375, + "grad_norm": 3.970860004425049, + "learning_rate": 9.714471453590194e-05, + "loss": 1.1513, + "num_input_tokens_seen": 6570896, + "step": 408 + }, + { + "epoch": 0.02864973250326162, + "grad_norm": 4.166610240936279, + "learning_rate": 9.713771628721542e-05, + "loss": 1.108, + "num_input_tokens_seen": 6587216, + "step": 409 + }, + { + "epoch": 0.028719780748990866, + "grad_norm": 3.9887096881866455, + "learning_rate": 9.71307180385289e-05, + "loss": 1.1639, + "num_input_tokens_seen": 6603600, + "step": 410 + }, + { + "epoch": 0.028789828994720112, + "grad_norm": 4.195802211761475, + "learning_rate": 9.712371978984239e-05, + "loss": 1.1478, + "num_input_tokens_seen": 6619984, + "step": 411 + }, + { + "epoch": 0.02885987724044936, + "grad_norm": 4.011331081390381, + "learning_rate": 9.711672154115587e-05, + "loss": 0.9554, + "num_input_tokens_seen": 6635904, + "step": 412 + }, + { + "epoch": 0.028929925486178606, + "grad_norm": 4.4170026779174805, + "learning_rate": 9.710972329246936e-05, + "loss": 1.1452, + "num_input_tokens_seen": 6651944, + "step": 413 + }, + { + "epoch": 0.028999973731907852, + "grad_norm": 4.073450088500977, + "learning_rate": 9.710272504378284e-05, + "loss": 1.1187, + "num_input_tokens_seen": 6668096, + "step": 414 + }, + { + "epoch": 0.029070021977637098, + "grad_norm": 4.161722183227539, + "learning_rate": 9.709572679509633e-05, + "loss": 1.1603, + "num_input_tokens_seen": 6684480, + "step": 415 + }, + { + "epoch": 0.029140070223366343, + "grad_norm": 4.540097713470459, + "learning_rate": 9.708872854640981e-05, + "loss": 1.2143, + "num_input_tokens_seen": 6700536, + "step": 416 + }, + { + "epoch": 0.02921011846909559, + "grad_norm": 4.030871868133545, + "learning_rate": 9.70817302977233e-05, + "loss": 0.9791, + "num_input_tokens_seen": 6716920, + "step": 417 + }, + { + "epoch": 0.029280166714824835, + "grad_norm": 4.1743268966674805, + "learning_rate": 9.707473204903679e-05, + "loss": 0.9818, + "num_input_tokens_seen": 6733304, + "step": 418 + }, + { + "epoch": 0.02935021496055408, + "grad_norm": 4.227272987365723, + "learning_rate": 9.706773380035026e-05, + "loss": 1.0945, + "num_input_tokens_seen": 6749688, + "step": 419 + }, + { + "epoch": 0.02942026320628333, + "grad_norm": 4.406428813934326, + "learning_rate": 9.706073555166375e-05, + "loss": 1.0302, + "num_input_tokens_seen": 6766072, + "step": 420 + }, + { + "epoch": 0.029490311452012575, + "grad_norm": 4.17899227142334, + "learning_rate": 9.705373730297724e-05, + "loss": 1.1048, + "num_input_tokens_seen": 6782456, + "step": 421 + }, + { + "epoch": 0.02956035969774182, + "grad_norm": 4.034752368927002, + "learning_rate": 9.704673905429073e-05, + "loss": 1.2639, + "num_input_tokens_seen": 6798840, + "step": 422 + }, + { + "epoch": 0.029630407943471066, + "grad_norm": 4.795727729797363, + "learning_rate": 9.703974080560421e-05, + "loss": 1.2448, + "num_input_tokens_seen": 6814912, + "step": 423 + }, + { + "epoch": 0.029700456189200312, + "grad_norm": 4.509056568145752, + "learning_rate": 9.703274255691769e-05, + "loss": 1.2157, + "num_input_tokens_seen": 6830720, + "step": 424 + }, + { + "epoch": 0.029770504434929557, + "grad_norm": 4.064620494842529, + "learning_rate": 9.702574430823118e-05, + "loss": 1.2042, + "num_input_tokens_seen": 6847104, + "step": 425 + }, + { + "epoch": 0.029840552680658803, + "grad_norm": 3.9060182571411133, + "learning_rate": 9.701874605954465e-05, + "loss": 0.9116, + "num_input_tokens_seen": 6862952, + "step": 426 + }, + { + "epoch": 0.02991060092638805, + "grad_norm": 3.9900951385498047, + "learning_rate": 9.701174781085814e-05, + "loss": 1.1621, + "num_input_tokens_seen": 6879336, + "step": 427 + }, + { + "epoch": 0.029980649172117294, + "grad_norm": 4.371436595916748, + "learning_rate": 9.700474956217164e-05, + "loss": 1.2731, + "num_input_tokens_seen": 6895720, + "step": 428 + }, + { + "epoch": 0.030050697417846543, + "grad_norm": 3.9422085285186768, + "learning_rate": 9.699775131348512e-05, + "loss": 0.9636, + "num_input_tokens_seen": 6912104, + "step": 429 + }, + { + "epoch": 0.03012074566357579, + "grad_norm": 4.080913543701172, + "learning_rate": 9.699075306479861e-05, + "loss": 1.1507, + "num_input_tokens_seen": 6928488, + "step": 430 + }, + { + "epoch": 0.030190793909305035, + "grad_norm": 4.493942737579346, + "learning_rate": 9.698375481611208e-05, + "loss": 1.2274, + "num_input_tokens_seen": 6944664, + "step": 431 + }, + { + "epoch": 0.03026084215503428, + "grad_norm": 4.073723793029785, + "learning_rate": 9.697675656742557e-05, + "loss": 1.0498, + "num_input_tokens_seen": 6960344, + "step": 432 + }, + { + "epoch": 0.030330890400763526, + "grad_norm": 3.9672274589538574, + "learning_rate": 9.696975831873906e-05, + "loss": 1.007, + "num_input_tokens_seen": 6976720, + "step": 433 + }, + { + "epoch": 0.03040093864649277, + "grad_norm": 4.497872829437256, + "learning_rate": 9.696276007005255e-05, + "loss": 1.1339, + "num_input_tokens_seen": 6992552, + "step": 434 + }, + { + "epoch": 0.030470986892222017, + "grad_norm": 4.422168731689453, + "learning_rate": 9.695576182136604e-05, + "loss": 1.34, + "num_input_tokens_seen": 7008936, + "step": 435 + }, + { + "epoch": 0.030541035137951263, + "grad_norm": 4.3009138107299805, + "learning_rate": 9.694876357267951e-05, + "loss": 1.2479, + "num_input_tokens_seen": 7024512, + "step": 436 + }, + { + "epoch": 0.030611083383680508, + "grad_norm": 4.04030704498291, + "learning_rate": 9.6941765323993e-05, + "loss": 1.097, + "num_input_tokens_seen": 7040896, + "step": 437 + }, + { + "epoch": 0.030681131629409757, + "grad_norm": 3.877417802810669, + "learning_rate": 9.693476707530649e-05, + "loss": 1.1363, + "num_input_tokens_seen": 7057280, + "step": 438 + }, + { + "epoch": 0.030751179875139003, + "grad_norm": 3.8185505867004395, + "learning_rate": 9.692776882661996e-05, + "loss": 0.9067, + "num_input_tokens_seen": 7072544, + "step": 439 + }, + { + "epoch": 0.03082122812086825, + "grad_norm": 4.028950214385986, + "learning_rate": 9.692077057793345e-05, + "loss": 1.1195, + "num_input_tokens_seen": 7088928, + "step": 440 + }, + { + "epoch": 0.030891276366597494, + "grad_norm": 4.2786431312561035, + "learning_rate": 9.691377232924694e-05, + "loss": 1.1199, + "num_input_tokens_seen": 7105248, + "step": 441 + }, + { + "epoch": 0.03096132461232674, + "grad_norm": 4.193462371826172, + "learning_rate": 9.690677408056043e-05, + "loss": 1.1812, + "num_input_tokens_seen": 7121008, + "step": 442 + }, + { + "epoch": 0.031031372858055985, + "grad_norm": 3.93597412109375, + "learning_rate": 9.68997758318739e-05, + "loss": 1.0677, + "num_input_tokens_seen": 7136944, + "step": 443 + }, + { + "epoch": 0.03110142110378523, + "grad_norm": 4.3208537101745605, + "learning_rate": 9.68927775831874e-05, + "loss": 1.1358, + "num_input_tokens_seen": 7152928, + "step": 444 + }, + { + "epoch": 0.031171469349514477, + "grad_norm": 3.9743378162384033, + "learning_rate": 9.688577933450088e-05, + "loss": 1.094, + "num_input_tokens_seen": 7169312, + "step": 445 + }, + { + "epoch": 0.031241517595243726, + "grad_norm": 4.226114273071289, + "learning_rate": 9.687878108581436e-05, + "loss": 1.1752, + "num_input_tokens_seen": 7185696, + "step": 446 + }, + { + "epoch": 0.03131156584097297, + "grad_norm": 4.210222244262695, + "learning_rate": 9.687178283712785e-05, + "loss": 1.1262, + "num_input_tokens_seen": 7201784, + "step": 447 + }, + { + "epoch": 0.03138161408670222, + "grad_norm": 4.311635971069336, + "learning_rate": 9.686478458844133e-05, + "loss": 1.2491, + "num_input_tokens_seen": 7218168, + "step": 448 + }, + { + "epoch": 0.03145166233243146, + "grad_norm": 4.56603479385376, + "learning_rate": 9.685778633975482e-05, + "loss": 1.3512, + "num_input_tokens_seen": 7233360, + "step": 449 + }, + { + "epoch": 0.03152171057816071, + "grad_norm": 4.232856750488281, + "learning_rate": 9.685078809106831e-05, + "loss": 0.9387, + "num_input_tokens_seen": 7248280, + "step": 450 + }, + { + "epoch": 0.031591758823889954, + "grad_norm": 4.512947082519531, + "learning_rate": 9.684378984238179e-05, + "loss": 1.1988, + "num_input_tokens_seen": 7264664, + "step": 451 + }, + { + "epoch": 0.0316618070696192, + "grad_norm": 4.273897171020508, + "learning_rate": 9.683679159369528e-05, + "loss": 1.2523, + "num_input_tokens_seen": 7281048, + "step": 452 + }, + { + "epoch": 0.031731855315348445, + "grad_norm": 4.288438320159912, + "learning_rate": 9.682979334500875e-05, + "loss": 1.1692, + "num_input_tokens_seen": 7297424, + "step": 453 + }, + { + "epoch": 0.03180190356107769, + "grad_norm": 4.27367639541626, + "learning_rate": 9.682279509632225e-05, + "loss": 1.1868, + "num_input_tokens_seen": 7312792, + "step": 454 + }, + { + "epoch": 0.031871951806806936, + "grad_norm": 3.978926181793213, + "learning_rate": 9.681579684763574e-05, + "loss": 1.0382, + "num_input_tokens_seen": 7329176, + "step": 455 + }, + { + "epoch": 0.03194200005253618, + "grad_norm": 4.4399919509887695, + "learning_rate": 9.680879859894922e-05, + "loss": 1.2072, + "num_input_tokens_seen": 7345560, + "step": 456 + }, + { + "epoch": 0.03201204829826543, + "grad_norm": 3.9786529541015625, + "learning_rate": 9.68018003502627e-05, + "loss": 1.1704, + "num_input_tokens_seen": 7361944, + "step": 457 + }, + { + "epoch": 0.03208209654399467, + "grad_norm": 4.171195030212402, + "learning_rate": 9.679480210157618e-05, + "loss": 1.1307, + "num_input_tokens_seen": 7378328, + "step": 458 + }, + { + "epoch": 0.032152144789723926, + "grad_norm": 3.9415268898010254, + "learning_rate": 9.678780385288967e-05, + "loss": 0.9971, + "num_input_tokens_seen": 7394208, + "step": 459 + }, + { + "epoch": 0.03222219303545317, + "grad_norm": 4.066036224365234, + "learning_rate": 9.678080560420316e-05, + "loss": 1.1227, + "num_input_tokens_seen": 7410328, + "step": 460 + }, + { + "epoch": 0.03229224128118242, + "grad_norm": 4.22513484954834, + "learning_rate": 9.677380735551665e-05, + "loss": 1.0883, + "num_input_tokens_seen": 7426712, + "step": 461 + }, + { + "epoch": 0.03236228952691166, + "grad_norm": 4.310954570770264, + "learning_rate": 9.676680910683013e-05, + "loss": 1.1695, + "num_input_tokens_seen": 7442736, + "step": 462 + }, + { + "epoch": 0.03243233777264091, + "grad_norm": 4.2868828773498535, + "learning_rate": 9.675981085814361e-05, + "loss": 1.0594, + "num_input_tokens_seen": 7458560, + "step": 463 + }, + { + "epoch": 0.032502386018370154, + "grad_norm": 4.318186283111572, + "learning_rate": 9.67528126094571e-05, + "loss": 1.1791, + "num_input_tokens_seen": 7474944, + "step": 464 + }, + { + "epoch": 0.0325724342640994, + "grad_norm": 4.040421009063721, + "learning_rate": 9.674581436077059e-05, + "loss": 1.0649, + "num_input_tokens_seen": 7490344, + "step": 465 + }, + { + "epoch": 0.032642482509828645, + "grad_norm": 3.914815902709961, + "learning_rate": 9.673881611208406e-05, + "loss": 1.1381, + "num_input_tokens_seen": 7506728, + "step": 466 + }, + { + "epoch": 0.03271253075555789, + "grad_norm": 4.054527282714844, + "learning_rate": 9.673181786339755e-05, + "loss": 1.2264, + "num_input_tokens_seen": 7522912, + "step": 467 + }, + { + "epoch": 0.032782579001287136, + "grad_norm": 4.295147895812988, + "learning_rate": 9.672481961471104e-05, + "loss": 1.1369, + "num_input_tokens_seen": 7539040, + "step": 468 + }, + { + "epoch": 0.03285262724701638, + "grad_norm": 4.109183311462402, + "learning_rate": 9.671782136602453e-05, + "loss": 1.1676, + "num_input_tokens_seen": 7555424, + "step": 469 + }, + { + "epoch": 0.03292267549274563, + "grad_norm": 4.131369590759277, + "learning_rate": 9.6710823117338e-05, + "loss": 1.1188, + "num_input_tokens_seen": 7571808, + "step": 470 + }, + { + "epoch": 0.03299272373847487, + "grad_norm": 3.998414993286133, + "learning_rate": 9.670382486865149e-05, + "loss": 1.0201, + "num_input_tokens_seen": 7587528, + "step": 471 + }, + { + "epoch": 0.03306277198420412, + "grad_norm": 4.1235551834106445, + "learning_rate": 9.669682661996498e-05, + "loss": 1.1265, + "num_input_tokens_seen": 7603912, + "step": 472 + }, + { + "epoch": 0.033132820229933364, + "grad_norm": 4.800798416137695, + "learning_rate": 9.668982837127845e-05, + "loss": 1.3634, + "num_input_tokens_seen": 7617512, + "step": 473 + }, + { + "epoch": 0.03320286847566261, + "grad_norm": 4.068000316619873, + "learning_rate": 9.668283012259196e-05, + "loss": 1.1427, + "num_input_tokens_seen": 7633040, + "step": 474 + }, + { + "epoch": 0.033272916721391856, + "grad_norm": 4.0715484619140625, + "learning_rate": 9.667583187390543e-05, + "loss": 1.0633, + "num_input_tokens_seen": 7648416, + "step": 475 + }, + { + "epoch": 0.0333429649671211, + "grad_norm": 3.937807321548462, + "learning_rate": 9.666883362521892e-05, + "loss": 1.1393, + "num_input_tokens_seen": 7664624, + "step": 476 + }, + { + "epoch": 0.033413013212850354, + "grad_norm": 4.195656776428223, + "learning_rate": 9.666183537653241e-05, + "loss": 1.1801, + "num_input_tokens_seen": 7680480, + "step": 477 + }, + { + "epoch": 0.0334830614585796, + "grad_norm": 4.227575778961182, + "learning_rate": 9.665483712784588e-05, + "loss": 1.0453, + "num_input_tokens_seen": 7696632, + "step": 478 + }, + { + "epoch": 0.033553109704308845, + "grad_norm": 4.328822135925293, + "learning_rate": 9.664783887915937e-05, + "loss": 1.221, + "num_input_tokens_seen": 7713016, + "step": 479 + }, + { + "epoch": 0.03362315795003809, + "grad_norm": 4.086736679077148, + "learning_rate": 9.664084063047286e-05, + "loss": 1.2817, + "num_input_tokens_seen": 7729400, + "step": 480 + }, + { + "epoch": 0.033693206195767336, + "grad_norm": 4.555233955383301, + "learning_rate": 9.663384238178635e-05, + "loss": 1.483, + "num_input_tokens_seen": 7745784, + "step": 481 + }, + { + "epoch": 0.03376325444149658, + "grad_norm": 4.118983745574951, + "learning_rate": 9.662684413309984e-05, + "loss": 0.9139, + "num_input_tokens_seen": 7762168, + "step": 482 + }, + { + "epoch": 0.03383330268722583, + "grad_norm": 4.232059001922607, + "learning_rate": 9.661984588441331e-05, + "loss": 1.1269, + "num_input_tokens_seen": 7777920, + "step": 483 + }, + { + "epoch": 0.03390335093295507, + "grad_norm": 6.288865089416504, + "learning_rate": 9.66128476357268e-05, + "loss": 1.0642, + "num_input_tokens_seen": 7794304, + "step": 484 + }, + { + "epoch": 0.03397339917868432, + "grad_norm": 4.133046627044678, + "learning_rate": 9.660584938704028e-05, + "loss": 1.2067, + "num_input_tokens_seen": 7810200, + "step": 485 + }, + { + "epoch": 0.034043447424413564, + "grad_norm": 4.147965431213379, + "learning_rate": 9.659885113835377e-05, + "loss": 1.0367, + "num_input_tokens_seen": 7826384, + "step": 486 + }, + { + "epoch": 0.03411349567014281, + "grad_norm": 4.1191020011901855, + "learning_rate": 9.659185288966725e-05, + "loss": 1.0972, + "num_input_tokens_seen": 7841704, + "step": 487 + }, + { + "epoch": 0.034183543915872056, + "grad_norm": 4.518441677093506, + "learning_rate": 9.658485464098074e-05, + "loss": 1.263, + "num_input_tokens_seen": 7858088, + "step": 488 + }, + { + "epoch": 0.0342535921616013, + "grad_norm": 4.321181297302246, + "learning_rate": 9.657785639229423e-05, + "loss": 1.1378, + "num_input_tokens_seen": 7874472, + "step": 489 + }, + { + "epoch": 0.03432364040733055, + "grad_norm": 4.366185665130615, + "learning_rate": 9.65708581436077e-05, + "loss": 1.1636, + "num_input_tokens_seen": 7890856, + "step": 490 + }, + { + "epoch": 0.03439368865305979, + "grad_norm": 4.042731761932373, + "learning_rate": 9.65638598949212e-05, + "loss": 1.0601, + "num_input_tokens_seen": 7906776, + "step": 491 + }, + { + "epoch": 0.03446373689878904, + "grad_norm": 3.743668556213379, + "learning_rate": 9.655686164623468e-05, + "loss": 1.0441, + "num_input_tokens_seen": 7923160, + "step": 492 + }, + { + "epoch": 0.034533785144518284, + "grad_norm": 3.8547139167785645, + "learning_rate": 9.654986339754816e-05, + "loss": 1.0842, + "num_input_tokens_seen": 7939296, + "step": 493 + }, + { + "epoch": 0.034603833390247536, + "grad_norm": 4.238414287567139, + "learning_rate": 9.654286514886166e-05, + "loss": 1.2498, + "num_input_tokens_seen": 7955504, + "step": 494 + }, + { + "epoch": 0.03467388163597678, + "grad_norm": 4.134857177734375, + "learning_rate": 9.653586690017514e-05, + "loss": 1.1241, + "num_input_tokens_seen": 7971888, + "step": 495 + }, + { + "epoch": 0.03474392988170603, + "grad_norm": 4.2501983642578125, + "learning_rate": 9.652886865148862e-05, + "loss": 1.1829, + "num_input_tokens_seen": 7988272, + "step": 496 + }, + { + "epoch": 0.03481397812743527, + "grad_norm": 7.4397053718566895, + "learning_rate": 9.65218704028021e-05, + "loss": 0.9952, + "num_input_tokens_seen": 8003744, + "step": 497 + }, + { + "epoch": 0.03488402637316452, + "grad_norm": 4.2750959396362305, + "learning_rate": 9.651487215411559e-05, + "loss": 1.2387, + "num_input_tokens_seen": 8019184, + "step": 498 + }, + { + "epoch": 0.034954074618893764, + "grad_norm": 4.156162261962891, + "learning_rate": 9.650787390542908e-05, + "loss": 1.1201, + "num_input_tokens_seen": 8035176, + "step": 499 + }, + { + "epoch": 0.03502412286462301, + "grad_norm": 4.178225040435791, + "learning_rate": 9.650087565674257e-05, + "loss": 1.2026, + "num_input_tokens_seen": 8051560, + "step": 500 + }, + { + "epoch": 0.035094171110352256, + "grad_norm": 4.147096157073975, + "learning_rate": 9.649387740805605e-05, + "loss": 1.2465, + "num_input_tokens_seen": 8067944, + "step": 501 + }, + { + "epoch": 0.0351642193560815, + "grad_norm": 4.329249858856201, + "learning_rate": 9.648687915936953e-05, + "loss": 1.2742, + "num_input_tokens_seen": 8083824, + "step": 502 + }, + { + "epoch": 0.03523426760181075, + "grad_norm": 4.404232978820801, + "learning_rate": 9.647988091068302e-05, + "loss": 1.1511, + "num_input_tokens_seen": 8100208, + "step": 503 + }, + { + "epoch": 0.03530431584753999, + "grad_norm": 4.190586090087891, + "learning_rate": 9.64728826619965e-05, + "loss": 0.9884, + "num_input_tokens_seen": 8116048, + "step": 504 + }, + { + "epoch": 0.03537436409326924, + "grad_norm": 4.262845516204834, + "learning_rate": 9.646588441330998e-05, + "loss": 1.1321, + "num_input_tokens_seen": 8132432, + "step": 505 + }, + { + "epoch": 0.035444412338998484, + "grad_norm": 4.452746391296387, + "learning_rate": 9.645888616462347e-05, + "loss": 1.1667, + "num_input_tokens_seen": 8148816, + "step": 506 + }, + { + "epoch": 0.03551446058472773, + "grad_norm": 4.111443042755127, + "learning_rate": 9.645188791593696e-05, + "loss": 1.0049, + "num_input_tokens_seen": 8164856, + "step": 507 + }, + { + "epoch": 0.035584508830456975, + "grad_norm": 4.292227268218994, + "learning_rate": 9.644488966725045e-05, + "loss": 1.1535, + "num_input_tokens_seen": 8181240, + "step": 508 + }, + { + "epoch": 0.03565455707618622, + "grad_norm": 4.295238971710205, + "learning_rate": 9.643789141856394e-05, + "loss": 1.236, + "num_input_tokens_seen": 8197624, + "step": 509 + }, + { + "epoch": 0.035724605321915466, + "grad_norm": 3.930659294128418, + "learning_rate": 9.643089316987741e-05, + "loss": 0.9195, + "num_input_tokens_seen": 8213816, + "step": 510 + }, + { + "epoch": 0.03579465356764472, + "grad_norm": 4.092316150665283, + "learning_rate": 9.64238949211909e-05, + "loss": 1.0799, + "num_input_tokens_seen": 8229632, + "step": 511 + }, + { + "epoch": 0.035864701813373964, + "grad_norm": 4.2939252853393555, + "learning_rate": 9.641689667250437e-05, + "loss": 1.111, + "num_input_tokens_seen": 8245232, + "step": 512 + }, + { + "epoch": 0.03593475005910321, + "grad_norm": 4.191503524780273, + "learning_rate": 9.640989842381786e-05, + "loss": 0.9399, + "num_input_tokens_seen": 8260912, + "step": 513 + }, + { + "epoch": 0.036004798304832455, + "grad_norm": 4.141485214233398, + "learning_rate": 9.640290017513136e-05, + "loss": 1.1334, + "num_input_tokens_seen": 8276864, + "step": 514 + }, + { + "epoch": 0.0360748465505617, + "grad_norm": 3.890547752380371, + "learning_rate": 9.639590192644484e-05, + "loss": 1.0055, + "num_input_tokens_seen": 8292720, + "step": 515 + }, + { + "epoch": 0.03614489479629095, + "grad_norm": 4.405922889709473, + "learning_rate": 9.638890367775833e-05, + "loss": 1.2238, + "num_input_tokens_seen": 8309104, + "step": 516 + }, + { + "epoch": 0.03621494304202019, + "grad_norm": 4.207942485809326, + "learning_rate": 9.63819054290718e-05, + "loss": 1.0688, + "num_input_tokens_seen": 8325304, + "step": 517 + }, + { + "epoch": 0.03628499128774944, + "grad_norm": 4.174366474151611, + "learning_rate": 9.637490718038529e-05, + "loss": 1.2303, + "num_input_tokens_seen": 8341688, + "step": 518 + }, + { + "epoch": 0.036355039533478684, + "grad_norm": 3.9641714096069336, + "learning_rate": 9.636790893169878e-05, + "loss": 1.2244, + "num_input_tokens_seen": 8357760, + "step": 519 + }, + { + "epoch": 0.03642508777920793, + "grad_norm": 5.832678318023682, + "learning_rate": 9.636091068301227e-05, + "loss": 1.0645, + "num_input_tokens_seen": 8372712, + "step": 520 + }, + { + "epoch": 0.036495136024937175, + "grad_norm": 3.7905161380767822, + "learning_rate": 9.635391243432576e-05, + "loss": 1.0551, + "num_input_tokens_seen": 8389096, + "step": 521 + }, + { + "epoch": 0.03656518427066642, + "grad_norm": 3.6744072437286377, + "learning_rate": 9.634691418563923e-05, + "loss": 1.0687, + "num_input_tokens_seen": 8405216, + "step": 522 + }, + { + "epoch": 0.036635232516395666, + "grad_norm": 4.897486209869385, + "learning_rate": 9.633991593695272e-05, + "loss": 1.1968, + "num_input_tokens_seen": 8421600, + "step": 523 + }, + { + "epoch": 0.03670528076212491, + "grad_norm": 3.821457862854004, + "learning_rate": 9.63329176882662e-05, + "loss": 1.0473, + "num_input_tokens_seen": 8437984, + "step": 524 + }, + { + "epoch": 0.03677532900785416, + "grad_norm": 3.873832941055298, + "learning_rate": 9.632591943957969e-05, + "loss": 0.9656, + "num_input_tokens_seen": 8453760, + "step": 525 + }, + { + "epoch": 0.0368453772535834, + "grad_norm": 4.139901161193848, + "learning_rate": 9.631892119089317e-05, + "loss": 1.0881, + "num_input_tokens_seen": 8470144, + "step": 526 + }, + { + "epoch": 0.03691542549931265, + "grad_norm": 3.9512782096862793, + "learning_rate": 9.631192294220666e-05, + "loss": 1.1093, + "num_input_tokens_seen": 8486528, + "step": 527 + }, + { + "epoch": 0.0369854737450419, + "grad_norm": 3.8937103748321533, + "learning_rate": 9.630492469352015e-05, + "loss": 0.9722, + "num_input_tokens_seen": 8502912, + "step": 528 + }, + { + "epoch": 0.03705552199077115, + "grad_norm": 4.482640743255615, + "learning_rate": 9.629792644483363e-05, + "loss": 1.056, + "num_input_tokens_seen": 8519296, + "step": 529 + }, + { + "epoch": 0.03712557023650039, + "grad_norm": 4.127941131591797, + "learning_rate": 9.629092819614711e-05, + "loss": 1.0285, + "num_input_tokens_seen": 8535160, + "step": 530 + }, + { + "epoch": 0.03719561848222964, + "grad_norm": 3.973585844039917, + "learning_rate": 9.62839299474606e-05, + "loss": 1.0356, + "num_input_tokens_seen": 8551256, + "step": 531 + }, + { + "epoch": 0.037265666727958884, + "grad_norm": 4.22855281829834, + "learning_rate": 9.627693169877408e-05, + "loss": 1.134, + "num_input_tokens_seen": 8567640, + "step": 532 + }, + { + "epoch": 0.03733571497368813, + "grad_norm": 4.144021511077881, + "learning_rate": 9.626993345008757e-05, + "loss": 1.0963, + "num_input_tokens_seen": 8583504, + "step": 533 + }, + { + "epoch": 0.037405763219417375, + "grad_norm": 3.8666226863861084, + "learning_rate": 9.626293520140106e-05, + "loss": 0.912, + "num_input_tokens_seen": 8599888, + "step": 534 + }, + { + "epoch": 0.03747581146514662, + "grad_norm": 4.215412616729736, + "learning_rate": 9.625593695271454e-05, + "loss": 1.1055, + "num_input_tokens_seen": 8616256, + "step": 535 + }, + { + "epoch": 0.037545859710875866, + "grad_norm": 4.353022575378418, + "learning_rate": 9.624893870402803e-05, + "loss": 1.0379, + "num_input_tokens_seen": 8632640, + "step": 536 + }, + { + "epoch": 0.03761590795660511, + "grad_norm": 3.778947591781616, + "learning_rate": 9.624194045534151e-05, + "loss": 1.0547, + "num_input_tokens_seen": 8648624, + "step": 537 + }, + { + "epoch": 0.03768595620233436, + "grad_norm": 4.481568336486816, + "learning_rate": 9.6234942206655e-05, + "loss": 1.3407, + "num_input_tokens_seen": 8664200, + "step": 538 + }, + { + "epoch": 0.0377560044480636, + "grad_norm": 4.066302299499512, + "learning_rate": 9.622794395796847e-05, + "loss": 0.995, + "num_input_tokens_seen": 8680584, + "step": 539 + }, + { + "epoch": 0.03782605269379285, + "grad_norm": 4.262768268585205, + "learning_rate": 9.622094570928197e-05, + "loss": 1.3054, + "num_input_tokens_seen": 8696968, + "step": 540 + }, + { + "epoch": 0.037896100939522094, + "grad_norm": 3.777597665786743, + "learning_rate": 9.621394746059546e-05, + "loss": 0.9831, + "num_input_tokens_seen": 8713352, + "step": 541 + }, + { + "epoch": 0.03796614918525134, + "grad_norm": 3.9732742309570312, + "learning_rate": 9.620694921190894e-05, + "loss": 1.0699, + "num_input_tokens_seen": 8729048, + "step": 542 + }, + { + "epoch": 0.038036197430980585, + "grad_norm": 4.543329238891602, + "learning_rate": 9.619995096322243e-05, + "loss": 1.1546, + "num_input_tokens_seen": 8745432, + "step": 543 + }, + { + "epoch": 0.03810624567670983, + "grad_norm": 4.903865814208984, + "learning_rate": 9.61929527145359e-05, + "loss": 1.1548, + "num_input_tokens_seen": 8760296, + "step": 544 + }, + { + "epoch": 0.03817629392243908, + "grad_norm": 4.197691917419434, + "learning_rate": 9.618595446584939e-05, + "loss": 1.1616, + "num_input_tokens_seen": 8776680, + "step": 545 + }, + { + "epoch": 0.03824634216816833, + "grad_norm": 3.912689208984375, + "learning_rate": 9.617895621716288e-05, + "loss": 0.9926, + "num_input_tokens_seen": 8793064, + "step": 546 + }, + { + "epoch": 0.038316390413897575, + "grad_norm": 4.291840076446533, + "learning_rate": 9.617195796847637e-05, + "loss": 1.1943, + "num_input_tokens_seen": 8809448, + "step": 547 + }, + { + "epoch": 0.03838643865962682, + "grad_norm": 3.9053072929382324, + "learning_rate": 9.616495971978985e-05, + "loss": 1.2437, + "num_input_tokens_seen": 8825536, + "step": 548 + }, + { + "epoch": 0.038456486905356066, + "grad_norm": 4.860696315765381, + "learning_rate": 9.615796147110333e-05, + "loss": 1.3045, + "num_input_tokens_seen": 8841920, + "step": 549 + }, + { + "epoch": 0.03852653515108531, + "grad_norm": 3.9394373893737793, + "learning_rate": 9.615096322241682e-05, + "loss": 1.1367, + "num_input_tokens_seen": 8858304, + "step": 550 + }, + { + "epoch": 0.03859658339681456, + "grad_norm": 3.8160409927368164, + "learning_rate": 9.61439649737303e-05, + "loss": 1.0864, + "num_input_tokens_seen": 8874688, + "step": 551 + }, + { + "epoch": 0.0386666316425438, + "grad_norm": 4.3792805671691895, + "learning_rate": 9.613696672504378e-05, + "loss": 1.2516, + "num_input_tokens_seen": 8891072, + "step": 552 + }, + { + "epoch": 0.03873667988827305, + "grad_norm": 4.103452682495117, + "learning_rate": 9.612996847635727e-05, + "loss": 0.9737, + "num_input_tokens_seen": 8907456, + "step": 553 + }, + { + "epoch": 0.038806728134002294, + "grad_norm": 4.117603302001953, + "learning_rate": 9.612297022767076e-05, + "loss": 1.096, + "num_input_tokens_seen": 8923816, + "step": 554 + }, + { + "epoch": 0.03887677637973154, + "grad_norm": 4.272468566894531, + "learning_rate": 9.611597197898425e-05, + "loss": 1.161, + "num_input_tokens_seen": 8939344, + "step": 555 + }, + { + "epoch": 0.038946824625460785, + "grad_norm": 4.323635578155518, + "learning_rate": 9.610897373029772e-05, + "loss": 1.1922, + "num_input_tokens_seen": 8954920, + "step": 556 + }, + { + "epoch": 0.03901687287119003, + "grad_norm": 3.783510684967041, + "learning_rate": 9.610197548161121e-05, + "loss": 1.0658, + "num_input_tokens_seen": 8971304, + "step": 557 + }, + { + "epoch": 0.039086921116919277, + "grad_norm": 4.3757548332214355, + "learning_rate": 9.60949772329247e-05, + "loss": 1.3186, + "num_input_tokens_seen": 8987672, + "step": 558 + }, + { + "epoch": 0.03915696936264852, + "grad_norm": 4.048824787139893, + "learning_rate": 9.608797898423818e-05, + "loss": 1.1452, + "num_input_tokens_seen": 9003896, + "step": 559 + }, + { + "epoch": 0.03922701760837777, + "grad_norm": 4.06865930557251, + "learning_rate": 9.608098073555168e-05, + "loss": 0.9861, + "num_input_tokens_seen": 9020280, + "step": 560 + }, + { + "epoch": 0.03929706585410701, + "grad_norm": 3.966737747192383, + "learning_rate": 9.607398248686515e-05, + "loss": 1.0323, + "num_input_tokens_seen": 9036280, + "step": 561 + }, + { + "epoch": 0.03936711409983626, + "grad_norm": 4.466656684875488, + "learning_rate": 9.606698423817864e-05, + "loss": 1.2462, + "num_input_tokens_seen": 9052664, + "step": 562 + }, + { + "epoch": 0.03943716234556551, + "grad_norm": 4.312132358551025, + "learning_rate": 9.605998598949213e-05, + "loss": 1.2133, + "num_input_tokens_seen": 9068832, + "step": 563 + }, + { + "epoch": 0.03950721059129476, + "grad_norm": 3.9202895164489746, + "learning_rate": 9.60529877408056e-05, + "loss": 1.0723, + "num_input_tokens_seen": 9084680, + "step": 564 + }, + { + "epoch": 0.039577258837024, + "grad_norm": 5.139899730682373, + "learning_rate": 9.604598949211909e-05, + "loss": 1.1165, + "num_input_tokens_seen": 9099792, + "step": 565 + }, + { + "epoch": 0.03964730708275325, + "grad_norm": 4.398557186126709, + "learning_rate": 9.603899124343258e-05, + "loss": 1.1737, + "num_input_tokens_seen": 9116136, + "step": 566 + }, + { + "epoch": 0.039717355328482494, + "grad_norm": 4.350982666015625, + "learning_rate": 9.603199299474607e-05, + "loss": 1.2174, + "num_input_tokens_seen": 9132520, + "step": 567 + }, + { + "epoch": 0.03978740357421174, + "grad_norm": 3.787644386291504, + "learning_rate": 9.602499474605956e-05, + "loss": 0.9914, + "num_input_tokens_seen": 9148856, + "step": 568 + }, + { + "epoch": 0.039857451819940985, + "grad_norm": 4.630245685577393, + "learning_rate": 9.601799649737303e-05, + "loss": 1.4135, + "num_input_tokens_seen": 9164888, + "step": 569 + }, + { + "epoch": 0.03992750006567023, + "grad_norm": 4.063969135284424, + "learning_rate": 9.601099824868652e-05, + "loss": 1.1312, + "num_input_tokens_seen": 9181272, + "step": 570 + }, + { + "epoch": 0.039997548311399476, + "grad_norm": 4.2443413734436035, + "learning_rate": 9.6004e-05, + "loss": 1.1627, + "num_input_tokens_seen": 9197344, + "step": 571 + }, + { + "epoch": 0.04006759655712872, + "grad_norm": 4.396352767944336, + "learning_rate": 9.599700175131349e-05, + "loss": 1.1222, + "num_input_tokens_seen": 9212312, + "step": 572 + }, + { + "epoch": 0.04013764480285797, + "grad_norm": 4.364585876464844, + "learning_rate": 9.599000350262697e-05, + "loss": 1.0522, + "num_input_tokens_seen": 9228696, + "step": 573 + }, + { + "epoch": 0.04020769304858721, + "grad_norm": 3.9348409175872803, + "learning_rate": 9.598300525394046e-05, + "loss": 1.1375, + "num_input_tokens_seen": 9245080, + "step": 574 + }, + { + "epoch": 0.04027774129431646, + "grad_norm": 4.051416873931885, + "learning_rate": 9.597600700525395e-05, + "loss": 1.0265, + "num_input_tokens_seen": 9260752, + "step": 575 + }, + { + "epoch": 0.040347789540045705, + "grad_norm": 4.661770820617676, + "learning_rate": 9.596900875656743e-05, + "loss": 1.192, + "num_input_tokens_seen": 9276792, + "step": 576 + }, + { + "epoch": 0.04041783778577495, + "grad_norm": 4.378422260284424, + "learning_rate": 9.596201050788092e-05, + "loss": 1.0497, + "num_input_tokens_seen": 9292768, + "step": 577 + }, + { + "epoch": 0.040487886031504196, + "grad_norm": 4.4690399169921875, + "learning_rate": 9.595501225919439e-05, + "loss": 1.2398, + "num_input_tokens_seen": 9309152, + "step": 578 + }, + { + "epoch": 0.04055793427723344, + "grad_norm": 4.1711273193359375, + "learning_rate": 9.594801401050788e-05, + "loss": 1.097, + "num_input_tokens_seen": 9325536, + "step": 579 + }, + { + "epoch": 0.040627982522962694, + "grad_norm": 3.8115949630737305, + "learning_rate": 9.594101576182137e-05, + "loss": 1.0317, + "num_input_tokens_seen": 9341920, + "step": 580 + }, + { + "epoch": 0.04069803076869194, + "grad_norm": 4.072190284729004, + "learning_rate": 9.593401751313486e-05, + "loss": 1.0649, + "num_input_tokens_seen": 9357904, + "step": 581 + }, + { + "epoch": 0.040768079014421185, + "grad_norm": 3.895766258239746, + "learning_rate": 9.592701926444835e-05, + "loss": 1.1906, + "num_input_tokens_seen": 9373496, + "step": 582 + }, + { + "epoch": 0.04083812726015043, + "grad_norm": 4.026490688323975, + "learning_rate": 9.592002101576182e-05, + "loss": 0.9913, + "num_input_tokens_seen": 9389824, + "step": 583 + }, + { + "epoch": 0.040908175505879676, + "grad_norm": 3.612987518310547, + "learning_rate": 9.591302276707531e-05, + "loss": 0.9376, + "num_input_tokens_seen": 9406208, + "step": 584 + }, + { + "epoch": 0.04097822375160892, + "grad_norm": 4.4619646072387695, + "learning_rate": 9.59060245183888e-05, + "loss": 1.2198, + "num_input_tokens_seen": 9422592, + "step": 585 + }, + { + "epoch": 0.04104827199733817, + "grad_norm": 3.990372896194458, + "learning_rate": 9.589902626970229e-05, + "loss": 1.082, + "num_input_tokens_seen": 9438816, + "step": 586 + }, + { + "epoch": 0.04111832024306741, + "grad_norm": 3.7697947025299072, + "learning_rate": 9.589202802101577e-05, + "loss": 1.0173, + "num_input_tokens_seen": 9455200, + "step": 587 + }, + { + "epoch": 0.04118836848879666, + "grad_norm": 4.066056728363037, + "learning_rate": 9.588502977232925e-05, + "loss": 1.124, + "num_input_tokens_seen": 9471320, + "step": 588 + }, + { + "epoch": 0.041258416734525905, + "grad_norm": 3.913506507873535, + "learning_rate": 9.587803152364274e-05, + "loss": 1.0501, + "num_input_tokens_seen": 9487304, + "step": 589 + }, + { + "epoch": 0.04132846498025515, + "grad_norm": 3.9049429893493652, + "learning_rate": 9.587103327495623e-05, + "loss": 1.0563, + "num_input_tokens_seen": 9503688, + "step": 590 + }, + { + "epoch": 0.041398513225984396, + "grad_norm": 4.316978454589844, + "learning_rate": 9.58640350262697e-05, + "loss": 1.1333, + "num_input_tokens_seen": 9519488, + "step": 591 + }, + { + "epoch": 0.04146856147171364, + "grad_norm": 3.7818517684936523, + "learning_rate": 9.585703677758319e-05, + "loss": 1.0537, + "num_input_tokens_seen": 9535872, + "step": 592 + }, + { + "epoch": 0.04153860971744289, + "grad_norm": 3.8751401901245117, + "learning_rate": 9.585003852889668e-05, + "loss": 1.1745, + "num_input_tokens_seen": 9551928, + "step": 593 + }, + { + "epoch": 0.04160865796317213, + "grad_norm": 4.357265949249268, + "learning_rate": 9.584304028021017e-05, + "loss": 1.1154, + "num_input_tokens_seen": 9568312, + "step": 594 + }, + { + "epoch": 0.04167870620890138, + "grad_norm": 4.184159755706787, + "learning_rate": 9.583604203152366e-05, + "loss": 1.125, + "num_input_tokens_seen": 9583968, + "step": 595 + }, + { + "epoch": 0.041748754454630624, + "grad_norm": 3.9540369510650635, + "learning_rate": 9.582904378283713e-05, + "loss": 1.2032, + "num_input_tokens_seen": 9600152, + "step": 596 + }, + { + "epoch": 0.04181880270035987, + "grad_norm": 4.401122093200684, + "learning_rate": 9.582204553415062e-05, + "loss": 1.4808, + "num_input_tokens_seen": 9615632, + "step": 597 + }, + { + "epoch": 0.04188885094608912, + "grad_norm": 4.418131351470947, + "learning_rate": 9.58150472854641e-05, + "loss": 1.0077, + "num_input_tokens_seen": 9631712, + "step": 598 + }, + { + "epoch": 0.04195889919181837, + "grad_norm": 4.362226963043213, + "learning_rate": 9.580804903677758e-05, + "loss": 1.1614, + "num_input_tokens_seen": 9648096, + "step": 599 + }, + { + "epoch": 0.04202894743754761, + "grad_norm": 4.051177024841309, + "learning_rate": 9.580105078809107e-05, + "loss": 1.0718, + "num_input_tokens_seen": 9663792, + "step": 600 + }, + { + "epoch": 0.04202894743754761, + "eval_loss": 1.1809133291244507, + "eval_runtime": 0.2062, + "eval_samples_per_second": 4.849, + "eval_steps_per_second": 4.849, + "num_input_tokens_seen": 9663792, + "step": 600 + }, + { + "epoch": 0.04209899568327686, + "grad_norm": 4.478739261627197, + "learning_rate": 9.579405253940456e-05, + "loss": 1.1963, + "num_input_tokens_seen": 9680176, + "step": 601 + }, + { + "epoch": 0.042169043929006104, + "grad_norm": 4.05004358291626, + "learning_rate": 9.578705429071805e-05, + "loss": 1.1005, + "num_input_tokens_seen": 9696560, + "step": 602 + }, + { + "epoch": 0.04223909217473535, + "grad_norm": 4.092396259307861, + "learning_rate": 9.578005604203152e-05, + "loss": 1.1796, + "num_input_tokens_seen": 9712944, + "step": 603 + }, + { + "epoch": 0.042309140420464596, + "grad_norm": 4.428014278411865, + "learning_rate": 9.577305779334501e-05, + "loss": 0.9734, + "num_input_tokens_seen": 9729096, + "step": 604 + }, + { + "epoch": 0.04237918866619384, + "grad_norm": 4.202315807342529, + "learning_rate": 9.576605954465849e-05, + "loss": 1.0502, + "num_input_tokens_seen": 9745480, + "step": 605 + }, + { + "epoch": 0.04244923691192309, + "grad_norm": 3.7633514404296875, + "learning_rate": 9.575906129597198e-05, + "loss": 0.9218, + "num_input_tokens_seen": 9761272, + "step": 606 + }, + { + "epoch": 0.04251928515765233, + "grad_norm": 4.170671463012695, + "learning_rate": 9.575206304728548e-05, + "loss": 1.1196, + "num_input_tokens_seen": 9777656, + "step": 607 + }, + { + "epoch": 0.04258933340338158, + "grad_norm": 4.20021915435791, + "learning_rate": 9.574506479859895e-05, + "loss": 1.1146, + "num_input_tokens_seen": 9794032, + "step": 608 + }, + { + "epoch": 0.042659381649110824, + "grad_norm": 4.437755107879639, + "learning_rate": 9.573806654991244e-05, + "loss": 1.0911, + "num_input_tokens_seen": 9809936, + "step": 609 + }, + { + "epoch": 0.04272942989484007, + "grad_norm": 4.417452335357666, + "learning_rate": 9.573106830122592e-05, + "loss": 1.2079, + "num_input_tokens_seen": 9825232, + "step": 610 + }, + { + "epoch": 0.042799478140569315, + "grad_norm": 4.144030570983887, + "learning_rate": 9.57240700525394e-05, + "loss": 1.1229, + "num_input_tokens_seen": 9840648, + "step": 611 + }, + { + "epoch": 0.04286952638629856, + "grad_norm": 3.991605043411255, + "learning_rate": 9.57170718038529e-05, + "loss": 1.0762, + "num_input_tokens_seen": 9857032, + "step": 612 + }, + { + "epoch": 0.042939574632027806, + "grad_norm": 4.516556262969971, + "learning_rate": 9.571007355516638e-05, + "loss": 1.3056, + "num_input_tokens_seen": 9872328, + "step": 613 + }, + { + "epoch": 0.04300962287775705, + "grad_norm": 4.030200481414795, + "learning_rate": 9.570307530647987e-05, + "loss": 0.9493, + "num_input_tokens_seen": 9887832, + "step": 614 + }, + { + "epoch": 0.043079671123486304, + "grad_norm": 4.345893859863281, + "learning_rate": 9.569607705779335e-05, + "loss": 1.2707, + "num_input_tokens_seen": 9904216, + "step": 615 + }, + { + "epoch": 0.04314971936921555, + "grad_norm": 4.158145427703857, + "learning_rate": 9.568907880910684e-05, + "loss": 1.0377, + "num_input_tokens_seen": 9920072, + "step": 616 + }, + { + "epoch": 0.043219767614944796, + "grad_norm": 4.155702590942383, + "learning_rate": 9.568208056042032e-05, + "loss": 1.091, + "num_input_tokens_seen": 9936416, + "step": 617 + }, + { + "epoch": 0.04328981586067404, + "grad_norm": 3.76328444480896, + "learning_rate": 9.56750823117338e-05, + "loss": 1.1011, + "num_input_tokens_seen": 9952456, + "step": 618 + }, + { + "epoch": 0.04335986410640329, + "grad_norm": 4.252495765686035, + "learning_rate": 9.566808406304729e-05, + "loss": 1.0616, + "num_input_tokens_seen": 9968608, + "step": 619 + }, + { + "epoch": 0.04342991235213253, + "grad_norm": 9.254091262817383, + "learning_rate": 9.566108581436078e-05, + "loss": 1.0315, + "num_input_tokens_seen": 9983016, + "step": 620 + }, + { + "epoch": 0.04349996059786178, + "grad_norm": 4.028343200683594, + "learning_rate": 9.565408756567426e-05, + "loss": 1.0667, + "num_input_tokens_seen": 9999400, + "step": 621 + }, + { + "epoch": 0.043570008843591024, + "grad_norm": 4.051328659057617, + "learning_rate": 9.564708931698775e-05, + "loss": 1.1375, + "num_input_tokens_seen": 10015384, + "step": 622 + }, + { + "epoch": 0.04364005708932027, + "grad_norm": 4.495016098022461, + "learning_rate": 9.564009106830123e-05, + "loss": 1.0691, + "num_input_tokens_seen": 10031152, + "step": 623 + }, + { + "epoch": 0.043710105335049515, + "grad_norm": 4.876840114593506, + "learning_rate": 9.563309281961472e-05, + "loss": 1.17, + "num_input_tokens_seen": 10047536, + "step": 624 + }, + { + "epoch": 0.04378015358077876, + "grad_norm": 4.407329559326172, + "learning_rate": 9.562609457092819e-05, + "loss": 1.2381, + "num_input_tokens_seen": 10063920, + "step": 625 + }, + { + "epoch": 0.043850201826508006, + "grad_norm": 4.161394119262695, + "learning_rate": 9.561909632224168e-05, + "loss": 1.0903, + "num_input_tokens_seen": 10079024, + "step": 626 + }, + { + "epoch": 0.04392025007223725, + "grad_norm": 4.382974624633789, + "learning_rate": 9.561209807355518e-05, + "loss": 1.3156, + "num_input_tokens_seen": 10095408, + "step": 627 + }, + { + "epoch": 0.0439902983179665, + "grad_norm": 4.004157543182373, + "learning_rate": 9.560509982486866e-05, + "loss": 1.1333, + "num_input_tokens_seen": 10111792, + "step": 628 + }, + { + "epoch": 0.04406034656369574, + "grad_norm": 3.9019265174865723, + "learning_rate": 9.559810157618215e-05, + "loss": 1.0948, + "num_input_tokens_seen": 10128144, + "step": 629 + }, + { + "epoch": 0.04413039480942499, + "grad_norm": 4.410470485687256, + "learning_rate": 9.559110332749562e-05, + "loss": 1.3219, + "num_input_tokens_seen": 10144288, + "step": 630 + }, + { + "epoch": 0.044200443055154234, + "grad_norm": 4.233544826507568, + "learning_rate": 9.558410507880911e-05, + "loss": 0.999, + "num_input_tokens_seen": 10160296, + "step": 631 + }, + { + "epoch": 0.04427049130088349, + "grad_norm": 4.120091438293457, + "learning_rate": 9.557710683012258e-05, + "loss": 1.0166, + "num_input_tokens_seen": 10176680, + "step": 632 + }, + { + "epoch": 0.04434053954661273, + "grad_norm": 5.061972618103027, + "learning_rate": 9.557010858143609e-05, + "loss": 1.251, + "num_input_tokens_seen": 10192088, + "step": 633 + }, + { + "epoch": 0.04441058779234198, + "grad_norm": 4.3690948486328125, + "learning_rate": 9.556311033274958e-05, + "loss": 1.2113, + "num_input_tokens_seen": 10208472, + "step": 634 + }, + { + "epoch": 0.044480636038071224, + "grad_norm": 3.798710346221924, + "learning_rate": 9.555611208406305e-05, + "loss": 1.0286, + "num_input_tokens_seen": 10224856, + "step": 635 + }, + { + "epoch": 0.04455068428380047, + "grad_norm": 4.41818380355835, + "learning_rate": 9.554911383537654e-05, + "loss": 1.14, + "num_input_tokens_seen": 10241200, + "step": 636 + }, + { + "epoch": 0.044620732529529715, + "grad_norm": 4.256262302398682, + "learning_rate": 9.554211558669001e-05, + "loss": 1.3103, + "num_input_tokens_seen": 10257584, + "step": 637 + }, + { + "epoch": 0.04469078077525896, + "grad_norm": 4.176064968109131, + "learning_rate": 9.55351173380035e-05, + "loss": 1.1985, + "num_input_tokens_seen": 10273760, + "step": 638 + }, + { + "epoch": 0.044760829020988206, + "grad_norm": 3.9971530437469482, + "learning_rate": 9.552811908931699e-05, + "loss": 1.1579, + "num_input_tokens_seen": 10290144, + "step": 639 + }, + { + "epoch": 0.04483087726671745, + "grad_norm": 4.150514602661133, + "learning_rate": 9.552112084063048e-05, + "loss": 1.1144, + "num_input_tokens_seen": 10306528, + "step": 640 + }, + { + "epoch": 0.0449009255124467, + "grad_norm": 4.1868367195129395, + "learning_rate": 9.551412259194397e-05, + "loss": 1.0099, + "num_input_tokens_seen": 10322480, + "step": 641 + }, + { + "epoch": 0.04497097375817594, + "grad_norm": 4.409821510314941, + "learning_rate": 9.550712434325744e-05, + "loss": 1.2574, + "num_input_tokens_seen": 10338864, + "step": 642 + }, + { + "epoch": 0.04504102200390519, + "grad_norm": 4.500023365020752, + "learning_rate": 9.550012609457093e-05, + "loss": 1.35, + "num_input_tokens_seen": 10355072, + "step": 643 + }, + { + "epoch": 0.045111070249634434, + "grad_norm": 10.278129577636719, + "learning_rate": 9.549312784588442e-05, + "loss": 1.0618, + "num_input_tokens_seen": 10371456, + "step": 644 + }, + { + "epoch": 0.04518111849536368, + "grad_norm": 3.9800543785095215, + "learning_rate": 9.54861295971979e-05, + "loss": 1.0341, + "num_input_tokens_seen": 10387720, + "step": 645 + }, + { + "epoch": 0.045251166741092926, + "grad_norm": 3.855720281600952, + "learning_rate": 9.547913134851138e-05, + "loss": 1.1323, + "num_input_tokens_seen": 10403936, + "step": 646 + }, + { + "epoch": 0.04532121498682217, + "grad_norm": 4.719264984130859, + "learning_rate": 9.547213309982487e-05, + "loss": 1.1407, + "num_input_tokens_seen": 10420320, + "step": 647 + }, + { + "epoch": 0.04539126323255142, + "grad_norm": 4.6528167724609375, + "learning_rate": 9.546513485113836e-05, + "loss": 1.1014, + "num_input_tokens_seen": 10436704, + "step": 648 + }, + { + "epoch": 0.04546131147828066, + "grad_norm": 4.0597028732299805, + "learning_rate": 9.545813660245185e-05, + "loss": 1.116, + "num_input_tokens_seen": 10452592, + "step": 649 + }, + { + "epoch": 0.045531359724009915, + "grad_norm": 4.161896705627441, + "learning_rate": 9.545113835376533e-05, + "loss": 1.1373, + "num_input_tokens_seen": 10468976, + "step": 650 + }, + { + "epoch": 0.04560140796973916, + "grad_norm": 4.125041961669922, + "learning_rate": 9.544414010507881e-05, + "loss": 1.0947, + "num_input_tokens_seen": 10484584, + "step": 651 + }, + { + "epoch": 0.045671456215468406, + "grad_norm": 4.278462886810303, + "learning_rate": 9.543714185639229e-05, + "loss": 1.1369, + "num_input_tokens_seen": 10500504, + "step": 652 + }, + { + "epoch": 0.04574150446119765, + "grad_norm": 4.766538619995117, + "learning_rate": 9.543014360770579e-05, + "loss": 1.1876, + "num_input_tokens_seen": 10516472, + "step": 653 + }, + { + "epoch": 0.0458115527069269, + "grad_norm": 4.457921504974365, + "learning_rate": 9.542314535901928e-05, + "loss": 1.0788, + "num_input_tokens_seen": 10532272, + "step": 654 + }, + { + "epoch": 0.04588160095265614, + "grad_norm": 5.021823883056641, + "learning_rate": 9.541614711033275e-05, + "loss": 1.1152, + "num_input_tokens_seen": 10547696, + "step": 655 + }, + { + "epoch": 0.04595164919838539, + "grad_norm": 4.407228469848633, + "learning_rate": 9.540914886164624e-05, + "loss": 1.0863, + "num_input_tokens_seen": 10564080, + "step": 656 + }, + { + "epoch": 0.046021697444114634, + "grad_norm": 3.9986062049865723, + "learning_rate": 9.540215061295972e-05, + "loss": 1.1624, + "num_input_tokens_seen": 10580464, + "step": 657 + }, + { + "epoch": 0.04609174568984388, + "grad_norm": 7.9165191650390625, + "learning_rate": 9.539515236427321e-05, + "loss": 1.0809, + "num_input_tokens_seen": 10595336, + "step": 658 + }, + { + "epoch": 0.046161793935573125, + "grad_norm": 4.357856273651123, + "learning_rate": 9.53881541155867e-05, + "loss": 1.0324, + "num_input_tokens_seen": 10611720, + "step": 659 + }, + { + "epoch": 0.04623184218130237, + "grad_norm": 3.8115761280059814, + "learning_rate": 9.538115586690018e-05, + "loss": 1.1499, + "num_input_tokens_seen": 10628104, + "step": 660 + }, + { + "epoch": 0.04630189042703162, + "grad_norm": 3.879671096801758, + "learning_rate": 9.537415761821367e-05, + "loss": 1.0474, + "num_input_tokens_seen": 10644096, + "step": 661 + }, + { + "epoch": 0.04637193867276086, + "grad_norm": 4.324586391448975, + "learning_rate": 9.536715936952715e-05, + "loss": 1.1904, + "num_input_tokens_seen": 10659408, + "step": 662 + }, + { + "epoch": 0.04644198691849011, + "grad_norm": 4.020029067993164, + "learning_rate": 9.536016112084064e-05, + "loss": 1.0848, + "num_input_tokens_seen": 10675792, + "step": 663 + }, + { + "epoch": 0.046512035164219354, + "grad_norm": 4.563455581665039, + "learning_rate": 9.535316287215411e-05, + "loss": 1.1735, + "num_input_tokens_seen": 10691632, + "step": 664 + }, + { + "epoch": 0.0465820834099486, + "grad_norm": 4.444424629211426, + "learning_rate": 9.53461646234676e-05, + "loss": 1.258, + "num_input_tokens_seen": 10708016, + "step": 665 + }, + { + "epoch": 0.046652131655677845, + "grad_norm": 3.9864089488983154, + "learning_rate": 9.533916637478109e-05, + "loss": 1.1315, + "num_input_tokens_seen": 10724176, + "step": 666 + }, + { + "epoch": 0.0467221799014071, + "grad_norm": 4.860849857330322, + "learning_rate": 9.533216812609458e-05, + "loss": 1.2276, + "num_input_tokens_seen": 10740560, + "step": 667 + }, + { + "epoch": 0.04679222814713634, + "grad_norm": 3.9701120853424072, + "learning_rate": 9.532516987740807e-05, + "loss": 1.1406, + "num_input_tokens_seen": 10756864, + "step": 668 + }, + { + "epoch": 0.04686227639286559, + "grad_norm": 3.660257577896118, + "learning_rate": 9.531817162872154e-05, + "loss": 1.0182, + "num_input_tokens_seen": 10773248, + "step": 669 + }, + { + "epoch": 0.046932324638594834, + "grad_norm": 3.888510227203369, + "learning_rate": 9.531117338003503e-05, + "loss": 1.0223, + "num_input_tokens_seen": 10789632, + "step": 670 + }, + { + "epoch": 0.04700237288432408, + "grad_norm": 4.794105052947998, + "learning_rate": 9.530417513134852e-05, + "loss": 1.0565, + "num_input_tokens_seen": 10804496, + "step": 671 + }, + { + "epoch": 0.047072421130053325, + "grad_norm": 4.293116092681885, + "learning_rate": 9.529717688266199e-05, + "loss": 1.2509, + "num_input_tokens_seen": 10819976, + "step": 672 + }, + { + "epoch": 0.04714246937578257, + "grad_norm": 5.112069129943848, + "learning_rate": 9.52901786339755e-05, + "loss": 1.0964, + "num_input_tokens_seen": 10836360, + "step": 673 + }, + { + "epoch": 0.04721251762151182, + "grad_norm": 3.9091360569000244, + "learning_rate": 9.528318038528897e-05, + "loss": 1.0647, + "num_input_tokens_seen": 10852744, + "step": 674 + }, + { + "epoch": 0.04728256586724106, + "grad_norm": 4.032161235809326, + "learning_rate": 9.527618213660246e-05, + "loss": 1.2362, + "num_input_tokens_seen": 10868928, + "step": 675 + }, + { + "epoch": 0.04735261411297031, + "grad_norm": 3.931156635284424, + "learning_rate": 9.526918388791595e-05, + "loss": 1.0571, + "num_input_tokens_seen": 10884776, + "step": 676 + }, + { + "epoch": 0.047422662358699554, + "grad_norm": 3.9511048793792725, + "learning_rate": 9.526218563922942e-05, + "loss": 1.0249, + "num_input_tokens_seen": 10901160, + "step": 677 + }, + { + "epoch": 0.0474927106044288, + "grad_norm": 4.199029445648193, + "learning_rate": 9.525518739054291e-05, + "loss": 1.2813, + "num_input_tokens_seen": 10917544, + "step": 678 + }, + { + "epoch": 0.047562758850158045, + "grad_norm": 3.8590247631073, + "learning_rate": 9.52481891418564e-05, + "loss": 1.02, + "num_input_tokens_seen": 10933928, + "step": 679 + }, + { + "epoch": 0.04763280709588729, + "grad_norm": 5.530341625213623, + "learning_rate": 9.524119089316989e-05, + "loss": 1.2316, + "num_input_tokens_seen": 10949600, + "step": 680 + }, + { + "epoch": 0.047702855341616536, + "grad_norm": 4.17647123336792, + "learning_rate": 9.523419264448338e-05, + "loss": 1.2985, + "num_input_tokens_seen": 10965984, + "step": 681 + }, + { + "epoch": 0.04777290358734578, + "grad_norm": 4.250451564788818, + "learning_rate": 9.522719439579685e-05, + "loss": 1.1638, + "num_input_tokens_seen": 10982368, + "step": 682 + }, + { + "epoch": 0.04784295183307503, + "grad_norm": 4.132594108581543, + "learning_rate": 9.522019614711034e-05, + "loss": 0.9638, + "num_input_tokens_seen": 10998752, + "step": 683 + }, + { + "epoch": 0.04791300007880428, + "grad_norm": 5.863363265991211, + "learning_rate": 9.521319789842382e-05, + "loss": 1.0736, + "num_input_tokens_seen": 11014376, + "step": 684 + }, + { + "epoch": 0.047983048324533525, + "grad_norm": 3.740323543548584, + "learning_rate": 9.52061996497373e-05, + "loss": 0.9958, + "num_input_tokens_seen": 11030440, + "step": 685 + }, + { + "epoch": 0.04805309657026277, + "grad_norm": 4.927120685577393, + "learning_rate": 9.519920140105079e-05, + "loss": 1.156, + "num_input_tokens_seen": 11046824, + "step": 686 + }, + { + "epoch": 0.04812314481599202, + "grad_norm": 4.708818435668945, + "learning_rate": 9.519220315236428e-05, + "loss": 1.2139, + "num_input_tokens_seen": 11063208, + "step": 687 + }, + { + "epoch": 0.04819319306172126, + "grad_norm": 3.7547767162323, + "learning_rate": 9.518520490367777e-05, + "loss": 0.9557, + "num_input_tokens_seen": 11079592, + "step": 688 + }, + { + "epoch": 0.04826324130745051, + "grad_norm": 4.038534641265869, + "learning_rate": 9.517820665499124e-05, + "loss": 1.1124, + "num_input_tokens_seen": 11095976, + "step": 689 + }, + { + "epoch": 0.048333289553179754, + "grad_norm": 4.159554481506348, + "learning_rate": 9.517120840630473e-05, + "loss": 1.0043, + "num_input_tokens_seen": 11112360, + "step": 690 + }, + { + "epoch": 0.048403337798909, + "grad_norm": 7.104836463928223, + "learning_rate": 9.516421015761821e-05, + "loss": 0.9736, + "num_input_tokens_seen": 11127800, + "step": 691 + }, + { + "epoch": 0.048473386044638245, + "grad_norm": 4.073885917663574, + "learning_rate": 9.51572119089317e-05, + "loss": 1.1249, + "num_input_tokens_seen": 11144184, + "step": 692 + }, + { + "epoch": 0.04854343429036749, + "grad_norm": 3.7190351486206055, + "learning_rate": 9.51502136602452e-05, + "loss": 1.1035, + "num_input_tokens_seen": 11160568, + "step": 693 + }, + { + "epoch": 0.048613482536096736, + "grad_norm": 4.252142429351807, + "learning_rate": 9.514321541155867e-05, + "loss": 1.1588, + "num_input_tokens_seen": 11176952, + "step": 694 + }, + { + "epoch": 0.04868353078182598, + "grad_norm": 4.418105125427246, + "learning_rate": 9.513621716287216e-05, + "loss": 1.2496, + "num_input_tokens_seen": 11193336, + "step": 695 + }, + { + "epoch": 0.04875357902755523, + "grad_norm": 4.195918560028076, + "learning_rate": 9.512921891418564e-05, + "loss": 1.0193, + "num_input_tokens_seen": 11209720, + "step": 696 + }, + { + "epoch": 0.04882362727328447, + "grad_norm": 5.138080596923828, + "learning_rate": 9.512222066549913e-05, + "loss": 1.1861, + "num_input_tokens_seen": 11225888, + "step": 697 + }, + { + "epoch": 0.04889367551901372, + "grad_norm": 4.489223003387451, + "learning_rate": 9.511522241681261e-05, + "loss": 1.1497, + "num_input_tokens_seen": 11241744, + "step": 698 + }, + { + "epoch": 0.048963723764742964, + "grad_norm": 3.972590208053589, + "learning_rate": 9.51082241681261e-05, + "loss": 1.2765, + "num_input_tokens_seen": 11257768, + "step": 699 + }, + { + "epoch": 0.04903377201047221, + "grad_norm": 13.274886131286621, + "learning_rate": 9.510122591943959e-05, + "loss": 1.1124, + "num_input_tokens_seen": 11273216, + "step": 700 + }, + { + "epoch": 0.049103820256201455, + "grad_norm": 3.7899255752563477, + "learning_rate": 9.509422767075307e-05, + "loss": 1.0445, + "num_input_tokens_seen": 11289600, + "step": 701 + }, + { + "epoch": 0.04917386850193071, + "grad_norm": 4.226947784423828, + "learning_rate": 9.508722942206656e-05, + "loss": 1.4313, + "num_input_tokens_seen": 11305920, + "step": 702 + }, + { + "epoch": 0.049243916747659953, + "grad_norm": 4.098162651062012, + "learning_rate": 9.508023117338003e-05, + "loss": 0.952, + "num_input_tokens_seen": 11322304, + "step": 703 + }, + { + "epoch": 0.0493139649933892, + "grad_norm": 3.9205965995788574, + "learning_rate": 9.507323292469352e-05, + "loss": 1.1648, + "num_input_tokens_seen": 11338688, + "step": 704 + }, + { + "epoch": 0.049384013239118445, + "grad_norm": 4.06537389755249, + "learning_rate": 9.506623467600701e-05, + "loss": 1.1295, + "num_input_tokens_seen": 11353544, + "step": 705 + }, + { + "epoch": 0.04945406148484769, + "grad_norm": 4.309032440185547, + "learning_rate": 9.50592364273205e-05, + "loss": 1.1475, + "num_input_tokens_seen": 11369928, + "step": 706 + }, + { + "epoch": 0.049524109730576936, + "grad_norm": 4.320526599884033, + "learning_rate": 9.505223817863399e-05, + "loss": 1.0102, + "num_input_tokens_seen": 11386312, + "step": 707 + }, + { + "epoch": 0.04959415797630618, + "grad_norm": 5.025510787963867, + "learning_rate": 9.504523992994747e-05, + "loss": 1.1182, + "num_input_tokens_seen": 11402696, + "step": 708 + }, + { + "epoch": 0.04966420622203543, + "grad_norm": 3.9406464099884033, + "learning_rate": 9.503824168126095e-05, + "loss": 1.068, + "num_input_tokens_seen": 11419080, + "step": 709 + }, + { + "epoch": 0.04973425446776467, + "grad_norm": 3.9148502349853516, + "learning_rate": 9.503124343257444e-05, + "loss": 1.1062, + "num_input_tokens_seen": 11435464, + "step": 710 + }, + { + "epoch": 0.04980430271349392, + "grad_norm": 3.9386026859283447, + "learning_rate": 9.502424518388791e-05, + "loss": 0.9516, + "num_input_tokens_seen": 11451848, + "step": 711 + }, + { + "epoch": 0.049874350959223164, + "grad_norm": 3.9537665843963623, + "learning_rate": 9.50172469352014e-05, + "loss": 1.1372, + "num_input_tokens_seen": 11468216, + "step": 712 + }, + { + "epoch": 0.04994439920495241, + "grad_norm": 3.97929310798645, + "learning_rate": 9.501024868651489e-05, + "loss": 1.0705, + "num_input_tokens_seen": 11484192, + "step": 713 + }, + { + "epoch": 0.050014447450681655, + "grad_norm": 3.9326419830322266, + "learning_rate": 9.500325043782838e-05, + "loss": 1.0986, + "num_input_tokens_seen": 11500576, + "step": 714 + }, + { + "epoch": 0.0500844956964109, + "grad_norm": 3.769347667694092, + "learning_rate": 9.499625218914187e-05, + "loss": 0.9265, + "num_input_tokens_seen": 11516960, + "step": 715 + }, + { + "epoch": 0.050154543942140146, + "grad_norm": 4.264547348022461, + "learning_rate": 9.498925394045534e-05, + "loss": 1.3166, + "num_input_tokens_seen": 11532616, + "step": 716 + }, + { + "epoch": 0.05022459218786939, + "grad_norm": 4.885791778564453, + "learning_rate": 9.498225569176883e-05, + "loss": 1.0669, + "num_input_tokens_seen": 11548552, + "step": 717 + }, + { + "epoch": 0.05029464043359864, + "grad_norm": 5.4089741706848145, + "learning_rate": 9.49752574430823e-05, + "loss": 1.3986, + "num_input_tokens_seen": 11564936, + "step": 718 + }, + { + "epoch": 0.05036468867932789, + "grad_norm": 4.503393173217773, + "learning_rate": 9.496825919439581e-05, + "loss": 0.9947, + "num_input_tokens_seen": 11580720, + "step": 719 + }, + { + "epoch": 0.050434736925057136, + "grad_norm": 4.364518165588379, + "learning_rate": 9.49612609457093e-05, + "loss": 1.12, + "num_input_tokens_seen": 11597104, + "step": 720 + }, + { + "epoch": 0.05050478517078638, + "grad_norm": 4.229926109313965, + "learning_rate": 9.495426269702277e-05, + "loss": 1.098, + "num_input_tokens_seen": 11612120, + "step": 721 + }, + { + "epoch": 0.05057483341651563, + "grad_norm": 4.477171897888184, + "learning_rate": 9.494726444833626e-05, + "loss": 1.1565, + "num_input_tokens_seen": 11627000, + "step": 722 + }, + { + "epoch": 0.05064488166224487, + "grad_norm": 4.071736812591553, + "learning_rate": 9.494026619964973e-05, + "loss": 1.2951, + "num_input_tokens_seen": 11643256, + "step": 723 + }, + { + "epoch": 0.05071492990797412, + "grad_norm": 4.219758033752441, + "learning_rate": 9.493326795096322e-05, + "loss": 1.1408, + "num_input_tokens_seen": 11659424, + "step": 724 + }, + { + "epoch": 0.050784978153703364, + "grad_norm": 4.108195781707764, + "learning_rate": 9.492626970227671e-05, + "loss": 0.9847, + "num_input_tokens_seen": 11675808, + "step": 725 + }, + { + "epoch": 0.05085502639943261, + "grad_norm": 3.964359760284424, + "learning_rate": 9.49192714535902e-05, + "loss": 1.0935, + "num_input_tokens_seen": 11691760, + "step": 726 + }, + { + "epoch": 0.050925074645161855, + "grad_norm": 4.585779190063477, + "learning_rate": 9.491227320490369e-05, + "loss": 1.1561, + "num_input_tokens_seen": 11706600, + "step": 727 + }, + { + "epoch": 0.0509951228908911, + "grad_norm": 3.8540141582489014, + "learning_rate": 9.490527495621716e-05, + "loss": 1.0163, + "num_input_tokens_seen": 11722984, + "step": 728 + }, + { + "epoch": 0.051065171136620346, + "grad_norm": 4.138955593109131, + "learning_rate": 9.489827670753065e-05, + "loss": 1.2842, + "num_input_tokens_seen": 11738968, + "step": 729 + }, + { + "epoch": 0.05113521938234959, + "grad_norm": 4.138274192810059, + "learning_rate": 9.489127845884413e-05, + "loss": 1.1452, + "num_input_tokens_seen": 11754952, + "step": 730 + }, + { + "epoch": 0.05120526762807884, + "grad_norm": 4.374305248260498, + "learning_rate": 9.488428021015762e-05, + "loss": 1.3622, + "num_input_tokens_seen": 11770832, + "step": 731 + }, + { + "epoch": 0.05127531587380808, + "grad_norm": 4.242674350738525, + "learning_rate": 9.48772819614711e-05, + "loss": 1.1914, + "num_input_tokens_seen": 11786872, + "step": 732 + }, + { + "epoch": 0.05134536411953733, + "grad_norm": 4.173389911651611, + "learning_rate": 9.48702837127846e-05, + "loss": 1.1853, + "num_input_tokens_seen": 11803256, + "step": 733 + }, + { + "epoch": 0.051415412365266575, + "grad_norm": 4.014588356018066, + "learning_rate": 9.486328546409808e-05, + "loss": 1.0436, + "num_input_tokens_seen": 11819608, + "step": 734 + }, + { + "epoch": 0.05148546061099582, + "grad_norm": 4.759418964385986, + "learning_rate": 9.485628721541157e-05, + "loss": 1.1605, + "num_input_tokens_seen": 11834296, + "step": 735 + }, + { + "epoch": 0.05155550885672507, + "grad_norm": 4.258687973022461, + "learning_rate": 9.484928896672505e-05, + "loss": 1.2993, + "num_input_tokens_seen": 11849728, + "step": 736 + }, + { + "epoch": 0.05162555710245432, + "grad_norm": 4.690395832061768, + "learning_rate": 9.484229071803853e-05, + "loss": 1.0655, + "num_input_tokens_seen": 11866112, + "step": 737 + }, + { + "epoch": 0.051695605348183564, + "grad_norm": 4.373327255249023, + "learning_rate": 9.483529246935201e-05, + "loss": 1.1364, + "num_input_tokens_seen": 11881960, + "step": 738 + }, + { + "epoch": 0.05176565359391281, + "grad_norm": 4.008789539337158, + "learning_rate": 9.482829422066551e-05, + "loss": 1.1174, + "num_input_tokens_seen": 11897936, + "step": 739 + }, + { + "epoch": 0.051835701839642055, + "grad_norm": 4.391345977783203, + "learning_rate": 9.482129597197899e-05, + "loss": 1.2045, + "num_input_tokens_seen": 11914320, + "step": 740 + }, + { + "epoch": 0.0519057500853713, + "grad_norm": 4.119503021240234, + "learning_rate": 9.481429772329248e-05, + "loss": 0.927, + "num_input_tokens_seen": 11930440, + "step": 741 + }, + { + "epoch": 0.051975798331100546, + "grad_norm": 4.186014175415039, + "learning_rate": 9.480729947460596e-05, + "loss": 1.1583, + "num_input_tokens_seen": 11946720, + "step": 742 + }, + { + "epoch": 0.05204584657682979, + "grad_norm": 4.119131088256836, + "learning_rate": 9.480030122591944e-05, + "loss": 1.0792, + "num_input_tokens_seen": 11962360, + "step": 743 + }, + { + "epoch": 0.05211589482255904, + "grad_norm": 3.921030044555664, + "learning_rate": 9.479330297723293e-05, + "loss": 0.9966, + "num_input_tokens_seen": 11978744, + "step": 744 + }, + { + "epoch": 0.05218594306828828, + "grad_norm": 3.806251049041748, + "learning_rate": 9.478630472854642e-05, + "loss": 1.1207, + "num_input_tokens_seen": 11994912, + "step": 745 + }, + { + "epoch": 0.05225599131401753, + "grad_norm": 4.508687973022461, + "learning_rate": 9.47793064798599e-05, + "loss": 1.1038, + "num_input_tokens_seen": 12011296, + "step": 746 + }, + { + "epoch": 0.052326039559746775, + "grad_norm": 4.458346843719482, + "learning_rate": 9.47723082311734e-05, + "loss": 1.2878, + "num_input_tokens_seen": 12027408, + "step": 747 + }, + { + "epoch": 0.05239608780547602, + "grad_norm": 5.779678821563721, + "learning_rate": 9.476530998248687e-05, + "loss": 1.2722, + "num_input_tokens_seen": 12043792, + "step": 748 + }, + { + "epoch": 0.052466136051205266, + "grad_norm": 4.621145725250244, + "learning_rate": 9.475831173380036e-05, + "loss": 1.2636, + "num_input_tokens_seen": 12059856, + "step": 749 + }, + { + "epoch": 0.05253618429693451, + "grad_norm": 4.276626110076904, + "learning_rate": 9.475131348511383e-05, + "loss": 1.3378, + "num_input_tokens_seen": 12076240, + "step": 750 + }, + { + "epoch": 0.05260623254266376, + "grad_norm": 4.533468246459961, + "learning_rate": 9.474431523642732e-05, + "loss": 0.921, + "num_input_tokens_seen": 12092416, + "step": 751 + }, + { + "epoch": 0.052676280788393, + "grad_norm": 4.626596927642822, + "learning_rate": 9.473731698774081e-05, + "loss": 1.2807, + "num_input_tokens_seen": 12108664, + "step": 752 + }, + { + "epoch": 0.052746329034122255, + "grad_norm": 4.3372907638549805, + "learning_rate": 9.47303187390543e-05, + "loss": 1.2754, + "num_input_tokens_seen": 12125048, + "step": 753 + }, + { + "epoch": 0.0528163772798515, + "grad_norm": 3.6576266288757324, + "learning_rate": 9.472332049036779e-05, + "loss": 0.8487, + "num_input_tokens_seen": 12141296, + "step": 754 + }, + { + "epoch": 0.052886425525580746, + "grad_norm": 3.8973164558410645, + "learning_rate": 9.471632224168126e-05, + "loss": 1.1211, + "num_input_tokens_seen": 12157544, + "step": 755 + }, + { + "epoch": 0.05295647377130999, + "grad_norm": 3.9059019088745117, + "learning_rate": 9.470932399299475e-05, + "loss": 1.2484, + "num_input_tokens_seen": 12173928, + "step": 756 + }, + { + "epoch": 0.05302652201703924, + "grad_norm": 4.133029937744141, + "learning_rate": 9.470232574430822e-05, + "loss": 1.0762, + "num_input_tokens_seen": 12189864, + "step": 757 + }, + { + "epoch": 0.05309657026276848, + "grad_norm": 3.8380961418151855, + "learning_rate": 9.469532749562171e-05, + "loss": 0.9938, + "num_input_tokens_seen": 12206248, + "step": 758 + }, + { + "epoch": 0.05316661850849773, + "grad_norm": 4.753637790679932, + "learning_rate": 9.468832924693522e-05, + "loss": 1.1272, + "num_input_tokens_seen": 12222632, + "step": 759 + }, + { + "epoch": 0.053236666754226974, + "grad_norm": 4.704193592071533, + "learning_rate": 9.468133099824869e-05, + "loss": 1.2276, + "num_input_tokens_seen": 12239016, + "step": 760 + }, + { + "epoch": 0.05330671499995622, + "grad_norm": 3.870870351791382, + "learning_rate": 9.467433274956218e-05, + "loss": 0.916, + "num_input_tokens_seen": 12254784, + "step": 761 + }, + { + "epoch": 0.053376763245685466, + "grad_norm": 3.8597328662872314, + "learning_rate": 9.466733450087567e-05, + "loss": 0.9871, + "num_input_tokens_seen": 12271160, + "step": 762 + }, + { + "epoch": 0.05344681149141471, + "grad_norm": 3.7109553813934326, + "learning_rate": 9.466033625218914e-05, + "loss": 1.1248, + "num_input_tokens_seen": 12286944, + "step": 763 + }, + { + "epoch": 0.05351685973714396, + "grad_norm": 3.985595464706421, + "learning_rate": 9.465333800350263e-05, + "loss": 1.0524, + "num_input_tokens_seen": 12303312, + "step": 764 + }, + { + "epoch": 0.0535869079828732, + "grad_norm": 3.797247886657715, + "learning_rate": 9.464633975481612e-05, + "loss": 1.0799, + "num_input_tokens_seen": 12319696, + "step": 765 + }, + { + "epoch": 0.05365695622860245, + "grad_norm": 4.88303279876709, + "learning_rate": 9.463934150612961e-05, + "loss": 1.2865, + "num_input_tokens_seen": 12335448, + "step": 766 + }, + { + "epoch": 0.053727004474331694, + "grad_norm": 4.273831367492676, + "learning_rate": 9.463234325744308e-05, + "loss": 1.1724, + "num_input_tokens_seen": 12351720, + "step": 767 + }, + { + "epoch": 0.05379705272006094, + "grad_norm": 3.9505984783172607, + "learning_rate": 9.462534500875657e-05, + "loss": 1.1478, + "num_input_tokens_seen": 12368104, + "step": 768 + }, + { + "epoch": 0.053867100965790185, + "grad_norm": 4.20963191986084, + "learning_rate": 9.461834676007006e-05, + "loss": 1.1018, + "num_input_tokens_seen": 12384488, + "step": 769 + }, + { + "epoch": 0.05393714921151943, + "grad_norm": 4.106869220733643, + "learning_rate": 9.461134851138354e-05, + "loss": 1.1097, + "num_input_tokens_seen": 12400128, + "step": 770 + }, + { + "epoch": 0.05400719745724868, + "grad_norm": 4.28592586517334, + "learning_rate": 9.460435026269702e-05, + "loss": 1.036, + "num_input_tokens_seen": 12416512, + "step": 771 + }, + { + "epoch": 0.05407724570297793, + "grad_norm": 3.821927070617676, + "learning_rate": 9.459735201401051e-05, + "loss": 1.1215, + "num_input_tokens_seen": 12432896, + "step": 772 + }, + { + "epoch": 0.054147293948707174, + "grad_norm": 4.14424467086792, + "learning_rate": 9.4590353765324e-05, + "loss": 1.0092, + "num_input_tokens_seen": 12449208, + "step": 773 + }, + { + "epoch": 0.05421734219443642, + "grad_norm": 4.610694885253906, + "learning_rate": 9.458335551663749e-05, + "loss": 1.2265, + "num_input_tokens_seen": 12464128, + "step": 774 + }, + { + "epoch": 0.054287390440165666, + "grad_norm": 4.410182952880859, + "learning_rate": 9.457635726795097e-05, + "loss": 1.1904, + "num_input_tokens_seen": 12479728, + "step": 775 + }, + { + "epoch": 0.05435743868589491, + "grad_norm": 4.096780300140381, + "learning_rate": 9.456935901926445e-05, + "loss": 1.2317, + "num_input_tokens_seen": 12495720, + "step": 776 + }, + { + "epoch": 0.05442748693162416, + "grad_norm": 4.028350830078125, + "learning_rate": 9.456236077057793e-05, + "loss": 1.1825, + "num_input_tokens_seen": 12511480, + "step": 777 + }, + { + "epoch": 0.0544975351773534, + "grad_norm": 5.264276504516602, + "learning_rate": 9.455536252189142e-05, + "loss": 1.057, + "num_input_tokens_seen": 12527864, + "step": 778 + }, + { + "epoch": 0.05456758342308265, + "grad_norm": 4.371725082397461, + "learning_rate": 9.454836427320492e-05, + "loss": 1.1625, + "num_input_tokens_seen": 12544168, + "step": 779 + }, + { + "epoch": 0.054637631668811894, + "grad_norm": 4.692862510681152, + "learning_rate": 9.45413660245184e-05, + "loss": 1.2211, + "num_input_tokens_seen": 12560552, + "step": 780 + }, + { + "epoch": 0.05470767991454114, + "grad_norm": 3.7462823390960693, + "learning_rate": 9.453436777583188e-05, + "loss": 1.0815, + "num_input_tokens_seen": 12576936, + "step": 781 + }, + { + "epoch": 0.054777728160270385, + "grad_norm": 4.161571025848389, + "learning_rate": 9.452736952714536e-05, + "loss": 0.9788, + "num_input_tokens_seen": 12593040, + "step": 782 + }, + { + "epoch": 0.05484777640599963, + "grad_norm": 3.96793532371521, + "learning_rate": 9.452037127845885e-05, + "loss": 1.1396, + "num_input_tokens_seen": 12609424, + "step": 783 + }, + { + "epoch": 0.054917824651728876, + "grad_norm": 4.183755874633789, + "learning_rate": 9.451337302977232e-05, + "loss": 1.0868, + "num_input_tokens_seen": 12625312, + "step": 784 + }, + { + "epoch": 0.05498787289745812, + "grad_norm": 4.506673336029053, + "learning_rate": 9.450637478108582e-05, + "loss": 1.1112, + "num_input_tokens_seen": 12641696, + "step": 785 + }, + { + "epoch": 0.05505792114318737, + "grad_norm": 3.8601651191711426, + "learning_rate": 9.449937653239931e-05, + "loss": 1.2149, + "num_input_tokens_seen": 12658080, + "step": 786 + }, + { + "epoch": 0.05512796938891661, + "grad_norm": 5.190856456756592, + "learning_rate": 9.449237828371279e-05, + "loss": 1.2661, + "num_input_tokens_seen": 12673032, + "step": 787 + }, + { + "epoch": 0.055198017634645866, + "grad_norm": 4.323099136352539, + "learning_rate": 9.448538003502628e-05, + "loss": 1.139, + "num_input_tokens_seen": 12689064, + "step": 788 + }, + { + "epoch": 0.05526806588037511, + "grad_norm": 4.271193981170654, + "learning_rate": 9.447838178633976e-05, + "loss": 1.037, + "num_input_tokens_seen": 12705448, + "step": 789 + }, + { + "epoch": 0.05533811412610436, + "grad_norm": 3.793525218963623, + "learning_rate": 9.447138353765324e-05, + "loss": 1.0265, + "num_input_tokens_seen": 12721832, + "step": 790 + }, + { + "epoch": 0.0554081623718336, + "grad_norm": 3.747575283050537, + "learning_rate": 9.446438528896673e-05, + "loss": 0.9567, + "num_input_tokens_seen": 12738216, + "step": 791 + }, + { + "epoch": 0.05547821061756285, + "grad_norm": 4.222849369049072, + "learning_rate": 9.445738704028022e-05, + "loss": 1.1859, + "num_input_tokens_seen": 12754600, + "step": 792 + }, + { + "epoch": 0.055548258863292094, + "grad_norm": 9.102783203125, + "learning_rate": 9.44503887915937e-05, + "loss": 1.0361, + "num_input_tokens_seen": 12770568, + "step": 793 + }, + { + "epoch": 0.05561830710902134, + "grad_norm": 4.4447808265686035, + "learning_rate": 9.444339054290718e-05, + "loss": 1.2908, + "num_input_tokens_seen": 12785768, + "step": 794 + }, + { + "epoch": 0.055688355354750585, + "grad_norm": 4.038604259490967, + "learning_rate": 9.443639229422067e-05, + "loss": 0.9294, + "num_input_tokens_seen": 12801704, + "step": 795 + }, + { + "epoch": 0.05575840360047983, + "grad_norm": 4.492194652557373, + "learning_rate": 9.442939404553416e-05, + "loss": 1.0466, + "num_input_tokens_seen": 12818088, + "step": 796 + }, + { + "epoch": 0.055828451846209076, + "grad_norm": 3.978029489517212, + "learning_rate": 9.442239579684763e-05, + "loss": 1.1719, + "num_input_tokens_seen": 12834432, + "step": 797 + }, + { + "epoch": 0.05589850009193832, + "grad_norm": 4.014431476593018, + "learning_rate": 9.441539754816112e-05, + "loss": 1.1222, + "num_input_tokens_seen": 12850816, + "step": 798 + }, + { + "epoch": 0.05596854833766757, + "grad_norm": 4.0948638916015625, + "learning_rate": 9.440839929947461e-05, + "loss": 1.2013, + "num_input_tokens_seen": 12867200, + "step": 799 + }, + { + "epoch": 0.05603859658339681, + "grad_norm": 4.18120813369751, + "learning_rate": 9.44014010507881e-05, + "loss": 0.9403, + "num_input_tokens_seen": 12883072, + "step": 800 + }, + { + "epoch": 0.05603859658339681, + "eval_loss": 1.1718552112579346, + "eval_runtime": 0.2039, + "eval_samples_per_second": 4.905, + "eval_steps_per_second": 4.905, + "num_input_tokens_seen": 12883072, + "step": 800 + }, + { + "epoch": 0.05610864482912606, + "grad_norm": 4.425891399383545, + "learning_rate": 9.439440280210159e-05, + "loss": 1.0435, + "num_input_tokens_seen": 12899456, + "step": 801 + }, + { + "epoch": 0.056178693074855304, + "grad_norm": 4.319190979003906, + "learning_rate": 9.438740455341506e-05, + "loss": 1.2612, + "num_input_tokens_seen": 12915840, + "step": 802 + }, + { + "epoch": 0.05624874132058455, + "grad_norm": 4.28010892868042, + "learning_rate": 9.438040630472855e-05, + "loss": 1.0853, + "num_input_tokens_seen": 12932096, + "step": 803 + }, + { + "epoch": 0.056318789566313796, + "grad_norm": 3.9454870223999023, + "learning_rate": 9.437340805604203e-05, + "loss": 1.055, + "num_input_tokens_seen": 12948208, + "step": 804 + }, + { + "epoch": 0.05638883781204305, + "grad_norm": 4.009400367736816, + "learning_rate": 9.436640980735553e-05, + "loss": 1.0681, + "num_input_tokens_seen": 12964096, + "step": 805 + }, + { + "epoch": 0.056458886057772294, + "grad_norm": 3.7949161529541016, + "learning_rate": 9.435941155866902e-05, + "loss": 1.0787, + "num_input_tokens_seen": 12980480, + "step": 806 + }, + { + "epoch": 0.05652893430350154, + "grad_norm": 3.910456418991089, + "learning_rate": 9.435241330998249e-05, + "loss": 0.9212, + "num_input_tokens_seen": 12996864, + "step": 807 + }, + { + "epoch": 0.056598982549230785, + "grad_norm": 4.744706630706787, + "learning_rate": 9.434541506129598e-05, + "loss": 1.0582, + "num_input_tokens_seen": 13013248, + "step": 808 + }, + { + "epoch": 0.05666903079496003, + "grad_norm": 4.4282732009887695, + "learning_rate": 9.433841681260946e-05, + "loss": 1.1353, + "num_input_tokens_seen": 13029632, + "step": 809 + }, + { + "epoch": 0.056739079040689276, + "grad_norm": 3.8422467708587646, + "learning_rate": 9.433141856392294e-05, + "loss": 0.9881, + "num_input_tokens_seen": 13046016, + "step": 810 + }, + { + "epoch": 0.05680912728641852, + "grad_norm": 4.1764445304870605, + "learning_rate": 9.432442031523643e-05, + "loss": 1.183, + "num_input_tokens_seen": 13062400, + "step": 811 + }, + { + "epoch": 0.05687917553214777, + "grad_norm": 4.713895320892334, + "learning_rate": 9.431742206654992e-05, + "loss": 1.0752, + "num_input_tokens_seen": 13078584, + "step": 812 + }, + { + "epoch": 0.05694922377787701, + "grad_norm": 4.265610694885254, + "learning_rate": 9.431042381786341e-05, + "loss": 0.9469, + "num_input_tokens_seen": 13094968, + "step": 813 + }, + { + "epoch": 0.05701927202360626, + "grad_norm": 3.9274330139160156, + "learning_rate": 9.430342556917688e-05, + "loss": 1.1765, + "num_input_tokens_seen": 13111304, + "step": 814 + }, + { + "epoch": 0.057089320269335504, + "grad_norm": 4.44935941696167, + "learning_rate": 9.429642732049037e-05, + "loss": 1.1014, + "num_input_tokens_seen": 13127304, + "step": 815 + }, + { + "epoch": 0.05715936851506475, + "grad_norm": 5.019375801086426, + "learning_rate": 9.428942907180386e-05, + "loss": 1.0535, + "num_input_tokens_seen": 13143688, + "step": 816 + }, + { + "epoch": 0.057229416760793995, + "grad_norm": 4.743424892425537, + "learning_rate": 9.428243082311734e-05, + "loss": 1.3912, + "num_input_tokens_seen": 13160072, + "step": 817 + }, + { + "epoch": 0.05729946500652324, + "grad_norm": 3.921475887298584, + "learning_rate": 9.427543257443083e-05, + "loss": 1.1116, + "num_input_tokens_seen": 13176456, + "step": 818 + }, + { + "epoch": 0.05736951325225249, + "grad_norm": 4.106019020080566, + "learning_rate": 9.426843432574431e-05, + "loss": 0.9, + "num_input_tokens_seen": 13192840, + "step": 819 + }, + { + "epoch": 0.05743956149798173, + "grad_norm": 4.298704147338867, + "learning_rate": 9.42614360770578e-05, + "loss": 1.281, + "num_input_tokens_seen": 13209144, + "step": 820 + }, + { + "epoch": 0.05750960974371098, + "grad_norm": 4.29774284362793, + "learning_rate": 9.425443782837128e-05, + "loss": 1.2703, + "num_input_tokens_seen": 13224752, + "step": 821 + }, + { + "epoch": 0.057579657989440224, + "grad_norm": 4.6176838874816895, + "learning_rate": 9.424743957968477e-05, + "loss": 1.232, + "num_input_tokens_seen": 13240856, + "step": 822 + }, + { + "epoch": 0.057649706235169476, + "grad_norm": 4.450786590576172, + "learning_rate": 9.424044133099826e-05, + "loss": 1.1369, + "num_input_tokens_seen": 13256800, + "step": 823 + }, + { + "epoch": 0.05771975448089872, + "grad_norm": 3.8302414417266846, + "learning_rate": 9.423344308231173e-05, + "loss": 0.9985, + "num_input_tokens_seen": 13273032, + "step": 824 + }, + { + "epoch": 0.05778980272662797, + "grad_norm": 4.641941070556641, + "learning_rate": 9.422644483362523e-05, + "loss": 1.2238, + "num_input_tokens_seen": 13289104, + "step": 825 + }, + { + "epoch": 0.05785985097235721, + "grad_norm": 4.369805335998535, + "learning_rate": 9.421944658493871e-05, + "loss": 1.2047, + "num_input_tokens_seen": 13304752, + "step": 826 + }, + { + "epoch": 0.05792989921808646, + "grad_norm": 3.863507032394409, + "learning_rate": 9.42124483362522e-05, + "loss": 1.1098, + "num_input_tokens_seen": 13321088, + "step": 827 + }, + { + "epoch": 0.057999947463815704, + "grad_norm": 5.323369979858398, + "learning_rate": 9.420545008756568e-05, + "loss": 1.1722, + "num_input_tokens_seen": 13336912, + "step": 828 + }, + { + "epoch": 0.05806999570954495, + "grad_norm": 4.006597995758057, + "learning_rate": 9.419845183887916e-05, + "loss": 1.0382, + "num_input_tokens_seen": 13353280, + "step": 829 + }, + { + "epoch": 0.058140043955274195, + "grad_norm": 4.1039886474609375, + "learning_rate": 9.419145359019265e-05, + "loss": 1.2037, + "num_input_tokens_seen": 13369664, + "step": 830 + }, + { + "epoch": 0.05821009220100344, + "grad_norm": 3.903517007827759, + "learning_rate": 9.418445534150614e-05, + "loss": 1.2185, + "num_input_tokens_seen": 13386048, + "step": 831 + }, + { + "epoch": 0.05828014044673269, + "grad_norm": 4.434885025024414, + "learning_rate": 9.417745709281963e-05, + "loss": 1.2444, + "num_input_tokens_seen": 13402432, + "step": 832 + }, + { + "epoch": 0.05835018869246193, + "grad_norm": 4.6121296882629395, + "learning_rate": 9.417045884413311e-05, + "loss": 1.2831, + "num_input_tokens_seen": 13418816, + "step": 833 + }, + { + "epoch": 0.05842023693819118, + "grad_norm": 3.6966841220855713, + "learning_rate": 9.416346059544659e-05, + "loss": 1.0751, + "num_input_tokens_seen": 13435200, + "step": 834 + }, + { + "epoch": 0.058490285183920424, + "grad_norm": 4.292221546173096, + "learning_rate": 9.415646234676008e-05, + "loss": 1.2068, + "num_input_tokens_seen": 13451584, + "step": 835 + }, + { + "epoch": 0.05856033342964967, + "grad_norm": 4.053999900817871, + "learning_rate": 9.414946409807355e-05, + "loss": 1.1735, + "num_input_tokens_seen": 13467824, + "step": 836 + }, + { + "epoch": 0.058630381675378915, + "grad_norm": 4.4411234855651855, + "learning_rate": 9.414246584938704e-05, + "loss": 1.0647, + "num_input_tokens_seen": 13483200, + "step": 837 + }, + { + "epoch": 0.05870042992110816, + "grad_norm": 3.956787347793579, + "learning_rate": 9.413546760070053e-05, + "loss": 0.9813, + "num_input_tokens_seen": 13499584, + "step": 838 + }, + { + "epoch": 0.058770478166837406, + "grad_norm": 5.050291061401367, + "learning_rate": 9.412846935201402e-05, + "loss": 1.1193, + "num_input_tokens_seen": 13515448, + "step": 839 + }, + { + "epoch": 0.05884052641256666, + "grad_norm": 3.8736393451690674, + "learning_rate": 9.412147110332751e-05, + "loss": 1.0294, + "num_input_tokens_seen": 13531200, + "step": 840 + }, + { + "epoch": 0.058910574658295904, + "grad_norm": 6.07747745513916, + "learning_rate": 9.411447285464098e-05, + "loss": 0.9684, + "num_input_tokens_seen": 13547584, + "step": 841 + }, + { + "epoch": 0.05898062290402515, + "grad_norm": 4.606445789337158, + "learning_rate": 9.410747460595447e-05, + "loss": 1.2119, + "num_input_tokens_seen": 13563528, + "step": 842 + }, + { + "epoch": 0.059050671149754395, + "grad_norm": 4.3981709480285645, + "learning_rate": 9.410047635726796e-05, + "loss": 1.3313, + "num_input_tokens_seen": 13579912, + "step": 843 + }, + { + "epoch": 0.05912071939548364, + "grad_norm": 3.64546799659729, + "learning_rate": 9.409347810858143e-05, + "loss": 0.8892, + "num_input_tokens_seen": 13596296, + "step": 844 + }, + { + "epoch": 0.05919076764121289, + "grad_norm": 4.15845251083374, + "learning_rate": 9.408647985989494e-05, + "loss": 1.1464, + "num_input_tokens_seen": 13612680, + "step": 845 + }, + { + "epoch": 0.05926081588694213, + "grad_norm": 6.049203872680664, + "learning_rate": 9.407948161120841e-05, + "loss": 1.1907, + "num_input_tokens_seen": 13627832, + "step": 846 + }, + { + "epoch": 0.05933086413267138, + "grad_norm": 3.7192461490631104, + "learning_rate": 9.40724833625219e-05, + "loss": 1.165, + "num_input_tokens_seen": 13643824, + "step": 847 + }, + { + "epoch": 0.059400912378400623, + "grad_norm": 4.183239936828613, + "learning_rate": 9.406548511383537e-05, + "loss": 1.1697, + "num_input_tokens_seen": 13660208, + "step": 848 + }, + { + "epoch": 0.05947096062412987, + "grad_norm": 4.126212120056152, + "learning_rate": 9.405848686514886e-05, + "loss": 1.0532, + "num_input_tokens_seen": 13676592, + "step": 849 + }, + { + "epoch": 0.059541008869859115, + "grad_norm": 4.033525466918945, + "learning_rate": 9.405148861646235e-05, + "loss": 1.1497, + "num_input_tokens_seen": 13692600, + "step": 850 + }, + { + "epoch": 0.05961105711558836, + "grad_norm": 4.162797451019287, + "learning_rate": 9.404449036777584e-05, + "loss": 1.162, + "num_input_tokens_seen": 13708984, + "step": 851 + }, + { + "epoch": 0.059681105361317606, + "grad_norm": 4.057224750518799, + "learning_rate": 9.403749211908933e-05, + "loss": 1.2166, + "num_input_tokens_seen": 13724656, + "step": 852 + }, + { + "epoch": 0.05975115360704685, + "grad_norm": 4.201955318450928, + "learning_rate": 9.40304938704028e-05, + "loss": 1.2195, + "num_input_tokens_seen": 13741040, + "step": 853 + }, + { + "epoch": 0.0598212018527761, + "grad_norm": 3.8704352378845215, + "learning_rate": 9.402349562171629e-05, + "loss": 0.8946, + "num_input_tokens_seen": 13757424, + "step": 854 + }, + { + "epoch": 0.05989125009850534, + "grad_norm": 6.010958671569824, + "learning_rate": 9.401649737302978e-05, + "loss": 1.2095, + "num_input_tokens_seen": 13773808, + "step": 855 + }, + { + "epoch": 0.05996129834423459, + "grad_norm": 4.975742816925049, + "learning_rate": 9.400949912434326e-05, + "loss": 1.1064, + "num_input_tokens_seen": 13789704, + "step": 856 + }, + { + "epoch": 0.06003134658996384, + "grad_norm": 4.021739959716797, + "learning_rate": 9.400250087565675e-05, + "loss": 1.2036, + "num_input_tokens_seen": 13806088, + "step": 857 + }, + { + "epoch": 0.06010139483569309, + "grad_norm": 4.262394905090332, + "learning_rate": 9.399550262697023e-05, + "loss": 1.1053, + "num_input_tokens_seen": 13821928, + "step": 858 + }, + { + "epoch": 0.06017144308142233, + "grad_norm": 4.3033671379089355, + "learning_rate": 9.398850437828372e-05, + "loss": 1.0213, + "num_input_tokens_seen": 13838232, + "step": 859 + }, + { + "epoch": 0.06024149132715158, + "grad_norm": 4.066610336303711, + "learning_rate": 9.398150612959721e-05, + "loss": 1.0579, + "num_input_tokens_seen": 13853912, + "step": 860 + }, + { + "epoch": 0.06031153957288082, + "grad_norm": 4.308155059814453, + "learning_rate": 9.397450788091069e-05, + "loss": 1.3624, + "num_input_tokens_seen": 13870224, + "step": 861 + }, + { + "epoch": 0.06038158781861007, + "grad_norm": 4.307553291320801, + "learning_rate": 9.396750963222417e-05, + "loss": 1.0942, + "num_input_tokens_seen": 13886608, + "step": 862 + }, + { + "epoch": 0.060451636064339315, + "grad_norm": 3.8107142448425293, + "learning_rate": 9.396051138353765e-05, + "loss": 1.1285, + "num_input_tokens_seen": 13902992, + "step": 863 + }, + { + "epoch": 0.06052168431006856, + "grad_norm": 4.530765533447266, + "learning_rate": 9.395351313485114e-05, + "loss": 1.2028, + "num_input_tokens_seen": 13919376, + "step": 864 + }, + { + "epoch": 0.060591732555797806, + "grad_norm": 4.035069465637207, + "learning_rate": 9.394651488616463e-05, + "loss": 1.0291, + "num_input_tokens_seen": 13935664, + "step": 865 + }, + { + "epoch": 0.06066178080152705, + "grad_norm": 4.028316497802734, + "learning_rate": 9.393951663747812e-05, + "loss": 1.21, + "num_input_tokens_seen": 13951096, + "step": 866 + }, + { + "epoch": 0.0607318290472563, + "grad_norm": 4.039167881011963, + "learning_rate": 9.39325183887916e-05, + "loss": 0.929, + "num_input_tokens_seen": 13966272, + "step": 867 + }, + { + "epoch": 0.06080187729298554, + "grad_norm": 4.139703273773193, + "learning_rate": 9.392552014010508e-05, + "loss": 1.2575, + "num_input_tokens_seen": 13981848, + "step": 868 + }, + { + "epoch": 0.06087192553871479, + "grad_norm": 4.222180366516113, + "learning_rate": 9.391852189141857e-05, + "loss": 1.2067, + "num_input_tokens_seen": 13997920, + "step": 869 + }, + { + "epoch": 0.060941973784444034, + "grad_norm": 3.7993030548095703, + "learning_rate": 9.391152364273206e-05, + "loss": 1.0865, + "num_input_tokens_seen": 14014304, + "step": 870 + }, + { + "epoch": 0.06101202203017328, + "grad_norm": 4.811493396759033, + "learning_rate": 9.390452539404554e-05, + "loss": 1.1331, + "num_input_tokens_seen": 14030688, + "step": 871 + }, + { + "epoch": 0.061082070275902525, + "grad_norm": 13.88792610168457, + "learning_rate": 9.389752714535903e-05, + "loss": 1.1368, + "num_input_tokens_seen": 14045584, + "step": 872 + }, + { + "epoch": 0.06115211852163177, + "grad_norm": 3.7678709030151367, + "learning_rate": 9.389052889667251e-05, + "loss": 1.1012, + "num_input_tokens_seen": 14061968, + "step": 873 + }, + { + "epoch": 0.061222166767361016, + "grad_norm": 4.252075672149658, + "learning_rate": 9.3883530647986e-05, + "loss": 1.0472, + "num_input_tokens_seen": 14077584, + "step": 874 + }, + { + "epoch": 0.06129221501309027, + "grad_norm": 3.555629253387451, + "learning_rate": 9.387653239929947e-05, + "loss": 0.8653, + "num_input_tokens_seen": 14093704, + "step": 875 + }, + { + "epoch": 0.061362263258819515, + "grad_norm": 4.122331619262695, + "learning_rate": 9.386953415061296e-05, + "loss": 1.0395, + "num_input_tokens_seen": 14109624, + "step": 876 + }, + { + "epoch": 0.06143231150454876, + "grad_norm": 3.6772518157958984, + "learning_rate": 9.386253590192645e-05, + "loss": 0.8842, + "num_input_tokens_seen": 14126008, + "step": 877 + }, + { + "epoch": 0.061502359750278006, + "grad_norm": 3.791351079940796, + "learning_rate": 9.385553765323994e-05, + "loss": 1.1118, + "num_input_tokens_seen": 14142392, + "step": 878 + }, + { + "epoch": 0.06157240799600725, + "grad_norm": 3.781759738922119, + "learning_rate": 9.384853940455343e-05, + "loss": 1.0577, + "num_input_tokens_seen": 14158776, + "step": 879 + }, + { + "epoch": 0.0616424562417365, + "grad_norm": 4.2420830726623535, + "learning_rate": 9.38415411558669e-05, + "loss": 1.268, + "num_input_tokens_seen": 14173920, + "step": 880 + }, + { + "epoch": 0.06171250448746574, + "grad_norm": 4.000860214233398, + "learning_rate": 9.383454290718039e-05, + "loss": 1.1626, + "num_input_tokens_seen": 14190032, + "step": 881 + }, + { + "epoch": 0.06178255273319499, + "grad_norm": 3.760969877243042, + "learning_rate": 9.382754465849388e-05, + "loss": 0.9684, + "num_input_tokens_seen": 14206416, + "step": 882 + }, + { + "epoch": 0.061852600978924234, + "grad_norm": 4.81919002532959, + "learning_rate": 9.382054640980735e-05, + "loss": 1.1056, + "num_input_tokens_seen": 14222408, + "step": 883 + }, + { + "epoch": 0.06192264922465348, + "grad_norm": 4.951950550079346, + "learning_rate": 9.381354816112084e-05, + "loss": 1.0334, + "num_input_tokens_seen": 14238616, + "step": 884 + }, + { + "epoch": 0.061992697470382725, + "grad_norm": 4.15132999420166, + "learning_rate": 9.380654991243433e-05, + "loss": 1.3171, + "num_input_tokens_seen": 14254968, + "step": 885 + }, + { + "epoch": 0.06206274571611197, + "grad_norm": 5.100244998931885, + "learning_rate": 9.379955166374782e-05, + "loss": 1.1684, + "num_input_tokens_seen": 14271352, + "step": 886 + }, + { + "epoch": 0.062132793961841216, + "grad_norm": 5.999105453491211, + "learning_rate": 9.379255341506131e-05, + "loss": 0.9824, + "num_input_tokens_seen": 14287496, + "step": 887 + }, + { + "epoch": 0.06220284220757046, + "grad_norm": 3.8826348781585693, + "learning_rate": 9.378555516637478e-05, + "loss": 1.0829, + "num_input_tokens_seen": 14303880, + "step": 888 + }, + { + "epoch": 0.06227289045329971, + "grad_norm": 5.308819770812988, + "learning_rate": 9.377855691768827e-05, + "loss": 1.1377, + "num_input_tokens_seen": 14320264, + "step": 889 + }, + { + "epoch": 0.06234293869902895, + "grad_norm": 4.383331775665283, + "learning_rate": 9.377155866900175e-05, + "loss": 1.0147, + "num_input_tokens_seen": 14336232, + "step": 890 + }, + { + "epoch": 0.0624129869447582, + "grad_norm": 4.335045337677002, + "learning_rate": 9.376456042031524e-05, + "loss": 0.9807, + "num_input_tokens_seen": 14351704, + "step": 891 + }, + { + "epoch": 0.06248303519048745, + "grad_norm": 3.6901326179504395, + "learning_rate": 9.375756217162872e-05, + "loss": 1.0494, + "num_input_tokens_seen": 14368088, + "step": 892 + }, + { + "epoch": 0.0625530834362167, + "grad_norm": 3.912727117538452, + "learning_rate": 9.375056392294221e-05, + "loss": 1.1191, + "num_input_tokens_seen": 14383904, + "step": 893 + }, + { + "epoch": 0.06262313168194594, + "grad_norm": 3.5688252449035645, + "learning_rate": 9.37435656742557e-05, + "loss": 0.833, + "num_input_tokens_seen": 14399648, + "step": 894 + }, + { + "epoch": 0.06269317992767519, + "grad_norm": 4.6460137367248535, + "learning_rate": 9.373656742556918e-05, + "loss": 1.2523, + "num_input_tokens_seen": 14415640, + "step": 895 + }, + { + "epoch": 0.06276322817340443, + "grad_norm": 3.8113012313842773, + "learning_rate": 9.372956917688266e-05, + "loss": 1.1789, + "num_input_tokens_seen": 14432024, + "step": 896 + }, + { + "epoch": 0.06283327641913368, + "grad_norm": 3.8755953311920166, + "learning_rate": 9.372257092819615e-05, + "loss": 1.1506, + "num_input_tokens_seen": 14448152, + "step": 897 + }, + { + "epoch": 0.06290332466486293, + "grad_norm": 4.225901126861572, + "learning_rate": 9.371557267950964e-05, + "loss": 1.0754, + "num_input_tokens_seen": 14464536, + "step": 898 + }, + { + "epoch": 0.06297337291059217, + "grad_norm": 3.9437992572784424, + "learning_rate": 9.370857443082313e-05, + "loss": 1.049, + "num_input_tokens_seen": 14480072, + "step": 899 + }, + { + "epoch": 0.06304342115632142, + "grad_norm": 3.8961846828460693, + "learning_rate": 9.37015761821366e-05, + "loss": 1.1925, + "num_input_tokens_seen": 14496456, + "step": 900 + }, + { + "epoch": 0.06311346940205066, + "grad_norm": 4.844581604003906, + "learning_rate": 9.36945779334501e-05, + "loss": 1.0867, + "num_input_tokens_seen": 14512520, + "step": 901 + }, + { + "epoch": 0.06318351764777991, + "grad_norm": 4.89027214050293, + "learning_rate": 9.368757968476357e-05, + "loss": 1.0997, + "num_input_tokens_seen": 14528904, + "step": 902 + }, + { + "epoch": 0.06325356589350915, + "grad_norm": 4.303073883056641, + "learning_rate": 9.368058143607706e-05, + "loss": 1.0626, + "num_input_tokens_seen": 14545288, + "step": 903 + }, + { + "epoch": 0.0633236141392384, + "grad_norm": 5.145171165466309, + "learning_rate": 9.367358318739055e-05, + "loss": 1.3597, + "num_input_tokens_seen": 14561672, + "step": 904 + }, + { + "epoch": 0.06339366238496764, + "grad_norm": 5.7905964851379395, + "learning_rate": 9.366658493870403e-05, + "loss": 1.1075, + "num_input_tokens_seen": 14575896, + "step": 905 + }, + { + "epoch": 0.06346371063069689, + "grad_norm": 3.7394728660583496, + "learning_rate": 9.365958669001752e-05, + "loss": 0.9347, + "num_input_tokens_seen": 14592280, + "step": 906 + }, + { + "epoch": 0.06353375887642614, + "grad_norm": 3.916626453399658, + "learning_rate": 9.3652588441331e-05, + "loss": 1.0793, + "num_input_tokens_seen": 14608072, + "step": 907 + }, + { + "epoch": 0.06360380712215538, + "grad_norm": 5.088227272033691, + "learning_rate": 9.364559019264449e-05, + "loss": 1.158, + "num_input_tokens_seen": 14624360, + "step": 908 + }, + { + "epoch": 0.06367385536788463, + "grad_norm": 3.8519606590270996, + "learning_rate": 9.363859194395798e-05, + "loss": 1.1235, + "num_input_tokens_seen": 14640744, + "step": 909 + }, + { + "epoch": 0.06374390361361387, + "grad_norm": 4.450200080871582, + "learning_rate": 9.363159369527145e-05, + "loss": 1.0145, + "num_input_tokens_seen": 14657128, + "step": 910 + }, + { + "epoch": 0.06381395185934312, + "grad_norm": 4.188115119934082, + "learning_rate": 9.362459544658494e-05, + "loss": 1.1457, + "num_input_tokens_seen": 14673128, + "step": 911 + }, + { + "epoch": 0.06388400010507236, + "grad_norm": 4.67346715927124, + "learning_rate": 9.361759719789843e-05, + "loss": 1.2841, + "num_input_tokens_seen": 14689512, + "step": 912 + }, + { + "epoch": 0.06395404835080161, + "grad_norm": 3.737790822982788, + "learning_rate": 9.361059894921192e-05, + "loss": 1.0114, + "num_input_tokens_seen": 14705872, + "step": 913 + }, + { + "epoch": 0.06402409659653086, + "grad_norm": 4.2486653327941895, + "learning_rate": 9.36036007005254e-05, + "loss": 1.1526, + "num_input_tokens_seen": 14721816, + "step": 914 + }, + { + "epoch": 0.0640941448422601, + "grad_norm": 4.120566368103027, + "learning_rate": 9.359660245183888e-05, + "loss": 1.1045, + "num_input_tokens_seen": 14738200, + "step": 915 + }, + { + "epoch": 0.06416419308798935, + "grad_norm": 5.259902477264404, + "learning_rate": 9.358960420315237e-05, + "loss": 1.3544, + "num_input_tokens_seen": 14753920, + "step": 916 + }, + { + "epoch": 0.06423424133371859, + "grad_norm": 3.900827646255493, + "learning_rate": 9.358260595446584e-05, + "loss": 1.1079, + "num_input_tokens_seen": 14769640, + "step": 917 + }, + { + "epoch": 0.06430428957944785, + "grad_norm": 4.103065490722656, + "learning_rate": 9.357560770577935e-05, + "loss": 0.963, + "num_input_tokens_seen": 14786024, + "step": 918 + }, + { + "epoch": 0.0643743378251771, + "grad_norm": 3.9913623332977295, + "learning_rate": 9.356860945709282e-05, + "loss": 1.0959, + "num_input_tokens_seen": 14802408, + "step": 919 + }, + { + "epoch": 0.06444438607090634, + "grad_norm": 3.7369885444641113, + "learning_rate": 9.356161120840631e-05, + "loss": 1.131, + "num_input_tokens_seen": 14818792, + "step": 920 + }, + { + "epoch": 0.06451443431663559, + "grad_norm": 4.029351711273193, + "learning_rate": 9.35546129597198e-05, + "loss": 1.0378, + "num_input_tokens_seen": 14833792, + "step": 921 + }, + { + "epoch": 0.06458448256236483, + "grad_norm": 4.043665885925293, + "learning_rate": 9.354761471103327e-05, + "loss": 1.179, + "num_input_tokens_seen": 14850176, + "step": 922 + }, + { + "epoch": 0.06465453080809408, + "grad_norm": 3.7803280353546143, + "learning_rate": 9.354061646234676e-05, + "loss": 0.9886, + "num_input_tokens_seen": 14866096, + "step": 923 + }, + { + "epoch": 0.06472457905382333, + "grad_norm": 5.537375450134277, + "learning_rate": 9.353361821366025e-05, + "loss": 1.2519, + "num_input_tokens_seen": 14882480, + "step": 924 + }, + { + "epoch": 0.06479462729955257, + "grad_norm": 4.944652557373047, + "learning_rate": 9.352661996497374e-05, + "loss": 1.1963, + "num_input_tokens_seen": 14898864, + "step": 925 + }, + { + "epoch": 0.06486467554528182, + "grad_norm": 4.3231611251831055, + "learning_rate": 9.351962171628723e-05, + "loss": 1.1858, + "num_input_tokens_seen": 14913856, + "step": 926 + }, + { + "epoch": 0.06493472379101106, + "grad_norm": 4.386692523956299, + "learning_rate": 9.35126234676007e-05, + "loss": 1.0464, + "num_input_tokens_seen": 14929816, + "step": 927 + }, + { + "epoch": 0.06500477203674031, + "grad_norm": 4.607088088989258, + "learning_rate": 9.350562521891419e-05, + "loss": 1.2197, + "num_input_tokens_seen": 14946200, + "step": 928 + }, + { + "epoch": 0.06507482028246955, + "grad_norm": 4.7108001708984375, + "learning_rate": 9.349862697022767e-05, + "loss": 1.2335, + "num_input_tokens_seen": 14961816, + "step": 929 + }, + { + "epoch": 0.0651448685281988, + "grad_norm": 3.844571352005005, + "learning_rate": 9.349162872154115e-05, + "loss": 1.2745, + "num_input_tokens_seen": 14978200, + "step": 930 + }, + { + "epoch": 0.06521491677392804, + "grad_norm": 4.078561782836914, + "learning_rate": 9.348463047285464e-05, + "loss": 1.1737, + "num_input_tokens_seen": 14994440, + "step": 931 + }, + { + "epoch": 0.06528496501965729, + "grad_norm": 4.317986011505127, + "learning_rate": 9.347763222416813e-05, + "loss": 1.3046, + "num_input_tokens_seen": 15010824, + "step": 932 + }, + { + "epoch": 0.06535501326538654, + "grad_norm": 4.459141254425049, + "learning_rate": 9.347063397548162e-05, + "loss": 1.2893, + "num_input_tokens_seen": 15026608, + "step": 933 + }, + { + "epoch": 0.06542506151111578, + "grad_norm": 4.251399993896484, + "learning_rate": 9.34636357267951e-05, + "loss": 1.2346, + "num_input_tokens_seen": 15042328, + "step": 934 + }, + { + "epoch": 0.06549510975684503, + "grad_norm": 4.568341255187988, + "learning_rate": 9.345663747810858e-05, + "loss": 1.4343, + "num_input_tokens_seen": 15058712, + "step": 935 + }, + { + "epoch": 0.06556515800257427, + "grad_norm": 4.7616424560546875, + "learning_rate": 9.344963922942207e-05, + "loss": 1.0925, + "num_input_tokens_seen": 15075096, + "step": 936 + }, + { + "epoch": 0.06563520624830352, + "grad_norm": 3.8224191665649414, + "learning_rate": 9.344264098073555e-05, + "loss": 1.0958, + "num_input_tokens_seen": 15091480, + "step": 937 + }, + { + "epoch": 0.06570525449403276, + "grad_norm": 4.985624313354492, + "learning_rate": 9.343564273204905e-05, + "loss": 1.233, + "num_input_tokens_seen": 15107864, + "step": 938 + }, + { + "epoch": 0.06577530273976201, + "grad_norm": 4.3780975341796875, + "learning_rate": 9.342864448336252e-05, + "loss": 1.1819, + "num_input_tokens_seen": 15123656, + "step": 939 + }, + { + "epoch": 0.06584535098549125, + "grad_norm": 4.435183525085449, + "learning_rate": 9.342164623467601e-05, + "loss": 1.1107, + "num_input_tokens_seen": 15140040, + "step": 940 + }, + { + "epoch": 0.0659153992312205, + "grad_norm": 4.560804843902588, + "learning_rate": 9.34146479859895e-05, + "loss": 1.1274, + "num_input_tokens_seen": 15156424, + "step": 941 + }, + { + "epoch": 0.06598544747694975, + "grad_norm": 5.184841156005859, + "learning_rate": 9.340764973730298e-05, + "loss": 1.3124, + "num_input_tokens_seen": 15172504, + "step": 942 + }, + { + "epoch": 0.06605549572267899, + "grad_norm": 3.5243096351623535, + "learning_rate": 9.340065148861647e-05, + "loss": 0.8203, + "num_input_tokens_seen": 15188888, + "step": 943 + }, + { + "epoch": 0.06612554396840824, + "grad_norm": 4.041544437408447, + "learning_rate": 9.339365323992995e-05, + "loss": 1.0602, + "num_input_tokens_seen": 15204672, + "step": 944 + }, + { + "epoch": 0.06619559221413748, + "grad_norm": 3.720906972885132, + "learning_rate": 9.338665499124344e-05, + "loss": 1.0722, + "num_input_tokens_seen": 15220688, + "step": 945 + }, + { + "epoch": 0.06626564045986673, + "grad_norm": 3.9778380393981934, + "learning_rate": 9.337965674255692e-05, + "loss": 1.2653, + "num_input_tokens_seen": 15236856, + "step": 946 + }, + { + "epoch": 0.06633568870559597, + "grad_norm": 4.486488342285156, + "learning_rate": 9.33726584938704e-05, + "loss": 1.2408, + "num_input_tokens_seen": 15253240, + "step": 947 + }, + { + "epoch": 0.06640573695132522, + "grad_norm": 8.369994163513184, + "learning_rate": 9.33656602451839e-05, + "loss": 1.4841, + "num_input_tokens_seen": 15267728, + "step": 948 + }, + { + "epoch": 0.06647578519705447, + "grad_norm": 4.2056732177734375, + "learning_rate": 9.335866199649737e-05, + "loss": 1.4258, + "num_input_tokens_seen": 15284112, + "step": 949 + }, + { + "epoch": 0.06654583344278371, + "grad_norm": 4.396723747253418, + "learning_rate": 9.335166374781086e-05, + "loss": 1.1578, + "num_input_tokens_seen": 15300496, + "step": 950 + }, + { + "epoch": 0.06661588168851296, + "grad_norm": 3.7177491188049316, + "learning_rate": 9.334466549912435e-05, + "loss": 1.0664, + "num_input_tokens_seen": 15316608, + "step": 951 + }, + { + "epoch": 0.0666859299342422, + "grad_norm": 4.080933094024658, + "learning_rate": 9.333766725043784e-05, + "loss": 1.1282, + "num_input_tokens_seen": 15332976, + "step": 952 + }, + { + "epoch": 0.06675597817997146, + "grad_norm": 5.188856601715088, + "learning_rate": 9.333066900175132e-05, + "loss": 1.2079, + "num_input_tokens_seen": 15349080, + "step": 953 + }, + { + "epoch": 0.06682602642570071, + "grad_norm": 4.583539962768555, + "learning_rate": 9.33236707530648e-05, + "loss": 0.9047, + "num_input_tokens_seen": 15365256, + "step": 954 + }, + { + "epoch": 0.06689607467142995, + "grad_norm": 3.873830795288086, + "learning_rate": 9.331667250437829e-05, + "loss": 1.159, + "num_input_tokens_seen": 15381640, + "step": 955 + }, + { + "epoch": 0.0669661229171592, + "grad_norm": 3.9574460983276367, + "learning_rate": 9.330967425569176e-05, + "loss": 1.0696, + "num_input_tokens_seen": 15397800, + "step": 956 + }, + { + "epoch": 0.06703617116288844, + "grad_norm": 3.8933448791503906, + "learning_rate": 9.330267600700525e-05, + "loss": 0.9844, + "num_input_tokens_seen": 15414112, + "step": 957 + }, + { + "epoch": 0.06710621940861769, + "grad_norm": 4.748478412628174, + "learning_rate": 9.329567775831875e-05, + "loss": 1.1308, + "num_input_tokens_seen": 15430496, + "step": 958 + }, + { + "epoch": 0.06717626765434694, + "grad_norm": 6.755379676818848, + "learning_rate": 9.328867950963223e-05, + "loss": 1.206, + "num_input_tokens_seen": 15445072, + "step": 959 + }, + { + "epoch": 0.06724631590007618, + "grad_norm": 4.382065773010254, + "learning_rate": 9.328168126094572e-05, + "loss": 1.0753, + "num_input_tokens_seen": 15460336, + "step": 960 + }, + { + "epoch": 0.06731636414580543, + "grad_norm": 5.037116527557373, + "learning_rate": 9.327468301225919e-05, + "loss": 1.0562, + "num_input_tokens_seen": 15474752, + "step": 961 + }, + { + "epoch": 0.06738641239153467, + "grad_norm": 5.838945388793945, + "learning_rate": 9.326768476357268e-05, + "loss": 1.314, + "num_input_tokens_seen": 15491136, + "step": 962 + }, + { + "epoch": 0.06745646063726392, + "grad_norm": 3.690436840057373, + "learning_rate": 9.326068651488617e-05, + "loss": 0.996, + "num_input_tokens_seen": 15507520, + "step": 963 + }, + { + "epoch": 0.06752650888299316, + "grad_norm": 4.1123247146606445, + "learning_rate": 9.325368826619966e-05, + "loss": 1.2031, + "num_input_tokens_seen": 15523904, + "step": 964 + }, + { + "epoch": 0.06759655712872241, + "grad_norm": 4.120308876037598, + "learning_rate": 9.324669001751315e-05, + "loss": 0.9671, + "num_input_tokens_seen": 15540136, + "step": 965 + }, + { + "epoch": 0.06766660537445165, + "grad_norm": 3.9849514961242676, + "learning_rate": 9.323969176882662e-05, + "loss": 1.1669, + "num_input_tokens_seen": 15556312, + "step": 966 + }, + { + "epoch": 0.0677366536201809, + "grad_norm": 3.9164884090423584, + "learning_rate": 9.323269352014011e-05, + "loss": 1.0883, + "num_input_tokens_seen": 15571864, + "step": 967 + }, + { + "epoch": 0.06780670186591015, + "grad_norm": 4.282434940338135, + "learning_rate": 9.32256952714536e-05, + "loss": 1.241, + "num_input_tokens_seen": 15587800, + "step": 968 + }, + { + "epoch": 0.06787675011163939, + "grad_norm": 4.118724346160889, + "learning_rate": 9.321869702276707e-05, + "loss": 1.0905, + "num_input_tokens_seen": 15603128, + "step": 969 + }, + { + "epoch": 0.06794679835736864, + "grad_norm": 4.233770847320557, + "learning_rate": 9.321169877408056e-05, + "loss": 1.0618, + "num_input_tokens_seen": 15617864, + "step": 970 + }, + { + "epoch": 0.06801684660309788, + "grad_norm": 3.933587074279785, + "learning_rate": 9.320470052539405e-05, + "loss": 0.982, + "num_input_tokens_seen": 15634248, + "step": 971 + }, + { + "epoch": 0.06808689484882713, + "grad_norm": 4.641788482666016, + "learning_rate": 9.319770227670754e-05, + "loss": 0.9793, + "num_input_tokens_seen": 15650304, + "step": 972 + }, + { + "epoch": 0.06815694309455637, + "grad_norm": 4.138880729675293, + "learning_rate": 9.319070402802102e-05, + "loss": 1.1991, + "num_input_tokens_seen": 15666688, + "step": 973 + }, + { + "epoch": 0.06822699134028562, + "grad_norm": 4.823685169219971, + "learning_rate": 9.31837057793345e-05, + "loss": 0.9162, + "num_input_tokens_seen": 15682936, + "step": 974 + }, + { + "epoch": 0.06829703958601487, + "grad_norm": 4.432481288909912, + "learning_rate": 9.317670753064799e-05, + "loss": 0.9626, + "num_input_tokens_seen": 15699320, + "step": 975 + }, + { + "epoch": 0.06836708783174411, + "grad_norm": 4.115868091583252, + "learning_rate": 9.316970928196147e-05, + "loss": 1.105, + "num_input_tokens_seen": 15715296, + "step": 976 + }, + { + "epoch": 0.06843713607747336, + "grad_norm": 3.964905023574829, + "learning_rate": 9.316271103327496e-05, + "loss": 1.0064, + "num_input_tokens_seen": 15731680, + "step": 977 + }, + { + "epoch": 0.0685071843232026, + "grad_norm": 3.686522960662842, + "learning_rate": 9.315571278458846e-05, + "loss": 0.9924, + "num_input_tokens_seen": 15747808, + "step": 978 + }, + { + "epoch": 0.06857723256893185, + "grad_norm": 4.0614423751831055, + "learning_rate": 9.314871453590193e-05, + "loss": 1.0425, + "num_input_tokens_seen": 15764168, + "step": 979 + }, + { + "epoch": 0.0686472808146611, + "grad_norm": 3.756350517272949, + "learning_rate": 9.314171628721542e-05, + "loss": 1.0757, + "num_input_tokens_seen": 15780176, + "step": 980 + }, + { + "epoch": 0.06871732906039034, + "grad_norm": 4.30344820022583, + "learning_rate": 9.31347180385289e-05, + "loss": 0.9496, + "num_input_tokens_seen": 15795720, + "step": 981 + }, + { + "epoch": 0.06878737730611958, + "grad_norm": 4.055768013000488, + "learning_rate": 9.312771978984239e-05, + "loss": 1.0189, + "num_input_tokens_seen": 15811528, + "step": 982 + }, + { + "epoch": 0.06885742555184883, + "grad_norm": 3.8779115676879883, + "learning_rate": 9.312072154115586e-05, + "loss": 1.0516, + "num_input_tokens_seen": 15827392, + "step": 983 + }, + { + "epoch": 0.06892747379757808, + "grad_norm": 5.014206886291504, + "learning_rate": 9.311372329246936e-05, + "loss": 1.3421, + "num_input_tokens_seen": 15843776, + "step": 984 + }, + { + "epoch": 0.06899752204330732, + "grad_norm": 4.548489570617676, + "learning_rate": 9.310672504378285e-05, + "loss": 1.1652, + "num_input_tokens_seen": 15858880, + "step": 985 + }, + { + "epoch": 0.06906757028903657, + "grad_norm": 4.312918186187744, + "learning_rate": 9.309972679509633e-05, + "loss": 1.2728, + "num_input_tokens_seen": 15874840, + "step": 986 + }, + { + "epoch": 0.06913761853476583, + "grad_norm": 3.9783735275268555, + "learning_rate": 9.309272854640981e-05, + "loss": 0.9377, + "num_input_tokens_seen": 15890568, + "step": 987 + }, + { + "epoch": 0.06920766678049507, + "grad_norm": 4.155986309051514, + "learning_rate": 9.308573029772329e-05, + "loss": 1.0278, + "num_input_tokens_seen": 15906952, + "step": 988 + }, + { + "epoch": 0.06927771502622432, + "grad_norm": 3.633018732070923, + "learning_rate": 9.307873204903678e-05, + "loss": 1.1276, + "num_input_tokens_seen": 15923336, + "step": 989 + }, + { + "epoch": 0.06934776327195356, + "grad_norm": 3.9513449668884277, + "learning_rate": 9.307173380035027e-05, + "loss": 0.9076, + "num_input_tokens_seen": 15939720, + "step": 990 + }, + { + "epoch": 0.06941781151768281, + "grad_norm": 4.296191692352295, + "learning_rate": 9.306473555166376e-05, + "loss": 1.0375, + "num_input_tokens_seen": 15956104, + "step": 991 + }, + { + "epoch": 0.06948785976341205, + "grad_norm": 5.266847133636475, + "learning_rate": 9.305773730297724e-05, + "loss": 1.1645, + "num_input_tokens_seen": 15972488, + "step": 992 + }, + { + "epoch": 0.0695579080091413, + "grad_norm": 4.321287155151367, + "learning_rate": 9.305073905429072e-05, + "loss": 1.046, + "num_input_tokens_seen": 15988408, + "step": 993 + }, + { + "epoch": 0.06962795625487055, + "grad_norm": 4.1421613693237305, + "learning_rate": 9.304374080560421e-05, + "loss": 1.0639, + "num_input_tokens_seen": 16002904, + "step": 994 + }, + { + "epoch": 0.06969800450059979, + "grad_norm": 6.811270713806152, + "learning_rate": 9.30367425569177e-05, + "loss": 1.1012, + "num_input_tokens_seen": 16017424, + "step": 995 + }, + { + "epoch": 0.06976805274632904, + "grad_norm": 4.968684196472168, + "learning_rate": 9.302974430823117e-05, + "loss": 1.0935, + "num_input_tokens_seen": 16033808, + "step": 996 + }, + { + "epoch": 0.06983810099205828, + "grad_norm": 4.592737197875977, + "learning_rate": 9.302274605954466e-05, + "loss": 0.9698, + "num_input_tokens_seen": 16050192, + "step": 997 + }, + { + "epoch": 0.06990814923778753, + "grad_norm": 3.7984917163848877, + "learning_rate": 9.301574781085815e-05, + "loss": 1.0976, + "num_input_tokens_seen": 16066192, + "step": 998 + }, + { + "epoch": 0.06997819748351677, + "grad_norm": 4.594212055206299, + "learning_rate": 9.300874956217164e-05, + "loss": 1.3718, + "num_input_tokens_seen": 16082576, + "step": 999 + }, + { + "epoch": 0.07004824572924602, + "grad_norm": 5.062666893005371, + "learning_rate": 9.300175131348511e-05, + "loss": 1.3139, + "num_input_tokens_seen": 16098960, + "step": 1000 + }, + { + "epoch": 0.07004824572924602, + "eval_loss": 1.1650840044021606, + "eval_runtime": 0.192, + "eval_samples_per_second": 5.208, + "eval_steps_per_second": 5.208, + "num_input_tokens_seen": 16098960, + "step": 1000 + }, + { + "epoch": 0.07011829397497527, + "grad_norm": 4.100902557373047, + "learning_rate": 9.29947530647986e-05, + "loss": 1.2711, + "num_input_tokens_seen": 16115216, + "step": 1001 + }, + { + "epoch": 0.07018834222070451, + "grad_norm": 4.24728536605835, + "learning_rate": 9.298775481611209e-05, + "loss": 0.9946, + "num_input_tokens_seen": 16130080, + "step": 1002 + }, + { + "epoch": 0.07025839046643376, + "grad_norm": 3.4653356075286865, + "learning_rate": 9.298075656742556e-05, + "loss": 0.8736, + "num_input_tokens_seen": 16146400, + "step": 1003 + }, + { + "epoch": 0.070328438712163, + "grad_norm": 5.548775672912598, + "learning_rate": 9.297375831873907e-05, + "loss": 0.9841, + "num_input_tokens_seen": 16162784, + "step": 1004 + }, + { + "epoch": 0.07039848695789225, + "grad_norm": 4.11661958694458, + "learning_rate": 9.296676007005256e-05, + "loss": 0.9857, + "num_input_tokens_seen": 16179024, + "step": 1005 + }, + { + "epoch": 0.0704685352036215, + "grad_norm": 4.006300449371338, + "learning_rate": 9.295976182136603e-05, + "loss": 1.0587, + "num_input_tokens_seen": 16195408, + "step": 1006 + }, + { + "epoch": 0.07053858344935074, + "grad_norm": 4.418802261352539, + "learning_rate": 9.295276357267952e-05, + "loss": 1.3845, + "num_input_tokens_seen": 16211792, + "step": 1007 + }, + { + "epoch": 0.07060863169507998, + "grad_norm": 5.625720024108887, + "learning_rate": 9.2945765323993e-05, + "loss": 1.2198, + "num_input_tokens_seen": 16226584, + "step": 1008 + }, + { + "epoch": 0.07067867994080923, + "grad_norm": 4.209630489349365, + "learning_rate": 9.293876707530648e-05, + "loss": 0.9387, + "num_input_tokens_seen": 16242256, + "step": 1009 + }, + { + "epoch": 0.07074872818653848, + "grad_norm": 4.0324788093566895, + "learning_rate": 9.293176882661997e-05, + "loss": 1.0713, + "num_input_tokens_seen": 16258640, + "step": 1010 + }, + { + "epoch": 0.07081877643226772, + "grad_norm": 4.0557684898376465, + "learning_rate": 9.292477057793346e-05, + "loss": 1.2831, + "num_input_tokens_seen": 16275024, + "step": 1011 + }, + { + "epoch": 0.07088882467799697, + "grad_norm": 4.511384010314941, + "learning_rate": 9.291777232924695e-05, + "loss": 1.1949, + "num_input_tokens_seen": 16291112, + "step": 1012 + }, + { + "epoch": 0.07095887292372621, + "grad_norm": 3.8120172023773193, + "learning_rate": 9.291077408056042e-05, + "loss": 1.013, + "num_input_tokens_seen": 16307496, + "step": 1013 + }, + { + "epoch": 0.07102892116945546, + "grad_norm": 4.039558410644531, + "learning_rate": 9.290377583187391e-05, + "loss": 1.1575, + "num_input_tokens_seen": 16323880, + "step": 1014 + }, + { + "epoch": 0.0710989694151847, + "grad_norm": 3.9076366424560547, + "learning_rate": 9.289677758318739e-05, + "loss": 1.1776, + "num_input_tokens_seen": 16339624, + "step": 1015 + }, + { + "epoch": 0.07116901766091395, + "grad_norm": 3.8083527088165283, + "learning_rate": 9.288977933450088e-05, + "loss": 0.965, + "num_input_tokens_seen": 16356008, + "step": 1016 + }, + { + "epoch": 0.0712390659066432, + "grad_norm": 4.5387282371521, + "learning_rate": 9.288278108581436e-05, + "loss": 1.1113, + "num_input_tokens_seen": 16372392, + "step": 1017 + }, + { + "epoch": 0.07130911415237244, + "grad_norm": 3.9228522777557373, + "learning_rate": 9.287578283712785e-05, + "loss": 1.1609, + "num_input_tokens_seen": 16388776, + "step": 1018 + }, + { + "epoch": 0.07137916239810169, + "grad_norm": 4.170912742614746, + "learning_rate": 9.286878458844134e-05, + "loss": 1.1324, + "num_input_tokens_seen": 16405160, + "step": 1019 + }, + { + "epoch": 0.07144921064383093, + "grad_norm": 4.426759719848633, + "learning_rate": 9.286178633975482e-05, + "loss": 1.2825, + "num_input_tokens_seen": 16421544, + "step": 1020 + }, + { + "epoch": 0.07151925888956018, + "grad_norm": 3.8606133460998535, + "learning_rate": 9.28547880910683e-05, + "loss": 1.1734, + "num_input_tokens_seen": 16437736, + "step": 1021 + }, + { + "epoch": 0.07158930713528944, + "grad_norm": 4.040006637573242, + "learning_rate": 9.28477898423818e-05, + "loss": 1.0824, + "num_input_tokens_seen": 16453776, + "step": 1022 + }, + { + "epoch": 0.07165935538101868, + "grad_norm": 3.7698042392730713, + "learning_rate": 9.284079159369527e-05, + "loss": 1.0951, + "num_input_tokens_seen": 16470160, + "step": 1023 + }, + { + "epoch": 0.07172940362674793, + "grad_norm": 4.180328369140625, + "learning_rate": 9.283379334500877e-05, + "loss": 1.0087, + "num_input_tokens_seen": 16486280, + "step": 1024 + }, + { + "epoch": 0.07179945187247717, + "grad_norm": 6.02299690246582, + "learning_rate": 9.282679509632225e-05, + "loss": 0.9788, + "num_input_tokens_seen": 16501784, + "step": 1025 + }, + { + "epoch": 0.07186950011820642, + "grad_norm": 4.239454746246338, + "learning_rate": 9.281979684763573e-05, + "loss": 1.3031, + "num_input_tokens_seen": 16518096, + "step": 1026 + }, + { + "epoch": 0.07193954836393567, + "grad_norm": 3.446030616760254, + "learning_rate": 9.281279859894921e-05, + "loss": 0.9523, + "num_input_tokens_seen": 16534480, + "step": 1027 + }, + { + "epoch": 0.07200959660966491, + "grad_norm": 4.2813568115234375, + "learning_rate": 9.28058003502627e-05, + "loss": 1.1041, + "num_input_tokens_seen": 16550864, + "step": 1028 + }, + { + "epoch": 0.07207964485539416, + "grad_norm": 5.289443016052246, + "learning_rate": 9.279880210157619e-05, + "loss": 1.3036, + "num_input_tokens_seen": 16567248, + "step": 1029 + }, + { + "epoch": 0.0721496931011234, + "grad_norm": 3.680283308029175, + "learning_rate": 9.279180385288967e-05, + "loss": 1.1434, + "num_input_tokens_seen": 16583632, + "step": 1030 + }, + { + "epoch": 0.07221974134685265, + "grad_norm": 4.283925533294678, + "learning_rate": 9.278480560420316e-05, + "loss": 1.1569, + "num_input_tokens_seen": 16600016, + "step": 1031 + }, + { + "epoch": 0.0722897895925819, + "grad_norm": 4.913532733917236, + "learning_rate": 9.277780735551665e-05, + "loss": 1.218, + "num_input_tokens_seen": 16616400, + "step": 1032 + }, + { + "epoch": 0.07235983783831114, + "grad_norm": 4.344277381896973, + "learning_rate": 9.277080910683013e-05, + "loss": 1.1495, + "num_input_tokens_seen": 16632024, + "step": 1033 + }, + { + "epoch": 0.07242988608404038, + "grad_norm": 3.9231889247894287, + "learning_rate": 9.276381085814362e-05, + "loss": 1.0492, + "num_input_tokens_seen": 16648408, + "step": 1034 + }, + { + "epoch": 0.07249993432976963, + "grad_norm": 4.062288284301758, + "learning_rate": 9.275681260945709e-05, + "loss": 0.927, + "num_input_tokens_seen": 16664792, + "step": 1035 + }, + { + "epoch": 0.07256998257549888, + "grad_norm": 4.163131237030029, + "learning_rate": 9.274981436077058e-05, + "loss": 1.0782, + "num_input_tokens_seen": 16680216, + "step": 1036 + }, + { + "epoch": 0.07264003082122812, + "grad_norm": 5.220231056213379, + "learning_rate": 9.274281611208407e-05, + "loss": 1.125, + "num_input_tokens_seen": 16696160, + "step": 1037 + }, + { + "epoch": 0.07271007906695737, + "grad_norm": 3.63785457611084, + "learning_rate": 9.273581786339756e-05, + "loss": 1.0229, + "num_input_tokens_seen": 16712544, + "step": 1038 + }, + { + "epoch": 0.07278012731268661, + "grad_norm": 4.612295627593994, + "learning_rate": 9.272881961471105e-05, + "loss": 1.3076, + "num_input_tokens_seen": 16728928, + "step": 1039 + }, + { + "epoch": 0.07285017555841586, + "grad_norm": 5.278262615203857, + "learning_rate": 9.272182136602452e-05, + "loss": 1.2682, + "num_input_tokens_seen": 16744184, + "step": 1040 + }, + { + "epoch": 0.0729202238041451, + "grad_norm": 4.3274455070495605, + "learning_rate": 9.271482311733801e-05, + "loss": 1.3517, + "num_input_tokens_seen": 16760056, + "step": 1041 + }, + { + "epoch": 0.07299027204987435, + "grad_norm": 4.1077375411987305, + "learning_rate": 9.270782486865148e-05, + "loss": 1.175, + "num_input_tokens_seen": 16776280, + "step": 1042 + }, + { + "epoch": 0.0730603202956036, + "grad_norm": 3.954604148864746, + "learning_rate": 9.270082661996497e-05, + "loss": 1.189, + "num_input_tokens_seen": 16792456, + "step": 1043 + }, + { + "epoch": 0.07313036854133284, + "grad_norm": 4.111297607421875, + "learning_rate": 9.269382837127847e-05, + "loss": 1.0265, + "num_input_tokens_seen": 16808840, + "step": 1044 + }, + { + "epoch": 0.07320041678706209, + "grad_norm": 3.56953501701355, + "learning_rate": 9.268683012259195e-05, + "loss": 1.0114, + "num_input_tokens_seen": 16824720, + "step": 1045 + }, + { + "epoch": 0.07327046503279133, + "grad_norm": 4.962648868560791, + "learning_rate": 9.267983187390544e-05, + "loss": 1.1714, + "num_input_tokens_seen": 16841104, + "step": 1046 + }, + { + "epoch": 0.07334051327852058, + "grad_norm": 3.7930710315704346, + "learning_rate": 9.267283362521891e-05, + "loss": 1.0903, + "num_input_tokens_seen": 16857488, + "step": 1047 + }, + { + "epoch": 0.07341056152424982, + "grad_norm": 4.158027172088623, + "learning_rate": 9.26658353765324e-05, + "loss": 1.1823, + "num_input_tokens_seen": 16873856, + "step": 1048 + }, + { + "epoch": 0.07348060976997907, + "grad_norm": 4.1571197509765625, + "learning_rate": 9.265883712784589e-05, + "loss": 1.2572, + "num_input_tokens_seen": 16890240, + "step": 1049 + }, + { + "epoch": 0.07355065801570831, + "grad_norm": 4.330874443054199, + "learning_rate": 9.265183887915938e-05, + "loss": 1.194, + "num_input_tokens_seen": 16906624, + "step": 1050 + }, + { + "epoch": 0.07362070626143756, + "grad_norm": 6.105716705322266, + "learning_rate": 9.264484063047287e-05, + "loss": 1.0685, + "num_input_tokens_seen": 16922864, + "step": 1051 + }, + { + "epoch": 0.0736907545071668, + "grad_norm": 4.8344407081604, + "learning_rate": 9.263784238178634e-05, + "loss": 1.1992, + "num_input_tokens_seen": 16939200, + "step": 1052 + }, + { + "epoch": 0.07376080275289605, + "grad_norm": 3.553568124771118, + "learning_rate": 9.263084413309983e-05, + "loss": 0.7907, + "num_input_tokens_seen": 16955584, + "step": 1053 + }, + { + "epoch": 0.0738308509986253, + "grad_norm": 3.8178694248199463, + "learning_rate": 9.26238458844133e-05, + "loss": 1.2031, + "num_input_tokens_seen": 16971968, + "step": 1054 + }, + { + "epoch": 0.07390089924435454, + "grad_norm": 3.5509321689605713, + "learning_rate": 9.26168476357268e-05, + "loss": 1.1189, + "num_input_tokens_seen": 16988352, + "step": 1055 + }, + { + "epoch": 0.0739709474900838, + "grad_norm": 3.870811939239502, + "learning_rate": 9.260984938704028e-05, + "loss": 1.0205, + "num_input_tokens_seen": 17004736, + "step": 1056 + }, + { + "epoch": 0.07404099573581305, + "grad_norm": 11.86201286315918, + "learning_rate": 9.260285113835377e-05, + "loss": 1.037, + "num_input_tokens_seen": 17020544, + "step": 1057 + }, + { + "epoch": 0.0741110439815423, + "grad_norm": 5.2176127433776855, + "learning_rate": 9.259585288966726e-05, + "loss": 1.0797, + "num_input_tokens_seen": 17036472, + "step": 1058 + }, + { + "epoch": 0.07418109222727154, + "grad_norm": 3.72566819190979, + "learning_rate": 9.258885464098075e-05, + "loss": 0.9307, + "num_input_tokens_seen": 17052360, + "step": 1059 + }, + { + "epoch": 0.07425114047300078, + "grad_norm": 4.323361396789551, + "learning_rate": 9.258185639229422e-05, + "loss": 1.0783, + "num_input_tokens_seen": 17067672, + "step": 1060 + }, + { + "epoch": 0.07432118871873003, + "grad_norm": 4.01705265045166, + "learning_rate": 9.257485814360771e-05, + "loss": 1.0402, + "num_input_tokens_seen": 17084056, + "step": 1061 + }, + { + "epoch": 0.07439123696445928, + "grad_norm": 4.4460039138793945, + "learning_rate": 9.256785989492119e-05, + "loss": 1.2294, + "num_input_tokens_seen": 17100096, + "step": 1062 + }, + { + "epoch": 0.07446128521018852, + "grad_norm": 4.634500503540039, + "learning_rate": 9.256086164623468e-05, + "loss": 1.1479, + "num_input_tokens_seen": 17116440, + "step": 1063 + }, + { + "epoch": 0.07453133345591777, + "grad_norm": 4.146971702575684, + "learning_rate": 9.255386339754817e-05, + "loss": 0.9052, + "num_input_tokens_seen": 17132592, + "step": 1064 + }, + { + "epoch": 0.07460138170164701, + "grad_norm": 6.171874523162842, + "learning_rate": 9.254686514886165e-05, + "loss": 1.1135, + "num_input_tokens_seen": 17148704, + "step": 1065 + }, + { + "epoch": 0.07467142994737626, + "grad_norm": 6.25461483001709, + "learning_rate": 9.253986690017514e-05, + "loss": 1.0003, + "num_input_tokens_seen": 17164920, + "step": 1066 + }, + { + "epoch": 0.0747414781931055, + "grad_norm": 3.886582851409912, + "learning_rate": 9.253286865148862e-05, + "loss": 1.1917, + "num_input_tokens_seen": 17181304, + "step": 1067 + }, + { + "epoch": 0.07481152643883475, + "grad_norm": 5.067885398864746, + "learning_rate": 9.25258704028021e-05, + "loss": 1.4475, + "num_input_tokens_seen": 17197208, + "step": 1068 + }, + { + "epoch": 0.074881574684564, + "grad_norm": 4.186190128326416, + "learning_rate": 9.251887215411558e-05, + "loss": 1.1255, + "num_input_tokens_seen": 17212680, + "step": 1069 + }, + { + "epoch": 0.07495162293029324, + "grad_norm": 4.059047698974609, + "learning_rate": 9.251187390542908e-05, + "loss": 1.1467, + "num_input_tokens_seen": 17229064, + "step": 1070 + }, + { + "epoch": 0.07502167117602249, + "grad_norm": 4.154530048370361, + "learning_rate": 9.250487565674257e-05, + "loss": 1.0811, + "num_input_tokens_seen": 17245448, + "step": 1071 + }, + { + "epoch": 0.07509171942175173, + "grad_norm": 3.760453701019287, + "learning_rate": 9.249787740805605e-05, + "loss": 1.1493, + "num_input_tokens_seen": 17261832, + "step": 1072 + }, + { + "epoch": 0.07516176766748098, + "grad_norm": 3.8155417442321777, + "learning_rate": 9.249087915936954e-05, + "loss": 1.0934, + "num_input_tokens_seen": 17278216, + "step": 1073 + }, + { + "epoch": 0.07523181591321022, + "grad_norm": 4.807973384857178, + "learning_rate": 9.248388091068301e-05, + "loss": 1.0704, + "num_input_tokens_seen": 17294600, + "step": 1074 + }, + { + "epoch": 0.07530186415893947, + "grad_norm": 11.421661376953125, + "learning_rate": 9.24768826619965e-05, + "loss": 0.9472, + "num_input_tokens_seen": 17308960, + "step": 1075 + }, + { + "epoch": 0.07537191240466871, + "grad_norm": 3.7491819858551025, + "learning_rate": 9.246988441330999e-05, + "loss": 1.1395, + "num_input_tokens_seen": 17324536, + "step": 1076 + }, + { + "epoch": 0.07544196065039796, + "grad_norm": 3.6289992332458496, + "learning_rate": 9.246288616462348e-05, + "loss": 0.9375, + "num_input_tokens_seen": 17340920, + "step": 1077 + }, + { + "epoch": 0.0755120088961272, + "grad_norm": 5.741896629333496, + "learning_rate": 9.245588791593696e-05, + "loss": 1.1656, + "num_input_tokens_seen": 17357304, + "step": 1078 + }, + { + "epoch": 0.07558205714185645, + "grad_norm": 3.5879697799682617, + "learning_rate": 9.244888966725044e-05, + "loss": 0.9421, + "num_input_tokens_seen": 17373592, + "step": 1079 + }, + { + "epoch": 0.0756521053875857, + "grad_norm": 7.3384504318237305, + "learning_rate": 9.244189141856393e-05, + "loss": 1.1358, + "num_input_tokens_seen": 17387872, + "step": 1080 + }, + { + "epoch": 0.07572215363331494, + "grad_norm": 3.6677255630493164, + "learning_rate": 9.24348931698774e-05, + "loss": 0.892, + "num_input_tokens_seen": 17403088, + "step": 1081 + }, + { + "epoch": 0.07579220187904419, + "grad_norm": 3.953216075897217, + "learning_rate": 9.242789492119089e-05, + "loss": 0.9757, + "num_input_tokens_seen": 17419392, + "step": 1082 + }, + { + "epoch": 0.07586225012477343, + "grad_norm": 4.827987194061279, + "learning_rate": 9.242089667250438e-05, + "loss": 1.1493, + "num_input_tokens_seen": 17435776, + "step": 1083 + }, + { + "epoch": 0.07593229837050268, + "grad_norm": 4.416223526000977, + "learning_rate": 9.241389842381787e-05, + "loss": 0.9913, + "num_input_tokens_seen": 17452080, + "step": 1084 + }, + { + "epoch": 0.07600234661623193, + "grad_norm": 3.7776753902435303, + "learning_rate": 9.240690017513136e-05, + "loss": 1.0589, + "num_input_tokens_seen": 17468160, + "step": 1085 + }, + { + "epoch": 0.07607239486196117, + "grad_norm": 4.139477252960205, + "learning_rate": 9.239990192644485e-05, + "loss": 0.9475, + "num_input_tokens_seen": 17484544, + "step": 1086 + }, + { + "epoch": 0.07614244310769042, + "grad_norm": 5.218942642211914, + "learning_rate": 9.239290367775832e-05, + "loss": 1.1626, + "num_input_tokens_seen": 17500928, + "step": 1087 + }, + { + "epoch": 0.07621249135341966, + "grad_norm": 4.773080348968506, + "learning_rate": 9.238590542907181e-05, + "loss": 1.154, + "num_input_tokens_seen": 17517312, + "step": 1088 + }, + { + "epoch": 0.07628253959914891, + "grad_norm": 3.840151309967041, + "learning_rate": 9.237890718038528e-05, + "loss": 1.0862, + "num_input_tokens_seen": 17533696, + "step": 1089 + }, + { + "epoch": 0.07635258784487815, + "grad_norm": 4.201962471008301, + "learning_rate": 9.237190893169879e-05, + "loss": 1.0945, + "num_input_tokens_seen": 17549512, + "step": 1090 + }, + { + "epoch": 0.07642263609060741, + "grad_norm": 4.4583001136779785, + "learning_rate": 9.236491068301226e-05, + "loss": 1.074, + "num_input_tokens_seen": 17565896, + "step": 1091 + }, + { + "epoch": 0.07649268433633666, + "grad_norm": 4.013672351837158, + "learning_rate": 9.235791243432575e-05, + "loss": 1.2545, + "num_input_tokens_seen": 17582264, + "step": 1092 + }, + { + "epoch": 0.0765627325820659, + "grad_norm": 3.69555926322937, + "learning_rate": 9.235091418563924e-05, + "loss": 1.1615, + "num_input_tokens_seen": 17597888, + "step": 1093 + }, + { + "epoch": 0.07663278082779515, + "grad_norm": 4.341784954071045, + "learning_rate": 9.234391593695271e-05, + "loss": 1.0369, + "num_input_tokens_seen": 17613392, + "step": 1094 + }, + { + "epoch": 0.0767028290735244, + "grad_norm": 4.043522357940674, + "learning_rate": 9.23369176882662e-05, + "loss": 1.0509, + "num_input_tokens_seen": 17629216, + "step": 1095 + }, + { + "epoch": 0.07677287731925364, + "grad_norm": 4.330739498138428, + "learning_rate": 9.232991943957969e-05, + "loss": 1.2208, + "num_input_tokens_seen": 17645600, + "step": 1096 + }, + { + "epoch": 0.07684292556498289, + "grad_norm": 4.8433122634887695, + "learning_rate": 9.232292119089318e-05, + "loss": 0.9492, + "num_input_tokens_seen": 17660952, + "step": 1097 + }, + { + "epoch": 0.07691297381071213, + "grad_norm": 3.9039859771728516, + "learning_rate": 9.231592294220667e-05, + "loss": 1.0601, + "num_input_tokens_seen": 17677336, + "step": 1098 + }, + { + "epoch": 0.07698302205644138, + "grad_norm": 3.814103126525879, + "learning_rate": 9.230892469352014e-05, + "loss": 0.9902, + "num_input_tokens_seen": 17693720, + "step": 1099 + }, + { + "epoch": 0.07705307030217062, + "grad_norm": 3.9864039421081543, + "learning_rate": 9.230192644483363e-05, + "loss": 1.1622, + "num_input_tokens_seen": 17710104, + "step": 1100 + }, + { + "epoch": 0.07712311854789987, + "grad_norm": 4.469820499420166, + "learning_rate": 9.229492819614711e-05, + "loss": 1.044, + "num_input_tokens_seen": 17726488, + "step": 1101 + }, + { + "epoch": 0.07719316679362911, + "grad_norm": 3.8044216632843018, + "learning_rate": 9.22879299474606e-05, + "loss": 1.1283, + "num_input_tokens_seen": 17742648, + "step": 1102 + }, + { + "epoch": 0.07726321503935836, + "grad_norm": 4.859435558319092, + "learning_rate": 9.228093169877408e-05, + "loss": 1.0995, + "num_input_tokens_seen": 17759032, + "step": 1103 + }, + { + "epoch": 0.0773332632850876, + "grad_norm": 3.830214023590088, + "learning_rate": 9.227393345008757e-05, + "loss": 1.1731, + "num_input_tokens_seen": 17774872, + "step": 1104 + }, + { + "epoch": 0.07740331153081685, + "grad_norm": 4.196676254272461, + "learning_rate": 9.226693520140106e-05, + "loss": 1.2055, + "num_input_tokens_seen": 17790832, + "step": 1105 + }, + { + "epoch": 0.0774733597765461, + "grad_norm": 4.50007438659668, + "learning_rate": 9.225993695271454e-05, + "loss": 0.952, + "num_input_tokens_seen": 17805024, + "step": 1106 + }, + { + "epoch": 0.07754340802227534, + "grad_norm": 4.392070293426514, + "learning_rate": 9.225293870402803e-05, + "loss": 1.1548, + "num_input_tokens_seen": 17820008, + "step": 1107 + }, + { + "epoch": 0.07761345626800459, + "grad_norm": 4.09447717666626, + "learning_rate": 9.22459404553415e-05, + "loss": 1.1233, + "num_input_tokens_seen": 17836392, + "step": 1108 + }, + { + "epoch": 0.07768350451373383, + "grad_norm": 4.591554641723633, + "learning_rate": 9.223894220665499e-05, + "loss": 1.2772, + "num_input_tokens_seen": 17852776, + "step": 1109 + }, + { + "epoch": 0.07775355275946308, + "grad_norm": 5.629931926727295, + "learning_rate": 9.223194395796849e-05, + "loss": 1.1453, + "num_input_tokens_seen": 17869160, + "step": 1110 + }, + { + "epoch": 0.07782360100519232, + "grad_norm": 4.307553768157959, + "learning_rate": 9.222494570928197e-05, + "loss": 1.1479, + "num_input_tokens_seen": 17885544, + "step": 1111 + }, + { + "epoch": 0.07789364925092157, + "grad_norm": 4.599300384521484, + "learning_rate": 9.221794746059545e-05, + "loss": 1.1304, + "num_input_tokens_seen": 17901848, + "step": 1112 + }, + { + "epoch": 0.07796369749665082, + "grad_norm": 4.217408657073975, + "learning_rate": 9.221094921190894e-05, + "loss": 1.1611, + "num_input_tokens_seen": 17918232, + "step": 1113 + }, + { + "epoch": 0.07803374574238006, + "grad_norm": 3.885847568511963, + "learning_rate": 9.220395096322242e-05, + "loss": 0.968, + "num_input_tokens_seen": 17934504, + "step": 1114 + }, + { + "epoch": 0.07810379398810931, + "grad_norm": 4.280134677886963, + "learning_rate": 9.219695271453591e-05, + "loss": 1.0944, + "num_input_tokens_seen": 17950888, + "step": 1115 + }, + { + "epoch": 0.07817384223383855, + "grad_norm": 4.081259727478027, + "learning_rate": 9.21899544658494e-05, + "loss": 1.0872, + "num_input_tokens_seen": 17967088, + "step": 1116 + }, + { + "epoch": 0.0782438904795678, + "grad_norm": 4.206293106079102, + "learning_rate": 9.218295621716288e-05, + "loss": 1.2013, + "num_input_tokens_seen": 17983312, + "step": 1117 + }, + { + "epoch": 0.07831393872529704, + "grad_norm": 4.837226390838623, + "learning_rate": 9.217595796847636e-05, + "loss": 1.2628, + "num_input_tokens_seen": 17998768, + "step": 1118 + }, + { + "epoch": 0.07838398697102629, + "grad_norm": 4.344440460205078, + "learning_rate": 9.216895971978985e-05, + "loss": 1.0389, + "num_input_tokens_seen": 18014840, + "step": 1119 + }, + { + "epoch": 0.07845403521675554, + "grad_norm": 4.357896327972412, + "learning_rate": 9.216196147110334e-05, + "loss": 1.2444, + "num_input_tokens_seen": 18030696, + "step": 1120 + }, + { + "epoch": 0.07852408346248478, + "grad_norm": 3.6449878215789795, + "learning_rate": 9.215496322241681e-05, + "loss": 1.0622, + "num_input_tokens_seen": 18047024, + "step": 1121 + }, + { + "epoch": 0.07859413170821403, + "grad_norm": 4.154385566711426, + "learning_rate": 9.21479649737303e-05, + "loss": 1.1551, + "num_input_tokens_seen": 18063408, + "step": 1122 + }, + { + "epoch": 0.07866417995394327, + "grad_norm": 3.5929031372070312, + "learning_rate": 9.214096672504379e-05, + "loss": 0.9682, + "num_input_tokens_seen": 18079280, + "step": 1123 + }, + { + "epoch": 0.07873422819967252, + "grad_norm": 3.5724170207977295, + "learning_rate": 9.213396847635728e-05, + "loss": 0.8952, + "num_input_tokens_seen": 18094488, + "step": 1124 + }, + { + "epoch": 0.07880427644540176, + "grad_norm": 4.100067615509033, + "learning_rate": 9.212697022767077e-05, + "loss": 0.9066, + "num_input_tokens_seen": 18110872, + "step": 1125 + }, + { + "epoch": 0.07887432469113102, + "grad_norm": 4.431338787078857, + "learning_rate": 9.211997197898424e-05, + "loss": 1.0116, + "num_input_tokens_seen": 18127256, + "step": 1126 + }, + { + "epoch": 0.07894437293686027, + "grad_norm": 3.9577043056488037, + "learning_rate": 9.211297373029773e-05, + "loss": 1.1299, + "num_input_tokens_seen": 18143208, + "step": 1127 + }, + { + "epoch": 0.07901442118258951, + "grad_norm": 4.753921985626221, + "learning_rate": 9.21059754816112e-05, + "loss": 1.0686, + "num_input_tokens_seen": 18158888, + "step": 1128 + }, + { + "epoch": 0.07908446942831876, + "grad_norm": 3.763982057571411, + "learning_rate": 9.209897723292469e-05, + "loss": 1.0467, + "num_input_tokens_seen": 18175192, + "step": 1129 + }, + { + "epoch": 0.079154517674048, + "grad_norm": 3.729553699493408, + "learning_rate": 9.20919789842382e-05, + "loss": 1.1152, + "num_input_tokens_seen": 18191384, + "step": 1130 + }, + { + "epoch": 0.07922456591977725, + "grad_norm": 3.7760956287384033, + "learning_rate": 9.208498073555167e-05, + "loss": 1.0994, + "num_input_tokens_seen": 18207768, + "step": 1131 + }, + { + "epoch": 0.0792946141655065, + "grad_norm": 4.64035177230835, + "learning_rate": 9.207798248686516e-05, + "loss": 1.1037, + "num_input_tokens_seen": 18224152, + "step": 1132 + }, + { + "epoch": 0.07936466241123574, + "grad_norm": 4.1443352699279785, + "learning_rate": 9.207098423817863e-05, + "loss": 1.2329, + "num_input_tokens_seen": 18240536, + "step": 1133 + }, + { + "epoch": 0.07943471065696499, + "grad_norm": 5.332706451416016, + "learning_rate": 9.206398598949212e-05, + "loss": 1.1303, + "num_input_tokens_seen": 18255528, + "step": 1134 + }, + { + "epoch": 0.07950475890269423, + "grad_norm": 3.914705514907837, + "learning_rate": 9.20569877408056e-05, + "loss": 1.1182, + "num_input_tokens_seen": 18271768, + "step": 1135 + }, + { + "epoch": 0.07957480714842348, + "grad_norm": 4.994162559509277, + "learning_rate": 9.20499894921191e-05, + "loss": 1.175, + "num_input_tokens_seen": 18288152, + "step": 1136 + }, + { + "epoch": 0.07964485539415272, + "grad_norm": 4.132298946380615, + "learning_rate": 9.204299124343259e-05, + "loss": 0.9402, + "num_input_tokens_seen": 18303784, + "step": 1137 + }, + { + "epoch": 0.07971490363988197, + "grad_norm": 3.9048449993133545, + "learning_rate": 9.203599299474606e-05, + "loss": 1.1283, + "num_input_tokens_seen": 18319968, + "step": 1138 + }, + { + "epoch": 0.07978495188561122, + "grad_norm": 3.981844425201416, + "learning_rate": 9.202899474605955e-05, + "loss": 1.0472, + "num_input_tokens_seen": 18335976, + "step": 1139 + }, + { + "epoch": 0.07985500013134046, + "grad_norm": 4.491240501403809, + "learning_rate": 9.202199649737304e-05, + "loss": 1.1022, + "num_input_tokens_seen": 18352360, + "step": 1140 + }, + { + "epoch": 0.07992504837706971, + "grad_norm": 4.152430534362793, + "learning_rate": 9.201499824868652e-05, + "loss": 1.0688, + "num_input_tokens_seen": 18368736, + "step": 1141 + }, + { + "epoch": 0.07999509662279895, + "grad_norm": 4.337832450866699, + "learning_rate": 9.2008e-05, + "loss": 1.0397, + "num_input_tokens_seen": 18385120, + "step": 1142 + }, + { + "epoch": 0.0800651448685282, + "grad_norm": 4.865042209625244, + "learning_rate": 9.200100175131349e-05, + "loss": 0.9616, + "num_input_tokens_seen": 18401504, + "step": 1143 + }, + { + "epoch": 0.08013519311425744, + "grad_norm": 3.783113479614258, + "learning_rate": 9.199400350262698e-05, + "loss": 1.0001, + "num_input_tokens_seen": 18417176, + "step": 1144 + }, + { + "epoch": 0.08020524135998669, + "grad_norm": 4.98455286026001, + "learning_rate": 9.198700525394046e-05, + "loss": 1.2139, + "num_input_tokens_seen": 18432584, + "step": 1145 + }, + { + "epoch": 0.08027528960571594, + "grad_norm": 4.1859517097473145, + "learning_rate": 9.198000700525394e-05, + "loss": 1.1333, + "num_input_tokens_seen": 18448968, + "step": 1146 + }, + { + "epoch": 0.08034533785144518, + "grad_norm": 3.7193386554718018, + "learning_rate": 9.197300875656743e-05, + "loss": 1.0055, + "num_input_tokens_seen": 18465352, + "step": 1147 + }, + { + "epoch": 0.08041538609717443, + "grad_norm": 4.280893325805664, + "learning_rate": 9.196601050788091e-05, + "loss": 1.1261, + "num_input_tokens_seen": 18481736, + "step": 1148 + }, + { + "epoch": 0.08048543434290367, + "grad_norm": 3.9979352951049805, + "learning_rate": 9.19590122591944e-05, + "loss": 1.025, + "num_input_tokens_seen": 18498120, + "step": 1149 + }, + { + "epoch": 0.08055548258863292, + "grad_norm": 5.594225883483887, + "learning_rate": 9.195201401050789e-05, + "loss": 1.0527, + "num_input_tokens_seen": 18513944, + "step": 1150 + }, + { + "epoch": 0.08062553083436216, + "grad_norm": 4.758842468261719, + "learning_rate": 9.194501576182137e-05, + "loss": 1.0915, + "num_input_tokens_seen": 18530328, + "step": 1151 + }, + { + "epoch": 0.08069557908009141, + "grad_norm": 5.597489356994629, + "learning_rate": 9.193801751313486e-05, + "loss": 1.0673, + "num_input_tokens_seen": 18546632, + "step": 1152 + }, + { + "epoch": 0.08076562732582065, + "grad_norm": 5.279472827911377, + "learning_rate": 9.193101926444834e-05, + "loss": 1.2897, + "num_input_tokens_seen": 18561856, + "step": 1153 + }, + { + "epoch": 0.0808356755715499, + "grad_norm": 4.672069072723389, + "learning_rate": 9.192402101576183e-05, + "loss": 1.0298, + "num_input_tokens_seen": 18577944, + "step": 1154 + }, + { + "epoch": 0.08090572381727915, + "grad_norm": 3.65533447265625, + "learning_rate": 9.19170227670753e-05, + "loss": 0.933, + "num_input_tokens_seen": 18593720, + "step": 1155 + }, + { + "epoch": 0.08097577206300839, + "grad_norm": 4.212414741516113, + "learning_rate": 9.19100245183888e-05, + "loss": 1.0496, + "num_input_tokens_seen": 18609864, + "step": 1156 + }, + { + "epoch": 0.08104582030873764, + "grad_norm": 4.471503734588623, + "learning_rate": 9.190302626970229e-05, + "loss": 1.2261, + "num_input_tokens_seen": 18626248, + "step": 1157 + }, + { + "epoch": 0.08111586855446688, + "grad_norm": 4.952723979949951, + "learning_rate": 9.189602802101577e-05, + "loss": 1.056, + "num_input_tokens_seen": 18642632, + "step": 1158 + }, + { + "epoch": 0.08118591680019613, + "grad_norm": 3.921449661254883, + "learning_rate": 9.188902977232926e-05, + "loss": 1.1617, + "num_input_tokens_seen": 18659016, + "step": 1159 + }, + { + "epoch": 0.08125596504592539, + "grad_norm": 3.728752374649048, + "learning_rate": 9.188203152364273e-05, + "loss": 1.1217, + "num_input_tokens_seen": 18675400, + "step": 1160 + }, + { + "epoch": 0.08132601329165463, + "grad_norm": 3.8742613792419434, + "learning_rate": 9.187503327495622e-05, + "loss": 1.1538, + "num_input_tokens_seen": 18691232, + "step": 1161 + }, + { + "epoch": 0.08139606153738388, + "grad_norm": 3.827157735824585, + "learning_rate": 9.186803502626971e-05, + "loss": 1.1457, + "num_input_tokens_seen": 18707616, + "step": 1162 + }, + { + "epoch": 0.08146610978311312, + "grad_norm": 3.8507778644561768, + "learning_rate": 9.18610367775832e-05, + "loss": 1.0317, + "num_input_tokens_seen": 18724000, + "step": 1163 + }, + { + "epoch": 0.08153615802884237, + "grad_norm": 5.328095436096191, + "learning_rate": 9.185403852889669e-05, + "loss": 1.0921, + "num_input_tokens_seen": 18740384, + "step": 1164 + }, + { + "epoch": 0.08160620627457162, + "grad_norm": 4.8900322914123535, + "learning_rate": 9.184704028021016e-05, + "loss": 1.1308, + "num_input_tokens_seen": 18756768, + "step": 1165 + }, + { + "epoch": 0.08167625452030086, + "grad_norm": 3.810084104537964, + "learning_rate": 9.184004203152365e-05, + "loss": 1.1244, + "num_input_tokens_seen": 18772632, + "step": 1166 + }, + { + "epoch": 0.08174630276603011, + "grad_norm": 4.318419456481934, + "learning_rate": 9.183304378283714e-05, + "loss": 1.0372, + "num_input_tokens_seen": 18788272, + "step": 1167 + }, + { + "epoch": 0.08181635101175935, + "grad_norm": 4.093379020690918, + "learning_rate": 9.182604553415061e-05, + "loss": 1.18, + "num_input_tokens_seen": 18803672, + "step": 1168 + }, + { + "epoch": 0.0818863992574886, + "grad_norm": 4.630450248718262, + "learning_rate": 9.18190472854641e-05, + "loss": 1.1439, + "num_input_tokens_seen": 18820056, + "step": 1169 + }, + { + "epoch": 0.08195644750321784, + "grad_norm": 4.388457775115967, + "learning_rate": 9.181204903677759e-05, + "loss": 1.0971, + "num_input_tokens_seen": 18836440, + "step": 1170 + }, + { + "epoch": 0.08202649574894709, + "grad_norm": 3.6942262649536133, + "learning_rate": 9.180505078809108e-05, + "loss": 1.1594, + "num_input_tokens_seen": 18852824, + "step": 1171 + }, + { + "epoch": 0.08209654399467634, + "grad_norm": 3.937696933746338, + "learning_rate": 9.179805253940455e-05, + "loss": 1.1841, + "num_input_tokens_seen": 18869208, + "step": 1172 + }, + { + "epoch": 0.08216659224040558, + "grad_norm": 4.062703609466553, + "learning_rate": 9.179105429071804e-05, + "loss": 1.083, + "num_input_tokens_seen": 18885320, + "step": 1173 + }, + { + "epoch": 0.08223664048613483, + "grad_norm": 7.794081211090088, + "learning_rate": 9.178405604203153e-05, + "loss": 1.2287, + "num_input_tokens_seen": 18900224, + "step": 1174 + }, + { + "epoch": 0.08230668873186407, + "grad_norm": 4.429391860961914, + "learning_rate": 9.1777057793345e-05, + "loss": 1.0504, + "num_input_tokens_seen": 18916456, + "step": 1175 + }, + { + "epoch": 0.08237673697759332, + "grad_norm": 3.954869508743286, + "learning_rate": 9.17700595446585e-05, + "loss": 1.1558, + "num_input_tokens_seen": 18932840, + "step": 1176 + }, + { + "epoch": 0.08244678522332256, + "grad_norm": 5.555337429046631, + "learning_rate": 9.176306129597198e-05, + "loss": 1.3628, + "num_input_tokens_seen": 18949224, + "step": 1177 + }, + { + "epoch": 0.08251683346905181, + "grad_norm": 3.575295925140381, + "learning_rate": 9.175606304728547e-05, + "loss": 1.0651, + "num_input_tokens_seen": 18965552, + "step": 1178 + }, + { + "epoch": 0.08258688171478105, + "grad_norm": 5.927703380584717, + "learning_rate": 9.174906479859896e-05, + "loss": 1.0582, + "num_input_tokens_seen": 18981496, + "step": 1179 + }, + { + "epoch": 0.0826569299605103, + "grad_norm": 6.553986549377441, + "learning_rate": 9.174206654991243e-05, + "loss": 1.4058, + "num_input_tokens_seen": 18996808, + "step": 1180 + }, + { + "epoch": 0.08272697820623955, + "grad_norm": 4.315832138061523, + "learning_rate": 9.173506830122592e-05, + "loss": 1.1166, + "num_input_tokens_seen": 19013192, + "step": 1181 + }, + { + "epoch": 0.08279702645196879, + "grad_norm": 3.818033218383789, + "learning_rate": 9.172807005253941e-05, + "loss": 1.0744, + "num_input_tokens_seen": 19029464, + "step": 1182 + }, + { + "epoch": 0.08286707469769804, + "grad_norm": 3.4207711219787598, + "learning_rate": 9.17210718038529e-05, + "loss": 0.8952, + "num_input_tokens_seen": 19045592, + "step": 1183 + }, + { + "epoch": 0.08293712294342728, + "grad_norm": 4.3305864334106445, + "learning_rate": 9.171407355516639e-05, + "loss": 0.9617, + "num_input_tokens_seen": 19061864, + "step": 1184 + }, + { + "epoch": 0.08300717118915653, + "grad_norm": 5.365218162536621, + "learning_rate": 9.170707530647986e-05, + "loss": 1.1669, + "num_input_tokens_seen": 19075448, + "step": 1185 + }, + { + "epoch": 0.08307721943488577, + "grad_norm": 3.9939708709716797, + "learning_rate": 9.170007705779335e-05, + "loss": 1.1325, + "num_input_tokens_seen": 19091832, + "step": 1186 + }, + { + "epoch": 0.08314726768061502, + "grad_norm": 3.8088884353637695, + "learning_rate": 9.169307880910683e-05, + "loss": 1.0132, + "num_input_tokens_seen": 19107920, + "step": 1187 + }, + { + "epoch": 0.08321731592634427, + "grad_norm": 3.858799457550049, + "learning_rate": 9.168608056042032e-05, + "loss": 0.9805, + "num_input_tokens_seen": 19123776, + "step": 1188 + }, + { + "epoch": 0.08328736417207351, + "grad_norm": 4.042770862579346, + "learning_rate": 9.16790823117338e-05, + "loss": 1.1668, + "num_input_tokens_seen": 19139752, + "step": 1189 + }, + { + "epoch": 0.08335741241780276, + "grad_norm": 4.2054762840271, + "learning_rate": 9.16720840630473e-05, + "loss": 1.0702, + "num_input_tokens_seen": 19156136, + "step": 1190 + }, + { + "epoch": 0.083427460663532, + "grad_norm": 4.450238227844238, + "learning_rate": 9.166508581436078e-05, + "loss": 1.0751, + "num_input_tokens_seen": 19172240, + "step": 1191 + }, + { + "epoch": 0.08349750890926125, + "grad_norm": 4.126129627227783, + "learning_rate": 9.165808756567426e-05, + "loss": 0.9957, + "num_input_tokens_seen": 19188624, + "step": 1192 + }, + { + "epoch": 0.0835675571549905, + "grad_norm": 4.131893157958984, + "learning_rate": 9.165108931698775e-05, + "loss": 1.2004, + "num_input_tokens_seen": 19205008, + "step": 1193 + }, + { + "epoch": 0.08363760540071974, + "grad_norm": 4.25187873840332, + "learning_rate": 9.164409106830123e-05, + "loss": 1.3571, + "num_input_tokens_seen": 19220856, + "step": 1194 + }, + { + "epoch": 0.083707653646449, + "grad_norm": 3.842498302459717, + "learning_rate": 9.163709281961471e-05, + "loss": 1.0963, + "num_input_tokens_seen": 19237208, + "step": 1195 + }, + { + "epoch": 0.08377770189217824, + "grad_norm": 3.694279432296753, + "learning_rate": 9.16300945709282e-05, + "loss": 1.1177, + "num_input_tokens_seen": 19253592, + "step": 1196 + }, + { + "epoch": 0.08384775013790749, + "grad_norm": 4.382254123687744, + "learning_rate": 9.162309632224169e-05, + "loss": 1.0344, + "num_input_tokens_seen": 19269976, + "step": 1197 + }, + { + "epoch": 0.08391779838363674, + "grad_norm": 4.267289161682129, + "learning_rate": 9.161609807355518e-05, + "loss": 1.1211, + "num_input_tokens_seen": 19286360, + "step": 1198 + }, + { + "epoch": 0.08398784662936598, + "grad_norm": 5.554534435272217, + "learning_rate": 9.160909982486865e-05, + "loss": 0.9674, + "num_input_tokens_seen": 19301800, + "step": 1199 + }, + { + "epoch": 0.08405789487509523, + "grad_norm": 4.1479668617248535, + "learning_rate": 9.160210157618214e-05, + "loss": 1.2334, + "num_input_tokens_seen": 19317392, + "step": 1200 + }, + { + "epoch": 0.08405789487509523, + "eval_loss": 1.1600490808486938, + "eval_runtime": 0.2015, + "eval_samples_per_second": 4.962, + "eval_steps_per_second": 4.962, + "num_input_tokens_seen": 19317392, + "step": 1200 + }, + { + "epoch": 0.08412794312082447, + "grad_norm": 4.1876349449157715, + "learning_rate": 9.159510332749563e-05, + "loss": 1.2036, + "num_input_tokens_seen": 19333776, + "step": 1201 + }, + { + "epoch": 0.08419799136655372, + "grad_norm": 4.031203746795654, + "learning_rate": 9.15881050788091e-05, + "loss": 1.2127, + "num_input_tokens_seen": 19349616, + "step": 1202 + }, + { + "epoch": 0.08426803961228296, + "grad_norm": 4.013350963592529, + "learning_rate": 9.15811068301226e-05, + "loss": 1.2147, + "num_input_tokens_seen": 19366000, + "step": 1203 + }, + { + "epoch": 0.08433808785801221, + "grad_norm": 4.509790897369385, + "learning_rate": 9.157410858143608e-05, + "loss": 1.3484, + "num_input_tokens_seen": 19381904, + "step": 1204 + }, + { + "epoch": 0.08440813610374145, + "grad_norm": 4.630336761474609, + "learning_rate": 9.156711033274957e-05, + "loss": 1.0246, + "num_input_tokens_seen": 19398288, + "step": 1205 + }, + { + "epoch": 0.0844781843494707, + "grad_norm": 3.819884777069092, + "learning_rate": 9.156011208406304e-05, + "loss": 1.1242, + "num_input_tokens_seen": 19414248, + "step": 1206 + }, + { + "epoch": 0.08454823259519995, + "grad_norm": 3.7933132648468018, + "learning_rate": 9.155311383537653e-05, + "loss": 1.0766, + "num_input_tokens_seen": 19430632, + "step": 1207 + }, + { + "epoch": 0.08461828084092919, + "grad_norm": 5.7384934425354, + "learning_rate": 9.154611558669002e-05, + "loss": 1.0691, + "num_input_tokens_seen": 19446248, + "step": 1208 + }, + { + "epoch": 0.08468832908665844, + "grad_norm": 3.9594175815582275, + "learning_rate": 9.153911733800351e-05, + "loss": 1.2029, + "num_input_tokens_seen": 19462632, + "step": 1209 + }, + { + "epoch": 0.08475837733238768, + "grad_norm": 3.8251891136169434, + "learning_rate": 9.1532119089317e-05, + "loss": 0.9994, + "num_input_tokens_seen": 19479016, + "step": 1210 + }, + { + "epoch": 0.08482842557811693, + "grad_norm": 3.9750332832336426, + "learning_rate": 9.152512084063049e-05, + "loss": 1.1737, + "num_input_tokens_seen": 19495112, + "step": 1211 + }, + { + "epoch": 0.08489847382384617, + "grad_norm": 3.986170530319214, + "learning_rate": 9.151812259194396e-05, + "loss": 1.1441, + "num_input_tokens_seen": 19511216, + "step": 1212 + }, + { + "epoch": 0.08496852206957542, + "grad_norm": 3.914065361022949, + "learning_rate": 9.151112434325745e-05, + "loss": 1.2233, + "num_input_tokens_seen": 19527600, + "step": 1213 + }, + { + "epoch": 0.08503857031530467, + "grad_norm": 4.328094482421875, + "learning_rate": 9.150412609457093e-05, + "loss": 1.2076, + "num_input_tokens_seen": 19543984, + "step": 1214 + }, + { + "epoch": 0.08510861856103391, + "grad_norm": 4.112467288970947, + "learning_rate": 9.149712784588441e-05, + "loss": 1.1732, + "num_input_tokens_seen": 19560368, + "step": 1215 + }, + { + "epoch": 0.08517866680676316, + "grad_norm": 4.680009365081787, + "learning_rate": 9.14901295971979e-05, + "loss": 0.985, + "num_input_tokens_seen": 19575616, + "step": 1216 + }, + { + "epoch": 0.0852487150524924, + "grad_norm": 4.4872660636901855, + "learning_rate": 9.148313134851139e-05, + "loss": 1.1799, + "num_input_tokens_seen": 19592000, + "step": 1217 + }, + { + "epoch": 0.08531876329822165, + "grad_norm": 3.7546637058258057, + "learning_rate": 9.147613309982488e-05, + "loss": 1.1989, + "num_input_tokens_seen": 19608384, + "step": 1218 + }, + { + "epoch": 0.0853888115439509, + "grad_norm": 5.590888500213623, + "learning_rate": 9.146913485113835e-05, + "loss": 1.1411, + "num_input_tokens_seen": 19624768, + "step": 1219 + }, + { + "epoch": 0.08545885978968014, + "grad_norm": 3.958021640777588, + "learning_rate": 9.146213660245184e-05, + "loss": 0.9309, + "num_input_tokens_seen": 19641152, + "step": 1220 + }, + { + "epoch": 0.08552890803540938, + "grad_norm": 3.7641196250915527, + "learning_rate": 9.145513835376533e-05, + "loss": 1.0299, + "num_input_tokens_seen": 19657536, + "step": 1221 + }, + { + "epoch": 0.08559895628113863, + "grad_norm": 4.395461559295654, + "learning_rate": 9.14481401050788e-05, + "loss": 1.1404, + "num_input_tokens_seen": 19673712, + "step": 1222 + }, + { + "epoch": 0.08566900452686788, + "grad_norm": 3.8162319660186768, + "learning_rate": 9.144114185639231e-05, + "loss": 1.1638, + "num_input_tokens_seen": 19689336, + "step": 1223 + }, + { + "epoch": 0.08573905277259712, + "grad_norm": 3.7025444507598877, + "learning_rate": 9.143414360770578e-05, + "loss": 0.9995, + "num_input_tokens_seen": 19705464, + "step": 1224 + }, + { + "epoch": 0.08580910101832637, + "grad_norm": 3.8621439933776855, + "learning_rate": 9.142714535901927e-05, + "loss": 1.1639, + "num_input_tokens_seen": 19721848, + "step": 1225 + }, + { + "epoch": 0.08587914926405561, + "grad_norm": 4.243250846862793, + "learning_rate": 9.142014711033275e-05, + "loss": 1.0104, + "num_input_tokens_seen": 19738072, + "step": 1226 + }, + { + "epoch": 0.08594919750978486, + "grad_norm": 4.05800724029541, + "learning_rate": 9.141314886164624e-05, + "loss": 1.0257, + "num_input_tokens_seen": 19754456, + "step": 1227 + }, + { + "epoch": 0.0860192457555141, + "grad_norm": 4.0894455909729, + "learning_rate": 9.140615061295972e-05, + "loss": 1.254, + "num_input_tokens_seen": 19770840, + "step": 1228 + }, + { + "epoch": 0.08608929400124336, + "grad_norm": 4.296894073486328, + "learning_rate": 9.139915236427321e-05, + "loss": 1.1298, + "num_input_tokens_seen": 19786864, + "step": 1229 + }, + { + "epoch": 0.08615934224697261, + "grad_norm": 4.0352888107299805, + "learning_rate": 9.13921541155867e-05, + "loss": 1.0611, + "num_input_tokens_seen": 19801800, + "step": 1230 + }, + { + "epoch": 0.08622939049270185, + "grad_norm": 4.087375640869141, + "learning_rate": 9.138515586690018e-05, + "loss": 0.9686, + "num_input_tokens_seen": 19818184, + "step": 1231 + }, + { + "epoch": 0.0862994387384311, + "grad_norm": 4.045078754425049, + "learning_rate": 9.137815761821367e-05, + "loss": 1.0915, + "num_input_tokens_seen": 19833016, + "step": 1232 + }, + { + "epoch": 0.08636948698416035, + "grad_norm": 4.399363040924072, + "learning_rate": 9.137115936952714e-05, + "loss": 1.1875, + "num_input_tokens_seen": 19848912, + "step": 1233 + }, + { + "epoch": 0.08643953522988959, + "grad_norm": 4.420406818389893, + "learning_rate": 9.136416112084063e-05, + "loss": 1.0534, + "num_input_tokens_seen": 19865296, + "step": 1234 + }, + { + "epoch": 0.08650958347561884, + "grad_norm": 4.131808280944824, + "learning_rate": 9.135716287215412e-05, + "loss": 1.1865, + "num_input_tokens_seen": 19881376, + "step": 1235 + }, + { + "epoch": 0.08657963172134808, + "grad_norm": 3.8256850242614746, + "learning_rate": 9.13501646234676e-05, + "loss": 1.0539, + "num_input_tokens_seen": 19897704, + "step": 1236 + }, + { + "epoch": 0.08664967996707733, + "grad_norm": 4.3497233390808105, + "learning_rate": 9.13431663747811e-05, + "loss": 1.191, + "num_input_tokens_seen": 19914088, + "step": 1237 + }, + { + "epoch": 0.08671972821280657, + "grad_norm": 4.18136739730835, + "learning_rate": 9.133616812609458e-05, + "loss": 1.0539, + "num_input_tokens_seen": 19930128, + "step": 1238 + }, + { + "epoch": 0.08678977645853582, + "grad_norm": 4.782970905303955, + "learning_rate": 9.132916987740806e-05, + "loss": 1.1992, + "num_input_tokens_seen": 19946512, + "step": 1239 + }, + { + "epoch": 0.08685982470426507, + "grad_norm": 4.16589879989624, + "learning_rate": 9.132217162872155e-05, + "loss": 1.1463, + "num_input_tokens_seen": 19962488, + "step": 1240 + }, + { + "epoch": 0.08692987294999431, + "grad_norm": 3.73541522026062, + "learning_rate": 9.131517338003502e-05, + "loss": 1.0272, + "num_input_tokens_seen": 19978584, + "step": 1241 + }, + { + "epoch": 0.08699992119572356, + "grad_norm": 4.225815773010254, + "learning_rate": 9.130817513134851e-05, + "loss": 1.177, + "num_input_tokens_seen": 19994816, + "step": 1242 + }, + { + "epoch": 0.0870699694414528, + "grad_norm": 7.807470321655273, + "learning_rate": 9.1301176882662e-05, + "loss": 1.1635, + "num_input_tokens_seen": 20010576, + "step": 1243 + }, + { + "epoch": 0.08714001768718205, + "grad_norm": 4.818174839019775, + "learning_rate": 9.129417863397549e-05, + "loss": 1.1892, + "num_input_tokens_seen": 20025712, + "step": 1244 + }, + { + "epoch": 0.0872100659329113, + "grad_norm": 3.8367979526519775, + "learning_rate": 9.128718038528898e-05, + "loss": 1.0096, + "num_input_tokens_seen": 20041904, + "step": 1245 + }, + { + "epoch": 0.08728011417864054, + "grad_norm": 3.9912586212158203, + "learning_rate": 9.128018213660245e-05, + "loss": 1.097, + "num_input_tokens_seen": 20058288, + "step": 1246 + }, + { + "epoch": 0.08735016242436978, + "grad_norm": 4.842557907104492, + "learning_rate": 9.127318388791594e-05, + "loss": 1.2012, + "num_input_tokens_seen": 20074672, + "step": 1247 + }, + { + "epoch": 0.08742021067009903, + "grad_norm": 3.816938877105713, + "learning_rate": 9.126618563922943e-05, + "loss": 1.1683, + "num_input_tokens_seen": 20090664, + "step": 1248 + }, + { + "epoch": 0.08749025891582828, + "grad_norm": 3.712480306625366, + "learning_rate": 9.125918739054292e-05, + "loss": 1.1978, + "num_input_tokens_seen": 20107048, + "step": 1249 + }, + { + "epoch": 0.08756030716155752, + "grad_norm": 4.185492515563965, + "learning_rate": 9.12521891418564e-05, + "loss": 1.2042, + "num_input_tokens_seen": 20123432, + "step": 1250 + }, + { + "epoch": 0.08763035540728677, + "grad_norm": 5.510714530944824, + "learning_rate": 9.124519089316988e-05, + "loss": 0.9757, + "num_input_tokens_seen": 20139112, + "step": 1251 + }, + { + "epoch": 0.08770040365301601, + "grad_norm": 3.9170289039611816, + "learning_rate": 9.123819264448337e-05, + "loss": 1.0213, + "num_input_tokens_seen": 20155496, + "step": 1252 + }, + { + "epoch": 0.08777045189874526, + "grad_norm": 3.738008975982666, + "learning_rate": 9.123119439579684e-05, + "loss": 0.9446, + "num_input_tokens_seen": 20171760, + "step": 1253 + }, + { + "epoch": 0.0878405001444745, + "grad_norm": 4.845873832702637, + "learning_rate": 9.122419614711033e-05, + "loss": 1.2135, + "num_input_tokens_seen": 20188056, + "step": 1254 + }, + { + "epoch": 0.08791054839020375, + "grad_norm": 4.166906356811523, + "learning_rate": 9.121719789842382e-05, + "loss": 1.1558, + "num_input_tokens_seen": 20204440, + "step": 1255 + }, + { + "epoch": 0.087980596635933, + "grad_norm": 4.039194107055664, + "learning_rate": 9.121019964973731e-05, + "loss": 1.0297, + "num_input_tokens_seen": 20220824, + "step": 1256 + }, + { + "epoch": 0.08805064488166224, + "grad_norm": 3.545482635498047, + "learning_rate": 9.12032014010508e-05, + "loss": 0.9757, + "num_input_tokens_seen": 20236888, + "step": 1257 + }, + { + "epoch": 0.08812069312739149, + "grad_norm": 3.82114839553833, + "learning_rate": 9.119620315236427e-05, + "loss": 1.1637, + "num_input_tokens_seen": 20253272, + "step": 1258 + }, + { + "epoch": 0.08819074137312073, + "grad_norm": 4.770678997039795, + "learning_rate": 9.118920490367776e-05, + "loss": 1.1421, + "num_input_tokens_seen": 20269656, + "step": 1259 + }, + { + "epoch": 0.08826078961884998, + "grad_norm": 4.4319539070129395, + "learning_rate": 9.118220665499124e-05, + "loss": 1.1565, + "num_input_tokens_seen": 20285456, + "step": 1260 + }, + { + "epoch": 0.08833083786457922, + "grad_norm": 4.0923357009887695, + "learning_rate": 9.117520840630473e-05, + "loss": 1.2328, + "num_input_tokens_seen": 20301232, + "step": 1261 + }, + { + "epoch": 0.08840088611030847, + "grad_norm": 5.8347344398498535, + "learning_rate": 9.116821015761821e-05, + "loss": 0.8824, + "num_input_tokens_seen": 20317224, + "step": 1262 + }, + { + "epoch": 0.08847093435603771, + "grad_norm": 4.525367259979248, + "learning_rate": 9.11612119089317e-05, + "loss": 1.1554, + "num_input_tokens_seen": 20332616, + "step": 1263 + }, + { + "epoch": 0.08854098260176697, + "grad_norm": 3.9754436016082764, + "learning_rate": 9.115421366024519e-05, + "loss": 1.0423, + "num_input_tokens_seen": 20348336, + "step": 1264 + }, + { + "epoch": 0.08861103084749622, + "grad_norm": 4.40745735168457, + "learning_rate": 9.114721541155868e-05, + "loss": 1.0485, + "num_input_tokens_seen": 20364312, + "step": 1265 + }, + { + "epoch": 0.08868107909322547, + "grad_norm": 7.126221179962158, + "learning_rate": 9.114021716287216e-05, + "loss": 1.2035, + "num_input_tokens_seen": 20380696, + "step": 1266 + }, + { + "epoch": 0.08875112733895471, + "grad_norm": 4.306386947631836, + "learning_rate": 9.113321891418564e-05, + "loss": 1.0399, + "num_input_tokens_seen": 20397080, + "step": 1267 + }, + { + "epoch": 0.08882117558468396, + "grad_norm": 3.566943407058716, + "learning_rate": 9.112622066549912e-05, + "loss": 1.0463, + "num_input_tokens_seen": 20413464, + "step": 1268 + }, + { + "epoch": 0.0888912238304132, + "grad_norm": 3.975228786468506, + "learning_rate": 9.111922241681262e-05, + "loss": 1.2576, + "num_input_tokens_seen": 20429848, + "step": 1269 + }, + { + "epoch": 0.08896127207614245, + "grad_norm": 4.928854465484619, + "learning_rate": 9.11122241681261e-05, + "loss": 1.1555, + "num_input_tokens_seen": 20446192, + "step": 1270 + }, + { + "epoch": 0.0890313203218717, + "grad_norm": 4.288821697235107, + "learning_rate": 9.110522591943958e-05, + "loss": 1.2559, + "num_input_tokens_seen": 20462576, + "step": 1271 + }, + { + "epoch": 0.08910136856760094, + "grad_norm": 3.9346396923065186, + "learning_rate": 9.109822767075307e-05, + "loss": 1.1479, + "num_input_tokens_seen": 20478520, + "step": 1272 + }, + { + "epoch": 0.08917141681333018, + "grad_norm": 3.7976620197296143, + "learning_rate": 9.109122942206655e-05, + "loss": 0.9903, + "num_input_tokens_seen": 20494408, + "step": 1273 + }, + { + "epoch": 0.08924146505905943, + "grad_norm": 5.373577117919922, + "learning_rate": 9.108423117338004e-05, + "loss": 0.8863, + "num_input_tokens_seen": 20510792, + "step": 1274 + }, + { + "epoch": 0.08931151330478868, + "grad_norm": 4.248324394226074, + "learning_rate": 9.107723292469353e-05, + "loss": 1.3492, + "num_input_tokens_seen": 20527064, + "step": 1275 + }, + { + "epoch": 0.08938156155051792, + "grad_norm": 4.453672885894775, + "learning_rate": 9.107023467600701e-05, + "loss": 0.9763, + "num_input_tokens_seen": 20543448, + "step": 1276 + }, + { + "epoch": 0.08945160979624717, + "grad_norm": 4.8721184730529785, + "learning_rate": 9.10632364273205e-05, + "loss": 0.9455, + "num_input_tokens_seen": 20559832, + "step": 1277 + }, + { + "epoch": 0.08952165804197641, + "grad_norm": 5.0173540115356445, + "learning_rate": 9.105623817863398e-05, + "loss": 1.0303, + "num_input_tokens_seen": 20576216, + "step": 1278 + }, + { + "epoch": 0.08959170628770566, + "grad_norm": 5.00100040435791, + "learning_rate": 9.104923992994747e-05, + "loss": 1.0393, + "num_input_tokens_seen": 20592600, + "step": 1279 + }, + { + "epoch": 0.0896617545334349, + "grad_norm": 4.271099090576172, + "learning_rate": 9.104224168126094e-05, + "loss": 1.2307, + "num_input_tokens_seen": 20608632, + "step": 1280 + }, + { + "epoch": 0.08973180277916415, + "grad_norm": 4.246976852416992, + "learning_rate": 9.103524343257443e-05, + "loss": 1.1405, + "num_input_tokens_seen": 20625016, + "step": 1281 + }, + { + "epoch": 0.0898018510248934, + "grad_norm": 5.033923149108887, + "learning_rate": 9.102824518388792e-05, + "loss": 1.0849, + "num_input_tokens_seen": 20641400, + "step": 1282 + }, + { + "epoch": 0.08987189927062264, + "grad_norm": 4.4118571281433105, + "learning_rate": 9.102124693520141e-05, + "loss": 1.118, + "num_input_tokens_seen": 20657448, + "step": 1283 + }, + { + "epoch": 0.08994194751635189, + "grad_norm": 4.150144577026367, + "learning_rate": 9.10142486865149e-05, + "loss": 1.0676, + "num_input_tokens_seen": 20673080, + "step": 1284 + }, + { + "epoch": 0.09001199576208113, + "grad_norm": 3.767683744430542, + "learning_rate": 9.100725043782837e-05, + "loss": 0.8968, + "num_input_tokens_seen": 20689464, + "step": 1285 + }, + { + "epoch": 0.09008204400781038, + "grad_norm": 4.816582202911377, + "learning_rate": 9.100025218914186e-05, + "loss": 1.0039, + "num_input_tokens_seen": 20703896, + "step": 1286 + }, + { + "epoch": 0.09015209225353962, + "grad_norm": 3.8913414478302, + "learning_rate": 9.099325394045533e-05, + "loss": 1.0077, + "num_input_tokens_seen": 20720280, + "step": 1287 + }, + { + "epoch": 0.09022214049926887, + "grad_norm": 4.305298328399658, + "learning_rate": 9.098625569176882e-05, + "loss": 1.1555, + "num_input_tokens_seen": 20735944, + "step": 1288 + }, + { + "epoch": 0.09029218874499811, + "grad_norm": 3.3120992183685303, + "learning_rate": 9.097925744308233e-05, + "loss": 0.8591, + "num_input_tokens_seen": 20752128, + "step": 1289 + }, + { + "epoch": 0.09036223699072736, + "grad_norm": 4.705013751983643, + "learning_rate": 9.09722591943958e-05, + "loss": 1.4579, + "num_input_tokens_seen": 20768512, + "step": 1290 + }, + { + "epoch": 0.0904322852364566, + "grad_norm": 5.08630895614624, + "learning_rate": 9.096526094570929e-05, + "loss": 1.1049, + "num_input_tokens_seen": 20783976, + "step": 1291 + }, + { + "epoch": 0.09050233348218585, + "grad_norm": 3.634686231613159, + "learning_rate": 9.095826269702278e-05, + "loss": 1.0344, + "num_input_tokens_seen": 20800360, + "step": 1292 + }, + { + "epoch": 0.0905723817279151, + "grad_norm": 4.220744609832764, + "learning_rate": 9.095126444833625e-05, + "loss": 1.1843, + "num_input_tokens_seen": 20816744, + "step": 1293 + }, + { + "epoch": 0.09064242997364434, + "grad_norm": 4.724472522735596, + "learning_rate": 9.094426619964974e-05, + "loss": 1.1365, + "num_input_tokens_seen": 20833128, + "step": 1294 + }, + { + "epoch": 0.09071247821937359, + "grad_norm": 3.9398090839385986, + "learning_rate": 9.093726795096323e-05, + "loss": 1.0703, + "num_input_tokens_seen": 20849448, + "step": 1295 + }, + { + "epoch": 0.09078252646510283, + "grad_norm": 4.260062217712402, + "learning_rate": 9.093026970227672e-05, + "loss": 1.0968, + "num_input_tokens_seen": 20865832, + "step": 1296 + }, + { + "epoch": 0.09085257471083208, + "grad_norm": 4.383310317993164, + "learning_rate": 9.09232714535902e-05, + "loss": 1.2542, + "num_input_tokens_seen": 20881288, + "step": 1297 + }, + { + "epoch": 0.09092262295656132, + "grad_norm": 4.479433059692383, + "learning_rate": 9.091627320490368e-05, + "loss": 0.9533, + "num_input_tokens_seen": 20897328, + "step": 1298 + }, + { + "epoch": 0.09099267120229058, + "grad_norm": 4.911858081817627, + "learning_rate": 9.090927495621717e-05, + "loss": 1.3399, + "num_input_tokens_seen": 20913712, + "step": 1299 + }, + { + "epoch": 0.09106271944801983, + "grad_norm": 4.015485763549805, + "learning_rate": 9.090227670753065e-05, + "loss": 1.1156, + "num_input_tokens_seen": 20929984, + "step": 1300 + }, + { + "epoch": 0.09113276769374908, + "grad_norm": 3.8690338134765625, + "learning_rate": 9.089527845884413e-05, + "loss": 1.0634, + "num_input_tokens_seen": 20946368, + "step": 1301 + }, + { + "epoch": 0.09120281593947832, + "grad_norm": 5.142012596130371, + "learning_rate": 9.088828021015762e-05, + "loss": 1.0579, + "num_input_tokens_seen": 20962752, + "step": 1302 + }, + { + "epoch": 0.09127286418520757, + "grad_norm": 3.954049587249756, + "learning_rate": 9.088128196147111e-05, + "loss": 1.0862, + "num_input_tokens_seen": 20979136, + "step": 1303 + }, + { + "epoch": 0.09134291243093681, + "grad_norm": 4.13312292098999, + "learning_rate": 9.08742837127846e-05, + "loss": 1.0548, + "num_input_tokens_seen": 20995520, + "step": 1304 + }, + { + "epoch": 0.09141296067666606, + "grad_norm": 4.24699592590332, + "learning_rate": 9.086728546409808e-05, + "loss": 1.0126, + "num_input_tokens_seen": 21011904, + "step": 1305 + }, + { + "epoch": 0.0914830089223953, + "grad_norm": 4.847048759460449, + "learning_rate": 9.086028721541156e-05, + "loss": 0.9973, + "num_input_tokens_seen": 21027664, + "step": 1306 + }, + { + "epoch": 0.09155305716812455, + "grad_norm": 4.573661804199219, + "learning_rate": 9.085328896672504e-05, + "loss": 1.005, + "num_input_tokens_seen": 21044048, + "step": 1307 + }, + { + "epoch": 0.0916231054138538, + "grad_norm": 4.13530158996582, + "learning_rate": 9.084629071803853e-05, + "loss": 1.1033, + "num_input_tokens_seen": 21060432, + "step": 1308 + }, + { + "epoch": 0.09169315365958304, + "grad_norm": 4.017937183380127, + "learning_rate": 9.083929246935203e-05, + "loss": 1.1971, + "num_input_tokens_seen": 21076816, + "step": 1309 + }, + { + "epoch": 0.09176320190531229, + "grad_norm": 5.928586483001709, + "learning_rate": 9.08322942206655e-05, + "loss": 1.0547, + "num_input_tokens_seen": 21093200, + "step": 1310 + }, + { + "epoch": 0.09183325015104153, + "grad_norm": 4.2442169189453125, + "learning_rate": 9.082529597197899e-05, + "loss": 1.2794, + "num_input_tokens_seen": 21109256, + "step": 1311 + }, + { + "epoch": 0.09190329839677078, + "grad_norm": 4.891444683074951, + "learning_rate": 9.081829772329247e-05, + "loss": 1.1833, + "num_input_tokens_seen": 21124848, + "step": 1312 + }, + { + "epoch": 0.09197334664250002, + "grad_norm": 4.323850154876709, + "learning_rate": 9.081129947460596e-05, + "loss": 1.1683, + "num_input_tokens_seen": 21141176, + "step": 1313 + }, + { + "epoch": 0.09204339488822927, + "grad_norm": 4.239765644073486, + "learning_rate": 9.080430122591943e-05, + "loss": 1.1073, + "num_input_tokens_seen": 21157240, + "step": 1314 + }, + { + "epoch": 0.09211344313395851, + "grad_norm": 4.12881326675415, + "learning_rate": 9.079730297723293e-05, + "loss": 1.2522, + "num_input_tokens_seen": 21173624, + "step": 1315 + }, + { + "epoch": 0.09218349137968776, + "grad_norm": 4.238161087036133, + "learning_rate": 9.079030472854642e-05, + "loss": 1.1828, + "num_input_tokens_seen": 21190008, + "step": 1316 + }, + { + "epoch": 0.092253539625417, + "grad_norm": 4.124176502227783, + "learning_rate": 9.07833064798599e-05, + "loss": 1.1388, + "num_input_tokens_seen": 21206392, + "step": 1317 + }, + { + "epoch": 0.09232358787114625, + "grad_norm": 3.772136926651001, + "learning_rate": 9.077630823117339e-05, + "loss": 1.068, + "num_input_tokens_seen": 21222776, + "step": 1318 + }, + { + "epoch": 0.0923936361168755, + "grad_norm": 4.628321170806885, + "learning_rate": 9.076930998248687e-05, + "loss": 1.2363, + "num_input_tokens_seen": 21239160, + "step": 1319 + }, + { + "epoch": 0.09246368436260474, + "grad_norm": 5.3034348487854, + "learning_rate": 9.076231173380035e-05, + "loss": 1.0638, + "num_input_tokens_seen": 21255544, + "step": 1320 + }, + { + "epoch": 0.09253373260833399, + "grad_norm": 3.6543760299682617, + "learning_rate": 9.075531348511384e-05, + "loss": 1.0071, + "num_input_tokens_seen": 21271928, + "step": 1321 + }, + { + "epoch": 0.09260378085406323, + "grad_norm": 4.1335062980651855, + "learning_rate": 9.074831523642733e-05, + "loss": 1.084, + "num_input_tokens_seen": 21288312, + "step": 1322 + }, + { + "epoch": 0.09267382909979248, + "grad_norm": 3.6392204761505127, + "learning_rate": 9.074131698774082e-05, + "loss": 1.1146, + "num_input_tokens_seen": 21304696, + "step": 1323 + }, + { + "epoch": 0.09274387734552172, + "grad_norm": 4.035269737243652, + "learning_rate": 9.073431873905429e-05, + "loss": 0.9578, + "num_input_tokens_seen": 21321080, + "step": 1324 + }, + { + "epoch": 0.09281392559125097, + "grad_norm": 4.650269508361816, + "learning_rate": 9.072732049036778e-05, + "loss": 1.0242, + "num_input_tokens_seen": 21337464, + "step": 1325 + }, + { + "epoch": 0.09288397383698022, + "grad_norm": 5.850543022155762, + "learning_rate": 9.072032224168127e-05, + "loss": 1.1196, + "num_input_tokens_seen": 21352968, + "step": 1326 + }, + { + "epoch": 0.09295402208270946, + "grad_norm": 4.177901744842529, + "learning_rate": 9.071332399299474e-05, + "loss": 1.1351, + "num_input_tokens_seen": 21368968, + "step": 1327 + }, + { + "epoch": 0.09302407032843871, + "grad_norm": 4.582173824310303, + "learning_rate": 9.070632574430823e-05, + "loss": 0.9115, + "num_input_tokens_seen": 21385352, + "step": 1328 + }, + { + "epoch": 0.09309411857416795, + "grad_norm": 4.7911787033081055, + "learning_rate": 9.069932749562173e-05, + "loss": 1.0413, + "num_input_tokens_seen": 21401144, + "step": 1329 + }, + { + "epoch": 0.0931641668198972, + "grad_norm": 4.058457374572754, + "learning_rate": 9.069232924693521e-05, + "loss": 1.0611, + "num_input_tokens_seen": 21416640, + "step": 1330 + }, + { + "epoch": 0.09323421506562644, + "grad_norm": 4.972208499908447, + "learning_rate": 9.06853309982487e-05, + "loss": 1.016, + "num_input_tokens_seen": 21433024, + "step": 1331 + }, + { + "epoch": 0.09330426331135569, + "grad_norm": 4.0875091552734375, + "learning_rate": 9.067833274956217e-05, + "loss": 1.089, + "num_input_tokens_seen": 21448888, + "step": 1332 + }, + { + "epoch": 0.09337431155708495, + "grad_norm": 3.923112154006958, + "learning_rate": 9.067133450087566e-05, + "loss": 0.9824, + "num_input_tokens_seen": 21465272, + "step": 1333 + }, + { + "epoch": 0.0934443598028142, + "grad_norm": 4.067697525024414, + "learning_rate": 9.066433625218914e-05, + "loss": 1.0492, + "num_input_tokens_seen": 21481656, + "step": 1334 + }, + { + "epoch": 0.09351440804854344, + "grad_norm": 4.185417652130127, + "learning_rate": 9.065733800350264e-05, + "loss": 1.1073, + "num_input_tokens_seen": 21498040, + "step": 1335 + }, + { + "epoch": 0.09358445629427269, + "grad_norm": 7.31542444229126, + "learning_rate": 9.065033975481613e-05, + "loss": 1.4322, + "num_input_tokens_seen": 21514088, + "step": 1336 + }, + { + "epoch": 0.09365450454000193, + "grad_norm": 4.754745006561279, + "learning_rate": 9.06433415061296e-05, + "loss": 0.9953, + "num_input_tokens_seen": 21530472, + "step": 1337 + }, + { + "epoch": 0.09372455278573118, + "grad_norm": 5.81265926361084, + "learning_rate": 9.063634325744309e-05, + "loss": 1.1434, + "num_input_tokens_seen": 21545728, + "step": 1338 + }, + { + "epoch": 0.09379460103146042, + "grad_norm": 5.586238861083984, + "learning_rate": 9.062934500875657e-05, + "loss": 0.9818, + "num_input_tokens_seen": 21562112, + "step": 1339 + }, + { + "epoch": 0.09386464927718967, + "grad_norm": 4.096534729003906, + "learning_rate": 9.062234676007005e-05, + "loss": 1.1856, + "num_input_tokens_seen": 21578496, + "step": 1340 + }, + { + "epoch": 0.09393469752291891, + "grad_norm": 4.913814544677734, + "learning_rate": 9.061534851138354e-05, + "loss": 1.041, + "num_input_tokens_seen": 21594792, + "step": 1341 + }, + { + "epoch": 0.09400474576864816, + "grad_norm": 3.8853912353515625, + "learning_rate": 9.060835026269703e-05, + "loss": 1.1651, + "num_input_tokens_seen": 21611176, + "step": 1342 + }, + { + "epoch": 0.0940747940143774, + "grad_norm": 4.187959671020508, + "learning_rate": 9.060135201401052e-05, + "loss": 1.1757, + "num_input_tokens_seen": 21627560, + "step": 1343 + }, + { + "epoch": 0.09414484226010665, + "grad_norm": 4.128627777099609, + "learning_rate": 9.0594353765324e-05, + "loss": 0.9243, + "num_input_tokens_seen": 21643576, + "step": 1344 + }, + { + "epoch": 0.0942148905058359, + "grad_norm": 4.7016825675964355, + "learning_rate": 9.058735551663748e-05, + "loss": 1.2425, + "num_input_tokens_seen": 21658600, + "step": 1345 + }, + { + "epoch": 0.09428493875156514, + "grad_norm": 3.970548391342163, + "learning_rate": 9.058035726795097e-05, + "loss": 1.0495, + "num_input_tokens_seen": 21674264, + "step": 1346 + }, + { + "epoch": 0.09435498699729439, + "grad_norm": 3.812196731567383, + "learning_rate": 9.057335901926445e-05, + "loss": 0.9558, + "num_input_tokens_seen": 21690112, + "step": 1347 + }, + { + "epoch": 0.09442503524302363, + "grad_norm": 3.6845176219940186, + "learning_rate": 9.056636077057794e-05, + "loss": 0.9758, + "num_input_tokens_seen": 21705744, + "step": 1348 + }, + { + "epoch": 0.09449508348875288, + "grad_norm": 4.119202136993408, + "learning_rate": 9.055936252189142e-05, + "loss": 1.0948, + "num_input_tokens_seen": 21721776, + "step": 1349 + }, + { + "epoch": 0.09456513173448212, + "grad_norm": 4.176985740661621, + "learning_rate": 9.055236427320491e-05, + "loss": 0.9475, + "num_input_tokens_seen": 21737912, + "step": 1350 + }, + { + "epoch": 0.09463517998021137, + "grad_norm": 4.057264804840088, + "learning_rate": 9.054536602451839e-05, + "loss": 1.1746, + "num_input_tokens_seen": 21754296, + "step": 1351 + }, + { + "epoch": 0.09470522822594062, + "grad_norm": 4.5631914138793945, + "learning_rate": 9.053836777583188e-05, + "loss": 1.0894, + "num_input_tokens_seen": 21770680, + "step": 1352 + }, + { + "epoch": 0.09477527647166986, + "grad_norm": 4.854849815368652, + "learning_rate": 9.053136952714536e-05, + "loss": 1.0686, + "num_input_tokens_seen": 21787064, + "step": 1353 + }, + { + "epoch": 0.09484532471739911, + "grad_norm": 5.326946258544922, + "learning_rate": 9.052437127845884e-05, + "loss": 0.872, + "num_input_tokens_seen": 21803448, + "step": 1354 + }, + { + "epoch": 0.09491537296312835, + "grad_norm": 4.283742904663086, + "learning_rate": 9.051737302977234e-05, + "loss": 1.2683, + "num_input_tokens_seen": 21819832, + "step": 1355 + }, + { + "epoch": 0.0949854212088576, + "grad_norm": 4.165935039520264, + "learning_rate": 9.051037478108582e-05, + "loss": 0.977, + "num_input_tokens_seen": 21836216, + "step": 1356 + }, + { + "epoch": 0.09505546945458684, + "grad_norm": 4.502480983734131, + "learning_rate": 9.05033765323993e-05, + "loss": 1.2854, + "num_input_tokens_seen": 21852600, + "step": 1357 + }, + { + "epoch": 0.09512551770031609, + "grad_norm": 4.185445308685303, + "learning_rate": 9.04963782837128e-05, + "loss": 1.2225, + "num_input_tokens_seen": 21868984, + "step": 1358 + }, + { + "epoch": 0.09519556594604534, + "grad_norm": 7.288909435272217, + "learning_rate": 9.048938003502627e-05, + "loss": 1.154, + "num_input_tokens_seen": 21884648, + "step": 1359 + }, + { + "epoch": 0.09526561419177458, + "grad_norm": 4.038896560668945, + "learning_rate": 9.048238178633976e-05, + "loss": 1.1437, + "num_input_tokens_seen": 21900704, + "step": 1360 + }, + { + "epoch": 0.09533566243750383, + "grad_norm": 4.216241836547852, + "learning_rate": 9.047538353765325e-05, + "loss": 1.1379, + "num_input_tokens_seen": 21916520, + "step": 1361 + }, + { + "epoch": 0.09540571068323307, + "grad_norm": 4.2549147605896, + "learning_rate": 9.046838528896673e-05, + "loss": 1.2578, + "num_input_tokens_seen": 21932904, + "step": 1362 + }, + { + "epoch": 0.09547575892896232, + "grad_norm": 3.6919445991516113, + "learning_rate": 9.046138704028022e-05, + "loss": 0.9876, + "num_input_tokens_seen": 21949288, + "step": 1363 + }, + { + "epoch": 0.09554580717469156, + "grad_norm": 5.467876434326172, + "learning_rate": 9.04543887915937e-05, + "loss": 0.9735, + "num_input_tokens_seen": 21965672, + "step": 1364 + }, + { + "epoch": 0.09561585542042081, + "grad_norm": 4.036736011505127, + "learning_rate": 9.044739054290719e-05, + "loss": 1.0712, + "num_input_tokens_seen": 21980792, + "step": 1365 + }, + { + "epoch": 0.09568590366615005, + "grad_norm": 4.083346843719482, + "learning_rate": 9.044039229422066e-05, + "loss": 1.0883, + "num_input_tokens_seen": 21996888, + "step": 1366 + }, + { + "epoch": 0.0957559519118793, + "grad_norm": 3.553262948989868, + "learning_rate": 9.043339404553415e-05, + "loss": 1.0116, + "num_input_tokens_seen": 22013160, + "step": 1367 + }, + { + "epoch": 0.09582600015760856, + "grad_norm": 4.787721633911133, + "learning_rate": 9.042639579684764e-05, + "loss": 1.1444, + "num_input_tokens_seen": 22029544, + "step": 1368 + }, + { + "epoch": 0.0958960484033378, + "grad_norm": 3.8053700923919678, + "learning_rate": 9.041939754816113e-05, + "loss": 1.1654, + "num_input_tokens_seen": 22045888, + "step": 1369 + }, + { + "epoch": 0.09596609664906705, + "grad_norm": 3.7679660320281982, + "learning_rate": 9.041239929947462e-05, + "loss": 1.1753, + "num_input_tokens_seen": 22062272, + "step": 1370 + }, + { + "epoch": 0.0960361448947963, + "grad_norm": 5.086554527282715, + "learning_rate": 9.040540105078809e-05, + "loss": 0.9579, + "num_input_tokens_seen": 22078080, + "step": 1371 + }, + { + "epoch": 0.09610619314052554, + "grad_norm": 4.255527496337891, + "learning_rate": 9.039840280210158e-05, + "loss": 1.0953, + "num_input_tokens_seen": 22093808, + "step": 1372 + }, + { + "epoch": 0.09617624138625479, + "grad_norm": 6.081700325012207, + "learning_rate": 9.039140455341507e-05, + "loss": 1.0363, + "num_input_tokens_seen": 22110192, + "step": 1373 + }, + { + "epoch": 0.09624628963198403, + "grad_norm": 4.376565456390381, + "learning_rate": 9.038440630472854e-05, + "loss": 1.1737, + "num_input_tokens_seen": 22126576, + "step": 1374 + }, + { + "epoch": 0.09631633787771328, + "grad_norm": 4.051114559173584, + "learning_rate": 9.037740805604205e-05, + "loss": 1.1921, + "num_input_tokens_seen": 22142768, + "step": 1375 + }, + { + "epoch": 0.09638638612344252, + "grad_norm": 4.46164083480835, + "learning_rate": 9.037040980735552e-05, + "loss": 1.1541, + "num_input_tokens_seen": 22158600, + "step": 1376 + }, + { + "epoch": 0.09645643436917177, + "grad_norm": 4.242503643035889, + "learning_rate": 9.036341155866901e-05, + "loss": 1.1314, + "num_input_tokens_seen": 22174984, + "step": 1377 + }, + { + "epoch": 0.09652648261490102, + "grad_norm": 3.6338908672332764, + "learning_rate": 9.035641330998248e-05, + "loss": 0.9257, + "num_input_tokens_seen": 22190880, + "step": 1378 + }, + { + "epoch": 0.09659653086063026, + "grad_norm": 4.73402738571167, + "learning_rate": 9.034941506129597e-05, + "loss": 1.1981, + "num_input_tokens_seen": 22206632, + "step": 1379 + }, + { + "epoch": 0.09666657910635951, + "grad_norm": 4.450289726257324, + "learning_rate": 9.034241681260946e-05, + "loss": 1.0851, + "num_input_tokens_seen": 22222896, + "step": 1380 + }, + { + "epoch": 0.09673662735208875, + "grad_norm": 5.578179359436035, + "learning_rate": 9.033541856392295e-05, + "loss": 1.2856, + "num_input_tokens_seen": 22238280, + "step": 1381 + }, + { + "epoch": 0.096806675597818, + "grad_norm": 3.8745546340942383, + "learning_rate": 9.032842031523644e-05, + "loss": 0.9841, + "num_input_tokens_seen": 22254664, + "step": 1382 + }, + { + "epoch": 0.09687672384354724, + "grad_norm": 5.7268548011779785, + "learning_rate": 9.032142206654991e-05, + "loss": 1.2024, + "num_input_tokens_seen": 22270000, + "step": 1383 + }, + { + "epoch": 0.09694677208927649, + "grad_norm": 4.380898952484131, + "learning_rate": 9.03144238178634e-05, + "loss": 1.0589, + "num_input_tokens_seen": 22286384, + "step": 1384 + }, + { + "epoch": 0.09701682033500574, + "grad_norm": 5.762500762939453, + "learning_rate": 9.030742556917689e-05, + "loss": 1.2061, + "num_input_tokens_seen": 22302272, + "step": 1385 + }, + { + "epoch": 0.09708686858073498, + "grad_norm": 3.739488363265991, + "learning_rate": 9.030042732049037e-05, + "loss": 0.9867, + "num_input_tokens_seen": 22318656, + "step": 1386 + }, + { + "epoch": 0.09715691682646423, + "grad_norm": 4.584897994995117, + "learning_rate": 9.029342907180385e-05, + "loss": 1.1934, + "num_input_tokens_seen": 22334704, + "step": 1387 + }, + { + "epoch": 0.09722696507219347, + "grad_norm": 4.161139488220215, + "learning_rate": 9.028643082311734e-05, + "loss": 1.1638, + "num_input_tokens_seen": 22349800, + "step": 1388 + }, + { + "epoch": 0.09729701331792272, + "grad_norm": 4.115293979644775, + "learning_rate": 9.027943257443083e-05, + "loss": 1.0181, + "num_input_tokens_seen": 22366184, + "step": 1389 + }, + { + "epoch": 0.09736706156365196, + "grad_norm": 3.7355988025665283, + "learning_rate": 9.027243432574432e-05, + "loss": 1.1182, + "num_input_tokens_seen": 22382568, + "step": 1390 + }, + { + "epoch": 0.09743710980938121, + "grad_norm": 4.15507173538208, + "learning_rate": 9.02654360770578e-05, + "loss": 1.0272, + "num_input_tokens_seen": 22398480, + "step": 1391 + }, + { + "epoch": 0.09750715805511045, + "grad_norm": 3.770918607711792, + "learning_rate": 9.025843782837128e-05, + "loss": 0.9834, + "num_input_tokens_seen": 22414864, + "step": 1392 + }, + { + "epoch": 0.0975772063008397, + "grad_norm": 4.214321136474609, + "learning_rate": 9.025143957968476e-05, + "loss": 1.1738, + "num_input_tokens_seen": 22429752, + "step": 1393 + }, + { + "epoch": 0.09764725454656895, + "grad_norm": 3.9854986667633057, + "learning_rate": 9.024444133099825e-05, + "loss": 1.2832, + "num_input_tokens_seen": 22446136, + "step": 1394 + }, + { + "epoch": 0.09771730279229819, + "grad_norm": 4.996057510375977, + "learning_rate": 9.023744308231174e-05, + "loss": 1.1691, + "num_input_tokens_seen": 22461160, + "step": 1395 + }, + { + "epoch": 0.09778735103802744, + "grad_norm": 3.682765007019043, + "learning_rate": 9.023044483362523e-05, + "loss": 0.9548, + "num_input_tokens_seen": 22477336, + "step": 1396 + }, + { + "epoch": 0.09785739928375668, + "grad_norm": 4.367272853851318, + "learning_rate": 9.022344658493871e-05, + "loss": 1.0512, + "num_input_tokens_seen": 22492952, + "step": 1397 + }, + { + "epoch": 0.09792744752948593, + "grad_norm": 3.9716336727142334, + "learning_rate": 9.021644833625219e-05, + "loss": 1.103, + "num_input_tokens_seen": 22509336, + "step": 1398 + }, + { + "epoch": 0.09799749577521517, + "grad_norm": 4.043631553649902, + "learning_rate": 9.020945008756568e-05, + "loss": 1.1439, + "num_input_tokens_seen": 22525568, + "step": 1399 + }, + { + "epoch": 0.09806754402094442, + "grad_norm": 4.343166351318359, + "learning_rate": 9.020245183887917e-05, + "loss": 1.1948, + "num_input_tokens_seen": 22541328, + "step": 1400 + }, + { + "epoch": 0.09806754402094442, + "eval_loss": 1.1561514139175415, + "eval_runtime": 0.1977, + "eval_samples_per_second": 5.058, + "eval_steps_per_second": 5.058, + "num_input_tokens_seen": 22541328, + "step": 1400 + }, + { + "epoch": 0.09813759226667366, + "grad_norm": 4.709417819976807, + "learning_rate": 9.019545359019265e-05, + "loss": 1.1398, + "num_input_tokens_seen": 22557304, + "step": 1401 + }, + { + "epoch": 0.09820764051240291, + "grad_norm": 7.022638320922852, + "learning_rate": 9.018845534150614e-05, + "loss": 1.0342, + "num_input_tokens_seen": 22573688, + "step": 1402 + }, + { + "epoch": 0.09827768875813217, + "grad_norm": 3.7976694107055664, + "learning_rate": 9.018145709281962e-05, + "loss": 0.9829, + "num_input_tokens_seen": 22589848, + "step": 1403 + }, + { + "epoch": 0.09834773700386142, + "grad_norm": 3.70877742767334, + "learning_rate": 9.01744588441331e-05, + "loss": 0.9707, + "num_input_tokens_seen": 22606232, + "step": 1404 + }, + { + "epoch": 0.09841778524959066, + "grad_norm": 7.724960803985596, + "learning_rate": 9.016746059544658e-05, + "loss": 0.9602, + "num_input_tokens_seen": 22621912, + "step": 1405 + }, + { + "epoch": 0.09848783349531991, + "grad_norm": 3.9619522094726562, + "learning_rate": 9.016046234676007e-05, + "loss": 0.998, + "num_input_tokens_seen": 22638296, + "step": 1406 + }, + { + "epoch": 0.09855788174104915, + "grad_norm": 3.8303041458129883, + "learning_rate": 9.015346409807356e-05, + "loss": 1.0762, + "num_input_tokens_seen": 22654496, + "step": 1407 + }, + { + "epoch": 0.0986279299867784, + "grad_norm": 4.029507637023926, + "learning_rate": 9.014646584938705e-05, + "loss": 1.2072, + "num_input_tokens_seen": 22670544, + "step": 1408 + }, + { + "epoch": 0.09869797823250764, + "grad_norm": 3.8487346172332764, + "learning_rate": 9.013946760070054e-05, + "loss": 1.1834, + "num_input_tokens_seen": 22686592, + "step": 1409 + }, + { + "epoch": 0.09876802647823689, + "grad_norm": 3.700751543045044, + "learning_rate": 9.013246935201401e-05, + "loss": 0.8698, + "num_input_tokens_seen": 22702976, + "step": 1410 + }, + { + "epoch": 0.09883807472396614, + "grad_norm": 3.686884641647339, + "learning_rate": 9.01254711033275e-05, + "loss": 0.9591, + "num_input_tokens_seen": 22719360, + "step": 1411 + }, + { + "epoch": 0.09890812296969538, + "grad_norm": 4.176409721374512, + "learning_rate": 9.011847285464099e-05, + "loss": 1.1578, + "num_input_tokens_seen": 22735744, + "step": 1412 + }, + { + "epoch": 0.09897817121542463, + "grad_norm": 4.331852912902832, + "learning_rate": 9.011147460595446e-05, + "loss": 0.9769, + "num_input_tokens_seen": 22752128, + "step": 1413 + }, + { + "epoch": 0.09904821946115387, + "grad_norm": 3.8534255027770996, + "learning_rate": 9.010447635726795e-05, + "loss": 1.1536, + "num_input_tokens_seen": 22768512, + "step": 1414 + }, + { + "epoch": 0.09911826770688312, + "grad_norm": 4.066548824310303, + "learning_rate": 9.009747810858144e-05, + "loss": 1.1199, + "num_input_tokens_seen": 22784760, + "step": 1415 + }, + { + "epoch": 0.09918831595261236, + "grad_norm": 4.076517581939697, + "learning_rate": 9.009047985989493e-05, + "loss": 1.1132, + "num_input_tokens_seen": 22801144, + "step": 1416 + }, + { + "epoch": 0.09925836419834161, + "grad_norm": 3.8858346939086914, + "learning_rate": 9.008348161120842e-05, + "loss": 0.9509, + "num_input_tokens_seen": 22817320, + "step": 1417 + }, + { + "epoch": 0.09932841244407085, + "grad_norm": 6.4605584144592285, + "learning_rate": 9.007648336252189e-05, + "loss": 1.2701, + "num_input_tokens_seen": 22833704, + "step": 1418 + }, + { + "epoch": 0.0993984606898001, + "grad_norm": 4.157481670379639, + "learning_rate": 9.006948511383538e-05, + "loss": 1.0169, + "num_input_tokens_seen": 22850088, + "step": 1419 + }, + { + "epoch": 0.09946850893552935, + "grad_norm": 3.725755214691162, + "learning_rate": 9.006248686514886e-05, + "loss": 1.0183, + "num_input_tokens_seen": 22866472, + "step": 1420 + }, + { + "epoch": 0.09953855718125859, + "grad_norm": 4.012838363647461, + "learning_rate": 9.005548861646236e-05, + "loss": 0.8425, + "num_input_tokens_seen": 22882856, + "step": 1421 + }, + { + "epoch": 0.09960860542698784, + "grad_norm": 3.8754239082336426, + "learning_rate": 9.004849036777583e-05, + "loss": 1.1375, + "num_input_tokens_seen": 22899240, + "step": 1422 + }, + { + "epoch": 0.09967865367271708, + "grad_norm": 3.90873384475708, + "learning_rate": 9.004149211908932e-05, + "loss": 1.0574, + "num_input_tokens_seen": 22915160, + "step": 1423 + }, + { + "epoch": 0.09974870191844633, + "grad_norm": 5.698948860168457, + "learning_rate": 9.003449387040281e-05, + "loss": 1.1338, + "num_input_tokens_seen": 22930592, + "step": 1424 + }, + { + "epoch": 0.09981875016417557, + "grad_norm": 4.103662014007568, + "learning_rate": 9.002749562171629e-05, + "loss": 1.2384, + "num_input_tokens_seen": 22946976, + "step": 1425 + }, + { + "epoch": 0.09988879840990482, + "grad_norm": 4.404048442840576, + "learning_rate": 9.002049737302977e-05, + "loss": 1.3855, + "num_input_tokens_seen": 22963360, + "step": 1426 + }, + { + "epoch": 0.09995884665563406, + "grad_norm": 4.043710708618164, + "learning_rate": 9.001349912434326e-05, + "loss": 1.2713, + "num_input_tokens_seen": 22979544, + "step": 1427 + }, + { + "epoch": 0.10002889490136331, + "grad_norm": 4.169802188873291, + "learning_rate": 9.000650087565675e-05, + "loss": 1.0777, + "num_input_tokens_seen": 22995072, + "step": 1428 + }, + { + "epoch": 0.10009894314709256, + "grad_norm": 4.010350227355957, + "learning_rate": 8.999950262697024e-05, + "loss": 1.1245, + "num_input_tokens_seen": 23010904, + "step": 1429 + }, + { + "epoch": 0.1001689913928218, + "grad_norm": 4.496591567993164, + "learning_rate": 8.999250437828372e-05, + "loss": 1.3372, + "num_input_tokens_seen": 23027288, + "step": 1430 + }, + { + "epoch": 0.10023903963855105, + "grad_norm": 4.2428765296936035, + "learning_rate": 8.99855061295972e-05, + "loss": 1.0258, + "num_input_tokens_seen": 23043352, + "step": 1431 + }, + { + "epoch": 0.10030908788428029, + "grad_norm": 4.083342552185059, + "learning_rate": 8.997850788091068e-05, + "loss": 1.227, + "num_input_tokens_seen": 23059736, + "step": 1432 + }, + { + "epoch": 0.10037913613000954, + "grad_norm": 3.860734462738037, + "learning_rate": 8.997150963222417e-05, + "loss": 1.0791, + "num_input_tokens_seen": 23075400, + "step": 1433 + }, + { + "epoch": 0.10044918437573878, + "grad_norm": 3.985151767730713, + "learning_rate": 8.996451138353766e-05, + "loss": 1.0486, + "num_input_tokens_seen": 23091704, + "step": 1434 + }, + { + "epoch": 0.10051923262146803, + "grad_norm": 4.039731502532959, + "learning_rate": 8.995751313485114e-05, + "loss": 0.9793, + "num_input_tokens_seen": 23108088, + "step": 1435 + }, + { + "epoch": 0.10058928086719728, + "grad_norm": 6.1780619621276855, + "learning_rate": 8.995051488616463e-05, + "loss": 1.0645, + "num_input_tokens_seen": 23123128, + "step": 1436 + }, + { + "epoch": 0.10065932911292653, + "grad_norm": 4.5783586502075195, + "learning_rate": 8.994351663747811e-05, + "loss": 1.1634, + "num_input_tokens_seen": 23139168, + "step": 1437 + }, + { + "epoch": 0.10072937735865578, + "grad_norm": 3.889927864074707, + "learning_rate": 8.99365183887916e-05, + "loss": 0.97, + "num_input_tokens_seen": 23154952, + "step": 1438 + }, + { + "epoch": 0.10079942560438503, + "grad_norm": 3.927945852279663, + "learning_rate": 8.992952014010509e-05, + "loss": 1.2428, + "num_input_tokens_seen": 23170688, + "step": 1439 + }, + { + "epoch": 0.10086947385011427, + "grad_norm": 3.8991434574127197, + "learning_rate": 8.992252189141856e-05, + "loss": 0.9519, + "num_input_tokens_seen": 23186432, + "step": 1440 + }, + { + "epoch": 0.10093952209584352, + "grad_norm": 3.6479310989379883, + "learning_rate": 8.991552364273206e-05, + "loss": 0.9656, + "num_input_tokens_seen": 23202816, + "step": 1441 + }, + { + "epoch": 0.10100957034157276, + "grad_norm": 4.637960910797119, + "learning_rate": 8.990852539404554e-05, + "loss": 1.2853, + "num_input_tokens_seen": 23218304, + "step": 1442 + }, + { + "epoch": 0.10107961858730201, + "grad_norm": 4.000091552734375, + "learning_rate": 8.990152714535903e-05, + "loss": 1.0421, + "num_input_tokens_seen": 23234688, + "step": 1443 + }, + { + "epoch": 0.10114966683303125, + "grad_norm": 4.959738731384277, + "learning_rate": 8.989452889667251e-05, + "loss": 1.0904, + "num_input_tokens_seen": 23250656, + "step": 1444 + }, + { + "epoch": 0.1012197150787605, + "grad_norm": 3.9251675605773926, + "learning_rate": 8.988753064798599e-05, + "loss": 0.9219, + "num_input_tokens_seen": 23266984, + "step": 1445 + }, + { + "epoch": 0.10128976332448975, + "grad_norm": 4.28665828704834, + "learning_rate": 8.988053239929948e-05, + "loss": 1.1465, + "num_input_tokens_seen": 23283368, + "step": 1446 + }, + { + "epoch": 0.10135981157021899, + "grad_norm": 4.421731472015381, + "learning_rate": 8.987353415061297e-05, + "loss": 1.1098, + "num_input_tokens_seen": 23298728, + "step": 1447 + }, + { + "epoch": 0.10142985981594824, + "grad_norm": 5.080065727233887, + "learning_rate": 8.986653590192646e-05, + "loss": 1.1172, + "num_input_tokens_seen": 23315112, + "step": 1448 + }, + { + "epoch": 0.10149990806167748, + "grad_norm": 5.618803977966309, + "learning_rate": 8.985953765323993e-05, + "loss": 0.9551, + "num_input_tokens_seen": 23329864, + "step": 1449 + }, + { + "epoch": 0.10156995630740673, + "grad_norm": 3.756836414337158, + "learning_rate": 8.985253940455342e-05, + "loss": 1.0981, + "num_input_tokens_seen": 23345672, + "step": 1450 + }, + { + "epoch": 0.10164000455313597, + "grad_norm": 4.461424827575684, + "learning_rate": 8.984554115586691e-05, + "loss": 1.1914, + "num_input_tokens_seen": 23362056, + "step": 1451 + }, + { + "epoch": 0.10171005279886522, + "grad_norm": 5.267919063568115, + "learning_rate": 8.983854290718038e-05, + "loss": 1.1928, + "num_input_tokens_seen": 23378440, + "step": 1452 + }, + { + "epoch": 0.10178010104459446, + "grad_norm": 5.513551235198975, + "learning_rate": 8.983154465849387e-05, + "loss": 1.2405, + "num_input_tokens_seen": 23394824, + "step": 1453 + }, + { + "epoch": 0.10185014929032371, + "grad_norm": 4.46366548538208, + "learning_rate": 8.982454640980736e-05, + "loss": 1.1436, + "num_input_tokens_seen": 23410568, + "step": 1454 + }, + { + "epoch": 0.10192019753605296, + "grad_norm": 5.066692352294922, + "learning_rate": 8.981754816112085e-05, + "loss": 1.1389, + "num_input_tokens_seen": 23426952, + "step": 1455 + }, + { + "epoch": 0.1019902457817822, + "grad_norm": 3.980743169784546, + "learning_rate": 8.981054991243434e-05, + "loss": 1.0623, + "num_input_tokens_seen": 23443256, + "step": 1456 + }, + { + "epoch": 0.10206029402751145, + "grad_norm": 4.088611125946045, + "learning_rate": 8.980355166374781e-05, + "loss": 1.0388, + "num_input_tokens_seen": 23459640, + "step": 1457 + }, + { + "epoch": 0.10213034227324069, + "grad_norm": 3.9585626125335693, + "learning_rate": 8.97965534150613e-05, + "loss": 1.2051, + "num_input_tokens_seen": 23475176, + "step": 1458 + }, + { + "epoch": 0.10220039051896994, + "grad_norm": 3.7923290729522705, + "learning_rate": 8.978955516637478e-05, + "loss": 1.0001, + "num_input_tokens_seen": 23490704, + "step": 1459 + }, + { + "epoch": 0.10227043876469918, + "grad_norm": 3.9089629650115967, + "learning_rate": 8.978255691768826e-05, + "loss": 0.9786, + "num_input_tokens_seen": 23506168, + "step": 1460 + }, + { + "epoch": 0.10234048701042843, + "grad_norm": 6.2259039878845215, + "learning_rate": 8.977555866900175e-05, + "loss": 1.2854, + "num_input_tokens_seen": 23522552, + "step": 1461 + }, + { + "epoch": 0.10241053525615768, + "grad_norm": 4.071867942810059, + "learning_rate": 8.976856042031524e-05, + "loss": 1.0724, + "num_input_tokens_seen": 23538936, + "step": 1462 + }, + { + "epoch": 0.10248058350188692, + "grad_norm": 4.587897777557373, + "learning_rate": 8.976156217162873e-05, + "loss": 1.1307, + "num_input_tokens_seen": 23554536, + "step": 1463 + }, + { + "epoch": 0.10255063174761617, + "grad_norm": 3.944937229156494, + "learning_rate": 8.97545639229422e-05, + "loss": 1.1503, + "num_input_tokens_seen": 23570888, + "step": 1464 + }, + { + "epoch": 0.10262067999334541, + "grad_norm": 3.7418766021728516, + "learning_rate": 8.97475656742557e-05, + "loss": 1.0414, + "num_input_tokens_seen": 23587272, + "step": 1465 + }, + { + "epoch": 0.10269072823907466, + "grad_norm": 3.9055676460266113, + "learning_rate": 8.974056742556918e-05, + "loss": 1.2284, + "num_input_tokens_seen": 23603640, + "step": 1466 + }, + { + "epoch": 0.1027607764848039, + "grad_norm": 3.9338066577911377, + "learning_rate": 8.973356917688267e-05, + "loss": 1.2389, + "num_input_tokens_seen": 23620024, + "step": 1467 + }, + { + "epoch": 0.10283082473053315, + "grad_norm": 4.024602890014648, + "learning_rate": 8.972657092819616e-05, + "loss": 1.1358, + "num_input_tokens_seen": 23636408, + "step": 1468 + }, + { + "epoch": 0.1029008729762624, + "grad_norm": 4.09812068939209, + "learning_rate": 8.971957267950963e-05, + "loss": 1.0734, + "num_input_tokens_seen": 23652480, + "step": 1469 + }, + { + "epoch": 0.10297092122199164, + "grad_norm": 4.7382025718688965, + "learning_rate": 8.971257443082312e-05, + "loss": 1.4112, + "num_input_tokens_seen": 23668424, + "step": 1470 + }, + { + "epoch": 0.10304096946772089, + "grad_norm": 4.518669605255127, + "learning_rate": 8.970557618213661e-05, + "loss": 1.3466, + "num_input_tokens_seen": 23684808, + "step": 1471 + }, + { + "epoch": 0.10311101771345015, + "grad_norm": 4.023036003112793, + "learning_rate": 8.969857793345009e-05, + "loss": 1.0246, + "num_input_tokens_seen": 23701192, + "step": 1472 + }, + { + "epoch": 0.10318106595917939, + "grad_norm": 4.6244215965271, + "learning_rate": 8.969157968476358e-05, + "loss": 1.2391, + "num_input_tokens_seen": 23717576, + "step": 1473 + }, + { + "epoch": 0.10325111420490864, + "grad_norm": 4.517683506011963, + "learning_rate": 8.968458143607706e-05, + "loss": 1.3872, + "num_input_tokens_seen": 23733960, + "step": 1474 + }, + { + "epoch": 0.10332116245063788, + "grad_norm": 4.048764705657959, + "learning_rate": 8.967758318739055e-05, + "loss": 1.0453, + "num_input_tokens_seen": 23750344, + "step": 1475 + }, + { + "epoch": 0.10339121069636713, + "grad_norm": 4.248376369476318, + "learning_rate": 8.967058493870403e-05, + "loss": 1.176, + "num_input_tokens_seen": 23766160, + "step": 1476 + }, + { + "epoch": 0.10346125894209637, + "grad_norm": 3.780548095703125, + "learning_rate": 8.966358669001752e-05, + "loss": 0.9048, + "num_input_tokens_seen": 23782544, + "step": 1477 + }, + { + "epoch": 0.10353130718782562, + "grad_norm": 4.26375675201416, + "learning_rate": 8.9656588441331e-05, + "loss": 0.8651, + "num_input_tokens_seen": 23798928, + "step": 1478 + }, + { + "epoch": 0.10360135543355486, + "grad_norm": 3.9202687740325928, + "learning_rate": 8.964959019264448e-05, + "loss": 1.1058, + "num_input_tokens_seen": 23815312, + "step": 1479 + }, + { + "epoch": 0.10367140367928411, + "grad_norm": 3.983797788619995, + "learning_rate": 8.964259194395797e-05, + "loss": 1.0778, + "num_input_tokens_seen": 23831696, + "step": 1480 + }, + { + "epoch": 0.10374145192501336, + "grad_norm": 4.471195220947266, + "learning_rate": 8.963559369527146e-05, + "loss": 1.1858, + "num_input_tokens_seen": 23847768, + "step": 1481 + }, + { + "epoch": 0.1038115001707426, + "grad_norm": 3.560317039489746, + "learning_rate": 8.962859544658495e-05, + "loss": 1.0205, + "num_input_tokens_seen": 23864152, + "step": 1482 + }, + { + "epoch": 0.10388154841647185, + "grad_norm": 3.8699846267700195, + "learning_rate": 8.962159719789843e-05, + "loss": 1.1438, + "num_input_tokens_seen": 23880536, + "step": 1483 + }, + { + "epoch": 0.10395159666220109, + "grad_norm": 4.547862529754639, + "learning_rate": 8.961459894921191e-05, + "loss": 1.0303, + "num_input_tokens_seen": 23896704, + "step": 1484 + }, + { + "epoch": 0.10402164490793034, + "grad_norm": 4.669456481933594, + "learning_rate": 8.96076007005254e-05, + "loss": 1.1994, + "num_input_tokens_seen": 23913088, + "step": 1485 + }, + { + "epoch": 0.10409169315365958, + "grad_norm": 4.346285343170166, + "learning_rate": 8.960060245183887e-05, + "loss": 1.2677, + "num_input_tokens_seen": 23929472, + "step": 1486 + }, + { + "epoch": 0.10416174139938883, + "grad_norm": 6.5028581619262695, + "learning_rate": 8.959360420315236e-05, + "loss": 0.989, + "num_input_tokens_seen": 23945216, + "step": 1487 + }, + { + "epoch": 0.10423178964511808, + "grad_norm": 3.935488224029541, + "learning_rate": 8.958660595446586e-05, + "loss": 1.2657, + "num_input_tokens_seen": 23961600, + "step": 1488 + }, + { + "epoch": 0.10430183789084732, + "grad_norm": 3.772397518157959, + "learning_rate": 8.957960770577934e-05, + "loss": 1.1038, + "num_input_tokens_seen": 23977984, + "step": 1489 + }, + { + "epoch": 0.10437188613657657, + "grad_norm": 4.508286476135254, + "learning_rate": 8.957260945709283e-05, + "loss": 1.2694, + "num_input_tokens_seen": 23993752, + "step": 1490 + }, + { + "epoch": 0.10444193438230581, + "grad_norm": 4.667380332946777, + "learning_rate": 8.95656112084063e-05, + "loss": 1.2837, + "num_input_tokens_seen": 24009832, + "step": 1491 + }, + { + "epoch": 0.10451198262803506, + "grad_norm": 7.675503730773926, + "learning_rate": 8.955861295971979e-05, + "loss": 1.121, + "num_input_tokens_seen": 24025784, + "step": 1492 + }, + { + "epoch": 0.1045820308737643, + "grad_norm": 4.427548408508301, + "learning_rate": 8.955161471103328e-05, + "loss": 0.835, + "num_input_tokens_seen": 24041568, + "step": 1493 + }, + { + "epoch": 0.10465207911949355, + "grad_norm": 3.9065396785736084, + "learning_rate": 8.954461646234677e-05, + "loss": 1.1322, + "num_input_tokens_seen": 24057952, + "step": 1494 + }, + { + "epoch": 0.1047221273652228, + "grad_norm": 4.052605628967285, + "learning_rate": 8.953761821366026e-05, + "loss": 1.1133, + "num_input_tokens_seen": 24074336, + "step": 1495 + }, + { + "epoch": 0.10479217561095204, + "grad_norm": 3.758476734161377, + "learning_rate": 8.953061996497373e-05, + "loss": 1.1302, + "num_input_tokens_seen": 24090720, + "step": 1496 + }, + { + "epoch": 0.10486222385668129, + "grad_norm": 4.4470014572143555, + "learning_rate": 8.952362171628722e-05, + "loss": 1.0969, + "num_input_tokens_seen": 24107024, + "step": 1497 + }, + { + "epoch": 0.10493227210241053, + "grad_norm": 4.222001075744629, + "learning_rate": 8.951662346760071e-05, + "loss": 1.147, + "num_input_tokens_seen": 24123408, + "step": 1498 + }, + { + "epoch": 0.10500232034813978, + "grad_norm": 4.72997522354126, + "learning_rate": 8.950962521891418e-05, + "loss": 1.1086, + "num_input_tokens_seen": 24137672, + "step": 1499 + }, + { + "epoch": 0.10507236859386902, + "grad_norm": 4.342312812805176, + "learning_rate": 8.950262697022767e-05, + "loss": 1.2044, + "num_input_tokens_seen": 24153248, + "step": 1500 + }, + { + "epoch": 0.10514241683959827, + "grad_norm": 4.723706245422363, + "learning_rate": 8.949562872154116e-05, + "loss": 1.1075, + "num_input_tokens_seen": 24169240, + "step": 1501 + }, + { + "epoch": 0.10521246508532751, + "grad_norm": 4.244345188140869, + "learning_rate": 8.948863047285465e-05, + "loss": 1.1839, + "num_input_tokens_seen": 24184608, + "step": 1502 + }, + { + "epoch": 0.10528251333105676, + "grad_norm": 3.6271615028381348, + "learning_rate": 8.948163222416812e-05, + "loss": 1.0755, + "num_input_tokens_seen": 24200992, + "step": 1503 + }, + { + "epoch": 0.105352561576786, + "grad_norm": 3.858696937561035, + "learning_rate": 8.947463397548161e-05, + "loss": 1.0598, + "num_input_tokens_seen": 24217376, + "step": 1504 + }, + { + "epoch": 0.10542260982251525, + "grad_norm": 7.14077091217041, + "learning_rate": 8.94676357267951e-05, + "loss": 1.0362, + "num_input_tokens_seen": 24232368, + "step": 1505 + }, + { + "epoch": 0.10549265806824451, + "grad_norm": 4.203495979309082, + "learning_rate": 8.946063747810858e-05, + "loss": 1.2491, + "num_input_tokens_seen": 24248520, + "step": 1506 + }, + { + "epoch": 0.10556270631397376, + "grad_norm": 4.344188213348389, + "learning_rate": 8.945363922942207e-05, + "loss": 0.905, + "num_input_tokens_seen": 24264824, + "step": 1507 + }, + { + "epoch": 0.105632754559703, + "grad_norm": 6.156280517578125, + "learning_rate": 8.944664098073557e-05, + "loss": 1.3046, + "num_input_tokens_seen": 24281208, + "step": 1508 + }, + { + "epoch": 0.10570280280543225, + "grad_norm": 4.687212944030762, + "learning_rate": 8.943964273204904e-05, + "loss": 1.1898, + "num_input_tokens_seen": 24297592, + "step": 1509 + }, + { + "epoch": 0.10577285105116149, + "grad_norm": 3.9128546714782715, + "learning_rate": 8.943264448336253e-05, + "loss": 1.0506, + "num_input_tokens_seen": 24313976, + "step": 1510 + }, + { + "epoch": 0.10584289929689074, + "grad_norm": 5.766979694366455, + "learning_rate": 8.9425646234676e-05, + "loss": 1.119, + "num_input_tokens_seen": 24330296, + "step": 1511 + }, + { + "epoch": 0.10591294754261998, + "grad_norm": 3.9610238075256348, + "learning_rate": 8.94186479859895e-05, + "loss": 1.279, + "num_input_tokens_seen": 24346680, + "step": 1512 + }, + { + "epoch": 0.10598299578834923, + "grad_norm": 4.262688636779785, + "learning_rate": 8.941164973730297e-05, + "loss": 1.3217, + "num_input_tokens_seen": 24362408, + "step": 1513 + }, + { + "epoch": 0.10605304403407848, + "grad_norm": 5.190121173858643, + "learning_rate": 8.940465148861647e-05, + "loss": 1.0615, + "num_input_tokens_seen": 24378248, + "step": 1514 + }, + { + "epoch": 0.10612309227980772, + "grad_norm": 4.5859479904174805, + "learning_rate": 8.939765323992996e-05, + "loss": 1.1377, + "num_input_tokens_seen": 24394632, + "step": 1515 + }, + { + "epoch": 0.10619314052553697, + "grad_norm": 4.021294593811035, + "learning_rate": 8.939065499124344e-05, + "loss": 0.9913, + "num_input_tokens_seen": 24411016, + "step": 1516 + }, + { + "epoch": 0.10626318877126621, + "grad_norm": 4.296265602111816, + "learning_rate": 8.938365674255692e-05, + "loss": 1.1753, + "num_input_tokens_seen": 24426792, + "step": 1517 + }, + { + "epoch": 0.10633323701699546, + "grad_norm": 3.4397289752960205, + "learning_rate": 8.93766584938704e-05, + "loss": 0.8159, + "num_input_tokens_seen": 24443176, + "step": 1518 + }, + { + "epoch": 0.1064032852627247, + "grad_norm": 4.009952545166016, + "learning_rate": 8.936966024518389e-05, + "loss": 1.0728, + "num_input_tokens_seen": 24459384, + "step": 1519 + }, + { + "epoch": 0.10647333350845395, + "grad_norm": 4.786280632019043, + "learning_rate": 8.936266199649738e-05, + "loss": 1.1303, + "num_input_tokens_seen": 24474904, + "step": 1520 + }, + { + "epoch": 0.1065433817541832, + "grad_norm": 3.869297981262207, + "learning_rate": 8.935566374781087e-05, + "loss": 1.0829, + "num_input_tokens_seen": 24490456, + "step": 1521 + }, + { + "epoch": 0.10661342999991244, + "grad_norm": 3.995553731918335, + "learning_rate": 8.934866549912435e-05, + "loss": 1.0813, + "num_input_tokens_seen": 24506840, + "step": 1522 + }, + { + "epoch": 0.10668347824564169, + "grad_norm": 4.195018291473389, + "learning_rate": 8.934166725043783e-05, + "loss": 1.0585, + "num_input_tokens_seen": 24522440, + "step": 1523 + }, + { + "epoch": 0.10675352649137093, + "grad_norm": 4.0432515144348145, + "learning_rate": 8.933466900175132e-05, + "loss": 1.0757, + "num_input_tokens_seen": 24538824, + "step": 1524 + }, + { + "epoch": 0.10682357473710018, + "grad_norm": 5.120638847351074, + "learning_rate": 8.93276707530648e-05, + "loss": 1.1328, + "num_input_tokens_seen": 24555208, + "step": 1525 + }, + { + "epoch": 0.10689362298282942, + "grad_norm": 3.925096035003662, + "learning_rate": 8.932067250437828e-05, + "loss": 1.1569, + "num_input_tokens_seen": 24571544, + "step": 1526 + }, + { + "epoch": 0.10696367122855867, + "grad_norm": 3.930328130722046, + "learning_rate": 8.931367425569177e-05, + "loss": 0.9385, + "num_input_tokens_seen": 24587736, + "step": 1527 + }, + { + "epoch": 0.10703371947428791, + "grad_norm": 3.7056055068969727, + "learning_rate": 8.930667600700526e-05, + "loss": 0.8675, + "num_input_tokens_seen": 24604120, + "step": 1528 + }, + { + "epoch": 0.10710376772001716, + "grad_norm": 5.945568561553955, + "learning_rate": 8.929967775831875e-05, + "loss": 1.0395, + "num_input_tokens_seen": 24620504, + "step": 1529 + }, + { + "epoch": 0.1071738159657464, + "grad_norm": 3.7765939235687256, + "learning_rate": 8.929267950963222e-05, + "loss": 0.8796, + "num_input_tokens_seen": 24635440, + "step": 1530 + }, + { + "epoch": 0.10724386421147565, + "grad_norm": 4.229284286499023, + "learning_rate": 8.928568126094571e-05, + "loss": 1.0941, + "num_input_tokens_seen": 24651824, + "step": 1531 + }, + { + "epoch": 0.1073139124572049, + "grad_norm": 4.198834419250488, + "learning_rate": 8.92786830122592e-05, + "loss": 1.118, + "num_input_tokens_seen": 24668208, + "step": 1532 + }, + { + "epoch": 0.10738396070293414, + "grad_norm": 8.091620445251465, + "learning_rate": 8.927168476357267e-05, + "loss": 1.1515, + "num_input_tokens_seen": 24684248, + "step": 1533 + }, + { + "epoch": 0.10745400894866339, + "grad_norm": 4.091879844665527, + "learning_rate": 8.926468651488618e-05, + "loss": 1.1283, + "num_input_tokens_seen": 24700632, + "step": 1534 + }, + { + "epoch": 0.10752405719439263, + "grad_norm": 3.90326189994812, + "learning_rate": 8.925768826619966e-05, + "loss": 1.047, + "num_input_tokens_seen": 24717016, + "step": 1535 + }, + { + "epoch": 0.10759410544012188, + "grad_norm": 4.097111225128174, + "learning_rate": 8.925069001751314e-05, + "loss": 1.1623, + "num_input_tokens_seen": 24732776, + "step": 1536 + }, + { + "epoch": 0.10766415368585112, + "grad_norm": 3.5537095069885254, + "learning_rate": 8.924369176882663e-05, + "loss": 0.989, + "num_input_tokens_seen": 24749064, + "step": 1537 + }, + { + "epoch": 0.10773420193158037, + "grad_norm": 4.3086256980896, + "learning_rate": 8.92366935201401e-05, + "loss": 1.0864, + "num_input_tokens_seen": 24765448, + "step": 1538 + }, + { + "epoch": 0.10780425017730962, + "grad_norm": 4.177425861358643, + "learning_rate": 8.922969527145359e-05, + "loss": 1.0652, + "num_input_tokens_seen": 24780816, + "step": 1539 + }, + { + "epoch": 0.10787429842303886, + "grad_norm": 3.6013338565826416, + "learning_rate": 8.922269702276708e-05, + "loss": 1.1045, + "num_input_tokens_seen": 24796600, + "step": 1540 + }, + { + "epoch": 0.10794434666876812, + "grad_norm": 4.05686092376709, + "learning_rate": 8.921569877408057e-05, + "loss": 1.1408, + "num_input_tokens_seen": 24812984, + "step": 1541 + }, + { + "epoch": 0.10801439491449737, + "grad_norm": 4.245424747467041, + "learning_rate": 8.920870052539406e-05, + "loss": 1.2634, + "num_input_tokens_seen": 24829368, + "step": 1542 + }, + { + "epoch": 0.10808444316022661, + "grad_norm": 3.9563350677490234, + "learning_rate": 8.920170227670753e-05, + "loss": 1.1015, + "num_input_tokens_seen": 24845752, + "step": 1543 + }, + { + "epoch": 0.10815449140595586, + "grad_norm": 4.209373474121094, + "learning_rate": 8.919470402802102e-05, + "loss": 1.2394, + "num_input_tokens_seen": 24862136, + "step": 1544 + }, + { + "epoch": 0.1082245396516851, + "grad_norm": 3.6590163707733154, + "learning_rate": 8.91877057793345e-05, + "loss": 1.0168, + "num_input_tokens_seen": 24878520, + "step": 1545 + }, + { + "epoch": 0.10829458789741435, + "grad_norm": 3.937568187713623, + "learning_rate": 8.918070753064799e-05, + "loss": 1.0999, + "num_input_tokens_seen": 24894696, + "step": 1546 + }, + { + "epoch": 0.1083646361431436, + "grad_norm": 3.948453426361084, + "learning_rate": 8.917370928196147e-05, + "loss": 1.0565, + "num_input_tokens_seen": 24910208, + "step": 1547 + }, + { + "epoch": 0.10843468438887284, + "grad_norm": 3.61549711227417, + "learning_rate": 8.916671103327496e-05, + "loss": 1.0294, + "num_input_tokens_seen": 24926592, + "step": 1548 + }, + { + "epoch": 0.10850473263460209, + "grad_norm": 4.091664791107178, + "learning_rate": 8.915971278458845e-05, + "loss": 1.0596, + "num_input_tokens_seen": 24942976, + "step": 1549 + }, + { + "epoch": 0.10857478088033133, + "grad_norm": 5.494830131530762, + "learning_rate": 8.915271453590193e-05, + "loss": 1.1564, + "num_input_tokens_seen": 24957984, + "step": 1550 + }, + { + "epoch": 0.10864482912606058, + "grad_norm": 4.546476364135742, + "learning_rate": 8.914571628721541e-05, + "loss": 1.0753, + "num_input_tokens_seen": 24974368, + "step": 1551 + }, + { + "epoch": 0.10871487737178982, + "grad_norm": 3.775996446609497, + "learning_rate": 8.91387180385289e-05, + "loss": 1.11, + "num_input_tokens_seen": 24990200, + "step": 1552 + }, + { + "epoch": 0.10878492561751907, + "grad_norm": 3.9989728927612305, + "learning_rate": 8.913171978984238e-05, + "loss": 1.0121, + "num_input_tokens_seen": 25006584, + "step": 1553 + }, + { + "epoch": 0.10885497386324831, + "grad_norm": 4.417224884033203, + "learning_rate": 8.912472154115588e-05, + "loss": 1.1891, + "num_input_tokens_seen": 25022464, + "step": 1554 + }, + { + "epoch": 0.10892502210897756, + "grad_norm": 4.604903697967529, + "learning_rate": 8.911772329246936e-05, + "loss": 0.9414, + "num_input_tokens_seen": 25038848, + "step": 1555 + }, + { + "epoch": 0.1089950703547068, + "grad_norm": 4.823176860809326, + "learning_rate": 8.911072504378284e-05, + "loss": 1.1259, + "num_input_tokens_seen": 25053776, + "step": 1556 + }, + { + "epoch": 0.10906511860043605, + "grad_norm": 3.6778531074523926, + "learning_rate": 8.910372679509632e-05, + "loss": 0.9995, + "num_input_tokens_seen": 25069872, + "step": 1557 + }, + { + "epoch": 0.1091351668461653, + "grad_norm": 4.344213485717773, + "learning_rate": 8.909672854640981e-05, + "loss": 1.1984, + "num_input_tokens_seen": 25086256, + "step": 1558 + }, + { + "epoch": 0.10920521509189454, + "grad_norm": 4.592464923858643, + "learning_rate": 8.90897302977233e-05, + "loss": 1.502, + "num_input_tokens_seen": 25102640, + "step": 1559 + }, + { + "epoch": 0.10927526333762379, + "grad_norm": 4.103248119354248, + "learning_rate": 8.908273204903678e-05, + "loss": 0.9454, + "num_input_tokens_seen": 25118328, + "step": 1560 + }, + { + "epoch": 0.10934531158335303, + "grad_norm": 4.637456893920898, + "learning_rate": 8.907573380035027e-05, + "loss": 1.3611, + "num_input_tokens_seen": 25134712, + "step": 1561 + }, + { + "epoch": 0.10941535982908228, + "grad_norm": 4.4709930419921875, + "learning_rate": 8.906873555166376e-05, + "loss": 1.1147, + "num_input_tokens_seen": 25149304, + "step": 1562 + }, + { + "epoch": 0.10948540807481152, + "grad_norm": 4.154660701751709, + "learning_rate": 8.906173730297724e-05, + "loss": 1.2855, + "num_input_tokens_seen": 25165360, + "step": 1563 + }, + { + "epoch": 0.10955545632054077, + "grad_norm": 4.1212334632873535, + "learning_rate": 8.905473905429073e-05, + "loss": 1.2015, + "num_input_tokens_seen": 25181744, + "step": 1564 + }, + { + "epoch": 0.10962550456627002, + "grad_norm": 3.8060882091522217, + "learning_rate": 8.90477408056042e-05, + "loss": 1.0333, + "num_input_tokens_seen": 25197800, + "step": 1565 + }, + { + "epoch": 0.10969555281199926, + "grad_norm": 3.4948956966400146, + "learning_rate": 8.904074255691769e-05, + "loss": 0.941, + "num_input_tokens_seen": 25214008, + "step": 1566 + }, + { + "epoch": 0.1097656010577285, + "grad_norm": 4.181606292724609, + "learning_rate": 8.903374430823118e-05, + "loss": 1.1185, + "num_input_tokens_seen": 25229496, + "step": 1567 + }, + { + "epoch": 0.10983564930345775, + "grad_norm": 4.206098556518555, + "learning_rate": 8.902674605954467e-05, + "loss": 1.0363, + "num_input_tokens_seen": 25244864, + "step": 1568 + }, + { + "epoch": 0.109905697549187, + "grad_norm": 3.797475576400757, + "learning_rate": 8.901974781085815e-05, + "loss": 1.0443, + "num_input_tokens_seen": 25261248, + "step": 1569 + }, + { + "epoch": 0.10997574579491624, + "grad_norm": 4.131814479827881, + "learning_rate": 8.901274956217163e-05, + "loss": 0.9977, + "num_input_tokens_seen": 25277632, + "step": 1570 + }, + { + "epoch": 0.11004579404064549, + "grad_norm": 3.9447309970855713, + "learning_rate": 8.900575131348512e-05, + "loss": 1.0839, + "num_input_tokens_seen": 25294016, + "step": 1571 + }, + { + "epoch": 0.11011584228637473, + "grad_norm": 3.916949510574341, + "learning_rate": 8.89987530647986e-05, + "loss": 1.1793, + "num_input_tokens_seen": 25309912, + "step": 1572 + }, + { + "epoch": 0.11018589053210398, + "grad_norm": 3.7132885456085205, + "learning_rate": 8.899175481611208e-05, + "loss": 1.081, + "num_input_tokens_seen": 25326296, + "step": 1573 + }, + { + "epoch": 0.11025593877783323, + "grad_norm": 4.5842390060424805, + "learning_rate": 8.898475656742558e-05, + "loss": 0.926, + "num_input_tokens_seen": 25342328, + "step": 1574 + }, + { + "epoch": 0.11032598702356247, + "grad_norm": 3.578962802886963, + "learning_rate": 8.897775831873906e-05, + "loss": 1.0599, + "num_input_tokens_seen": 25357640, + "step": 1575 + }, + { + "epoch": 0.11039603526929173, + "grad_norm": 3.5823471546173096, + "learning_rate": 8.897076007005255e-05, + "loss": 0.9519, + "num_input_tokens_seen": 25373424, + "step": 1576 + }, + { + "epoch": 0.11046608351502098, + "grad_norm": 3.721482515335083, + "learning_rate": 8.896376182136602e-05, + "loss": 0.976, + "num_input_tokens_seen": 25389808, + "step": 1577 + }, + { + "epoch": 0.11053613176075022, + "grad_norm": 4.874295711517334, + "learning_rate": 8.895676357267951e-05, + "loss": 1.3507, + "num_input_tokens_seen": 25406192, + "step": 1578 + }, + { + "epoch": 0.11060618000647947, + "grad_norm": 3.8547258377075195, + "learning_rate": 8.8949765323993e-05, + "loss": 0.9444, + "num_input_tokens_seen": 25421632, + "step": 1579 + }, + { + "epoch": 0.11067622825220871, + "grad_norm": 4.847586631774902, + "learning_rate": 8.894276707530649e-05, + "loss": 1.0526, + "num_input_tokens_seen": 25438016, + "step": 1580 + }, + { + "epoch": 0.11074627649793796, + "grad_norm": 3.950594425201416, + "learning_rate": 8.893576882661998e-05, + "loss": 1.0688, + "num_input_tokens_seen": 25454400, + "step": 1581 + }, + { + "epoch": 0.1108163247436672, + "grad_norm": 3.7372758388519287, + "learning_rate": 8.892877057793345e-05, + "loss": 1.2211, + "num_input_tokens_seen": 25470304, + "step": 1582 + }, + { + "epoch": 0.11088637298939645, + "grad_norm": 3.8695788383483887, + "learning_rate": 8.892177232924694e-05, + "loss": 1.1006, + "num_input_tokens_seen": 25486688, + "step": 1583 + }, + { + "epoch": 0.1109564212351257, + "grad_norm": 4.623810768127441, + "learning_rate": 8.891477408056042e-05, + "loss": 1.034, + "num_input_tokens_seen": 25503072, + "step": 1584 + }, + { + "epoch": 0.11102646948085494, + "grad_norm": 4.03538179397583, + "learning_rate": 8.89077758318739e-05, + "loss": 1.0915, + "num_input_tokens_seen": 25519008, + "step": 1585 + }, + { + "epoch": 0.11109651772658419, + "grad_norm": 7.486603736877441, + "learning_rate": 8.890077758318739e-05, + "loss": 1.0137, + "num_input_tokens_seen": 25533808, + "step": 1586 + }, + { + "epoch": 0.11116656597231343, + "grad_norm": 4.660414218902588, + "learning_rate": 8.889377933450088e-05, + "loss": 1.0172, + "num_input_tokens_seen": 25549784, + "step": 1587 + }, + { + "epoch": 0.11123661421804268, + "grad_norm": 3.9375548362731934, + "learning_rate": 8.888678108581437e-05, + "loss": 0.9843, + "num_input_tokens_seen": 25566168, + "step": 1588 + }, + { + "epoch": 0.11130666246377192, + "grad_norm": 4.275035858154297, + "learning_rate": 8.887978283712786e-05, + "loss": 1.1802, + "num_input_tokens_seen": 25582552, + "step": 1589 + }, + { + "epoch": 0.11137671070950117, + "grad_norm": 4.799124240875244, + "learning_rate": 8.887278458844133e-05, + "loss": 1.2702, + "num_input_tokens_seen": 25598936, + "step": 1590 + }, + { + "epoch": 0.11144675895523042, + "grad_norm": 4.143614768981934, + "learning_rate": 8.886578633975482e-05, + "loss": 1.1797, + "num_input_tokens_seen": 25615320, + "step": 1591 + }, + { + "epoch": 0.11151680720095966, + "grad_norm": 4.490556716918945, + "learning_rate": 8.88587880910683e-05, + "loss": 1.1351, + "num_input_tokens_seen": 25630624, + "step": 1592 + }, + { + "epoch": 0.1115868554466889, + "grad_norm": 6.010688781738281, + "learning_rate": 8.885178984238179e-05, + "loss": 1.059, + "num_input_tokens_seen": 25646048, + "step": 1593 + }, + { + "epoch": 0.11165690369241815, + "grad_norm": 3.7447726726531982, + "learning_rate": 8.884479159369527e-05, + "loss": 0.9902, + "num_input_tokens_seen": 25661528, + "step": 1594 + }, + { + "epoch": 0.1117269519381474, + "grad_norm": 4.77920389175415, + "learning_rate": 8.883779334500876e-05, + "loss": 1.1158, + "num_input_tokens_seen": 25677912, + "step": 1595 + }, + { + "epoch": 0.11179700018387664, + "grad_norm": 3.9812231063842773, + "learning_rate": 8.883079509632225e-05, + "loss": 1.096, + "num_input_tokens_seen": 25694296, + "step": 1596 + }, + { + "epoch": 0.11186704842960589, + "grad_norm": 3.7404634952545166, + "learning_rate": 8.882379684763573e-05, + "loss": 0.9965, + "num_input_tokens_seen": 25710448, + "step": 1597 + }, + { + "epoch": 0.11193709667533513, + "grad_norm": 4.466211318969727, + "learning_rate": 8.881679859894922e-05, + "loss": 1.1495, + "num_input_tokens_seen": 25726624, + "step": 1598 + }, + { + "epoch": 0.11200714492106438, + "grad_norm": 3.6850225925445557, + "learning_rate": 8.880980035026269e-05, + "loss": 0.9685, + "num_input_tokens_seen": 25742456, + "step": 1599 + }, + { + "epoch": 0.11207719316679363, + "grad_norm": 4.128363609313965, + "learning_rate": 8.880280210157619e-05, + "loss": 1.1052, + "num_input_tokens_seen": 25758840, + "step": 1600 + }, + { + "epoch": 0.11207719316679363, + "eval_loss": 1.1512293815612793, + "eval_runtime": 0.1988, + "eval_samples_per_second": 5.031, + "eval_steps_per_second": 5.031, + "num_input_tokens_seen": 25758840, + "step": 1600 + }, + { + "epoch": 0.11214724141252287, + "grad_norm": 4.852661609649658, + "learning_rate": 8.879580385288968e-05, + "loss": 1.0778, + "num_input_tokens_seen": 25774312, + "step": 1601 + }, + { + "epoch": 0.11221728965825212, + "grad_norm": 4.501857280731201, + "learning_rate": 8.878880560420316e-05, + "loss": 1.302, + "num_input_tokens_seen": 25790696, + "step": 1602 + }, + { + "epoch": 0.11228733790398136, + "grad_norm": 4.142490863800049, + "learning_rate": 8.878180735551665e-05, + "loss": 1.0375, + "num_input_tokens_seen": 25807080, + "step": 1603 + }, + { + "epoch": 0.11235738614971061, + "grad_norm": 3.606905698776245, + "learning_rate": 8.877480910683012e-05, + "loss": 0.9254, + "num_input_tokens_seen": 25822552, + "step": 1604 + }, + { + "epoch": 0.11242743439543985, + "grad_norm": 3.837010145187378, + "learning_rate": 8.876781085814361e-05, + "loss": 1.1756, + "num_input_tokens_seen": 25838088, + "step": 1605 + }, + { + "epoch": 0.1124974826411691, + "grad_norm": 3.9082963466644287, + "learning_rate": 8.87608126094571e-05, + "loss": 1.0201, + "num_input_tokens_seen": 25854240, + "step": 1606 + }, + { + "epoch": 0.11256753088689835, + "grad_norm": 4.062923908233643, + "learning_rate": 8.875381436077059e-05, + "loss": 1.1034, + "num_input_tokens_seen": 25870624, + "step": 1607 + }, + { + "epoch": 0.11263757913262759, + "grad_norm": 4.331594944000244, + "learning_rate": 8.874681611208407e-05, + "loss": 1.2043, + "num_input_tokens_seen": 25886656, + "step": 1608 + }, + { + "epoch": 0.11270762737835684, + "grad_norm": 3.77466082572937, + "learning_rate": 8.873981786339755e-05, + "loss": 0.936, + "num_input_tokens_seen": 25902704, + "step": 1609 + }, + { + "epoch": 0.1127776756240861, + "grad_norm": 3.3747365474700928, + "learning_rate": 8.873281961471104e-05, + "loss": 0.9071, + "num_input_tokens_seen": 25919088, + "step": 1610 + }, + { + "epoch": 0.11284772386981534, + "grad_norm": 5.377493381500244, + "learning_rate": 8.872582136602451e-05, + "loss": 0.9246, + "num_input_tokens_seen": 25935472, + "step": 1611 + }, + { + "epoch": 0.11291777211554459, + "grad_norm": 5.506969451904297, + "learning_rate": 8.8718823117338e-05, + "loss": 0.9211, + "num_input_tokens_seen": 25951664, + "step": 1612 + }, + { + "epoch": 0.11298782036127383, + "grad_norm": 4.874104976654053, + "learning_rate": 8.871182486865149e-05, + "loss": 1.1654, + "num_input_tokens_seen": 25968048, + "step": 1613 + }, + { + "epoch": 0.11305786860700308, + "grad_norm": 4.666824817657471, + "learning_rate": 8.870482661996498e-05, + "loss": 1.2155, + "num_input_tokens_seen": 25983784, + "step": 1614 + }, + { + "epoch": 0.11312791685273232, + "grad_norm": 3.949862241744995, + "learning_rate": 8.869782837127847e-05, + "loss": 1.1243, + "num_input_tokens_seen": 26000168, + "step": 1615 + }, + { + "epoch": 0.11319796509846157, + "grad_norm": 3.866542339324951, + "learning_rate": 8.869083012259196e-05, + "loss": 1.1302, + "num_input_tokens_seen": 26015456, + "step": 1616 + }, + { + "epoch": 0.11326801334419082, + "grad_norm": 3.8679909706115723, + "learning_rate": 8.868383187390543e-05, + "loss": 1.0886, + "num_input_tokens_seen": 26031224, + "step": 1617 + }, + { + "epoch": 0.11333806158992006, + "grad_norm": 4.7508087158203125, + "learning_rate": 8.867683362521892e-05, + "loss": 1.2837, + "num_input_tokens_seen": 26046952, + "step": 1618 + }, + { + "epoch": 0.1134081098356493, + "grad_norm": 3.878549337387085, + "learning_rate": 8.86698353765324e-05, + "loss": 0.99, + "num_input_tokens_seen": 26063280, + "step": 1619 + }, + { + "epoch": 0.11347815808137855, + "grad_norm": 3.8016276359558105, + "learning_rate": 8.86628371278459e-05, + "loss": 1.1682, + "num_input_tokens_seen": 26079616, + "step": 1620 + }, + { + "epoch": 0.1135482063271078, + "grad_norm": 4.040102481842041, + "learning_rate": 8.865583887915937e-05, + "loss": 1.1008, + "num_input_tokens_seen": 26095232, + "step": 1621 + }, + { + "epoch": 0.11361825457283704, + "grad_norm": 3.932529926300049, + "learning_rate": 8.864884063047286e-05, + "loss": 1.1663, + "num_input_tokens_seen": 26111616, + "step": 1622 + }, + { + "epoch": 0.11368830281856629, + "grad_norm": 4.568112373352051, + "learning_rate": 8.864184238178635e-05, + "loss": 1.1932, + "num_input_tokens_seen": 26128000, + "step": 1623 + }, + { + "epoch": 0.11375835106429553, + "grad_norm": 4.23036527633667, + "learning_rate": 8.863484413309982e-05, + "loss": 1.0223, + "num_input_tokens_seen": 26144384, + "step": 1624 + }, + { + "epoch": 0.11382839931002478, + "grad_norm": 4.209012031555176, + "learning_rate": 8.862784588441331e-05, + "loss": 1.0992, + "num_input_tokens_seen": 26160768, + "step": 1625 + }, + { + "epoch": 0.11389844755575403, + "grad_norm": 3.865983724594116, + "learning_rate": 8.86208476357268e-05, + "loss": 1.1213, + "num_input_tokens_seen": 26177152, + "step": 1626 + }, + { + "epoch": 0.11396849580148327, + "grad_norm": 3.781083822250366, + "learning_rate": 8.861384938704029e-05, + "loss": 1.0132, + "num_input_tokens_seen": 26193536, + "step": 1627 + }, + { + "epoch": 0.11403854404721252, + "grad_norm": 4.330471038818359, + "learning_rate": 8.860685113835378e-05, + "loss": 0.9749, + "num_input_tokens_seen": 26208976, + "step": 1628 + }, + { + "epoch": 0.11410859229294176, + "grad_norm": 4.772238254547119, + "learning_rate": 8.859985288966725e-05, + "loss": 1.2796, + "num_input_tokens_seen": 26225360, + "step": 1629 + }, + { + "epoch": 0.11417864053867101, + "grad_norm": 4.0468668937683105, + "learning_rate": 8.859285464098074e-05, + "loss": 1.0056, + "num_input_tokens_seen": 26241744, + "step": 1630 + }, + { + "epoch": 0.11424868878440025, + "grad_norm": 3.9648735523223877, + "learning_rate": 8.858585639229422e-05, + "loss": 1.2185, + "num_input_tokens_seen": 26258128, + "step": 1631 + }, + { + "epoch": 0.1143187370301295, + "grad_norm": 4.7014079093933105, + "learning_rate": 8.85788581436077e-05, + "loss": 1.1795, + "num_input_tokens_seen": 26274512, + "step": 1632 + }, + { + "epoch": 0.11438878527585875, + "grad_norm": 4.6375627517700195, + "learning_rate": 8.85718598949212e-05, + "loss": 1.0074, + "num_input_tokens_seen": 26290008, + "step": 1633 + }, + { + "epoch": 0.11445883352158799, + "grad_norm": 4.427719593048096, + "learning_rate": 8.856486164623468e-05, + "loss": 1.2769, + "num_input_tokens_seen": 26305512, + "step": 1634 + }, + { + "epoch": 0.11452888176731724, + "grad_norm": 6.001821994781494, + "learning_rate": 8.855786339754817e-05, + "loss": 1.0606, + "num_input_tokens_seen": 26319504, + "step": 1635 + }, + { + "epoch": 0.11459893001304648, + "grad_norm": 3.970672369003296, + "learning_rate": 8.855086514886165e-05, + "loss": 1.1944, + "num_input_tokens_seen": 26335888, + "step": 1636 + }, + { + "epoch": 0.11466897825877573, + "grad_norm": 3.924450635910034, + "learning_rate": 8.854386690017514e-05, + "loss": 0.9607, + "num_input_tokens_seen": 26351536, + "step": 1637 + }, + { + "epoch": 0.11473902650450497, + "grad_norm": 4.400977611541748, + "learning_rate": 8.853686865148861e-05, + "loss": 1.0641, + "num_input_tokens_seen": 26367808, + "step": 1638 + }, + { + "epoch": 0.11480907475023422, + "grad_norm": 3.9734365940093994, + "learning_rate": 8.85298704028021e-05, + "loss": 1.2258, + "num_input_tokens_seen": 26383864, + "step": 1639 + }, + { + "epoch": 0.11487912299596346, + "grad_norm": 3.792949914932251, + "learning_rate": 8.85228721541156e-05, + "loss": 1.0401, + "num_input_tokens_seen": 26400248, + "step": 1640 + }, + { + "epoch": 0.11494917124169271, + "grad_norm": 5.14591121673584, + "learning_rate": 8.851587390542908e-05, + "loss": 1.0484, + "num_input_tokens_seen": 26416056, + "step": 1641 + }, + { + "epoch": 0.11501921948742196, + "grad_norm": 5.0158162117004395, + "learning_rate": 8.850887565674256e-05, + "loss": 1.2823, + "num_input_tokens_seen": 26431400, + "step": 1642 + }, + { + "epoch": 0.1150892677331512, + "grad_norm": 4.459201812744141, + "learning_rate": 8.850187740805605e-05, + "loss": 1.2371, + "num_input_tokens_seen": 26446920, + "step": 1643 + }, + { + "epoch": 0.11515931597888045, + "grad_norm": 3.717949867248535, + "learning_rate": 8.849487915936953e-05, + "loss": 1.1299, + "num_input_tokens_seen": 26463304, + "step": 1644 + }, + { + "epoch": 0.1152293642246097, + "grad_norm": 3.7555253505706787, + "learning_rate": 8.848788091068302e-05, + "loss": 1.0835, + "num_input_tokens_seen": 26479296, + "step": 1645 + }, + { + "epoch": 0.11529941247033895, + "grad_norm": 4.3726325035095215, + "learning_rate": 8.84808826619965e-05, + "loss": 0.9606, + "num_input_tokens_seen": 26495024, + "step": 1646 + }, + { + "epoch": 0.1153694607160682, + "grad_norm": 3.728700876235962, + "learning_rate": 8.847388441331e-05, + "loss": 1.0486, + "num_input_tokens_seen": 26511408, + "step": 1647 + }, + { + "epoch": 0.11543950896179744, + "grad_norm": 4.276855945587158, + "learning_rate": 8.846688616462347e-05, + "loss": 0.9869, + "num_input_tokens_seen": 26527688, + "step": 1648 + }, + { + "epoch": 0.11550955720752669, + "grad_norm": 5.386009693145752, + "learning_rate": 8.845988791593696e-05, + "loss": 1.0021, + "num_input_tokens_seen": 26544072, + "step": 1649 + }, + { + "epoch": 0.11557960545325593, + "grad_norm": 4.978610992431641, + "learning_rate": 8.845288966725045e-05, + "loss": 1.2531, + "num_input_tokens_seen": 26560456, + "step": 1650 + }, + { + "epoch": 0.11564965369898518, + "grad_norm": 5.325594425201416, + "learning_rate": 8.844589141856392e-05, + "loss": 0.9983, + "num_input_tokens_seen": 26576840, + "step": 1651 + }, + { + "epoch": 0.11571970194471443, + "grad_norm": 4.359868049621582, + "learning_rate": 8.843889316987741e-05, + "loss": 0.9652, + "num_input_tokens_seen": 26593224, + "step": 1652 + }, + { + "epoch": 0.11578975019044367, + "grad_norm": 7.921500205993652, + "learning_rate": 8.84318949211909e-05, + "loss": 1.0767, + "num_input_tokens_seen": 26607352, + "step": 1653 + }, + { + "epoch": 0.11585979843617292, + "grad_norm": 3.51788330078125, + "learning_rate": 8.842489667250439e-05, + "loss": 1.0677, + "num_input_tokens_seen": 26623696, + "step": 1654 + }, + { + "epoch": 0.11592984668190216, + "grad_norm": 4.120747089385986, + "learning_rate": 8.841789842381788e-05, + "loss": 1.2139, + "num_input_tokens_seen": 26639832, + "step": 1655 + }, + { + "epoch": 0.11599989492763141, + "grad_norm": 4.077361106872559, + "learning_rate": 8.841090017513135e-05, + "loss": 1.0639, + "num_input_tokens_seen": 26655432, + "step": 1656 + }, + { + "epoch": 0.11606994317336065, + "grad_norm": 3.9629955291748047, + "learning_rate": 8.840390192644484e-05, + "loss": 1.0846, + "num_input_tokens_seen": 26671816, + "step": 1657 + }, + { + "epoch": 0.1161399914190899, + "grad_norm": 3.933544635772705, + "learning_rate": 8.839690367775831e-05, + "loss": 1.1543, + "num_input_tokens_seen": 26688096, + "step": 1658 + }, + { + "epoch": 0.11621003966481915, + "grad_norm": 4.702983379364014, + "learning_rate": 8.83899054290718e-05, + "loss": 1.0699, + "num_input_tokens_seen": 26704480, + "step": 1659 + }, + { + "epoch": 0.11628008791054839, + "grad_norm": 4.536739826202393, + "learning_rate": 8.83829071803853e-05, + "loss": 1.149, + "num_input_tokens_seen": 26720864, + "step": 1660 + }, + { + "epoch": 0.11635013615627764, + "grad_norm": 4.419711589813232, + "learning_rate": 8.837590893169878e-05, + "loss": 1.1994, + "num_input_tokens_seen": 26737248, + "step": 1661 + }, + { + "epoch": 0.11642018440200688, + "grad_norm": 4.106175899505615, + "learning_rate": 8.836891068301227e-05, + "loss": 1.0682, + "num_input_tokens_seen": 26753632, + "step": 1662 + }, + { + "epoch": 0.11649023264773613, + "grad_norm": 3.469658374786377, + "learning_rate": 8.836191243432574e-05, + "loss": 1.0356, + "num_input_tokens_seen": 26769944, + "step": 1663 + }, + { + "epoch": 0.11656028089346537, + "grad_norm": 7.273227691650391, + "learning_rate": 8.835491418563923e-05, + "loss": 1.1699, + "num_input_tokens_seen": 26784520, + "step": 1664 + }, + { + "epoch": 0.11663032913919462, + "grad_norm": 3.611165761947632, + "learning_rate": 8.834791593695271e-05, + "loss": 0.8595, + "num_input_tokens_seen": 26800360, + "step": 1665 + }, + { + "epoch": 0.11670037738492386, + "grad_norm": 4.405304908752441, + "learning_rate": 8.834091768826621e-05, + "loss": 1.2055, + "num_input_tokens_seen": 26816744, + "step": 1666 + }, + { + "epoch": 0.11677042563065311, + "grad_norm": 3.897247791290283, + "learning_rate": 8.83339194395797e-05, + "loss": 0.9599, + "num_input_tokens_seen": 26832520, + "step": 1667 + }, + { + "epoch": 0.11684047387638236, + "grad_norm": 3.898019313812256, + "learning_rate": 8.832692119089317e-05, + "loss": 1.0838, + "num_input_tokens_seen": 26848080, + "step": 1668 + }, + { + "epoch": 0.1169105221221116, + "grad_norm": 4.6351542472839355, + "learning_rate": 8.831992294220666e-05, + "loss": 1.2776, + "num_input_tokens_seen": 26864464, + "step": 1669 + }, + { + "epoch": 0.11698057036784085, + "grad_norm": 4.020237922668457, + "learning_rate": 8.831292469352015e-05, + "loss": 0.9955, + "num_input_tokens_seen": 26880848, + "step": 1670 + }, + { + "epoch": 0.11705061861357009, + "grad_norm": 5.813192367553711, + "learning_rate": 8.830592644483363e-05, + "loss": 1.2867, + "num_input_tokens_seen": 26897232, + "step": 1671 + }, + { + "epoch": 0.11712066685929934, + "grad_norm": 4.058423042297363, + "learning_rate": 8.829892819614711e-05, + "loss": 1.0697, + "num_input_tokens_seen": 26912872, + "step": 1672 + }, + { + "epoch": 0.11719071510502858, + "grad_norm": 4.76987361907959, + "learning_rate": 8.82919299474606e-05, + "loss": 0.9226, + "num_input_tokens_seen": 26929256, + "step": 1673 + }, + { + "epoch": 0.11726076335075783, + "grad_norm": 3.8400967121124268, + "learning_rate": 8.828493169877409e-05, + "loss": 1.0089, + "num_input_tokens_seen": 26945624, + "step": 1674 + }, + { + "epoch": 0.11733081159648708, + "grad_norm": 4.49709415435791, + "learning_rate": 8.827793345008757e-05, + "loss": 1.0898, + "num_input_tokens_seen": 26961464, + "step": 1675 + }, + { + "epoch": 0.11740085984221632, + "grad_norm": 4.143093109130859, + "learning_rate": 8.827093520140105e-05, + "loss": 1.0493, + "num_input_tokens_seen": 26976720, + "step": 1676 + }, + { + "epoch": 0.11747090808794557, + "grad_norm": 4.138030529022217, + "learning_rate": 8.826393695271454e-05, + "loss": 1.1555, + "num_input_tokens_seen": 26993056, + "step": 1677 + }, + { + "epoch": 0.11754095633367481, + "grad_norm": 3.8191847801208496, + "learning_rate": 8.825693870402802e-05, + "loss": 1.0993, + "num_input_tokens_seen": 27009440, + "step": 1678 + }, + { + "epoch": 0.11761100457940406, + "grad_norm": 3.8392176628112793, + "learning_rate": 8.824994045534151e-05, + "loss": 1.1067, + "num_input_tokens_seen": 27024880, + "step": 1679 + }, + { + "epoch": 0.11768105282513332, + "grad_norm": 4.468568801879883, + "learning_rate": 8.8242942206655e-05, + "loss": 1.1424, + "num_input_tokens_seen": 27040672, + "step": 1680 + }, + { + "epoch": 0.11775110107086256, + "grad_norm": 3.6515510082244873, + "learning_rate": 8.823594395796848e-05, + "loss": 1.0659, + "num_input_tokens_seen": 27057056, + "step": 1681 + }, + { + "epoch": 0.11782114931659181, + "grad_norm": 4.479739189147949, + "learning_rate": 8.822894570928197e-05, + "loss": 1.0399, + "num_input_tokens_seen": 27073440, + "step": 1682 + }, + { + "epoch": 0.11789119756232105, + "grad_norm": 3.762479782104492, + "learning_rate": 8.822194746059545e-05, + "loss": 1.1041, + "num_input_tokens_seen": 27089824, + "step": 1683 + }, + { + "epoch": 0.1179612458080503, + "grad_norm": 4.694389343261719, + "learning_rate": 8.821494921190894e-05, + "loss": 1.2785, + "num_input_tokens_seen": 27106208, + "step": 1684 + }, + { + "epoch": 0.11803129405377955, + "grad_norm": 3.738931179046631, + "learning_rate": 8.820795096322241e-05, + "loss": 0.9039, + "num_input_tokens_seen": 27122352, + "step": 1685 + }, + { + "epoch": 0.11810134229950879, + "grad_norm": 4.065624237060547, + "learning_rate": 8.820095271453591e-05, + "loss": 1.0048, + "num_input_tokens_seen": 27138160, + "step": 1686 + }, + { + "epoch": 0.11817139054523804, + "grad_norm": 3.5373826026916504, + "learning_rate": 8.81939544658494e-05, + "loss": 0.8786, + "num_input_tokens_seen": 27154544, + "step": 1687 + }, + { + "epoch": 0.11824143879096728, + "grad_norm": 3.773066282272339, + "learning_rate": 8.818695621716288e-05, + "loss": 1.0043, + "num_input_tokens_seen": 27170928, + "step": 1688 + }, + { + "epoch": 0.11831148703669653, + "grad_norm": 3.3876242637634277, + "learning_rate": 8.817995796847637e-05, + "loss": 0.9909, + "num_input_tokens_seen": 27187312, + "step": 1689 + }, + { + "epoch": 0.11838153528242577, + "grad_norm": 4.526343822479248, + "learning_rate": 8.817295971978984e-05, + "loss": 1.0899, + "num_input_tokens_seen": 27202208, + "step": 1690 + }, + { + "epoch": 0.11845158352815502, + "grad_norm": 4.691114902496338, + "learning_rate": 8.816596147110333e-05, + "loss": 1.0823, + "num_input_tokens_seen": 27218592, + "step": 1691 + }, + { + "epoch": 0.11852163177388426, + "grad_norm": 3.90531849861145, + "learning_rate": 8.815896322241682e-05, + "loss": 1.1438, + "num_input_tokens_seen": 27234976, + "step": 1692 + }, + { + "epoch": 0.11859168001961351, + "grad_norm": 3.5546317100524902, + "learning_rate": 8.81519649737303e-05, + "loss": 1.0326, + "num_input_tokens_seen": 27251360, + "step": 1693 + }, + { + "epoch": 0.11866172826534276, + "grad_norm": 5.117360591888428, + "learning_rate": 8.81449667250438e-05, + "loss": 1.1921, + "num_input_tokens_seen": 27267744, + "step": 1694 + }, + { + "epoch": 0.118731776511072, + "grad_norm": 4.055267810821533, + "learning_rate": 8.813796847635727e-05, + "loss": 1.0607, + "num_input_tokens_seen": 27283688, + "step": 1695 + }, + { + "epoch": 0.11880182475680125, + "grad_norm": 4.04268741607666, + "learning_rate": 8.813097022767076e-05, + "loss": 1.1862, + "num_input_tokens_seen": 27300072, + "step": 1696 + }, + { + "epoch": 0.11887187300253049, + "grad_norm": 4.048800945281982, + "learning_rate": 8.812397197898425e-05, + "loss": 0.9231, + "num_input_tokens_seen": 27316456, + "step": 1697 + }, + { + "epoch": 0.11894192124825974, + "grad_norm": 4.445494174957275, + "learning_rate": 8.811697373029772e-05, + "loss": 1.241, + "num_input_tokens_seen": 27332464, + "step": 1698 + }, + { + "epoch": 0.11901196949398898, + "grad_norm": 4.522054672241211, + "learning_rate": 8.810997548161121e-05, + "loss": 1.3945, + "num_input_tokens_seen": 27348848, + "step": 1699 + }, + { + "epoch": 0.11908201773971823, + "grad_norm": 4.106349468231201, + "learning_rate": 8.81029772329247e-05, + "loss": 1.1457, + "num_input_tokens_seen": 27365232, + "step": 1700 + }, + { + "epoch": 0.11915206598544748, + "grad_norm": 6.059356689453125, + "learning_rate": 8.809597898423819e-05, + "loss": 1.3381, + "num_input_tokens_seen": 27380448, + "step": 1701 + }, + { + "epoch": 0.11922211423117672, + "grad_norm": 3.8089959621429443, + "learning_rate": 8.808898073555166e-05, + "loss": 1.0699, + "num_input_tokens_seen": 27396832, + "step": 1702 + }, + { + "epoch": 0.11929216247690597, + "grad_norm": 4.21024227142334, + "learning_rate": 8.808198248686515e-05, + "loss": 1.306, + "num_input_tokens_seen": 27413096, + "step": 1703 + }, + { + "epoch": 0.11936221072263521, + "grad_norm": 4.286004066467285, + "learning_rate": 8.807498423817864e-05, + "loss": 1.2325, + "num_input_tokens_seen": 27429480, + "step": 1704 + }, + { + "epoch": 0.11943225896836446, + "grad_norm": 3.512561559677124, + "learning_rate": 8.806798598949212e-05, + "loss": 0.8804, + "num_input_tokens_seen": 27445864, + "step": 1705 + }, + { + "epoch": 0.1195023072140937, + "grad_norm": 4.096526145935059, + "learning_rate": 8.806098774080562e-05, + "loss": 1.0591, + "num_input_tokens_seen": 27462248, + "step": 1706 + }, + { + "epoch": 0.11957235545982295, + "grad_norm": 5.032350063323975, + "learning_rate": 8.805398949211909e-05, + "loss": 0.8948, + "num_input_tokens_seen": 27478312, + "step": 1707 + }, + { + "epoch": 0.1196424037055522, + "grad_norm": 4.756420612335205, + "learning_rate": 8.804699124343258e-05, + "loss": 1.0584, + "num_input_tokens_seen": 27494696, + "step": 1708 + }, + { + "epoch": 0.11971245195128144, + "grad_norm": 4.869518756866455, + "learning_rate": 8.803999299474607e-05, + "loss": 0.9394, + "num_input_tokens_seen": 27511080, + "step": 1709 + }, + { + "epoch": 0.11978250019701069, + "grad_norm": 3.451759099960327, + "learning_rate": 8.803299474605954e-05, + "loss": 0.9171, + "num_input_tokens_seen": 27527328, + "step": 1710 + }, + { + "epoch": 0.11985254844273993, + "grad_norm": 4.247021675109863, + "learning_rate": 8.802599649737303e-05, + "loss": 1.1204, + "num_input_tokens_seen": 27543712, + "step": 1711 + }, + { + "epoch": 0.11992259668846918, + "grad_norm": 4.597024917602539, + "learning_rate": 8.801899824868652e-05, + "loss": 1.196, + "num_input_tokens_seen": 27560096, + "step": 1712 + }, + { + "epoch": 0.11999264493419842, + "grad_norm": 4.242952823638916, + "learning_rate": 8.801200000000001e-05, + "loss": 1.1747, + "num_input_tokens_seen": 27576320, + "step": 1713 + }, + { + "epoch": 0.12006269317992768, + "grad_norm": 5.1166486740112305, + "learning_rate": 8.80050017513135e-05, + "loss": 1.4222, + "num_input_tokens_seen": 27591024, + "step": 1714 + }, + { + "epoch": 0.12013274142565693, + "grad_norm": 4.6713714599609375, + "learning_rate": 8.799800350262697e-05, + "loss": 1.1869, + "num_input_tokens_seen": 27606352, + "step": 1715 + }, + { + "epoch": 0.12020278967138617, + "grad_norm": 4.62678861618042, + "learning_rate": 8.799100525394046e-05, + "loss": 1.1524, + "num_input_tokens_seen": 27622736, + "step": 1716 + }, + { + "epoch": 0.12027283791711542, + "grad_norm": 3.611985206604004, + "learning_rate": 8.798400700525394e-05, + "loss": 1.1179, + "num_input_tokens_seen": 27639120, + "step": 1717 + }, + { + "epoch": 0.12034288616284466, + "grad_norm": 4.165099143981934, + "learning_rate": 8.797700875656743e-05, + "loss": 1.0104, + "num_input_tokens_seen": 27654024, + "step": 1718 + }, + { + "epoch": 0.12041293440857391, + "grad_norm": 4.532061576843262, + "learning_rate": 8.797001050788091e-05, + "loss": 1.05, + "num_input_tokens_seen": 27670408, + "step": 1719 + }, + { + "epoch": 0.12048298265430316, + "grad_norm": 4.880197048187256, + "learning_rate": 8.79630122591944e-05, + "loss": 1.0321, + "num_input_tokens_seen": 27686792, + "step": 1720 + }, + { + "epoch": 0.1205530309000324, + "grad_norm": 3.521052360534668, + "learning_rate": 8.795601401050789e-05, + "loss": 0.9048, + "num_input_tokens_seen": 27703176, + "step": 1721 + }, + { + "epoch": 0.12062307914576165, + "grad_norm": 3.965725898742676, + "learning_rate": 8.794901576182137e-05, + "loss": 1.1348, + "num_input_tokens_seen": 27719024, + "step": 1722 + }, + { + "epoch": 0.12069312739149089, + "grad_norm": 3.936962842941284, + "learning_rate": 8.794201751313486e-05, + "loss": 1.1531, + "num_input_tokens_seen": 27734736, + "step": 1723 + }, + { + "epoch": 0.12076317563722014, + "grad_norm": 5.225526332855225, + "learning_rate": 8.793501926444834e-05, + "loss": 1.2784, + "num_input_tokens_seen": 27751120, + "step": 1724 + }, + { + "epoch": 0.12083322388294938, + "grad_norm": 4.125289440155029, + "learning_rate": 8.792802101576182e-05, + "loss": 1.1893, + "num_input_tokens_seen": 27767288, + "step": 1725 + }, + { + "epoch": 0.12090327212867863, + "grad_norm": 3.9352405071258545, + "learning_rate": 8.792102276707532e-05, + "loss": 1.1867, + "num_input_tokens_seen": 27783672, + "step": 1726 + }, + { + "epoch": 0.12097332037440787, + "grad_norm": 3.908578634262085, + "learning_rate": 8.79140245183888e-05, + "loss": 1.0024, + "num_input_tokens_seen": 27799640, + "step": 1727 + }, + { + "epoch": 0.12104336862013712, + "grad_norm": 3.694387435913086, + "learning_rate": 8.790702626970229e-05, + "loss": 1.0652, + "num_input_tokens_seen": 27816024, + "step": 1728 + }, + { + "epoch": 0.12111341686586637, + "grad_norm": 4.0100016593933105, + "learning_rate": 8.790002802101576e-05, + "loss": 1.0511, + "num_input_tokens_seen": 27832408, + "step": 1729 + }, + { + "epoch": 0.12118346511159561, + "grad_norm": 5.454882621765137, + "learning_rate": 8.789302977232925e-05, + "loss": 1.1096, + "num_input_tokens_seen": 27848792, + "step": 1730 + }, + { + "epoch": 0.12125351335732486, + "grad_norm": 5.065526485443115, + "learning_rate": 8.788603152364274e-05, + "loss": 1.0354, + "num_input_tokens_seen": 27864688, + "step": 1731 + }, + { + "epoch": 0.1213235616030541, + "grad_norm": 3.73103666305542, + "learning_rate": 8.787903327495623e-05, + "loss": 1.0328, + "num_input_tokens_seen": 27881072, + "step": 1732 + }, + { + "epoch": 0.12139360984878335, + "grad_norm": 3.971198081970215, + "learning_rate": 8.787203502626971e-05, + "loss": 1.1908, + "num_input_tokens_seen": 27896912, + "step": 1733 + }, + { + "epoch": 0.1214636580945126, + "grad_norm": 3.933809518814087, + "learning_rate": 8.786503677758319e-05, + "loss": 1.1125, + "num_input_tokens_seen": 27913104, + "step": 1734 + }, + { + "epoch": 0.12153370634024184, + "grad_norm": 3.92167329788208, + "learning_rate": 8.785803852889668e-05, + "loss": 1.0007, + "num_input_tokens_seen": 27929488, + "step": 1735 + }, + { + "epoch": 0.12160375458597109, + "grad_norm": 4.441089630126953, + "learning_rate": 8.785104028021017e-05, + "loss": 0.9748, + "num_input_tokens_seen": 27945504, + "step": 1736 + }, + { + "epoch": 0.12167380283170033, + "grad_norm": 4.023623466491699, + "learning_rate": 8.784404203152364e-05, + "loss": 0.8826, + "num_input_tokens_seen": 27961888, + "step": 1737 + }, + { + "epoch": 0.12174385107742958, + "grad_norm": 4.0328826904296875, + "learning_rate": 8.783704378283713e-05, + "loss": 1.2769, + "num_input_tokens_seen": 27978024, + "step": 1738 + }, + { + "epoch": 0.12181389932315882, + "grad_norm": 4.5445733070373535, + "learning_rate": 8.783004553415062e-05, + "loss": 1.3745, + "num_input_tokens_seen": 27993840, + "step": 1739 + }, + { + "epoch": 0.12188394756888807, + "grad_norm": 3.609834671020508, + "learning_rate": 8.782304728546411e-05, + "loss": 0.916, + "num_input_tokens_seen": 28010224, + "step": 1740 + }, + { + "epoch": 0.12195399581461731, + "grad_norm": 3.849306344985962, + "learning_rate": 8.78160490367776e-05, + "loss": 1.1135, + "num_input_tokens_seen": 28026232, + "step": 1741 + }, + { + "epoch": 0.12202404406034656, + "grad_norm": 4.11102294921875, + "learning_rate": 8.780905078809107e-05, + "loss": 1.2269, + "num_input_tokens_seen": 28041880, + "step": 1742 + }, + { + "epoch": 0.1220940923060758, + "grad_norm": 4.156986713409424, + "learning_rate": 8.780205253940456e-05, + "loss": 1.0321, + "num_input_tokens_seen": 28058264, + "step": 1743 + }, + { + "epoch": 0.12216414055180505, + "grad_norm": 3.9670159816741943, + "learning_rate": 8.779505429071803e-05, + "loss": 0.9752, + "num_input_tokens_seen": 28073168, + "step": 1744 + }, + { + "epoch": 0.1222341887975343, + "grad_norm": 5.342650890350342, + "learning_rate": 8.778805604203152e-05, + "loss": 1.1416, + "num_input_tokens_seen": 28089552, + "step": 1745 + }, + { + "epoch": 0.12230423704326354, + "grad_norm": 4.031285285949707, + "learning_rate": 8.778105779334501e-05, + "loss": 1.1134, + "num_input_tokens_seen": 28105264, + "step": 1746 + }, + { + "epoch": 0.12237428528899279, + "grad_norm": 3.5976450443267822, + "learning_rate": 8.77740595446585e-05, + "loss": 1.0342, + "num_input_tokens_seen": 28121648, + "step": 1747 + }, + { + "epoch": 0.12244433353472203, + "grad_norm": 4.947859764099121, + "learning_rate": 8.776706129597199e-05, + "loss": 1.0809, + "num_input_tokens_seen": 28137640, + "step": 1748 + }, + { + "epoch": 0.12251438178045129, + "grad_norm": 4.004949569702148, + "learning_rate": 8.776006304728546e-05, + "loss": 1.0921, + "num_input_tokens_seen": 28154024, + "step": 1749 + }, + { + "epoch": 0.12258443002618054, + "grad_norm": 3.9022445678710938, + "learning_rate": 8.775306479859895e-05, + "loss": 1.0844, + "num_input_tokens_seen": 28170408, + "step": 1750 + }, + { + "epoch": 0.12265447827190978, + "grad_norm": 4.171925067901611, + "learning_rate": 8.774606654991244e-05, + "loss": 1.1894, + "num_input_tokens_seen": 28186792, + "step": 1751 + }, + { + "epoch": 0.12272452651763903, + "grad_norm": 3.9387433528900146, + "learning_rate": 8.773906830122592e-05, + "loss": 1.0303, + "num_input_tokens_seen": 28203176, + "step": 1752 + }, + { + "epoch": 0.12279457476336827, + "grad_norm": 5.067278861999512, + "learning_rate": 8.773207005253942e-05, + "loss": 1.1924, + "num_input_tokens_seen": 28219192, + "step": 1753 + }, + { + "epoch": 0.12286462300909752, + "grad_norm": 3.673807144165039, + "learning_rate": 8.77250718038529e-05, + "loss": 1.0438, + "num_input_tokens_seen": 28235576, + "step": 1754 + }, + { + "epoch": 0.12293467125482677, + "grad_norm": 5.303588390350342, + "learning_rate": 8.771807355516638e-05, + "loss": 1.2601, + "num_input_tokens_seen": 28251960, + "step": 1755 + }, + { + "epoch": 0.12300471950055601, + "grad_norm": 5.343825340270996, + "learning_rate": 8.771107530647986e-05, + "loss": 1.1126, + "num_input_tokens_seen": 28268344, + "step": 1756 + }, + { + "epoch": 0.12307476774628526, + "grad_norm": 4.125874996185303, + "learning_rate": 8.770407705779335e-05, + "loss": 1.1497, + "num_input_tokens_seen": 28284144, + "step": 1757 + }, + { + "epoch": 0.1231448159920145, + "grad_norm": 4.628546714782715, + "learning_rate": 8.769707880910683e-05, + "loss": 1.1757, + "num_input_tokens_seen": 28299896, + "step": 1758 + }, + { + "epoch": 0.12321486423774375, + "grad_norm": 3.946603775024414, + "learning_rate": 8.769008056042032e-05, + "loss": 1.2739, + "num_input_tokens_seen": 28316280, + "step": 1759 + }, + { + "epoch": 0.123284912483473, + "grad_norm": 3.4837770462036133, + "learning_rate": 8.768308231173381e-05, + "loss": 0.9682, + "num_input_tokens_seen": 28332128, + "step": 1760 + }, + { + "epoch": 0.12335496072920224, + "grad_norm": 3.9601573944091797, + "learning_rate": 8.767608406304729e-05, + "loss": 1.2647, + "num_input_tokens_seen": 28347488, + "step": 1761 + }, + { + "epoch": 0.12342500897493149, + "grad_norm": 4.178001403808594, + "learning_rate": 8.766908581436078e-05, + "loss": 1.0055, + "num_input_tokens_seen": 28363872, + "step": 1762 + }, + { + "epoch": 0.12349505722066073, + "grad_norm": 3.9182498455047607, + "learning_rate": 8.766208756567426e-05, + "loss": 1.1407, + "num_input_tokens_seen": 28380208, + "step": 1763 + }, + { + "epoch": 0.12356510546638998, + "grad_norm": 4.071939468383789, + "learning_rate": 8.765508931698774e-05, + "loss": 1.3196, + "num_input_tokens_seen": 28396592, + "step": 1764 + }, + { + "epoch": 0.12363515371211922, + "grad_norm": 4.657908916473389, + "learning_rate": 8.764809106830123e-05, + "loss": 1.0739, + "num_input_tokens_seen": 28412976, + "step": 1765 + }, + { + "epoch": 0.12370520195784847, + "grad_norm": 3.9706201553344727, + "learning_rate": 8.764109281961472e-05, + "loss": 1.0904, + "num_input_tokens_seen": 28429088, + "step": 1766 + }, + { + "epoch": 0.12377525020357771, + "grad_norm": 4.571341514587402, + "learning_rate": 8.76340945709282e-05, + "loss": 1.1314, + "num_input_tokens_seen": 28445472, + "step": 1767 + }, + { + "epoch": 0.12384529844930696, + "grad_norm": 4.197002410888672, + "learning_rate": 8.762709632224169e-05, + "loss": 0.8251, + "num_input_tokens_seen": 28461656, + "step": 1768 + }, + { + "epoch": 0.1239153466950362, + "grad_norm": 5.376040935516357, + "learning_rate": 8.762009807355517e-05, + "loss": 1.1626, + "num_input_tokens_seen": 28477088, + "step": 1769 + }, + { + "epoch": 0.12398539494076545, + "grad_norm": 3.987495183944702, + "learning_rate": 8.761309982486866e-05, + "loss": 1.2449, + "num_input_tokens_seen": 28493472, + "step": 1770 + }, + { + "epoch": 0.1240554431864947, + "grad_norm": 4.379208564758301, + "learning_rate": 8.760610157618213e-05, + "loss": 1.2834, + "num_input_tokens_seen": 28509856, + "step": 1771 + }, + { + "epoch": 0.12412549143222394, + "grad_norm": 3.7258729934692383, + "learning_rate": 8.759910332749562e-05, + "loss": 1.1115, + "num_input_tokens_seen": 28525664, + "step": 1772 + }, + { + "epoch": 0.12419553967795319, + "grad_norm": 4.0574774742126465, + "learning_rate": 8.759210507880911e-05, + "loss": 1.1005, + "num_input_tokens_seen": 28541920, + "step": 1773 + }, + { + "epoch": 0.12426558792368243, + "grad_norm": 3.8423895835876465, + "learning_rate": 8.75851068301226e-05, + "loss": 1.1067, + "num_input_tokens_seen": 28558216, + "step": 1774 + }, + { + "epoch": 0.12433563616941168, + "grad_norm": 3.8898398876190186, + "learning_rate": 8.757810858143609e-05, + "loss": 1.1963, + "num_input_tokens_seen": 28574536, + "step": 1775 + }, + { + "epoch": 0.12440568441514092, + "grad_norm": 3.286412000656128, + "learning_rate": 8.757111033274956e-05, + "loss": 0.9159, + "num_input_tokens_seen": 28590920, + "step": 1776 + }, + { + "epoch": 0.12447573266087017, + "grad_norm": 3.7219464778900146, + "learning_rate": 8.756411208406305e-05, + "loss": 1.0883, + "num_input_tokens_seen": 28607192, + "step": 1777 + }, + { + "epoch": 0.12454578090659942, + "grad_norm": 3.8907012939453125, + "learning_rate": 8.755711383537654e-05, + "loss": 1.0226, + "num_input_tokens_seen": 28623176, + "step": 1778 + }, + { + "epoch": 0.12461582915232866, + "grad_norm": 3.8087925910949707, + "learning_rate": 8.755011558669003e-05, + "loss": 1.0115, + "num_input_tokens_seen": 28639528, + "step": 1779 + }, + { + "epoch": 0.1246858773980579, + "grad_norm": 4.8956217765808105, + "learning_rate": 8.754311733800352e-05, + "loss": 1.0108, + "num_input_tokens_seen": 28654976, + "step": 1780 + }, + { + "epoch": 0.12475592564378715, + "grad_norm": 3.7400572299957275, + "learning_rate": 8.753611908931699e-05, + "loss": 0.8787, + "num_input_tokens_seen": 28671064, + "step": 1781 + }, + { + "epoch": 0.1248259738895164, + "grad_norm": 4.689199924468994, + "learning_rate": 8.752912084063048e-05, + "loss": 1.2326, + "num_input_tokens_seen": 28686664, + "step": 1782 + }, + { + "epoch": 0.12489602213524566, + "grad_norm": 3.6594929695129395, + "learning_rate": 8.752212259194395e-05, + "loss": 1.1626, + "num_input_tokens_seen": 28703048, + "step": 1783 + }, + { + "epoch": 0.1249660703809749, + "grad_norm": 4.6070356369018555, + "learning_rate": 8.751512434325744e-05, + "loss": 1.358, + "num_input_tokens_seen": 28719000, + "step": 1784 + }, + { + "epoch": 0.12503611862670413, + "grad_norm": 4.658362865447998, + "learning_rate": 8.750812609457093e-05, + "loss": 1.2852, + "num_input_tokens_seen": 28735384, + "step": 1785 + }, + { + "epoch": 0.1251061668724334, + "grad_norm": 3.6963465213775635, + "learning_rate": 8.750112784588442e-05, + "loss": 1.1068, + "num_input_tokens_seen": 28750856, + "step": 1786 + }, + { + "epoch": 0.12517621511816263, + "grad_norm": 4.419562816619873, + "learning_rate": 8.749412959719791e-05, + "loss": 1.1559, + "num_input_tokens_seen": 28766824, + "step": 1787 + }, + { + "epoch": 0.12524626336389189, + "grad_norm": 4.601676940917969, + "learning_rate": 8.74871313485114e-05, + "loss": 1.0642, + "num_input_tokens_seen": 28783208, + "step": 1788 + }, + { + "epoch": 0.12531631160962112, + "grad_norm": 3.8597445487976074, + "learning_rate": 8.748013309982487e-05, + "loss": 1.1149, + "num_input_tokens_seen": 28799160, + "step": 1789 + }, + { + "epoch": 0.12538635985535038, + "grad_norm": 3.654649257659912, + "learning_rate": 8.747313485113835e-05, + "loss": 1.3127, + "num_input_tokens_seen": 28815440, + "step": 1790 + }, + { + "epoch": 0.1254564081010796, + "grad_norm": 4.043321132659912, + "learning_rate": 8.746613660245184e-05, + "loss": 1.0844, + "num_input_tokens_seen": 28831824, + "step": 1791 + }, + { + "epoch": 0.12552645634680887, + "grad_norm": 4.5223894119262695, + "learning_rate": 8.745913835376532e-05, + "loss": 1.0627, + "num_input_tokens_seen": 28846984, + "step": 1792 + }, + { + "epoch": 0.1255965045925381, + "grad_norm": 4.074361801147461, + "learning_rate": 8.745214010507881e-05, + "loss": 0.9772, + "num_input_tokens_seen": 28863368, + "step": 1793 + }, + { + "epoch": 0.12566655283826736, + "grad_norm": 4.661183834075928, + "learning_rate": 8.74451418563923e-05, + "loss": 1.152, + "num_input_tokens_seen": 28879752, + "step": 1794 + }, + { + "epoch": 0.1257366010839966, + "grad_norm": 3.95831561088562, + "learning_rate": 8.743814360770579e-05, + "loss": 1.117, + "num_input_tokens_seen": 28895728, + "step": 1795 + }, + { + "epoch": 0.12580664932972585, + "grad_norm": 4.271726131439209, + "learning_rate": 8.743114535901927e-05, + "loss": 1.0935, + "num_input_tokens_seen": 28912112, + "step": 1796 + }, + { + "epoch": 0.12587669757545508, + "grad_norm": 4.079075336456299, + "learning_rate": 8.742414711033275e-05, + "loss": 1.1397, + "num_input_tokens_seen": 28928496, + "step": 1797 + }, + { + "epoch": 0.12594674582118434, + "grad_norm": 4.030980587005615, + "learning_rate": 8.741714886164623e-05, + "loss": 0.9405, + "num_input_tokens_seen": 28943968, + "step": 1798 + }, + { + "epoch": 0.12601679406691357, + "grad_norm": 3.7285454273223877, + "learning_rate": 8.741015061295973e-05, + "loss": 0.8448, + "num_input_tokens_seen": 28959800, + "step": 1799 + }, + { + "epoch": 0.12608684231264283, + "grad_norm": 3.964663028717041, + "learning_rate": 8.74031523642732e-05, + "loss": 1.1614, + "num_input_tokens_seen": 28976184, + "step": 1800 + }, + { + "epoch": 0.12608684231264283, + "eval_loss": 1.1493111848831177, + "eval_runtime": 0.196, + "eval_samples_per_second": 5.102, + "eval_steps_per_second": 5.102, + "num_input_tokens_seen": 28976184, + "step": 1800 + }, + { + "epoch": 0.1261568905583721, + "grad_norm": 4.2887396812438965, + "learning_rate": 8.73961541155867e-05, + "loss": 0.9047, + "num_input_tokens_seen": 28992552, + "step": 1801 + }, + { + "epoch": 0.12622693880410132, + "grad_norm": 5.139194011688232, + "learning_rate": 8.738915586690018e-05, + "loss": 1.1656, + "num_input_tokens_seen": 29007480, + "step": 1802 + }, + { + "epoch": 0.12629698704983058, + "grad_norm": 4.023421287536621, + "learning_rate": 8.738215761821366e-05, + "loss": 1.0585, + "num_input_tokens_seen": 29023864, + "step": 1803 + }, + { + "epoch": 0.12636703529555982, + "grad_norm": 3.6131162643432617, + "learning_rate": 8.737515936952715e-05, + "loss": 1.0964, + "num_input_tokens_seen": 29039640, + "step": 1804 + }, + { + "epoch": 0.12643708354128907, + "grad_norm": 4.477705478668213, + "learning_rate": 8.736816112084064e-05, + "loss": 0.8054, + "num_input_tokens_seen": 29055816, + "step": 1805 + }, + { + "epoch": 0.1265071317870183, + "grad_norm": 3.7637252807617188, + "learning_rate": 8.736116287215412e-05, + "loss": 1.0389, + "num_input_tokens_seen": 29071456, + "step": 1806 + }, + { + "epoch": 0.12657718003274757, + "grad_norm": 3.9611611366271973, + "learning_rate": 8.735416462346761e-05, + "loss": 1.1907, + "num_input_tokens_seen": 29087840, + "step": 1807 + }, + { + "epoch": 0.1266472282784768, + "grad_norm": 3.6022791862487793, + "learning_rate": 8.734716637478109e-05, + "loss": 0.9538, + "num_input_tokens_seen": 29104224, + "step": 1808 + }, + { + "epoch": 0.12671727652420606, + "grad_norm": 3.7403485774993896, + "learning_rate": 8.734016812609458e-05, + "loss": 1.12, + "num_input_tokens_seen": 29120608, + "step": 1809 + }, + { + "epoch": 0.1267873247699353, + "grad_norm": 3.5624709129333496, + "learning_rate": 8.733316987740805e-05, + "loss": 1.0931, + "num_input_tokens_seen": 29136840, + "step": 1810 + }, + { + "epoch": 0.12685737301566455, + "grad_norm": 3.961516857147217, + "learning_rate": 8.732617162872154e-05, + "loss": 0.9529, + "num_input_tokens_seen": 29153224, + "step": 1811 + }, + { + "epoch": 0.12692742126139378, + "grad_norm": 4.895046234130859, + "learning_rate": 8.731917338003503e-05, + "loss": 1.0697, + "num_input_tokens_seen": 29168336, + "step": 1812 + }, + { + "epoch": 0.12699746950712304, + "grad_norm": 4.290217876434326, + "learning_rate": 8.731217513134852e-05, + "loss": 0.8945, + "num_input_tokens_seen": 29184720, + "step": 1813 + }, + { + "epoch": 0.12706751775285227, + "grad_norm": 3.6602399349212646, + "learning_rate": 8.7305176882662e-05, + "loss": 1.0465, + "num_input_tokens_seen": 29200920, + "step": 1814 + }, + { + "epoch": 0.12713756599858153, + "grad_norm": 3.7980921268463135, + "learning_rate": 8.72981786339755e-05, + "loss": 0.8915, + "num_input_tokens_seen": 29217304, + "step": 1815 + }, + { + "epoch": 0.12720761424431076, + "grad_norm": 3.646242141723633, + "learning_rate": 8.729118038528897e-05, + "loss": 1.0058, + "num_input_tokens_seen": 29233688, + "step": 1816 + }, + { + "epoch": 0.12727766249004002, + "grad_norm": 5.226564884185791, + "learning_rate": 8.728418213660244e-05, + "loss": 0.9569, + "num_input_tokens_seen": 29247896, + "step": 1817 + }, + { + "epoch": 0.12734771073576925, + "grad_norm": 3.8191912174224854, + "learning_rate": 8.727718388791593e-05, + "loss": 1.1548, + "num_input_tokens_seen": 29263896, + "step": 1818 + }, + { + "epoch": 0.1274177589814985, + "grad_norm": 4.349045276641846, + "learning_rate": 8.727018563922944e-05, + "loss": 1.1368, + "num_input_tokens_seen": 29280224, + "step": 1819 + }, + { + "epoch": 0.12748780722722775, + "grad_norm": 3.842888116836548, + "learning_rate": 8.726318739054291e-05, + "loss": 1.0052, + "num_input_tokens_seen": 29296608, + "step": 1820 + }, + { + "epoch": 0.127557855472957, + "grad_norm": 3.8854012489318848, + "learning_rate": 8.72561891418564e-05, + "loss": 1.0584, + "num_input_tokens_seen": 29312992, + "step": 1821 + }, + { + "epoch": 0.12762790371868624, + "grad_norm": 4.102949619293213, + "learning_rate": 8.724919089316989e-05, + "loss": 0.9004, + "num_input_tokens_seen": 29328416, + "step": 1822 + }, + { + "epoch": 0.1276979519644155, + "grad_norm": 5.0174336433410645, + "learning_rate": 8.724219264448336e-05, + "loss": 1.0837, + "num_input_tokens_seen": 29344800, + "step": 1823 + }, + { + "epoch": 0.12776800021014473, + "grad_norm": 3.6122186183929443, + "learning_rate": 8.723519439579685e-05, + "loss": 0.924, + "num_input_tokens_seen": 29361184, + "step": 1824 + }, + { + "epoch": 0.127838048455874, + "grad_norm": 4.086683750152588, + "learning_rate": 8.722819614711034e-05, + "loss": 1.0945, + "num_input_tokens_seen": 29376840, + "step": 1825 + }, + { + "epoch": 0.12790809670160322, + "grad_norm": 4.279770851135254, + "learning_rate": 8.722119789842383e-05, + "loss": 0.9831, + "num_input_tokens_seen": 29393016, + "step": 1826 + }, + { + "epoch": 0.12797814494733248, + "grad_norm": 5.032819747924805, + "learning_rate": 8.72141996497373e-05, + "loss": 1.1691, + "num_input_tokens_seen": 29409400, + "step": 1827 + }, + { + "epoch": 0.1280481931930617, + "grad_norm": 4.480144023895264, + "learning_rate": 8.720720140105079e-05, + "loss": 1.1481, + "num_input_tokens_seen": 29425472, + "step": 1828 + }, + { + "epoch": 0.12811824143879097, + "grad_norm": 3.6843478679656982, + "learning_rate": 8.720020315236428e-05, + "loss": 1.1302, + "num_input_tokens_seen": 29441472, + "step": 1829 + }, + { + "epoch": 0.1281882896845202, + "grad_norm": 3.7091941833496094, + "learning_rate": 8.719320490367776e-05, + "loss": 1.0351, + "num_input_tokens_seen": 29457600, + "step": 1830 + }, + { + "epoch": 0.12825833793024946, + "grad_norm": 4.122303009033203, + "learning_rate": 8.718620665499124e-05, + "loss": 1.0791, + "num_input_tokens_seen": 29473984, + "step": 1831 + }, + { + "epoch": 0.1283283861759787, + "grad_norm": 5.282047748565674, + "learning_rate": 8.717920840630473e-05, + "loss": 1.4479, + "num_input_tokens_seen": 29490336, + "step": 1832 + }, + { + "epoch": 0.12839843442170795, + "grad_norm": 4.0706586837768555, + "learning_rate": 8.717221015761822e-05, + "loss": 1.0026, + "num_input_tokens_seen": 29506432, + "step": 1833 + }, + { + "epoch": 0.12846848266743718, + "grad_norm": 3.856018543243408, + "learning_rate": 8.716521190893171e-05, + "loss": 1.0545, + "num_input_tokens_seen": 29521744, + "step": 1834 + }, + { + "epoch": 0.12853853091316644, + "grad_norm": 3.7059905529022217, + "learning_rate": 8.715821366024518e-05, + "loss": 0.9876, + "num_input_tokens_seen": 29537104, + "step": 1835 + }, + { + "epoch": 0.1286085791588957, + "grad_norm": 3.915038585662842, + "learning_rate": 8.715121541155867e-05, + "loss": 1.2072, + "num_input_tokens_seen": 29552928, + "step": 1836 + }, + { + "epoch": 0.12867862740462493, + "grad_norm": 3.6828839778900146, + "learning_rate": 8.714421716287215e-05, + "loss": 0.9849, + "num_input_tokens_seen": 29569312, + "step": 1837 + }, + { + "epoch": 0.1287486756503542, + "grad_norm": 4.3285441398620605, + "learning_rate": 8.713721891418564e-05, + "loss": 1.2812, + "num_input_tokens_seen": 29584376, + "step": 1838 + }, + { + "epoch": 0.12881872389608343, + "grad_norm": 4.646363258361816, + "learning_rate": 8.713022066549914e-05, + "loss": 1.1107, + "num_input_tokens_seen": 29599856, + "step": 1839 + }, + { + "epoch": 0.12888877214181269, + "grad_norm": 4.180859088897705, + "learning_rate": 8.712322241681261e-05, + "loss": 1.0751, + "num_input_tokens_seen": 29616224, + "step": 1840 + }, + { + "epoch": 0.12895882038754192, + "grad_norm": 3.666090250015259, + "learning_rate": 8.71162241681261e-05, + "loss": 1.0568, + "num_input_tokens_seen": 29632608, + "step": 1841 + }, + { + "epoch": 0.12902886863327118, + "grad_norm": 3.4623513221740723, + "learning_rate": 8.710922591943959e-05, + "loss": 0.9662, + "num_input_tokens_seen": 29648992, + "step": 1842 + }, + { + "epoch": 0.1290989168790004, + "grad_norm": 4.720603942871094, + "learning_rate": 8.710222767075307e-05, + "loss": 1.0566, + "num_input_tokens_seen": 29665136, + "step": 1843 + }, + { + "epoch": 0.12916896512472967, + "grad_norm": 4.208099365234375, + "learning_rate": 8.709522942206654e-05, + "loss": 1.1878, + "num_input_tokens_seen": 29681520, + "step": 1844 + }, + { + "epoch": 0.1292390133704589, + "grad_norm": 4.145462989807129, + "learning_rate": 8.708823117338004e-05, + "loss": 1.0159, + "num_input_tokens_seen": 29697480, + "step": 1845 + }, + { + "epoch": 0.12930906161618816, + "grad_norm": 3.9043843746185303, + "learning_rate": 8.708123292469353e-05, + "loss": 1.0809, + "num_input_tokens_seen": 29713560, + "step": 1846 + }, + { + "epoch": 0.1293791098619174, + "grad_norm": 4.092489242553711, + "learning_rate": 8.707423467600701e-05, + "loss": 1.0432, + "num_input_tokens_seen": 29729944, + "step": 1847 + }, + { + "epoch": 0.12944915810764665, + "grad_norm": 4.73677396774292, + "learning_rate": 8.70672364273205e-05, + "loss": 1.0276, + "num_input_tokens_seen": 29746328, + "step": 1848 + }, + { + "epoch": 0.12951920635337588, + "grad_norm": 6.134850025177002, + "learning_rate": 8.706023817863398e-05, + "loss": 0.9749, + "num_input_tokens_seen": 29762584, + "step": 1849 + }, + { + "epoch": 0.12958925459910514, + "grad_norm": 3.4841954708099365, + "learning_rate": 8.705323992994746e-05, + "loss": 0.9534, + "num_input_tokens_seen": 29778968, + "step": 1850 + }, + { + "epoch": 0.12965930284483437, + "grad_norm": 3.8816237449645996, + "learning_rate": 8.704624168126095e-05, + "loss": 0.7471, + "num_input_tokens_seen": 29795352, + "step": 1851 + }, + { + "epoch": 0.12972935109056363, + "grad_norm": 3.596538543701172, + "learning_rate": 8.703924343257444e-05, + "loss": 0.9753, + "num_input_tokens_seen": 29811608, + "step": 1852 + }, + { + "epoch": 0.12979939933629286, + "grad_norm": 3.9403269290924072, + "learning_rate": 8.703224518388793e-05, + "loss": 1.0667, + "num_input_tokens_seen": 29827608, + "step": 1853 + }, + { + "epoch": 0.12986944758202212, + "grad_norm": 4.586714744567871, + "learning_rate": 8.70252469352014e-05, + "loss": 0.9335, + "num_input_tokens_seen": 29843992, + "step": 1854 + }, + { + "epoch": 0.12993949582775136, + "grad_norm": 3.905280590057373, + "learning_rate": 8.701824868651489e-05, + "loss": 0.9115, + "num_input_tokens_seen": 29860376, + "step": 1855 + }, + { + "epoch": 0.13000954407348062, + "grad_norm": 4.974122524261475, + "learning_rate": 8.701125043782838e-05, + "loss": 0.9887, + "num_input_tokens_seen": 29875880, + "step": 1856 + }, + { + "epoch": 0.13007959231920985, + "grad_norm": 4.33966064453125, + "learning_rate": 8.700425218914185e-05, + "loss": 1.1955, + "num_input_tokens_seen": 29891088, + "step": 1857 + }, + { + "epoch": 0.1301496405649391, + "grad_norm": 4.593107223510742, + "learning_rate": 8.699725394045534e-05, + "loss": 0.9012, + "num_input_tokens_seen": 29907472, + "step": 1858 + }, + { + "epoch": 0.13021968881066834, + "grad_norm": 4.036941051483154, + "learning_rate": 8.699025569176884e-05, + "loss": 1.048, + "num_input_tokens_seen": 29923856, + "step": 1859 + }, + { + "epoch": 0.1302897370563976, + "grad_norm": 3.887981653213501, + "learning_rate": 8.698325744308232e-05, + "loss": 1.2116, + "num_input_tokens_seen": 29939872, + "step": 1860 + }, + { + "epoch": 0.13035978530212683, + "grad_norm": 3.796053886413574, + "learning_rate": 8.697625919439581e-05, + "loss": 1.1678, + "num_input_tokens_seen": 29955928, + "step": 1861 + }, + { + "epoch": 0.1304298335478561, + "grad_norm": 4.5357184410095215, + "learning_rate": 8.696926094570928e-05, + "loss": 0.9246, + "num_input_tokens_seen": 29970760, + "step": 1862 + }, + { + "epoch": 0.13049988179358532, + "grad_norm": 5.54911994934082, + "learning_rate": 8.696226269702277e-05, + "loss": 1.1874, + "num_input_tokens_seen": 29986408, + "step": 1863 + }, + { + "epoch": 0.13056993003931458, + "grad_norm": 3.6517300605773926, + "learning_rate": 8.695526444833625e-05, + "loss": 1.0949, + "num_input_tokens_seen": 30002792, + "step": 1864 + }, + { + "epoch": 0.1306399782850438, + "grad_norm": 3.6885063648223877, + "learning_rate": 8.694826619964975e-05, + "loss": 1.0027, + "num_input_tokens_seen": 30019176, + "step": 1865 + }, + { + "epoch": 0.13071002653077307, + "grad_norm": 4.417117118835449, + "learning_rate": 8.694126795096324e-05, + "loss": 1.1017, + "num_input_tokens_seen": 30034856, + "step": 1866 + }, + { + "epoch": 0.1307800747765023, + "grad_norm": 4.070515155792236, + "learning_rate": 8.693426970227671e-05, + "loss": 1.0393, + "num_input_tokens_seen": 30051240, + "step": 1867 + }, + { + "epoch": 0.13085012302223156, + "grad_norm": 4.135226726531982, + "learning_rate": 8.69272714535902e-05, + "loss": 1.0886, + "num_input_tokens_seen": 30067392, + "step": 1868 + }, + { + "epoch": 0.1309201712679608, + "grad_norm": 4.304529666900635, + "learning_rate": 8.692027320490369e-05, + "loss": 1.0851, + "num_input_tokens_seen": 30083640, + "step": 1869 + }, + { + "epoch": 0.13099021951369005, + "grad_norm": 4.633643627166748, + "learning_rate": 8.691327495621716e-05, + "loss": 1.1934, + "num_input_tokens_seen": 30099968, + "step": 1870 + }, + { + "epoch": 0.1310602677594193, + "grad_norm": 3.6481478214263916, + "learning_rate": 8.690627670753065e-05, + "loss": 1.0661, + "num_input_tokens_seen": 30116352, + "step": 1871 + }, + { + "epoch": 0.13113031600514855, + "grad_norm": 4.15482234954834, + "learning_rate": 8.689927845884414e-05, + "loss": 1.1083, + "num_input_tokens_seen": 30132256, + "step": 1872 + }, + { + "epoch": 0.1312003642508778, + "grad_norm": 3.6562340259552, + "learning_rate": 8.689228021015763e-05, + "loss": 0.9322, + "num_input_tokens_seen": 30147520, + "step": 1873 + }, + { + "epoch": 0.13127041249660704, + "grad_norm": 5.323586463928223, + "learning_rate": 8.68852819614711e-05, + "loss": 1.4077, + "num_input_tokens_seen": 30163880, + "step": 1874 + }, + { + "epoch": 0.1313404607423363, + "grad_norm": 4.068235397338867, + "learning_rate": 8.687828371278459e-05, + "loss": 1.144, + "num_input_tokens_seen": 30180264, + "step": 1875 + }, + { + "epoch": 0.13141050898806553, + "grad_norm": 3.743837594985962, + "learning_rate": 8.687128546409808e-05, + "loss": 0.9754, + "num_input_tokens_seen": 30196520, + "step": 1876 + }, + { + "epoch": 0.1314805572337948, + "grad_norm": 4.344557285308838, + "learning_rate": 8.686428721541156e-05, + "loss": 1.2745, + "num_input_tokens_seen": 30212904, + "step": 1877 + }, + { + "epoch": 0.13155060547952402, + "grad_norm": 4.048375129699707, + "learning_rate": 8.685728896672505e-05, + "loss": 1.1916, + "num_input_tokens_seen": 30228464, + "step": 1878 + }, + { + "epoch": 0.13162065372525328, + "grad_norm": 3.893768548965454, + "learning_rate": 8.685029071803853e-05, + "loss": 1.1462, + "num_input_tokens_seen": 30244848, + "step": 1879 + }, + { + "epoch": 0.1316907019709825, + "grad_norm": 4.469354629516602, + "learning_rate": 8.684329246935202e-05, + "loss": 1.0267, + "num_input_tokens_seen": 30260744, + "step": 1880 + }, + { + "epoch": 0.13176075021671177, + "grad_norm": 3.8471877574920654, + "learning_rate": 8.68362942206655e-05, + "loss": 0.8467, + "num_input_tokens_seen": 30277128, + "step": 1881 + }, + { + "epoch": 0.131830798462441, + "grad_norm": 4.37143611907959, + "learning_rate": 8.682929597197899e-05, + "loss": 0.9103, + "num_input_tokens_seen": 30293184, + "step": 1882 + }, + { + "epoch": 0.13190084670817026, + "grad_norm": 4.4709601402282715, + "learning_rate": 8.682229772329247e-05, + "loss": 0.9975, + "num_input_tokens_seen": 30309568, + "step": 1883 + }, + { + "epoch": 0.1319708949538995, + "grad_norm": 4.016445159912109, + "learning_rate": 8.681529947460595e-05, + "loss": 1.1499, + "num_input_tokens_seen": 30325952, + "step": 1884 + }, + { + "epoch": 0.13204094319962875, + "grad_norm": 3.6610453128814697, + "learning_rate": 8.680830122591945e-05, + "loss": 1.1407, + "num_input_tokens_seen": 30341608, + "step": 1885 + }, + { + "epoch": 0.13211099144535798, + "grad_norm": 4.226510524749756, + "learning_rate": 8.680130297723294e-05, + "loss": 0.8327, + "num_input_tokens_seen": 30357992, + "step": 1886 + }, + { + "epoch": 0.13218103969108724, + "grad_norm": 4.135020732879639, + "learning_rate": 8.679430472854642e-05, + "loss": 1.0807, + "num_input_tokens_seen": 30373464, + "step": 1887 + }, + { + "epoch": 0.13225108793681647, + "grad_norm": 3.858785629272461, + "learning_rate": 8.67873064798599e-05, + "loss": 0.9305, + "num_input_tokens_seen": 30389336, + "step": 1888 + }, + { + "epoch": 0.13232113618254573, + "grad_norm": 3.5424365997314453, + "learning_rate": 8.678030823117338e-05, + "loss": 1.0885, + "num_input_tokens_seen": 30405720, + "step": 1889 + }, + { + "epoch": 0.13239118442827497, + "grad_norm": 4.177000522613525, + "learning_rate": 8.677330998248687e-05, + "loss": 1.2172, + "num_input_tokens_seen": 30422104, + "step": 1890 + }, + { + "epoch": 0.13246123267400423, + "grad_norm": 4.08710241317749, + "learning_rate": 8.676631173380036e-05, + "loss": 1.0063, + "num_input_tokens_seen": 30437560, + "step": 1891 + }, + { + "epoch": 0.13253128091973346, + "grad_norm": 3.889277219772339, + "learning_rate": 8.675931348511384e-05, + "loss": 1.0227, + "num_input_tokens_seen": 30453944, + "step": 1892 + }, + { + "epoch": 0.13260132916546272, + "grad_norm": 3.7967042922973633, + "learning_rate": 8.675231523642733e-05, + "loss": 0.8988, + "num_input_tokens_seen": 30469480, + "step": 1893 + }, + { + "epoch": 0.13267137741119195, + "grad_norm": 4.2189202308654785, + "learning_rate": 8.674531698774081e-05, + "loss": 1.0591, + "num_input_tokens_seen": 30485536, + "step": 1894 + }, + { + "epoch": 0.1327414256569212, + "grad_norm": 4.682656764984131, + "learning_rate": 8.67383187390543e-05, + "loss": 1.2001, + "num_input_tokens_seen": 30501720, + "step": 1895 + }, + { + "epoch": 0.13281147390265044, + "grad_norm": 4.151151657104492, + "learning_rate": 8.673132049036779e-05, + "loss": 1.027, + "num_input_tokens_seen": 30518104, + "step": 1896 + }, + { + "epoch": 0.1328815221483797, + "grad_norm": 3.700916290283203, + "learning_rate": 8.672432224168126e-05, + "loss": 1.0545, + "num_input_tokens_seen": 30534488, + "step": 1897 + }, + { + "epoch": 0.13295157039410893, + "grad_norm": 3.512343406677246, + "learning_rate": 8.671732399299475e-05, + "loss": 1.0569, + "num_input_tokens_seen": 30550872, + "step": 1898 + }, + { + "epoch": 0.1330216186398382, + "grad_norm": 3.5579488277435303, + "learning_rate": 8.671032574430824e-05, + "loss": 0.9725, + "num_input_tokens_seen": 30567256, + "step": 1899 + }, + { + "epoch": 0.13309166688556742, + "grad_norm": 3.7006070613861084, + "learning_rate": 8.670332749562173e-05, + "loss": 0.9628, + "num_input_tokens_seen": 30582784, + "step": 1900 + }, + { + "epoch": 0.13316171513129668, + "grad_norm": 4.373071670532227, + "learning_rate": 8.66963292469352e-05, + "loss": 1.2223, + "num_input_tokens_seen": 30599168, + "step": 1901 + }, + { + "epoch": 0.1332317633770259, + "grad_norm": 4.459958076477051, + "learning_rate": 8.668933099824869e-05, + "loss": 1.2149, + "num_input_tokens_seen": 30615552, + "step": 1902 + }, + { + "epoch": 0.13330181162275517, + "grad_norm": 4.919619560241699, + "learning_rate": 8.668233274956218e-05, + "loss": 1.069, + "num_input_tokens_seen": 30631936, + "step": 1903 + }, + { + "epoch": 0.1333718598684844, + "grad_norm": 3.709568977355957, + "learning_rate": 8.667533450087565e-05, + "loss": 0.9867, + "num_input_tokens_seen": 30648320, + "step": 1904 + }, + { + "epoch": 0.13344190811421366, + "grad_norm": 4.097365379333496, + "learning_rate": 8.666833625218916e-05, + "loss": 1.2128, + "num_input_tokens_seen": 30664704, + "step": 1905 + }, + { + "epoch": 0.13351195635994292, + "grad_norm": 4.702358722686768, + "learning_rate": 8.666133800350263e-05, + "loss": 1.2809, + "num_input_tokens_seen": 30681088, + "step": 1906 + }, + { + "epoch": 0.13358200460567216, + "grad_norm": 3.7732086181640625, + "learning_rate": 8.665433975481612e-05, + "loss": 1.1529, + "num_input_tokens_seen": 30697472, + "step": 1907 + }, + { + "epoch": 0.13365205285140142, + "grad_norm": 5.318485260009766, + "learning_rate": 8.66473415061296e-05, + "loss": 1.0414, + "num_input_tokens_seen": 30712336, + "step": 1908 + }, + { + "epoch": 0.13372210109713065, + "grad_norm": 4.364311695098877, + "learning_rate": 8.664034325744308e-05, + "loss": 1.0634, + "num_input_tokens_seen": 30728600, + "step": 1909 + }, + { + "epoch": 0.1337921493428599, + "grad_norm": 4.860876083374023, + "learning_rate": 8.663334500875657e-05, + "loss": 1.0945, + "num_input_tokens_seen": 30744832, + "step": 1910 + }, + { + "epoch": 0.13386219758858914, + "grad_norm": 4.455454349517822, + "learning_rate": 8.662634676007006e-05, + "loss": 1.1765, + "num_input_tokens_seen": 30761216, + "step": 1911 + }, + { + "epoch": 0.1339322458343184, + "grad_norm": 4.70845365524292, + "learning_rate": 8.661934851138355e-05, + "loss": 1.2774, + "num_input_tokens_seen": 30776600, + "step": 1912 + }, + { + "epoch": 0.13400229408004763, + "grad_norm": 3.9769747257232666, + "learning_rate": 8.661235026269704e-05, + "loss": 1.006, + "num_input_tokens_seen": 30792632, + "step": 1913 + }, + { + "epoch": 0.1340723423257769, + "grad_norm": 4.387015342712402, + "learning_rate": 8.660535201401051e-05, + "loss": 1.1839, + "num_input_tokens_seen": 30809016, + "step": 1914 + }, + { + "epoch": 0.13414239057150612, + "grad_norm": 4.786890506744385, + "learning_rate": 8.6598353765324e-05, + "loss": 1.2352, + "num_input_tokens_seen": 30825136, + "step": 1915 + }, + { + "epoch": 0.13421243881723538, + "grad_norm": 3.502570629119873, + "learning_rate": 8.659135551663748e-05, + "loss": 1.0175, + "num_input_tokens_seen": 30841472, + "step": 1916 + }, + { + "epoch": 0.1342824870629646, + "grad_norm": 4.2404913902282715, + "learning_rate": 8.658435726795096e-05, + "loss": 1.1882, + "num_input_tokens_seen": 30857856, + "step": 1917 + }, + { + "epoch": 0.13435253530869387, + "grad_norm": 4.230425834655762, + "learning_rate": 8.657735901926445e-05, + "loss": 1.098, + "num_input_tokens_seen": 30874240, + "step": 1918 + }, + { + "epoch": 0.1344225835544231, + "grad_norm": 3.9034597873687744, + "learning_rate": 8.657036077057794e-05, + "loss": 1.0441, + "num_input_tokens_seen": 30890560, + "step": 1919 + }, + { + "epoch": 0.13449263180015236, + "grad_norm": 3.829190492630005, + "learning_rate": 8.656336252189143e-05, + "loss": 1.0675, + "num_input_tokens_seen": 30906480, + "step": 1920 + }, + { + "epoch": 0.1345626800458816, + "grad_norm": 3.9801993370056152, + "learning_rate": 8.65563642732049e-05, + "loss": 1.0407, + "num_input_tokens_seen": 30922160, + "step": 1921 + }, + { + "epoch": 0.13463272829161085, + "grad_norm": 5.018815994262695, + "learning_rate": 8.65493660245184e-05, + "loss": 1.1155, + "num_input_tokens_seen": 30938544, + "step": 1922 + }, + { + "epoch": 0.13470277653734009, + "grad_norm": 3.6515283584594727, + "learning_rate": 8.654236777583188e-05, + "loss": 1.0436, + "num_input_tokens_seen": 30954088, + "step": 1923 + }, + { + "epoch": 0.13477282478306934, + "grad_norm": 4.440131664276123, + "learning_rate": 8.653536952714536e-05, + "loss": 1.002, + "num_input_tokens_seen": 30970472, + "step": 1924 + }, + { + "epoch": 0.13484287302879858, + "grad_norm": 5.27577543258667, + "learning_rate": 8.652837127845885e-05, + "loss": 1.0783, + "num_input_tokens_seen": 30985544, + "step": 1925 + }, + { + "epoch": 0.13491292127452784, + "grad_norm": 4.632978916168213, + "learning_rate": 8.652137302977233e-05, + "loss": 1.1539, + "num_input_tokens_seen": 31001928, + "step": 1926 + }, + { + "epoch": 0.13498296952025707, + "grad_norm": 3.9239861965179443, + "learning_rate": 8.651437478108582e-05, + "loss": 1.0231, + "num_input_tokens_seen": 31018312, + "step": 1927 + }, + { + "epoch": 0.13505301776598633, + "grad_norm": 4.819107532501221, + "learning_rate": 8.65073765323993e-05, + "loss": 1.1631, + "num_input_tokens_seen": 31033568, + "step": 1928 + }, + { + "epoch": 0.13512306601171556, + "grad_norm": 3.5287766456604004, + "learning_rate": 8.650037828371279e-05, + "loss": 1.0172, + "num_input_tokens_seen": 31049952, + "step": 1929 + }, + { + "epoch": 0.13519311425744482, + "grad_norm": 3.536736488342285, + "learning_rate": 8.649338003502628e-05, + "loss": 0.9576, + "num_input_tokens_seen": 31066336, + "step": 1930 + }, + { + "epoch": 0.13526316250317405, + "grad_norm": 5.148278713226318, + "learning_rate": 8.648638178633976e-05, + "loss": 1.2137, + "num_input_tokens_seen": 31082136, + "step": 1931 + }, + { + "epoch": 0.1353332107489033, + "grad_norm": 4.076564788818359, + "learning_rate": 8.647938353765325e-05, + "loss": 1.081, + "num_input_tokens_seen": 31098520, + "step": 1932 + }, + { + "epoch": 0.13540325899463254, + "grad_norm": 4.747740745544434, + "learning_rate": 8.647238528896673e-05, + "loss": 1.1989, + "num_input_tokens_seen": 31114560, + "step": 1933 + }, + { + "epoch": 0.1354733072403618, + "grad_norm": 3.662280797958374, + "learning_rate": 8.646538704028022e-05, + "loss": 1.0797, + "num_input_tokens_seen": 31130944, + "step": 1934 + }, + { + "epoch": 0.13554335548609103, + "grad_norm": 3.8747782707214355, + "learning_rate": 8.645838879159369e-05, + "loss": 0.9258, + "num_input_tokens_seen": 31146544, + "step": 1935 + }, + { + "epoch": 0.1356134037318203, + "grad_norm": 3.465095281600952, + "learning_rate": 8.645139054290718e-05, + "loss": 1.0582, + "num_input_tokens_seen": 31162928, + "step": 1936 + }, + { + "epoch": 0.13568345197754952, + "grad_norm": 4.640190124511719, + "learning_rate": 8.644439229422067e-05, + "loss": 1.1265, + "num_input_tokens_seen": 31177712, + "step": 1937 + }, + { + "epoch": 0.13575350022327878, + "grad_norm": 3.88620924949646, + "learning_rate": 8.643739404553416e-05, + "loss": 1.0244, + "num_input_tokens_seen": 31193640, + "step": 1938 + }, + { + "epoch": 0.13582354846900804, + "grad_norm": 3.657331705093384, + "learning_rate": 8.643039579684765e-05, + "loss": 0.9715, + "num_input_tokens_seen": 31209112, + "step": 1939 + }, + { + "epoch": 0.13589359671473727, + "grad_norm": 6.8866448402404785, + "learning_rate": 8.642339754816113e-05, + "loss": 0.9734, + "num_input_tokens_seen": 31223968, + "step": 1940 + }, + { + "epoch": 0.13596364496046653, + "grad_norm": 5.0794172286987305, + "learning_rate": 8.641639929947461e-05, + "loss": 1.1988, + "num_input_tokens_seen": 31240352, + "step": 1941 + }, + { + "epoch": 0.13603369320619577, + "grad_norm": 4.631995677947998, + "learning_rate": 8.64094010507881e-05, + "loss": 1.1814, + "num_input_tokens_seen": 31256736, + "step": 1942 + }, + { + "epoch": 0.13610374145192503, + "grad_norm": 5.566014766693115, + "learning_rate": 8.640240280210157e-05, + "loss": 1.1769, + "num_input_tokens_seen": 31273120, + "step": 1943 + }, + { + "epoch": 0.13617378969765426, + "grad_norm": 3.940988302230835, + "learning_rate": 8.639540455341506e-05, + "loss": 1.0196, + "num_input_tokens_seen": 31289504, + "step": 1944 + }, + { + "epoch": 0.13624383794338352, + "grad_norm": 3.9979453086853027, + "learning_rate": 8.638840630472855e-05, + "loss": 1.0467, + "num_input_tokens_seen": 31305888, + "step": 1945 + }, + { + "epoch": 0.13631388618911275, + "grad_norm": 5.303500175476074, + "learning_rate": 8.638140805604204e-05, + "loss": 1.0938, + "num_input_tokens_seen": 31321512, + "step": 1946 + }, + { + "epoch": 0.136383934434842, + "grad_norm": 4.6745429039001465, + "learning_rate": 8.637440980735553e-05, + "loss": 1.3665, + "num_input_tokens_seen": 31337896, + "step": 1947 + }, + { + "epoch": 0.13645398268057124, + "grad_norm": 4.203839302062988, + "learning_rate": 8.6367411558669e-05, + "loss": 0.8949, + "num_input_tokens_seen": 31354176, + "step": 1948 + }, + { + "epoch": 0.1365240309263005, + "grad_norm": 4.802511215209961, + "learning_rate": 8.636041330998249e-05, + "loss": 1.2427, + "num_input_tokens_seen": 31369976, + "step": 1949 + }, + { + "epoch": 0.13659407917202973, + "grad_norm": 4.077885627746582, + "learning_rate": 8.635341506129598e-05, + "loss": 1.1259, + "num_input_tokens_seen": 31386360, + "step": 1950 + }, + { + "epoch": 0.136664127417759, + "grad_norm": 5.009285926818848, + "learning_rate": 8.634641681260947e-05, + "loss": 1.0278, + "num_input_tokens_seen": 31402744, + "step": 1951 + }, + { + "epoch": 0.13673417566348822, + "grad_norm": 3.539872646331787, + "learning_rate": 8.633941856392294e-05, + "loss": 1.0522, + "num_input_tokens_seen": 31419128, + "step": 1952 + }, + { + "epoch": 0.13680422390921748, + "grad_norm": 4.664520740509033, + "learning_rate": 8.633242031523643e-05, + "loss": 1.1559, + "num_input_tokens_seen": 31435400, + "step": 1953 + }, + { + "epoch": 0.1368742721549467, + "grad_norm": 3.8469269275665283, + "learning_rate": 8.632542206654992e-05, + "loss": 1.1237, + "num_input_tokens_seen": 31451408, + "step": 1954 + }, + { + "epoch": 0.13694432040067597, + "grad_norm": 4.064670085906982, + "learning_rate": 8.63184238178634e-05, + "loss": 0.8825, + "num_input_tokens_seen": 31467504, + "step": 1955 + }, + { + "epoch": 0.1370143686464052, + "grad_norm": 3.9931817054748535, + "learning_rate": 8.631142556917688e-05, + "loss": 1.17, + "num_input_tokens_seen": 31483528, + "step": 1956 + }, + { + "epoch": 0.13708441689213446, + "grad_norm": 4.136581897735596, + "learning_rate": 8.630442732049037e-05, + "loss": 1.069, + "num_input_tokens_seen": 31499912, + "step": 1957 + }, + { + "epoch": 0.1371544651378637, + "grad_norm": 3.7189536094665527, + "learning_rate": 8.629742907180386e-05, + "loss": 1.0509, + "num_input_tokens_seen": 31515560, + "step": 1958 + }, + { + "epoch": 0.13722451338359296, + "grad_norm": 3.7821719646453857, + "learning_rate": 8.629043082311735e-05, + "loss": 1.0583, + "num_input_tokens_seen": 31531944, + "step": 1959 + }, + { + "epoch": 0.1372945616293222, + "grad_norm": 6.815886497497559, + "learning_rate": 8.628343257443082e-05, + "loss": 0.9118, + "num_input_tokens_seen": 31548248, + "step": 1960 + }, + { + "epoch": 0.13736460987505145, + "grad_norm": 7.490451812744141, + "learning_rate": 8.627643432574431e-05, + "loss": 1.1145, + "num_input_tokens_seen": 31562560, + "step": 1961 + }, + { + "epoch": 0.13743465812078068, + "grad_norm": 4.918768405914307, + "learning_rate": 8.626943607705779e-05, + "loss": 1.2198, + "num_input_tokens_seen": 31578944, + "step": 1962 + }, + { + "epoch": 0.13750470636650994, + "grad_norm": 5.567696571350098, + "learning_rate": 8.626243782837128e-05, + "loss": 1.1083, + "num_input_tokens_seen": 31594312, + "step": 1963 + }, + { + "epoch": 0.13757475461223917, + "grad_norm": 4.24015474319458, + "learning_rate": 8.625543957968477e-05, + "loss": 1.1807, + "num_input_tokens_seen": 31609656, + "step": 1964 + }, + { + "epoch": 0.13764480285796843, + "grad_norm": 5.664759635925293, + "learning_rate": 8.624844133099825e-05, + "loss": 1.1775, + "num_input_tokens_seen": 31626040, + "step": 1965 + }, + { + "epoch": 0.13771485110369766, + "grad_norm": 3.7281267642974854, + "learning_rate": 8.624144308231174e-05, + "loss": 1.0994, + "num_input_tokens_seen": 31642424, + "step": 1966 + }, + { + "epoch": 0.13778489934942692, + "grad_norm": 4.112753391265869, + "learning_rate": 8.623444483362523e-05, + "loss": 1.2113, + "num_input_tokens_seen": 31658808, + "step": 1967 + }, + { + "epoch": 0.13785494759515615, + "grad_norm": 3.8851754665374756, + "learning_rate": 8.62274465849387e-05, + "loss": 1.0596, + "num_input_tokens_seen": 31675192, + "step": 1968 + }, + { + "epoch": 0.1379249958408854, + "grad_norm": 4.161825656890869, + "learning_rate": 8.62204483362522e-05, + "loss": 1.03, + "num_input_tokens_seen": 31691576, + "step": 1969 + }, + { + "epoch": 0.13799504408661464, + "grad_norm": 4.802804470062256, + "learning_rate": 8.621345008756567e-05, + "loss": 1.4374, + "num_input_tokens_seen": 31707960, + "step": 1970 + }, + { + "epoch": 0.1380650923323439, + "grad_norm": 3.752012252807617, + "learning_rate": 8.620645183887917e-05, + "loss": 1.124, + "num_input_tokens_seen": 31724344, + "step": 1971 + }, + { + "epoch": 0.13813514057807313, + "grad_norm": 3.8039815425872803, + "learning_rate": 8.619945359019265e-05, + "loss": 1.0051, + "num_input_tokens_seen": 31740456, + "step": 1972 + }, + { + "epoch": 0.1382051888238024, + "grad_norm": 4.029634952545166, + "learning_rate": 8.619245534150614e-05, + "loss": 1.2221, + "num_input_tokens_seen": 31756776, + "step": 1973 + }, + { + "epoch": 0.13827523706953165, + "grad_norm": 5.531665802001953, + "learning_rate": 8.618545709281962e-05, + "loss": 1.1534, + "num_input_tokens_seen": 31772480, + "step": 1974 + }, + { + "epoch": 0.13834528531526089, + "grad_norm": 4.6494646072387695, + "learning_rate": 8.61784588441331e-05, + "loss": 0.9723, + "num_input_tokens_seen": 31788504, + "step": 1975 + }, + { + "epoch": 0.13841533356099014, + "grad_norm": 4.201340675354004, + "learning_rate": 8.617146059544659e-05, + "loss": 1.0648, + "num_input_tokens_seen": 31804888, + "step": 1976 + }, + { + "epoch": 0.13848538180671938, + "grad_norm": 4.272038459777832, + "learning_rate": 8.616446234676008e-05, + "loss": 1.2557, + "num_input_tokens_seen": 31821272, + "step": 1977 + }, + { + "epoch": 0.13855543005244864, + "grad_norm": 3.729841947555542, + "learning_rate": 8.615746409807357e-05, + "loss": 1.0346, + "num_input_tokens_seen": 31837656, + "step": 1978 + }, + { + "epoch": 0.13862547829817787, + "grad_norm": 3.5615944862365723, + "learning_rate": 8.615046584938704e-05, + "loss": 0.9986, + "num_input_tokens_seen": 31854040, + "step": 1979 + }, + { + "epoch": 0.13869552654390713, + "grad_norm": 3.7658376693725586, + "learning_rate": 8.614346760070053e-05, + "loss": 1.3268, + "num_input_tokens_seen": 31870424, + "step": 1980 + }, + { + "epoch": 0.13876557478963636, + "grad_norm": 4.124275207519531, + "learning_rate": 8.613646935201402e-05, + "loss": 1.2736, + "num_input_tokens_seen": 31886808, + "step": 1981 + }, + { + "epoch": 0.13883562303536562, + "grad_norm": 5.348685264587402, + "learning_rate": 8.612947110332749e-05, + "loss": 1.0492, + "num_input_tokens_seen": 31902880, + "step": 1982 + }, + { + "epoch": 0.13890567128109485, + "grad_norm": 5.311651706695557, + "learning_rate": 8.612247285464098e-05, + "loss": 1.2034, + "num_input_tokens_seen": 31918704, + "step": 1983 + }, + { + "epoch": 0.1389757195268241, + "grad_norm": 4.194555759429932, + "learning_rate": 8.611547460595447e-05, + "loss": 1.2802, + "num_input_tokens_seen": 31935088, + "step": 1984 + }, + { + "epoch": 0.13904576777255334, + "grad_norm": 3.6576390266418457, + "learning_rate": 8.610847635726796e-05, + "loss": 1.0618, + "num_input_tokens_seen": 31951472, + "step": 1985 + }, + { + "epoch": 0.1391158160182826, + "grad_norm": 4.169801235198975, + "learning_rate": 8.610147810858145e-05, + "loss": 1.1668, + "num_input_tokens_seen": 31967856, + "step": 1986 + }, + { + "epoch": 0.13918586426401183, + "grad_norm": 3.79791259765625, + "learning_rate": 8.609447985989492e-05, + "loss": 1.2546, + "num_input_tokens_seen": 31984232, + "step": 1987 + }, + { + "epoch": 0.1392559125097411, + "grad_norm": 3.726701021194458, + "learning_rate": 8.608748161120841e-05, + "loss": 1.177, + "num_input_tokens_seen": 31999720, + "step": 1988 + }, + { + "epoch": 0.13932596075547032, + "grad_norm": 3.7376129627227783, + "learning_rate": 8.608048336252189e-05, + "loss": 1.0174, + "num_input_tokens_seen": 32016104, + "step": 1989 + }, + { + "epoch": 0.13939600900119958, + "grad_norm": 4.290423393249512, + "learning_rate": 8.607348511383537e-05, + "loss": 1.1556, + "num_input_tokens_seen": 32031992, + "step": 1990 + }, + { + "epoch": 0.13946605724692882, + "grad_norm": 3.592384099960327, + "learning_rate": 8.606648686514888e-05, + "loss": 1.0629, + "num_input_tokens_seen": 32047904, + "step": 1991 + }, + { + "epoch": 0.13953610549265807, + "grad_norm": 3.753692626953125, + "learning_rate": 8.605948861646235e-05, + "loss": 1.0111, + "num_input_tokens_seen": 32063720, + "step": 1992 + }, + { + "epoch": 0.1396061537383873, + "grad_norm": 4.698465347290039, + "learning_rate": 8.605249036777584e-05, + "loss": 1.0255, + "num_input_tokens_seen": 32079768, + "step": 1993 + }, + { + "epoch": 0.13967620198411657, + "grad_norm": 4.187407970428467, + "learning_rate": 8.604549211908933e-05, + "loss": 0.9006, + "num_input_tokens_seen": 32095120, + "step": 1994 + }, + { + "epoch": 0.1397462502298458, + "grad_norm": 4.256275653839111, + "learning_rate": 8.60384938704028e-05, + "loss": 1.1607, + "num_input_tokens_seen": 32111408, + "step": 1995 + }, + { + "epoch": 0.13981629847557506, + "grad_norm": 6.693331241607666, + "learning_rate": 8.603149562171629e-05, + "loss": 1.2317, + "num_input_tokens_seen": 32127792, + "step": 1996 + }, + { + "epoch": 0.1398863467213043, + "grad_norm": 3.69393253326416, + "learning_rate": 8.602449737302978e-05, + "loss": 0.9747, + "num_input_tokens_seen": 32143792, + "step": 1997 + }, + { + "epoch": 0.13995639496703355, + "grad_norm": 4.117836952209473, + "learning_rate": 8.601749912434327e-05, + "loss": 1.0732, + "num_input_tokens_seen": 32158624, + "step": 1998 + }, + { + "epoch": 0.14002644321276278, + "grad_norm": 5.14541482925415, + "learning_rate": 8.601050087565674e-05, + "loss": 1.1787, + "num_input_tokens_seen": 32175008, + "step": 1999 + }, + { + "epoch": 0.14009649145849204, + "grad_norm": 4.0103535652160645, + "learning_rate": 8.600350262697023e-05, + "loss": 1.1308, + "num_input_tokens_seen": 32191392, + "step": 2000 + }, + { + "epoch": 0.14009649145849204, + "eval_loss": 1.1461617946624756, + "eval_runtime": 0.1945, + "eval_samples_per_second": 5.141, + "eval_steps_per_second": 5.141, + "num_input_tokens_seen": 32191392, + "step": 2000 + }, + { + "epoch": 0.14016653970422127, + "grad_norm": 3.8072049617767334, + "learning_rate": 8.599650437828372e-05, + "loss": 0.8617, + "num_input_tokens_seen": 32207712, + "step": 2001 + }, + { + "epoch": 0.14023658794995053, + "grad_norm": 4.034494400024414, + "learning_rate": 8.59895061295972e-05, + "loss": 1.1719, + "num_input_tokens_seen": 32223440, + "step": 2002 + }, + { + "epoch": 0.14030663619567976, + "grad_norm": 3.9485251903533936, + "learning_rate": 8.598250788091069e-05, + "loss": 1.2242, + "num_input_tokens_seen": 32239824, + "step": 2003 + }, + { + "epoch": 0.14037668444140902, + "grad_norm": 5.427109241485596, + "learning_rate": 8.597550963222417e-05, + "loss": 1.1922, + "num_input_tokens_seen": 32255976, + "step": 2004 + }, + { + "epoch": 0.14044673268713825, + "grad_norm": 4.4832000732421875, + "learning_rate": 8.596851138353766e-05, + "loss": 1.2791, + "num_input_tokens_seen": 32272304, + "step": 2005 + }, + { + "epoch": 0.1405167809328675, + "grad_norm": 4.4699859619140625, + "learning_rate": 8.596151313485114e-05, + "loss": 1.0175, + "num_input_tokens_seen": 32288688, + "step": 2006 + }, + { + "epoch": 0.14058682917859674, + "grad_norm": 6.007316589355469, + "learning_rate": 8.595451488616463e-05, + "loss": 1.2402, + "num_input_tokens_seen": 32304992, + "step": 2007 + }, + { + "epoch": 0.140656877424326, + "grad_norm": 5.460748195648193, + "learning_rate": 8.594751663747811e-05, + "loss": 1.2683, + "num_input_tokens_seen": 32320104, + "step": 2008 + }, + { + "epoch": 0.14072692567005526, + "grad_norm": 4.430675029754639, + "learning_rate": 8.594051838879159e-05, + "loss": 0.9664, + "num_input_tokens_seen": 32336040, + "step": 2009 + }, + { + "epoch": 0.1407969739157845, + "grad_norm": 4.469089508056641, + "learning_rate": 8.593352014010508e-05, + "loss": 1.0335, + "num_input_tokens_seen": 32352424, + "step": 2010 + }, + { + "epoch": 0.14086702216151376, + "grad_norm": 4.94099760055542, + "learning_rate": 8.592652189141858e-05, + "loss": 1.1091, + "num_input_tokens_seen": 32367944, + "step": 2011 + }, + { + "epoch": 0.140937070407243, + "grad_norm": 5.430322170257568, + "learning_rate": 8.591952364273206e-05, + "loss": 1.2256, + "num_input_tokens_seen": 32384328, + "step": 2012 + }, + { + "epoch": 0.14100711865297225, + "grad_norm": 3.847569704055786, + "learning_rate": 8.591252539404554e-05, + "loss": 1.019, + "num_input_tokens_seen": 32400712, + "step": 2013 + }, + { + "epoch": 0.14107716689870148, + "grad_norm": 3.7531189918518066, + "learning_rate": 8.590552714535902e-05, + "loss": 0.9409, + "num_input_tokens_seen": 32417096, + "step": 2014 + }, + { + "epoch": 0.14114721514443074, + "grad_norm": 4.070606708526611, + "learning_rate": 8.589852889667251e-05, + "loss": 1.0857, + "num_input_tokens_seen": 32432504, + "step": 2015 + }, + { + "epoch": 0.14121726339015997, + "grad_norm": 4.791952610015869, + "learning_rate": 8.589153064798598e-05, + "loss": 0.8467, + "num_input_tokens_seen": 32448008, + "step": 2016 + }, + { + "epoch": 0.14128731163588923, + "grad_norm": 4.672977924346924, + "learning_rate": 8.588453239929948e-05, + "loss": 1.081, + "num_input_tokens_seen": 32463792, + "step": 2017 + }, + { + "epoch": 0.14135735988161846, + "grad_norm": 6.187239170074463, + "learning_rate": 8.587753415061297e-05, + "loss": 1.0416, + "num_input_tokens_seen": 32480104, + "step": 2018 + }, + { + "epoch": 0.14142740812734772, + "grad_norm": 4.058189392089844, + "learning_rate": 8.587053590192645e-05, + "loss": 1.0598, + "num_input_tokens_seen": 32495824, + "step": 2019 + }, + { + "epoch": 0.14149745637307695, + "grad_norm": 3.862661838531494, + "learning_rate": 8.586353765323994e-05, + "loss": 0.9371, + "num_input_tokens_seen": 32512208, + "step": 2020 + }, + { + "epoch": 0.1415675046188062, + "grad_norm": 3.7348716259002686, + "learning_rate": 8.585653940455343e-05, + "loss": 1.1021, + "num_input_tokens_seen": 32528592, + "step": 2021 + }, + { + "epoch": 0.14163755286453544, + "grad_norm": 4.405923843383789, + "learning_rate": 8.58495411558669e-05, + "loss": 1.1405, + "num_input_tokens_seen": 32544120, + "step": 2022 + }, + { + "epoch": 0.1417076011102647, + "grad_norm": 3.73984694480896, + "learning_rate": 8.584254290718039e-05, + "loss": 1.0797, + "num_input_tokens_seen": 32560504, + "step": 2023 + }, + { + "epoch": 0.14177764935599393, + "grad_norm": 5.73613166809082, + "learning_rate": 8.583554465849388e-05, + "loss": 1.2119, + "num_input_tokens_seen": 32576888, + "step": 2024 + }, + { + "epoch": 0.1418476976017232, + "grad_norm": 6.435116291046143, + "learning_rate": 8.582854640980737e-05, + "loss": 1.3408, + "num_input_tokens_seen": 32591592, + "step": 2025 + }, + { + "epoch": 0.14191774584745243, + "grad_norm": 4.520002365112305, + "learning_rate": 8.582154816112084e-05, + "loss": 1.1654, + "num_input_tokens_seen": 32607448, + "step": 2026 + }, + { + "epoch": 0.14198779409318169, + "grad_norm": 4.01891565322876, + "learning_rate": 8.581454991243433e-05, + "loss": 1.1203, + "num_input_tokens_seen": 32623104, + "step": 2027 + }, + { + "epoch": 0.14205784233891092, + "grad_norm": 3.8237030506134033, + "learning_rate": 8.580755166374782e-05, + "loss": 1.0548, + "num_input_tokens_seen": 32639376, + "step": 2028 + }, + { + "epoch": 0.14212789058464018, + "grad_norm": 4.893499851226807, + "learning_rate": 8.58005534150613e-05, + "loss": 1.1629, + "num_input_tokens_seen": 32654800, + "step": 2029 + }, + { + "epoch": 0.1421979388303694, + "grad_norm": 3.6075315475463867, + "learning_rate": 8.579355516637478e-05, + "loss": 1.0889, + "num_input_tokens_seen": 32671184, + "step": 2030 + }, + { + "epoch": 0.14226798707609867, + "grad_norm": 4.696410179138184, + "learning_rate": 8.578655691768827e-05, + "loss": 1.1777, + "num_input_tokens_seen": 32687360, + "step": 2031 + }, + { + "epoch": 0.1423380353218279, + "grad_norm": 3.9465558528900146, + "learning_rate": 8.577955866900176e-05, + "loss": 1.1378, + "num_input_tokens_seen": 32703744, + "step": 2032 + }, + { + "epoch": 0.14240808356755716, + "grad_norm": 3.933898448944092, + "learning_rate": 8.577256042031523e-05, + "loss": 0.8353, + "num_input_tokens_seen": 32720128, + "step": 2033 + }, + { + "epoch": 0.1424781318132864, + "grad_norm": 3.865894317626953, + "learning_rate": 8.576556217162872e-05, + "loss": 0.9827, + "num_input_tokens_seen": 32735976, + "step": 2034 + }, + { + "epoch": 0.14254818005901565, + "grad_norm": 3.9533474445343018, + "learning_rate": 8.575856392294221e-05, + "loss": 1.1028, + "num_input_tokens_seen": 32752240, + "step": 2035 + }, + { + "epoch": 0.14261822830474488, + "grad_norm": 3.5534164905548096, + "learning_rate": 8.575156567425569e-05, + "loss": 1.0887, + "num_input_tokens_seen": 32768624, + "step": 2036 + }, + { + "epoch": 0.14268827655047414, + "grad_norm": 5.689724922180176, + "learning_rate": 8.574456742556918e-05, + "loss": 1.0588, + "num_input_tokens_seen": 32784600, + "step": 2037 + }, + { + "epoch": 0.14275832479620337, + "grad_norm": 4.010136604309082, + "learning_rate": 8.573756917688268e-05, + "loss": 0.8989, + "num_input_tokens_seen": 32799824, + "step": 2038 + }, + { + "epoch": 0.14282837304193263, + "grad_norm": 4.153547763824463, + "learning_rate": 8.573057092819615e-05, + "loss": 1.162, + "num_input_tokens_seen": 32815744, + "step": 2039 + }, + { + "epoch": 0.14289842128766186, + "grad_norm": 3.976120948791504, + "learning_rate": 8.572357267950964e-05, + "loss": 1.2234, + "num_input_tokens_seen": 32831664, + "step": 2040 + }, + { + "epoch": 0.14296846953339112, + "grad_norm": 3.9593231678009033, + "learning_rate": 8.571657443082312e-05, + "loss": 1.0482, + "num_input_tokens_seen": 32848048, + "step": 2041 + }, + { + "epoch": 0.14303851777912036, + "grad_norm": 3.920823097229004, + "learning_rate": 8.57095761821366e-05, + "loss": 1.1891, + "num_input_tokens_seen": 32863168, + "step": 2042 + }, + { + "epoch": 0.14310856602484961, + "grad_norm": 4.754055976867676, + "learning_rate": 8.57025779334501e-05, + "loss": 1.1123, + "num_input_tokens_seen": 32879552, + "step": 2043 + }, + { + "epoch": 0.14317861427057887, + "grad_norm": 3.6835105419158936, + "learning_rate": 8.569557968476358e-05, + "loss": 1.0919, + "num_input_tokens_seen": 32895864, + "step": 2044 + }, + { + "epoch": 0.1432486625163081, + "grad_norm": 4.115698337554932, + "learning_rate": 8.568858143607707e-05, + "loss": 1.166, + "num_input_tokens_seen": 32912232, + "step": 2045 + }, + { + "epoch": 0.14331871076203737, + "grad_norm": 6.536626815795898, + "learning_rate": 8.568158318739055e-05, + "loss": 1.1534, + "num_input_tokens_seen": 32928616, + "step": 2046 + }, + { + "epoch": 0.1433887590077666, + "grad_norm": 5.43113899230957, + "learning_rate": 8.567458493870403e-05, + "loss": 0.9645, + "num_input_tokens_seen": 32945000, + "step": 2047 + }, + { + "epoch": 0.14345880725349586, + "grad_norm": 3.8677239418029785, + "learning_rate": 8.566758669001752e-05, + "loss": 1.2213, + "num_input_tokens_seen": 32961384, + "step": 2048 + }, + { + "epoch": 0.1435288554992251, + "grad_norm": 6.913444995880127, + "learning_rate": 8.5660588441331e-05, + "loss": 1.2204, + "num_input_tokens_seen": 32977768, + "step": 2049 + }, + { + "epoch": 0.14359890374495435, + "grad_norm": 4.870579719543457, + "learning_rate": 8.565359019264449e-05, + "loss": 1.1022, + "num_input_tokens_seen": 32994152, + "step": 2050 + }, + { + "epoch": 0.14366895199068358, + "grad_norm": 4.057044982910156, + "learning_rate": 8.564659194395797e-05, + "loss": 1.0599, + "num_input_tokens_seen": 33010536, + "step": 2051 + }, + { + "epoch": 0.14373900023641284, + "grad_norm": 8.405828475952148, + "learning_rate": 8.563959369527146e-05, + "loss": 1.0928, + "num_input_tokens_seen": 33025192, + "step": 2052 + }, + { + "epoch": 0.14380904848214207, + "grad_norm": 4.188510894775391, + "learning_rate": 8.563259544658494e-05, + "loss": 1.1207, + "num_input_tokens_seen": 33041576, + "step": 2053 + }, + { + "epoch": 0.14387909672787133, + "grad_norm": 6.505815505981445, + "learning_rate": 8.562559719789843e-05, + "loss": 1.1196, + "num_input_tokens_seen": 33057800, + "step": 2054 + }, + { + "epoch": 0.14394914497360056, + "grad_norm": 4.021209716796875, + "learning_rate": 8.561859894921192e-05, + "loss": 0.953, + "num_input_tokens_seen": 33073872, + "step": 2055 + }, + { + "epoch": 0.14401919321932982, + "grad_norm": 3.924671173095703, + "learning_rate": 8.561160070052539e-05, + "loss": 1.022, + "num_input_tokens_seen": 33090256, + "step": 2056 + }, + { + "epoch": 0.14408924146505905, + "grad_norm": 4.1323418617248535, + "learning_rate": 8.560460245183888e-05, + "loss": 1.0995, + "num_input_tokens_seen": 33106256, + "step": 2057 + }, + { + "epoch": 0.1441592897107883, + "grad_norm": 4.236043930053711, + "learning_rate": 8.559760420315237e-05, + "loss": 1.0842, + "num_input_tokens_seen": 33122352, + "step": 2058 + }, + { + "epoch": 0.14422933795651754, + "grad_norm": 3.4836020469665527, + "learning_rate": 8.559060595446586e-05, + "loss": 1.0136, + "num_input_tokens_seen": 33138736, + "step": 2059 + }, + { + "epoch": 0.1442993862022468, + "grad_norm": 4.363439083099365, + "learning_rate": 8.558360770577933e-05, + "loss": 1.1382, + "num_input_tokens_seen": 33153936, + "step": 2060 + }, + { + "epoch": 0.14436943444797604, + "grad_norm": 5.099925994873047, + "learning_rate": 8.557660945709282e-05, + "loss": 1.0027, + "num_input_tokens_seen": 33170320, + "step": 2061 + }, + { + "epoch": 0.1444394826937053, + "grad_norm": 4.438295364379883, + "learning_rate": 8.556961120840631e-05, + "loss": 1.11, + "num_input_tokens_seen": 33186704, + "step": 2062 + }, + { + "epoch": 0.14450953093943453, + "grad_norm": 3.7912747859954834, + "learning_rate": 8.556261295971978e-05, + "loss": 1.0708, + "num_input_tokens_seen": 33203088, + "step": 2063 + }, + { + "epoch": 0.1445795791851638, + "grad_norm": 4.679794788360596, + "learning_rate": 8.555561471103329e-05, + "loss": 0.9707, + "num_input_tokens_seen": 33218936, + "step": 2064 + }, + { + "epoch": 0.14464962743089302, + "grad_norm": 4.092919826507568, + "learning_rate": 8.554861646234677e-05, + "loss": 1.2103, + "num_input_tokens_seen": 33235320, + "step": 2065 + }, + { + "epoch": 0.14471967567662228, + "grad_norm": 4.13189172744751, + "learning_rate": 8.554161821366025e-05, + "loss": 0.9919, + "num_input_tokens_seen": 33251704, + "step": 2066 + }, + { + "epoch": 0.1447897239223515, + "grad_norm": 3.618739366531372, + "learning_rate": 8.553461996497374e-05, + "loss": 1.0026, + "num_input_tokens_seen": 33268088, + "step": 2067 + }, + { + "epoch": 0.14485977216808077, + "grad_norm": 4.197813034057617, + "learning_rate": 8.552762171628721e-05, + "loss": 1.3134, + "num_input_tokens_seen": 33284472, + "step": 2068 + }, + { + "epoch": 0.14492982041381, + "grad_norm": 4.159245491027832, + "learning_rate": 8.55206234676007e-05, + "loss": 0.9579, + "num_input_tokens_seen": 33300560, + "step": 2069 + }, + { + "epoch": 0.14499986865953926, + "grad_norm": 3.970898389816284, + "learning_rate": 8.551362521891419e-05, + "loss": 1.0587, + "num_input_tokens_seen": 33316744, + "step": 2070 + }, + { + "epoch": 0.1450699169052685, + "grad_norm": 5.635775089263916, + "learning_rate": 8.550662697022768e-05, + "loss": 1.2284, + "num_input_tokens_seen": 33333128, + "step": 2071 + }, + { + "epoch": 0.14513996515099775, + "grad_norm": 5.239542007446289, + "learning_rate": 8.549962872154117e-05, + "loss": 1.166, + "num_input_tokens_seen": 33348392, + "step": 2072 + }, + { + "epoch": 0.14521001339672698, + "grad_norm": 3.8646957874298096, + "learning_rate": 8.549263047285464e-05, + "loss": 1.0343, + "num_input_tokens_seen": 33364504, + "step": 2073 + }, + { + "epoch": 0.14528006164245624, + "grad_norm": 4.49400520324707, + "learning_rate": 8.548563222416813e-05, + "loss": 0.9953, + "num_input_tokens_seen": 33379680, + "step": 2074 + }, + { + "epoch": 0.14535010988818547, + "grad_norm": 3.782107353210449, + "learning_rate": 8.547863397548162e-05, + "loss": 1.1396, + "num_input_tokens_seen": 33396064, + "step": 2075 + }, + { + "epoch": 0.14542015813391473, + "grad_norm": 3.8171703815460205, + "learning_rate": 8.54716357267951e-05, + "loss": 1.1364, + "num_input_tokens_seen": 33411640, + "step": 2076 + }, + { + "epoch": 0.14549020637964397, + "grad_norm": 3.56487774848938, + "learning_rate": 8.546463747810858e-05, + "loss": 1.0396, + "num_input_tokens_seen": 33428024, + "step": 2077 + }, + { + "epoch": 0.14556025462537323, + "grad_norm": 5.169209003448486, + "learning_rate": 8.545763922942207e-05, + "loss": 1.1262, + "num_input_tokens_seen": 33444408, + "step": 2078 + }, + { + "epoch": 0.14563030287110248, + "grad_norm": 3.718086004257202, + "learning_rate": 8.545064098073556e-05, + "loss": 1.0769, + "num_input_tokens_seen": 33460416, + "step": 2079 + }, + { + "epoch": 0.14570035111683172, + "grad_norm": 4.2451372146606445, + "learning_rate": 8.544364273204904e-05, + "loss": 1.0298, + "num_input_tokens_seen": 33476800, + "step": 2080 + }, + { + "epoch": 0.14577039936256098, + "grad_norm": 3.7441632747650146, + "learning_rate": 8.543664448336252e-05, + "loss": 0.9785, + "num_input_tokens_seen": 33492536, + "step": 2081 + }, + { + "epoch": 0.1458404476082902, + "grad_norm": 3.8453383445739746, + "learning_rate": 8.542964623467601e-05, + "loss": 1.2527, + "num_input_tokens_seen": 33508920, + "step": 2082 + }, + { + "epoch": 0.14591049585401947, + "grad_norm": 3.6744494438171387, + "learning_rate": 8.542264798598949e-05, + "loss": 1.0739, + "num_input_tokens_seen": 33525304, + "step": 2083 + }, + { + "epoch": 0.1459805440997487, + "grad_norm": 4.209956645965576, + "learning_rate": 8.541564973730299e-05, + "loss": 1.1538, + "num_input_tokens_seen": 33541544, + "step": 2084 + }, + { + "epoch": 0.14605059234547796, + "grad_norm": 4.347019672393799, + "learning_rate": 8.540865148861647e-05, + "loss": 1.1078, + "num_input_tokens_seen": 33557928, + "step": 2085 + }, + { + "epoch": 0.1461206405912072, + "grad_norm": 5.323390483856201, + "learning_rate": 8.540165323992995e-05, + "loss": 1.0776, + "num_input_tokens_seen": 33573152, + "step": 2086 + }, + { + "epoch": 0.14619068883693645, + "grad_norm": 3.632425308227539, + "learning_rate": 8.539465499124343e-05, + "loss": 1.0595, + "num_input_tokens_seen": 33588848, + "step": 2087 + }, + { + "epoch": 0.14626073708266568, + "grad_norm": 4.460893154144287, + "learning_rate": 8.538765674255692e-05, + "loss": 1.1131, + "num_input_tokens_seen": 33604984, + "step": 2088 + }, + { + "epoch": 0.14633078532839494, + "grad_norm": 4.059104919433594, + "learning_rate": 8.53806584938704e-05, + "loss": 1.1818, + "num_input_tokens_seen": 33620384, + "step": 2089 + }, + { + "epoch": 0.14640083357412417, + "grad_norm": 6.023964881896973, + "learning_rate": 8.53736602451839e-05, + "loss": 1.1589, + "num_input_tokens_seen": 33636416, + "step": 2090 + }, + { + "epoch": 0.14647088181985343, + "grad_norm": 4.462921619415283, + "learning_rate": 8.536666199649738e-05, + "loss": 0.9362, + "num_input_tokens_seen": 33652504, + "step": 2091 + }, + { + "epoch": 0.14654093006558266, + "grad_norm": 4.003902435302734, + "learning_rate": 8.535966374781087e-05, + "loss": 1.1062, + "num_input_tokens_seen": 33668888, + "step": 2092 + }, + { + "epoch": 0.14661097831131192, + "grad_norm": 4.161351680755615, + "learning_rate": 8.535266549912435e-05, + "loss": 1.0252, + "num_input_tokens_seen": 33685272, + "step": 2093 + }, + { + "epoch": 0.14668102655704116, + "grad_norm": 4.424163341522217, + "learning_rate": 8.534566725043784e-05, + "loss": 1.0225, + "num_input_tokens_seen": 33700872, + "step": 2094 + }, + { + "epoch": 0.14675107480277041, + "grad_norm": 4.2255072593688965, + "learning_rate": 8.533866900175131e-05, + "loss": 1.2044, + "num_input_tokens_seen": 33717256, + "step": 2095 + }, + { + "epoch": 0.14682112304849965, + "grad_norm": 4.204975605010986, + "learning_rate": 8.53316707530648e-05, + "loss": 1.1861, + "num_input_tokens_seen": 33732544, + "step": 2096 + }, + { + "epoch": 0.1468911712942289, + "grad_norm": 3.7058298587799072, + "learning_rate": 8.532467250437829e-05, + "loss": 1.1568, + "num_input_tokens_seen": 33748928, + "step": 2097 + }, + { + "epoch": 0.14696121953995814, + "grad_norm": 6.157133102416992, + "learning_rate": 8.531767425569178e-05, + "loss": 1.0704, + "num_input_tokens_seen": 33765312, + "step": 2098 + }, + { + "epoch": 0.1470312677856874, + "grad_norm": 4.3684210777282715, + "learning_rate": 8.531067600700526e-05, + "loss": 1.0977, + "num_input_tokens_seen": 33781552, + "step": 2099 + }, + { + "epoch": 0.14710131603141663, + "grad_norm": 3.957848310470581, + "learning_rate": 8.530367775831874e-05, + "loss": 1.1412, + "num_input_tokens_seen": 33797464, + "step": 2100 + }, + { + "epoch": 0.1471713642771459, + "grad_norm": 4.9368486404418945, + "learning_rate": 8.529667950963223e-05, + "loss": 0.9986, + "num_input_tokens_seen": 33812672, + "step": 2101 + }, + { + "epoch": 0.14724141252287512, + "grad_norm": 3.8515660762786865, + "learning_rate": 8.528968126094572e-05, + "loss": 0.8715, + "num_input_tokens_seen": 33829024, + "step": 2102 + }, + { + "epoch": 0.14731146076860438, + "grad_norm": 3.961448907852173, + "learning_rate": 8.528268301225919e-05, + "loss": 1.1416, + "num_input_tokens_seen": 33845408, + "step": 2103 + }, + { + "epoch": 0.1473815090143336, + "grad_norm": 4.101677894592285, + "learning_rate": 8.52756847635727e-05, + "loss": 1.001, + "num_input_tokens_seen": 33861240, + "step": 2104 + }, + { + "epoch": 0.14745155726006287, + "grad_norm": 3.886634349822998, + "learning_rate": 8.526868651488617e-05, + "loss": 1.1546, + "num_input_tokens_seen": 33876832, + "step": 2105 + }, + { + "epoch": 0.1475216055057921, + "grad_norm": 3.7241156101226807, + "learning_rate": 8.526168826619966e-05, + "loss": 1.2116, + "num_input_tokens_seen": 33893216, + "step": 2106 + }, + { + "epoch": 0.14759165375152136, + "grad_norm": 3.829458236694336, + "learning_rate": 8.525469001751313e-05, + "loss": 0.8324, + "num_input_tokens_seen": 33909224, + "step": 2107 + }, + { + "epoch": 0.1476617019972506, + "grad_norm": 5.275660514831543, + "learning_rate": 8.524769176882662e-05, + "loss": 1.0253, + "num_input_tokens_seen": 33924768, + "step": 2108 + }, + { + "epoch": 0.14773175024297985, + "grad_norm": 4.207718372344971, + "learning_rate": 8.524069352014011e-05, + "loss": 1.1871, + "num_input_tokens_seen": 33940288, + "step": 2109 + }, + { + "epoch": 0.14780179848870909, + "grad_norm": 4.205242156982422, + "learning_rate": 8.52336952714536e-05, + "loss": 1.0834, + "num_input_tokens_seen": 33956512, + "step": 2110 + }, + { + "epoch": 0.14787184673443834, + "grad_norm": 4.365423202514648, + "learning_rate": 8.522669702276709e-05, + "loss": 1.2043, + "num_input_tokens_seen": 33972896, + "step": 2111 + }, + { + "epoch": 0.1479418949801676, + "grad_norm": 4.416136741638184, + "learning_rate": 8.521969877408056e-05, + "loss": 1.001, + "num_input_tokens_seen": 33989280, + "step": 2112 + }, + { + "epoch": 0.14801194322589684, + "grad_norm": 4.375226974487305, + "learning_rate": 8.521270052539405e-05, + "loss": 1.1186, + "num_input_tokens_seen": 34005664, + "step": 2113 + }, + { + "epoch": 0.1480819914716261, + "grad_norm": 5.2603840827941895, + "learning_rate": 8.520570227670753e-05, + "loss": 1.0723, + "num_input_tokens_seen": 34021576, + "step": 2114 + }, + { + "epoch": 0.14815203971735533, + "grad_norm": 4.02445125579834, + "learning_rate": 8.519870402802101e-05, + "loss": 1.11, + "num_input_tokens_seen": 34037960, + "step": 2115 + }, + { + "epoch": 0.1482220879630846, + "grad_norm": 3.6527910232543945, + "learning_rate": 8.51917057793345e-05, + "loss": 1.0293, + "num_input_tokens_seen": 34053240, + "step": 2116 + }, + { + "epoch": 0.14829213620881382, + "grad_norm": 4.170680999755859, + "learning_rate": 8.518470753064799e-05, + "loss": 1.2068, + "num_input_tokens_seen": 34068896, + "step": 2117 + }, + { + "epoch": 0.14836218445454308, + "grad_norm": 4.366664886474609, + "learning_rate": 8.517770928196148e-05, + "loss": 0.9541, + "num_input_tokens_seen": 34085280, + "step": 2118 + }, + { + "epoch": 0.1484322327002723, + "grad_norm": 3.50757098197937, + "learning_rate": 8.517071103327497e-05, + "loss": 0.9992, + "num_input_tokens_seen": 34101664, + "step": 2119 + }, + { + "epoch": 0.14850228094600157, + "grad_norm": 4.607417106628418, + "learning_rate": 8.516371278458844e-05, + "loss": 1.1974, + "num_input_tokens_seen": 34117752, + "step": 2120 + }, + { + "epoch": 0.1485723291917308, + "grad_norm": 3.959874391555786, + "learning_rate": 8.515671453590193e-05, + "loss": 0.9902, + "num_input_tokens_seen": 34133576, + "step": 2121 + }, + { + "epoch": 0.14864237743746006, + "grad_norm": 4.708366870880127, + "learning_rate": 8.514971628721541e-05, + "loss": 1.1201, + "num_input_tokens_seen": 34149952, + "step": 2122 + }, + { + "epoch": 0.1487124256831893, + "grad_norm": 3.6237339973449707, + "learning_rate": 8.51427180385289e-05, + "loss": 1.1091, + "num_input_tokens_seen": 34166336, + "step": 2123 + }, + { + "epoch": 0.14878247392891855, + "grad_norm": 4.606329917907715, + "learning_rate": 8.513571978984238e-05, + "loss": 1.0986, + "num_input_tokens_seen": 34181128, + "step": 2124 + }, + { + "epoch": 0.14885252217464778, + "grad_norm": 4.559760093688965, + "learning_rate": 8.512872154115587e-05, + "loss": 1.1022, + "num_input_tokens_seen": 34197512, + "step": 2125 + }, + { + "epoch": 0.14892257042037704, + "grad_norm": 3.870089292526245, + "learning_rate": 8.512172329246936e-05, + "loss": 1.0378, + "num_input_tokens_seen": 34213896, + "step": 2126 + }, + { + "epoch": 0.14899261866610627, + "grad_norm": 4.441296100616455, + "learning_rate": 8.511472504378284e-05, + "loss": 1.1473, + "num_input_tokens_seen": 34229472, + "step": 2127 + }, + { + "epoch": 0.14906266691183553, + "grad_norm": 3.8565545082092285, + "learning_rate": 8.510772679509633e-05, + "loss": 1.1465, + "num_input_tokens_seen": 34245856, + "step": 2128 + }, + { + "epoch": 0.14913271515756477, + "grad_norm": 3.563889741897583, + "learning_rate": 8.510072854640981e-05, + "loss": 0.8612, + "num_input_tokens_seen": 34262240, + "step": 2129 + }, + { + "epoch": 0.14920276340329403, + "grad_norm": 4.2634429931640625, + "learning_rate": 8.50937302977233e-05, + "loss": 1.1825, + "num_input_tokens_seen": 34278624, + "step": 2130 + }, + { + "epoch": 0.14927281164902326, + "grad_norm": 5.418450355529785, + "learning_rate": 8.508673204903679e-05, + "loss": 0.9869, + "num_input_tokens_seen": 34294216, + "step": 2131 + }, + { + "epoch": 0.14934285989475252, + "grad_norm": 3.511815309524536, + "learning_rate": 8.507973380035027e-05, + "loss": 0.9725, + "num_input_tokens_seen": 34310592, + "step": 2132 + }, + { + "epoch": 0.14941290814048175, + "grad_norm": 4.088070392608643, + "learning_rate": 8.507273555166375e-05, + "loss": 1.1299, + "num_input_tokens_seen": 34326352, + "step": 2133 + }, + { + "epoch": 0.149482956386211, + "grad_norm": 3.8594932556152344, + "learning_rate": 8.506573730297723e-05, + "loss": 1.0281, + "num_input_tokens_seen": 34342512, + "step": 2134 + }, + { + "epoch": 0.14955300463194024, + "grad_norm": 5.410063743591309, + "learning_rate": 8.505873905429072e-05, + "loss": 1.1376, + "num_input_tokens_seen": 34358896, + "step": 2135 + }, + { + "epoch": 0.1496230528776695, + "grad_norm": 4.02821159362793, + "learning_rate": 8.505174080560421e-05, + "loss": 0.9707, + "num_input_tokens_seen": 34375280, + "step": 2136 + }, + { + "epoch": 0.14969310112339873, + "grad_norm": 3.866480827331543, + "learning_rate": 8.50447425569177e-05, + "loss": 1.0727, + "num_input_tokens_seen": 34391584, + "step": 2137 + }, + { + "epoch": 0.149763149369128, + "grad_norm": 3.667064905166626, + "learning_rate": 8.503774430823118e-05, + "loss": 1.0609, + "num_input_tokens_seen": 34407264, + "step": 2138 + }, + { + "epoch": 0.14983319761485722, + "grad_norm": 5.41308069229126, + "learning_rate": 8.503074605954466e-05, + "loss": 1.0779, + "num_input_tokens_seen": 34423648, + "step": 2139 + }, + { + "epoch": 0.14990324586058648, + "grad_norm": 4.1716485023498535, + "learning_rate": 8.502374781085815e-05, + "loss": 1.2112, + "num_input_tokens_seen": 34439512, + "step": 2140 + }, + { + "epoch": 0.1499732941063157, + "grad_norm": 4.1403913497924805, + "learning_rate": 8.501674956217162e-05, + "loss": 1.0773, + "num_input_tokens_seen": 34455896, + "step": 2141 + }, + { + "epoch": 0.15004334235204497, + "grad_norm": 3.75219988822937, + "learning_rate": 8.500975131348511e-05, + "loss": 1.0685, + "num_input_tokens_seen": 34472280, + "step": 2142 + }, + { + "epoch": 0.1501133905977742, + "grad_norm": 4.339532852172852, + "learning_rate": 8.50027530647986e-05, + "loss": 1.0439, + "num_input_tokens_seen": 34488664, + "step": 2143 + }, + { + "epoch": 0.15018343884350346, + "grad_norm": 4.259124755859375, + "learning_rate": 8.499575481611209e-05, + "loss": 1.0576, + "num_input_tokens_seen": 34505048, + "step": 2144 + }, + { + "epoch": 0.1502534870892327, + "grad_norm": 5.031396865844727, + "learning_rate": 8.498875656742558e-05, + "loss": 0.9932, + "num_input_tokens_seen": 34521432, + "step": 2145 + }, + { + "epoch": 0.15032353533496196, + "grad_norm": 5.313172340393066, + "learning_rate": 8.498175831873907e-05, + "loss": 1.1737, + "num_input_tokens_seen": 34536344, + "step": 2146 + }, + { + "epoch": 0.15039358358069121, + "grad_norm": 4.844740390777588, + "learning_rate": 8.497476007005254e-05, + "loss": 1.4095, + "num_input_tokens_seen": 34552728, + "step": 2147 + }, + { + "epoch": 0.15046363182642045, + "grad_norm": 4.231154441833496, + "learning_rate": 8.496776182136603e-05, + "loss": 1.1196, + "num_input_tokens_seen": 34569016, + "step": 2148 + }, + { + "epoch": 0.1505336800721497, + "grad_norm": 4.176802635192871, + "learning_rate": 8.49607635726795e-05, + "loss": 1.0856, + "num_input_tokens_seen": 34585376, + "step": 2149 + }, + { + "epoch": 0.15060372831787894, + "grad_norm": 4.710334777832031, + "learning_rate": 8.4953765323993e-05, + "loss": 1.0085, + "num_input_tokens_seen": 34600400, + "step": 2150 + }, + { + "epoch": 0.1506737765636082, + "grad_norm": 3.9053258895874023, + "learning_rate": 8.494676707530648e-05, + "loss": 1.2191, + "num_input_tokens_seen": 34616688, + "step": 2151 + }, + { + "epoch": 0.15074382480933743, + "grad_norm": 4.043003559112549, + "learning_rate": 8.493976882661997e-05, + "loss": 1.0541, + "num_input_tokens_seen": 34631920, + "step": 2152 + }, + { + "epoch": 0.1508138730550667, + "grad_norm": 5.230721473693848, + "learning_rate": 8.493277057793346e-05, + "loss": 1.1491, + "num_input_tokens_seen": 34648128, + "step": 2153 + }, + { + "epoch": 0.15088392130079592, + "grad_norm": 4.098349094390869, + "learning_rate": 8.492577232924693e-05, + "loss": 1.1302, + "num_input_tokens_seen": 34664512, + "step": 2154 + }, + { + "epoch": 0.15095396954652518, + "grad_norm": 4.803813457489014, + "learning_rate": 8.491877408056042e-05, + "loss": 0.9653, + "num_input_tokens_seen": 34680560, + "step": 2155 + }, + { + "epoch": 0.1510240177922544, + "grad_norm": 4.25751256942749, + "learning_rate": 8.491177583187391e-05, + "loss": 1.2481, + "num_input_tokens_seen": 34696944, + "step": 2156 + }, + { + "epoch": 0.15109406603798367, + "grad_norm": 6.600613117218018, + "learning_rate": 8.49047775831874e-05, + "loss": 1.1786, + "num_input_tokens_seen": 34712416, + "step": 2157 + }, + { + "epoch": 0.1511641142837129, + "grad_norm": 5.649744987487793, + "learning_rate": 8.489777933450087e-05, + "loss": 1.3045, + "num_input_tokens_seen": 34728520, + "step": 2158 + }, + { + "epoch": 0.15123416252944216, + "grad_norm": 5.778639316558838, + "learning_rate": 8.489078108581436e-05, + "loss": 1.1224, + "num_input_tokens_seen": 34744776, + "step": 2159 + }, + { + "epoch": 0.1513042107751714, + "grad_norm": 5.944733619689941, + "learning_rate": 8.488378283712785e-05, + "loss": 1.3293, + "num_input_tokens_seen": 34761160, + "step": 2160 + }, + { + "epoch": 0.15137425902090065, + "grad_norm": 3.7783594131469727, + "learning_rate": 8.487678458844133e-05, + "loss": 1.0975, + "num_input_tokens_seen": 34777544, + "step": 2161 + }, + { + "epoch": 0.15144430726662989, + "grad_norm": 5.126344680786133, + "learning_rate": 8.486978633975482e-05, + "loss": 1.0509, + "num_input_tokens_seen": 34793072, + "step": 2162 + }, + { + "epoch": 0.15151435551235914, + "grad_norm": 4.689150333404541, + "learning_rate": 8.48627880910683e-05, + "loss": 1.1454, + "num_input_tokens_seen": 34809456, + "step": 2163 + }, + { + "epoch": 0.15158440375808838, + "grad_norm": 3.7559547424316406, + "learning_rate": 8.485578984238179e-05, + "loss": 1.1414, + "num_input_tokens_seen": 34825208, + "step": 2164 + }, + { + "epoch": 0.15165445200381764, + "grad_norm": 3.9225172996520996, + "learning_rate": 8.484879159369528e-05, + "loss": 1.1771, + "num_input_tokens_seen": 34841592, + "step": 2165 + }, + { + "epoch": 0.15172450024954687, + "grad_norm": 4.264125347137451, + "learning_rate": 8.484179334500876e-05, + "loss": 1.0046, + "num_input_tokens_seen": 34857928, + "step": 2166 + }, + { + "epoch": 0.15179454849527613, + "grad_norm": 4.0784382820129395, + "learning_rate": 8.483479509632224e-05, + "loss": 1.0638, + "num_input_tokens_seen": 34873224, + "step": 2167 + }, + { + "epoch": 0.15186459674100536, + "grad_norm": 4.371130466461182, + "learning_rate": 8.482779684763572e-05, + "loss": 1.3854, + "num_input_tokens_seen": 34889608, + "step": 2168 + }, + { + "epoch": 0.15193464498673462, + "grad_norm": 3.7022883892059326, + "learning_rate": 8.482079859894921e-05, + "loss": 0.9892, + "num_input_tokens_seen": 34905984, + "step": 2169 + }, + { + "epoch": 0.15200469323246385, + "grad_norm": 4.196985721588135, + "learning_rate": 8.481380035026271e-05, + "loss": 0.9674, + "num_input_tokens_seen": 34922368, + "step": 2170 + }, + { + "epoch": 0.1520747414781931, + "grad_norm": 4.0252580642700195, + "learning_rate": 8.480680210157619e-05, + "loss": 1.0478, + "num_input_tokens_seen": 34938752, + "step": 2171 + }, + { + "epoch": 0.15214478972392234, + "grad_norm": 4.03692626953125, + "learning_rate": 8.479980385288967e-05, + "loss": 1.1801, + "num_input_tokens_seen": 34954176, + "step": 2172 + }, + { + "epoch": 0.1522148379696516, + "grad_norm": 4.183175563812256, + "learning_rate": 8.479280560420316e-05, + "loss": 1.1117, + "num_input_tokens_seen": 34969880, + "step": 2173 + }, + { + "epoch": 0.15228488621538083, + "grad_norm": 3.757636070251465, + "learning_rate": 8.478580735551664e-05, + "loss": 1.1507, + "num_input_tokens_seen": 34985576, + "step": 2174 + }, + { + "epoch": 0.1523549344611101, + "grad_norm": 3.9442903995513916, + "learning_rate": 8.477880910683013e-05, + "loss": 1.034, + "num_input_tokens_seen": 35001896, + "step": 2175 + }, + { + "epoch": 0.15242498270683932, + "grad_norm": 4.092566013336182, + "learning_rate": 8.477181085814362e-05, + "loss": 1.0542, + "num_input_tokens_seen": 35018280, + "step": 2176 + }, + { + "epoch": 0.15249503095256858, + "grad_norm": 5.494921684265137, + "learning_rate": 8.47648126094571e-05, + "loss": 1.0988, + "num_input_tokens_seen": 35034544, + "step": 2177 + }, + { + "epoch": 0.15256507919829781, + "grad_norm": 7.327289581298828, + "learning_rate": 8.475781436077058e-05, + "loss": 1.1879, + "num_input_tokens_seen": 35050928, + "step": 2178 + }, + { + "epoch": 0.15263512744402707, + "grad_norm": 4.048150539398193, + "learning_rate": 8.475081611208407e-05, + "loss": 1.1071, + "num_input_tokens_seen": 35067000, + "step": 2179 + }, + { + "epoch": 0.1527051756897563, + "grad_norm": 6.388006210327148, + "learning_rate": 8.474381786339756e-05, + "loss": 0.9821, + "num_input_tokens_seen": 35082064, + "step": 2180 + }, + { + "epoch": 0.15277522393548557, + "grad_norm": 4.289052963256836, + "learning_rate": 8.473681961471103e-05, + "loss": 1.077, + "num_input_tokens_seen": 35098448, + "step": 2181 + }, + { + "epoch": 0.15284527218121483, + "grad_norm": 4.288560390472412, + "learning_rate": 8.472982136602452e-05, + "loss": 1.2723, + "num_input_tokens_seen": 35114832, + "step": 2182 + }, + { + "epoch": 0.15291532042694406, + "grad_norm": 4.17701530456543, + "learning_rate": 8.472282311733801e-05, + "loss": 1.1691, + "num_input_tokens_seen": 35131216, + "step": 2183 + }, + { + "epoch": 0.15298536867267332, + "grad_norm": 4.975949764251709, + "learning_rate": 8.47158248686515e-05, + "loss": 1.057, + "num_input_tokens_seen": 35147600, + "step": 2184 + }, + { + "epoch": 0.15305541691840255, + "grad_norm": 5.465437889099121, + "learning_rate": 8.470882661996497e-05, + "loss": 1.0328, + "num_input_tokens_seen": 35162464, + "step": 2185 + }, + { + "epoch": 0.1531254651641318, + "grad_norm": 3.329401731491089, + "learning_rate": 8.470182837127846e-05, + "loss": 1.0596, + "num_input_tokens_seen": 35178744, + "step": 2186 + }, + { + "epoch": 0.15319551340986104, + "grad_norm": 5.962124824523926, + "learning_rate": 8.469483012259195e-05, + "loss": 1.2799, + "num_input_tokens_seen": 35194736, + "step": 2187 + }, + { + "epoch": 0.1532655616555903, + "grad_norm": 3.897841691970825, + "learning_rate": 8.468783187390542e-05, + "loss": 1.1701, + "num_input_tokens_seen": 35211120, + "step": 2188 + }, + { + "epoch": 0.15333560990131953, + "grad_norm": 3.9668943881988525, + "learning_rate": 8.468083362521891e-05, + "loss": 1.1302, + "num_input_tokens_seen": 35227504, + "step": 2189 + }, + { + "epoch": 0.1534056581470488, + "grad_norm": 3.8960444927215576, + "learning_rate": 8.467383537653241e-05, + "loss": 0.891, + "num_input_tokens_seen": 35243584, + "step": 2190 + }, + { + "epoch": 0.15347570639277802, + "grad_norm": 3.7700982093811035, + "learning_rate": 8.466683712784589e-05, + "loss": 1.1744, + "num_input_tokens_seen": 35259968, + "step": 2191 + }, + { + "epoch": 0.15354575463850728, + "grad_norm": 4.65008020401001, + "learning_rate": 8.465983887915938e-05, + "loss": 1.2807, + "num_input_tokens_seen": 35276352, + "step": 2192 + }, + { + "epoch": 0.1536158028842365, + "grad_norm": 3.5371146202087402, + "learning_rate": 8.465284063047285e-05, + "loss": 0.9699, + "num_input_tokens_seen": 35292736, + "step": 2193 + }, + { + "epoch": 0.15368585112996577, + "grad_norm": 4.395732879638672, + "learning_rate": 8.464584238178634e-05, + "loss": 0.9862, + "num_input_tokens_seen": 35309120, + "step": 2194 + }, + { + "epoch": 0.153755899375695, + "grad_norm": 5.01919412612915, + "learning_rate": 8.463884413309982e-05, + "loss": 1.0143, + "num_input_tokens_seen": 35325504, + "step": 2195 + }, + { + "epoch": 0.15382594762142426, + "grad_norm": 3.7417054176330566, + "learning_rate": 8.463184588441332e-05, + "loss": 1.0712, + "num_input_tokens_seen": 35341376, + "step": 2196 + }, + { + "epoch": 0.1538959958671535, + "grad_norm": 4.119459629058838, + "learning_rate": 8.462484763572681e-05, + "loss": 1.0919, + "num_input_tokens_seen": 35357520, + "step": 2197 + }, + { + "epoch": 0.15396604411288276, + "grad_norm": 6.938751220703125, + "learning_rate": 8.461784938704028e-05, + "loss": 1.1272, + "num_input_tokens_seen": 35372920, + "step": 2198 + }, + { + "epoch": 0.154036092358612, + "grad_norm": 5.000339984893799, + "learning_rate": 8.461085113835377e-05, + "loss": 1.1508, + "num_input_tokens_seen": 35389304, + "step": 2199 + }, + { + "epoch": 0.15410614060434125, + "grad_norm": 3.6554362773895264, + "learning_rate": 8.460385288966726e-05, + "loss": 1.0765, + "num_input_tokens_seen": 35405688, + "step": 2200 + }, + { + "epoch": 0.15410614060434125, + "eval_loss": 1.145054578781128, + "eval_runtime": 0.1886, + "eval_samples_per_second": 5.303, + "eval_steps_per_second": 5.303, + "num_input_tokens_seen": 35405688, + "step": 2200 + }, + { + "epoch": 0.15417618885007048, + "grad_norm": 3.718207836151123, + "learning_rate": 8.459685464098073e-05, + "loss": 0.8814, + "num_input_tokens_seen": 35422072, + "step": 2201 + }, + { + "epoch": 0.15424623709579974, + "grad_norm": 4.98813533782959, + "learning_rate": 8.458985639229422e-05, + "loss": 1.1814, + "num_input_tokens_seen": 35438456, + "step": 2202 + }, + { + "epoch": 0.15431628534152897, + "grad_norm": 3.550008535385132, + "learning_rate": 8.458285814360771e-05, + "loss": 1.1281, + "num_input_tokens_seen": 35454840, + "step": 2203 + }, + { + "epoch": 0.15438633358725823, + "grad_norm": 3.8408641815185547, + "learning_rate": 8.45758598949212e-05, + "loss": 0.9759, + "num_input_tokens_seen": 35471080, + "step": 2204 + }, + { + "epoch": 0.15445638183298746, + "grad_norm": 4.515852451324463, + "learning_rate": 8.456886164623468e-05, + "loss": 0.9394, + "num_input_tokens_seen": 35486904, + "step": 2205 + }, + { + "epoch": 0.15452643007871672, + "grad_norm": 3.6536715030670166, + "learning_rate": 8.456186339754816e-05, + "loss": 0.9649, + "num_input_tokens_seen": 35503064, + "step": 2206 + }, + { + "epoch": 0.15459647832444595, + "grad_norm": 4.071808338165283, + "learning_rate": 8.455486514886165e-05, + "loss": 1.0972, + "num_input_tokens_seen": 35518880, + "step": 2207 + }, + { + "epoch": 0.1546665265701752, + "grad_norm": 4.329566955566406, + "learning_rate": 8.454786690017513e-05, + "loss": 1.0843, + "num_input_tokens_seen": 35535256, + "step": 2208 + }, + { + "epoch": 0.15473657481590444, + "grad_norm": 4.243298053741455, + "learning_rate": 8.454086865148862e-05, + "loss": 1.1688, + "num_input_tokens_seen": 35551376, + "step": 2209 + }, + { + "epoch": 0.1548066230616337, + "grad_norm": 4.154253959655762, + "learning_rate": 8.453387040280212e-05, + "loss": 1.0458, + "num_input_tokens_seen": 35567696, + "step": 2210 + }, + { + "epoch": 0.15487667130736293, + "grad_norm": 4.0564494132995605, + "learning_rate": 8.45268721541156e-05, + "loss": 1.0585, + "num_input_tokens_seen": 35583576, + "step": 2211 + }, + { + "epoch": 0.1549467195530922, + "grad_norm": 3.735724687576294, + "learning_rate": 8.451987390542907e-05, + "loss": 0.92, + "num_input_tokens_seen": 35599536, + "step": 2212 + }, + { + "epoch": 0.15501676779882143, + "grad_norm": 4.651454925537109, + "learning_rate": 8.451287565674256e-05, + "loss": 1.2097, + "num_input_tokens_seen": 35615920, + "step": 2213 + }, + { + "epoch": 0.15508681604455068, + "grad_norm": 5.01883602142334, + "learning_rate": 8.450587740805605e-05, + "loss": 0.9275, + "num_input_tokens_seen": 35631208, + "step": 2214 + }, + { + "epoch": 0.15515686429027992, + "grad_norm": 4.435250282287598, + "learning_rate": 8.449887915936952e-05, + "loss": 1.003, + "num_input_tokens_seen": 35647328, + "step": 2215 + }, + { + "epoch": 0.15522691253600918, + "grad_norm": 3.495476245880127, + "learning_rate": 8.449188091068302e-05, + "loss": 0.9968, + "num_input_tokens_seen": 35663472, + "step": 2216 + }, + { + "epoch": 0.15529696078173844, + "grad_norm": 4.461013317108154, + "learning_rate": 8.448488266199651e-05, + "loss": 1.1098, + "num_input_tokens_seen": 35679856, + "step": 2217 + }, + { + "epoch": 0.15536700902746767, + "grad_norm": 5.4857683181762695, + "learning_rate": 8.447788441330999e-05, + "loss": 1.143, + "num_input_tokens_seen": 35695616, + "step": 2218 + }, + { + "epoch": 0.15543705727319693, + "grad_norm": 4.20158052444458, + "learning_rate": 8.447088616462348e-05, + "loss": 1.1643, + "num_input_tokens_seen": 35711432, + "step": 2219 + }, + { + "epoch": 0.15550710551892616, + "grad_norm": 4.289988040924072, + "learning_rate": 8.446388791593695e-05, + "loss": 1.1582, + "num_input_tokens_seen": 35727552, + "step": 2220 + }, + { + "epoch": 0.15557715376465542, + "grad_norm": 3.7897555828094482, + "learning_rate": 8.445688966725044e-05, + "loss": 1.255, + "num_input_tokens_seen": 35743800, + "step": 2221 + }, + { + "epoch": 0.15564720201038465, + "grad_norm": 4.405816078186035, + "learning_rate": 8.444989141856393e-05, + "loss": 1.1057, + "num_input_tokens_seen": 35760184, + "step": 2222 + }, + { + "epoch": 0.1557172502561139, + "grad_norm": 4.2683610916137695, + "learning_rate": 8.444289316987742e-05, + "loss": 1.1042, + "num_input_tokens_seen": 35776568, + "step": 2223 + }, + { + "epoch": 0.15578729850184314, + "grad_norm": 3.9999659061431885, + "learning_rate": 8.44358949211909e-05, + "loss": 1.0504, + "num_input_tokens_seen": 35792952, + "step": 2224 + }, + { + "epoch": 0.1558573467475724, + "grad_norm": 3.6252965927124023, + "learning_rate": 8.442889667250438e-05, + "loss": 0.9755, + "num_input_tokens_seen": 35809176, + "step": 2225 + }, + { + "epoch": 0.15592739499330163, + "grad_norm": 3.9726274013519287, + "learning_rate": 8.442189842381787e-05, + "loss": 1.1104, + "num_input_tokens_seen": 35825560, + "step": 2226 + }, + { + "epoch": 0.1559974432390309, + "grad_norm": 5.004739761352539, + "learning_rate": 8.441490017513136e-05, + "loss": 1.2484, + "num_input_tokens_seen": 35841936, + "step": 2227 + }, + { + "epoch": 0.15606749148476012, + "grad_norm": 5.432271480560303, + "learning_rate": 8.440790192644483e-05, + "loss": 0.9799, + "num_input_tokens_seen": 35857944, + "step": 2228 + }, + { + "epoch": 0.15613753973048938, + "grad_norm": 4.553518295288086, + "learning_rate": 8.440090367775832e-05, + "loss": 1.1077, + "num_input_tokens_seen": 35873920, + "step": 2229 + }, + { + "epoch": 0.15620758797621861, + "grad_norm": 5.924668312072754, + "learning_rate": 8.439390542907181e-05, + "loss": 1.2937, + "num_input_tokens_seen": 35888872, + "step": 2230 + }, + { + "epoch": 0.15627763622194787, + "grad_norm": 4.276167392730713, + "learning_rate": 8.43869071803853e-05, + "loss": 1.1883, + "num_input_tokens_seen": 35905256, + "step": 2231 + }, + { + "epoch": 0.1563476844676771, + "grad_norm": 3.719632863998413, + "learning_rate": 8.437990893169877e-05, + "loss": 1.0713, + "num_input_tokens_seen": 35921640, + "step": 2232 + }, + { + "epoch": 0.15641773271340637, + "grad_norm": 4.769368648529053, + "learning_rate": 8.437291068301226e-05, + "loss": 1.079, + "num_input_tokens_seen": 35936256, + "step": 2233 + }, + { + "epoch": 0.1564877809591356, + "grad_norm": 4.957282543182373, + "learning_rate": 8.436591243432575e-05, + "loss": 1.0535, + "num_input_tokens_seen": 35952640, + "step": 2234 + }, + { + "epoch": 0.15655782920486486, + "grad_norm": 4.782018661499023, + "learning_rate": 8.435891418563923e-05, + "loss": 1.0799, + "num_input_tokens_seen": 35967880, + "step": 2235 + }, + { + "epoch": 0.1566278774505941, + "grad_norm": 4.716582775115967, + "learning_rate": 8.435191593695273e-05, + "loss": 1.1388, + "num_input_tokens_seen": 35984016, + "step": 2236 + }, + { + "epoch": 0.15669792569632335, + "grad_norm": 4.36606502532959, + "learning_rate": 8.434491768826622e-05, + "loss": 0.954, + "num_input_tokens_seen": 35999904, + "step": 2237 + }, + { + "epoch": 0.15676797394205258, + "grad_norm": 3.8300321102142334, + "learning_rate": 8.433791943957969e-05, + "loss": 1.0903, + "num_input_tokens_seen": 36016216, + "step": 2238 + }, + { + "epoch": 0.15683802218778184, + "grad_norm": 3.7595677375793457, + "learning_rate": 8.433092119089317e-05, + "loss": 1.0214, + "num_input_tokens_seen": 36032600, + "step": 2239 + }, + { + "epoch": 0.15690807043351107, + "grad_norm": 4.783555030822754, + "learning_rate": 8.432392294220665e-05, + "loss": 1.1621, + "num_input_tokens_seen": 36048984, + "step": 2240 + }, + { + "epoch": 0.15697811867924033, + "grad_norm": 4.393221855163574, + "learning_rate": 8.431692469352014e-05, + "loss": 1.2196, + "num_input_tokens_seen": 36065368, + "step": 2241 + }, + { + "epoch": 0.15704816692496956, + "grad_norm": 3.8634722232818604, + "learning_rate": 8.430992644483363e-05, + "loss": 1.0227, + "num_input_tokens_seen": 36081752, + "step": 2242 + }, + { + "epoch": 0.15711821517069882, + "grad_norm": 4.5091233253479, + "learning_rate": 8.430292819614712e-05, + "loss": 0.9261, + "num_input_tokens_seen": 36097672, + "step": 2243 + }, + { + "epoch": 0.15718826341642805, + "grad_norm": 3.89699387550354, + "learning_rate": 8.429592994746061e-05, + "loss": 1.0023, + "num_input_tokens_seen": 36114048, + "step": 2244 + }, + { + "epoch": 0.1572583116621573, + "grad_norm": 3.8859546184539795, + "learning_rate": 8.428893169877408e-05, + "loss": 0.9597, + "num_input_tokens_seen": 36130024, + "step": 2245 + }, + { + "epoch": 0.15732835990788654, + "grad_norm": 4.236848831176758, + "learning_rate": 8.428193345008757e-05, + "loss": 1.1777, + "num_input_tokens_seen": 36146408, + "step": 2246 + }, + { + "epoch": 0.1573984081536158, + "grad_norm": 6.742307662963867, + "learning_rate": 8.427493520140105e-05, + "loss": 0.9674, + "num_input_tokens_seen": 36161440, + "step": 2247 + }, + { + "epoch": 0.15746845639934504, + "grad_norm": 3.332416534423828, + "learning_rate": 8.426793695271454e-05, + "loss": 0.7694, + "num_input_tokens_seen": 36177824, + "step": 2248 + }, + { + "epoch": 0.1575385046450743, + "grad_norm": 4.672734260559082, + "learning_rate": 8.426093870402802e-05, + "loss": 0.9228, + "num_input_tokens_seen": 36193320, + "step": 2249 + }, + { + "epoch": 0.15760855289080353, + "grad_norm": 4.437155246734619, + "learning_rate": 8.425394045534151e-05, + "loss": 1.2712, + "num_input_tokens_seen": 36209704, + "step": 2250 + }, + { + "epoch": 0.1576786011365328, + "grad_norm": 4.112512111663818, + "learning_rate": 8.4246942206655e-05, + "loss": 1.3494, + "num_input_tokens_seen": 36226088, + "step": 2251 + }, + { + "epoch": 0.15774864938226205, + "grad_norm": 4.432194709777832, + "learning_rate": 8.423994395796848e-05, + "loss": 1.1303, + "num_input_tokens_seen": 36242472, + "step": 2252 + }, + { + "epoch": 0.15781869762799128, + "grad_norm": 4.322375297546387, + "learning_rate": 8.423294570928197e-05, + "loss": 1.084, + "num_input_tokens_seen": 36258680, + "step": 2253 + }, + { + "epoch": 0.15788874587372054, + "grad_norm": 3.848836660385132, + "learning_rate": 8.422594746059545e-05, + "loss": 1.2057, + "num_input_tokens_seen": 36274512, + "step": 2254 + }, + { + "epoch": 0.15795879411944977, + "grad_norm": 4.022729396820068, + "learning_rate": 8.421894921190893e-05, + "loss": 1.0584, + "num_input_tokens_seen": 36289568, + "step": 2255 + }, + { + "epoch": 0.15802884236517903, + "grad_norm": 3.8060622215270996, + "learning_rate": 8.421195096322243e-05, + "loss": 1.1144, + "num_input_tokens_seen": 36305256, + "step": 2256 + }, + { + "epoch": 0.15809889061090826, + "grad_norm": 4.685004234313965, + "learning_rate": 8.42049527145359e-05, + "loss": 1.1341, + "num_input_tokens_seen": 36321008, + "step": 2257 + }, + { + "epoch": 0.15816893885663752, + "grad_norm": 3.4483463764190674, + "learning_rate": 8.41979544658494e-05, + "loss": 0.9563, + "num_input_tokens_seen": 36337000, + "step": 2258 + }, + { + "epoch": 0.15823898710236675, + "grad_norm": 3.7172203063964844, + "learning_rate": 8.419095621716287e-05, + "loss": 1.1463, + "num_input_tokens_seen": 36353160, + "step": 2259 + }, + { + "epoch": 0.158309035348096, + "grad_norm": 5.734589099884033, + "learning_rate": 8.418395796847636e-05, + "loss": 0.9321, + "num_input_tokens_seen": 36369248, + "step": 2260 + }, + { + "epoch": 0.15837908359382524, + "grad_norm": 4.060257911682129, + "learning_rate": 8.417695971978985e-05, + "loss": 1.2162, + "num_input_tokens_seen": 36384736, + "step": 2261 + }, + { + "epoch": 0.1584491318395545, + "grad_norm": 5.240515232086182, + "learning_rate": 8.416996147110334e-05, + "loss": 0.9652, + "num_input_tokens_seen": 36401120, + "step": 2262 + }, + { + "epoch": 0.15851918008528373, + "grad_norm": 5.482649803161621, + "learning_rate": 8.416296322241682e-05, + "loss": 1.207, + "num_input_tokens_seen": 36417504, + "step": 2263 + }, + { + "epoch": 0.158589228331013, + "grad_norm": 3.9862253665924072, + "learning_rate": 8.415596497373031e-05, + "loss": 1.1354, + "num_input_tokens_seen": 36433888, + "step": 2264 + }, + { + "epoch": 0.15865927657674223, + "grad_norm": 6.322808742523193, + "learning_rate": 8.414896672504379e-05, + "loss": 1.1144, + "num_input_tokens_seen": 36449552, + "step": 2265 + }, + { + "epoch": 0.15872932482247148, + "grad_norm": 4.312921524047852, + "learning_rate": 8.414196847635726e-05, + "loss": 1.1254, + "num_input_tokens_seen": 36465936, + "step": 2266 + }, + { + "epoch": 0.15879937306820072, + "grad_norm": 4.178677082061768, + "learning_rate": 8.413497022767075e-05, + "loss": 1.2539, + "num_input_tokens_seen": 36482184, + "step": 2267 + }, + { + "epoch": 0.15886942131392998, + "grad_norm": 4.304810523986816, + "learning_rate": 8.412797197898424e-05, + "loss": 1.199, + "num_input_tokens_seen": 36498320, + "step": 2268 + }, + { + "epoch": 0.1589394695596592, + "grad_norm": 3.723483085632324, + "learning_rate": 8.412097373029773e-05, + "loss": 1.0335, + "num_input_tokens_seen": 36514704, + "step": 2269 + }, + { + "epoch": 0.15900951780538847, + "grad_norm": 4.285789489746094, + "learning_rate": 8.411397548161122e-05, + "loss": 1.2463, + "num_input_tokens_seen": 36531032, + "step": 2270 + }, + { + "epoch": 0.1590795660511177, + "grad_norm": 3.5788466930389404, + "learning_rate": 8.41069772329247e-05, + "loss": 0.7809, + "num_input_tokens_seen": 36547416, + "step": 2271 + }, + { + "epoch": 0.15914961429684696, + "grad_norm": 5.785874366760254, + "learning_rate": 8.409997898423818e-05, + "loss": 1.2832, + "num_input_tokens_seen": 36563800, + "step": 2272 + }, + { + "epoch": 0.1592196625425762, + "grad_norm": 3.914402723312378, + "learning_rate": 8.409298073555167e-05, + "loss": 1.2065, + "num_input_tokens_seen": 36580184, + "step": 2273 + }, + { + "epoch": 0.15928971078830545, + "grad_norm": 3.878512144088745, + "learning_rate": 8.408598248686514e-05, + "loss": 1.1457, + "num_input_tokens_seen": 36596568, + "step": 2274 + }, + { + "epoch": 0.15935975903403468, + "grad_norm": 4.195454120635986, + "learning_rate": 8.407898423817863e-05, + "loss": 1.2628, + "num_input_tokens_seen": 36612952, + "step": 2275 + }, + { + "epoch": 0.15942980727976394, + "grad_norm": 3.847649097442627, + "learning_rate": 8.407198598949212e-05, + "loss": 1.0678, + "num_input_tokens_seen": 36628752, + "step": 2276 + }, + { + "epoch": 0.15949985552549317, + "grad_norm": 5.284397125244141, + "learning_rate": 8.406498774080561e-05, + "loss": 1.0508, + "num_input_tokens_seen": 36645136, + "step": 2277 + }, + { + "epoch": 0.15956990377122243, + "grad_norm": 4.10982084274292, + "learning_rate": 8.40579894921191e-05, + "loss": 1.0558, + "num_input_tokens_seen": 36661392, + "step": 2278 + }, + { + "epoch": 0.15963995201695166, + "grad_norm": 3.8282828330993652, + "learning_rate": 8.405099124343257e-05, + "loss": 1.1064, + "num_input_tokens_seen": 36676856, + "step": 2279 + }, + { + "epoch": 0.15971000026268092, + "grad_norm": 4.115365028381348, + "learning_rate": 8.404399299474606e-05, + "loss": 1.0081, + "num_input_tokens_seen": 36693080, + "step": 2280 + }, + { + "epoch": 0.15978004850841016, + "grad_norm": 3.6131088733673096, + "learning_rate": 8.403699474605955e-05, + "loss": 0.8565, + "num_input_tokens_seen": 36709440, + "step": 2281 + }, + { + "epoch": 0.15985009675413941, + "grad_norm": 3.83146071434021, + "learning_rate": 8.402999649737304e-05, + "loss": 1.0762, + "num_input_tokens_seen": 36725496, + "step": 2282 + }, + { + "epoch": 0.15992014499986865, + "grad_norm": 3.8456339836120605, + "learning_rate": 8.402299824868653e-05, + "loss": 1.053, + "num_input_tokens_seen": 36741544, + "step": 2283 + }, + { + "epoch": 0.1599901932455979, + "grad_norm": 3.717014789581299, + "learning_rate": 8.4016e-05, + "loss": 1.0053, + "num_input_tokens_seen": 36757928, + "step": 2284 + }, + { + "epoch": 0.16006024149132717, + "grad_norm": 4.3730854988098145, + "learning_rate": 8.400900175131349e-05, + "loss": 1.1639, + "num_input_tokens_seen": 36774144, + "step": 2285 + }, + { + "epoch": 0.1601302897370564, + "grad_norm": 3.6635241508483887, + "learning_rate": 8.400200350262697e-05, + "loss": 0.9721, + "num_input_tokens_seen": 36790248, + "step": 2286 + }, + { + "epoch": 0.16020033798278566, + "grad_norm": 3.9058330059051514, + "learning_rate": 8.399500525394046e-05, + "loss": 1.0814, + "num_input_tokens_seen": 36806632, + "step": 2287 + }, + { + "epoch": 0.1602703862285149, + "grad_norm": 3.60127854347229, + "learning_rate": 8.398800700525394e-05, + "loss": 1.1541, + "num_input_tokens_seen": 36823016, + "step": 2288 + }, + { + "epoch": 0.16034043447424415, + "grad_norm": 5.762889385223389, + "learning_rate": 8.398100875656743e-05, + "loss": 0.9572, + "num_input_tokens_seen": 36838576, + "step": 2289 + }, + { + "epoch": 0.16041048271997338, + "grad_norm": 3.495436191558838, + "learning_rate": 8.397401050788092e-05, + "loss": 1.0156, + "num_input_tokens_seen": 36854960, + "step": 2290 + }, + { + "epoch": 0.16048053096570264, + "grad_norm": 4.083384037017822, + "learning_rate": 8.396701225919441e-05, + "loss": 1.1724, + "num_input_tokens_seen": 36870672, + "step": 2291 + }, + { + "epoch": 0.16055057921143187, + "grad_norm": 3.7010245323181152, + "learning_rate": 8.396001401050788e-05, + "loss": 0.8871, + "num_input_tokens_seen": 36887056, + "step": 2292 + }, + { + "epoch": 0.16062062745716113, + "grad_norm": 3.419485330581665, + "learning_rate": 8.395301576182136e-05, + "loss": 0.9586, + "num_input_tokens_seen": 36903144, + "step": 2293 + }, + { + "epoch": 0.16069067570289036, + "grad_norm": 3.593970537185669, + "learning_rate": 8.394601751313485e-05, + "loss": 1.0109, + "num_input_tokens_seen": 36919192, + "step": 2294 + }, + { + "epoch": 0.16076072394861962, + "grad_norm": 3.729038953781128, + "learning_rate": 8.393901926444834e-05, + "loss": 1.288, + "num_input_tokens_seen": 36935576, + "step": 2295 + }, + { + "epoch": 0.16083077219434885, + "grad_norm": 3.60687255859375, + "learning_rate": 8.393202101576183e-05, + "loss": 0.9423, + "num_input_tokens_seen": 36951960, + "step": 2296 + }, + { + "epoch": 0.1609008204400781, + "grad_norm": 3.4520435333251953, + "learning_rate": 8.392502276707531e-05, + "loss": 0.9515, + "num_input_tokens_seen": 36968344, + "step": 2297 + }, + { + "epoch": 0.16097086868580734, + "grad_norm": 3.71907639503479, + "learning_rate": 8.39180245183888e-05, + "loss": 1.1141, + "num_input_tokens_seen": 36984440, + "step": 2298 + }, + { + "epoch": 0.1610409169315366, + "grad_norm": 3.8897864818573, + "learning_rate": 8.391102626970228e-05, + "loss": 1.1124, + "num_input_tokens_seen": 37000824, + "step": 2299 + }, + { + "epoch": 0.16111096517726584, + "grad_norm": 3.579921245574951, + "learning_rate": 8.390402802101577e-05, + "loss": 1.0998, + "num_input_tokens_seen": 37017088, + "step": 2300 + }, + { + "epoch": 0.1611810134229951, + "grad_norm": 3.9658427238464355, + "learning_rate": 8.389702977232924e-05, + "loss": 1.034, + "num_input_tokens_seen": 37033232, + "step": 2301 + }, + { + "epoch": 0.16125106166872433, + "grad_norm": 4.2862725257873535, + "learning_rate": 8.389003152364274e-05, + "loss": 0.9662, + "num_input_tokens_seen": 37049616, + "step": 2302 + }, + { + "epoch": 0.1613211099144536, + "grad_norm": 3.7523694038391113, + "learning_rate": 8.388303327495622e-05, + "loss": 1.0806, + "num_input_tokens_seen": 37065784, + "step": 2303 + }, + { + "epoch": 0.16139115816018282, + "grad_norm": 3.9068679809570312, + "learning_rate": 8.387603502626971e-05, + "loss": 1.0985, + "num_input_tokens_seen": 37082168, + "step": 2304 + }, + { + "epoch": 0.16146120640591208, + "grad_norm": 5.876891613006592, + "learning_rate": 8.38690367775832e-05, + "loss": 1.2938, + "num_input_tokens_seen": 37097072, + "step": 2305 + }, + { + "epoch": 0.1615312546516413, + "grad_norm": 4.040335655212402, + "learning_rate": 8.386203852889667e-05, + "loss": 1.1376, + "num_input_tokens_seen": 37112936, + "step": 2306 + }, + { + "epoch": 0.16160130289737057, + "grad_norm": 3.566763401031494, + "learning_rate": 8.385504028021016e-05, + "loss": 0.9164, + "num_input_tokens_seen": 37129320, + "step": 2307 + }, + { + "epoch": 0.1616713511430998, + "grad_norm": 3.7780325412750244, + "learning_rate": 8.384804203152365e-05, + "loss": 0.9541, + "num_input_tokens_seen": 37144832, + "step": 2308 + }, + { + "epoch": 0.16174139938882906, + "grad_norm": 4.291510105133057, + "learning_rate": 8.384104378283714e-05, + "loss": 1.2579, + "num_input_tokens_seen": 37160312, + "step": 2309 + }, + { + "epoch": 0.1618114476345583, + "grad_norm": 3.721531629562378, + "learning_rate": 8.383404553415063e-05, + "loss": 1.0108, + "num_input_tokens_seen": 37176696, + "step": 2310 + }, + { + "epoch": 0.16188149588028755, + "grad_norm": 3.883301258087158, + "learning_rate": 8.38270472854641e-05, + "loss": 1.17, + "num_input_tokens_seen": 37192632, + "step": 2311 + }, + { + "epoch": 0.16195154412601678, + "grad_norm": 4.240591049194336, + "learning_rate": 8.382004903677759e-05, + "loss": 1.2263, + "num_input_tokens_seen": 37208448, + "step": 2312 + }, + { + "epoch": 0.16202159237174604, + "grad_norm": 4.711728572845459, + "learning_rate": 8.381305078809106e-05, + "loss": 1.1743, + "num_input_tokens_seen": 37223176, + "step": 2313 + }, + { + "epoch": 0.16209164061747527, + "grad_norm": 4.733399391174316, + "learning_rate": 8.380605253940455e-05, + "loss": 1.2512, + "num_input_tokens_seen": 37239560, + "step": 2314 + }, + { + "epoch": 0.16216168886320453, + "grad_norm": 5.842257976531982, + "learning_rate": 8.379905429071804e-05, + "loss": 1.029, + "num_input_tokens_seen": 37255536, + "step": 2315 + }, + { + "epoch": 0.16223173710893377, + "grad_norm": 3.9891135692596436, + "learning_rate": 8.379205604203153e-05, + "loss": 1.0767, + "num_input_tokens_seen": 37271920, + "step": 2316 + }, + { + "epoch": 0.16230178535466303, + "grad_norm": 3.3596630096435547, + "learning_rate": 8.378505779334502e-05, + "loss": 0.8963, + "num_input_tokens_seen": 37288024, + "step": 2317 + }, + { + "epoch": 0.16237183360039226, + "grad_norm": 4.346104621887207, + "learning_rate": 8.377805954465851e-05, + "loss": 1.0947, + "num_input_tokens_seen": 37304264, + "step": 2318 + }, + { + "epoch": 0.16244188184612152, + "grad_norm": 3.5524039268493652, + "learning_rate": 8.377106129597198e-05, + "loss": 0.9435, + "num_input_tokens_seen": 37320648, + "step": 2319 + }, + { + "epoch": 0.16251193009185078, + "grad_norm": 4.335781574249268, + "learning_rate": 8.376406304728546e-05, + "loss": 0.9151, + "num_input_tokens_seen": 37336104, + "step": 2320 + }, + { + "epoch": 0.16258197833758, + "grad_norm": 3.7356534004211426, + "learning_rate": 8.375706479859895e-05, + "loss": 1.0195, + "num_input_tokens_seen": 37352488, + "step": 2321 + }, + { + "epoch": 0.16265202658330927, + "grad_norm": 3.842710494995117, + "learning_rate": 8.375006654991243e-05, + "loss": 1.0543, + "num_input_tokens_seen": 37368872, + "step": 2322 + }, + { + "epoch": 0.1627220748290385, + "grad_norm": 3.9485390186309814, + "learning_rate": 8.374306830122592e-05, + "loss": 1.2149, + "num_input_tokens_seen": 37385256, + "step": 2323 + }, + { + "epoch": 0.16279212307476776, + "grad_norm": 3.9196622371673584, + "learning_rate": 8.373607005253941e-05, + "loss": 1.0907, + "num_input_tokens_seen": 37401224, + "step": 2324 + }, + { + "epoch": 0.162862171320497, + "grad_norm": 4.2444844245910645, + "learning_rate": 8.37290718038529e-05, + "loss": 1.0201, + "num_input_tokens_seen": 37417016, + "step": 2325 + }, + { + "epoch": 0.16293221956622625, + "grad_norm": 3.974438190460205, + "learning_rate": 8.372207355516638e-05, + "loss": 1.0733, + "num_input_tokens_seen": 37433400, + "step": 2326 + }, + { + "epoch": 0.16300226781195548, + "grad_norm": 3.833350658416748, + "learning_rate": 8.371507530647986e-05, + "loss": 1.1536, + "num_input_tokens_seen": 37449784, + "step": 2327 + }, + { + "epoch": 0.16307231605768474, + "grad_norm": 4.566055774688721, + "learning_rate": 8.370807705779335e-05, + "loss": 1.093, + "num_input_tokens_seen": 37465720, + "step": 2328 + }, + { + "epoch": 0.16314236430341397, + "grad_norm": 3.455068588256836, + "learning_rate": 8.370107880910684e-05, + "loss": 0.9396, + "num_input_tokens_seen": 37482104, + "step": 2329 + }, + { + "epoch": 0.16321241254914323, + "grad_norm": 4.584096908569336, + "learning_rate": 8.369408056042032e-05, + "loss": 1.0109, + "num_input_tokens_seen": 37498488, + "step": 2330 + }, + { + "epoch": 0.16328246079487246, + "grad_norm": 4.0225958824157715, + "learning_rate": 8.36870823117338e-05, + "loss": 1.1507, + "num_input_tokens_seen": 37514264, + "step": 2331 + }, + { + "epoch": 0.16335250904060172, + "grad_norm": 5.311272144317627, + "learning_rate": 8.368008406304729e-05, + "loss": 1.2248, + "num_input_tokens_seen": 37529280, + "step": 2332 + }, + { + "epoch": 0.16342255728633096, + "grad_norm": 3.752720594406128, + "learning_rate": 8.367308581436077e-05, + "loss": 0.964, + "num_input_tokens_seen": 37545664, + "step": 2333 + }, + { + "epoch": 0.16349260553206021, + "grad_norm": 3.8337442874908447, + "learning_rate": 8.366608756567426e-05, + "loss": 1.1928, + "num_input_tokens_seen": 37562048, + "step": 2334 + }, + { + "epoch": 0.16356265377778945, + "grad_norm": 3.818251132965088, + "learning_rate": 8.365908931698775e-05, + "loss": 1.0032, + "num_input_tokens_seen": 37577848, + "step": 2335 + }, + { + "epoch": 0.1636327020235187, + "grad_norm": 3.7170960903167725, + "learning_rate": 8.365209106830123e-05, + "loss": 1.2297, + "num_input_tokens_seen": 37594232, + "step": 2336 + }, + { + "epoch": 0.16370275026924794, + "grad_norm": 3.984950304031372, + "learning_rate": 8.364509281961472e-05, + "loss": 1.0744, + "num_input_tokens_seen": 37610248, + "step": 2337 + }, + { + "epoch": 0.1637727985149772, + "grad_norm": 3.4384636878967285, + "learning_rate": 8.36380945709282e-05, + "loss": 1.0015, + "num_input_tokens_seen": 37626632, + "step": 2338 + }, + { + "epoch": 0.16384284676070643, + "grad_norm": 3.952625274658203, + "learning_rate": 8.363109632224169e-05, + "loss": 1.1604, + "num_input_tokens_seen": 37643016, + "step": 2339 + }, + { + "epoch": 0.1639128950064357, + "grad_norm": 3.7193119525909424, + "learning_rate": 8.362409807355516e-05, + "loss": 0.9054, + "num_input_tokens_seen": 37658216, + "step": 2340 + }, + { + "epoch": 0.16398294325216492, + "grad_norm": 3.977997303009033, + "learning_rate": 8.361709982486865e-05, + "loss": 1.2904, + "num_input_tokens_seen": 37674600, + "step": 2341 + }, + { + "epoch": 0.16405299149789418, + "grad_norm": 5.108094215393066, + "learning_rate": 8.361010157618214e-05, + "loss": 1.0664, + "num_input_tokens_seen": 37690184, + "step": 2342 + }, + { + "epoch": 0.1641230397436234, + "grad_norm": 4.881065845489502, + "learning_rate": 8.360310332749563e-05, + "loss": 1.0787, + "num_input_tokens_seen": 37705352, + "step": 2343 + }, + { + "epoch": 0.16419308798935267, + "grad_norm": 4.128891468048096, + "learning_rate": 8.359610507880912e-05, + "loss": 0.8745, + "num_input_tokens_seen": 37721736, + "step": 2344 + }, + { + "epoch": 0.1642631362350819, + "grad_norm": 4.006495475769043, + "learning_rate": 8.35891068301226e-05, + "loss": 0.9992, + "num_input_tokens_seen": 37738120, + "step": 2345 + }, + { + "epoch": 0.16433318448081116, + "grad_norm": 3.877427101135254, + "learning_rate": 8.358210858143608e-05, + "loss": 0.9334, + "num_input_tokens_seen": 37754504, + "step": 2346 + }, + { + "epoch": 0.1644032327265404, + "grad_norm": 3.7013916969299316, + "learning_rate": 8.357511033274955e-05, + "loss": 1.069, + "num_input_tokens_seen": 37770792, + "step": 2347 + }, + { + "epoch": 0.16447328097226965, + "grad_norm": 3.675049066543579, + "learning_rate": 8.356811208406304e-05, + "loss": 0.9863, + "num_input_tokens_seen": 37786800, + "step": 2348 + }, + { + "epoch": 0.16454332921799888, + "grad_norm": 4.831826210021973, + "learning_rate": 8.356111383537654e-05, + "loss": 0.9077, + "num_input_tokens_seen": 37801760, + "step": 2349 + }, + { + "epoch": 0.16461337746372814, + "grad_norm": 4.207952499389648, + "learning_rate": 8.355411558669002e-05, + "loss": 1.0585, + "num_input_tokens_seen": 37818144, + "step": 2350 + }, + { + "epoch": 0.16468342570945738, + "grad_norm": 3.9083497524261475, + "learning_rate": 8.354711733800351e-05, + "loss": 1.1437, + "num_input_tokens_seen": 37833896, + "step": 2351 + }, + { + "epoch": 0.16475347395518664, + "grad_norm": 4.307275295257568, + "learning_rate": 8.3540119089317e-05, + "loss": 1.0692, + "num_input_tokens_seen": 37850280, + "step": 2352 + }, + { + "epoch": 0.16482352220091587, + "grad_norm": 3.9434409141540527, + "learning_rate": 8.353312084063047e-05, + "loss": 0.9842, + "num_input_tokens_seen": 37866664, + "step": 2353 + }, + { + "epoch": 0.16489357044664513, + "grad_norm": 4.162476539611816, + "learning_rate": 8.352612259194396e-05, + "loss": 1.2, + "num_input_tokens_seen": 37883048, + "step": 2354 + }, + { + "epoch": 0.1649636186923744, + "grad_norm": 4.3073506355285645, + "learning_rate": 8.351912434325745e-05, + "loss": 1.2625, + "num_input_tokens_seen": 37899264, + "step": 2355 + }, + { + "epoch": 0.16503366693810362, + "grad_norm": 3.9900870323181152, + "learning_rate": 8.351212609457094e-05, + "loss": 1.079, + "num_input_tokens_seen": 37915648, + "step": 2356 + }, + { + "epoch": 0.16510371518383288, + "grad_norm": 3.599282741546631, + "learning_rate": 8.350512784588441e-05, + "loss": 0.9226, + "num_input_tokens_seen": 37932032, + "step": 2357 + }, + { + "epoch": 0.1651737634295621, + "grad_norm": 3.796546697616577, + "learning_rate": 8.34981295971979e-05, + "loss": 0.9095, + "num_input_tokens_seen": 37948416, + "step": 2358 + }, + { + "epoch": 0.16524381167529137, + "grad_norm": 4.0810017585754395, + "learning_rate": 8.349113134851139e-05, + "loss": 0.9083, + "num_input_tokens_seen": 37964072, + "step": 2359 + }, + { + "epoch": 0.1653138599210206, + "grad_norm": 4.155765533447266, + "learning_rate": 8.348413309982487e-05, + "loss": 1.1827, + "num_input_tokens_seen": 37980320, + "step": 2360 + }, + { + "epoch": 0.16538390816674986, + "grad_norm": 4.131893634796143, + "learning_rate": 8.347713485113835e-05, + "loss": 1.1245, + "num_input_tokens_seen": 37995872, + "step": 2361 + }, + { + "epoch": 0.1654539564124791, + "grad_norm": 4.266848564147949, + "learning_rate": 8.347013660245184e-05, + "loss": 1.1084, + "num_input_tokens_seen": 38011856, + "step": 2362 + }, + { + "epoch": 0.16552400465820835, + "grad_norm": 3.8229875564575195, + "learning_rate": 8.346313835376533e-05, + "loss": 1.0592, + "num_input_tokens_seen": 38028080, + "step": 2363 + }, + { + "epoch": 0.16559405290393758, + "grad_norm": 4.0808234214782715, + "learning_rate": 8.345614010507882e-05, + "loss": 0.9667, + "num_input_tokens_seen": 38043992, + "step": 2364 + }, + { + "epoch": 0.16566410114966684, + "grad_norm": 4.470417022705078, + "learning_rate": 8.34491418563923e-05, + "loss": 1.2859, + "num_input_tokens_seen": 38059848, + "step": 2365 + }, + { + "epoch": 0.16573414939539607, + "grad_norm": 3.459963798522949, + "learning_rate": 8.344214360770578e-05, + "loss": 1.0801, + "num_input_tokens_seen": 38076232, + "step": 2366 + }, + { + "epoch": 0.16580419764112533, + "grad_norm": 3.6845312118530273, + "learning_rate": 8.343514535901926e-05, + "loss": 1.1277, + "num_input_tokens_seen": 38092616, + "step": 2367 + }, + { + "epoch": 0.16587424588685457, + "grad_norm": 3.683866500854492, + "learning_rate": 8.342814711033275e-05, + "loss": 1.0821, + "num_input_tokens_seen": 38108880, + "step": 2368 + }, + { + "epoch": 0.16594429413258382, + "grad_norm": 4.3266191482543945, + "learning_rate": 8.342114886164625e-05, + "loss": 1.1432, + "num_input_tokens_seen": 38125264, + "step": 2369 + }, + { + "epoch": 0.16601434237831306, + "grad_norm": 3.9031660556793213, + "learning_rate": 8.341415061295972e-05, + "loss": 1.0378, + "num_input_tokens_seen": 38141648, + "step": 2370 + }, + { + "epoch": 0.16608439062404232, + "grad_norm": 5.415440082550049, + "learning_rate": 8.340715236427321e-05, + "loss": 1.2011, + "num_input_tokens_seen": 38157328, + "step": 2371 + }, + { + "epoch": 0.16615443886977155, + "grad_norm": 4.017500877380371, + "learning_rate": 8.34001541155867e-05, + "loss": 1.0771, + "num_input_tokens_seen": 38173096, + "step": 2372 + }, + { + "epoch": 0.1662244871155008, + "grad_norm": 3.855212926864624, + "learning_rate": 8.339315586690018e-05, + "loss": 1.173, + "num_input_tokens_seen": 38189480, + "step": 2373 + }, + { + "epoch": 0.16629453536123004, + "grad_norm": 3.8502743244171143, + "learning_rate": 8.338615761821365e-05, + "loss": 1.0241, + "num_input_tokens_seen": 38205416, + "step": 2374 + }, + { + "epoch": 0.1663645836069593, + "grad_norm": 6.8746867179870605, + "learning_rate": 8.337915936952715e-05, + "loss": 1.0459, + "num_input_tokens_seen": 38221800, + "step": 2375 + }, + { + "epoch": 0.16643463185268853, + "grad_norm": 3.9708571434020996, + "learning_rate": 8.337216112084064e-05, + "loss": 0.9832, + "num_input_tokens_seen": 38237208, + "step": 2376 + }, + { + "epoch": 0.1665046800984178, + "grad_norm": 4.927229404449463, + "learning_rate": 8.336516287215412e-05, + "loss": 1.1103, + "num_input_tokens_seen": 38253592, + "step": 2377 + }, + { + "epoch": 0.16657472834414702, + "grad_norm": 3.9976963996887207, + "learning_rate": 8.33581646234676e-05, + "loss": 1.1451, + "num_input_tokens_seen": 38269184, + "step": 2378 + }, + { + "epoch": 0.16664477658987628, + "grad_norm": 3.680177927017212, + "learning_rate": 8.33511663747811e-05, + "loss": 1.0602, + "num_input_tokens_seen": 38285568, + "step": 2379 + }, + { + "epoch": 0.1667148248356055, + "grad_norm": 3.768069267272949, + "learning_rate": 8.334416812609457e-05, + "loss": 1.0822, + "num_input_tokens_seen": 38301952, + "step": 2380 + }, + { + "epoch": 0.16678487308133477, + "grad_norm": 4.554010391235352, + "learning_rate": 8.333716987740806e-05, + "loss": 1.3037, + "num_input_tokens_seen": 38318336, + "step": 2381 + }, + { + "epoch": 0.166854921327064, + "grad_norm": 3.6799368858337402, + "learning_rate": 8.333017162872155e-05, + "loss": 1.0152, + "num_input_tokens_seen": 38333544, + "step": 2382 + }, + { + "epoch": 0.16692496957279326, + "grad_norm": 3.5584356784820557, + "learning_rate": 8.332317338003503e-05, + "loss": 0.9617, + "num_input_tokens_seen": 38349632, + "step": 2383 + }, + { + "epoch": 0.1669950178185225, + "grad_norm": 5.978849411010742, + "learning_rate": 8.331617513134851e-05, + "loss": 0.9975, + "num_input_tokens_seen": 38364872, + "step": 2384 + }, + { + "epoch": 0.16706506606425175, + "grad_norm": 4.641121864318848, + "learning_rate": 8.3309176882662e-05, + "loss": 1.0021, + "num_input_tokens_seen": 38379800, + "step": 2385 + }, + { + "epoch": 0.167135114309981, + "grad_norm": 3.895772695541382, + "learning_rate": 8.330217863397549e-05, + "loss": 1.1187, + "num_input_tokens_seen": 38395744, + "step": 2386 + }, + { + "epoch": 0.16720516255571025, + "grad_norm": 3.48437762260437, + "learning_rate": 8.329518038528896e-05, + "loss": 1.0527, + "num_input_tokens_seen": 38412056, + "step": 2387 + }, + { + "epoch": 0.16727521080143948, + "grad_norm": 4.2831549644470215, + "learning_rate": 8.328818213660245e-05, + "loss": 0.8967, + "num_input_tokens_seen": 38426768, + "step": 2388 + }, + { + "epoch": 0.16734525904716874, + "grad_norm": 3.7090001106262207, + "learning_rate": 8.328118388791595e-05, + "loss": 0.9903, + "num_input_tokens_seen": 38442296, + "step": 2389 + }, + { + "epoch": 0.167415307292898, + "grad_norm": 4.253223896026611, + "learning_rate": 8.327418563922943e-05, + "loss": 1.0169, + "num_input_tokens_seen": 38458664, + "step": 2390 + }, + { + "epoch": 0.16748535553862723, + "grad_norm": 4.919910907745361, + "learning_rate": 8.326718739054292e-05, + "loss": 1.233, + "num_input_tokens_seen": 38475048, + "step": 2391 + }, + { + "epoch": 0.1675554037843565, + "grad_norm": 7.881314277648926, + "learning_rate": 8.326018914185639e-05, + "loss": 1.1, + "num_input_tokens_seen": 38491432, + "step": 2392 + }, + { + "epoch": 0.16762545203008572, + "grad_norm": 6.979029655456543, + "learning_rate": 8.325319089316988e-05, + "loss": 0.9189, + "num_input_tokens_seen": 38506312, + "step": 2393 + }, + { + "epoch": 0.16769550027581498, + "grad_norm": 4.141571044921875, + "learning_rate": 8.324619264448336e-05, + "loss": 1.0821, + "num_input_tokens_seen": 38522696, + "step": 2394 + }, + { + "epoch": 0.1677655485215442, + "grad_norm": 4.306760311126709, + "learning_rate": 8.323919439579686e-05, + "loss": 1.1857, + "num_input_tokens_seen": 38539080, + "step": 2395 + }, + { + "epoch": 0.16783559676727347, + "grad_norm": 4.089770793914795, + "learning_rate": 8.323219614711035e-05, + "loss": 1.0994, + "num_input_tokens_seen": 38555464, + "step": 2396 + }, + { + "epoch": 0.1679056450130027, + "grad_norm": 3.648800849914551, + "learning_rate": 8.322519789842382e-05, + "loss": 1.1015, + "num_input_tokens_seen": 38571848, + "step": 2397 + }, + { + "epoch": 0.16797569325873196, + "grad_norm": 4.310317516326904, + "learning_rate": 8.321819964973731e-05, + "loss": 1.171, + "num_input_tokens_seen": 38587616, + "step": 2398 + }, + { + "epoch": 0.1680457415044612, + "grad_norm": 5.373032093048096, + "learning_rate": 8.32112014010508e-05, + "loss": 0.9952, + "num_input_tokens_seen": 38604000, + "step": 2399 + }, + { + "epoch": 0.16811578975019045, + "grad_norm": 3.7830634117126465, + "learning_rate": 8.320420315236427e-05, + "loss": 0.9953, + "num_input_tokens_seen": 38620384, + "step": 2400 + }, + { + "epoch": 0.16811578975019045, + "eval_loss": 1.1429402828216553, + "eval_runtime": 0.205, + "eval_samples_per_second": 4.878, + "eval_steps_per_second": 4.878, + "num_input_tokens_seen": 38620384, + "step": 2400 + }, + { + "epoch": 0.16818583799591968, + "grad_norm": 6.3896684646606445, + "learning_rate": 8.319720490367776e-05, + "loss": 1.184, + "num_input_tokens_seen": 38636288, + "step": 2401 + }, + { + "epoch": 0.16825588624164894, + "grad_norm": 4.178726673126221, + "learning_rate": 8.319020665499125e-05, + "loss": 1.0362, + "num_input_tokens_seen": 38652352, + "step": 2402 + }, + { + "epoch": 0.16832593448737818, + "grad_norm": 3.7572708129882812, + "learning_rate": 8.318320840630474e-05, + "loss": 0.9756, + "num_input_tokens_seen": 38668712, + "step": 2403 + }, + { + "epoch": 0.16839598273310744, + "grad_norm": 3.688552141189575, + "learning_rate": 8.317621015761821e-05, + "loss": 1.0644, + "num_input_tokens_seen": 38685096, + "step": 2404 + }, + { + "epoch": 0.16846603097883667, + "grad_norm": 4.2040510177612305, + "learning_rate": 8.31692119089317e-05, + "loss": 1.1251, + "num_input_tokens_seen": 38701480, + "step": 2405 + }, + { + "epoch": 0.16853607922456593, + "grad_norm": 3.9412119388580322, + "learning_rate": 8.316221366024519e-05, + "loss": 1.0243, + "num_input_tokens_seen": 38716904, + "step": 2406 + }, + { + "epoch": 0.16860612747029516, + "grad_norm": 3.9538826942443848, + "learning_rate": 8.315521541155867e-05, + "loss": 1.0361, + "num_input_tokens_seen": 38733288, + "step": 2407 + }, + { + "epoch": 0.16867617571602442, + "grad_norm": 3.803135871887207, + "learning_rate": 8.314821716287215e-05, + "loss": 1.0684, + "num_input_tokens_seen": 38749672, + "step": 2408 + }, + { + "epoch": 0.16874622396175365, + "grad_norm": 4.323539733886719, + "learning_rate": 8.314121891418564e-05, + "loss": 1.1091, + "num_input_tokens_seen": 38766056, + "step": 2409 + }, + { + "epoch": 0.1688162722074829, + "grad_norm": 3.84000825881958, + "learning_rate": 8.313422066549913e-05, + "loss": 1.0052, + "num_input_tokens_seen": 38782440, + "step": 2410 + }, + { + "epoch": 0.16888632045321214, + "grad_norm": 6.76428747177124, + "learning_rate": 8.312722241681261e-05, + "loss": 1.1773, + "num_input_tokens_seen": 38798824, + "step": 2411 + }, + { + "epoch": 0.1689563686989414, + "grad_norm": 5.8638224601745605, + "learning_rate": 8.31202241681261e-05, + "loss": 0.9515, + "num_input_tokens_seen": 38815112, + "step": 2412 + }, + { + "epoch": 0.16902641694467063, + "grad_norm": 4.254051685333252, + "learning_rate": 8.311322591943958e-05, + "loss": 1.1365, + "num_input_tokens_seen": 38831192, + "step": 2413 + }, + { + "epoch": 0.1690964651903999, + "grad_norm": 3.641663074493408, + "learning_rate": 8.310622767075306e-05, + "loss": 0.9888, + "num_input_tokens_seen": 38847360, + "step": 2414 + }, + { + "epoch": 0.16916651343612912, + "grad_norm": 3.594768762588501, + "learning_rate": 8.309922942206656e-05, + "loss": 1.1156, + "num_input_tokens_seen": 38863744, + "step": 2415 + }, + { + "epoch": 0.16923656168185838, + "grad_norm": 3.6955742835998535, + "learning_rate": 8.309223117338005e-05, + "loss": 0.9514, + "num_input_tokens_seen": 38879880, + "step": 2416 + }, + { + "epoch": 0.16930660992758761, + "grad_norm": 3.64803409576416, + "learning_rate": 8.308523292469353e-05, + "loss": 1.0045, + "num_input_tokens_seen": 38896264, + "step": 2417 + }, + { + "epoch": 0.16937665817331687, + "grad_norm": 3.7921512126922607, + "learning_rate": 8.307823467600701e-05, + "loss": 1.0838, + "num_input_tokens_seen": 38912648, + "step": 2418 + }, + { + "epoch": 0.1694467064190461, + "grad_norm": 4.777346611022949, + "learning_rate": 8.307123642732049e-05, + "loss": 1.239, + "num_input_tokens_seen": 38929032, + "step": 2419 + }, + { + "epoch": 0.16951675466477537, + "grad_norm": 4.417767524719238, + "learning_rate": 8.306423817863398e-05, + "loss": 1.0101, + "num_input_tokens_seen": 38945416, + "step": 2420 + }, + { + "epoch": 0.1695868029105046, + "grad_norm": 4.257672309875488, + "learning_rate": 8.305723992994747e-05, + "loss": 0.8461, + "num_input_tokens_seen": 38961800, + "step": 2421 + }, + { + "epoch": 0.16965685115623386, + "grad_norm": 4.098975658416748, + "learning_rate": 8.305024168126095e-05, + "loss": 1.158, + "num_input_tokens_seen": 38978184, + "step": 2422 + }, + { + "epoch": 0.1697268994019631, + "grad_norm": 5.206361293792725, + "learning_rate": 8.304324343257444e-05, + "loss": 1.041, + "num_input_tokens_seen": 38994568, + "step": 2423 + }, + { + "epoch": 0.16979694764769235, + "grad_norm": 3.638395309448242, + "learning_rate": 8.303624518388792e-05, + "loss": 0.8883, + "num_input_tokens_seen": 39010136, + "step": 2424 + }, + { + "epoch": 0.1698669958934216, + "grad_norm": 3.4154045581817627, + "learning_rate": 8.30292469352014e-05, + "loss": 1.0024, + "num_input_tokens_seen": 39026520, + "step": 2425 + }, + { + "epoch": 0.16993704413915084, + "grad_norm": 3.923617362976074, + "learning_rate": 8.30222486865149e-05, + "loss": 1.1696, + "num_input_tokens_seen": 39042816, + "step": 2426 + }, + { + "epoch": 0.1700070923848801, + "grad_norm": 4.469310760498047, + "learning_rate": 8.301525043782837e-05, + "loss": 1.3424, + "num_input_tokens_seen": 39059040, + "step": 2427 + }, + { + "epoch": 0.17007714063060933, + "grad_norm": 4.111564636230469, + "learning_rate": 8.300825218914186e-05, + "loss": 0.9867, + "num_input_tokens_seen": 39074992, + "step": 2428 + }, + { + "epoch": 0.1701471888763386, + "grad_norm": 3.7809438705444336, + "learning_rate": 8.300125394045535e-05, + "loss": 0.965, + "num_input_tokens_seen": 39090840, + "step": 2429 + }, + { + "epoch": 0.17021723712206782, + "grad_norm": 3.704542875289917, + "learning_rate": 8.299425569176884e-05, + "loss": 1.1784, + "num_input_tokens_seen": 39107136, + "step": 2430 + }, + { + "epoch": 0.17028728536779708, + "grad_norm": 4.356417179107666, + "learning_rate": 8.298725744308231e-05, + "loss": 1.149, + "num_input_tokens_seen": 39123520, + "step": 2431 + }, + { + "epoch": 0.1703573336135263, + "grad_norm": 3.400228500366211, + "learning_rate": 8.29802591943958e-05, + "loss": 0.867, + "num_input_tokens_seen": 39139904, + "step": 2432 + }, + { + "epoch": 0.17042738185925557, + "grad_norm": 4.777987480163574, + "learning_rate": 8.297326094570929e-05, + "loss": 1.1159, + "num_input_tokens_seen": 39156288, + "step": 2433 + }, + { + "epoch": 0.1704974301049848, + "grad_norm": 5.600007057189941, + "learning_rate": 8.296626269702276e-05, + "loss": 0.8863, + "num_input_tokens_seen": 39171928, + "step": 2434 + }, + { + "epoch": 0.17056747835071406, + "grad_norm": 3.72717022895813, + "learning_rate": 8.295926444833627e-05, + "loss": 1.079, + "num_input_tokens_seen": 39188032, + "step": 2435 + }, + { + "epoch": 0.1706375265964433, + "grad_norm": 7.264038562774658, + "learning_rate": 8.295226619964974e-05, + "loss": 0.8546, + "num_input_tokens_seen": 39203816, + "step": 2436 + }, + { + "epoch": 0.17070757484217255, + "grad_norm": 4.103509426116943, + "learning_rate": 8.294526795096323e-05, + "loss": 1.0138, + "num_input_tokens_seen": 39220200, + "step": 2437 + }, + { + "epoch": 0.1707776230879018, + "grad_norm": 3.6456661224365234, + "learning_rate": 8.29382697022767e-05, + "loss": 0.9107, + "num_input_tokens_seen": 39236584, + "step": 2438 + }, + { + "epoch": 0.17084767133363105, + "grad_norm": 3.750075340270996, + "learning_rate": 8.293127145359019e-05, + "loss": 1.0773, + "num_input_tokens_seen": 39252968, + "step": 2439 + }, + { + "epoch": 0.17091771957936028, + "grad_norm": 4.5003581047058105, + "learning_rate": 8.292427320490368e-05, + "loss": 1.1834, + "num_input_tokens_seen": 39269192, + "step": 2440 + }, + { + "epoch": 0.17098776782508954, + "grad_norm": 4.513885498046875, + "learning_rate": 8.291727495621717e-05, + "loss": 1.0337, + "num_input_tokens_seen": 39285576, + "step": 2441 + }, + { + "epoch": 0.17105781607081877, + "grad_norm": 4.220343589782715, + "learning_rate": 8.291027670753066e-05, + "loss": 1.3044, + "num_input_tokens_seen": 39300864, + "step": 2442 + }, + { + "epoch": 0.17112786431654803, + "grad_norm": 4.986631393432617, + "learning_rate": 8.290327845884415e-05, + "loss": 1.0377, + "num_input_tokens_seen": 39317208, + "step": 2443 + }, + { + "epoch": 0.17119791256227726, + "grad_norm": 7.632670879364014, + "learning_rate": 8.289628021015762e-05, + "loss": 1.1749, + "num_input_tokens_seen": 39332392, + "step": 2444 + }, + { + "epoch": 0.17126796080800652, + "grad_norm": 3.588841199874878, + "learning_rate": 8.288928196147111e-05, + "loss": 0.8124, + "num_input_tokens_seen": 39348600, + "step": 2445 + }, + { + "epoch": 0.17133800905373575, + "grad_norm": 4.311728477478027, + "learning_rate": 8.288228371278459e-05, + "loss": 1.035, + "num_input_tokens_seen": 39364456, + "step": 2446 + }, + { + "epoch": 0.171408057299465, + "grad_norm": 6.236140251159668, + "learning_rate": 8.287528546409807e-05, + "loss": 1.1243, + "num_input_tokens_seen": 39379496, + "step": 2447 + }, + { + "epoch": 0.17147810554519424, + "grad_norm": 4.228808403015137, + "learning_rate": 8.286828721541156e-05, + "loss": 1.0185, + "num_input_tokens_seen": 39395880, + "step": 2448 + }, + { + "epoch": 0.1715481537909235, + "grad_norm": 3.873366117477417, + "learning_rate": 8.286128896672505e-05, + "loss": 0.9684, + "num_input_tokens_seen": 39412264, + "step": 2449 + }, + { + "epoch": 0.17161820203665273, + "grad_norm": 3.797846794128418, + "learning_rate": 8.285429071803854e-05, + "loss": 1.0562, + "num_input_tokens_seen": 39428648, + "step": 2450 + }, + { + "epoch": 0.171688250282382, + "grad_norm": 3.798875093460083, + "learning_rate": 8.284729246935202e-05, + "loss": 1.0409, + "num_input_tokens_seen": 39445032, + "step": 2451 + }, + { + "epoch": 0.17175829852811123, + "grad_norm": 5.118900299072266, + "learning_rate": 8.28402942206655e-05, + "loss": 1.14, + "num_input_tokens_seen": 39460168, + "step": 2452 + }, + { + "epoch": 0.17182834677384048, + "grad_norm": 4.157371520996094, + "learning_rate": 8.283329597197899e-05, + "loss": 1.1676, + "num_input_tokens_seen": 39476544, + "step": 2453 + }, + { + "epoch": 0.17189839501956972, + "grad_norm": 3.760786771774292, + "learning_rate": 8.282629772329247e-05, + "loss": 1.0482, + "num_input_tokens_seen": 39492928, + "step": 2454 + }, + { + "epoch": 0.17196844326529898, + "grad_norm": 4.252779960632324, + "learning_rate": 8.281929947460596e-05, + "loss": 1.1538, + "num_input_tokens_seen": 39509312, + "step": 2455 + }, + { + "epoch": 0.1720384915110282, + "grad_norm": 4.374740123748779, + "learning_rate": 8.281230122591944e-05, + "loss": 1.0132, + "num_input_tokens_seen": 39525696, + "step": 2456 + }, + { + "epoch": 0.17210853975675747, + "grad_norm": 4.460380554199219, + "learning_rate": 8.280530297723293e-05, + "loss": 1.1876, + "num_input_tokens_seen": 39541864, + "step": 2457 + }, + { + "epoch": 0.17217858800248673, + "grad_norm": 4.22148323059082, + "learning_rate": 8.279830472854641e-05, + "loss": 1.11, + "num_input_tokens_seen": 39557944, + "step": 2458 + }, + { + "epoch": 0.17224863624821596, + "grad_norm": 4.310081481933594, + "learning_rate": 8.27913064798599e-05, + "loss": 1.0506, + "num_input_tokens_seen": 39574328, + "step": 2459 + }, + { + "epoch": 0.17231868449394522, + "grad_norm": 4.15192174911499, + "learning_rate": 8.278430823117339e-05, + "loss": 0.7793, + "num_input_tokens_seen": 39589312, + "step": 2460 + }, + { + "epoch": 0.17238873273967445, + "grad_norm": 4.6561455726623535, + "learning_rate": 8.277730998248687e-05, + "loss": 1.2239, + "num_input_tokens_seen": 39605456, + "step": 2461 + }, + { + "epoch": 0.1724587809854037, + "grad_norm": 4.273087978363037, + "learning_rate": 8.277031173380036e-05, + "loss": 1.1436, + "num_input_tokens_seen": 39621840, + "step": 2462 + }, + { + "epoch": 0.17252882923113294, + "grad_norm": 4.575830459594727, + "learning_rate": 8.276331348511384e-05, + "loss": 1.2589, + "num_input_tokens_seen": 39638224, + "step": 2463 + }, + { + "epoch": 0.1725988774768622, + "grad_norm": 3.9122824668884277, + "learning_rate": 8.275631523642733e-05, + "loss": 1.0634, + "num_input_tokens_seen": 39654608, + "step": 2464 + }, + { + "epoch": 0.17266892572259143, + "grad_norm": 4.991362571716309, + "learning_rate": 8.27493169877408e-05, + "loss": 1.2077, + "num_input_tokens_seen": 39669824, + "step": 2465 + }, + { + "epoch": 0.1727389739683207, + "grad_norm": 4.688175678253174, + "learning_rate": 8.274231873905429e-05, + "loss": 1.0955, + "num_input_tokens_seen": 39686208, + "step": 2466 + }, + { + "epoch": 0.17280902221404992, + "grad_norm": 3.779524087905884, + "learning_rate": 8.273532049036778e-05, + "loss": 1.004, + "num_input_tokens_seen": 39702336, + "step": 2467 + }, + { + "epoch": 0.17287907045977918, + "grad_norm": 4.117679595947266, + "learning_rate": 8.272832224168127e-05, + "loss": 1.0321, + "num_input_tokens_seen": 39718232, + "step": 2468 + }, + { + "epoch": 0.17294911870550841, + "grad_norm": 3.810084819793701, + "learning_rate": 8.272132399299476e-05, + "loss": 1.0325, + "num_input_tokens_seen": 39733584, + "step": 2469 + }, + { + "epoch": 0.17301916695123767, + "grad_norm": 3.7730038166046143, + "learning_rate": 8.271432574430824e-05, + "loss": 0.9207, + "num_input_tokens_seen": 39749968, + "step": 2470 + }, + { + "epoch": 0.1730892151969669, + "grad_norm": 7.299304008483887, + "learning_rate": 8.270732749562172e-05, + "loss": 1.3425, + "num_input_tokens_seen": 39765552, + "step": 2471 + }, + { + "epoch": 0.17315926344269617, + "grad_norm": 4.079380512237549, + "learning_rate": 8.270032924693521e-05, + "loss": 1.0336, + "num_input_tokens_seen": 39781936, + "step": 2472 + }, + { + "epoch": 0.1732293116884254, + "grad_norm": 3.736607789993286, + "learning_rate": 8.269333099824868e-05, + "loss": 1.0126, + "num_input_tokens_seen": 39797688, + "step": 2473 + }, + { + "epoch": 0.17329935993415466, + "grad_norm": 5.587291240692139, + "learning_rate": 8.268633274956217e-05, + "loss": 1.1422, + "num_input_tokens_seen": 39814072, + "step": 2474 + }, + { + "epoch": 0.1733694081798839, + "grad_norm": 3.5963592529296875, + "learning_rate": 8.267933450087566e-05, + "loss": 0.9947, + "num_input_tokens_seen": 39830456, + "step": 2475 + }, + { + "epoch": 0.17343945642561315, + "grad_norm": 5.241317272186279, + "learning_rate": 8.267233625218915e-05, + "loss": 1.0661, + "num_input_tokens_seen": 39846728, + "step": 2476 + }, + { + "epoch": 0.17350950467134238, + "grad_norm": 4.194108009338379, + "learning_rate": 8.266533800350264e-05, + "loss": 1.1659, + "num_input_tokens_seen": 39863112, + "step": 2477 + }, + { + "epoch": 0.17357955291707164, + "grad_norm": 4.698538780212402, + "learning_rate": 8.265833975481611e-05, + "loss": 1.3673, + "num_input_tokens_seen": 39878624, + "step": 2478 + }, + { + "epoch": 0.17364960116280087, + "grad_norm": 5.960018634796143, + "learning_rate": 8.26513415061296e-05, + "loss": 1.104, + "num_input_tokens_seen": 39894944, + "step": 2479 + }, + { + "epoch": 0.17371964940853013, + "grad_norm": 4.386090278625488, + "learning_rate": 8.264434325744309e-05, + "loss": 1.1284, + "num_input_tokens_seen": 39911040, + "step": 2480 + }, + { + "epoch": 0.17378969765425936, + "grad_norm": 3.7272467613220215, + "learning_rate": 8.263734500875658e-05, + "loss": 1.1066, + "num_input_tokens_seen": 39927408, + "step": 2481 + }, + { + "epoch": 0.17385974589998862, + "grad_norm": 4.296888828277588, + "learning_rate": 8.263034676007005e-05, + "loss": 1.1014, + "num_input_tokens_seen": 39943792, + "step": 2482 + }, + { + "epoch": 0.17392979414571785, + "grad_norm": 5.469056606292725, + "learning_rate": 8.262334851138354e-05, + "loss": 1.1672, + "num_input_tokens_seen": 39958176, + "step": 2483 + }, + { + "epoch": 0.1739998423914471, + "grad_norm": 5.6080498695373535, + "learning_rate": 8.261635026269703e-05, + "loss": 1.2713, + "num_input_tokens_seen": 39973592, + "step": 2484 + }, + { + "epoch": 0.17406989063717634, + "grad_norm": 3.6164181232452393, + "learning_rate": 8.26093520140105e-05, + "loss": 0.9019, + "num_input_tokens_seen": 39989792, + "step": 2485 + }, + { + "epoch": 0.1741399388829056, + "grad_norm": 3.757291078567505, + "learning_rate": 8.2602353765324e-05, + "loss": 1.1038, + "num_input_tokens_seen": 40005672, + "step": 2486 + }, + { + "epoch": 0.17420998712863484, + "grad_norm": 5.1490559577941895, + "learning_rate": 8.259535551663748e-05, + "loss": 1.1524, + "num_input_tokens_seen": 40021816, + "step": 2487 + }, + { + "epoch": 0.1742800353743641, + "grad_norm": 3.9055886268615723, + "learning_rate": 8.258835726795097e-05, + "loss": 1.1, + "num_input_tokens_seen": 40038200, + "step": 2488 + }, + { + "epoch": 0.17435008362009333, + "grad_norm": 5.496553897857666, + "learning_rate": 8.258135901926446e-05, + "loss": 1.3214, + "num_input_tokens_seen": 40054584, + "step": 2489 + }, + { + "epoch": 0.1744201318658226, + "grad_norm": 4.069197177886963, + "learning_rate": 8.257436077057793e-05, + "loss": 0.888, + "num_input_tokens_seen": 40070968, + "step": 2490 + }, + { + "epoch": 0.17449018011155182, + "grad_norm": 5.098565101623535, + "learning_rate": 8.256736252189142e-05, + "loss": 0.9918, + "num_input_tokens_seen": 40087352, + "step": 2491 + }, + { + "epoch": 0.17456022835728108, + "grad_norm": 4.083621025085449, + "learning_rate": 8.25603642732049e-05, + "loss": 0.9506, + "num_input_tokens_seen": 40103736, + "step": 2492 + }, + { + "epoch": 0.17463027660301034, + "grad_norm": 3.8676462173461914, + "learning_rate": 8.255336602451839e-05, + "loss": 1.0746, + "num_input_tokens_seen": 40120120, + "step": 2493 + }, + { + "epoch": 0.17470032484873957, + "grad_norm": 3.8799197673797607, + "learning_rate": 8.254636777583188e-05, + "loss": 1.0207, + "num_input_tokens_seen": 40136504, + "step": 2494 + }, + { + "epoch": 0.17477037309446883, + "grad_norm": 5.469006538391113, + "learning_rate": 8.253936952714536e-05, + "loss": 1.0081, + "num_input_tokens_seen": 40152888, + "step": 2495 + }, + { + "epoch": 0.17484042134019806, + "grad_norm": 4.163306713104248, + "learning_rate": 8.253237127845885e-05, + "loss": 1.2059, + "num_input_tokens_seen": 40169272, + "step": 2496 + }, + { + "epoch": 0.17491046958592732, + "grad_norm": 3.792062282562256, + "learning_rate": 8.252537302977234e-05, + "loss": 1.0806, + "num_input_tokens_seen": 40185656, + "step": 2497 + }, + { + "epoch": 0.17498051783165655, + "grad_norm": 3.6881046295166016, + "learning_rate": 8.251837478108582e-05, + "loss": 1.1557, + "num_input_tokens_seen": 40202040, + "step": 2498 + }, + { + "epoch": 0.1750505660773858, + "grad_norm": 3.818491220474243, + "learning_rate": 8.25113765323993e-05, + "loss": 1.2193, + "num_input_tokens_seen": 40218424, + "step": 2499 + }, + { + "epoch": 0.17512061432311504, + "grad_norm": 3.77933931350708, + "learning_rate": 8.250437828371278e-05, + "loss": 1.0508, + "num_input_tokens_seen": 40234216, + "step": 2500 + }, + { + "epoch": 0.1751906625688443, + "grad_norm": 4.106552600860596, + "learning_rate": 8.249738003502628e-05, + "loss": 0.8558, + "num_input_tokens_seen": 40250368, + "step": 2501 + }, + { + "epoch": 0.17526071081457353, + "grad_norm": 4.9382710456848145, + "learning_rate": 8.249038178633976e-05, + "loss": 1.3082, + "num_input_tokens_seen": 40266600, + "step": 2502 + }, + { + "epoch": 0.1753307590603028, + "grad_norm": 3.8894200325012207, + "learning_rate": 8.248338353765325e-05, + "loss": 1.035, + "num_input_tokens_seen": 40282984, + "step": 2503 + }, + { + "epoch": 0.17540080730603202, + "grad_norm": 3.793044328689575, + "learning_rate": 8.247638528896673e-05, + "loss": 1.0376, + "num_input_tokens_seen": 40299368, + "step": 2504 + }, + { + "epoch": 0.17547085555176128, + "grad_norm": 4.874731540679932, + "learning_rate": 8.246938704028021e-05, + "loss": 1.2598, + "num_input_tokens_seen": 40315752, + "step": 2505 + }, + { + "epoch": 0.17554090379749052, + "grad_norm": 3.908191680908203, + "learning_rate": 8.24623887915937e-05, + "loss": 0.9739, + "num_input_tokens_seen": 40332136, + "step": 2506 + }, + { + "epoch": 0.17561095204321978, + "grad_norm": 3.585002899169922, + "learning_rate": 8.245539054290719e-05, + "loss": 0.9736, + "num_input_tokens_seen": 40348520, + "step": 2507 + }, + { + "epoch": 0.175681000288949, + "grad_norm": 3.9742348194122314, + "learning_rate": 8.244839229422068e-05, + "loss": 1.0278, + "num_input_tokens_seen": 40364760, + "step": 2508 + }, + { + "epoch": 0.17575104853467827, + "grad_norm": 5.1725921630859375, + "learning_rate": 8.244139404553415e-05, + "loss": 1.1488, + "num_input_tokens_seen": 40380072, + "step": 2509 + }, + { + "epoch": 0.1758210967804075, + "grad_norm": 4.038326263427734, + "learning_rate": 8.243439579684764e-05, + "loss": 1.2252, + "num_input_tokens_seen": 40395472, + "step": 2510 + }, + { + "epoch": 0.17589114502613676, + "grad_norm": 3.7381017208099365, + "learning_rate": 8.242739754816113e-05, + "loss": 1.041, + "num_input_tokens_seen": 40411280, + "step": 2511 + }, + { + "epoch": 0.175961193271866, + "grad_norm": 4.327959060668945, + "learning_rate": 8.24203992994746e-05, + "loss": 1.0272, + "num_input_tokens_seen": 40427664, + "step": 2512 + }, + { + "epoch": 0.17603124151759525, + "grad_norm": 3.720078706741333, + "learning_rate": 8.241340105078809e-05, + "loss": 1.2306, + "num_input_tokens_seen": 40443760, + "step": 2513 + }, + { + "epoch": 0.17610128976332448, + "grad_norm": 4.0901618003845215, + "learning_rate": 8.240640280210158e-05, + "loss": 1.0098, + "num_input_tokens_seen": 40460144, + "step": 2514 + }, + { + "epoch": 0.17617133800905374, + "grad_norm": 4.013705730438232, + "learning_rate": 8.239940455341507e-05, + "loss": 1.0817, + "num_input_tokens_seen": 40476528, + "step": 2515 + }, + { + "epoch": 0.17624138625478297, + "grad_norm": 3.8833489418029785, + "learning_rate": 8.239240630472856e-05, + "loss": 1.119, + "num_input_tokens_seen": 40492768, + "step": 2516 + }, + { + "epoch": 0.17631143450051223, + "grad_norm": 7.381611347198486, + "learning_rate": 8.238540805604203e-05, + "loss": 1.3033, + "num_input_tokens_seen": 40507344, + "step": 2517 + }, + { + "epoch": 0.17638148274624146, + "grad_norm": 3.8792364597320557, + "learning_rate": 8.237840980735552e-05, + "loss": 1.1113, + "num_input_tokens_seen": 40523552, + "step": 2518 + }, + { + "epoch": 0.17645153099197072, + "grad_norm": 5.19634485244751, + "learning_rate": 8.2371411558669e-05, + "loss": 1.2186, + "num_input_tokens_seen": 40538640, + "step": 2519 + }, + { + "epoch": 0.17652157923769995, + "grad_norm": 4.081907749176025, + "learning_rate": 8.236441330998248e-05, + "loss": 1.1075, + "num_input_tokens_seen": 40555024, + "step": 2520 + }, + { + "epoch": 0.17659162748342921, + "grad_norm": 4.296377182006836, + "learning_rate": 8.235741506129599e-05, + "loss": 0.9319, + "num_input_tokens_seen": 40570480, + "step": 2521 + }, + { + "epoch": 0.17666167572915845, + "grad_norm": 4.143492221832275, + "learning_rate": 8.235041681260946e-05, + "loss": 0.964, + "num_input_tokens_seen": 40586664, + "step": 2522 + }, + { + "epoch": 0.1767317239748877, + "grad_norm": 3.9894590377807617, + "learning_rate": 8.234341856392295e-05, + "loss": 0.913, + "num_input_tokens_seen": 40603048, + "step": 2523 + }, + { + "epoch": 0.17680177222061694, + "grad_norm": 4.283662796020508, + "learning_rate": 8.233642031523644e-05, + "loss": 1.0709, + "num_input_tokens_seen": 40618440, + "step": 2524 + }, + { + "epoch": 0.1768718204663462, + "grad_norm": 4.126082420349121, + "learning_rate": 8.232942206654991e-05, + "loss": 1.1371, + "num_input_tokens_seen": 40634824, + "step": 2525 + }, + { + "epoch": 0.17694186871207543, + "grad_norm": 4.252981662750244, + "learning_rate": 8.23224238178634e-05, + "loss": 1.0351, + "num_input_tokens_seen": 40650640, + "step": 2526 + }, + { + "epoch": 0.1770119169578047, + "grad_norm": 3.768542528152466, + "learning_rate": 8.231542556917689e-05, + "loss": 0.8221, + "num_input_tokens_seen": 40667000, + "step": 2527 + }, + { + "epoch": 0.17708196520353395, + "grad_norm": 4.067849636077881, + "learning_rate": 8.230842732049038e-05, + "loss": 1.2117, + "num_input_tokens_seen": 40683288, + "step": 2528 + }, + { + "epoch": 0.17715201344926318, + "grad_norm": 4.7552995681762695, + "learning_rate": 8.230142907180385e-05, + "loss": 1.0001, + "num_input_tokens_seen": 40699304, + "step": 2529 + }, + { + "epoch": 0.17722206169499244, + "grad_norm": 4.099888324737549, + "learning_rate": 8.229443082311734e-05, + "loss": 1.3335, + "num_input_tokens_seen": 40715688, + "step": 2530 + }, + { + "epoch": 0.17729210994072167, + "grad_norm": 4.219737529754639, + "learning_rate": 8.228743257443083e-05, + "loss": 1.3004, + "num_input_tokens_seen": 40731640, + "step": 2531 + }, + { + "epoch": 0.17736215818645093, + "grad_norm": 4.125600337982178, + "learning_rate": 8.22804343257443e-05, + "loss": 1.1828, + "num_input_tokens_seen": 40747664, + "step": 2532 + }, + { + "epoch": 0.17743220643218016, + "grad_norm": 3.7761423587799072, + "learning_rate": 8.22734360770578e-05, + "loss": 1.1082, + "num_input_tokens_seen": 40764048, + "step": 2533 + }, + { + "epoch": 0.17750225467790942, + "grad_norm": 5.0669026374816895, + "learning_rate": 8.226643782837128e-05, + "loss": 1.1434, + "num_input_tokens_seen": 40779160, + "step": 2534 + }, + { + "epoch": 0.17757230292363865, + "grad_norm": 4.688200950622559, + "learning_rate": 8.225943957968477e-05, + "loss": 1.2135, + "num_input_tokens_seen": 40795416, + "step": 2535 + }, + { + "epoch": 0.1776423511693679, + "grad_norm": 3.62204647064209, + "learning_rate": 8.225244133099825e-05, + "loss": 1.0816, + "num_input_tokens_seen": 40811800, + "step": 2536 + }, + { + "epoch": 0.17771239941509714, + "grad_norm": 4.086390495300293, + "learning_rate": 8.224544308231174e-05, + "loss": 1.067, + "num_input_tokens_seen": 40826960, + "step": 2537 + }, + { + "epoch": 0.1777824476608264, + "grad_norm": 5.574249744415283, + "learning_rate": 8.223844483362522e-05, + "loss": 1.2678, + "num_input_tokens_seen": 40843344, + "step": 2538 + }, + { + "epoch": 0.17785249590655564, + "grad_norm": 3.9721264839172363, + "learning_rate": 8.22314465849387e-05, + "loss": 1.0381, + "num_input_tokens_seen": 40859448, + "step": 2539 + }, + { + "epoch": 0.1779225441522849, + "grad_norm": 4.220152854919434, + "learning_rate": 8.222444833625219e-05, + "loss": 1.1014, + "num_input_tokens_seen": 40875128, + "step": 2540 + }, + { + "epoch": 0.17799259239801413, + "grad_norm": 3.905205011367798, + "learning_rate": 8.221745008756569e-05, + "loss": 0.9568, + "num_input_tokens_seen": 40890624, + "step": 2541 + }, + { + "epoch": 0.1780626406437434, + "grad_norm": 4.114316463470459, + "learning_rate": 8.221045183887917e-05, + "loss": 0.9885, + "num_input_tokens_seen": 40905624, + "step": 2542 + }, + { + "epoch": 0.17813268888947262, + "grad_norm": 4.810879230499268, + "learning_rate": 8.220345359019265e-05, + "loss": 0.9447, + "num_input_tokens_seen": 40922008, + "step": 2543 + }, + { + "epoch": 0.17820273713520188, + "grad_norm": 4.224065780639648, + "learning_rate": 8.219645534150613e-05, + "loss": 1.1176, + "num_input_tokens_seen": 40938392, + "step": 2544 + }, + { + "epoch": 0.1782727853809311, + "grad_norm": 3.7086703777313232, + "learning_rate": 8.218945709281962e-05, + "loss": 0.8931, + "num_input_tokens_seen": 40954776, + "step": 2545 + }, + { + "epoch": 0.17834283362666037, + "grad_norm": 4.346426963806152, + "learning_rate": 8.218245884413309e-05, + "loss": 0.9808, + "num_input_tokens_seen": 40971160, + "step": 2546 + }, + { + "epoch": 0.1784128818723896, + "grad_norm": 3.9295589923858643, + "learning_rate": 8.21754605954466e-05, + "loss": 1.1054, + "num_input_tokens_seen": 40987544, + "step": 2547 + }, + { + "epoch": 0.17848293011811886, + "grad_norm": 4.224534034729004, + "learning_rate": 8.216846234676008e-05, + "loss": 1.1131, + "num_input_tokens_seen": 41002816, + "step": 2548 + }, + { + "epoch": 0.1785529783638481, + "grad_norm": 3.940401315689087, + "learning_rate": 8.216146409807356e-05, + "loss": 1.1551, + "num_input_tokens_seen": 41018560, + "step": 2549 + }, + { + "epoch": 0.17862302660957735, + "grad_norm": 4.010072231292725, + "learning_rate": 8.215446584938705e-05, + "loss": 1.0915, + "num_input_tokens_seen": 41033976, + "step": 2550 + }, + { + "epoch": 0.17869307485530658, + "grad_norm": 4.192416191101074, + "learning_rate": 8.214746760070054e-05, + "loss": 1.0954, + "num_input_tokens_seen": 41049384, + "step": 2551 + }, + { + "epoch": 0.17876312310103584, + "grad_norm": 3.765962600708008, + "learning_rate": 8.214046935201401e-05, + "loss": 1.1029, + "num_input_tokens_seen": 41065528, + "step": 2552 + }, + { + "epoch": 0.17883317134676507, + "grad_norm": 3.7856082916259766, + "learning_rate": 8.21334711033275e-05, + "loss": 1.1063, + "num_input_tokens_seen": 41081912, + "step": 2553 + }, + { + "epoch": 0.17890321959249433, + "grad_norm": 4.845935821533203, + "learning_rate": 8.212647285464099e-05, + "loss": 1.2907, + "num_input_tokens_seen": 41098056, + "step": 2554 + }, + { + "epoch": 0.17897326783822357, + "grad_norm": 4.835206985473633, + "learning_rate": 8.211947460595448e-05, + "loss": 1.0591, + "num_input_tokens_seen": 41114376, + "step": 2555 + }, + { + "epoch": 0.17904331608395282, + "grad_norm": 3.9637155532836914, + "learning_rate": 8.211247635726795e-05, + "loss": 1.1689, + "num_input_tokens_seen": 41130760, + "step": 2556 + }, + { + "epoch": 0.17911336432968206, + "grad_norm": 3.5001652240753174, + "learning_rate": 8.210547810858144e-05, + "loss": 0.9798, + "num_input_tokens_seen": 41147040, + "step": 2557 + }, + { + "epoch": 0.17918341257541132, + "grad_norm": 5.54505729675293, + "learning_rate": 8.209847985989493e-05, + "loss": 1.3004, + "num_input_tokens_seen": 41163312, + "step": 2558 + }, + { + "epoch": 0.17925346082114055, + "grad_norm": 4.122933387756348, + "learning_rate": 8.20914816112084e-05, + "loss": 1.1754, + "num_input_tokens_seen": 41179632, + "step": 2559 + }, + { + "epoch": 0.1793235090668698, + "grad_norm": 4.166035175323486, + "learning_rate": 8.208448336252189e-05, + "loss": 1.0022, + "num_input_tokens_seen": 41196000, + "step": 2560 + }, + { + "epoch": 0.17939355731259904, + "grad_norm": 4.129281520843506, + "learning_rate": 8.20774851138354e-05, + "loss": 1.2342, + "num_input_tokens_seen": 41211944, + "step": 2561 + }, + { + "epoch": 0.1794636055583283, + "grad_norm": 3.9011406898498535, + "learning_rate": 8.207048686514887e-05, + "loss": 1.0238, + "num_input_tokens_seen": 41227680, + "step": 2562 + }, + { + "epoch": 0.17953365380405756, + "grad_norm": 3.717945098876953, + "learning_rate": 8.206348861646234e-05, + "loss": 0.9601, + "num_input_tokens_seen": 41244064, + "step": 2563 + }, + { + "epoch": 0.1796037020497868, + "grad_norm": 5.05475378036499, + "learning_rate": 8.205649036777583e-05, + "loss": 1.1192, + "num_input_tokens_seen": 41260448, + "step": 2564 + }, + { + "epoch": 0.17967375029551605, + "grad_norm": 4.52910041809082, + "learning_rate": 8.204949211908932e-05, + "loss": 0.9443, + "num_input_tokens_seen": 41276832, + "step": 2565 + }, + { + "epoch": 0.17974379854124528, + "grad_norm": 4.6492695808410645, + "learning_rate": 8.20424938704028e-05, + "loss": 1.0729, + "num_input_tokens_seen": 41293216, + "step": 2566 + }, + { + "epoch": 0.17981384678697454, + "grad_norm": 4.7587456703186035, + "learning_rate": 8.20354956217163e-05, + "loss": 0.9702, + "num_input_tokens_seen": 41309600, + "step": 2567 + }, + { + "epoch": 0.17988389503270377, + "grad_norm": 6.8467817306518555, + "learning_rate": 8.202849737302979e-05, + "loss": 1.1385, + "num_input_tokens_seen": 41325984, + "step": 2568 + }, + { + "epoch": 0.17995394327843303, + "grad_norm": 3.7771074771881104, + "learning_rate": 8.202149912434326e-05, + "loss": 1.1603, + "num_input_tokens_seen": 41342368, + "step": 2569 + }, + { + "epoch": 0.18002399152416226, + "grad_norm": 3.8494906425476074, + "learning_rate": 8.201450087565675e-05, + "loss": 1.056, + "num_input_tokens_seen": 41357992, + "step": 2570 + }, + { + "epoch": 0.18009403976989152, + "grad_norm": 4.079790115356445, + "learning_rate": 8.200750262697023e-05, + "loss": 1.1159, + "num_input_tokens_seen": 41374256, + "step": 2571 + }, + { + "epoch": 0.18016408801562075, + "grad_norm": 7.093918800354004, + "learning_rate": 8.200050437828371e-05, + "loss": 1.1756, + "num_input_tokens_seen": 41388728, + "step": 2572 + }, + { + "epoch": 0.18023413626135001, + "grad_norm": 4.636250972747803, + "learning_rate": 8.19935061295972e-05, + "loss": 1.1599, + "num_input_tokens_seen": 41404488, + "step": 2573 + }, + { + "epoch": 0.18030418450707925, + "grad_norm": 3.789625644683838, + "learning_rate": 8.198650788091069e-05, + "loss": 1.162, + "num_input_tokens_seen": 41420200, + "step": 2574 + }, + { + "epoch": 0.1803742327528085, + "grad_norm": 3.849637508392334, + "learning_rate": 8.197950963222418e-05, + "loss": 1.1399, + "num_input_tokens_seen": 41436496, + "step": 2575 + }, + { + "epoch": 0.18044428099853774, + "grad_norm": 3.6819775104522705, + "learning_rate": 8.197251138353766e-05, + "loss": 1.1467, + "num_input_tokens_seen": 41452736, + "step": 2576 + }, + { + "epoch": 0.180514329244267, + "grad_norm": 4.505229473114014, + "learning_rate": 8.196551313485114e-05, + "loss": 1.0336, + "num_input_tokens_seen": 41468976, + "step": 2577 + }, + { + "epoch": 0.18058437748999623, + "grad_norm": 5.465007781982422, + "learning_rate": 8.195851488616463e-05, + "loss": 0.983, + "num_input_tokens_seen": 41485064, + "step": 2578 + }, + { + "epoch": 0.1806544257357255, + "grad_norm": 3.993953227996826, + "learning_rate": 8.195151663747811e-05, + "loss": 1.3406, + "num_input_tokens_seen": 41501448, + "step": 2579 + }, + { + "epoch": 0.18072447398145472, + "grad_norm": 5.29327392578125, + "learning_rate": 8.19445183887916e-05, + "loss": 1.2397, + "num_input_tokens_seen": 41517832, + "step": 2580 + }, + { + "epoch": 0.18079452222718398, + "grad_norm": 4.132434844970703, + "learning_rate": 8.193752014010508e-05, + "loss": 1.2522, + "num_input_tokens_seen": 41532976, + "step": 2581 + }, + { + "epoch": 0.1808645704729132, + "grad_norm": 5.620279788970947, + "learning_rate": 8.193052189141857e-05, + "loss": 1.06, + "num_input_tokens_seen": 41548784, + "step": 2582 + }, + { + "epoch": 0.18093461871864247, + "grad_norm": 3.9721081256866455, + "learning_rate": 8.192352364273205e-05, + "loss": 1.0458, + "num_input_tokens_seen": 41565168, + "step": 2583 + }, + { + "epoch": 0.1810046669643717, + "grad_norm": 5.015312194824219, + "learning_rate": 8.191652539404554e-05, + "loss": 0.9813, + "num_input_tokens_seen": 41580584, + "step": 2584 + }, + { + "epoch": 0.18107471521010096, + "grad_norm": 5.385783672332764, + "learning_rate": 8.190952714535903e-05, + "loss": 1.0853, + "num_input_tokens_seen": 41596656, + "step": 2585 + }, + { + "epoch": 0.1811447634558302, + "grad_norm": 4.1005120277404785, + "learning_rate": 8.19025288966725e-05, + "loss": 1.0509, + "num_input_tokens_seen": 41611752, + "step": 2586 + }, + { + "epoch": 0.18121481170155945, + "grad_norm": 3.6853153705596924, + "learning_rate": 8.1895530647986e-05, + "loss": 1.0736, + "num_input_tokens_seen": 41627408, + "step": 2587 + }, + { + "epoch": 0.18128485994728868, + "grad_norm": 3.7818400859832764, + "learning_rate": 8.188853239929949e-05, + "loss": 1.1182, + "num_input_tokens_seen": 41643792, + "step": 2588 + }, + { + "epoch": 0.18135490819301794, + "grad_norm": 4.564868450164795, + "learning_rate": 8.188153415061297e-05, + "loss": 1.1408, + "num_input_tokens_seen": 41658768, + "step": 2589 + }, + { + "epoch": 0.18142495643874718, + "grad_norm": 4.092021465301514, + "learning_rate": 8.187453590192644e-05, + "loss": 1.0978, + "num_input_tokens_seen": 41675088, + "step": 2590 + }, + { + "epoch": 0.18149500468447644, + "grad_norm": 5.051564693450928, + "learning_rate": 8.186753765323993e-05, + "loss": 0.9746, + "num_input_tokens_seen": 41690376, + "step": 2591 + }, + { + "epoch": 0.18156505293020567, + "grad_norm": 3.5786261558532715, + "learning_rate": 8.186053940455342e-05, + "loss": 0.9638, + "num_input_tokens_seen": 41706760, + "step": 2592 + }, + { + "epoch": 0.18163510117593493, + "grad_norm": 4.11420202255249, + "learning_rate": 8.185354115586691e-05, + "loss": 1.1234, + "num_input_tokens_seen": 41721760, + "step": 2593 + }, + { + "epoch": 0.18170514942166416, + "grad_norm": 4.445348262786865, + "learning_rate": 8.18465429071804e-05, + "loss": 1.0846, + "num_input_tokens_seen": 41737640, + "step": 2594 + }, + { + "epoch": 0.18177519766739342, + "grad_norm": 5.705301284790039, + "learning_rate": 8.183954465849388e-05, + "loss": 1.2254, + "num_input_tokens_seen": 41753784, + "step": 2595 + }, + { + "epoch": 0.18184524591312265, + "grad_norm": 3.7948646545410156, + "learning_rate": 8.183254640980736e-05, + "loss": 0.9929, + "num_input_tokens_seen": 41770120, + "step": 2596 + }, + { + "epoch": 0.1819152941588519, + "grad_norm": 4.296072959899902, + "learning_rate": 8.182554816112085e-05, + "loss": 1.1365, + "num_input_tokens_seen": 41786504, + "step": 2597 + }, + { + "epoch": 0.18198534240458117, + "grad_norm": 4.750889778137207, + "learning_rate": 8.181854991243432e-05, + "loss": 1.1295, + "num_input_tokens_seen": 41802888, + "step": 2598 + }, + { + "epoch": 0.1820553906503104, + "grad_norm": 4.031731128692627, + "learning_rate": 8.181155166374781e-05, + "loss": 1.1096, + "num_input_tokens_seen": 41819264, + "step": 2599 + }, + { + "epoch": 0.18212543889603966, + "grad_norm": 4.620563507080078, + "learning_rate": 8.18045534150613e-05, + "loss": 1.1862, + "num_input_tokens_seen": 41835016, + "step": 2600 + }, + { + "epoch": 0.18212543889603966, + "eval_loss": 1.1404880285263062, + "eval_runtime": 0.189, + "eval_samples_per_second": 5.291, + "eval_steps_per_second": 5.291, + "num_input_tokens_seen": 41835016, + "step": 2600 + }, + { + "epoch": 0.1821954871417689, + "grad_norm": 3.8487257957458496, + "learning_rate": 8.179755516637479e-05, + "loss": 1.0103, + "num_input_tokens_seen": 41851400, + "step": 2601 + }, + { + "epoch": 0.18226553538749815, + "grad_norm": 4.221493244171143, + "learning_rate": 8.179055691768828e-05, + "loss": 1.1346, + "num_input_tokens_seen": 41867784, + "step": 2602 + }, + { + "epoch": 0.18233558363322738, + "grad_norm": 3.88747239112854, + "learning_rate": 8.178355866900175e-05, + "loss": 1.0679, + "num_input_tokens_seen": 41884024, + "step": 2603 + }, + { + "epoch": 0.18240563187895664, + "grad_norm": 3.845551013946533, + "learning_rate": 8.177656042031524e-05, + "loss": 0.9442, + "num_input_tokens_seen": 41899936, + "step": 2604 + }, + { + "epoch": 0.18247568012468587, + "grad_norm": 3.6964564323425293, + "learning_rate": 8.176956217162873e-05, + "loss": 0.815, + "num_input_tokens_seen": 41915512, + "step": 2605 + }, + { + "epoch": 0.18254572837041513, + "grad_norm": 4.105105400085449, + "learning_rate": 8.17625639229422e-05, + "loss": 1.0891, + "num_input_tokens_seen": 41931728, + "step": 2606 + }, + { + "epoch": 0.18261577661614437, + "grad_norm": 5.0245842933654785, + "learning_rate": 8.175556567425569e-05, + "loss": 0.9657, + "num_input_tokens_seen": 41947528, + "step": 2607 + }, + { + "epoch": 0.18268582486187362, + "grad_norm": 3.4683709144592285, + "learning_rate": 8.174856742556918e-05, + "loss": 0.8183, + "num_input_tokens_seen": 41963912, + "step": 2608 + }, + { + "epoch": 0.18275587310760286, + "grad_norm": 4.603201866149902, + "learning_rate": 8.174156917688267e-05, + "loss": 1.1339, + "num_input_tokens_seen": 41979976, + "step": 2609 + }, + { + "epoch": 0.18282592135333212, + "grad_norm": 3.9904422760009766, + "learning_rate": 8.173457092819615e-05, + "loss": 1.0661, + "num_input_tokens_seen": 41996360, + "step": 2610 + }, + { + "epoch": 0.18289596959906135, + "grad_norm": 6.363785743713379, + "learning_rate": 8.172757267950963e-05, + "loss": 0.9569, + "num_input_tokens_seen": 42011712, + "step": 2611 + }, + { + "epoch": 0.1829660178447906, + "grad_norm": 3.7257959842681885, + "learning_rate": 8.172057443082312e-05, + "loss": 1.0227, + "num_input_tokens_seen": 42028096, + "step": 2612 + }, + { + "epoch": 0.18303606609051984, + "grad_norm": 3.8486809730529785, + "learning_rate": 8.171357618213661e-05, + "loss": 1.0442, + "num_input_tokens_seen": 42044480, + "step": 2613 + }, + { + "epoch": 0.1831061143362491, + "grad_norm": 4.620292663574219, + "learning_rate": 8.17065779334501e-05, + "loss": 0.9917, + "num_input_tokens_seen": 42060864, + "step": 2614 + }, + { + "epoch": 0.18317616258197833, + "grad_norm": 3.52644944190979, + "learning_rate": 8.169957968476359e-05, + "loss": 1.1402, + "num_input_tokens_seen": 42077072, + "step": 2615 + }, + { + "epoch": 0.1832462108277076, + "grad_norm": 3.800718069076538, + "learning_rate": 8.169258143607706e-05, + "loss": 0.9864, + "num_input_tokens_seen": 42093128, + "step": 2616 + }, + { + "epoch": 0.18331625907343682, + "grad_norm": 3.9447405338287354, + "learning_rate": 8.168558318739054e-05, + "loss": 0.9923, + "num_input_tokens_seen": 42109512, + "step": 2617 + }, + { + "epoch": 0.18338630731916608, + "grad_norm": 4.109864234924316, + "learning_rate": 8.167858493870403e-05, + "loss": 0.9583, + "num_input_tokens_seen": 42125776, + "step": 2618 + }, + { + "epoch": 0.1834563555648953, + "grad_norm": 3.6538870334625244, + "learning_rate": 8.167158669001752e-05, + "loss": 1.0731, + "num_input_tokens_seen": 42141760, + "step": 2619 + }, + { + "epoch": 0.18352640381062457, + "grad_norm": 5.139223098754883, + "learning_rate": 8.1664588441331e-05, + "loss": 1.2108, + "num_input_tokens_seen": 42157096, + "step": 2620 + }, + { + "epoch": 0.1835964520563538, + "grad_norm": 4.420098781585693, + "learning_rate": 8.165759019264449e-05, + "loss": 1.0652, + "num_input_tokens_seen": 42173480, + "step": 2621 + }, + { + "epoch": 0.18366650030208306, + "grad_norm": 5.559954643249512, + "learning_rate": 8.165059194395798e-05, + "loss": 0.9224, + "num_input_tokens_seen": 42188944, + "step": 2622 + }, + { + "epoch": 0.1837365485478123, + "grad_norm": 3.827627420425415, + "learning_rate": 8.164359369527146e-05, + "loss": 0.9185, + "num_input_tokens_seen": 42204952, + "step": 2623 + }, + { + "epoch": 0.18380659679354155, + "grad_norm": 7.454338550567627, + "learning_rate": 8.163659544658494e-05, + "loss": 1.2441, + "num_input_tokens_seen": 42221336, + "step": 2624 + }, + { + "epoch": 0.1838766450392708, + "grad_norm": 4.34182071685791, + "learning_rate": 8.162959719789842e-05, + "loss": 1.122, + "num_input_tokens_seen": 42237720, + "step": 2625 + }, + { + "epoch": 0.18394669328500005, + "grad_norm": 3.9157843589782715, + "learning_rate": 8.162259894921191e-05, + "loss": 1.1206, + "num_input_tokens_seen": 42253584, + "step": 2626 + }, + { + "epoch": 0.18401674153072928, + "grad_norm": 3.952451467514038, + "learning_rate": 8.16156007005254e-05, + "loss": 1.2001, + "num_input_tokens_seen": 42269968, + "step": 2627 + }, + { + "epoch": 0.18408678977645854, + "grad_norm": 5.0545148849487305, + "learning_rate": 8.160860245183889e-05, + "loss": 1.0629, + "num_input_tokens_seen": 42286232, + "step": 2628 + }, + { + "epoch": 0.18415683802218777, + "grad_norm": 7.176907062530518, + "learning_rate": 8.160160420315237e-05, + "loss": 1.1248, + "num_input_tokens_seen": 42302616, + "step": 2629 + }, + { + "epoch": 0.18422688626791703, + "grad_norm": 3.994748830795288, + "learning_rate": 8.159460595446585e-05, + "loss": 0.9938, + "num_input_tokens_seen": 42319000, + "step": 2630 + }, + { + "epoch": 0.18429693451364626, + "grad_norm": 3.5744547843933105, + "learning_rate": 8.158760770577934e-05, + "loss": 1.0644, + "num_input_tokens_seen": 42335384, + "step": 2631 + }, + { + "epoch": 0.18436698275937552, + "grad_norm": 3.528723955154419, + "learning_rate": 8.158060945709283e-05, + "loss": 0.9955, + "num_input_tokens_seen": 42351768, + "step": 2632 + }, + { + "epoch": 0.18443703100510478, + "grad_norm": 3.9958291053771973, + "learning_rate": 8.15736112084063e-05, + "loss": 1.076, + "num_input_tokens_seen": 42368152, + "step": 2633 + }, + { + "epoch": 0.184507079250834, + "grad_norm": 4.1659650802612305, + "learning_rate": 8.15666129597198e-05, + "loss": 1.1427, + "num_input_tokens_seen": 42384536, + "step": 2634 + }, + { + "epoch": 0.18457712749656327, + "grad_norm": 5.116000652313232, + "learning_rate": 8.155961471103328e-05, + "loss": 1.1418, + "num_input_tokens_seen": 42399704, + "step": 2635 + }, + { + "epoch": 0.1846471757422925, + "grad_norm": 4.01514196395874, + "learning_rate": 8.155261646234677e-05, + "loss": 0.9521, + "num_input_tokens_seen": 42416056, + "step": 2636 + }, + { + "epoch": 0.18471722398802176, + "grad_norm": 4.290152072906494, + "learning_rate": 8.154561821366024e-05, + "loss": 1.06, + "num_input_tokens_seen": 42431544, + "step": 2637 + }, + { + "epoch": 0.184787272233751, + "grad_norm": 4.267684459686279, + "learning_rate": 8.153861996497373e-05, + "loss": 1.0247, + "num_input_tokens_seen": 42447928, + "step": 2638 + }, + { + "epoch": 0.18485732047948025, + "grad_norm": 3.593191385269165, + "learning_rate": 8.153162171628722e-05, + "loss": 0.9917, + "num_input_tokens_seen": 42464312, + "step": 2639 + }, + { + "epoch": 0.18492736872520948, + "grad_norm": 4.322700023651123, + "learning_rate": 8.152462346760071e-05, + "loss": 1.1686, + "num_input_tokens_seen": 42480696, + "step": 2640 + }, + { + "epoch": 0.18499741697093874, + "grad_norm": 4.176753520965576, + "learning_rate": 8.15176252189142e-05, + "loss": 1.046, + "num_input_tokens_seen": 42496520, + "step": 2641 + }, + { + "epoch": 0.18506746521666798, + "grad_norm": 4.405294895172119, + "learning_rate": 8.151062697022769e-05, + "loss": 1.0884, + "num_input_tokens_seen": 42512904, + "step": 2642 + }, + { + "epoch": 0.18513751346239724, + "grad_norm": 3.8770217895507812, + "learning_rate": 8.150362872154116e-05, + "loss": 1.124, + "num_input_tokens_seen": 42529288, + "step": 2643 + }, + { + "epoch": 0.18520756170812647, + "grad_norm": 3.5909271240234375, + "learning_rate": 8.149663047285464e-05, + "loss": 1.107, + "num_input_tokens_seen": 42545672, + "step": 2644 + }, + { + "epoch": 0.18527760995385573, + "grad_norm": 3.73958420753479, + "learning_rate": 8.148963222416812e-05, + "loss": 0.9943, + "num_input_tokens_seen": 42562056, + "step": 2645 + }, + { + "epoch": 0.18534765819958496, + "grad_norm": 3.6813879013061523, + "learning_rate": 8.148263397548161e-05, + "loss": 0.9861, + "num_input_tokens_seen": 42577720, + "step": 2646 + }, + { + "epoch": 0.18541770644531422, + "grad_norm": 4.13958215713501, + "learning_rate": 8.14756357267951e-05, + "loss": 1.0882, + "num_input_tokens_seen": 42594104, + "step": 2647 + }, + { + "epoch": 0.18548775469104345, + "grad_norm": 3.757805109024048, + "learning_rate": 8.146863747810859e-05, + "loss": 1.0872, + "num_input_tokens_seen": 42610256, + "step": 2648 + }, + { + "epoch": 0.1855578029367727, + "grad_norm": 4.57798957824707, + "learning_rate": 8.146163922942208e-05, + "loss": 0.9471, + "num_input_tokens_seen": 42626424, + "step": 2649 + }, + { + "epoch": 0.18562785118250194, + "grad_norm": 3.797257423400879, + "learning_rate": 8.145464098073555e-05, + "loss": 0.9336, + "num_input_tokens_seen": 42642200, + "step": 2650 + }, + { + "epoch": 0.1856978994282312, + "grad_norm": 4.258513450622559, + "learning_rate": 8.144764273204904e-05, + "loss": 1.1557, + "num_input_tokens_seen": 42657416, + "step": 2651 + }, + { + "epoch": 0.18576794767396043, + "grad_norm": 4.369161605834961, + "learning_rate": 8.144064448336252e-05, + "loss": 1.0013, + "num_input_tokens_seen": 42673752, + "step": 2652 + }, + { + "epoch": 0.1858379959196897, + "grad_norm": 4.159987926483154, + "learning_rate": 8.1433646234676e-05, + "loss": 1.151, + "num_input_tokens_seen": 42690136, + "step": 2653 + }, + { + "epoch": 0.18590804416541892, + "grad_norm": 7.164428234100342, + "learning_rate": 8.14266479859895e-05, + "loss": 1.1637, + "num_input_tokens_seen": 42706520, + "step": 2654 + }, + { + "epoch": 0.18597809241114818, + "grad_norm": 3.4230172634124756, + "learning_rate": 8.141964973730298e-05, + "loss": 0.9291, + "num_input_tokens_seen": 42722904, + "step": 2655 + }, + { + "epoch": 0.18604814065687741, + "grad_norm": 4.316817283630371, + "learning_rate": 8.141265148861647e-05, + "loss": 1.2645, + "num_input_tokens_seen": 42738640, + "step": 2656 + }, + { + "epoch": 0.18611818890260667, + "grad_norm": 3.7894318103790283, + "learning_rate": 8.140565323992995e-05, + "loss": 1.1287, + "num_input_tokens_seen": 42754848, + "step": 2657 + }, + { + "epoch": 0.1861882371483359, + "grad_norm": 4.198835849761963, + "learning_rate": 8.139865499124344e-05, + "loss": 1.1525, + "num_input_tokens_seen": 42771232, + "step": 2658 + }, + { + "epoch": 0.18625828539406517, + "grad_norm": 3.796414852142334, + "learning_rate": 8.139165674255692e-05, + "loss": 1.0313, + "num_input_tokens_seen": 42787344, + "step": 2659 + }, + { + "epoch": 0.1863283336397944, + "grad_norm": 3.6421244144439697, + "learning_rate": 8.138465849387041e-05, + "loss": 1.0497, + "num_input_tokens_seen": 42803728, + "step": 2660 + }, + { + "epoch": 0.18639838188552366, + "grad_norm": 4.391780376434326, + "learning_rate": 8.13776602451839e-05, + "loss": 1.0564, + "num_input_tokens_seen": 42820112, + "step": 2661 + }, + { + "epoch": 0.1864684301312529, + "grad_norm": 4.187370777130127, + "learning_rate": 8.137066199649738e-05, + "loss": 1.0289, + "num_input_tokens_seen": 42836496, + "step": 2662 + }, + { + "epoch": 0.18653847837698215, + "grad_norm": 3.794281244277954, + "learning_rate": 8.136366374781086e-05, + "loss": 1.172, + "num_input_tokens_seen": 42852880, + "step": 2663 + }, + { + "epoch": 0.18660852662271138, + "grad_norm": 4.386116981506348, + "learning_rate": 8.135666549912434e-05, + "loss": 1.1443, + "num_input_tokens_seen": 42869264, + "step": 2664 + }, + { + "epoch": 0.18667857486844064, + "grad_norm": 4.223747253417969, + "learning_rate": 8.134966725043783e-05, + "loss": 1.074, + "num_input_tokens_seen": 42885528, + "step": 2665 + }, + { + "epoch": 0.1867486231141699, + "grad_norm": 5.020680904388428, + "learning_rate": 8.134266900175132e-05, + "loss": 1.1927, + "num_input_tokens_seen": 42901912, + "step": 2666 + }, + { + "epoch": 0.18681867135989913, + "grad_norm": 8.58757495880127, + "learning_rate": 8.13356707530648e-05, + "loss": 1.1377, + "num_input_tokens_seen": 42917072, + "step": 2667 + }, + { + "epoch": 0.1868887196056284, + "grad_norm": 3.6986234188079834, + "learning_rate": 8.13286725043783e-05, + "loss": 1.0536, + "num_input_tokens_seen": 42933296, + "step": 2668 + }, + { + "epoch": 0.18695876785135762, + "grad_norm": 4.196423053741455, + "learning_rate": 8.132167425569178e-05, + "loss": 1.0484, + "num_input_tokens_seen": 42948968, + "step": 2669 + }, + { + "epoch": 0.18702881609708688, + "grad_norm": 4.019235610961914, + "learning_rate": 8.131467600700526e-05, + "loss": 1.1241, + "num_input_tokens_seen": 42965352, + "step": 2670 + }, + { + "epoch": 0.1870988643428161, + "grad_norm": 4.035778045654297, + "learning_rate": 8.130767775831873e-05, + "loss": 0.8962, + "num_input_tokens_seen": 42980872, + "step": 2671 + }, + { + "epoch": 0.18716891258854537, + "grad_norm": 4.193873882293701, + "learning_rate": 8.130067950963222e-05, + "loss": 1.0494, + "num_input_tokens_seen": 42996848, + "step": 2672 + }, + { + "epoch": 0.1872389608342746, + "grad_norm": 4.011183738708496, + "learning_rate": 8.129368126094571e-05, + "loss": 1.1151, + "num_input_tokens_seen": 43012728, + "step": 2673 + }, + { + "epoch": 0.18730900908000386, + "grad_norm": 5.662332534790039, + "learning_rate": 8.12866830122592e-05, + "loss": 1.1238, + "num_input_tokens_seen": 43028728, + "step": 2674 + }, + { + "epoch": 0.1873790573257331, + "grad_norm": 4.4699387550354, + "learning_rate": 8.127968476357269e-05, + "loss": 1.0712, + "num_input_tokens_seen": 43044504, + "step": 2675 + }, + { + "epoch": 0.18744910557146235, + "grad_norm": 3.857011556625366, + "learning_rate": 8.127268651488618e-05, + "loss": 0.9866, + "num_input_tokens_seen": 43060496, + "step": 2676 + }, + { + "epoch": 0.1875191538171916, + "grad_norm": 3.5458414554595947, + "learning_rate": 8.126568826619965e-05, + "loss": 0.9317, + "num_input_tokens_seen": 43076880, + "step": 2677 + }, + { + "epoch": 0.18758920206292085, + "grad_norm": 5.4007744789123535, + "learning_rate": 8.125869001751314e-05, + "loss": 1.2016, + "num_input_tokens_seen": 43091368, + "step": 2678 + }, + { + "epoch": 0.18765925030865008, + "grad_norm": 5.15717077255249, + "learning_rate": 8.125169176882661e-05, + "loss": 1.0662, + "num_input_tokens_seen": 43107752, + "step": 2679 + }, + { + "epoch": 0.18772929855437934, + "grad_norm": 4.891427516937256, + "learning_rate": 8.124469352014012e-05, + "loss": 1.1684, + "num_input_tokens_seen": 43122808, + "step": 2680 + }, + { + "epoch": 0.18779934680010857, + "grad_norm": 4.651966571807861, + "learning_rate": 8.123769527145359e-05, + "loss": 1.1457, + "num_input_tokens_seen": 43139056, + "step": 2681 + }, + { + "epoch": 0.18786939504583783, + "grad_norm": 3.844129800796509, + "learning_rate": 8.123069702276708e-05, + "loss": 0.9282, + "num_input_tokens_seen": 43155440, + "step": 2682 + }, + { + "epoch": 0.18793944329156706, + "grad_norm": 3.669360876083374, + "learning_rate": 8.122369877408057e-05, + "loss": 1.1418, + "num_input_tokens_seen": 43171824, + "step": 2683 + }, + { + "epoch": 0.18800949153729632, + "grad_norm": 3.6102185249328613, + "learning_rate": 8.121670052539404e-05, + "loss": 1.0786, + "num_input_tokens_seen": 43188208, + "step": 2684 + }, + { + "epoch": 0.18807953978302555, + "grad_norm": 3.593414783477783, + "learning_rate": 8.120970227670753e-05, + "loss": 0.9982, + "num_input_tokens_seen": 43204248, + "step": 2685 + }, + { + "epoch": 0.1881495880287548, + "grad_norm": 5.017848491668701, + "learning_rate": 8.120270402802102e-05, + "loss": 0.9573, + "num_input_tokens_seen": 43219808, + "step": 2686 + }, + { + "epoch": 0.18821963627448404, + "grad_norm": 4.083794593811035, + "learning_rate": 8.119570577933451e-05, + "loss": 1.0678, + "num_input_tokens_seen": 43235712, + "step": 2687 + }, + { + "epoch": 0.1882896845202133, + "grad_norm": 4.265167713165283, + "learning_rate": 8.1188707530648e-05, + "loss": 1.2967, + "num_input_tokens_seen": 43252048, + "step": 2688 + }, + { + "epoch": 0.18835973276594253, + "grad_norm": 4.24991512298584, + "learning_rate": 8.118170928196147e-05, + "loss": 1.0267, + "num_input_tokens_seen": 43268152, + "step": 2689 + }, + { + "epoch": 0.1884297810116718, + "grad_norm": 4.059658050537109, + "learning_rate": 8.117471103327496e-05, + "loss": 1.1356, + "num_input_tokens_seen": 43284240, + "step": 2690 + }, + { + "epoch": 0.18849982925740102, + "grad_norm": 4.807305812835693, + "learning_rate": 8.116771278458844e-05, + "loss": 1.0424, + "num_input_tokens_seen": 43299368, + "step": 2691 + }, + { + "epoch": 0.18856987750313028, + "grad_norm": 5.590726852416992, + "learning_rate": 8.116071453590193e-05, + "loss": 1.1008, + "num_input_tokens_seen": 43315648, + "step": 2692 + }, + { + "epoch": 0.18863992574885952, + "grad_norm": 5.114964485168457, + "learning_rate": 8.115371628721541e-05, + "loss": 0.8916, + "num_input_tokens_seen": 43331688, + "step": 2693 + }, + { + "epoch": 0.18870997399458878, + "grad_norm": 4.323836803436279, + "learning_rate": 8.11467180385289e-05, + "loss": 1.1858, + "num_input_tokens_seen": 43346672, + "step": 2694 + }, + { + "epoch": 0.188780022240318, + "grad_norm": 4.290014743804932, + "learning_rate": 8.113971978984239e-05, + "loss": 1.2498, + "num_input_tokens_seen": 43362872, + "step": 2695 + }, + { + "epoch": 0.18885007048604727, + "grad_norm": 3.5292484760284424, + "learning_rate": 8.113272154115588e-05, + "loss": 1.0045, + "num_input_tokens_seen": 43379256, + "step": 2696 + }, + { + "epoch": 0.1889201187317765, + "grad_norm": 4.21523380279541, + "learning_rate": 8.112572329246935e-05, + "loss": 1.0515, + "num_input_tokens_seen": 43395152, + "step": 2697 + }, + { + "epoch": 0.18899016697750576, + "grad_norm": 4.900782108306885, + "learning_rate": 8.111872504378283e-05, + "loss": 1.1038, + "num_input_tokens_seen": 43411536, + "step": 2698 + }, + { + "epoch": 0.189060215223235, + "grad_norm": 3.613231658935547, + "learning_rate": 8.111172679509632e-05, + "loss": 1.017, + "num_input_tokens_seen": 43427920, + "step": 2699 + }, + { + "epoch": 0.18913026346896425, + "grad_norm": 3.681725263595581, + "learning_rate": 8.110472854640982e-05, + "loss": 1.1396, + "num_input_tokens_seen": 43444304, + "step": 2700 + }, + { + "epoch": 0.1892003117146935, + "grad_norm": 3.801785707473755, + "learning_rate": 8.10977302977233e-05, + "loss": 0.9856, + "num_input_tokens_seen": 43459960, + "step": 2701 + }, + { + "epoch": 0.18927035996042274, + "grad_norm": 3.4208626747131348, + "learning_rate": 8.109073204903678e-05, + "loss": 1.0048, + "num_input_tokens_seen": 43476344, + "step": 2702 + }, + { + "epoch": 0.189340408206152, + "grad_norm": 4.169189453125, + "learning_rate": 8.108373380035027e-05, + "loss": 1.0014, + "num_input_tokens_seen": 43492728, + "step": 2703 + }, + { + "epoch": 0.18941045645188123, + "grad_norm": 3.7125117778778076, + "learning_rate": 8.107673555166375e-05, + "loss": 0.9707, + "num_input_tokens_seen": 43508168, + "step": 2704 + }, + { + "epoch": 0.1894805046976105, + "grad_norm": 4.550642490386963, + "learning_rate": 8.106973730297724e-05, + "loss": 1.0832, + "num_input_tokens_seen": 43524480, + "step": 2705 + }, + { + "epoch": 0.18955055294333972, + "grad_norm": 4.219499588012695, + "learning_rate": 8.106273905429072e-05, + "loss": 1.148, + "num_input_tokens_seen": 43540864, + "step": 2706 + }, + { + "epoch": 0.18962060118906898, + "grad_norm": 4.605996131896973, + "learning_rate": 8.105574080560421e-05, + "loss": 1.0564, + "num_input_tokens_seen": 43557248, + "step": 2707 + }, + { + "epoch": 0.18969064943479821, + "grad_norm": 3.740314245223999, + "learning_rate": 8.104874255691769e-05, + "loss": 1.0194, + "num_input_tokens_seen": 43573632, + "step": 2708 + }, + { + "epoch": 0.18976069768052747, + "grad_norm": 3.92555832862854, + "learning_rate": 8.104174430823118e-05, + "loss": 1.1663, + "num_input_tokens_seen": 43589728, + "step": 2709 + }, + { + "epoch": 0.1898307459262567, + "grad_norm": 3.5653927326202393, + "learning_rate": 8.103474605954467e-05, + "loss": 1.1165, + "num_input_tokens_seen": 43606112, + "step": 2710 + }, + { + "epoch": 0.18990079417198596, + "grad_norm": 5.943650245666504, + "learning_rate": 8.102774781085814e-05, + "loss": 1.309, + "num_input_tokens_seen": 43621072, + "step": 2711 + }, + { + "epoch": 0.1899708424177152, + "grad_norm": 3.7632322311401367, + "learning_rate": 8.102074956217163e-05, + "loss": 1.0963, + "num_input_tokens_seen": 43636976, + "step": 2712 + }, + { + "epoch": 0.19004089066344446, + "grad_norm": 3.605536699295044, + "learning_rate": 8.101375131348512e-05, + "loss": 0.9509, + "num_input_tokens_seen": 43653360, + "step": 2713 + }, + { + "epoch": 0.1901109389091737, + "grad_norm": 3.7717363834381104, + "learning_rate": 8.10067530647986e-05, + "loss": 0.9407, + "num_input_tokens_seen": 43669488, + "step": 2714 + }, + { + "epoch": 0.19018098715490295, + "grad_norm": 4.55484676361084, + "learning_rate": 8.09997548161121e-05, + "loss": 0.8501, + "num_input_tokens_seen": 43684704, + "step": 2715 + }, + { + "epoch": 0.19025103540063218, + "grad_norm": 4.155830383300781, + "learning_rate": 8.099275656742557e-05, + "loss": 0.9936, + "num_input_tokens_seen": 43700112, + "step": 2716 + }, + { + "epoch": 0.19032108364636144, + "grad_norm": 5.615505695343018, + "learning_rate": 8.098575831873906e-05, + "loss": 1.2055, + "num_input_tokens_seen": 43716136, + "step": 2717 + }, + { + "epoch": 0.19039113189209067, + "grad_norm": 4.60966157913208, + "learning_rate": 8.097876007005253e-05, + "loss": 1.0531, + "num_input_tokens_seen": 43731576, + "step": 2718 + }, + { + "epoch": 0.19046118013781993, + "grad_norm": 5.698062896728516, + "learning_rate": 8.097176182136602e-05, + "loss": 0.9692, + "num_input_tokens_seen": 43747960, + "step": 2719 + }, + { + "epoch": 0.19053122838354916, + "grad_norm": 3.760756492614746, + "learning_rate": 8.096476357267952e-05, + "loss": 0.9638, + "num_input_tokens_seen": 43764304, + "step": 2720 + }, + { + "epoch": 0.19060127662927842, + "grad_norm": 4.084067344665527, + "learning_rate": 8.0957765323993e-05, + "loss": 1.083, + "num_input_tokens_seen": 43780688, + "step": 2721 + }, + { + "epoch": 0.19067132487500765, + "grad_norm": 3.9934301376342773, + "learning_rate": 8.095076707530649e-05, + "loss": 0.9757, + "num_input_tokens_seen": 43797072, + "step": 2722 + }, + { + "epoch": 0.1907413731207369, + "grad_norm": 3.915512800216675, + "learning_rate": 8.094376882661998e-05, + "loss": 1.1031, + "num_input_tokens_seen": 43813456, + "step": 2723 + }, + { + "epoch": 0.19081142136646614, + "grad_norm": 3.967040777206421, + "learning_rate": 8.093677057793345e-05, + "loss": 0.9821, + "num_input_tokens_seen": 43829656, + "step": 2724 + }, + { + "epoch": 0.1908814696121954, + "grad_norm": 3.707667827606201, + "learning_rate": 8.092977232924693e-05, + "loss": 1.1489, + "num_input_tokens_seen": 43846040, + "step": 2725 + }, + { + "epoch": 0.19095151785792464, + "grad_norm": 3.3822734355926514, + "learning_rate": 8.092277408056043e-05, + "loss": 1.0051, + "num_input_tokens_seen": 43862144, + "step": 2726 + }, + { + "epoch": 0.1910215661036539, + "grad_norm": 3.7703781127929688, + "learning_rate": 8.091577583187392e-05, + "loss": 1.0363, + "num_input_tokens_seen": 43878328, + "step": 2727 + }, + { + "epoch": 0.19109161434938313, + "grad_norm": 3.902003049850464, + "learning_rate": 8.090877758318739e-05, + "loss": 1.0051, + "num_input_tokens_seen": 43893480, + "step": 2728 + }, + { + "epoch": 0.19116166259511239, + "grad_norm": 3.971395969390869, + "learning_rate": 8.090177933450088e-05, + "loss": 1.0469, + "num_input_tokens_seen": 43909752, + "step": 2729 + }, + { + "epoch": 0.19123171084084162, + "grad_norm": 3.4233641624450684, + "learning_rate": 8.089478108581437e-05, + "loss": 0.8821, + "num_input_tokens_seen": 43926136, + "step": 2730 + }, + { + "epoch": 0.19130175908657088, + "grad_norm": 5.967614650726318, + "learning_rate": 8.088778283712784e-05, + "loss": 1.1995, + "num_input_tokens_seen": 43941592, + "step": 2731 + }, + { + "epoch": 0.1913718073323001, + "grad_norm": 4.431912899017334, + "learning_rate": 8.088078458844133e-05, + "loss": 1.2471, + "num_input_tokens_seen": 43957784, + "step": 2732 + }, + { + "epoch": 0.19144185557802937, + "grad_norm": 3.659182071685791, + "learning_rate": 8.087378633975482e-05, + "loss": 0.9701, + "num_input_tokens_seen": 43973648, + "step": 2733 + }, + { + "epoch": 0.1915119038237586, + "grad_norm": 4.983634948730469, + "learning_rate": 8.086678809106831e-05, + "loss": 1.1023, + "num_input_tokens_seen": 43990032, + "step": 2734 + }, + { + "epoch": 0.19158195206948786, + "grad_norm": 4.236748695373535, + "learning_rate": 8.085978984238179e-05, + "loss": 1.0724, + "num_input_tokens_seen": 44005064, + "step": 2735 + }, + { + "epoch": 0.19165200031521712, + "grad_norm": 3.3617727756500244, + "learning_rate": 8.085279159369527e-05, + "loss": 0.9986, + "num_input_tokens_seen": 44021448, + "step": 2736 + }, + { + "epoch": 0.19172204856094635, + "grad_norm": 3.4514083862304688, + "learning_rate": 8.084579334500876e-05, + "loss": 0.8738, + "num_input_tokens_seen": 44037832, + "step": 2737 + }, + { + "epoch": 0.1917920968066756, + "grad_norm": 4.126194000244141, + "learning_rate": 8.083879509632224e-05, + "loss": 1.1142, + "num_input_tokens_seen": 44053384, + "step": 2738 + }, + { + "epoch": 0.19186214505240484, + "grad_norm": 5.12385368347168, + "learning_rate": 8.083179684763573e-05, + "loss": 1.251, + "num_input_tokens_seen": 44068728, + "step": 2739 + }, + { + "epoch": 0.1919321932981341, + "grad_norm": 3.457253932952881, + "learning_rate": 8.082479859894923e-05, + "loss": 0.8251, + "num_input_tokens_seen": 44085112, + "step": 2740 + }, + { + "epoch": 0.19200224154386333, + "grad_norm": 3.8708858489990234, + "learning_rate": 8.08178003502627e-05, + "loss": 1.1838, + "num_input_tokens_seen": 44101456, + "step": 2741 + }, + { + "epoch": 0.1920722897895926, + "grad_norm": 4.175468921661377, + "learning_rate": 8.081080210157619e-05, + "loss": 1.0062, + "num_input_tokens_seen": 44116640, + "step": 2742 + }, + { + "epoch": 0.19214233803532182, + "grad_norm": 4.141748428344727, + "learning_rate": 8.080380385288967e-05, + "loss": 1.1609, + "num_input_tokens_seen": 44132328, + "step": 2743 + }, + { + "epoch": 0.19221238628105108, + "grad_norm": 5.1061692237854, + "learning_rate": 8.079680560420316e-05, + "loss": 1.172, + "num_input_tokens_seen": 44148712, + "step": 2744 + }, + { + "epoch": 0.19228243452678032, + "grad_norm": 3.990196704864502, + "learning_rate": 8.078980735551663e-05, + "loss": 0.9997, + "num_input_tokens_seen": 44164600, + "step": 2745 + }, + { + "epoch": 0.19235248277250958, + "grad_norm": 4.365367412567139, + "learning_rate": 8.078280910683013e-05, + "loss": 1.0672, + "num_input_tokens_seen": 44180984, + "step": 2746 + }, + { + "epoch": 0.1924225310182388, + "grad_norm": 4.092031002044678, + "learning_rate": 8.077581085814362e-05, + "loss": 1.1405, + "num_input_tokens_seen": 44196400, + "step": 2747 + }, + { + "epoch": 0.19249257926396807, + "grad_norm": 3.4052438735961914, + "learning_rate": 8.07688126094571e-05, + "loss": 1.0128, + "num_input_tokens_seen": 44212736, + "step": 2748 + }, + { + "epoch": 0.1925626275096973, + "grad_norm": 4.703436374664307, + "learning_rate": 8.076181436077059e-05, + "loss": 1.2058, + "num_input_tokens_seen": 44229120, + "step": 2749 + }, + { + "epoch": 0.19263267575542656, + "grad_norm": 3.7579853534698486, + "learning_rate": 8.075481611208407e-05, + "loss": 0.9081, + "num_input_tokens_seen": 44245144, + "step": 2750 + }, + { + "epoch": 0.1927027240011558, + "grad_norm": 3.6251869201660156, + "learning_rate": 8.074781786339755e-05, + "loss": 0.9854, + "num_input_tokens_seen": 44260920, + "step": 2751 + }, + { + "epoch": 0.19277277224688505, + "grad_norm": 3.4949889183044434, + "learning_rate": 8.074081961471104e-05, + "loss": 1.1115, + "num_input_tokens_seen": 44277280, + "step": 2752 + }, + { + "epoch": 0.19284282049261428, + "grad_norm": 4.28520393371582, + "learning_rate": 8.073382136602453e-05, + "loss": 1.2536, + "num_input_tokens_seen": 44293664, + "step": 2753 + }, + { + "epoch": 0.19291286873834354, + "grad_norm": 3.9574859142303467, + "learning_rate": 8.072682311733801e-05, + "loss": 1.1584, + "num_input_tokens_seen": 44309328, + "step": 2754 + }, + { + "epoch": 0.19298291698407277, + "grad_norm": 3.6340646743774414, + "learning_rate": 8.071982486865149e-05, + "loss": 1.0116, + "num_input_tokens_seen": 44325336, + "step": 2755 + }, + { + "epoch": 0.19305296522980203, + "grad_norm": 5.131178855895996, + "learning_rate": 8.071282661996498e-05, + "loss": 1.1226, + "num_input_tokens_seen": 44341264, + "step": 2756 + }, + { + "epoch": 0.19312301347553126, + "grad_norm": 4.273870944976807, + "learning_rate": 8.070582837127847e-05, + "loss": 1.0953, + "num_input_tokens_seen": 44357648, + "step": 2757 + }, + { + "epoch": 0.19319306172126052, + "grad_norm": 3.883690118789673, + "learning_rate": 8.069883012259194e-05, + "loss": 1.2978, + "num_input_tokens_seen": 44373984, + "step": 2758 + }, + { + "epoch": 0.19326310996698975, + "grad_norm": 4.284129619598389, + "learning_rate": 8.069183187390543e-05, + "loss": 1.0356, + "num_input_tokens_seen": 44389160, + "step": 2759 + }, + { + "epoch": 0.19333315821271901, + "grad_norm": 4.517998695373535, + "learning_rate": 8.068483362521892e-05, + "loss": 1.0378, + "num_input_tokens_seen": 44405544, + "step": 2760 + }, + { + "epoch": 0.19340320645844825, + "grad_norm": 4.098707675933838, + "learning_rate": 8.067783537653241e-05, + "loss": 1.2235, + "num_input_tokens_seen": 44421560, + "step": 2761 + }, + { + "epoch": 0.1934732547041775, + "grad_norm": 3.656461477279663, + "learning_rate": 8.067083712784588e-05, + "loss": 0.8462, + "num_input_tokens_seen": 44437944, + "step": 2762 + }, + { + "epoch": 0.19354330294990674, + "grad_norm": 3.8305914402008057, + "learning_rate": 8.066383887915937e-05, + "loss": 1.1084, + "num_input_tokens_seen": 44454208, + "step": 2763 + }, + { + "epoch": 0.193613351195636, + "grad_norm": 4.0582990646362305, + "learning_rate": 8.065684063047286e-05, + "loss": 1.2152, + "num_input_tokens_seen": 44470592, + "step": 2764 + }, + { + "epoch": 0.19368339944136523, + "grad_norm": 4.159184455871582, + "learning_rate": 8.064984238178633e-05, + "loss": 1.0183, + "num_input_tokens_seen": 44486976, + "step": 2765 + }, + { + "epoch": 0.1937534476870945, + "grad_norm": 3.7490620613098145, + "learning_rate": 8.064284413309984e-05, + "loss": 1.0883, + "num_input_tokens_seen": 44503360, + "step": 2766 + }, + { + "epoch": 0.19382349593282372, + "grad_norm": 4.3000288009643555, + "learning_rate": 8.063584588441333e-05, + "loss": 1.2323, + "num_input_tokens_seen": 44519744, + "step": 2767 + }, + { + "epoch": 0.19389354417855298, + "grad_norm": 3.9175477027893066, + "learning_rate": 8.06288476357268e-05, + "loss": 0.8758, + "num_input_tokens_seen": 44535664, + "step": 2768 + }, + { + "epoch": 0.1939635924242822, + "grad_norm": 4.4328293800354, + "learning_rate": 8.062184938704029e-05, + "loss": 1.0173, + "num_input_tokens_seen": 44550984, + "step": 2769 + }, + { + "epoch": 0.19403364067001147, + "grad_norm": 4.556321620941162, + "learning_rate": 8.061485113835376e-05, + "loss": 1.1389, + "num_input_tokens_seen": 44566808, + "step": 2770 + }, + { + "epoch": 0.19410368891574073, + "grad_norm": 4.382159233093262, + "learning_rate": 8.060785288966725e-05, + "loss": 1.1211, + "num_input_tokens_seen": 44583192, + "step": 2771 + }, + { + "epoch": 0.19417373716146996, + "grad_norm": 3.920137405395508, + "learning_rate": 8.060085464098074e-05, + "loss": 0.9815, + "num_input_tokens_seen": 44599480, + "step": 2772 + }, + { + "epoch": 0.19424378540719922, + "grad_norm": 4.23013162612915, + "learning_rate": 8.059385639229423e-05, + "loss": 1.2268, + "num_input_tokens_seen": 44615240, + "step": 2773 + }, + { + "epoch": 0.19431383365292845, + "grad_norm": 3.7917346954345703, + "learning_rate": 8.058685814360772e-05, + "loss": 1.0935, + "num_input_tokens_seen": 44630952, + "step": 2774 + }, + { + "epoch": 0.1943838818986577, + "grad_norm": 4.798681259155273, + "learning_rate": 8.05798598949212e-05, + "loss": 1.1321, + "num_input_tokens_seen": 44647336, + "step": 2775 + }, + { + "epoch": 0.19445393014438694, + "grad_norm": 3.563124418258667, + "learning_rate": 8.057286164623468e-05, + "loss": 1.1231, + "num_input_tokens_seen": 44663720, + "step": 2776 + }, + { + "epoch": 0.1945239783901162, + "grad_norm": 6.6064019203186035, + "learning_rate": 8.056586339754817e-05, + "loss": 0.8685, + "num_input_tokens_seen": 44679616, + "step": 2777 + }, + { + "epoch": 0.19459402663584544, + "grad_norm": 4.1651291847229, + "learning_rate": 8.055886514886165e-05, + "loss": 1.1634, + "num_input_tokens_seen": 44695800, + "step": 2778 + }, + { + "epoch": 0.1946640748815747, + "grad_norm": 3.929474353790283, + "learning_rate": 8.055186690017513e-05, + "loss": 1.1127, + "num_input_tokens_seen": 44711744, + "step": 2779 + }, + { + "epoch": 0.19473412312730393, + "grad_norm": 3.758721351623535, + "learning_rate": 8.054486865148862e-05, + "loss": 0.9218, + "num_input_tokens_seen": 44728128, + "step": 2780 + }, + { + "epoch": 0.19480417137303319, + "grad_norm": 4.988550662994385, + "learning_rate": 8.053787040280211e-05, + "loss": 1.222, + "num_input_tokens_seen": 44744512, + "step": 2781 + }, + { + "epoch": 0.19487421961876242, + "grad_norm": 3.7875940799713135, + "learning_rate": 8.053087215411559e-05, + "loss": 1.0393, + "num_input_tokens_seen": 44760896, + "step": 2782 + }, + { + "epoch": 0.19494426786449168, + "grad_norm": 3.877729654312134, + "learning_rate": 8.052387390542908e-05, + "loss": 1.1748, + "num_input_tokens_seen": 44777280, + "step": 2783 + }, + { + "epoch": 0.1950143161102209, + "grad_norm": 4.979894161224365, + "learning_rate": 8.051687565674256e-05, + "loss": 1.1506, + "num_input_tokens_seen": 44793664, + "step": 2784 + }, + { + "epoch": 0.19508436435595017, + "grad_norm": 4.3148579597473145, + "learning_rate": 8.050987740805604e-05, + "loss": 1.1587, + "num_input_tokens_seen": 44809688, + "step": 2785 + }, + { + "epoch": 0.1951544126016794, + "grad_norm": 4.082404136657715, + "learning_rate": 8.050287915936954e-05, + "loss": 1.1488, + "num_input_tokens_seen": 44825600, + "step": 2786 + }, + { + "epoch": 0.19522446084740866, + "grad_norm": 3.6951189041137695, + "learning_rate": 8.049588091068302e-05, + "loss": 1.1542, + "num_input_tokens_seen": 44841984, + "step": 2787 + }, + { + "epoch": 0.1952945090931379, + "grad_norm": 3.797136068344116, + "learning_rate": 8.04888826619965e-05, + "loss": 0.964, + "num_input_tokens_seen": 44858368, + "step": 2788 + }, + { + "epoch": 0.19536455733886715, + "grad_norm": 3.8912811279296875, + "learning_rate": 8.048188441330998e-05, + "loss": 0.8985, + "num_input_tokens_seen": 44873752, + "step": 2789 + }, + { + "epoch": 0.19543460558459638, + "grad_norm": 4.355793476104736, + "learning_rate": 8.047488616462347e-05, + "loss": 1.1546, + "num_input_tokens_seen": 44889336, + "step": 2790 + }, + { + "epoch": 0.19550465383032564, + "grad_norm": 4.216153144836426, + "learning_rate": 8.046788791593696e-05, + "loss": 1.0922, + "num_input_tokens_seen": 44905720, + "step": 2791 + }, + { + "epoch": 0.19557470207605487, + "grad_norm": 3.5346696376800537, + "learning_rate": 8.046088966725045e-05, + "loss": 1.1628, + "num_input_tokens_seen": 44921864, + "step": 2792 + }, + { + "epoch": 0.19564475032178413, + "grad_norm": 4.2197489738464355, + "learning_rate": 8.045389141856393e-05, + "loss": 1.0177, + "num_input_tokens_seen": 44938248, + "step": 2793 + }, + { + "epoch": 0.19571479856751337, + "grad_norm": 3.66995906829834, + "learning_rate": 8.044689316987742e-05, + "loss": 1.0401, + "num_input_tokens_seen": 44954632, + "step": 2794 + }, + { + "epoch": 0.19578484681324262, + "grad_norm": 5.062297821044922, + "learning_rate": 8.04398949211909e-05, + "loss": 1.2106, + "num_input_tokens_seen": 44971016, + "step": 2795 + }, + { + "epoch": 0.19585489505897186, + "grad_norm": 4.473872661590576, + "learning_rate": 8.043289667250439e-05, + "loss": 1.153, + "num_input_tokens_seen": 44987400, + "step": 2796 + }, + { + "epoch": 0.19592494330470112, + "grad_norm": 4.724556922912598, + "learning_rate": 8.042589842381786e-05, + "loss": 1.23, + "num_input_tokens_seen": 45002968, + "step": 2797 + }, + { + "epoch": 0.19599499155043035, + "grad_norm": 4.324196815490723, + "learning_rate": 8.041890017513135e-05, + "loss": 0.8708, + "num_input_tokens_seen": 45019352, + "step": 2798 + }, + { + "epoch": 0.1960650397961596, + "grad_norm": 4.309204578399658, + "learning_rate": 8.041190192644484e-05, + "loss": 1.0769, + "num_input_tokens_seen": 45034960, + "step": 2799 + }, + { + "epoch": 0.19613508804188884, + "grad_norm": 3.4928808212280273, + "learning_rate": 8.040490367775833e-05, + "loss": 0.9394, + "num_input_tokens_seen": 45051344, + "step": 2800 + }, + { + "epoch": 0.19613508804188884, + "eval_loss": 1.1401225328445435, + "eval_runtime": 0.185, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 5.405, + "num_input_tokens_seen": 45051344, + "step": 2800 + }, + { + "epoch": 0.1962051362876181, + "grad_norm": 6.388762474060059, + "learning_rate": 8.039790542907182e-05, + "loss": 1.047, + "num_input_tokens_seen": 45066712, + "step": 2801 + }, + { + "epoch": 0.19627518453334733, + "grad_norm": 3.8386781215667725, + "learning_rate": 8.039090718038529e-05, + "loss": 1.0248, + "num_input_tokens_seen": 45082472, + "step": 2802 + }, + { + "epoch": 0.1963452327790766, + "grad_norm": 3.540064573287964, + "learning_rate": 8.038390893169878e-05, + "loss": 0.846, + "num_input_tokens_seen": 45098072, + "step": 2803 + }, + { + "epoch": 0.19641528102480582, + "grad_norm": 3.9858322143554688, + "learning_rate": 8.037691068301227e-05, + "loss": 1.1443, + "num_input_tokens_seen": 45114456, + "step": 2804 + }, + { + "epoch": 0.19648532927053508, + "grad_norm": 4.418299674987793, + "learning_rate": 8.036991243432574e-05, + "loss": 1.0391, + "num_input_tokens_seen": 45130416, + "step": 2805 + }, + { + "epoch": 0.19655537751626434, + "grad_norm": 4.6108880043029785, + "learning_rate": 8.036291418563923e-05, + "loss": 0.9911, + "num_input_tokens_seen": 45146800, + "step": 2806 + }, + { + "epoch": 0.19662542576199357, + "grad_norm": 3.686781883239746, + "learning_rate": 8.035591593695272e-05, + "loss": 0.904, + "num_input_tokens_seen": 45163016, + "step": 2807 + }, + { + "epoch": 0.19669547400772283, + "grad_norm": 3.7459771633148193, + "learning_rate": 8.034891768826621e-05, + "loss": 1.0635, + "num_input_tokens_seen": 45178912, + "step": 2808 + }, + { + "epoch": 0.19676552225345206, + "grad_norm": 4.955589771270752, + "learning_rate": 8.034191943957968e-05, + "loss": 0.951, + "num_input_tokens_seen": 45193928, + "step": 2809 + }, + { + "epoch": 0.19683557049918132, + "grad_norm": 4.901642322540283, + "learning_rate": 8.033492119089317e-05, + "loss": 1.0751, + "num_input_tokens_seen": 45209080, + "step": 2810 + }, + { + "epoch": 0.19690561874491055, + "grad_norm": 3.685493230819702, + "learning_rate": 8.032792294220666e-05, + "loss": 1.0408, + "num_input_tokens_seen": 45225400, + "step": 2811 + }, + { + "epoch": 0.19697566699063981, + "grad_norm": 4.731873512268066, + "learning_rate": 8.032092469352015e-05, + "loss": 0.9684, + "num_input_tokens_seen": 45241152, + "step": 2812 + }, + { + "epoch": 0.19704571523636905, + "grad_norm": 4.52595853805542, + "learning_rate": 8.031392644483364e-05, + "loss": 1.142, + "num_input_tokens_seen": 45256976, + "step": 2813 + }, + { + "epoch": 0.1971157634820983, + "grad_norm": 4.4693074226379395, + "learning_rate": 8.030692819614711e-05, + "loss": 1.0846, + "num_input_tokens_seen": 45273360, + "step": 2814 + }, + { + "epoch": 0.19718581172782754, + "grad_norm": 5.599058151245117, + "learning_rate": 8.02999299474606e-05, + "loss": 1.1544, + "num_input_tokens_seen": 45289744, + "step": 2815 + }, + { + "epoch": 0.1972558599735568, + "grad_norm": 3.758751153945923, + "learning_rate": 8.029293169877408e-05, + "loss": 1.1877, + "num_input_tokens_seen": 45305960, + "step": 2816 + }, + { + "epoch": 0.19732590821928603, + "grad_norm": 4.059335231781006, + "learning_rate": 8.028593345008757e-05, + "loss": 1.0294, + "num_input_tokens_seen": 45321536, + "step": 2817 + }, + { + "epoch": 0.1973959564650153, + "grad_norm": 3.8090553283691406, + "learning_rate": 8.027893520140105e-05, + "loss": 1.1264, + "num_input_tokens_seen": 45337920, + "step": 2818 + }, + { + "epoch": 0.19746600471074452, + "grad_norm": 3.7900006771087646, + "learning_rate": 8.027193695271454e-05, + "loss": 1.2042, + "num_input_tokens_seen": 45353632, + "step": 2819 + }, + { + "epoch": 0.19753605295647378, + "grad_norm": 4.279977321624756, + "learning_rate": 8.026493870402803e-05, + "loss": 1.0786, + "num_input_tokens_seen": 45369712, + "step": 2820 + }, + { + "epoch": 0.197606101202203, + "grad_norm": 3.7417356967926025, + "learning_rate": 8.025794045534152e-05, + "loss": 1.0756, + "num_input_tokens_seen": 45384816, + "step": 2821 + }, + { + "epoch": 0.19767614944793227, + "grad_norm": 4.084759712219238, + "learning_rate": 8.0250942206655e-05, + "loss": 0.9187, + "num_input_tokens_seen": 45401200, + "step": 2822 + }, + { + "epoch": 0.1977461976936615, + "grad_norm": 4.963731288909912, + "learning_rate": 8.024394395796848e-05, + "loss": 1.2548, + "num_input_tokens_seen": 45417096, + "step": 2823 + }, + { + "epoch": 0.19781624593939076, + "grad_norm": 4.115303993225098, + "learning_rate": 8.023694570928196e-05, + "loss": 1.2127, + "num_input_tokens_seen": 45433480, + "step": 2824 + }, + { + "epoch": 0.19788629418512, + "grad_norm": 3.908439874649048, + "learning_rate": 8.022994746059545e-05, + "loss": 1.0171, + "num_input_tokens_seen": 45448984, + "step": 2825 + }, + { + "epoch": 0.19795634243084925, + "grad_norm": 4.0723090171813965, + "learning_rate": 8.022294921190894e-05, + "loss": 0.9883, + "num_input_tokens_seen": 45465192, + "step": 2826 + }, + { + "epoch": 0.19802639067657848, + "grad_norm": 4.219478607177734, + "learning_rate": 8.021595096322242e-05, + "loss": 1.109, + "num_input_tokens_seen": 45480904, + "step": 2827 + }, + { + "epoch": 0.19809643892230774, + "grad_norm": 4.246188163757324, + "learning_rate": 8.020895271453591e-05, + "loss": 1.3058, + "num_input_tokens_seen": 45497288, + "step": 2828 + }, + { + "epoch": 0.19816648716803698, + "grad_norm": 4.898525714874268, + "learning_rate": 8.020195446584939e-05, + "loss": 1.1058, + "num_input_tokens_seen": 45513456, + "step": 2829 + }, + { + "epoch": 0.19823653541376623, + "grad_norm": 4.1247239112854, + "learning_rate": 8.019495621716288e-05, + "loss": 1.031, + "num_input_tokens_seen": 45528752, + "step": 2830 + }, + { + "epoch": 0.19830658365949547, + "grad_norm": 4.352110385894775, + "learning_rate": 8.018795796847636e-05, + "loss": 1.3602, + "num_input_tokens_seen": 45545136, + "step": 2831 + }, + { + "epoch": 0.19837663190522473, + "grad_norm": 3.731719732284546, + "learning_rate": 8.018095971978985e-05, + "loss": 0.9833, + "num_input_tokens_seen": 45561160, + "step": 2832 + }, + { + "epoch": 0.19844668015095396, + "grad_norm": 4.234768867492676, + "learning_rate": 8.017396147110333e-05, + "loss": 1.2279, + "num_input_tokens_seen": 45577288, + "step": 2833 + }, + { + "epoch": 0.19851672839668322, + "grad_norm": 4.682285308837891, + "learning_rate": 8.016696322241682e-05, + "loss": 1.0376, + "num_input_tokens_seen": 45593152, + "step": 2834 + }, + { + "epoch": 0.19858677664241245, + "grad_norm": 4.576408863067627, + "learning_rate": 8.01599649737303e-05, + "loss": 1.225, + "num_input_tokens_seen": 45609408, + "step": 2835 + }, + { + "epoch": 0.1986568248881417, + "grad_norm": 4.209808826446533, + "learning_rate": 8.015296672504378e-05, + "loss": 1.0308, + "num_input_tokens_seen": 45625792, + "step": 2836 + }, + { + "epoch": 0.19872687313387094, + "grad_norm": 4.383143901824951, + "learning_rate": 8.014596847635727e-05, + "loss": 1.2079, + "num_input_tokens_seen": 45642176, + "step": 2837 + }, + { + "epoch": 0.1987969213796002, + "grad_norm": 4.105413913726807, + "learning_rate": 8.013897022767076e-05, + "loss": 1.0623, + "num_input_tokens_seen": 45657480, + "step": 2838 + }, + { + "epoch": 0.19886696962532946, + "grad_norm": 5.339532852172852, + "learning_rate": 8.013197197898425e-05, + "loss": 1.1131, + "num_input_tokens_seen": 45673168, + "step": 2839 + }, + { + "epoch": 0.1989370178710587, + "grad_norm": 3.8160016536712646, + "learning_rate": 8.012497373029774e-05, + "loss": 1.1392, + "num_input_tokens_seen": 45689088, + "step": 2840 + }, + { + "epoch": 0.19900706611678795, + "grad_norm": 3.763986587524414, + "learning_rate": 8.011797548161121e-05, + "loss": 1.1852, + "num_input_tokens_seen": 45705472, + "step": 2841 + }, + { + "epoch": 0.19907711436251718, + "grad_norm": 4.034756183624268, + "learning_rate": 8.01109772329247e-05, + "loss": 1.1856, + "num_input_tokens_seen": 45721168, + "step": 2842 + }, + { + "epoch": 0.19914716260824644, + "grad_norm": 3.971479654312134, + "learning_rate": 8.010397898423817e-05, + "loss": 1.1443, + "num_input_tokens_seen": 45737312, + "step": 2843 + }, + { + "epoch": 0.19921721085397567, + "grad_norm": 4.118296146392822, + "learning_rate": 8.009698073555166e-05, + "loss": 0.9964, + "num_input_tokens_seen": 45752792, + "step": 2844 + }, + { + "epoch": 0.19928725909970493, + "grad_norm": 3.628143310546875, + "learning_rate": 8.008998248686515e-05, + "loss": 1.1102, + "num_input_tokens_seen": 45769008, + "step": 2845 + }, + { + "epoch": 0.19935730734543416, + "grad_norm": 3.9946494102478027, + "learning_rate": 8.008298423817864e-05, + "loss": 1.1199, + "num_input_tokens_seen": 45785392, + "step": 2846 + }, + { + "epoch": 0.19942735559116342, + "grad_norm": 3.7445459365844727, + "learning_rate": 8.007598598949213e-05, + "loss": 1.1245, + "num_input_tokens_seen": 45801320, + "step": 2847 + }, + { + "epoch": 0.19949740383689266, + "grad_norm": 3.745481491088867, + "learning_rate": 8.006898774080562e-05, + "loss": 1.0969, + "num_input_tokens_seen": 45817504, + "step": 2848 + }, + { + "epoch": 0.19956745208262192, + "grad_norm": 4.1305766105651855, + "learning_rate": 8.006198949211909e-05, + "loss": 1.0953, + "num_input_tokens_seen": 45833888, + "step": 2849 + }, + { + "epoch": 0.19963750032835115, + "grad_norm": 3.7843470573425293, + "learning_rate": 8.005499124343258e-05, + "loss": 1.111, + "num_input_tokens_seen": 45850272, + "step": 2850 + }, + { + "epoch": 0.1997075485740804, + "grad_norm": 3.9884989261627197, + "learning_rate": 8.004799299474606e-05, + "loss": 1.083, + "num_input_tokens_seen": 45866656, + "step": 2851 + }, + { + "epoch": 0.19977759681980964, + "grad_norm": 3.7280545234680176, + "learning_rate": 8.004099474605956e-05, + "loss": 1.0036, + "num_input_tokens_seen": 45882776, + "step": 2852 + }, + { + "epoch": 0.1998476450655389, + "grad_norm": 5.151428699493408, + "learning_rate": 8.003399649737303e-05, + "loss": 1.2988, + "num_input_tokens_seen": 45898520, + "step": 2853 + }, + { + "epoch": 0.19991769331126813, + "grad_norm": 6.738519191741943, + "learning_rate": 8.002699824868652e-05, + "loss": 1.1934, + "num_input_tokens_seen": 45914904, + "step": 2854 + }, + { + "epoch": 0.1999877415569974, + "grad_norm": 4.689775466918945, + "learning_rate": 8.002000000000001e-05, + "loss": 1.3534, + "num_input_tokens_seen": 45931288, + "step": 2855 + }, + { + "epoch": 0.20005778980272662, + "grad_norm": 4.047792911529541, + "learning_rate": 8.001300175131348e-05, + "loss": 1.2926, + "num_input_tokens_seen": 45947672, + "step": 2856 + }, + { + "epoch": 0.20012783804845588, + "grad_norm": 4.609661102294922, + "learning_rate": 8.000600350262697e-05, + "loss": 1.0717, + "num_input_tokens_seen": 45964056, + "step": 2857 + }, + { + "epoch": 0.2001978862941851, + "grad_norm": 4.188840389251709, + "learning_rate": 7.999900525394046e-05, + "loss": 1.0872, + "num_input_tokens_seen": 45980152, + "step": 2858 + }, + { + "epoch": 0.20026793453991437, + "grad_norm": 3.558335781097412, + "learning_rate": 7.999200700525395e-05, + "loss": 1.1207, + "num_input_tokens_seen": 45996536, + "step": 2859 + }, + { + "epoch": 0.2003379827856436, + "grad_norm": 10.145834922790527, + "learning_rate": 7.998500875656743e-05, + "loss": 1.0649, + "num_input_tokens_seen": 46011616, + "step": 2860 + }, + { + "epoch": 0.20040803103137286, + "grad_norm": 5.534536838531494, + "learning_rate": 7.997801050788091e-05, + "loss": 1.3019, + "num_input_tokens_seen": 46027016, + "step": 2861 + }, + { + "epoch": 0.2004780792771021, + "grad_norm": 4.258336544036865, + "learning_rate": 7.99710122591944e-05, + "loss": 1.1192, + "num_input_tokens_seen": 46043400, + "step": 2862 + }, + { + "epoch": 0.20054812752283135, + "grad_norm": 5.266301155090332, + "learning_rate": 7.996401401050788e-05, + "loss": 1.0048, + "num_input_tokens_seen": 46059784, + "step": 2863 + }, + { + "epoch": 0.20061817576856059, + "grad_norm": 4.502764701843262, + "learning_rate": 7.995701576182137e-05, + "loss": 0.9435, + "num_input_tokens_seen": 46075584, + "step": 2864 + }, + { + "epoch": 0.20068822401428985, + "grad_norm": 4.39752197265625, + "learning_rate": 7.995001751313485e-05, + "loss": 0.9992, + "num_input_tokens_seen": 46091520, + "step": 2865 + }, + { + "epoch": 0.20075827226001908, + "grad_norm": 3.9562480449676514, + "learning_rate": 7.994301926444834e-05, + "loss": 0.9935, + "num_input_tokens_seen": 46107568, + "step": 2866 + }, + { + "epoch": 0.20082832050574834, + "grad_norm": 4.466681957244873, + "learning_rate": 7.993602101576183e-05, + "loss": 1.0067, + "num_input_tokens_seen": 46123952, + "step": 2867 + }, + { + "epoch": 0.20089836875147757, + "grad_norm": 3.9317095279693604, + "learning_rate": 7.992902276707531e-05, + "loss": 1.0353, + "num_input_tokens_seen": 46140336, + "step": 2868 + }, + { + "epoch": 0.20096841699720683, + "grad_norm": 5.025266170501709, + "learning_rate": 7.99220245183888e-05, + "loss": 1.1297, + "num_input_tokens_seen": 46155504, + "step": 2869 + }, + { + "epoch": 0.20103846524293606, + "grad_norm": 3.82340931892395, + "learning_rate": 7.991502626970227e-05, + "loss": 1.1677, + "num_input_tokens_seen": 46171888, + "step": 2870 + }, + { + "epoch": 0.20110851348866532, + "grad_norm": 4.017914295196533, + "learning_rate": 7.990802802101576e-05, + "loss": 1.0779, + "num_input_tokens_seen": 46187712, + "step": 2871 + }, + { + "epoch": 0.20117856173439455, + "grad_norm": 4.053089618682861, + "learning_rate": 7.990102977232926e-05, + "loss": 0.9687, + "num_input_tokens_seen": 46202912, + "step": 2872 + }, + { + "epoch": 0.2012486099801238, + "grad_norm": 3.5664076805114746, + "learning_rate": 7.989403152364274e-05, + "loss": 1.0047, + "num_input_tokens_seen": 46219296, + "step": 2873 + }, + { + "epoch": 0.20131865822585307, + "grad_norm": 4.039318084716797, + "learning_rate": 7.988703327495623e-05, + "loss": 1.107, + "num_input_tokens_seen": 46235680, + "step": 2874 + }, + { + "epoch": 0.2013887064715823, + "grad_norm": 3.8851678371429443, + "learning_rate": 7.988003502626971e-05, + "loss": 1.0268, + "num_input_tokens_seen": 46251408, + "step": 2875 + }, + { + "epoch": 0.20145875471731156, + "grad_norm": 3.581632137298584, + "learning_rate": 7.987303677758319e-05, + "loss": 0.9255, + "num_input_tokens_seen": 46267696, + "step": 2876 + }, + { + "epoch": 0.2015288029630408, + "grad_norm": 4.135960102081299, + "learning_rate": 7.986603852889668e-05, + "loss": 1.1763, + "num_input_tokens_seen": 46284080, + "step": 2877 + }, + { + "epoch": 0.20159885120877005, + "grad_norm": 3.649959087371826, + "learning_rate": 7.985904028021017e-05, + "loss": 0.8932, + "num_input_tokens_seen": 46300456, + "step": 2878 + }, + { + "epoch": 0.20166889945449928, + "grad_norm": 4.564159393310547, + "learning_rate": 7.985204203152365e-05, + "loss": 0.9239, + "num_input_tokens_seen": 46315928, + "step": 2879 + }, + { + "epoch": 0.20173894770022854, + "grad_norm": 3.806626796722412, + "learning_rate": 7.984504378283713e-05, + "loss": 1.0011, + "num_input_tokens_seen": 46331520, + "step": 2880 + }, + { + "epoch": 0.20180899594595778, + "grad_norm": 6.621458530426025, + "learning_rate": 7.983804553415062e-05, + "loss": 1.045, + "num_input_tokens_seen": 46347904, + "step": 2881 + }, + { + "epoch": 0.20187904419168703, + "grad_norm": 4.554089546203613, + "learning_rate": 7.983104728546411e-05, + "loss": 0.9472, + "num_input_tokens_seen": 46364288, + "step": 2882 + }, + { + "epoch": 0.20194909243741627, + "grad_norm": 4.206694602966309, + "learning_rate": 7.982404903677758e-05, + "loss": 1.1913, + "num_input_tokens_seen": 46380672, + "step": 2883 + }, + { + "epoch": 0.20201914068314553, + "grad_norm": 6.333064079284668, + "learning_rate": 7.981705078809107e-05, + "loss": 1.1189, + "num_input_tokens_seen": 46396384, + "step": 2884 + }, + { + "epoch": 0.20208918892887476, + "grad_norm": 3.6293835639953613, + "learning_rate": 7.981005253940456e-05, + "loss": 0.9825, + "num_input_tokens_seen": 46412712, + "step": 2885 + }, + { + "epoch": 0.20215923717460402, + "grad_norm": 6.282841682434082, + "learning_rate": 7.980305429071805e-05, + "loss": 1.0498, + "num_input_tokens_seen": 46429096, + "step": 2886 + }, + { + "epoch": 0.20222928542033325, + "grad_norm": 3.661564350128174, + "learning_rate": 7.979605604203152e-05, + "loss": 0.9022, + "num_input_tokens_seen": 46445480, + "step": 2887 + }, + { + "epoch": 0.2022993336660625, + "grad_norm": 4.232359409332275, + "learning_rate": 7.978905779334501e-05, + "loss": 1.3196, + "num_input_tokens_seen": 46461344, + "step": 2888 + }, + { + "epoch": 0.20236938191179174, + "grad_norm": 3.9777348041534424, + "learning_rate": 7.97820595446585e-05, + "loss": 1.1121, + "num_input_tokens_seen": 46477728, + "step": 2889 + }, + { + "epoch": 0.202439430157521, + "grad_norm": 4.221210479736328, + "learning_rate": 7.977506129597197e-05, + "loss": 1.1899, + "num_input_tokens_seen": 46493680, + "step": 2890 + }, + { + "epoch": 0.20250947840325023, + "grad_norm": 4.210818767547607, + "learning_rate": 7.976806304728546e-05, + "loss": 1.1003, + "num_input_tokens_seen": 46510064, + "step": 2891 + }, + { + "epoch": 0.2025795266489795, + "grad_norm": 5.012551307678223, + "learning_rate": 7.976106479859895e-05, + "loss": 0.9933, + "num_input_tokens_seen": 46526448, + "step": 2892 + }, + { + "epoch": 0.20264957489470872, + "grad_norm": 3.4867520332336426, + "learning_rate": 7.975406654991244e-05, + "loss": 0.8495, + "num_input_tokens_seen": 46542832, + "step": 2893 + }, + { + "epoch": 0.20271962314043798, + "grad_norm": 4.74222993850708, + "learning_rate": 7.974706830122593e-05, + "loss": 1.1398, + "num_input_tokens_seen": 46559048, + "step": 2894 + }, + { + "epoch": 0.20278967138616721, + "grad_norm": 5.358060359954834, + "learning_rate": 7.97400700525394e-05, + "loss": 1.0004, + "num_input_tokens_seen": 46575400, + "step": 2895 + }, + { + "epoch": 0.20285971963189647, + "grad_norm": 4.2599053382873535, + "learning_rate": 7.973307180385289e-05, + "loss": 1.0021, + "num_input_tokens_seen": 46591064, + "step": 2896 + }, + { + "epoch": 0.2029297678776257, + "grad_norm": 5.993118762969971, + "learning_rate": 7.972607355516637e-05, + "loss": 1.2017, + "num_input_tokens_seen": 46606504, + "step": 2897 + }, + { + "epoch": 0.20299981612335496, + "grad_norm": 4.129568576812744, + "learning_rate": 7.971907530647987e-05, + "loss": 1.2929, + "num_input_tokens_seen": 46622400, + "step": 2898 + }, + { + "epoch": 0.2030698643690842, + "grad_norm": 3.8486111164093018, + "learning_rate": 7.971207705779336e-05, + "loss": 1.0113, + "num_input_tokens_seen": 46638752, + "step": 2899 + }, + { + "epoch": 0.20313991261481346, + "grad_norm": 4.262311935424805, + "learning_rate": 7.970507880910683e-05, + "loss": 1.1222, + "num_input_tokens_seen": 46655136, + "step": 2900 + }, + { + "epoch": 0.2032099608605427, + "grad_norm": 4.065335750579834, + "learning_rate": 7.969808056042032e-05, + "loss": 1.2965, + "num_input_tokens_seen": 46671520, + "step": 2901 + }, + { + "epoch": 0.20328000910627195, + "grad_norm": 3.8313064575195312, + "learning_rate": 7.969108231173381e-05, + "loss": 1.1245, + "num_input_tokens_seen": 46687904, + "step": 2902 + }, + { + "epoch": 0.20335005735200118, + "grad_norm": 3.711580276489258, + "learning_rate": 7.968408406304729e-05, + "loss": 1.1688, + "num_input_tokens_seen": 46704088, + "step": 2903 + }, + { + "epoch": 0.20342010559773044, + "grad_norm": 4.172581672668457, + "learning_rate": 7.967708581436077e-05, + "loss": 1.1609, + "num_input_tokens_seen": 46720360, + "step": 2904 + }, + { + "epoch": 0.20349015384345967, + "grad_norm": 4.7567267417907715, + "learning_rate": 7.967008756567426e-05, + "loss": 1.169, + "num_input_tokens_seen": 46735560, + "step": 2905 + }, + { + "epoch": 0.20356020208918893, + "grad_norm": 4.304897308349609, + "learning_rate": 7.966308931698775e-05, + "loss": 0.9359, + "num_input_tokens_seen": 46751720, + "step": 2906 + }, + { + "epoch": 0.20363025033491816, + "grad_norm": 4.0556864738464355, + "learning_rate": 7.965609106830123e-05, + "loss": 1.0763, + "num_input_tokens_seen": 46767432, + "step": 2907 + }, + { + "epoch": 0.20370029858064742, + "grad_norm": 3.7381911277770996, + "learning_rate": 7.964909281961472e-05, + "loss": 1.0158, + "num_input_tokens_seen": 46783488, + "step": 2908 + }, + { + "epoch": 0.20377034682637668, + "grad_norm": 4.363048553466797, + "learning_rate": 7.96420945709282e-05, + "loss": 0.9627, + "num_input_tokens_seen": 46799016, + "step": 2909 + }, + { + "epoch": 0.2038403950721059, + "grad_norm": 4.04617166519165, + "learning_rate": 7.963509632224168e-05, + "loss": 1.1312, + "num_input_tokens_seen": 46815400, + "step": 2910 + }, + { + "epoch": 0.20391044331783517, + "grad_norm": 3.8854830265045166, + "learning_rate": 7.962809807355517e-05, + "loss": 1.0525, + "num_input_tokens_seen": 46831784, + "step": 2911 + }, + { + "epoch": 0.2039804915635644, + "grad_norm": 4.197749614715576, + "learning_rate": 7.962109982486866e-05, + "loss": 1.0839, + "num_input_tokens_seen": 46848168, + "step": 2912 + }, + { + "epoch": 0.20405053980929366, + "grad_norm": 4.414098739624023, + "learning_rate": 7.961410157618214e-05, + "loss": 1.1576, + "num_input_tokens_seen": 46864552, + "step": 2913 + }, + { + "epoch": 0.2041205880550229, + "grad_norm": 3.7771573066711426, + "learning_rate": 7.960710332749562e-05, + "loss": 0.9597, + "num_input_tokens_seen": 46880936, + "step": 2914 + }, + { + "epoch": 0.20419063630075215, + "grad_norm": 4.179026126861572, + "learning_rate": 7.960010507880911e-05, + "loss": 1.0754, + "num_input_tokens_seen": 46897192, + "step": 2915 + }, + { + "epoch": 0.20426068454648139, + "grad_norm": 4.017509460449219, + "learning_rate": 7.95931068301226e-05, + "loss": 1.0476, + "num_input_tokens_seen": 46913576, + "step": 2916 + }, + { + "epoch": 0.20433073279221065, + "grad_norm": 5.863056182861328, + "learning_rate": 7.958610858143607e-05, + "loss": 1.235, + "num_input_tokens_seen": 46929960, + "step": 2917 + }, + { + "epoch": 0.20440078103793988, + "grad_norm": 5.267307281494141, + "learning_rate": 7.957911033274956e-05, + "loss": 1.2414, + "num_input_tokens_seen": 46946344, + "step": 2918 + }, + { + "epoch": 0.20447082928366914, + "grad_norm": 5.20788049697876, + "learning_rate": 7.957211208406306e-05, + "loss": 1.1215, + "num_input_tokens_seen": 46961712, + "step": 2919 + }, + { + "epoch": 0.20454087752939837, + "grad_norm": 4.609791278839111, + "learning_rate": 7.956511383537654e-05, + "loss": 1.0219, + "num_input_tokens_seen": 46977752, + "step": 2920 + }, + { + "epoch": 0.20461092577512763, + "grad_norm": 3.9752824306488037, + "learning_rate": 7.955811558669003e-05, + "loss": 1.1427, + "num_input_tokens_seen": 46994136, + "step": 2921 + }, + { + "epoch": 0.20468097402085686, + "grad_norm": 3.8456339836120605, + "learning_rate": 7.95511173380035e-05, + "loss": 1.1006, + "num_input_tokens_seen": 47010520, + "step": 2922 + }, + { + "epoch": 0.20475102226658612, + "grad_norm": 4.087759494781494, + "learning_rate": 7.954411908931699e-05, + "loss": 1.0535, + "num_input_tokens_seen": 47026904, + "step": 2923 + }, + { + "epoch": 0.20482107051231535, + "grad_norm": 3.9754104614257812, + "learning_rate": 7.953712084063048e-05, + "loss": 1.0334, + "num_input_tokens_seen": 47043288, + "step": 2924 + }, + { + "epoch": 0.2048911187580446, + "grad_norm": 3.61798357963562, + "learning_rate": 7.953012259194397e-05, + "loss": 1.1471, + "num_input_tokens_seen": 47059672, + "step": 2925 + }, + { + "epoch": 0.20496116700377384, + "grad_norm": 4.015439510345459, + "learning_rate": 7.952312434325746e-05, + "loss": 1.0836, + "num_input_tokens_seen": 47074232, + "step": 2926 + }, + { + "epoch": 0.2050312152495031, + "grad_norm": 5.869642734527588, + "learning_rate": 7.951612609457093e-05, + "loss": 1.275, + "num_input_tokens_seen": 47090616, + "step": 2927 + }, + { + "epoch": 0.20510126349523233, + "grad_norm": 4.0500922203063965, + "learning_rate": 7.950912784588442e-05, + "loss": 1.1142, + "num_input_tokens_seen": 47106656, + "step": 2928 + }, + { + "epoch": 0.2051713117409616, + "grad_norm": 5.468737602233887, + "learning_rate": 7.950212959719791e-05, + "loss": 1.2679, + "num_input_tokens_seen": 47122648, + "step": 2929 + }, + { + "epoch": 0.20524135998669082, + "grad_norm": 3.842905282974243, + "learning_rate": 7.949513134851138e-05, + "loss": 1.0889, + "num_input_tokens_seen": 47139032, + "step": 2930 + }, + { + "epoch": 0.20531140823242008, + "grad_norm": 4.24273681640625, + "learning_rate": 7.948813309982487e-05, + "loss": 1.0533, + "num_input_tokens_seen": 47154344, + "step": 2931 + }, + { + "epoch": 0.20538145647814932, + "grad_norm": 3.977433443069458, + "learning_rate": 7.948113485113836e-05, + "loss": 0.9184, + "num_input_tokens_seen": 47170728, + "step": 2932 + }, + { + "epoch": 0.20545150472387858, + "grad_norm": 3.8441646099090576, + "learning_rate": 7.947413660245185e-05, + "loss": 1.1266, + "num_input_tokens_seen": 47187112, + "step": 2933 + }, + { + "epoch": 0.2055215529696078, + "grad_norm": 3.3789381980895996, + "learning_rate": 7.946713835376532e-05, + "loss": 0.9244, + "num_input_tokens_seen": 47203400, + "step": 2934 + }, + { + "epoch": 0.20559160121533707, + "grad_norm": 3.817631483078003, + "learning_rate": 7.946014010507881e-05, + "loss": 1.198, + "num_input_tokens_seen": 47219784, + "step": 2935 + }, + { + "epoch": 0.2056616494610663, + "grad_norm": 3.788300037384033, + "learning_rate": 7.94531418563923e-05, + "loss": 1.1565, + "num_input_tokens_seen": 47236168, + "step": 2936 + }, + { + "epoch": 0.20573169770679556, + "grad_norm": 3.852132558822632, + "learning_rate": 7.944614360770578e-05, + "loss": 1.1259, + "num_input_tokens_seen": 47252288, + "step": 2937 + }, + { + "epoch": 0.2058017459525248, + "grad_norm": 3.8631093502044678, + "learning_rate": 7.943914535901926e-05, + "loss": 1.091, + "num_input_tokens_seen": 47267000, + "step": 2938 + }, + { + "epoch": 0.20587179419825405, + "grad_norm": 3.72165846824646, + "learning_rate": 7.943214711033275e-05, + "loss": 0.7975, + "num_input_tokens_seen": 47282832, + "step": 2939 + }, + { + "epoch": 0.20594184244398328, + "grad_norm": 4.04188871383667, + "learning_rate": 7.942514886164624e-05, + "loss": 1.0953, + "num_input_tokens_seen": 47298320, + "step": 2940 + }, + { + "epoch": 0.20601189068971254, + "grad_norm": 3.5907206535339355, + "learning_rate": 7.941815061295972e-05, + "loss": 0.9766, + "num_input_tokens_seen": 47314704, + "step": 2941 + }, + { + "epoch": 0.20608193893544177, + "grad_norm": 5.023667335510254, + "learning_rate": 7.94111523642732e-05, + "loss": 1.2083, + "num_input_tokens_seen": 47331088, + "step": 2942 + }, + { + "epoch": 0.20615198718117103, + "grad_norm": 3.8885724544525146, + "learning_rate": 7.94041541155867e-05, + "loss": 0.9374, + "num_input_tokens_seen": 47347424, + "step": 2943 + }, + { + "epoch": 0.2062220354269003, + "grad_norm": 4.289493560791016, + "learning_rate": 7.939715586690017e-05, + "loss": 1.0399, + "num_input_tokens_seen": 47363808, + "step": 2944 + }, + { + "epoch": 0.20629208367262952, + "grad_norm": 4.976572513580322, + "learning_rate": 7.939015761821367e-05, + "loss": 0.8901, + "num_input_tokens_seen": 47379152, + "step": 2945 + }, + { + "epoch": 0.20636213191835878, + "grad_norm": 4.0893425941467285, + "learning_rate": 7.938315936952716e-05, + "loss": 1.0622, + "num_input_tokens_seen": 47395536, + "step": 2946 + }, + { + "epoch": 0.206432180164088, + "grad_norm": 3.799873113632202, + "learning_rate": 7.937616112084063e-05, + "loss": 1.1433, + "num_input_tokens_seen": 47410968, + "step": 2947 + }, + { + "epoch": 0.20650222840981727, + "grad_norm": 4.688945293426514, + "learning_rate": 7.936916287215412e-05, + "loss": 1.1424, + "num_input_tokens_seen": 47427352, + "step": 2948 + }, + { + "epoch": 0.2065722766555465, + "grad_norm": 3.6503846645355225, + "learning_rate": 7.93621646234676e-05, + "loss": 0.9236, + "num_input_tokens_seen": 47443736, + "step": 2949 + }, + { + "epoch": 0.20664232490127576, + "grad_norm": 4.2314324378967285, + "learning_rate": 7.935516637478109e-05, + "loss": 1.2795, + "num_input_tokens_seen": 47460120, + "step": 2950 + }, + { + "epoch": 0.206712373147005, + "grad_norm": 5.159674644470215, + "learning_rate": 7.934816812609458e-05, + "loss": 0.8852, + "num_input_tokens_seen": 47476256, + "step": 2951 + }, + { + "epoch": 0.20678242139273426, + "grad_norm": 3.798804759979248, + "learning_rate": 7.934116987740806e-05, + "loss": 1.1161, + "num_input_tokens_seen": 47492208, + "step": 2952 + }, + { + "epoch": 0.2068524696384635, + "grad_norm": 4.233975887298584, + "learning_rate": 7.933417162872155e-05, + "loss": 1.0927, + "num_input_tokens_seen": 47507728, + "step": 2953 + }, + { + "epoch": 0.20692251788419275, + "grad_norm": 3.38350772857666, + "learning_rate": 7.932717338003503e-05, + "loss": 1.0429, + "num_input_tokens_seen": 47523992, + "step": 2954 + }, + { + "epoch": 0.20699256612992198, + "grad_norm": 3.94380521774292, + "learning_rate": 7.932017513134852e-05, + "loss": 0.9227, + "num_input_tokens_seen": 47540376, + "step": 2955 + }, + { + "epoch": 0.20706261437565124, + "grad_norm": 3.887354612350464, + "learning_rate": 7.9313176882662e-05, + "loss": 0.9709, + "num_input_tokens_seen": 47555336, + "step": 2956 + }, + { + "epoch": 0.20713266262138047, + "grad_norm": 4.271602153778076, + "learning_rate": 7.930617863397548e-05, + "loss": 1.3089, + "num_input_tokens_seen": 47570520, + "step": 2957 + }, + { + "epoch": 0.20720271086710973, + "grad_norm": 4.119933605194092, + "learning_rate": 7.929918038528897e-05, + "loss": 1.0162, + "num_input_tokens_seen": 47586904, + "step": 2958 + }, + { + "epoch": 0.20727275911283896, + "grad_norm": 6.137136936187744, + "learning_rate": 7.929218213660246e-05, + "loss": 0.7847, + "num_input_tokens_seen": 47602424, + "step": 2959 + }, + { + "epoch": 0.20734280735856822, + "grad_norm": 3.5264923572540283, + "learning_rate": 7.928518388791595e-05, + "loss": 1.0751, + "num_input_tokens_seen": 47618808, + "step": 2960 + }, + { + "epoch": 0.20741285560429745, + "grad_norm": 4.183988094329834, + "learning_rate": 7.927818563922942e-05, + "loss": 1.1901, + "num_input_tokens_seen": 47634576, + "step": 2961 + }, + { + "epoch": 0.2074829038500267, + "grad_norm": 3.486311197280884, + "learning_rate": 7.927118739054291e-05, + "loss": 0.8559, + "num_input_tokens_seen": 47649920, + "step": 2962 + }, + { + "epoch": 0.20755295209575594, + "grad_norm": 4.561336994171143, + "learning_rate": 7.92641891418564e-05, + "loss": 0.9521, + "num_input_tokens_seen": 47666304, + "step": 2963 + }, + { + "epoch": 0.2076230003414852, + "grad_norm": 4.002289295196533, + "learning_rate": 7.925719089316987e-05, + "loss": 1.1708, + "num_input_tokens_seen": 47682688, + "step": 2964 + }, + { + "epoch": 0.20769304858721443, + "grad_norm": 3.694175958633423, + "learning_rate": 7.925019264448338e-05, + "loss": 0.9635, + "num_input_tokens_seen": 47699072, + "step": 2965 + }, + { + "epoch": 0.2077630968329437, + "grad_norm": 3.7827298641204834, + "learning_rate": 7.924319439579685e-05, + "loss": 1.0921, + "num_input_tokens_seen": 47714720, + "step": 2966 + }, + { + "epoch": 0.20783314507867293, + "grad_norm": 3.8371527194976807, + "learning_rate": 7.923619614711034e-05, + "loss": 1.12, + "num_input_tokens_seen": 47730904, + "step": 2967 + }, + { + "epoch": 0.20790319332440219, + "grad_norm": 4.20089054107666, + "learning_rate": 7.922919789842381e-05, + "loss": 1.0999, + "num_input_tokens_seen": 47747288, + "step": 2968 + }, + { + "epoch": 0.20797324157013142, + "grad_norm": 3.978065252304077, + "learning_rate": 7.92221996497373e-05, + "loss": 1.0472, + "num_input_tokens_seen": 47763672, + "step": 2969 + }, + { + "epoch": 0.20804328981586068, + "grad_norm": 4.882012844085693, + "learning_rate": 7.921520140105079e-05, + "loss": 1.0838, + "num_input_tokens_seen": 47778888, + "step": 2970 + }, + { + "epoch": 0.2081133380615899, + "grad_norm": 4.202088356018066, + "learning_rate": 7.920820315236428e-05, + "loss": 1.178, + "num_input_tokens_seen": 47795272, + "step": 2971 + }, + { + "epoch": 0.20818338630731917, + "grad_norm": 3.623647928237915, + "learning_rate": 7.920120490367777e-05, + "loss": 0.9782, + "num_input_tokens_seen": 47811656, + "step": 2972 + }, + { + "epoch": 0.2082534345530484, + "grad_norm": 4.158148765563965, + "learning_rate": 7.919420665499126e-05, + "loss": 1.0585, + "num_input_tokens_seen": 47827520, + "step": 2973 + }, + { + "epoch": 0.20832348279877766, + "grad_norm": 4.016353130340576, + "learning_rate": 7.918720840630473e-05, + "loss": 1.0176, + "num_input_tokens_seen": 47843904, + "step": 2974 + }, + { + "epoch": 0.2083935310445069, + "grad_norm": 5.862729072570801, + "learning_rate": 7.918021015761822e-05, + "loss": 1.0233, + "num_input_tokens_seen": 47860288, + "step": 2975 + }, + { + "epoch": 0.20846357929023615, + "grad_norm": 4.194519519805908, + "learning_rate": 7.91732119089317e-05, + "loss": 1.13, + "num_input_tokens_seen": 47876536, + "step": 2976 + }, + { + "epoch": 0.20853362753596538, + "grad_norm": 3.925144672393799, + "learning_rate": 7.916621366024518e-05, + "loss": 1.0069, + "num_input_tokens_seen": 47892216, + "step": 2977 + }, + { + "epoch": 0.20860367578169464, + "grad_norm": 4.005881309509277, + "learning_rate": 7.915921541155867e-05, + "loss": 1.1126, + "num_input_tokens_seen": 47907840, + "step": 2978 + }, + { + "epoch": 0.2086737240274239, + "grad_norm": 3.6061627864837646, + "learning_rate": 7.915221716287216e-05, + "loss": 0.8235, + "num_input_tokens_seen": 47923832, + "step": 2979 + }, + { + "epoch": 0.20874377227315313, + "grad_norm": 4.407896041870117, + "learning_rate": 7.914521891418565e-05, + "loss": 0.962, + "num_input_tokens_seen": 47940216, + "step": 2980 + }, + { + "epoch": 0.2088138205188824, + "grad_norm": 4.089472770690918, + "learning_rate": 7.913822066549912e-05, + "loss": 1.0691, + "num_input_tokens_seen": 47956600, + "step": 2981 + }, + { + "epoch": 0.20888386876461162, + "grad_norm": 4.384250640869141, + "learning_rate": 7.913122241681261e-05, + "loss": 1.1681, + "num_input_tokens_seen": 47972984, + "step": 2982 + }, + { + "epoch": 0.20895391701034088, + "grad_norm": 3.881756544113159, + "learning_rate": 7.91242241681261e-05, + "loss": 1.1473, + "num_input_tokens_seen": 47989368, + "step": 2983 + }, + { + "epoch": 0.20902396525607012, + "grad_norm": 3.9435884952545166, + "learning_rate": 7.911722591943958e-05, + "loss": 1.0328, + "num_input_tokens_seen": 48005608, + "step": 2984 + }, + { + "epoch": 0.20909401350179938, + "grad_norm": 4.1196794509887695, + "learning_rate": 7.911022767075308e-05, + "loss": 1.0287, + "num_input_tokens_seen": 48021992, + "step": 2985 + }, + { + "epoch": 0.2091640617475286, + "grad_norm": 4.482571125030518, + "learning_rate": 7.910322942206655e-05, + "loss": 1.0663, + "num_input_tokens_seen": 48037816, + "step": 2986 + }, + { + "epoch": 0.20923410999325787, + "grad_norm": 5.359109401702881, + "learning_rate": 7.909623117338004e-05, + "loss": 1.2157, + "num_input_tokens_seen": 48054200, + "step": 2987 + }, + { + "epoch": 0.2093041582389871, + "grad_norm": 5.712708950042725, + "learning_rate": 7.908923292469352e-05, + "loss": 1.1454, + "num_input_tokens_seen": 48070008, + "step": 2988 + }, + { + "epoch": 0.20937420648471636, + "grad_norm": 3.980526924133301, + "learning_rate": 7.9082234676007e-05, + "loss": 1.1933, + "num_input_tokens_seen": 48084864, + "step": 2989 + }, + { + "epoch": 0.2094442547304456, + "grad_norm": 4.963679790496826, + "learning_rate": 7.90752364273205e-05, + "loss": 1.1465, + "num_input_tokens_seen": 48101248, + "step": 2990 + }, + { + "epoch": 0.20951430297617485, + "grad_norm": 6.20939302444458, + "learning_rate": 7.906823817863398e-05, + "loss": 1.1187, + "num_input_tokens_seen": 48114984, + "step": 2991 + }, + { + "epoch": 0.20958435122190408, + "grad_norm": 13.218465805053711, + "learning_rate": 7.906123992994747e-05, + "loss": 1.0589, + "num_input_tokens_seen": 48129704, + "step": 2992 + }, + { + "epoch": 0.20965439946763334, + "grad_norm": 6.285522937774658, + "learning_rate": 7.905424168126095e-05, + "loss": 1.0993, + "num_input_tokens_seen": 48144280, + "step": 2993 + }, + { + "epoch": 0.20972444771336257, + "grad_norm": 5.113750457763672, + "learning_rate": 7.904724343257444e-05, + "loss": 1.0187, + "num_input_tokens_seen": 48160664, + "step": 2994 + }, + { + "epoch": 0.20979449595909183, + "grad_norm": 3.5571322441101074, + "learning_rate": 7.904024518388791e-05, + "loss": 0.9789, + "num_input_tokens_seen": 48177048, + "step": 2995 + }, + { + "epoch": 0.20986454420482106, + "grad_norm": 4.965229511260986, + "learning_rate": 7.90332469352014e-05, + "loss": 1.0934, + "num_input_tokens_seen": 48193400, + "step": 2996 + }, + { + "epoch": 0.20993459245055032, + "grad_norm": 4.466450214385986, + "learning_rate": 7.902624868651489e-05, + "loss": 1.2786, + "num_input_tokens_seen": 48209784, + "step": 2997 + }, + { + "epoch": 0.21000464069627955, + "grad_norm": 3.556642770767212, + "learning_rate": 7.901925043782838e-05, + "loss": 1.0579, + "num_input_tokens_seen": 48226096, + "step": 2998 + }, + { + "epoch": 0.2100746889420088, + "grad_norm": 5.175073146820068, + "learning_rate": 7.901225218914187e-05, + "loss": 1.0822, + "num_input_tokens_seen": 48242384, + "step": 2999 + }, + { + "epoch": 0.21014473718773805, + "grad_norm": 4.901797771453857, + "learning_rate": 7.900525394045535e-05, + "loss": 0.9413, + "num_input_tokens_seen": 48257944, + "step": 3000 + }, + { + "epoch": 0.21014473718773805, + "eval_loss": 1.137844204902649, + "eval_runtime": 0.2151, + "eval_samples_per_second": 4.649, + "eval_steps_per_second": 4.649, + "num_input_tokens_seen": 48257944, + "step": 3000 + }, + { + "epoch": 0.2102147854334673, + "grad_norm": 3.8474860191345215, + "learning_rate": 7.899825569176883e-05, + "loss": 0.9454, + "num_input_tokens_seen": 48273144, + "step": 3001 + }, + { + "epoch": 0.21028483367919654, + "grad_norm": 4.4164347648620605, + "learning_rate": 7.899125744308232e-05, + "loss": 1.2554, + "num_input_tokens_seen": 48288896, + "step": 3002 + }, + { + "epoch": 0.2103548819249258, + "grad_norm": 4.560143947601318, + "learning_rate": 7.898425919439579e-05, + "loss": 1.1129, + "num_input_tokens_seen": 48305168, + "step": 3003 + }, + { + "epoch": 0.21042493017065503, + "grad_norm": 4.310809135437012, + "learning_rate": 7.897726094570928e-05, + "loss": 1.1215, + "num_input_tokens_seen": 48320936, + "step": 3004 + }, + { + "epoch": 0.2104949784163843, + "grad_norm": 5.8606367111206055, + "learning_rate": 7.897026269702277e-05, + "loss": 0.7859, + "num_input_tokens_seen": 48334752, + "step": 3005 + }, + { + "epoch": 0.21056502666211352, + "grad_norm": 4.533644676208496, + "learning_rate": 7.896326444833626e-05, + "loss": 1.3134, + "num_input_tokens_seen": 48351136, + "step": 3006 + }, + { + "epoch": 0.21063507490784278, + "grad_norm": 3.955151081085205, + "learning_rate": 7.895626619964975e-05, + "loss": 1.3093, + "num_input_tokens_seen": 48367520, + "step": 3007 + }, + { + "epoch": 0.210705123153572, + "grad_norm": 4.857527732849121, + "learning_rate": 7.894926795096322e-05, + "loss": 0.9838, + "num_input_tokens_seen": 48383584, + "step": 3008 + }, + { + "epoch": 0.21077517139930127, + "grad_norm": 4.2091593742370605, + "learning_rate": 7.894226970227671e-05, + "loss": 0.9278, + "num_input_tokens_seen": 48399968, + "step": 3009 + }, + { + "epoch": 0.2108452196450305, + "grad_norm": 4.02255916595459, + "learning_rate": 7.89352714535902e-05, + "loss": 1.086, + "num_input_tokens_seen": 48416016, + "step": 3010 + }, + { + "epoch": 0.21091526789075976, + "grad_norm": 4.021467208862305, + "learning_rate": 7.892827320490369e-05, + "loss": 1.1088, + "num_input_tokens_seen": 48432400, + "step": 3011 + }, + { + "epoch": 0.21098531613648902, + "grad_norm": 4.211849212646484, + "learning_rate": 7.892127495621716e-05, + "loss": 1.1698, + "num_input_tokens_seen": 48448784, + "step": 3012 + }, + { + "epoch": 0.21105536438221825, + "grad_norm": 3.890512704849243, + "learning_rate": 7.891427670753065e-05, + "loss": 1.1048, + "num_input_tokens_seen": 48465168, + "step": 3013 + }, + { + "epoch": 0.2111254126279475, + "grad_norm": 3.9605376720428467, + "learning_rate": 7.890727845884414e-05, + "loss": 0.9904, + "num_input_tokens_seen": 48481024, + "step": 3014 + }, + { + "epoch": 0.21119546087367674, + "grad_norm": 3.6985483169555664, + "learning_rate": 7.890028021015761e-05, + "loss": 1.1033, + "num_input_tokens_seen": 48497408, + "step": 3015 + }, + { + "epoch": 0.211265509119406, + "grad_norm": 4.245354652404785, + "learning_rate": 7.88932819614711e-05, + "loss": 1.0609, + "num_input_tokens_seen": 48513640, + "step": 3016 + }, + { + "epoch": 0.21133555736513523, + "grad_norm": 4.163609027862549, + "learning_rate": 7.888628371278459e-05, + "loss": 1.2399, + "num_input_tokens_seen": 48529704, + "step": 3017 + }, + { + "epoch": 0.2114056056108645, + "grad_norm": 4.139742374420166, + "learning_rate": 7.887928546409808e-05, + "loss": 1.1029, + "num_input_tokens_seen": 48545808, + "step": 3018 + }, + { + "epoch": 0.21147565385659373, + "grad_norm": 4.119020938873291, + "learning_rate": 7.887228721541157e-05, + "loss": 1.233, + "num_input_tokens_seen": 48561584, + "step": 3019 + }, + { + "epoch": 0.21154570210232299, + "grad_norm": 3.467578172683716, + "learning_rate": 7.886528896672504e-05, + "loss": 0.9757, + "num_input_tokens_seen": 48577912, + "step": 3020 + }, + { + "epoch": 0.21161575034805222, + "grad_norm": 4.891791820526123, + "learning_rate": 7.885829071803853e-05, + "loss": 1.0507, + "num_input_tokens_seen": 48591792, + "step": 3021 + }, + { + "epoch": 0.21168579859378148, + "grad_norm": 3.8184545040130615, + "learning_rate": 7.885129246935201e-05, + "loss": 0.9845, + "num_input_tokens_seen": 48606656, + "step": 3022 + }, + { + "epoch": 0.2117558468395107, + "grad_norm": 3.909607410430908, + "learning_rate": 7.88442942206655e-05, + "loss": 1.2735, + "num_input_tokens_seen": 48622608, + "step": 3023 + }, + { + "epoch": 0.21182589508523997, + "grad_norm": 3.780740737915039, + "learning_rate": 7.883729597197899e-05, + "loss": 0.9796, + "num_input_tokens_seen": 48638992, + "step": 3024 + }, + { + "epoch": 0.2118959433309692, + "grad_norm": 3.95491099357605, + "learning_rate": 7.883029772329247e-05, + "loss": 1.0265, + "num_input_tokens_seen": 48654344, + "step": 3025 + }, + { + "epoch": 0.21196599157669846, + "grad_norm": 3.724346876144409, + "learning_rate": 7.882329947460596e-05, + "loss": 0.9352, + "num_input_tokens_seen": 48670728, + "step": 3026 + }, + { + "epoch": 0.2120360398224277, + "grad_norm": 4.314544200897217, + "learning_rate": 7.881630122591945e-05, + "loss": 1.145, + "num_input_tokens_seen": 48685424, + "step": 3027 + }, + { + "epoch": 0.21210608806815695, + "grad_norm": 3.9340150356292725, + "learning_rate": 7.880930297723293e-05, + "loss": 1.0337, + "num_input_tokens_seen": 48700416, + "step": 3028 + }, + { + "epoch": 0.21217613631388618, + "grad_norm": 4.978084087371826, + "learning_rate": 7.880230472854641e-05, + "loss": 1.2418, + "num_input_tokens_seen": 48716800, + "step": 3029 + }, + { + "epoch": 0.21224618455961544, + "grad_norm": 3.7038094997406006, + "learning_rate": 7.879530647985989e-05, + "loss": 1.0618, + "num_input_tokens_seen": 48732616, + "step": 3030 + }, + { + "epoch": 0.21231623280534467, + "grad_norm": 5.743021011352539, + "learning_rate": 7.878830823117339e-05, + "loss": 1.033, + "num_input_tokens_seen": 48748656, + "step": 3031 + }, + { + "epoch": 0.21238628105107393, + "grad_norm": 5.655540943145752, + "learning_rate": 7.878130998248687e-05, + "loss": 1.3541, + "num_input_tokens_seen": 48765040, + "step": 3032 + }, + { + "epoch": 0.21245632929680316, + "grad_norm": 4.291803359985352, + "learning_rate": 7.877431173380036e-05, + "loss": 1.1966, + "num_input_tokens_seen": 48781424, + "step": 3033 + }, + { + "epoch": 0.21252637754253242, + "grad_norm": 5.103096961975098, + "learning_rate": 7.876731348511384e-05, + "loss": 1.0543, + "num_input_tokens_seen": 48797808, + "step": 3034 + }, + { + "epoch": 0.21259642578826166, + "grad_norm": 5.048161029815674, + "learning_rate": 7.876031523642732e-05, + "loss": 0.9595, + "num_input_tokens_seen": 48814192, + "step": 3035 + }, + { + "epoch": 0.21266647403399092, + "grad_norm": 4.086791038513184, + "learning_rate": 7.875331698774081e-05, + "loss": 1.1128, + "num_input_tokens_seen": 48829816, + "step": 3036 + }, + { + "epoch": 0.21273652227972015, + "grad_norm": 3.8422605991363525, + "learning_rate": 7.87463187390543e-05, + "loss": 1.175, + "num_input_tokens_seen": 48846200, + "step": 3037 + }, + { + "epoch": 0.2128065705254494, + "grad_norm": 3.7120776176452637, + "learning_rate": 7.873932049036778e-05, + "loss": 1.0748, + "num_input_tokens_seen": 48862584, + "step": 3038 + }, + { + "epoch": 0.21287661877117864, + "grad_norm": 5.051353454589844, + "learning_rate": 7.873232224168126e-05, + "loss": 1.0278, + "num_input_tokens_seen": 48878368, + "step": 3039 + }, + { + "epoch": 0.2129466670169079, + "grad_norm": 3.9874653816223145, + "learning_rate": 7.872532399299475e-05, + "loss": 1.256, + "num_input_tokens_seen": 48894696, + "step": 3040 + }, + { + "epoch": 0.21301671526263713, + "grad_norm": 4.455258369445801, + "learning_rate": 7.871832574430824e-05, + "loss": 1.2226, + "num_input_tokens_seen": 48911080, + "step": 3041 + }, + { + "epoch": 0.2130867635083664, + "grad_norm": 5.521103382110596, + "learning_rate": 7.871132749562171e-05, + "loss": 1.2116, + "num_input_tokens_seen": 48927464, + "step": 3042 + }, + { + "epoch": 0.21315681175409562, + "grad_norm": 3.80818510055542, + "learning_rate": 7.87043292469352e-05, + "loss": 1.2213, + "num_input_tokens_seen": 48943848, + "step": 3043 + }, + { + "epoch": 0.21322685999982488, + "grad_norm": 4.319914817810059, + "learning_rate": 7.869733099824869e-05, + "loss": 0.9786, + "num_input_tokens_seen": 48960232, + "step": 3044 + }, + { + "epoch": 0.2132969082455541, + "grad_norm": 4.196371078491211, + "learning_rate": 7.869033274956218e-05, + "loss": 0.9782, + "num_input_tokens_seen": 48976616, + "step": 3045 + }, + { + "epoch": 0.21336695649128337, + "grad_norm": 3.988114595413208, + "learning_rate": 7.868333450087567e-05, + "loss": 1.0923, + "num_input_tokens_seen": 48992248, + "step": 3046 + }, + { + "epoch": 0.21343700473701263, + "grad_norm": 3.887589454650879, + "learning_rate": 7.867633625218914e-05, + "loss": 1.068, + "num_input_tokens_seen": 49008632, + "step": 3047 + }, + { + "epoch": 0.21350705298274186, + "grad_norm": 3.7942206859588623, + "learning_rate": 7.866933800350263e-05, + "loss": 1.1917, + "num_input_tokens_seen": 49024560, + "step": 3048 + }, + { + "epoch": 0.21357710122847112, + "grad_norm": 4.464767932891846, + "learning_rate": 7.86623397548161e-05, + "loss": 0.9137, + "num_input_tokens_seen": 49040200, + "step": 3049 + }, + { + "epoch": 0.21364714947420035, + "grad_norm": 4.411591529846191, + "learning_rate": 7.86553415061296e-05, + "loss": 1.2315, + "num_input_tokens_seen": 49056328, + "step": 3050 + }, + { + "epoch": 0.2137171977199296, + "grad_norm": 4.895592212677002, + "learning_rate": 7.86483432574431e-05, + "loss": 1.0756, + "num_input_tokens_seen": 49072696, + "step": 3051 + }, + { + "epoch": 0.21378724596565885, + "grad_norm": 4.46630859375, + "learning_rate": 7.864134500875657e-05, + "loss": 0.837, + "num_input_tokens_seen": 49087256, + "step": 3052 + }, + { + "epoch": 0.2138572942113881, + "grad_norm": 4.975766658782959, + "learning_rate": 7.863434676007006e-05, + "loss": 1.0508, + "num_input_tokens_seen": 49103640, + "step": 3053 + }, + { + "epoch": 0.21392734245711734, + "grad_norm": 4.441544532775879, + "learning_rate": 7.862734851138355e-05, + "loss": 0.9917, + "num_input_tokens_seen": 49119032, + "step": 3054 + }, + { + "epoch": 0.2139973907028466, + "grad_norm": 3.797757148742676, + "learning_rate": 7.862035026269702e-05, + "loss": 0.8701, + "num_input_tokens_seen": 49134960, + "step": 3055 + }, + { + "epoch": 0.21406743894857583, + "grad_norm": 4.021834373474121, + "learning_rate": 7.861335201401051e-05, + "loss": 1.0355, + "num_input_tokens_seen": 49151344, + "step": 3056 + }, + { + "epoch": 0.2141374871943051, + "grad_norm": 3.772587537765503, + "learning_rate": 7.8606353765324e-05, + "loss": 0.9717, + "num_input_tokens_seen": 49167424, + "step": 3057 + }, + { + "epoch": 0.21420753544003432, + "grad_norm": 5.356356143951416, + "learning_rate": 7.859935551663749e-05, + "loss": 1.027, + "num_input_tokens_seen": 49183504, + "step": 3058 + }, + { + "epoch": 0.21427758368576358, + "grad_norm": 4.314568042755127, + "learning_rate": 7.859235726795096e-05, + "loss": 1.0233, + "num_input_tokens_seen": 49199320, + "step": 3059 + }, + { + "epoch": 0.2143476319314928, + "grad_norm": 3.777794122695923, + "learning_rate": 7.858535901926445e-05, + "loss": 1.1218, + "num_input_tokens_seen": 49215032, + "step": 3060 + }, + { + "epoch": 0.21441768017722207, + "grad_norm": 3.788496732711792, + "learning_rate": 7.857836077057794e-05, + "loss": 0.9121, + "num_input_tokens_seen": 49230248, + "step": 3061 + }, + { + "epoch": 0.2144877284229513, + "grad_norm": 3.776698589324951, + "learning_rate": 7.857136252189142e-05, + "loss": 1.0687, + "num_input_tokens_seen": 49246264, + "step": 3062 + }, + { + "epoch": 0.21455777666868056, + "grad_norm": 3.8229172229766846, + "learning_rate": 7.85643642732049e-05, + "loss": 0.9773, + "num_input_tokens_seen": 49262648, + "step": 3063 + }, + { + "epoch": 0.2146278249144098, + "grad_norm": 3.7620902061462402, + "learning_rate": 7.85573660245184e-05, + "loss": 1.0162, + "num_input_tokens_seen": 49278640, + "step": 3064 + }, + { + "epoch": 0.21469787316013905, + "grad_norm": 3.953148126602173, + "learning_rate": 7.855036777583188e-05, + "loss": 1.1277, + "num_input_tokens_seen": 49295024, + "step": 3065 + }, + { + "epoch": 0.21476792140586828, + "grad_norm": 4.1923441886901855, + "learning_rate": 7.854336952714536e-05, + "loss": 0.9317, + "num_input_tokens_seen": 49311408, + "step": 3066 + }, + { + "epoch": 0.21483796965159754, + "grad_norm": 4.922461986541748, + "learning_rate": 7.853637127845885e-05, + "loss": 1.2234, + "num_input_tokens_seen": 49327120, + "step": 3067 + }, + { + "epoch": 0.21490801789732678, + "grad_norm": 3.7414777278900146, + "learning_rate": 7.852937302977233e-05, + "loss": 0.8628, + "num_input_tokens_seen": 49343504, + "step": 3068 + }, + { + "epoch": 0.21497806614305603, + "grad_norm": 6.1490912437438965, + "learning_rate": 7.852237478108581e-05, + "loss": 0.9836, + "num_input_tokens_seen": 49359336, + "step": 3069 + }, + { + "epoch": 0.21504811438878527, + "grad_norm": 4.232786178588867, + "learning_rate": 7.85153765323993e-05, + "loss": 1.1071, + "num_input_tokens_seen": 49374888, + "step": 3070 + }, + { + "epoch": 0.21511816263451453, + "grad_norm": 4.170281887054443, + "learning_rate": 7.85083782837128e-05, + "loss": 1.1863, + "num_input_tokens_seen": 49391272, + "step": 3071 + }, + { + "epoch": 0.21518821088024376, + "grad_norm": 4.096348285675049, + "learning_rate": 7.850138003502627e-05, + "loss": 1.1574, + "num_input_tokens_seen": 49407656, + "step": 3072 + }, + { + "epoch": 0.21525825912597302, + "grad_norm": 4.523014068603516, + "learning_rate": 7.849438178633976e-05, + "loss": 0.9481, + "num_input_tokens_seen": 49424040, + "step": 3073 + }, + { + "epoch": 0.21532830737170225, + "grad_norm": 5.029306888580322, + "learning_rate": 7.848738353765324e-05, + "loss": 1.2744, + "num_input_tokens_seen": 49440208, + "step": 3074 + }, + { + "epoch": 0.2153983556174315, + "grad_norm": 3.5349771976470947, + "learning_rate": 7.848038528896673e-05, + "loss": 0.8675, + "num_input_tokens_seen": 49456520, + "step": 3075 + }, + { + "epoch": 0.21546840386316074, + "grad_norm": 3.544787645339966, + "learning_rate": 7.84733870402802e-05, + "loss": 1.0082, + "num_input_tokens_seen": 49472904, + "step": 3076 + }, + { + "epoch": 0.21553845210889, + "grad_norm": 4.602756500244141, + "learning_rate": 7.84663887915937e-05, + "loss": 1.0747, + "num_input_tokens_seen": 49489264, + "step": 3077 + }, + { + "epoch": 0.21560850035461923, + "grad_norm": 6.479659080505371, + "learning_rate": 7.845939054290719e-05, + "loss": 1.0437, + "num_input_tokens_seen": 49505232, + "step": 3078 + }, + { + "epoch": 0.2156785486003485, + "grad_norm": 4.584348201751709, + "learning_rate": 7.845239229422067e-05, + "loss": 1.1054, + "num_input_tokens_seen": 49521616, + "step": 3079 + }, + { + "epoch": 0.21574859684607772, + "grad_norm": 4.339470386505127, + "learning_rate": 7.844539404553416e-05, + "loss": 1.2386, + "num_input_tokens_seen": 49537376, + "step": 3080 + }, + { + "epoch": 0.21581864509180698, + "grad_norm": 4.098686218261719, + "learning_rate": 7.843839579684765e-05, + "loss": 0.9376, + "num_input_tokens_seen": 49552256, + "step": 3081 + }, + { + "epoch": 0.21588869333753624, + "grad_norm": 4.619485855102539, + "learning_rate": 7.843139754816112e-05, + "loss": 1.0066, + "num_input_tokens_seen": 49568640, + "step": 3082 + }, + { + "epoch": 0.21595874158326547, + "grad_norm": 4.018712997436523, + "learning_rate": 7.842439929947461e-05, + "loss": 1.0062, + "num_input_tokens_seen": 49584816, + "step": 3083 + }, + { + "epoch": 0.21602878982899473, + "grad_norm": 5.898901462554932, + "learning_rate": 7.84174010507881e-05, + "loss": 1.1956, + "num_input_tokens_seen": 49600872, + "step": 3084 + }, + { + "epoch": 0.21609883807472396, + "grad_norm": 4.794529438018799, + "learning_rate": 7.841040280210159e-05, + "loss": 1.0035, + "num_input_tokens_seen": 49616840, + "step": 3085 + }, + { + "epoch": 0.21616888632045322, + "grad_norm": 4.934964656829834, + "learning_rate": 7.840340455341506e-05, + "loss": 1.0033, + "num_input_tokens_seen": 49633224, + "step": 3086 + }, + { + "epoch": 0.21623893456618246, + "grad_norm": 3.6171560287475586, + "learning_rate": 7.839640630472855e-05, + "loss": 1.1165, + "num_input_tokens_seen": 49649056, + "step": 3087 + }, + { + "epoch": 0.21630898281191172, + "grad_norm": 4.032123565673828, + "learning_rate": 7.838940805604204e-05, + "loss": 1.2411, + "num_input_tokens_seen": 49665440, + "step": 3088 + }, + { + "epoch": 0.21637903105764095, + "grad_norm": 3.4669382572174072, + "learning_rate": 7.838240980735551e-05, + "loss": 0.9666, + "num_input_tokens_seen": 49681824, + "step": 3089 + }, + { + "epoch": 0.2164490793033702, + "grad_norm": 3.6899688243865967, + "learning_rate": 7.8375411558669e-05, + "loss": 0.9657, + "num_input_tokens_seen": 49698208, + "step": 3090 + }, + { + "epoch": 0.21651912754909944, + "grad_norm": 4.231171131134033, + "learning_rate": 7.83684133099825e-05, + "loss": 1.1459, + "num_input_tokens_seen": 49713664, + "step": 3091 + }, + { + "epoch": 0.2165891757948287, + "grad_norm": 4.792253017425537, + "learning_rate": 7.836141506129598e-05, + "loss": 0.9982, + "num_input_tokens_seen": 49730048, + "step": 3092 + }, + { + "epoch": 0.21665922404055793, + "grad_norm": 5.7171478271484375, + "learning_rate": 7.835441681260945e-05, + "loss": 1.189, + "num_input_tokens_seen": 49746432, + "step": 3093 + }, + { + "epoch": 0.2167292722862872, + "grad_norm": 4.393872261047363, + "learning_rate": 7.834741856392294e-05, + "loss": 0.9969, + "num_input_tokens_seen": 49762816, + "step": 3094 + }, + { + "epoch": 0.21679932053201642, + "grad_norm": 6.388276100158691, + "learning_rate": 7.834042031523643e-05, + "loss": 1.2192, + "num_input_tokens_seen": 49778680, + "step": 3095 + }, + { + "epoch": 0.21686936877774568, + "grad_norm": 3.8204843997955322, + "learning_rate": 7.83334220665499e-05, + "loss": 1.0601, + "num_input_tokens_seen": 49794344, + "step": 3096 + }, + { + "epoch": 0.2169394170234749, + "grad_norm": 10.573785781860352, + "learning_rate": 7.832642381786341e-05, + "loss": 0.9257, + "num_input_tokens_seen": 49810208, + "step": 3097 + }, + { + "epoch": 0.21700946526920417, + "grad_norm": 3.437734603881836, + "learning_rate": 7.83194255691769e-05, + "loss": 0.8757, + "num_input_tokens_seen": 49826448, + "step": 3098 + }, + { + "epoch": 0.2170795135149334, + "grad_norm": 3.476918935775757, + "learning_rate": 7.831242732049037e-05, + "loss": 0.908, + "num_input_tokens_seen": 49842832, + "step": 3099 + }, + { + "epoch": 0.21714956176066266, + "grad_norm": 4.037630558013916, + "learning_rate": 7.830542907180386e-05, + "loss": 1.1305, + "num_input_tokens_seen": 49859216, + "step": 3100 + }, + { + "epoch": 0.2172196100063919, + "grad_norm": 3.7424814701080322, + "learning_rate": 7.829843082311734e-05, + "loss": 1.1701, + "num_input_tokens_seen": 49875528, + "step": 3101 + }, + { + "epoch": 0.21728965825212115, + "grad_norm": 4.222198486328125, + "learning_rate": 7.829143257443082e-05, + "loss": 1.0539, + "num_input_tokens_seen": 49891912, + "step": 3102 + }, + { + "epoch": 0.21735970649785039, + "grad_norm": 4.064510822296143, + "learning_rate": 7.828443432574431e-05, + "loss": 1.0524, + "num_input_tokens_seen": 49908064, + "step": 3103 + }, + { + "epoch": 0.21742975474357965, + "grad_norm": 3.822498083114624, + "learning_rate": 7.82774360770578e-05, + "loss": 0.9085, + "num_input_tokens_seen": 49923776, + "step": 3104 + }, + { + "epoch": 0.21749980298930888, + "grad_norm": 4.368459224700928, + "learning_rate": 7.827043782837129e-05, + "loss": 0.9599, + "num_input_tokens_seen": 49940104, + "step": 3105 + }, + { + "epoch": 0.21756985123503814, + "grad_norm": 3.722587823867798, + "learning_rate": 7.826343957968477e-05, + "loss": 1.0286, + "num_input_tokens_seen": 49955624, + "step": 3106 + }, + { + "epoch": 0.21763989948076737, + "grad_norm": 4.277473449707031, + "learning_rate": 7.825644133099825e-05, + "loss": 1.1797, + "num_input_tokens_seen": 49971784, + "step": 3107 + }, + { + "epoch": 0.21770994772649663, + "grad_norm": 4.586781024932861, + "learning_rate": 7.824944308231174e-05, + "loss": 1.0395, + "num_input_tokens_seen": 49988168, + "step": 3108 + }, + { + "epoch": 0.21777999597222586, + "grad_norm": 4.456960201263428, + "learning_rate": 7.824244483362522e-05, + "loss": 0.9449, + "num_input_tokens_seen": 50003392, + "step": 3109 + }, + { + "epoch": 0.21785004421795512, + "grad_norm": 4.115220069885254, + "learning_rate": 7.82354465849387e-05, + "loss": 1.1393, + "num_input_tokens_seen": 50019776, + "step": 3110 + }, + { + "epoch": 0.21792009246368435, + "grad_norm": 3.5760059356689453, + "learning_rate": 7.82284483362522e-05, + "loss": 1.0421, + "num_input_tokens_seen": 50036160, + "step": 3111 + }, + { + "epoch": 0.2179901407094136, + "grad_norm": 5.757627487182617, + "learning_rate": 7.822145008756568e-05, + "loss": 1.1382, + "num_input_tokens_seen": 50052544, + "step": 3112 + }, + { + "epoch": 0.21806018895514284, + "grad_norm": 3.4349796772003174, + "learning_rate": 7.821445183887916e-05, + "loss": 0.8474, + "num_input_tokens_seen": 50068872, + "step": 3113 + }, + { + "epoch": 0.2181302372008721, + "grad_norm": 5.546512603759766, + "learning_rate": 7.820745359019265e-05, + "loss": 1.4564, + "num_input_tokens_seen": 50085256, + "step": 3114 + }, + { + "epoch": 0.21820028544660133, + "grad_norm": 3.4954123497009277, + "learning_rate": 7.820045534150614e-05, + "loss": 1.0479, + "num_input_tokens_seen": 50101640, + "step": 3115 + }, + { + "epoch": 0.2182703336923306, + "grad_norm": 5.396134376525879, + "learning_rate": 7.819345709281961e-05, + "loss": 1.0834, + "num_input_tokens_seen": 50117040, + "step": 3116 + }, + { + "epoch": 0.21834038193805985, + "grad_norm": 3.7895803451538086, + "learning_rate": 7.818645884413311e-05, + "loss": 0.9567, + "num_input_tokens_seen": 50133424, + "step": 3117 + }, + { + "epoch": 0.21841043018378908, + "grad_norm": 5.321155548095703, + "learning_rate": 7.81794605954466e-05, + "loss": 1.2246, + "num_input_tokens_seen": 50148520, + "step": 3118 + }, + { + "epoch": 0.21848047842951834, + "grad_norm": 4.993834495544434, + "learning_rate": 7.817246234676008e-05, + "loss": 1.0944, + "num_input_tokens_seen": 50164904, + "step": 3119 + }, + { + "epoch": 0.21855052667524758, + "grad_norm": 3.69236159324646, + "learning_rate": 7.816546409807355e-05, + "loss": 0.9759, + "num_input_tokens_seen": 50181288, + "step": 3120 + }, + { + "epoch": 0.21862057492097683, + "grad_norm": 3.689748764038086, + "learning_rate": 7.815846584938704e-05, + "loss": 1.0594, + "num_input_tokens_seen": 50197672, + "step": 3121 + }, + { + "epoch": 0.21869062316670607, + "grad_norm": 5.904501914978027, + "learning_rate": 7.815146760070053e-05, + "loss": 1.2348, + "num_input_tokens_seen": 50214016, + "step": 3122 + }, + { + "epoch": 0.21876067141243533, + "grad_norm": 4.018721103668213, + "learning_rate": 7.814446935201402e-05, + "loss": 1.2688, + "num_input_tokens_seen": 50229984, + "step": 3123 + }, + { + "epoch": 0.21883071965816456, + "grad_norm": 3.6527509689331055, + "learning_rate": 7.81374711033275e-05, + "loss": 0.8508, + "num_input_tokens_seen": 50246368, + "step": 3124 + }, + { + "epoch": 0.21890076790389382, + "grad_norm": 3.6600260734558105, + "learning_rate": 7.8130472854641e-05, + "loss": 1.0952, + "num_input_tokens_seen": 50262208, + "step": 3125 + }, + { + "epoch": 0.21897081614962305, + "grad_norm": 3.415969133377075, + "learning_rate": 7.812347460595447e-05, + "loss": 1.1055, + "num_input_tokens_seen": 50278592, + "step": 3126 + }, + { + "epoch": 0.2190408643953523, + "grad_norm": 5.027013301849365, + "learning_rate": 7.811647635726796e-05, + "loss": 1.2189, + "num_input_tokens_seen": 50294976, + "step": 3127 + }, + { + "epoch": 0.21911091264108154, + "grad_norm": 3.806324005126953, + "learning_rate": 7.810947810858143e-05, + "loss": 1.094, + "num_input_tokens_seen": 50311360, + "step": 3128 + }, + { + "epoch": 0.2191809608868108, + "grad_norm": 5.208338260650635, + "learning_rate": 7.810247985989492e-05, + "loss": 1.0917, + "num_input_tokens_seen": 50327744, + "step": 3129 + }, + { + "epoch": 0.21925100913254003, + "grad_norm": 3.5902316570281982, + "learning_rate": 7.809548161120841e-05, + "loss": 1.0894, + "num_input_tokens_seen": 50343616, + "step": 3130 + }, + { + "epoch": 0.2193210573782693, + "grad_norm": 3.7159717082977295, + "learning_rate": 7.80884833625219e-05, + "loss": 1.1168, + "num_input_tokens_seen": 50360000, + "step": 3131 + }, + { + "epoch": 0.21939110562399852, + "grad_norm": 4.03640079498291, + "learning_rate": 7.808148511383539e-05, + "loss": 0.8906, + "num_input_tokens_seen": 50376384, + "step": 3132 + }, + { + "epoch": 0.21946115386972778, + "grad_norm": 3.763805627822876, + "learning_rate": 7.807448686514886e-05, + "loss": 1.0922, + "num_input_tokens_seen": 50392328, + "step": 3133 + }, + { + "epoch": 0.219531202115457, + "grad_norm": 4.242026329040527, + "learning_rate": 7.806748861646235e-05, + "loss": 1.1286, + "num_input_tokens_seen": 50408712, + "step": 3134 + }, + { + "epoch": 0.21960125036118627, + "grad_norm": 3.5783863067626953, + "learning_rate": 7.806049036777584e-05, + "loss": 0.946, + "num_input_tokens_seen": 50424816, + "step": 3135 + }, + { + "epoch": 0.2196712986069155, + "grad_norm": 3.8409011363983154, + "learning_rate": 7.805349211908931e-05, + "loss": 1.0901, + "num_input_tokens_seen": 50440464, + "step": 3136 + }, + { + "epoch": 0.21974134685264476, + "grad_norm": 3.642411231994629, + "learning_rate": 7.804649387040282e-05, + "loss": 1.097, + "num_input_tokens_seen": 50456552, + "step": 3137 + }, + { + "epoch": 0.219811395098374, + "grad_norm": 3.702481985092163, + "learning_rate": 7.803949562171629e-05, + "loss": 1.0843, + "num_input_tokens_seen": 50472936, + "step": 3138 + }, + { + "epoch": 0.21988144334410326, + "grad_norm": 3.776094913482666, + "learning_rate": 7.803249737302978e-05, + "loss": 1.0058, + "num_input_tokens_seen": 50488760, + "step": 3139 + }, + { + "epoch": 0.2199514915898325, + "grad_norm": 4.29668664932251, + "learning_rate": 7.802549912434326e-05, + "loss": 1.3095, + "num_input_tokens_seen": 50505144, + "step": 3140 + }, + { + "epoch": 0.22002153983556175, + "grad_norm": 3.8290088176727295, + "learning_rate": 7.801850087565674e-05, + "loss": 0.9331, + "num_input_tokens_seen": 50521520, + "step": 3141 + }, + { + "epoch": 0.22009158808129098, + "grad_norm": 3.9471163749694824, + "learning_rate": 7.801150262697023e-05, + "loss": 1.1064, + "num_input_tokens_seen": 50537688, + "step": 3142 + }, + { + "epoch": 0.22016163632702024, + "grad_norm": 4.3500657081604, + "learning_rate": 7.800450437828372e-05, + "loss": 1.0843, + "num_input_tokens_seen": 50554072, + "step": 3143 + }, + { + "epoch": 0.22023168457274947, + "grad_norm": 4.257317066192627, + "learning_rate": 7.799750612959721e-05, + "loss": 1.2822, + "num_input_tokens_seen": 50570456, + "step": 3144 + }, + { + "epoch": 0.22030173281847873, + "grad_norm": 3.881340265274048, + "learning_rate": 7.79905078809107e-05, + "loss": 1.2797, + "num_input_tokens_seen": 50586840, + "step": 3145 + }, + { + "epoch": 0.22037178106420796, + "grad_norm": 4.07082986831665, + "learning_rate": 7.798350963222417e-05, + "loss": 1.1659, + "num_input_tokens_seen": 50603224, + "step": 3146 + }, + { + "epoch": 0.22044182930993722, + "grad_norm": 3.740081310272217, + "learning_rate": 7.797651138353765e-05, + "loss": 1.1638, + "num_input_tokens_seen": 50619608, + "step": 3147 + }, + { + "epoch": 0.22051187755566645, + "grad_norm": 3.9368820190429688, + "learning_rate": 7.796951313485114e-05, + "loss": 1.2122, + "num_input_tokens_seen": 50635240, + "step": 3148 + }, + { + "epoch": 0.2205819258013957, + "grad_norm": 4.027481555938721, + "learning_rate": 7.796251488616463e-05, + "loss": 1.1479, + "num_input_tokens_seen": 50651144, + "step": 3149 + }, + { + "epoch": 0.22065197404712494, + "grad_norm": 3.53271222114563, + "learning_rate": 7.795551663747811e-05, + "loss": 0.8352, + "num_input_tokens_seen": 50667320, + "step": 3150 + }, + { + "epoch": 0.2207220222928542, + "grad_norm": 3.9494400024414062, + "learning_rate": 7.79485183887916e-05, + "loss": 1.1353, + "num_input_tokens_seen": 50683704, + "step": 3151 + }, + { + "epoch": 0.22079207053858346, + "grad_norm": 4.944929122924805, + "learning_rate": 7.794152014010509e-05, + "loss": 1.0833, + "num_input_tokens_seen": 50699544, + "step": 3152 + }, + { + "epoch": 0.2208621187843127, + "grad_norm": 4.625114440917969, + "learning_rate": 7.793452189141857e-05, + "loss": 1.0123, + "num_input_tokens_seen": 50715096, + "step": 3153 + }, + { + "epoch": 0.22093216703004195, + "grad_norm": 4.543829441070557, + "learning_rate": 7.792752364273205e-05, + "loss": 0.9669, + "num_input_tokens_seen": 50731480, + "step": 3154 + }, + { + "epoch": 0.22100221527577119, + "grad_norm": 4.038646221160889, + "learning_rate": 7.792052539404553e-05, + "loss": 1.1399, + "num_input_tokens_seen": 50747864, + "step": 3155 + }, + { + "epoch": 0.22107226352150045, + "grad_norm": 5.269920825958252, + "learning_rate": 7.791352714535902e-05, + "loss": 1.1412, + "num_input_tokens_seen": 50764248, + "step": 3156 + }, + { + "epoch": 0.22114231176722968, + "grad_norm": 3.661792278289795, + "learning_rate": 7.790652889667251e-05, + "loss": 0.8881, + "num_input_tokens_seen": 50780296, + "step": 3157 + }, + { + "epoch": 0.22121236001295894, + "grad_norm": 5.119567394256592, + "learning_rate": 7.7899530647986e-05, + "loss": 1.2316, + "num_input_tokens_seen": 50796680, + "step": 3158 + }, + { + "epoch": 0.22128240825868817, + "grad_norm": 4.011631965637207, + "learning_rate": 7.789253239929948e-05, + "loss": 1.0854, + "num_input_tokens_seen": 50812648, + "step": 3159 + }, + { + "epoch": 0.22135245650441743, + "grad_norm": 4.292233467102051, + "learning_rate": 7.788553415061296e-05, + "loss": 0.8441, + "num_input_tokens_seen": 50829032, + "step": 3160 + }, + { + "epoch": 0.22142250475014666, + "grad_norm": 3.9228122234344482, + "learning_rate": 7.787853590192645e-05, + "loss": 1.0963, + "num_input_tokens_seen": 50844776, + "step": 3161 + }, + { + "epoch": 0.22149255299587592, + "grad_norm": 4.396078109741211, + "learning_rate": 7.787153765323994e-05, + "loss": 1.2647, + "num_input_tokens_seen": 50860792, + "step": 3162 + }, + { + "epoch": 0.22156260124160515, + "grad_norm": 3.6809213161468506, + "learning_rate": 7.786453940455342e-05, + "loss": 1.0172, + "num_input_tokens_seen": 50877176, + "step": 3163 + }, + { + "epoch": 0.2216326494873344, + "grad_norm": 3.7879207134246826, + "learning_rate": 7.785754115586691e-05, + "loss": 0.9708, + "num_input_tokens_seen": 50893560, + "step": 3164 + }, + { + "epoch": 0.22170269773306364, + "grad_norm": 5.248175621032715, + "learning_rate": 7.785054290718039e-05, + "loss": 0.9575, + "num_input_tokens_seen": 50909944, + "step": 3165 + }, + { + "epoch": 0.2217727459787929, + "grad_norm": 5.437406539916992, + "learning_rate": 7.784354465849388e-05, + "loss": 1.1873, + "num_input_tokens_seen": 50925848, + "step": 3166 + }, + { + "epoch": 0.22184279422452213, + "grad_norm": 4.390413761138916, + "learning_rate": 7.783654640980735e-05, + "loss": 1.143, + "num_input_tokens_seen": 50941488, + "step": 3167 + }, + { + "epoch": 0.2219128424702514, + "grad_norm": 3.6923863887786865, + "learning_rate": 7.782954816112084e-05, + "loss": 1.012, + "num_input_tokens_seen": 50956984, + "step": 3168 + }, + { + "epoch": 0.22198289071598062, + "grad_norm": 4.338325023651123, + "learning_rate": 7.782254991243433e-05, + "loss": 1.0984, + "num_input_tokens_seen": 50973096, + "step": 3169 + }, + { + "epoch": 0.22205293896170988, + "grad_norm": 5.631222248077393, + "learning_rate": 7.781555166374782e-05, + "loss": 1.0325, + "num_input_tokens_seen": 50989480, + "step": 3170 + }, + { + "epoch": 0.22212298720743912, + "grad_norm": 3.852337598800659, + "learning_rate": 7.78085534150613e-05, + "loss": 1.0279, + "num_input_tokens_seen": 51005864, + "step": 3171 + }, + { + "epoch": 0.22219303545316837, + "grad_norm": 3.6684298515319824, + "learning_rate": 7.78015551663748e-05, + "loss": 0.9906, + "num_input_tokens_seen": 51022248, + "step": 3172 + }, + { + "epoch": 0.2222630836988976, + "grad_norm": 3.7521257400512695, + "learning_rate": 7.779455691768827e-05, + "loss": 0.9641, + "num_input_tokens_seen": 51038632, + "step": 3173 + }, + { + "epoch": 0.22233313194462687, + "grad_norm": 4.571293354034424, + "learning_rate": 7.778755866900175e-05, + "loss": 1.1655, + "num_input_tokens_seen": 51055016, + "step": 3174 + }, + { + "epoch": 0.2224031801903561, + "grad_norm": 3.921743154525757, + "learning_rate": 7.778056042031523e-05, + "loss": 1.0815, + "num_input_tokens_seen": 51071288, + "step": 3175 + }, + { + "epoch": 0.22247322843608536, + "grad_norm": 6.1666083335876465, + "learning_rate": 7.777356217162872e-05, + "loss": 1.1025, + "num_input_tokens_seen": 51086712, + "step": 3176 + }, + { + "epoch": 0.2225432766818146, + "grad_norm": 4.170863151550293, + "learning_rate": 7.776656392294221e-05, + "loss": 1.0547, + "num_input_tokens_seen": 51102904, + "step": 3177 + }, + { + "epoch": 0.22261332492754385, + "grad_norm": 4.218405246734619, + "learning_rate": 7.77595656742557e-05, + "loss": 1.0685, + "num_input_tokens_seen": 51119288, + "step": 3178 + }, + { + "epoch": 0.22268337317327308, + "grad_norm": 4.158823490142822, + "learning_rate": 7.775256742556919e-05, + "loss": 1.0053, + "num_input_tokens_seen": 51135672, + "step": 3179 + }, + { + "epoch": 0.22275342141900234, + "grad_norm": 3.900827407836914, + "learning_rate": 7.774556917688266e-05, + "loss": 0.9212, + "num_input_tokens_seen": 51151880, + "step": 3180 + }, + { + "epoch": 0.22282346966473157, + "grad_norm": 3.6363813877105713, + "learning_rate": 7.773857092819615e-05, + "loss": 1.0602, + "num_input_tokens_seen": 51167712, + "step": 3181 + }, + { + "epoch": 0.22289351791046083, + "grad_norm": 6.452186584472656, + "learning_rate": 7.773157267950963e-05, + "loss": 1.3543, + "num_input_tokens_seen": 51184096, + "step": 3182 + }, + { + "epoch": 0.22296356615619006, + "grad_norm": 4.324470043182373, + "learning_rate": 7.772457443082313e-05, + "loss": 1.3328, + "num_input_tokens_seen": 51200480, + "step": 3183 + }, + { + "epoch": 0.22303361440191932, + "grad_norm": 4.093019485473633, + "learning_rate": 7.77175761821366e-05, + "loss": 1.2647, + "num_input_tokens_seen": 51216864, + "step": 3184 + }, + { + "epoch": 0.22310366264764858, + "grad_norm": 3.923771619796753, + "learning_rate": 7.771057793345009e-05, + "loss": 1.0121, + "num_input_tokens_seen": 51233248, + "step": 3185 + }, + { + "epoch": 0.2231737108933778, + "grad_norm": 3.3340275287628174, + "learning_rate": 7.770357968476358e-05, + "loss": 0.8954, + "num_input_tokens_seen": 51249400, + "step": 3186 + }, + { + "epoch": 0.22324375913910707, + "grad_norm": 5.360925197601318, + "learning_rate": 7.769658143607706e-05, + "loss": 1.0391, + "num_input_tokens_seen": 51264920, + "step": 3187 + }, + { + "epoch": 0.2233138073848363, + "grad_norm": 4.377450466156006, + "learning_rate": 7.768958318739054e-05, + "loss": 1.2148, + "num_input_tokens_seen": 51280528, + "step": 3188 + }, + { + "epoch": 0.22338385563056556, + "grad_norm": 4.01370906829834, + "learning_rate": 7.768258493870403e-05, + "loss": 1.0084, + "num_input_tokens_seen": 51296912, + "step": 3189 + }, + { + "epoch": 0.2234539038762948, + "grad_norm": 5.112427711486816, + "learning_rate": 7.767558669001752e-05, + "loss": 1.0388, + "num_input_tokens_seen": 51313296, + "step": 3190 + }, + { + "epoch": 0.22352395212202406, + "grad_norm": 3.5889225006103516, + "learning_rate": 7.766858844133101e-05, + "loss": 1.0018, + "num_input_tokens_seen": 51329680, + "step": 3191 + }, + { + "epoch": 0.2235940003677533, + "grad_norm": 3.6924920082092285, + "learning_rate": 7.766159019264449e-05, + "loss": 1.1056, + "num_input_tokens_seen": 51346064, + "step": 3192 + }, + { + "epoch": 0.22366404861348255, + "grad_norm": 3.9349400997161865, + "learning_rate": 7.765459194395797e-05, + "loss": 0.9785, + "num_input_tokens_seen": 51361200, + "step": 3193 + }, + { + "epoch": 0.22373409685921178, + "grad_norm": 3.6980738639831543, + "learning_rate": 7.764759369527145e-05, + "loss": 0.9112, + "num_input_tokens_seen": 51377584, + "step": 3194 + }, + { + "epoch": 0.22380414510494104, + "grad_norm": 4.400575637817383, + "learning_rate": 7.764059544658494e-05, + "loss": 1.2927, + "num_input_tokens_seen": 51393968, + "step": 3195 + }, + { + "epoch": 0.22387419335067027, + "grad_norm": 3.758664846420288, + "learning_rate": 7.763359719789843e-05, + "loss": 0.8743, + "num_input_tokens_seen": 51410160, + "step": 3196 + }, + { + "epoch": 0.22394424159639953, + "grad_norm": 4.376255512237549, + "learning_rate": 7.762659894921192e-05, + "loss": 1.1239, + "num_input_tokens_seen": 51426192, + "step": 3197 + }, + { + "epoch": 0.22401428984212876, + "grad_norm": 4.371212959289551, + "learning_rate": 7.76196007005254e-05, + "loss": 1.4918, + "num_input_tokens_seen": 51442576, + "step": 3198 + }, + { + "epoch": 0.22408433808785802, + "grad_norm": 3.5152950286865234, + "learning_rate": 7.761260245183889e-05, + "loss": 1.0344, + "num_input_tokens_seen": 51458648, + "step": 3199 + }, + { + "epoch": 0.22415438633358725, + "grad_norm": 4.100535869598389, + "learning_rate": 7.760560420315237e-05, + "loss": 0.9969, + "num_input_tokens_seen": 51475032, + "step": 3200 + }, + { + "epoch": 0.22415438633358725, + "eval_loss": 1.1358542442321777, + "eval_runtime": 0.2073, + "eval_samples_per_second": 4.825, + "eval_steps_per_second": 4.825, + "num_input_tokens_seen": 51475032, + "step": 3200 + }, + { + "epoch": 0.2242244345793165, + "grad_norm": 4.394073486328125, + "learning_rate": 7.759860595446584e-05, + "loss": 1.0951, + "num_input_tokens_seen": 51490544, + "step": 3201 + }, + { + "epoch": 0.22429448282504574, + "grad_norm": 4.041582107543945, + "learning_rate": 7.759160770577933e-05, + "loss": 1.1615, + "num_input_tokens_seen": 51506928, + "step": 3202 + }, + { + "epoch": 0.224364531070775, + "grad_norm": 4.268798351287842, + "learning_rate": 7.758460945709282e-05, + "loss": 1.0975, + "num_input_tokens_seen": 51523232, + "step": 3203 + }, + { + "epoch": 0.22443457931650423, + "grad_norm": 4.080141067504883, + "learning_rate": 7.757761120840631e-05, + "loss": 0.9809, + "num_input_tokens_seen": 51539616, + "step": 3204 + }, + { + "epoch": 0.2245046275622335, + "grad_norm": 7.690321445465088, + "learning_rate": 7.75706129597198e-05, + "loss": 1.1217, + "num_input_tokens_seen": 51556000, + "step": 3205 + }, + { + "epoch": 0.22457467580796273, + "grad_norm": 4.161118507385254, + "learning_rate": 7.756361471103329e-05, + "loss": 0.9672, + "num_input_tokens_seen": 51572384, + "step": 3206 + }, + { + "epoch": 0.22464472405369199, + "grad_norm": 3.922683000564575, + "learning_rate": 7.755661646234676e-05, + "loss": 1.0665, + "num_input_tokens_seen": 51588768, + "step": 3207 + }, + { + "epoch": 0.22471477229942122, + "grad_norm": 3.7474617958068848, + "learning_rate": 7.754961821366025e-05, + "loss": 1.1283, + "num_input_tokens_seen": 51604792, + "step": 3208 + }, + { + "epoch": 0.22478482054515048, + "grad_norm": 3.856959819793701, + "learning_rate": 7.754261996497374e-05, + "loss": 0.963, + "num_input_tokens_seen": 51621176, + "step": 3209 + }, + { + "epoch": 0.2248548687908797, + "grad_norm": 4.130929470062256, + "learning_rate": 7.753562171628723e-05, + "loss": 1.0563, + "num_input_tokens_seen": 51636864, + "step": 3210 + }, + { + "epoch": 0.22492491703660897, + "grad_norm": 3.5023388862609863, + "learning_rate": 7.75286234676007e-05, + "loss": 0.8926, + "num_input_tokens_seen": 51653248, + "step": 3211 + }, + { + "epoch": 0.2249949652823382, + "grad_norm": 3.736415386199951, + "learning_rate": 7.752162521891419e-05, + "loss": 1.08, + "num_input_tokens_seen": 51669632, + "step": 3212 + }, + { + "epoch": 0.22506501352806746, + "grad_norm": 4.355846881866455, + "learning_rate": 7.751462697022768e-05, + "loss": 1.0265, + "num_input_tokens_seen": 51684632, + "step": 3213 + }, + { + "epoch": 0.2251350617737967, + "grad_norm": 4.165436744689941, + "learning_rate": 7.750762872154115e-05, + "loss": 1.1594, + "num_input_tokens_seen": 51701016, + "step": 3214 + }, + { + "epoch": 0.22520511001952595, + "grad_norm": 4.4387946128845215, + "learning_rate": 7.750063047285464e-05, + "loss": 0.911, + "num_input_tokens_seen": 51716176, + "step": 3215 + }, + { + "epoch": 0.22527515826525518, + "grad_norm": 4.749145030975342, + "learning_rate": 7.749363222416813e-05, + "loss": 0.952, + "num_input_tokens_seen": 51732560, + "step": 3216 + }, + { + "epoch": 0.22534520651098444, + "grad_norm": 4.321863651275635, + "learning_rate": 7.748663397548162e-05, + "loss": 1.0974, + "num_input_tokens_seen": 51748944, + "step": 3217 + }, + { + "epoch": 0.22541525475671367, + "grad_norm": 5.319899082183838, + "learning_rate": 7.747963572679511e-05, + "loss": 0.9506, + "num_input_tokens_seen": 51765328, + "step": 3218 + }, + { + "epoch": 0.22548530300244293, + "grad_norm": 3.5695643424987793, + "learning_rate": 7.747263747810858e-05, + "loss": 1.1482, + "num_input_tokens_seen": 51781712, + "step": 3219 + }, + { + "epoch": 0.2255553512481722, + "grad_norm": 3.725698947906494, + "learning_rate": 7.746563922942207e-05, + "loss": 0.9205, + "num_input_tokens_seen": 51798096, + "step": 3220 + }, + { + "epoch": 0.22562539949390142, + "grad_norm": 3.795003652572632, + "learning_rate": 7.745864098073555e-05, + "loss": 1.0314, + "num_input_tokens_seen": 51814480, + "step": 3221 + }, + { + "epoch": 0.22569544773963068, + "grad_norm": 3.817578077316284, + "learning_rate": 7.745164273204903e-05, + "loss": 1.1218, + "num_input_tokens_seen": 51830864, + "step": 3222 + }, + { + "epoch": 0.22576549598535992, + "grad_norm": 5.982937812805176, + "learning_rate": 7.744464448336252e-05, + "loss": 0.9544, + "num_input_tokens_seen": 51846104, + "step": 3223 + }, + { + "epoch": 0.22583554423108917, + "grad_norm": 5.063079833984375, + "learning_rate": 7.743764623467601e-05, + "loss": 0.9191, + "num_input_tokens_seen": 51862488, + "step": 3224 + }, + { + "epoch": 0.2259055924768184, + "grad_norm": 3.620837450027466, + "learning_rate": 7.74306479859895e-05, + "loss": 1.0484, + "num_input_tokens_seen": 51878784, + "step": 3225 + }, + { + "epoch": 0.22597564072254767, + "grad_norm": 3.578369617462158, + "learning_rate": 7.742364973730299e-05, + "loss": 1.0146, + "num_input_tokens_seen": 51894832, + "step": 3226 + }, + { + "epoch": 0.2260456889682769, + "grad_norm": 4.0356974601745605, + "learning_rate": 7.741665148861646e-05, + "loss": 1.0664, + "num_input_tokens_seen": 51911216, + "step": 3227 + }, + { + "epoch": 0.22611573721400616, + "grad_norm": 4.133927822113037, + "learning_rate": 7.740965323992994e-05, + "loss": 1.1579, + "num_input_tokens_seen": 51927600, + "step": 3228 + }, + { + "epoch": 0.2261857854597354, + "grad_norm": 4.2958879470825195, + "learning_rate": 7.740265499124343e-05, + "loss": 1.0519, + "num_input_tokens_seen": 51943688, + "step": 3229 + }, + { + "epoch": 0.22625583370546465, + "grad_norm": 6.211035251617432, + "learning_rate": 7.739565674255693e-05, + "loss": 1.0097, + "num_input_tokens_seen": 51960072, + "step": 3230 + }, + { + "epoch": 0.22632588195119388, + "grad_norm": 4.073126316070557, + "learning_rate": 7.73886584938704e-05, + "loss": 1.0226, + "num_input_tokens_seen": 51976456, + "step": 3231 + }, + { + "epoch": 0.22639593019692314, + "grad_norm": 3.605041980743408, + "learning_rate": 7.73816602451839e-05, + "loss": 0.817, + "num_input_tokens_seen": 51992840, + "step": 3232 + }, + { + "epoch": 0.22646597844265237, + "grad_norm": 4.341184139251709, + "learning_rate": 7.737466199649738e-05, + "loss": 1.1391, + "num_input_tokens_seen": 52008696, + "step": 3233 + }, + { + "epoch": 0.22653602668838163, + "grad_norm": 4.676966667175293, + "learning_rate": 7.736766374781086e-05, + "loss": 1.0163, + "num_input_tokens_seen": 52024944, + "step": 3234 + }, + { + "epoch": 0.22660607493411086, + "grad_norm": 4.6688032150268555, + "learning_rate": 7.736066549912435e-05, + "loss": 0.972, + "num_input_tokens_seen": 52041104, + "step": 3235 + }, + { + "epoch": 0.22667612317984012, + "grad_norm": 4.6416916847229, + "learning_rate": 7.735366725043783e-05, + "loss": 1.1197, + "num_input_tokens_seen": 52055864, + "step": 3236 + }, + { + "epoch": 0.22674617142556935, + "grad_norm": 3.713846206665039, + "learning_rate": 7.734666900175132e-05, + "loss": 1.0498, + "num_input_tokens_seen": 52071992, + "step": 3237 + }, + { + "epoch": 0.2268162196712986, + "grad_norm": 3.694094657897949, + "learning_rate": 7.73396707530648e-05, + "loss": 1.083, + "num_input_tokens_seen": 52088376, + "step": 3238 + }, + { + "epoch": 0.22688626791702785, + "grad_norm": 4.250162601470947, + "learning_rate": 7.733267250437829e-05, + "loss": 0.9421, + "num_input_tokens_seen": 52104320, + "step": 3239 + }, + { + "epoch": 0.2269563161627571, + "grad_norm": 3.8184008598327637, + "learning_rate": 7.732567425569178e-05, + "loss": 1.0033, + "num_input_tokens_seen": 52120416, + "step": 3240 + }, + { + "epoch": 0.22702636440848634, + "grad_norm": 3.9957122802734375, + "learning_rate": 7.731867600700525e-05, + "loss": 0.9594, + "num_input_tokens_seen": 52136704, + "step": 3241 + }, + { + "epoch": 0.2270964126542156, + "grad_norm": 4.153292655944824, + "learning_rate": 7.731167775831874e-05, + "loss": 1.2315, + "num_input_tokens_seen": 52153088, + "step": 3242 + }, + { + "epoch": 0.22716646089994483, + "grad_norm": 3.628377914428711, + "learning_rate": 7.730467950963223e-05, + "loss": 0.9826, + "num_input_tokens_seen": 52169032, + "step": 3243 + }, + { + "epoch": 0.2272365091456741, + "grad_norm": 3.45796275138855, + "learning_rate": 7.729768126094572e-05, + "loss": 1.0942, + "num_input_tokens_seen": 52185416, + "step": 3244 + }, + { + "epoch": 0.22730655739140332, + "grad_norm": 3.9128968715667725, + "learning_rate": 7.72906830122592e-05, + "loss": 1.2954, + "num_input_tokens_seen": 52201504, + "step": 3245 + }, + { + "epoch": 0.22737660563713258, + "grad_norm": 4.4097394943237305, + "learning_rate": 7.728368476357268e-05, + "loss": 1.0171, + "num_input_tokens_seen": 52217184, + "step": 3246 + }, + { + "epoch": 0.2274466538828618, + "grad_norm": 4.110626220703125, + "learning_rate": 7.727668651488617e-05, + "loss": 1.0412, + "num_input_tokens_seen": 52233432, + "step": 3247 + }, + { + "epoch": 0.22751670212859107, + "grad_norm": 4.161354064941406, + "learning_rate": 7.726968826619964e-05, + "loss": 0.9371, + "num_input_tokens_seen": 52249816, + "step": 3248 + }, + { + "epoch": 0.2275867503743203, + "grad_norm": 5.910977363586426, + "learning_rate": 7.726269001751313e-05, + "loss": 0.8993, + "num_input_tokens_seen": 52266200, + "step": 3249 + }, + { + "epoch": 0.22765679862004956, + "grad_norm": 3.8264660835266113, + "learning_rate": 7.725569176882663e-05, + "loss": 1.0927, + "num_input_tokens_seen": 52282136, + "step": 3250 + }, + { + "epoch": 0.2277268468657788, + "grad_norm": 3.9992623329162598, + "learning_rate": 7.724869352014011e-05, + "loss": 0.9256, + "num_input_tokens_seen": 52297368, + "step": 3251 + }, + { + "epoch": 0.22779689511150805, + "grad_norm": 4.263967990875244, + "learning_rate": 7.72416952714536e-05, + "loss": 1.1708, + "num_input_tokens_seen": 52313200, + "step": 3252 + }, + { + "epoch": 0.22786694335723728, + "grad_norm": 3.8846871852874756, + "learning_rate": 7.723469702276709e-05, + "loss": 1.1445, + "num_input_tokens_seen": 52329584, + "step": 3253 + }, + { + "epoch": 0.22793699160296654, + "grad_norm": 4.3504533767700195, + "learning_rate": 7.722769877408056e-05, + "loss": 1.0332, + "num_input_tokens_seen": 52345968, + "step": 3254 + }, + { + "epoch": 0.2280070398486958, + "grad_norm": 3.9775991439819336, + "learning_rate": 7.722070052539404e-05, + "loss": 1.2149, + "num_input_tokens_seen": 52362352, + "step": 3255 + }, + { + "epoch": 0.22807708809442503, + "grad_norm": 4.098363399505615, + "learning_rate": 7.721370227670754e-05, + "loss": 1.1278, + "num_input_tokens_seen": 52378736, + "step": 3256 + }, + { + "epoch": 0.2281471363401543, + "grad_norm": 3.7094836235046387, + "learning_rate": 7.720670402802103e-05, + "loss": 1.0221, + "num_input_tokens_seen": 52394896, + "step": 3257 + }, + { + "epoch": 0.22821718458588353, + "grad_norm": 4.042232036590576, + "learning_rate": 7.71997057793345e-05, + "loss": 1.2902, + "num_input_tokens_seen": 52410952, + "step": 3258 + }, + { + "epoch": 0.22828723283161279, + "grad_norm": 3.725853443145752, + "learning_rate": 7.719270753064799e-05, + "loss": 1.0135, + "num_input_tokens_seen": 52427200, + "step": 3259 + }, + { + "epoch": 0.22835728107734202, + "grad_norm": 5.186229705810547, + "learning_rate": 7.718570928196148e-05, + "loss": 1.0539, + "num_input_tokens_seen": 52443584, + "step": 3260 + }, + { + "epoch": 0.22842732932307128, + "grad_norm": 3.8725364208221436, + "learning_rate": 7.717871103327495e-05, + "loss": 1.0782, + "num_input_tokens_seen": 52458272, + "step": 3261 + }, + { + "epoch": 0.2284973775688005, + "grad_norm": 5.006584644317627, + "learning_rate": 7.717171278458844e-05, + "loss": 1.0313, + "num_input_tokens_seen": 52474456, + "step": 3262 + }, + { + "epoch": 0.22856742581452977, + "grad_norm": 5.102536201477051, + "learning_rate": 7.716471453590193e-05, + "loss": 1.2077, + "num_input_tokens_seen": 52490464, + "step": 3263 + }, + { + "epoch": 0.228637474060259, + "grad_norm": 3.741029977798462, + "learning_rate": 7.715771628721542e-05, + "loss": 0.8978, + "num_input_tokens_seen": 52506112, + "step": 3264 + }, + { + "epoch": 0.22870752230598826, + "grad_norm": 5.656842231750488, + "learning_rate": 7.71507180385289e-05, + "loss": 1.1569, + "num_input_tokens_seen": 52522496, + "step": 3265 + }, + { + "epoch": 0.2287775705517175, + "grad_norm": 3.882403612136841, + "learning_rate": 7.714371978984238e-05, + "loss": 1.163, + "num_input_tokens_seen": 52538240, + "step": 3266 + }, + { + "epoch": 0.22884761879744675, + "grad_norm": 4.812796592712402, + "learning_rate": 7.713672154115587e-05, + "loss": 1.0478, + "num_input_tokens_seen": 52554024, + "step": 3267 + }, + { + "epoch": 0.22891766704317598, + "grad_norm": 3.9040687084198, + "learning_rate": 7.712972329246935e-05, + "loss": 1.0123, + "num_input_tokens_seen": 52570408, + "step": 3268 + }, + { + "epoch": 0.22898771528890524, + "grad_norm": 3.8387644290924072, + "learning_rate": 7.712272504378284e-05, + "loss": 0.9401, + "num_input_tokens_seen": 52586512, + "step": 3269 + }, + { + "epoch": 0.22905776353463447, + "grad_norm": 4.602542877197266, + "learning_rate": 7.711572679509634e-05, + "loss": 1.0196, + "num_input_tokens_seen": 52602896, + "step": 3270 + }, + { + "epoch": 0.22912781178036373, + "grad_norm": 4.209007263183594, + "learning_rate": 7.710872854640981e-05, + "loss": 1.1401, + "num_input_tokens_seen": 52619080, + "step": 3271 + }, + { + "epoch": 0.22919786002609296, + "grad_norm": 3.5082032680511475, + "learning_rate": 7.71017302977233e-05, + "loss": 0.9979, + "num_input_tokens_seen": 52635464, + "step": 3272 + }, + { + "epoch": 0.22926790827182222, + "grad_norm": 4.123980522155762, + "learning_rate": 7.709473204903678e-05, + "loss": 1.0201, + "num_input_tokens_seen": 52651848, + "step": 3273 + }, + { + "epoch": 0.22933795651755146, + "grad_norm": 4.267751216888428, + "learning_rate": 7.708773380035027e-05, + "loss": 1.1338, + "num_input_tokens_seen": 52668232, + "step": 3274 + }, + { + "epoch": 0.22940800476328072, + "grad_norm": 4.1165666580200195, + "learning_rate": 7.708073555166374e-05, + "loss": 1.1146, + "num_input_tokens_seen": 52684616, + "step": 3275 + }, + { + "epoch": 0.22947805300900995, + "grad_norm": 4.810427665710449, + "learning_rate": 7.707373730297724e-05, + "loss": 1.1785, + "num_input_tokens_seen": 52701000, + "step": 3276 + }, + { + "epoch": 0.2295481012547392, + "grad_norm": 6.566617488861084, + "learning_rate": 7.706673905429073e-05, + "loss": 0.8192, + "num_input_tokens_seen": 52715920, + "step": 3277 + }, + { + "epoch": 0.22961814950046844, + "grad_norm": 4.456092834472656, + "learning_rate": 7.70597408056042e-05, + "loss": 0.992, + "num_input_tokens_seen": 52732304, + "step": 3278 + }, + { + "epoch": 0.2296881977461977, + "grad_norm": 4.063642501831055, + "learning_rate": 7.70527425569177e-05, + "loss": 0.9306, + "num_input_tokens_seen": 52748688, + "step": 3279 + }, + { + "epoch": 0.22975824599192693, + "grad_norm": 3.337742567062378, + "learning_rate": 7.704574430823118e-05, + "loss": 0.8497, + "num_input_tokens_seen": 52764800, + "step": 3280 + }, + { + "epoch": 0.2298282942376562, + "grad_norm": 4.36488151550293, + "learning_rate": 7.703874605954466e-05, + "loss": 1.0851, + "num_input_tokens_seen": 52780952, + "step": 3281 + }, + { + "epoch": 0.22989834248338542, + "grad_norm": 4.948200702667236, + "learning_rate": 7.703174781085815e-05, + "loss": 0.9591, + "num_input_tokens_seen": 52795728, + "step": 3282 + }, + { + "epoch": 0.22996839072911468, + "grad_norm": 4.977625370025635, + "learning_rate": 7.702474956217164e-05, + "loss": 1.2094, + "num_input_tokens_seen": 52812112, + "step": 3283 + }, + { + "epoch": 0.2300384389748439, + "grad_norm": 3.7551944255828857, + "learning_rate": 7.701775131348512e-05, + "loss": 1.1018, + "num_input_tokens_seen": 52828184, + "step": 3284 + }, + { + "epoch": 0.23010848722057317, + "grad_norm": 3.700916051864624, + "learning_rate": 7.70107530647986e-05, + "loss": 1.0159, + "num_input_tokens_seen": 52844568, + "step": 3285 + }, + { + "epoch": 0.2301785354663024, + "grad_norm": 4.135788917541504, + "learning_rate": 7.700375481611209e-05, + "loss": 1.047, + "num_input_tokens_seen": 52860952, + "step": 3286 + }, + { + "epoch": 0.23024858371203166, + "grad_norm": 4.018477916717529, + "learning_rate": 7.699675656742558e-05, + "loss": 1.1124, + "num_input_tokens_seen": 52876808, + "step": 3287 + }, + { + "epoch": 0.2303186319577609, + "grad_norm": 5.230745315551758, + "learning_rate": 7.698975831873905e-05, + "loss": 1.0805, + "num_input_tokens_seen": 52893192, + "step": 3288 + }, + { + "epoch": 0.23038868020349015, + "grad_norm": 4.192041873931885, + "learning_rate": 7.698276007005254e-05, + "loss": 1.1476, + "num_input_tokens_seen": 52909576, + "step": 3289 + }, + { + "epoch": 0.2304587284492194, + "grad_norm": 4.28109073638916, + "learning_rate": 7.697576182136603e-05, + "loss": 0.9795, + "num_input_tokens_seen": 52925592, + "step": 3290 + }, + { + "epoch": 0.23052877669494864, + "grad_norm": 4.673538684844971, + "learning_rate": 7.696876357267952e-05, + "loss": 1.2104, + "num_input_tokens_seen": 52941784, + "step": 3291 + }, + { + "epoch": 0.2305988249406779, + "grad_norm": 3.791339159011841, + "learning_rate": 7.696176532399299e-05, + "loss": 1.0098, + "num_input_tokens_seen": 52958168, + "step": 3292 + }, + { + "epoch": 0.23066887318640714, + "grad_norm": 5.353015899658203, + "learning_rate": 7.695476707530648e-05, + "loss": 1.346, + "num_input_tokens_seen": 52974552, + "step": 3293 + }, + { + "epoch": 0.2307389214321364, + "grad_norm": 6.66793966293335, + "learning_rate": 7.694776882661997e-05, + "loss": 1.127, + "num_input_tokens_seen": 52990512, + "step": 3294 + }, + { + "epoch": 0.23080896967786563, + "grad_norm": 5.462240695953369, + "learning_rate": 7.694077057793344e-05, + "loss": 1.2397, + "num_input_tokens_seen": 53006768, + "step": 3295 + }, + { + "epoch": 0.2308790179235949, + "grad_norm": 4.212863445281982, + "learning_rate": 7.693377232924695e-05, + "loss": 0.9377, + "num_input_tokens_seen": 53023152, + "step": 3296 + }, + { + "epoch": 0.23094906616932412, + "grad_norm": 3.623929977416992, + "learning_rate": 7.692677408056044e-05, + "loss": 0.9086, + "num_input_tokens_seen": 53039536, + "step": 3297 + }, + { + "epoch": 0.23101911441505338, + "grad_norm": 4.791571617126465, + "learning_rate": 7.691977583187391e-05, + "loss": 1.0059, + "num_input_tokens_seen": 53055920, + "step": 3298 + }, + { + "epoch": 0.2310891626607826, + "grad_norm": 3.733243465423584, + "learning_rate": 7.69127775831874e-05, + "loss": 1.1729, + "num_input_tokens_seen": 53072304, + "step": 3299 + }, + { + "epoch": 0.23115921090651187, + "grad_norm": 3.916738986968994, + "learning_rate": 7.690577933450087e-05, + "loss": 1.2479, + "num_input_tokens_seen": 53088568, + "step": 3300 + }, + { + "epoch": 0.2312292591522411, + "grad_norm": 4.0346856117248535, + "learning_rate": 7.689878108581436e-05, + "loss": 1.0858, + "num_input_tokens_seen": 53103656, + "step": 3301 + }, + { + "epoch": 0.23129930739797036, + "grad_norm": 4.834316730499268, + "learning_rate": 7.689178283712785e-05, + "loss": 0.9328, + "num_input_tokens_seen": 53120040, + "step": 3302 + }, + { + "epoch": 0.2313693556436996, + "grad_norm": 4.5966291427612305, + "learning_rate": 7.688478458844134e-05, + "loss": 1.0108, + "num_input_tokens_seen": 53136424, + "step": 3303 + }, + { + "epoch": 0.23143940388942885, + "grad_norm": 5.17268705368042, + "learning_rate": 7.687778633975483e-05, + "loss": 1.1559, + "num_input_tokens_seen": 53152080, + "step": 3304 + }, + { + "epoch": 0.23150945213515808, + "grad_norm": 3.6322672367095947, + "learning_rate": 7.68707880910683e-05, + "loss": 1.0666, + "num_input_tokens_seen": 53168464, + "step": 3305 + }, + { + "epoch": 0.23157950038088734, + "grad_norm": 4.761613368988037, + "learning_rate": 7.686378984238179e-05, + "loss": 1.032, + "num_input_tokens_seen": 53184848, + "step": 3306 + }, + { + "epoch": 0.23164954862661657, + "grad_norm": 3.4870493412017822, + "learning_rate": 7.685679159369528e-05, + "loss": 1.026, + "num_input_tokens_seen": 53201232, + "step": 3307 + }, + { + "epoch": 0.23171959687234583, + "grad_norm": 4.122028827667236, + "learning_rate": 7.684979334500876e-05, + "loss": 1.2103, + "num_input_tokens_seen": 53217616, + "step": 3308 + }, + { + "epoch": 0.23178964511807507, + "grad_norm": 3.4486751556396484, + "learning_rate": 7.684279509632224e-05, + "loss": 0.6654, + "num_input_tokens_seen": 53233936, + "step": 3309 + }, + { + "epoch": 0.23185969336380433, + "grad_norm": 4.321650981903076, + "learning_rate": 7.683579684763573e-05, + "loss": 1.106, + "num_input_tokens_seen": 53250320, + "step": 3310 + }, + { + "epoch": 0.23192974160953356, + "grad_norm": 5.820108413696289, + "learning_rate": 7.682879859894922e-05, + "loss": 1.0225, + "num_input_tokens_seen": 53266592, + "step": 3311 + }, + { + "epoch": 0.23199978985526282, + "grad_norm": 5.5514912605285645, + "learning_rate": 7.68218003502627e-05, + "loss": 1.1083, + "num_input_tokens_seen": 53282976, + "step": 3312 + }, + { + "epoch": 0.23206983810099205, + "grad_norm": 4.108302116394043, + "learning_rate": 7.681480210157618e-05, + "loss": 1.1507, + "num_input_tokens_seen": 53299184, + "step": 3313 + }, + { + "epoch": 0.2321398863467213, + "grad_norm": 4.037779331207275, + "learning_rate": 7.680780385288967e-05, + "loss": 1.2858, + "num_input_tokens_seen": 53315000, + "step": 3314 + }, + { + "epoch": 0.23220993459245054, + "grad_norm": 4.5398383140563965, + "learning_rate": 7.680080560420315e-05, + "loss": 1.0374, + "num_input_tokens_seen": 53331104, + "step": 3315 + }, + { + "epoch": 0.2322799828381798, + "grad_norm": 4.2399067878723145, + "learning_rate": 7.679380735551665e-05, + "loss": 1.098, + "num_input_tokens_seen": 53347488, + "step": 3316 + }, + { + "epoch": 0.23235003108390903, + "grad_norm": 5.6600775718688965, + "learning_rate": 7.678680910683013e-05, + "loss": 0.9446, + "num_input_tokens_seen": 53363872, + "step": 3317 + }, + { + "epoch": 0.2324200793296383, + "grad_norm": 4.462069511413574, + "learning_rate": 7.677981085814361e-05, + "loss": 0.9313, + "num_input_tokens_seen": 53379424, + "step": 3318 + }, + { + "epoch": 0.23249012757536752, + "grad_norm": 4.644591808319092, + "learning_rate": 7.677281260945709e-05, + "loss": 1.3155, + "num_input_tokens_seen": 53395728, + "step": 3319 + }, + { + "epoch": 0.23256017582109678, + "grad_norm": 3.860954523086548, + "learning_rate": 7.676581436077058e-05, + "loss": 1.0917, + "num_input_tokens_seen": 53412112, + "step": 3320 + }, + { + "epoch": 0.232630224066826, + "grad_norm": 4.625146389007568, + "learning_rate": 7.675881611208407e-05, + "loss": 0.9253, + "num_input_tokens_seen": 53427992, + "step": 3321 + }, + { + "epoch": 0.23270027231255527, + "grad_norm": 6.473335266113281, + "learning_rate": 7.675181786339756e-05, + "loss": 0.9892, + "num_input_tokens_seen": 53444376, + "step": 3322 + }, + { + "epoch": 0.2327703205582845, + "grad_norm": 3.6846091747283936, + "learning_rate": 7.674481961471104e-05, + "loss": 0.9976, + "num_input_tokens_seen": 53460760, + "step": 3323 + }, + { + "epoch": 0.23284036880401376, + "grad_norm": 3.784900188446045, + "learning_rate": 7.673782136602453e-05, + "loss": 0.8865, + "num_input_tokens_seen": 53477144, + "step": 3324 + }, + { + "epoch": 0.23291041704974302, + "grad_norm": 4.175132751464844, + "learning_rate": 7.673082311733801e-05, + "loss": 1.1741, + "num_input_tokens_seen": 53493496, + "step": 3325 + }, + { + "epoch": 0.23298046529547226, + "grad_norm": 4.355600833892822, + "learning_rate": 7.67238248686515e-05, + "loss": 0.8686, + "num_input_tokens_seen": 53509560, + "step": 3326 + }, + { + "epoch": 0.23305051354120151, + "grad_norm": 4.32242488861084, + "learning_rate": 7.671682661996497e-05, + "loss": 0.9493, + "num_input_tokens_seen": 53525944, + "step": 3327 + }, + { + "epoch": 0.23312056178693075, + "grad_norm": 4.937814235687256, + "learning_rate": 7.670982837127846e-05, + "loss": 1.1617, + "num_input_tokens_seen": 53541312, + "step": 3328 + }, + { + "epoch": 0.23319061003266, + "grad_norm": 3.1939101219177246, + "learning_rate": 7.670283012259195e-05, + "loss": 0.8866, + "num_input_tokens_seen": 53557696, + "step": 3329 + }, + { + "epoch": 0.23326065827838924, + "grad_norm": 5.137113094329834, + "learning_rate": 7.669583187390544e-05, + "loss": 0.9911, + "num_input_tokens_seen": 53573600, + "step": 3330 + }, + { + "epoch": 0.2333307065241185, + "grad_norm": 3.777954578399658, + "learning_rate": 7.668883362521893e-05, + "loss": 1.0047, + "num_input_tokens_seen": 53588808, + "step": 3331 + }, + { + "epoch": 0.23340075476984773, + "grad_norm": 4.229750633239746, + "learning_rate": 7.66818353765324e-05, + "loss": 1.3247, + "num_input_tokens_seen": 53603416, + "step": 3332 + }, + { + "epoch": 0.233470803015577, + "grad_norm": 4.248676776885986, + "learning_rate": 7.667483712784589e-05, + "loss": 1.2149, + "num_input_tokens_seen": 53618896, + "step": 3333 + }, + { + "epoch": 0.23354085126130622, + "grad_norm": 3.7393991947174072, + "learning_rate": 7.666783887915938e-05, + "loss": 1.0339, + "num_input_tokens_seen": 53635280, + "step": 3334 + }, + { + "epoch": 0.23361089950703548, + "grad_norm": 3.6224875450134277, + "learning_rate": 7.666084063047285e-05, + "loss": 0.8727, + "num_input_tokens_seen": 53651664, + "step": 3335 + }, + { + "epoch": 0.2336809477527647, + "grad_norm": 4.2722063064575195, + "learning_rate": 7.665384238178634e-05, + "loss": 1.1982, + "num_input_tokens_seen": 53668048, + "step": 3336 + }, + { + "epoch": 0.23375099599849397, + "grad_norm": 3.4717535972595215, + "learning_rate": 7.664684413309983e-05, + "loss": 0.9695, + "num_input_tokens_seen": 53684432, + "step": 3337 + }, + { + "epoch": 0.2338210442442232, + "grad_norm": 3.6640021800994873, + "learning_rate": 7.663984588441332e-05, + "loss": 0.8621, + "num_input_tokens_seen": 53700816, + "step": 3338 + }, + { + "epoch": 0.23389109248995246, + "grad_norm": 5.14633321762085, + "learning_rate": 7.66328476357268e-05, + "loss": 1.1954, + "num_input_tokens_seen": 53717200, + "step": 3339 + }, + { + "epoch": 0.2339611407356817, + "grad_norm": 4.479960918426514, + "learning_rate": 7.662584938704028e-05, + "loss": 1.1001, + "num_input_tokens_seen": 53733584, + "step": 3340 + }, + { + "epoch": 0.23403118898141095, + "grad_norm": 5.33896017074585, + "learning_rate": 7.661885113835377e-05, + "loss": 0.8984, + "num_input_tokens_seen": 53749072, + "step": 3341 + }, + { + "epoch": 0.23410123722714019, + "grad_norm": 4.407443046569824, + "learning_rate": 7.661185288966726e-05, + "loss": 1.2437, + "num_input_tokens_seen": 53765088, + "step": 3342 + }, + { + "epoch": 0.23417128547286944, + "grad_norm": 3.8250956535339355, + "learning_rate": 7.660485464098075e-05, + "loss": 0.9243, + "num_input_tokens_seen": 53781000, + "step": 3343 + }, + { + "epoch": 0.23424133371859868, + "grad_norm": 4.316215515136719, + "learning_rate": 7.659785639229422e-05, + "loss": 1.0972, + "num_input_tokens_seen": 53796744, + "step": 3344 + }, + { + "epoch": 0.23431138196432794, + "grad_norm": 4.291647434234619, + "learning_rate": 7.659085814360771e-05, + "loss": 1.1376, + "num_input_tokens_seen": 53813128, + "step": 3345 + }, + { + "epoch": 0.23438143021005717, + "grad_norm": 3.704899787902832, + "learning_rate": 7.658385989492119e-05, + "loss": 1.2117, + "num_input_tokens_seen": 53829512, + "step": 3346 + }, + { + "epoch": 0.23445147845578643, + "grad_norm": 3.5979909896850586, + "learning_rate": 7.657686164623468e-05, + "loss": 0.9604, + "num_input_tokens_seen": 53845536, + "step": 3347 + }, + { + "epoch": 0.23452152670151566, + "grad_norm": 3.8820247650146484, + "learning_rate": 7.656986339754816e-05, + "loss": 1.2439, + "num_input_tokens_seen": 53861920, + "step": 3348 + }, + { + "epoch": 0.23459157494724492, + "grad_norm": 4.226894855499268, + "learning_rate": 7.656286514886165e-05, + "loss": 1.0884, + "num_input_tokens_seen": 53878304, + "step": 3349 + }, + { + "epoch": 0.23466162319297415, + "grad_norm": 4.507336616516113, + "learning_rate": 7.655586690017514e-05, + "loss": 1.0184, + "num_input_tokens_seen": 53894688, + "step": 3350 + }, + { + "epoch": 0.2347316714387034, + "grad_norm": 3.86645245552063, + "learning_rate": 7.654886865148863e-05, + "loss": 1.0895, + "num_input_tokens_seen": 53910736, + "step": 3351 + }, + { + "epoch": 0.23480171968443264, + "grad_norm": 3.8789820671081543, + "learning_rate": 7.65418704028021e-05, + "loss": 1.0078, + "num_input_tokens_seen": 53926688, + "step": 3352 + }, + { + "epoch": 0.2348717679301619, + "grad_norm": 3.893564462661743, + "learning_rate": 7.653487215411559e-05, + "loss": 1.0701, + "num_input_tokens_seen": 53942904, + "step": 3353 + }, + { + "epoch": 0.23494181617589113, + "grad_norm": 4.6554412841796875, + "learning_rate": 7.652787390542907e-05, + "loss": 1.1396, + "num_input_tokens_seen": 53957976, + "step": 3354 + }, + { + "epoch": 0.2350118644216204, + "grad_norm": 4.118137359619141, + "learning_rate": 7.652087565674256e-05, + "loss": 1.2019, + "num_input_tokens_seen": 53973520, + "step": 3355 + }, + { + "epoch": 0.23508191266734962, + "grad_norm": 5.099210262298584, + "learning_rate": 7.651387740805605e-05, + "loss": 0.892, + "num_input_tokens_seen": 53989280, + "step": 3356 + }, + { + "epoch": 0.23515196091307888, + "grad_norm": 3.868797779083252, + "learning_rate": 7.650687915936953e-05, + "loss": 1.0992, + "num_input_tokens_seen": 54005664, + "step": 3357 + }, + { + "epoch": 0.23522200915880812, + "grad_norm": 4.032477378845215, + "learning_rate": 7.649988091068302e-05, + "loss": 1.0356, + "num_input_tokens_seen": 54022048, + "step": 3358 + }, + { + "epoch": 0.23529205740453737, + "grad_norm": 3.907238483428955, + "learning_rate": 7.64928826619965e-05, + "loss": 1.0925, + "num_input_tokens_seen": 54038432, + "step": 3359 + }, + { + "epoch": 0.23536210565026663, + "grad_norm": 3.6504223346710205, + "learning_rate": 7.648588441330999e-05, + "loss": 0.9708, + "num_input_tokens_seen": 54054272, + "step": 3360 + }, + { + "epoch": 0.23543215389599587, + "grad_norm": 4.614812850952148, + "learning_rate": 7.647888616462347e-05, + "loss": 1.136, + "num_input_tokens_seen": 54070656, + "step": 3361 + }, + { + "epoch": 0.23550220214172513, + "grad_norm": 4.812591552734375, + "learning_rate": 7.647188791593696e-05, + "loss": 1.0714, + "num_input_tokens_seen": 54086416, + "step": 3362 + }, + { + "epoch": 0.23557225038745436, + "grad_norm": 3.709543466567993, + "learning_rate": 7.646488966725044e-05, + "loss": 1.106, + "num_input_tokens_seen": 54102800, + "step": 3363 + }, + { + "epoch": 0.23564229863318362, + "grad_norm": 3.9850802421569824, + "learning_rate": 7.645789141856393e-05, + "loss": 1.1509, + "num_input_tokens_seen": 54119184, + "step": 3364 + }, + { + "epoch": 0.23571234687891285, + "grad_norm": 4.59740686416626, + "learning_rate": 7.645089316987742e-05, + "loss": 1.1974, + "num_input_tokens_seen": 54135568, + "step": 3365 + }, + { + "epoch": 0.2357823951246421, + "grad_norm": 4.118459224700928, + "learning_rate": 7.644389492119089e-05, + "loss": 1.2196, + "num_input_tokens_seen": 54151952, + "step": 3366 + }, + { + "epoch": 0.23585244337037134, + "grad_norm": 4.172552108764648, + "learning_rate": 7.643689667250438e-05, + "loss": 1.0178, + "num_input_tokens_seen": 54167776, + "step": 3367 + }, + { + "epoch": 0.2359224916161006, + "grad_norm": 3.9671120643615723, + "learning_rate": 7.642989842381787e-05, + "loss": 1.0589, + "num_input_tokens_seen": 54184160, + "step": 3368 + }, + { + "epoch": 0.23599253986182983, + "grad_norm": 3.7376415729522705, + "learning_rate": 7.642290017513136e-05, + "loss": 1.1445, + "num_input_tokens_seen": 54200280, + "step": 3369 + }, + { + "epoch": 0.2360625881075591, + "grad_norm": 4.665002346038818, + "learning_rate": 7.641590192644484e-05, + "loss": 1.3347, + "num_input_tokens_seen": 54216664, + "step": 3370 + }, + { + "epoch": 0.23613263635328832, + "grad_norm": 3.669015884399414, + "learning_rate": 7.640890367775832e-05, + "loss": 0.8359, + "num_input_tokens_seen": 54232320, + "step": 3371 + }, + { + "epoch": 0.23620268459901758, + "grad_norm": 3.993393659591675, + "learning_rate": 7.640190542907181e-05, + "loss": 1.0298, + "num_input_tokens_seen": 54248704, + "step": 3372 + }, + { + "epoch": 0.2362727328447468, + "grad_norm": 3.808516263961792, + "learning_rate": 7.639490718038528e-05, + "loss": 1.1315, + "num_input_tokens_seen": 54265088, + "step": 3373 + }, + { + "epoch": 0.23634278109047607, + "grad_norm": 5.25230073928833, + "learning_rate": 7.638790893169877e-05, + "loss": 1.1273, + "num_input_tokens_seen": 54281256, + "step": 3374 + }, + { + "epoch": 0.2364128293362053, + "grad_norm": 5.724976062774658, + "learning_rate": 7.638091068301226e-05, + "loss": 1.3176, + "num_input_tokens_seen": 54296832, + "step": 3375 + }, + { + "epoch": 0.23648287758193456, + "grad_norm": 3.553737163543701, + "learning_rate": 7.637391243432575e-05, + "loss": 1.0288, + "num_input_tokens_seen": 54313120, + "step": 3376 + }, + { + "epoch": 0.2365529258276638, + "grad_norm": 6.614949703216553, + "learning_rate": 7.636691418563924e-05, + "loss": 1.0649, + "num_input_tokens_seen": 54328184, + "step": 3377 + }, + { + "epoch": 0.23662297407339306, + "grad_norm": 3.76234769821167, + "learning_rate": 7.635991593695273e-05, + "loss": 1.149, + "num_input_tokens_seen": 54344568, + "step": 3378 + }, + { + "epoch": 0.2366930223191223, + "grad_norm": 3.4564521312713623, + "learning_rate": 7.63529176882662e-05, + "loss": 0.9227, + "num_input_tokens_seen": 54360952, + "step": 3379 + }, + { + "epoch": 0.23676307056485155, + "grad_norm": 3.735978841781616, + "learning_rate": 7.634591943957969e-05, + "loss": 1.2159, + "num_input_tokens_seen": 54377336, + "step": 3380 + }, + { + "epoch": 0.23683311881058078, + "grad_norm": 4.106653690338135, + "learning_rate": 7.633892119089317e-05, + "loss": 1.0997, + "num_input_tokens_seen": 54393232, + "step": 3381 + }, + { + "epoch": 0.23690316705631004, + "grad_norm": 3.9169600009918213, + "learning_rate": 7.633192294220667e-05, + "loss": 1.247, + "num_input_tokens_seen": 54409616, + "step": 3382 + }, + { + "epoch": 0.23697321530203927, + "grad_norm": 3.8265388011932373, + "learning_rate": 7.632492469352014e-05, + "loss": 1.1391, + "num_input_tokens_seen": 54425312, + "step": 3383 + }, + { + "epoch": 0.23704326354776853, + "grad_norm": 3.6288204193115234, + "learning_rate": 7.631792644483363e-05, + "loss": 1.0445, + "num_input_tokens_seen": 54441696, + "step": 3384 + }, + { + "epoch": 0.23711331179349776, + "grad_norm": 4.207483291625977, + "learning_rate": 7.631092819614712e-05, + "loss": 1.2068, + "num_input_tokens_seen": 54457720, + "step": 3385 + }, + { + "epoch": 0.23718336003922702, + "grad_norm": 3.880786895751953, + "learning_rate": 7.63039299474606e-05, + "loss": 1.0471, + "num_input_tokens_seen": 54474104, + "step": 3386 + }, + { + "epoch": 0.23725340828495625, + "grad_norm": 4.493243217468262, + "learning_rate": 7.629693169877408e-05, + "loss": 1.1107, + "num_input_tokens_seen": 54490080, + "step": 3387 + }, + { + "epoch": 0.2373234565306855, + "grad_norm": 4.432561874389648, + "learning_rate": 7.628993345008757e-05, + "loss": 1.1474, + "num_input_tokens_seen": 54506464, + "step": 3388 + }, + { + "epoch": 0.23739350477641474, + "grad_norm": 4.210158824920654, + "learning_rate": 7.628293520140106e-05, + "loss": 1.1567, + "num_input_tokens_seen": 54522848, + "step": 3389 + }, + { + "epoch": 0.237463553022144, + "grad_norm": 4.561443328857422, + "learning_rate": 7.627593695271454e-05, + "loss": 1.2793, + "num_input_tokens_seen": 54538192, + "step": 3390 + }, + { + "epoch": 0.23753360126787323, + "grad_norm": 3.6792140007019043, + "learning_rate": 7.626893870402802e-05, + "loss": 0.9692, + "num_input_tokens_seen": 54554576, + "step": 3391 + }, + { + "epoch": 0.2376036495136025, + "grad_norm": 4.3415141105651855, + "learning_rate": 7.626194045534151e-05, + "loss": 1.1777, + "num_input_tokens_seen": 54570960, + "step": 3392 + }, + { + "epoch": 0.23767369775933175, + "grad_norm": 3.770224094390869, + "learning_rate": 7.625494220665499e-05, + "loss": 1.1923, + "num_input_tokens_seen": 54587344, + "step": 3393 + }, + { + "epoch": 0.23774374600506099, + "grad_norm": 3.7803759574890137, + "learning_rate": 7.624794395796848e-05, + "loss": 1.1631, + "num_input_tokens_seen": 54603728, + "step": 3394 + }, + { + "epoch": 0.23781379425079024, + "grad_norm": 4.559312343597412, + "learning_rate": 7.624094570928196e-05, + "loss": 1.0235, + "num_input_tokens_seen": 54619760, + "step": 3395 + }, + { + "epoch": 0.23788384249651948, + "grad_norm": 4.215981483459473, + "learning_rate": 7.623394746059545e-05, + "loss": 1.2803, + "num_input_tokens_seen": 54636144, + "step": 3396 + }, + { + "epoch": 0.23795389074224874, + "grad_norm": 4.108291149139404, + "learning_rate": 7.622694921190894e-05, + "loss": 1.0486, + "num_input_tokens_seen": 54652136, + "step": 3397 + }, + { + "epoch": 0.23802393898797797, + "grad_norm": 4.4075093269348145, + "learning_rate": 7.621995096322242e-05, + "loss": 1.0766, + "num_input_tokens_seen": 54668520, + "step": 3398 + }, + { + "epoch": 0.23809398723370723, + "grad_norm": 4.002575874328613, + "learning_rate": 7.62129527145359e-05, + "loss": 1.1793, + "num_input_tokens_seen": 54684544, + "step": 3399 + }, + { + "epoch": 0.23816403547943646, + "grad_norm": 3.5264174938201904, + "learning_rate": 7.620595446584938e-05, + "loss": 0.928, + "num_input_tokens_seen": 54700680, + "step": 3400 + }, + { + "epoch": 0.23816403547943646, + "eval_loss": 1.1361509561538696, + "eval_runtime": 0.1856, + "eval_samples_per_second": 5.389, + "eval_steps_per_second": 5.389, + "num_input_tokens_seen": 54700680, + "step": 3400 + }, + { + "epoch": 0.23823408372516572, + "grad_norm": 3.585204839706421, + "learning_rate": 7.619895621716287e-05, + "loss": 1.0865, + "num_input_tokens_seen": 54717064, + "step": 3401 + }, + { + "epoch": 0.23830413197089495, + "grad_norm": 4.442777633666992, + "learning_rate": 7.619195796847637e-05, + "loss": 0.9445, + "num_input_tokens_seen": 54732648, + "step": 3402 + }, + { + "epoch": 0.2383741802166242, + "grad_norm": 3.807063102722168, + "learning_rate": 7.618495971978985e-05, + "loss": 1.0127, + "num_input_tokens_seen": 54749032, + "step": 3403 + }, + { + "epoch": 0.23844422846235344, + "grad_norm": 4.984583854675293, + "learning_rate": 7.617796147110333e-05, + "loss": 1.349, + "num_input_tokens_seen": 54764192, + "step": 3404 + }, + { + "epoch": 0.2385142767080827, + "grad_norm": 4.326750755310059, + "learning_rate": 7.617096322241682e-05, + "loss": 1.0875, + "num_input_tokens_seen": 54780120, + "step": 3405 + }, + { + "epoch": 0.23858432495381193, + "grad_norm": 5.707291126251221, + "learning_rate": 7.61639649737303e-05, + "loss": 1.0816, + "num_input_tokens_seen": 54796168, + "step": 3406 + }, + { + "epoch": 0.2386543731995412, + "grad_norm": 4.450499534606934, + "learning_rate": 7.615696672504379e-05, + "loss": 1.139, + "num_input_tokens_seen": 54812056, + "step": 3407 + }, + { + "epoch": 0.23872442144527042, + "grad_norm": 4.253554821014404, + "learning_rate": 7.614996847635728e-05, + "loss": 1.1798, + "num_input_tokens_seen": 54828248, + "step": 3408 + }, + { + "epoch": 0.23879446969099968, + "grad_norm": 5.04890251159668, + "learning_rate": 7.614297022767076e-05, + "loss": 0.9968, + "num_input_tokens_seen": 54844632, + "step": 3409 + }, + { + "epoch": 0.23886451793672892, + "grad_norm": 3.24513578414917, + "learning_rate": 7.613597197898424e-05, + "loss": 0.8901, + "num_input_tokens_seen": 54861016, + "step": 3410 + }, + { + "epoch": 0.23893456618245817, + "grad_norm": 4.008625507354736, + "learning_rate": 7.612897373029773e-05, + "loss": 1.1048, + "num_input_tokens_seen": 54877168, + "step": 3411 + }, + { + "epoch": 0.2390046144281874, + "grad_norm": 5.393536567687988, + "learning_rate": 7.612197548161122e-05, + "loss": 1.1554, + "num_input_tokens_seen": 54892720, + "step": 3412 + }, + { + "epoch": 0.23907466267391667, + "grad_norm": 4.388333797454834, + "learning_rate": 7.611497723292469e-05, + "loss": 1.0478, + "num_input_tokens_seen": 54909104, + "step": 3413 + }, + { + "epoch": 0.2391447109196459, + "grad_norm": 3.8056883811950684, + "learning_rate": 7.610797898423818e-05, + "loss": 0.9235, + "num_input_tokens_seen": 54925280, + "step": 3414 + }, + { + "epoch": 0.23921475916537516, + "grad_norm": 6.9983062744140625, + "learning_rate": 7.610098073555167e-05, + "loss": 1.0766, + "num_input_tokens_seen": 54941384, + "step": 3415 + }, + { + "epoch": 0.2392848074111044, + "grad_norm": 3.485119581222534, + "learning_rate": 7.609398248686516e-05, + "loss": 1.0811, + "num_input_tokens_seen": 54957592, + "step": 3416 + }, + { + "epoch": 0.23935485565683365, + "grad_norm": 4.450938701629639, + "learning_rate": 7.608698423817863e-05, + "loss": 0.9354, + "num_input_tokens_seen": 54973976, + "step": 3417 + }, + { + "epoch": 0.23942490390256288, + "grad_norm": 4.142702579498291, + "learning_rate": 7.607998598949212e-05, + "loss": 1.0336, + "num_input_tokens_seen": 54990360, + "step": 3418 + }, + { + "epoch": 0.23949495214829214, + "grad_norm": 4.341495513916016, + "learning_rate": 7.607298774080561e-05, + "loss": 0.9722, + "num_input_tokens_seen": 55006744, + "step": 3419 + }, + { + "epoch": 0.23956500039402137, + "grad_norm": 4.355419158935547, + "learning_rate": 7.606598949211908e-05, + "loss": 0.9972, + "num_input_tokens_seen": 55022816, + "step": 3420 + }, + { + "epoch": 0.23963504863975063, + "grad_norm": 4.295046806335449, + "learning_rate": 7.605899124343257e-05, + "loss": 1.1881, + "num_input_tokens_seen": 55039200, + "step": 3421 + }, + { + "epoch": 0.23970509688547986, + "grad_norm": 3.9299042224884033, + "learning_rate": 7.605199299474608e-05, + "loss": 1.0959, + "num_input_tokens_seen": 55055552, + "step": 3422 + }, + { + "epoch": 0.23977514513120912, + "grad_norm": 3.7252607345581055, + "learning_rate": 7.604499474605955e-05, + "loss": 0.9151, + "num_input_tokens_seen": 55071936, + "step": 3423 + }, + { + "epoch": 0.23984519337693835, + "grad_norm": 4.723415851593018, + "learning_rate": 7.603799649737304e-05, + "loss": 0.9568, + "num_input_tokens_seen": 55088320, + "step": 3424 + }, + { + "epoch": 0.2399152416226676, + "grad_norm": 3.9923605918884277, + "learning_rate": 7.603099824868651e-05, + "loss": 1.1124, + "num_input_tokens_seen": 55104416, + "step": 3425 + }, + { + "epoch": 0.23998528986839684, + "grad_norm": 4.510697364807129, + "learning_rate": 7.6024e-05, + "loss": 1.1397, + "num_input_tokens_seen": 55120800, + "step": 3426 + }, + { + "epoch": 0.2400553381141261, + "grad_norm": 4.161818027496338, + "learning_rate": 7.601700175131348e-05, + "loss": 1.0915, + "num_input_tokens_seen": 55137184, + "step": 3427 + }, + { + "epoch": 0.24012538635985536, + "grad_norm": 5.871128082275391, + "learning_rate": 7.601000350262698e-05, + "loss": 0.9465, + "num_input_tokens_seen": 55152528, + "step": 3428 + }, + { + "epoch": 0.2401954346055846, + "grad_norm": 4.180598258972168, + "learning_rate": 7.600300525394047e-05, + "loss": 1.0132, + "num_input_tokens_seen": 55168552, + "step": 3429 + }, + { + "epoch": 0.24026548285131386, + "grad_norm": 5.575338363647461, + "learning_rate": 7.599600700525394e-05, + "loss": 1.2578, + "num_input_tokens_seen": 55184104, + "step": 3430 + }, + { + "epoch": 0.2403355310970431, + "grad_norm": 4.503122329711914, + "learning_rate": 7.598900875656743e-05, + "loss": 1.1367, + "num_input_tokens_seen": 55199768, + "step": 3431 + }, + { + "epoch": 0.24040557934277235, + "grad_norm": 3.6931769847869873, + "learning_rate": 7.598201050788092e-05, + "loss": 1.0977, + "num_input_tokens_seen": 55216016, + "step": 3432 + }, + { + "epoch": 0.24047562758850158, + "grad_norm": 4.138489723205566, + "learning_rate": 7.59750122591944e-05, + "loss": 1.1163, + "num_input_tokens_seen": 55232400, + "step": 3433 + }, + { + "epoch": 0.24054567583423084, + "grad_norm": 3.603297710418701, + "learning_rate": 7.596801401050788e-05, + "loss": 1.1277, + "num_input_tokens_seen": 55248784, + "step": 3434 + }, + { + "epoch": 0.24061572407996007, + "grad_norm": 4.072240352630615, + "learning_rate": 7.596101576182137e-05, + "loss": 1.3073, + "num_input_tokens_seen": 55264320, + "step": 3435 + }, + { + "epoch": 0.24068577232568933, + "grad_norm": 5.015305519104004, + "learning_rate": 7.595401751313486e-05, + "loss": 1.3236, + "num_input_tokens_seen": 55280528, + "step": 3436 + }, + { + "epoch": 0.24075582057141856, + "grad_norm": 5.135364055633545, + "learning_rate": 7.594701926444834e-05, + "loss": 1.0322, + "num_input_tokens_seen": 55296912, + "step": 3437 + }, + { + "epoch": 0.24082586881714782, + "grad_norm": 4.737668991088867, + "learning_rate": 7.594002101576183e-05, + "loss": 1.0069, + "num_input_tokens_seen": 55313296, + "step": 3438 + }, + { + "epoch": 0.24089591706287705, + "grad_norm": 4.380087375640869, + "learning_rate": 7.593302276707531e-05, + "loss": 1.267, + "num_input_tokens_seen": 55329152, + "step": 3439 + }, + { + "epoch": 0.2409659653086063, + "grad_norm": 4.472866535186768, + "learning_rate": 7.592602451838879e-05, + "loss": 1.1577, + "num_input_tokens_seen": 55345536, + "step": 3440 + }, + { + "epoch": 0.24103601355433554, + "grad_norm": 4.323402404785156, + "learning_rate": 7.591902626970228e-05, + "loss": 1.1872, + "num_input_tokens_seen": 55361920, + "step": 3441 + }, + { + "epoch": 0.2411060618000648, + "grad_norm": 3.7247276306152344, + "learning_rate": 7.591202802101578e-05, + "loss": 1.0906, + "num_input_tokens_seen": 55377344, + "step": 3442 + }, + { + "epoch": 0.24117611004579403, + "grad_norm": 6.503116607666016, + "learning_rate": 7.590502977232925e-05, + "loss": 1.2304, + "num_input_tokens_seen": 55393728, + "step": 3443 + }, + { + "epoch": 0.2412461582915233, + "grad_norm": 4.590184688568115, + "learning_rate": 7.589803152364273e-05, + "loss": 1.1369, + "num_input_tokens_seen": 55410112, + "step": 3444 + }, + { + "epoch": 0.24131620653725253, + "grad_norm": 3.718323230743408, + "learning_rate": 7.589103327495622e-05, + "loss": 1.09, + "num_input_tokens_seen": 55426392, + "step": 3445 + }, + { + "epoch": 0.24138625478298179, + "grad_norm": 4.8696465492248535, + "learning_rate": 7.58840350262697e-05, + "loss": 1.2361, + "num_input_tokens_seen": 55442632, + "step": 3446 + }, + { + "epoch": 0.24145630302871102, + "grad_norm": 3.7620716094970703, + "learning_rate": 7.587703677758318e-05, + "loss": 0.9411, + "num_input_tokens_seen": 55459016, + "step": 3447 + }, + { + "epoch": 0.24152635127444028, + "grad_norm": 3.8696882724761963, + "learning_rate": 7.587003852889668e-05, + "loss": 0.992, + "num_input_tokens_seen": 55474944, + "step": 3448 + }, + { + "epoch": 0.2415963995201695, + "grad_norm": 4.628901481628418, + "learning_rate": 7.586304028021017e-05, + "loss": 1.1376, + "num_input_tokens_seen": 55490416, + "step": 3449 + }, + { + "epoch": 0.24166644776589877, + "grad_norm": 4.1568121910095215, + "learning_rate": 7.585604203152365e-05, + "loss": 1.1596, + "num_input_tokens_seen": 55505760, + "step": 3450 + }, + { + "epoch": 0.241736496011628, + "grad_norm": 4.089991569519043, + "learning_rate": 7.584904378283714e-05, + "loss": 1.1707, + "num_input_tokens_seen": 55521528, + "step": 3451 + }, + { + "epoch": 0.24180654425735726, + "grad_norm": 7.870755195617676, + "learning_rate": 7.584204553415061e-05, + "loss": 1.3069, + "num_input_tokens_seen": 55536256, + "step": 3452 + }, + { + "epoch": 0.2418765925030865, + "grad_norm": 4.898053169250488, + "learning_rate": 7.58350472854641e-05, + "loss": 1.0912, + "num_input_tokens_seen": 55551904, + "step": 3453 + }, + { + "epoch": 0.24194664074881575, + "grad_norm": 4.515797138214111, + "learning_rate": 7.582804903677759e-05, + "loss": 1.2266, + "num_input_tokens_seen": 55567240, + "step": 3454 + }, + { + "epoch": 0.24201668899454498, + "grad_norm": 3.7202370166778564, + "learning_rate": 7.582105078809108e-05, + "loss": 1.0118, + "num_input_tokens_seen": 55583176, + "step": 3455 + }, + { + "epoch": 0.24208673724027424, + "grad_norm": 5.834963321685791, + "learning_rate": 7.581405253940457e-05, + "loss": 1.3757, + "num_input_tokens_seen": 55599144, + "step": 3456 + }, + { + "epoch": 0.24215678548600347, + "grad_norm": 4.450705528259277, + "learning_rate": 7.580705429071804e-05, + "loss": 0.9523, + "num_input_tokens_seen": 55615528, + "step": 3457 + }, + { + "epoch": 0.24222683373173273, + "grad_norm": 3.796229839324951, + "learning_rate": 7.580005604203153e-05, + "loss": 1.0415, + "num_input_tokens_seen": 55631912, + "step": 3458 + }, + { + "epoch": 0.24229688197746196, + "grad_norm": 4.004448413848877, + "learning_rate": 7.579305779334502e-05, + "loss": 1.1538, + "num_input_tokens_seen": 55647896, + "step": 3459 + }, + { + "epoch": 0.24236693022319122, + "grad_norm": 4.511063098907471, + "learning_rate": 7.578605954465849e-05, + "loss": 1.0616, + "num_input_tokens_seen": 55664280, + "step": 3460 + }, + { + "epoch": 0.24243697846892046, + "grad_norm": 6.866496562957764, + "learning_rate": 7.577906129597198e-05, + "loss": 1.1126, + "num_input_tokens_seen": 55679720, + "step": 3461 + }, + { + "epoch": 0.24250702671464971, + "grad_norm": 5.447164058685303, + "learning_rate": 7.577206304728547e-05, + "loss": 1.0812, + "num_input_tokens_seen": 55696104, + "step": 3462 + }, + { + "epoch": 0.24257707496037897, + "grad_norm": 6.401725769042969, + "learning_rate": 7.576506479859896e-05, + "loss": 1.0097, + "num_input_tokens_seen": 55712488, + "step": 3463 + }, + { + "epoch": 0.2426471232061082, + "grad_norm": 3.7833733558654785, + "learning_rate": 7.575806654991243e-05, + "loss": 1.0064, + "num_input_tokens_seen": 55728504, + "step": 3464 + }, + { + "epoch": 0.24271717145183747, + "grad_norm": 4.157958984375, + "learning_rate": 7.575106830122592e-05, + "loss": 1.109, + "num_input_tokens_seen": 55743904, + "step": 3465 + }, + { + "epoch": 0.2427872196975667, + "grad_norm": 4.657470703125, + "learning_rate": 7.574407005253941e-05, + "loss": 1.2033, + "num_input_tokens_seen": 55759920, + "step": 3466 + }, + { + "epoch": 0.24285726794329596, + "grad_norm": 5.129040718078613, + "learning_rate": 7.573707180385289e-05, + "loss": 1.2959, + "num_input_tokens_seen": 55776304, + "step": 3467 + }, + { + "epoch": 0.2429273161890252, + "grad_norm": 4.887351036071777, + "learning_rate": 7.573007355516639e-05, + "loss": 1.0568, + "num_input_tokens_seen": 55792688, + "step": 3468 + }, + { + "epoch": 0.24299736443475445, + "grad_norm": 4.042501926422119, + "learning_rate": 7.572307530647988e-05, + "loss": 1.0509, + "num_input_tokens_seen": 55809072, + "step": 3469 + }, + { + "epoch": 0.24306741268048368, + "grad_norm": 4.162355422973633, + "learning_rate": 7.571607705779335e-05, + "loss": 1.0168, + "num_input_tokens_seen": 55825456, + "step": 3470 + }, + { + "epoch": 0.24313746092621294, + "grad_norm": 3.8478844165802, + "learning_rate": 7.570907880910683e-05, + "loss": 1.2378, + "num_input_tokens_seen": 55841840, + "step": 3471 + }, + { + "epoch": 0.24320750917194217, + "grad_norm": 6.2065815925598145, + "learning_rate": 7.570208056042032e-05, + "loss": 1.3644, + "num_input_tokens_seen": 55858224, + "step": 3472 + }, + { + "epoch": 0.24327755741767143, + "grad_norm": 4.8233642578125, + "learning_rate": 7.56950823117338e-05, + "loss": 1.1363, + "num_input_tokens_seen": 55874608, + "step": 3473 + }, + { + "epoch": 0.24334760566340066, + "grad_norm": 3.534205198287964, + "learning_rate": 7.568808406304729e-05, + "loss": 1.0236, + "num_input_tokens_seen": 55890992, + "step": 3474 + }, + { + "epoch": 0.24341765390912992, + "grad_norm": 4.218345642089844, + "learning_rate": 7.568108581436078e-05, + "loss": 1.0921, + "num_input_tokens_seen": 55907376, + "step": 3475 + }, + { + "epoch": 0.24348770215485915, + "grad_norm": 3.7262325286865234, + "learning_rate": 7.567408756567427e-05, + "loss": 1.1182, + "num_input_tokens_seen": 55923752, + "step": 3476 + }, + { + "epoch": 0.2435577504005884, + "grad_norm": 3.7530906200408936, + "learning_rate": 7.566708931698774e-05, + "loss": 1.0766, + "num_input_tokens_seen": 55939176, + "step": 3477 + }, + { + "epoch": 0.24362779864631764, + "grad_norm": 4.452608585357666, + "learning_rate": 7.566009106830123e-05, + "loss": 0.9421, + "num_input_tokens_seen": 55955200, + "step": 3478 + }, + { + "epoch": 0.2436978468920469, + "grad_norm": 4.049906253814697, + "learning_rate": 7.565309281961471e-05, + "loss": 1.1022, + "num_input_tokens_seen": 55971584, + "step": 3479 + }, + { + "epoch": 0.24376789513777614, + "grad_norm": 4.956455230712891, + "learning_rate": 7.56460945709282e-05, + "loss": 1.2684, + "num_input_tokens_seen": 55987968, + "step": 3480 + }, + { + "epoch": 0.2438379433835054, + "grad_norm": 4.846863746643066, + "learning_rate": 7.563909632224169e-05, + "loss": 1.0492, + "num_input_tokens_seen": 56003000, + "step": 3481 + }, + { + "epoch": 0.24390799162923463, + "grad_norm": 4.678101539611816, + "learning_rate": 7.563209807355517e-05, + "loss": 0.8577, + "num_input_tokens_seen": 56019384, + "step": 3482 + }, + { + "epoch": 0.2439780398749639, + "grad_norm": 4.13012170791626, + "learning_rate": 7.562509982486866e-05, + "loss": 0.9508, + "num_input_tokens_seen": 56035768, + "step": 3483 + }, + { + "epoch": 0.24404808812069312, + "grad_norm": 3.7858669757843018, + "learning_rate": 7.561810157618214e-05, + "loss": 1.1034, + "num_input_tokens_seen": 56052152, + "step": 3484 + }, + { + "epoch": 0.24411813636642238, + "grad_norm": 3.7132198810577393, + "learning_rate": 7.561110332749563e-05, + "loss": 1.0665, + "num_input_tokens_seen": 56068536, + "step": 3485 + }, + { + "epoch": 0.2441881846121516, + "grad_norm": 4.093675136566162, + "learning_rate": 7.560410507880911e-05, + "loss": 1.0994, + "num_input_tokens_seen": 56084888, + "step": 3486 + }, + { + "epoch": 0.24425823285788087, + "grad_norm": 3.8601670265197754, + "learning_rate": 7.559710683012259e-05, + "loss": 0.9648, + "num_input_tokens_seen": 56101272, + "step": 3487 + }, + { + "epoch": 0.2443282811036101, + "grad_norm": 3.9332170486450195, + "learning_rate": 7.559010858143608e-05, + "loss": 0.9559, + "num_input_tokens_seen": 56117352, + "step": 3488 + }, + { + "epoch": 0.24439832934933936, + "grad_norm": 3.7619807720184326, + "learning_rate": 7.558311033274957e-05, + "loss": 1.0948, + "num_input_tokens_seen": 56133736, + "step": 3489 + }, + { + "epoch": 0.2444683775950686, + "grad_norm": 3.9035656452178955, + "learning_rate": 7.557611208406306e-05, + "loss": 1.1255, + "num_input_tokens_seen": 56149624, + "step": 3490 + }, + { + "epoch": 0.24453842584079785, + "grad_norm": 5.9505486488342285, + "learning_rate": 7.556911383537653e-05, + "loss": 1.0192, + "num_input_tokens_seen": 56163752, + "step": 3491 + }, + { + "epoch": 0.24460847408652708, + "grad_norm": 4.006525039672852, + "learning_rate": 7.556211558669002e-05, + "loss": 1.0859, + "num_input_tokens_seen": 56180136, + "step": 3492 + }, + { + "epoch": 0.24467852233225634, + "grad_norm": 5.28178071975708, + "learning_rate": 7.555511733800351e-05, + "loss": 1.3704, + "num_input_tokens_seen": 56196152, + "step": 3493 + }, + { + "epoch": 0.24474857057798557, + "grad_norm": 3.577709674835205, + "learning_rate": 7.5548119089317e-05, + "loss": 1.0015, + "num_input_tokens_seen": 56212528, + "step": 3494 + }, + { + "epoch": 0.24481861882371483, + "grad_norm": 4.6045002937316895, + "learning_rate": 7.554112084063048e-05, + "loss": 1.1895, + "num_input_tokens_seen": 56228912, + "step": 3495 + }, + { + "epoch": 0.24488866706944407, + "grad_norm": 4.160959720611572, + "learning_rate": 7.553412259194397e-05, + "loss": 1.1253, + "num_input_tokens_seen": 56244944, + "step": 3496 + }, + { + "epoch": 0.24495871531517333, + "grad_norm": 4.380669593811035, + "learning_rate": 7.552712434325745e-05, + "loss": 1.0171, + "num_input_tokens_seen": 56261072, + "step": 3497 + }, + { + "epoch": 0.24502876356090258, + "grad_norm": 3.568679094314575, + "learning_rate": 7.552012609457092e-05, + "loss": 1.0005, + "num_input_tokens_seen": 56277456, + "step": 3498 + }, + { + "epoch": 0.24509881180663182, + "grad_norm": 4.006386756896973, + "learning_rate": 7.551312784588441e-05, + "loss": 1.0756, + "num_input_tokens_seen": 56293840, + "step": 3499 + }, + { + "epoch": 0.24516886005236108, + "grad_norm": 4.180081844329834, + "learning_rate": 7.55061295971979e-05, + "loss": 1.303, + "num_input_tokens_seen": 56310224, + "step": 3500 + }, + { + "epoch": 0.2452389082980903, + "grad_norm": 5.228555202484131, + "learning_rate": 7.549913134851139e-05, + "loss": 1.0267, + "num_input_tokens_seen": 56326608, + "step": 3501 + }, + { + "epoch": 0.24530895654381957, + "grad_norm": 3.60235595703125, + "learning_rate": 7.549213309982488e-05, + "loss": 0.9258, + "num_input_tokens_seen": 56342752, + "step": 3502 + }, + { + "epoch": 0.2453790047895488, + "grad_norm": 5.305295467376709, + "learning_rate": 7.548513485113837e-05, + "loss": 1.0366, + "num_input_tokens_seen": 56358208, + "step": 3503 + }, + { + "epoch": 0.24544905303527806, + "grad_norm": 4.5955939292907715, + "learning_rate": 7.547813660245184e-05, + "loss": 1.1016, + "num_input_tokens_seen": 56374592, + "step": 3504 + }, + { + "epoch": 0.2455191012810073, + "grad_norm": 4.503798961639404, + "learning_rate": 7.547113835376533e-05, + "loss": 0.9045, + "num_input_tokens_seen": 56390320, + "step": 3505 + }, + { + "epoch": 0.24558914952673655, + "grad_norm": 3.79156231880188, + "learning_rate": 7.54641401050788e-05, + "loss": 0.9796, + "num_input_tokens_seen": 56406176, + "step": 3506 + }, + { + "epoch": 0.24565919777246578, + "grad_norm": 4.054116249084473, + "learning_rate": 7.54571418563923e-05, + "loss": 1.0002, + "num_input_tokens_seen": 56421120, + "step": 3507 + }, + { + "epoch": 0.24572924601819504, + "grad_norm": 3.57210636138916, + "learning_rate": 7.545014360770578e-05, + "loss": 0.982, + "num_input_tokens_seen": 56437504, + "step": 3508 + }, + { + "epoch": 0.24579929426392427, + "grad_norm": 4.02463960647583, + "learning_rate": 7.544314535901927e-05, + "loss": 1.0651, + "num_input_tokens_seen": 56452680, + "step": 3509 + }, + { + "epoch": 0.24586934250965353, + "grad_norm": 3.712689161300659, + "learning_rate": 7.543614711033276e-05, + "loss": 1.1449, + "num_input_tokens_seen": 56468424, + "step": 3510 + }, + { + "epoch": 0.24593939075538276, + "grad_norm": 4.943066596984863, + "learning_rate": 7.542914886164623e-05, + "loss": 1.2289, + "num_input_tokens_seen": 56484784, + "step": 3511 + }, + { + "epoch": 0.24600943900111202, + "grad_norm": 4.94294548034668, + "learning_rate": 7.542215061295972e-05, + "loss": 1.0088, + "num_input_tokens_seen": 56500992, + "step": 3512 + }, + { + "epoch": 0.24607948724684126, + "grad_norm": 4.003958225250244, + "learning_rate": 7.541515236427321e-05, + "loss": 0.9778, + "num_input_tokens_seen": 56516392, + "step": 3513 + }, + { + "epoch": 0.24614953549257051, + "grad_norm": 4.173887252807617, + "learning_rate": 7.540815411558669e-05, + "loss": 1.0291, + "num_input_tokens_seen": 56532776, + "step": 3514 + }, + { + "epoch": 0.24621958373829975, + "grad_norm": 5.028538227081299, + "learning_rate": 7.540115586690019e-05, + "loss": 1.0489, + "num_input_tokens_seen": 56549056, + "step": 3515 + }, + { + "epoch": 0.246289631984029, + "grad_norm": 3.781219959259033, + "learning_rate": 7.539415761821366e-05, + "loss": 0.9056, + "num_input_tokens_seen": 56564768, + "step": 3516 + }, + { + "epoch": 0.24635968022975824, + "grad_norm": 4.070143222808838, + "learning_rate": 7.538715936952715e-05, + "loss": 1.0561, + "num_input_tokens_seen": 56580856, + "step": 3517 + }, + { + "epoch": 0.2464297284754875, + "grad_norm": 4.322885036468506, + "learning_rate": 7.538016112084063e-05, + "loss": 0.9834, + "num_input_tokens_seen": 56596664, + "step": 3518 + }, + { + "epoch": 0.24649977672121673, + "grad_norm": 7.525569438934326, + "learning_rate": 7.537316287215412e-05, + "loss": 1.278, + "num_input_tokens_seen": 56611608, + "step": 3519 + }, + { + "epoch": 0.246569824966946, + "grad_norm": 3.6520745754241943, + "learning_rate": 7.53661646234676e-05, + "loss": 0.9472, + "num_input_tokens_seen": 56627992, + "step": 3520 + }, + { + "epoch": 0.24663987321267522, + "grad_norm": 6.346038341522217, + "learning_rate": 7.53591663747811e-05, + "loss": 1.0585, + "num_input_tokens_seen": 56644224, + "step": 3521 + }, + { + "epoch": 0.24670992145840448, + "grad_norm": 5.049849987030029, + "learning_rate": 7.535216812609458e-05, + "loss": 1.1843, + "num_input_tokens_seen": 56660464, + "step": 3522 + }, + { + "epoch": 0.2467799697041337, + "grad_norm": 5.948208332061768, + "learning_rate": 7.534516987740807e-05, + "loss": 1.1426, + "num_input_tokens_seen": 56676848, + "step": 3523 + }, + { + "epoch": 0.24685001794986297, + "grad_norm": 4.2648210525512695, + "learning_rate": 7.533817162872155e-05, + "loss": 1.0703, + "num_input_tokens_seen": 56692800, + "step": 3524 + }, + { + "epoch": 0.2469200661955922, + "grad_norm": 4.268098831176758, + "learning_rate": 7.533117338003502e-05, + "loss": 1.123, + "num_input_tokens_seen": 56709184, + "step": 3525 + }, + { + "epoch": 0.24699011444132146, + "grad_norm": 3.987408399581909, + "learning_rate": 7.532417513134851e-05, + "loss": 1.1312, + "num_input_tokens_seen": 56724744, + "step": 3526 + }, + { + "epoch": 0.2470601626870507, + "grad_norm": 4.087530612945557, + "learning_rate": 7.5317176882662e-05, + "loss": 0.9904, + "num_input_tokens_seen": 56741128, + "step": 3527 + }, + { + "epoch": 0.24713021093277995, + "grad_norm": 3.484837770462036, + "learning_rate": 7.531017863397549e-05, + "loss": 1.0385, + "num_input_tokens_seen": 56757512, + "step": 3528 + }, + { + "epoch": 0.24720025917850919, + "grad_norm": 4.382214546203613, + "learning_rate": 7.530318038528898e-05, + "loss": 1.1713, + "num_input_tokens_seen": 56773896, + "step": 3529 + }, + { + "epoch": 0.24727030742423844, + "grad_norm": 4.361959457397461, + "learning_rate": 7.529618213660246e-05, + "loss": 1.2548, + "num_input_tokens_seen": 56789184, + "step": 3530 + }, + { + "epoch": 0.24734035566996768, + "grad_norm": 4.029329776763916, + "learning_rate": 7.528918388791594e-05, + "loss": 1.0637, + "num_input_tokens_seen": 56804912, + "step": 3531 + }, + { + "epoch": 0.24741040391569694, + "grad_norm": 4.577064514160156, + "learning_rate": 7.528218563922943e-05, + "loss": 0.9591, + "num_input_tokens_seen": 56821296, + "step": 3532 + }, + { + "epoch": 0.2474804521614262, + "grad_norm": 3.6799368858337402, + "learning_rate": 7.52751873905429e-05, + "loss": 1.0508, + "num_input_tokens_seen": 56837680, + "step": 3533 + }, + { + "epoch": 0.24755050040715543, + "grad_norm": 3.962989568710327, + "learning_rate": 7.526818914185639e-05, + "loss": 1.1347, + "num_input_tokens_seen": 56853984, + "step": 3534 + }, + { + "epoch": 0.2476205486528847, + "grad_norm": 3.610877275466919, + "learning_rate": 7.526119089316988e-05, + "loss": 1.075, + "num_input_tokens_seen": 56870368, + "step": 3535 + }, + { + "epoch": 0.24769059689861392, + "grad_norm": 4.16568660736084, + "learning_rate": 7.525419264448337e-05, + "loss": 0.9326, + "num_input_tokens_seen": 56886032, + "step": 3536 + }, + { + "epoch": 0.24776064514434318, + "grad_norm": 4.645627021789551, + "learning_rate": 7.524719439579686e-05, + "loss": 1.0221, + "num_input_tokens_seen": 56900928, + "step": 3537 + }, + { + "epoch": 0.2478306933900724, + "grad_norm": 3.652317523956299, + "learning_rate": 7.524019614711033e-05, + "loss": 1.1641, + "num_input_tokens_seen": 56917048, + "step": 3538 + }, + { + "epoch": 0.24790074163580167, + "grad_norm": 5.583502769470215, + "learning_rate": 7.523319789842382e-05, + "loss": 1.0303, + "num_input_tokens_seen": 56933432, + "step": 3539 + }, + { + "epoch": 0.2479707898815309, + "grad_norm": 3.6924219131469727, + "learning_rate": 7.522619964973731e-05, + "loss": 0.9386, + "num_input_tokens_seen": 56949816, + "step": 3540 + }, + { + "epoch": 0.24804083812726016, + "grad_norm": 6.476202487945557, + "learning_rate": 7.52192014010508e-05, + "loss": 1.1841, + "num_input_tokens_seen": 56966064, + "step": 3541 + }, + { + "epoch": 0.2481108863729894, + "grad_norm": 4.052863121032715, + "learning_rate": 7.521220315236429e-05, + "loss": 1.1133, + "num_input_tokens_seen": 56982448, + "step": 3542 + }, + { + "epoch": 0.24818093461871865, + "grad_norm": 6.569397926330566, + "learning_rate": 7.520520490367776e-05, + "loss": 1.1061, + "num_input_tokens_seen": 56998832, + "step": 3543 + }, + { + "epoch": 0.24825098286444788, + "grad_norm": 4.026796817779541, + "learning_rate": 7.519820665499125e-05, + "loss": 1.0121, + "num_input_tokens_seen": 57014744, + "step": 3544 + }, + { + "epoch": 0.24832103111017714, + "grad_norm": 3.705080032348633, + "learning_rate": 7.519120840630472e-05, + "loss": 1.0041, + "num_input_tokens_seen": 57031128, + "step": 3545 + }, + { + "epoch": 0.24839107935590637, + "grad_norm": 4.828441143035889, + "learning_rate": 7.518421015761821e-05, + "loss": 1.1551, + "num_input_tokens_seen": 57047512, + "step": 3546 + }, + { + "epoch": 0.24846112760163563, + "grad_norm": 5.6117777824401855, + "learning_rate": 7.51772119089317e-05, + "loss": 1.1555, + "num_input_tokens_seen": 57063840, + "step": 3547 + }, + { + "epoch": 0.24853117584736487, + "grad_norm": 3.955193281173706, + "learning_rate": 7.517021366024519e-05, + "loss": 1.0514, + "num_input_tokens_seen": 57079936, + "step": 3548 + }, + { + "epoch": 0.24860122409309413, + "grad_norm": 3.8878116607666016, + "learning_rate": 7.516321541155868e-05, + "loss": 1.0335, + "num_input_tokens_seen": 57096320, + "step": 3549 + }, + { + "epoch": 0.24867127233882336, + "grad_norm": 6.119873046875, + "learning_rate": 7.515621716287217e-05, + "loss": 1.0798, + "num_input_tokens_seen": 57111632, + "step": 3550 + }, + { + "epoch": 0.24874132058455262, + "grad_norm": 3.757984161376953, + "learning_rate": 7.514921891418564e-05, + "loss": 0.9911, + "num_input_tokens_seen": 57128016, + "step": 3551 + }, + { + "epoch": 0.24881136883028185, + "grad_norm": 4.173069953918457, + "learning_rate": 7.514222066549912e-05, + "loss": 1.1443, + "num_input_tokens_seen": 57144400, + "step": 3552 + }, + { + "epoch": 0.2488814170760111, + "grad_norm": 3.6985576152801514, + "learning_rate": 7.51352224168126e-05, + "loss": 1.1084, + "num_input_tokens_seen": 57160784, + "step": 3553 + }, + { + "epoch": 0.24895146532174034, + "grad_norm": 4.464880466461182, + "learning_rate": 7.51282241681261e-05, + "loss": 1.2571, + "num_input_tokens_seen": 57177168, + "step": 3554 + }, + { + "epoch": 0.2490215135674696, + "grad_norm": 3.699873447418213, + "learning_rate": 7.512122591943958e-05, + "loss": 0.9722, + "num_input_tokens_seen": 57193208, + "step": 3555 + }, + { + "epoch": 0.24909156181319883, + "grad_norm": 5.011424541473389, + "learning_rate": 7.511422767075307e-05, + "loss": 0.962, + "num_input_tokens_seen": 57209592, + "step": 3556 + }, + { + "epoch": 0.2491616100589281, + "grad_norm": 4.302685260772705, + "learning_rate": 7.510722942206656e-05, + "loss": 1.0718, + "num_input_tokens_seen": 57225976, + "step": 3557 + }, + { + "epoch": 0.24923165830465732, + "grad_norm": 3.991840362548828, + "learning_rate": 7.510023117338004e-05, + "loss": 1.0826, + "num_input_tokens_seen": 57242168, + "step": 3558 + }, + { + "epoch": 0.24930170655038658, + "grad_norm": 3.9910435676574707, + "learning_rate": 7.509323292469352e-05, + "loss": 1.2494, + "num_input_tokens_seen": 57258552, + "step": 3559 + }, + { + "epoch": 0.2493717547961158, + "grad_norm": 4.170960426330566, + "learning_rate": 7.5086234676007e-05, + "loss": 1.0068, + "num_input_tokens_seen": 57274936, + "step": 3560 + }, + { + "epoch": 0.24944180304184507, + "grad_norm": 4.317671298980713, + "learning_rate": 7.50792364273205e-05, + "loss": 1.0835, + "num_input_tokens_seen": 57291320, + "step": 3561 + }, + { + "epoch": 0.2495118512875743, + "grad_norm": 3.871293783187866, + "learning_rate": 7.507223817863398e-05, + "loss": 0.9352, + "num_input_tokens_seen": 57307056, + "step": 3562 + }, + { + "epoch": 0.24958189953330356, + "grad_norm": 4.014804840087891, + "learning_rate": 7.506523992994747e-05, + "loss": 1.2893, + "num_input_tokens_seen": 57322376, + "step": 3563 + }, + { + "epoch": 0.2496519477790328, + "grad_norm": 5.13847017288208, + "learning_rate": 7.505824168126095e-05, + "loss": 1.1943, + "num_input_tokens_seen": 57338760, + "step": 3564 + }, + { + "epoch": 0.24972199602476206, + "grad_norm": 3.7801826000213623, + "learning_rate": 7.505124343257443e-05, + "loss": 1.021, + "num_input_tokens_seen": 57355144, + "step": 3565 + }, + { + "epoch": 0.24979204427049131, + "grad_norm": 3.662065029144287, + "learning_rate": 7.504424518388792e-05, + "loss": 1.1461, + "num_input_tokens_seen": 57371528, + "step": 3566 + }, + { + "epoch": 0.24986209251622055, + "grad_norm": 4.548840522766113, + "learning_rate": 7.50372469352014e-05, + "loss": 1.058, + "num_input_tokens_seen": 57387912, + "step": 3567 + }, + { + "epoch": 0.2499321407619498, + "grad_norm": 4.732056140899658, + "learning_rate": 7.50302486865149e-05, + "loss": 1.0513, + "num_input_tokens_seen": 57403120, + "step": 3568 + }, + { + "epoch": 0.25000218900767907, + "grad_norm": 3.7986674308776855, + "learning_rate": 7.502325043782837e-05, + "loss": 0.9574, + "num_input_tokens_seen": 57418800, + "step": 3569 + }, + { + "epoch": 0.25007223725340827, + "grad_norm": 6.760079860687256, + "learning_rate": 7.501625218914186e-05, + "loss": 0.9101, + "num_input_tokens_seen": 57432608, + "step": 3570 + }, + { + "epoch": 0.25014228549913753, + "grad_norm": 4.0666985511779785, + "learning_rate": 7.500925394045535e-05, + "loss": 1.0564, + "num_input_tokens_seen": 57448296, + "step": 3571 + }, + { + "epoch": 0.2502123337448668, + "grad_norm": 3.7505650520324707, + "learning_rate": 7.500225569176882e-05, + "loss": 1.1593, + "num_input_tokens_seen": 57464680, + "step": 3572 + }, + { + "epoch": 0.25028238199059605, + "grad_norm": 5.1084675788879395, + "learning_rate": 7.499525744308231e-05, + "loss": 1.317, + "num_input_tokens_seen": 57481032, + "step": 3573 + }, + { + "epoch": 0.25035243023632525, + "grad_norm": 6.083080768585205, + "learning_rate": 7.49882591943958e-05, + "loss": 0.9305, + "num_input_tokens_seen": 57497416, + "step": 3574 + }, + { + "epoch": 0.2504224784820545, + "grad_norm": 4.197649955749512, + "learning_rate": 7.498126094570929e-05, + "loss": 1.0191, + "num_input_tokens_seen": 57513800, + "step": 3575 + }, + { + "epoch": 0.25049252672778377, + "grad_norm": 4.637972831726074, + "learning_rate": 7.497426269702278e-05, + "loss": 0.9914, + "num_input_tokens_seen": 57529832, + "step": 3576 + }, + { + "epoch": 0.25056257497351303, + "grad_norm": 4.096358776092529, + "learning_rate": 7.496726444833626e-05, + "loss": 1.1909, + "num_input_tokens_seen": 57545432, + "step": 3577 + }, + { + "epoch": 0.25063262321924223, + "grad_norm": 3.9253315925598145, + "learning_rate": 7.496026619964974e-05, + "loss": 1.1383, + "num_input_tokens_seen": 57561816, + "step": 3578 + }, + { + "epoch": 0.2507026714649715, + "grad_norm": 5.603836536407471, + "learning_rate": 7.495326795096321e-05, + "loss": 1.0744, + "num_input_tokens_seen": 57577336, + "step": 3579 + }, + { + "epoch": 0.25077271971070075, + "grad_norm": 4.588653564453125, + "learning_rate": 7.49462697022767e-05, + "loss": 1.0896, + "num_input_tokens_seen": 57593720, + "step": 3580 + }, + { + "epoch": 0.25084276795643, + "grad_norm": 3.989229917526245, + "learning_rate": 7.49392714535902e-05, + "loss": 0.9605, + "num_input_tokens_seen": 57609656, + "step": 3581 + }, + { + "epoch": 0.2509128162021592, + "grad_norm": 4.728183269500732, + "learning_rate": 7.493227320490368e-05, + "loss": 1.2626, + "num_input_tokens_seen": 57626040, + "step": 3582 + }, + { + "epoch": 0.2509828644478885, + "grad_norm": 4.269988059997559, + "learning_rate": 7.492527495621717e-05, + "loss": 1.0987, + "num_input_tokens_seen": 57641280, + "step": 3583 + }, + { + "epoch": 0.25105291269361774, + "grad_norm": 6.506377696990967, + "learning_rate": 7.491827670753066e-05, + "loss": 0.9327, + "num_input_tokens_seen": 57657664, + "step": 3584 + }, + { + "epoch": 0.251122960939347, + "grad_norm": 6.415282726287842, + "learning_rate": 7.491127845884413e-05, + "loss": 0.9515, + "num_input_tokens_seen": 57672704, + "step": 3585 + }, + { + "epoch": 0.2511930091850762, + "grad_norm": 3.969257116317749, + "learning_rate": 7.490428021015761e-05, + "loss": 1.1255, + "num_input_tokens_seen": 57687504, + "step": 3586 + }, + { + "epoch": 0.25126305743080546, + "grad_norm": 3.493469476699829, + "learning_rate": 7.489728196147111e-05, + "loss": 0.95, + "num_input_tokens_seen": 57703512, + "step": 3587 + }, + { + "epoch": 0.2513331056765347, + "grad_norm": 5.777353763580322, + "learning_rate": 7.48902837127846e-05, + "loss": 1.0089, + "num_input_tokens_seen": 57719344, + "step": 3588 + }, + { + "epoch": 0.251403153922264, + "grad_norm": 3.6840991973876953, + "learning_rate": 7.488328546409807e-05, + "loss": 1.0351, + "num_input_tokens_seen": 57734848, + "step": 3589 + }, + { + "epoch": 0.2514732021679932, + "grad_norm": 6.526551246643066, + "learning_rate": 7.487628721541156e-05, + "loss": 1.1651, + "num_input_tokens_seen": 57751232, + "step": 3590 + }, + { + "epoch": 0.25154325041372244, + "grad_norm": 3.7879719734191895, + "learning_rate": 7.486928896672505e-05, + "loss": 1.0128, + "num_input_tokens_seen": 57767616, + "step": 3591 + }, + { + "epoch": 0.2516132986594517, + "grad_norm": 6.891875267028809, + "learning_rate": 7.486229071803853e-05, + "loss": 1.2037, + "num_input_tokens_seen": 57783592, + "step": 3592 + }, + { + "epoch": 0.25168334690518096, + "grad_norm": 4.700318336486816, + "learning_rate": 7.485529246935201e-05, + "loss": 1.0291, + "num_input_tokens_seen": 57799976, + "step": 3593 + }, + { + "epoch": 0.25175339515091016, + "grad_norm": 6.47390604019165, + "learning_rate": 7.48482942206655e-05, + "loss": 0.9828, + "num_input_tokens_seen": 57816360, + "step": 3594 + }, + { + "epoch": 0.2518234433966394, + "grad_norm": 5.045449733734131, + "learning_rate": 7.484129597197899e-05, + "loss": 0.9569, + "num_input_tokens_seen": 57832016, + "step": 3595 + }, + { + "epoch": 0.2518934916423687, + "grad_norm": 4.258456230163574, + "learning_rate": 7.483429772329247e-05, + "loss": 0.9804, + "num_input_tokens_seen": 57848400, + "step": 3596 + }, + { + "epoch": 0.25196353988809794, + "grad_norm": 3.948582649230957, + "learning_rate": 7.482729947460596e-05, + "loss": 0.9898, + "num_input_tokens_seen": 57864784, + "step": 3597 + }, + { + "epoch": 0.25203358813382715, + "grad_norm": 4.017141342163086, + "learning_rate": 7.482030122591944e-05, + "loss": 0.8644, + "num_input_tokens_seen": 57879696, + "step": 3598 + }, + { + "epoch": 0.2521036363795564, + "grad_norm": 3.7428297996520996, + "learning_rate": 7.481330297723292e-05, + "loss": 0.9318, + "num_input_tokens_seen": 57896080, + "step": 3599 + }, + { + "epoch": 0.25217368462528567, + "grad_norm": 4.883368968963623, + "learning_rate": 7.480630472854641e-05, + "loss": 0.9771, + "num_input_tokens_seen": 57911976, + "step": 3600 + }, + { + "epoch": 0.25217368462528567, + "eval_loss": 1.136000633239746, + "eval_runtime": 0.2016, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 4.959, + "num_input_tokens_seen": 57911976, + "step": 3600 + }, + { + "epoch": 0.2522437328710149, + "grad_norm": 4.399716377258301, + "learning_rate": 7.479930647985991e-05, + "loss": 0.9965, + "num_input_tokens_seen": 57927440, + "step": 3601 + }, + { + "epoch": 0.2523137811167442, + "grad_norm": 6.019199371337891, + "learning_rate": 7.479230823117338e-05, + "loss": 1.1172, + "num_input_tokens_seen": 57943824, + "step": 3602 + }, + { + "epoch": 0.2523838293624734, + "grad_norm": 4.42507266998291, + "learning_rate": 7.478530998248687e-05, + "loss": 1.1294, + "num_input_tokens_seen": 57960208, + "step": 3603 + }, + { + "epoch": 0.25245387760820265, + "grad_norm": 4.0232062339782715, + "learning_rate": 7.477831173380036e-05, + "loss": 1.031, + "num_input_tokens_seen": 57976560, + "step": 3604 + }, + { + "epoch": 0.2525239258539319, + "grad_norm": 3.6392862796783447, + "learning_rate": 7.477131348511384e-05, + "loss": 0.8717, + "num_input_tokens_seen": 57992944, + "step": 3605 + }, + { + "epoch": 0.25259397409966117, + "grad_norm": 3.849912643432617, + "learning_rate": 7.476431523642731e-05, + "loss": 0.994, + "num_input_tokens_seen": 58009328, + "step": 3606 + }, + { + "epoch": 0.25266402234539037, + "grad_norm": 3.5331156253814697, + "learning_rate": 7.475731698774081e-05, + "loss": 0.8999, + "num_input_tokens_seen": 58025152, + "step": 3607 + }, + { + "epoch": 0.25273407059111963, + "grad_norm": 4.343970775604248, + "learning_rate": 7.47503187390543e-05, + "loss": 1.0231, + "num_input_tokens_seen": 58041536, + "step": 3608 + }, + { + "epoch": 0.2528041188368489, + "grad_norm": 3.6736862659454346, + "learning_rate": 7.474332049036778e-05, + "loss": 1.161, + "num_input_tokens_seen": 58057920, + "step": 3609 + }, + { + "epoch": 0.25287416708257815, + "grad_norm": 6.599121570587158, + "learning_rate": 7.473632224168127e-05, + "loss": 1.2235, + "num_input_tokens_seen": 58073784, + "step": 3610 + }, + { + "epoch": 0.25294421532830735, + "grad_norm": 4.2448930740356445, + "learning_rate": 7.472932399299475e-05, + "loss": 1.0207, + "num_input_tokens_seen": 58088776, + "step": 3611 + }, + { + "epoch": 0.2530142635740366, + "grad_norm": 3.416584014892578, + "learning_rate": 7.472232574430823e-05, + "loss": 0.984, + "num_input_tokens_seen": 58105160, + "step": 3612 + }, + { + "epoch": 0.2530843118197659, + "grad_norm": 3.9348700046539307, + "learning_rate": 7.471532749562172e-05, + "loss": 1.0883, + "num_input_tokens_seen": 58121528, + "step": 3613 + }, + { + "epoch": 0.25315436006549513, + "grad_norm": 6.208236217498779, + "learning_rate": 7.470832924693521e-05, + "loss": 1.1842, + "num_input_tokens_seen": 58137912, + "step": 3614 + }, + { + "epoch": 0.25322440831122434, + "grad_norm": 3.9069888591766357, + "learning_rate": 7.47013309982487e-05, + "loss": 0.9958, + "num_input_tokens_seen": 58154056, + "step": 3615 + }, + { + "epoch": 0.2532944565569536, + "grad_norm": 4.482925891876221, + "learning_rate": 7.469433274956217e-05, + "loss": 1.0365, + "num_input_tokens_seen": 58168904, + "step": 3616 + }, + { + "epoch": 0.25336450480268286, + "grad_norm": 4.082488536834717, + "learning_rate": 7.468733450087566e-05, + "loss": 0.9116, + "num_input_tokens_seen": 58185288, + "step": 3617 + }, + { + "epoch": 0.2534345530484121, + "grad_norm": 5.994426250457764, + "learning_rate": 7.468033625218915e-05, + "loss": 1.1286, + "num_input_tokens_seen": 58201600, + "step": 3618 + }, + { + "epoch": 0.2535046012941413, + "grad_norm": 3.966487169265747, + "learning_rate": 7.467333800350262e-05, + "loss": 1.061, + "num_input_tokens_seen": 58217752, + "step": 3619 + }, + { + "epoch": 0.2535746495398706, + "grad_norm": 4.3370537757873535, + "learning_rate": 7.466633975481611e-05, + "loss": 0.9495, + "num_input_tokens_seen": 58233672, + "step": 3620 + }, + { + "epoch": 0.25364469778559984, + "grad_norm": 4.638936519622803, + "learning_rate": 7.465934150612961e-05, + "loss": 1.1593, + "num_input_tokens_seen": 58249904, + "step": 3621 + }, + { + "epoch": 0.2537147460313291, + "grad_norm": 3.42993426322937, + "learning_rate": 7.465234325744309e-05, + "loss": 0.9112, + "num_input_tokens_seen": 58265272, + "step": 3622 + }, + { + "epoch": 0.2537847942770583, + "grad_norm": 4.637670516967773, + "learning_rate": 7.464534500875656e-05, + "loss": 1.1578, + "num_input_tokens_seen": 58281656, + "step": 3623 + }, + { + "epoch": 0.25385484252278756, + "grad_norm": 4.470972061157227, + "learning_rate": 7.463834676007005e-05, + "loss": 0.9973, + "num_input_tokens_seen": 58297696, + "step": 3624 + }, + { + "epoch": 0.2539248907685168, + "grad_norm": 4.158536434173584, + "learning_rate": 7.463134851138354e-05, + "loss": 1.2625, + "num_input_tokens_seen": 58313960, + "step": 3625 + }, + { + "epoch": 0.2539949390142461, + "grad_norm": 5.2940850257873535, + "learning_rate": 7.462435026269702e-05, + "loss": 1.1649, + "num_input_tokens_seen": 58329928, + "step": 3626 + }, + { + "epoch": 0.2540649872599753, + "grad_norm": 4.270470142364502, + "learning_rate": 7.461735201401052e-05, + "loss": 0.9042, + "num_input_tokens_seen": 58345544, + "step": 3627 + }, + { + "epoch": 0.25413503550570454, + "grad_norm": 4.488008975982666, + "learning_rate": 7.4610353765324e-05, + "loss": 1.2652, + "num_input_tokens_seen": 58361736, + "step": 3628 + }, + { + "epoch": 0.2542050837514338, + "grad_norm": 3.9760642051696777, + "learning_rate": 7.460335551663748e-05, + "loss": 0.9522, + "num_input_tokens_seen": 58377888, + "step": 3629 + }, + { + "epoch": 0.25427513199716306, + "grad_norm": 4.022678852081299, + "learning_rate": 7.459635726795097e-05, + "loss": 1.0673, + "num_input_tokens_seen": 58393744, + "step": 3630 + }, + { + "epoch": 0.25434518024289227, + "grad_norm": 6.345690727233887, + "learning_rate": 7.458935901926446e-05, + "loss": 1.052, + "num_input_tokens_seen": 58410064, + "step": 3631 + }, + { + "epoch": 0.2544152284886215, + "grad_norm": 4.0159101486206055, + "learning_rate": 7.458236077057793e-05, + "loss": 1.1164, + "num_input_tokens_seen": 58426352, + "step": 3632 + }, + { + "epoch": 0.2544852767343508, + "grad_norm": 4.125208854675293, + "learning_rate": 7.457536252189142e-05, + "loss": 1.0113, + "num_input_tokens_seen": 58441936, + "step": 3633 + }, + { + "epoch": 0.25455532498008004, + "grad_norm": 4.429535865783691, + "learning_rate": 7.456836427320491e-05, + "loss": 1.158, + "num_input_tokens_seen": 58457136, + "step": 3634 + }, + { + "epoch": 0.2546253732258093, + "grad_norm": 3.655606269836426, + "learning_rate": 7.45613660245184e-05, + "loss": 1.0467, + "num_input_tokens_seen": 58473520, + "step": 3635 + }, + { + "epoch": 0.2546954214715385, + "grad_norm": 3.688188314437866, + "learning_rate": 7.455436777583187e-05, + "loss": 0.9309, + "num_input_tokens_seen": 58489904, + "step": 3636 + }, + { + "epoch": 0.25476546971726777, + "grad_norm": 3.95440411567688, + "learning_rate": 7.454736952714536e-05, + "loss": 1.2586, + "num_input_tokens_seen": 58506032, + "step": 3637 + }, + { + "epoch": 0.254835517962997, + "grad_norm": 3.950641632080078, + "learning_rate": 7.454037127845885e-05, + "loss": 0.9397, + "num_input_tokens_seen": 58521464, + "step": 3638 + }, + { + "epoch": 0.2549055662087263, + "grad_norm": 4.9607038497924805, + "learning_rate": 7.453337302977233e-05, + "loss": 1.0498, + "num_input_tokens_seen": 58537848, + "step": 3639 + }, + { + "epoch": 0.2549756144544555, + "grad_norm": 3.4168713092803955, + "learning_rate": 7.452637478108582e-05, + "loss": 0.8983, + "num_input_tokens_seen": 58554232, + "step": 3640 + }, + { + "epoch": 0.25504566270018475, + "grad_norm": 6.897549152374268, + "learning_rate": 7.451937653239932e-05, + "loss": 1.2782, + "num_input_tokens_seen": 58570616, + "step": 3641 + }, + { + "epoch": 0.255115710945914, + "grad_norm": 4.009060859680176, + "learning_rate": 7.451237828371279e-05, + "loss": 1.0205, + "num_input_tokens_seen": 58587000, + "step": 3642 + }, + { + "epoch": 0.25518575919164327, + "grad_norm": 4.245255470275879, + "learning_rate": 7.450538003502627e-05, + "loss": 0.98, + "num_input_tokens_seen": 58602768, + "step": 3643 + }, + { + "epoch": 0.2552558074373725, + "grad_norm": 3.7547385692596436, + "learning_rate": 7.449838178633976e-05, + "loss": 1.0763, + "num_input_tokens_seen": 58619024, + "step": 3644 + }, + { + "epoch": 0.25532585568310173, + "grad_norm": 5.7543745040893555, + "learning_rate": 7.449138353765324e-05, + "loss": 1.1535, + "num_input_tokens_seen": 58635408, + "step": 3645 + }, + { + "epoch": 0.255395903928831, + "grad_norm": 3.8786420822143555, + "learning_rate": 7.448438528896672e-05, + "loss": 1.0385, + "num_input_tokens_seen": 58651392, + "step": 3646 + }, + { + "epoch": 0.25546595217456025, + "grad_norm": 4.290858745574951, + "learning_rate": 7.447738704028022e-05, + "loss": 0.9459, + "num_input_tokens_seen": 58667712, + "step": 3647 + }, + { + "epoch": 0.25553600042028946, + "grad_norm": 3.8005576133728027, + "learning_rate": 7.447038879159371e-05, + "loss": 1.1709, + "num_input_tokens_seen": 58683512, + "step": 3648 + }, + { + "epoch": 0.2556060486660187, + "grad_norm": 3.574735403060913, + "learning_rate": 7.446339054290719e-05, + "loss": 1.0276, + "num_input_tokens_seen": 58699296, + "step": 3649 + }, + { + "epoch": 0.255676096911748, + "grad_norm": 4.487549304962158, + "learning_rate": 7.445639229422066e-05, + "loss": 1.0608, + "num_input_tokens_seen": 58715680, + "step": 3650 + }, + { + "epoch": 0.25574614515747723, + "grad_norm": 3.80549955368042, + "learning_rate": 7.444939404553415e-05, + "loss": 1.0916, + "num_input_tokens_seen": 58732064, + "step": 3651 + }, + { + "epoch": 0.25581619340320644, + "grad_norm": 6.745276927947998, + "learning_rate": 7.444239579684764e-05, + "loss": 0.9649, + "num_input_tokens_seen": 58748416, + "step": 3652 + }, + { + "epoch": 0.2558862416489357, + "grad_norm": 5.366410732269287, + "learning_rate": 7.443539754816113e-05, + "loss": 1.1205, + "num_input_tokens_seen": 58764800, + "step": 3653 + }, + { + "epoch": 0.25595628989466496, + "grad_norm": 4.889951705932617, + "learning_rate": 7.442839929947462e-05, + "loss": 1.0447, + "num_input_tokens_seen": 58779776, + "step": 3654 + }, + { + "epoch": 0.2560263381403942, + "grad_norm": 3.776078462600708, + "learning_rate": 7.44214010507881e-05, + "loss": 0.9146, + "num_input_tokens_seen": 58796160, + "step": 3655 + }, + { + "epoch": 0.2560963863861234, + "grad_norm": 4.999850749969482, + "learning_rate": 7.441440280210158e-05, + "loss": 1.053, + "num_input_tokens_seen": 58812544, + "step": 3656 + }, + { + "epoch": 0.2561664346318527, + "grad_norm": 4.111214637756348, + "learning_rate": 7.440740455341507e-05, + "loss": 1.02, + "num_input_tokens_seen": 58828696, + "step": 3657 + }, + { + "epoch": 0.25623648287758194, + "grad_norm": 4.49043083190918, + "learning_rate": 7.440040630472856e-05, + "loss": 0.8889, + "num_input_tokens_seen": 58845080, + "step": 3658 + }, + { + "epoch": 0.2563065311233112, + "grad_norm": 4.440788745880127, + "learning_rate": 7.439340805604203e-05, + "loss": 1.0635, + "num_input_tokens_seen": 58861464, + "step": 3659 + }, + { + "epoch": 0.2563765793690404, + "grad_norm": 5.642586708068848, + "learning_rate": 7.438640980735552e-05, + "loss": 1.3676, + "num_input_tokens_seen": 58877624, + "step": 3660 + }, + { + "epoch": 0.25644662761476966, + "grad_norm": 3.8768467903137207, + "learning_rate": 7.437941155866901e-05, + "loss": 0.9737, + "num_input_tokens_seen": 58894008, + "step": 3661 + }, + { + "epoch": 0.2565166758604989, + "grad_norm": 3.9855473041534424, + "learning_rate": 7.43724133099825e-05, + "loss": 1.0987, + "num_input_tokens_seen": 58909600, + "step": 3662 + }, + { + "epoch": 0.2565867241062282, + "grad_norm": 3.6692938804626465, + "learning_rate": 7.436541506129597e-05, + "loss": 1.0541, + "num_input_tokens_seen": 58925776, + "step": 3663 + }, + { + "epoch": 0.2566567723519574, + "grad_norm": 3.87776517868042, + "learning_rate": 7.435841681260946e-05, + "loss": 1.0616, + "num_input_tokens_seen": 58941048, + "step": 3664 + }, + { + "epoch": 0.25672682059768664, + "grad_norm": 3.5173263549804688, + "learning_rate": 7.435141856392295e-05, + "loss": 0.9046, + "num_input_tokens_seen": 58957432, + "step": 3665 + }, + { + "epoch": 0.2567968688434159, + "grad_norm": 4.312611103057861, + "learning_rate": 7.434442031523642e-05, + "loss": 0.8224, + "num_input_tokens_seen": 58973816, + "step": 3666 + }, + { + "epoch": 0.25686691708914516, + "grad_norm": 3.7889907360076904, + "learning_rate": 7.433742206654991e-05, + "loss": 1.1431, + "num_input_tokens_seen": 58989472, + "step": 3667 + }, + { + "epoch": 0.25693696533487437, + "grad_norm": 4.997755527496338, + "learning_rate": 7.433042381786341e-05, + "loss": 1.2147, + "num_input_tokens_seen": 59005856, + "step": 3668 + }, + { + "epoch": 0.2570070135806036, + "grad_norm": 5.839511871337891, + "learning_rate": 7.432342556917689e-05, + "loss": 1.0974, + "num_input_tokens_seen": 59022176, + "step": 3669 + }, + { + "epoch": 0.2570770618263329, + "grad_norm": 4.185897350311279, + "learning_rate": 7.431642732049036e-05, + "loss": 1.0769, + "num_input_tokens_seen": 59038296, + "step": 3670 + }, + { + "epoch": 0.25714711007206215, + "grad_norm": 3.6666383743286133, + "learning_rate": 7.430942907180385e-05, + "loss": 1.0051, + "num_input_tokens_seen": 59054680, + "step": 3671 + }, + { + "epoch": 0.2572171583177914, + "grad_norm": 3.8587453365325928, + "learning_rate": 7.430243082311734e-05, + "loss": 1.1001, + "num_input_tokens_seen": 59070912, + "step": 3672 + }, + { + "epoch": 0.2572872065635206, + "grad_norm": 3.6518352031707764, + "learning_rate": 7.429543257443083e-05, + "loss": 1.042, + "num_input_tokens_seen": 59087296, + "step": 3673 + }, + { + "epoch": 0.25735725480924987, + "grad_norm": 4.629798412322998, + "learning_rate": 7.428843432574432e-05, + "loss": 1.2649, + "num_input_tokens_seen": 59103632, + "step": 3674 + }, + { + "epoch": 0.25742730305497913, + "grad_norm": 6.353034496307373, + "learning_rate": 7.428143607705781e-05, + "loss": 1.3823, + "num_input_tokens_seen": 59120016, + "step": 3675 + }, + { + "epoch": 0.2574973513007084, + "grad_norm": 6.1848273277282715, + "learning_rate": 7.427443782837128e-05, + "loss": 1.2275, + "num_input_tokens_seen": 59136232, + "step": 3676 + }, + { + "epoch": 0.2575673995464376, + "grad_norm": 3.6022186279296875, + "learning_rate": 7.426743957968476e-05, + "loss": 0.9513, + "num_input_tokens_seen": 59152616, + "step": 3677 + }, + { + "epoch": 0.25763744779216685, + "grad_norm": 3.6495468616485596, + "learning_rate": 7.426044133099825e-05, + "loss": 1.0282, + "num_input_tokens_seen": 59167792, + "step": 3678 + }, + { + "epoch": 0.2577074960378961, + "grad_norm": 4.675189018249512, + "learning_rate": 7.425344308231174e-05, + "loss": 1.1248, + "num_input_tokens_seen": 59184176, + "step": 3679 + }, + { + "epoch": 0.25777754428362537, + "grad_norm": 3.657700538635254, + "learning_rate": 7.424644483362522e-05, + "loss": 1.0445, + "num_input_tokens_seen": 59199632, + "step": 3680 + }, + { + "epoch": 0.2578475925293546, + "grad_norm": 3.9934394359588623, + "learning_rate": 7.423944658493871e-05, + "loss": 1.0598, + "num_input_tokens_seen": 59215720, + "step": 3681 + }, + { + "epoch": 0.25791764077508383, + "grad_norm": 3.777191400527954, + "learning_rate": 7.42324483362522e-05, + "loss": 1.2255, + "num_input_tokens_seen": 59231248, + "step": 3682 + }, + { + "epoch": 0.2579876890208131, + "grad_norm": 3.9812276363372803, + "learning_rate": 7.422545008756568e-05, + "loss": 1.1253, + "num_input_tokens_seen": 59247280, + "step": 3683 + }, + { + "epoch": 0.25805773726654235, + "grad_norm": 3.631455183029175, + "learning_rate": 7.421845183887916e-05, + "loss": 1.0559, + "num_input_tokens_seen": 59263664, + "step": 3684 + }, + { + "epoch": 0.25812778551227156, + "grad_norm": 3.803898334503174, + "learning_rate": 7.421145359019265e-05, + "loss": 0.9847, + "num_input_tokens_seen": 59279880, + "step": 3685 + }, + { + "epoch": 0.2581978337580008, + "grad_norm": 3.649956703186035, + "learning_rate": 7.420445534150613e-05, + "loss": 0.951, + "num_input_tokens_seen": 59296216, + "step": 3686 + }, + { + "epoch": 0.2582678820037301, + "grad_norm": 4.010924339294434, + "learning_rate": 7.419745709281962e-05, + "loss": 1.1987, + "num_input_tokens_seen": 59312448, + "step": 3687 + }, + { + "epoch": 0.25833793024945934, + "grad_norm": 4.2410759925842285, + "learning_rate": 7.41904588441331e-05, + "loss": 0.9677, + "num_input_tokens_seen": 59328456, + "step": 3688 + }, + { + "epoch": 0.25840797849518854, + "grad_norm": 3.9170684814453125, + "learning_rate": 7.41834605954466e-05, + "loss": 1.0795, + "num_input_tokens_seen": 59344840, + "step": 3689 + }, + { + "epoch": 0.2584780267409178, + "grad_norm": 3.935624837875366, + "learning_rate": 7.417646234676007e-05, + "loss": 1.1149, + "num_input_tokens_seen": 59360040, + "step": 3690 + }, + { + "epoch": 0.25854807498664706, + "grad_norm": 4.3747782707214355, + "learning_rate": 7.416946409807356e-05, + "loss": 1.2462, + "num_input_tokens_seen": 59375896, + "step": 3691 + }, + { + "epoch": 0.2586181232323763, + "grad_norm": 7.553433418273926, + "learning_rate": 7.416246584938705e-05, + "loss": 1.4753, + "num_input_tokens_seen": 59391144, + "step": 3692 + }, + { + "epoch": 0.2586881714781055, + "grad_norm": 3.4443981647491455, + "learning_rate": 7.415546760070053e-05, + "loss": 1.0629, + "num_input_tokens_seen": 59407528, + "step": 3693 + }, + { + "epoch": 0.2587582197238348, + "grad_norm": 4.02165412902832, + "learning_rate": 7.414846935201401e-05, + "loss": 0.9951, + "num_input_tokens_seen": 59422824, + "step": 3694 + }, + { + "epoch": 0.25882826796956404, + "grad_norm": 3.8880200386047363, + "learning_rate": 7.414147110332751e-05, + "loss": 0.941, + "num_input_tokens_seen": 59439208, + "step": 3695 + }, + { + "epoch": 0.2588983162152933, + "grad_norm": 5.463441371917725, + "learning_rate": 7.413447285464099e-05, + "loss": 0.9333, + "num_input_tokens_seen": 59455592, + "step": 3696 + }, + { + "epoch": 0.2589683644610225, + "grad_norm": 7.555225372314453, + "learning_rate": 7.412747460595446e-05, + "loss": 1.2278, + "num_input_tokens_seen": 59471976, + "step": 3697 + }, + { + "epoch": 0.25903841270675176, + "grad_norm": 5.7154436111450195, + "learning_rate": 7.412047635726795e-05, + "loss": 1.151, + "num_input_tokens_seen": 59488360, + "step": 3698 + }, + { + "epoch": 0.259108460952481, + "grad_norm": 5.09559965133667, + "learning_rate": 7.411347810858144e-05, + "loss": 1.0998, + "num_input_tokens_seen": 59504536, + "step": 3699 + }, + { + "epoch": 0.2591785091982103, + "grad_norm": 4.7749738693237305, + "learning_rate": 7.410647985989493e-05, + "loss": 1.2971, + "num_input_tokens_seen": 59520488, + "step": 3700 + }, + { + "epoch": 0.2592485574439395, + "grad_norm": 4.323631763458252, + "learning_rate": 7.409948161120842e-05, + "loss": 1.1687, + "num_input_tokens_seen": 59535384, + "step": 3701 + }, + { + "epoch": 0.25931860568966875, + "grad_norm": 3.511822462081909, + "learning_rate": 7.40924833625219e-05, + "loss": 1.0547, + "num_input_tokens_seen": 59550888, + "step": 3702 + }, + { + "epoch": 0.259388653935398, + "grad_norm": 4.039402008056641, + "learning_rate": 7.408548511383538e-05, + "loss": 0.8453, + "num_input_tokens_seen": 59567184, + "step": 3703 + }, + { + "epoch": 0.25945870218112727, + "grad_norm": 3.6692605018615723, + "learning_rate": 7.407848686514885e-05, + "loss": 0.9705, + "num_input_tokens_seen": 59583568, + "step": 3704 + }, + { + "epoch": 0.2595287504268565, + "grad_norm": 4.414707660675049, + "learning_rate": 7.407148861646234e-05, + "loss": 0.8734, + "num_input_tokens_seen": 59599088, + "step": 3705 + }, + { + "epoch": 0.25959879867258573, + "grad_norm": 4.073670387268066, + "learning_rate": 7.406449036777583e-05, + "loss": 1.2958, + "num_input_tokens_seen": 59615432, + "step": 3706 + }, + { + "epoch": 0.259668846918315, + "grad_norm": 4.436419486999512, + "learning_rate": 7.405749211908932e-05, + "loss": 1.0019, + "num_input_tokens_seen": 59631816, + "step": 3707 + }, + { + "epoch": 0.25973889516404425, + "grad_norm": 5.866218090057373, + "learning_rate": 7.405049387040281e-05, + "loss": 1.043, + "num_input_tokens_seen": 59648200, + "step": 3708 + }, + { + "epoch": 0.2598089434097735, + "grad_norm": 4.133188247680664, + "learning_rate": 7.40434956217163e-05, + "loss": 1.1168, + "num_input_tokens_seen": 59664584, + "step": 3709 + }, + { + "epoch": 0.2598789916555027, + "grad_norm": 4.1976213455200195, + "learning_rate": 7.403649737302977e-05, + "loss": 1.1118, + "num_input_tokens_seen": 59680288, + "step": 3710 + }, + { + "epoch": 0.25994903990123197, + "grad_norm": 3.990983009338379, + "learning_rate": 7.402949912434326e-05, + "loss": 0.9963, + "num_input_tokens_seen": 59696408, + "step": 3711 + }, + { + "epoch": 0.26001908814696123, + "grad_norm": 4.427793025970459, + "learning_rate": 7.402250087565675e-05, + "loss": 1.1771, + "num_input_tokens_seen": 59712792, + "step": 3712 + }, + { + "epoch": 0.2600891363926905, + "grad_norm": 5.360867023468018, + "learning_rate": 7.401550262697024e-05, + "loss": 1.1428, + "num_input_tokens_seen": 59728968, + "step": 3713 + }, + { + "epoch": 0.2601591846384197, + "grad_norm": 3.8442916870117188, + "learning_rate": 7.400850437828371e-05, + "loss": 0.9544, + "num_input_tokens_seen": 59745352, + "step": 3714 + }, + { + "epoch": 0.26022923288414895, + "grad_norm": 3.7610833644866943, + "learning_rate": 7.40015061295972e-05, + "loss": 0.9969, + "num_input_tokens_seen": 59761736, + "step": 3715 + }, + { + "epoch": 0.2602992811298782, + "grad_norm": 3.9050705432891846, + "learning_rate": 7.399450788091069e-05, + "loss": 1.2099, + "num_input_tokens_seen": 59778000, + "step": 3716 + }, + { + "epoch": 0.2603693293756075, + "grad_norm": 4.293839454650879, + "learning_rate": 7.398750963222417e-05, + "loss": 1.0274, + "num_input_tokens_seen": 59794216, + "step": 3717 + }, + { + "epoch": 0.2604393776213367, + "grad_norm": 3.7403993606567383, + "learning_rate": 7.398051138353765e-05, + "loss": 1.0172, + "num_input_tokens_seen": 59810600, + "step": 3718 + }, + { + "epoch": 0.26050942586706594, + "grad_norm": 5.266970157623291, + "learning_rate": 7.397351313485114e-05, + "loss": 0.8695, + "num_input_tokens_seen": 59826984, + "step": 3719 + }, + { + "epoch": 0.2605794741127952, + "grad_norm": 4.385645866394043, + "learning_rate": 7.396651488616463e-05, + "loss": 1.0625, + "num_input_tokens_seen": 59843368, + "step": 3720 + }, + { + "epoch": 0.26064952235852445, + "grad_norm": 4.349147796630859, + "learning_rate": 7.39595166374781e-05, + "loss": 1.2092, + "num_input_tokens_seen": 59859136, + "step": 3721 + }, + { + "epoch": 0.26071957060425366, + "grad_norm": 4.69277286529541, + "learning_rate": 7.395251838879161e-05, + "loss": 1.1171, + "num_input_tokens_seen": 59875024, + "step": 3722 + }, + { + "epoch": 0.2607896188499829, + "grad_norm": 3.602949857711792, + "learning_rate": 7.394552014010508e-05, + "loss": 1.0994, + "num_input_tokens_seen": 59891408, + "step": 3723 + }, + { + "epoch": 0.2608596670957122, + "grad_norm": 4.137026786804199, + "learning_rate": 7.393852189141856e-05, + "loss": 1.0414, + "num_input_tokens_seen": 59906360, + "step": 3724 + }, + { + "epoch": 0.26092971534144144, + "grad_norm": 4.558672904968262, + "learning_rate": 7.393152364273205e-05, + "loss": 1.2051, + "num_input_tokens_seen": 59922744, + "step": 3725 + }, + { + "epoch": 0.26099976358717064, + "grad_norm": 3.977217197418213, + "learning_rate": 7.392452539404554e-05, + "loss": 0.9036, + "num_input_tokens_seen": 59938448, + "step": 3726 + }, + { + "epoch": 0.2610698118328999, + "grad_norm": 6.573578834533691, + "learning_rate": 7.391752714535902e-05, + "loss": 0.9693, + "num_input_tokens_seen": 59954832, + "step": 3727 + }, + { + "epoch": 0.26113986007862916, + "grad_norm": 4.253365516662598, + "learning_rate": 7.391052889667251e-05, + "loss": 1.1001, + "num_input_tokens_seen": 59971216, + "step": 3728 + }, + { + "epoch": 0.2612099083243584, + "grad_norm": 4.279355525970459, + "learning_rate": 7.3903530647986e-05, + "loss": 1.0456, + "num_input_tokens_seen": 59987384, + "step": 3729 + }, + { + "epoch": 0.2612799565700876, + "grad_norm": 5.5035505294799805, + "learning_rate": 7.389653239929948e-05, + "loss": 1.236, + "num_input_tokens_seen": 60003720, + "step": 3730 + }, + { + "epoch": 0.2613500048158169, + "grad_norm": 5.064812660217285, + "learning_rate": 7.388953415061295e-05, + "loss": 0.8739, + "num_input_tokens_seen": 60020104, + "step": 3731 + }, + { + "epoch": 0.26142005306154614, + "grad_norm": 4.716748237609863, + "learning_rate": 7.388253590192644e-05, + "loss": 1.2417, + "num_input_tokens_seen": 60036488, + "step": 3732 + }, + { + "epoch": 0.2614901013072754, + "grad_norm": 4.0947489738464355, + "learning_rate": 7.387553765323994e-05, + "loss": 1.1332, + "num_input_tokens_seen": 60052384, + "step": 3733 + }, + { + "epoch": 0.2615601495530046, + "grad_norm": 3.757126808166504, + "learning_rate": 7.386853940455342e-05, + "loss": 1.0442, + "num_input_tokens_seen": 60068624, + "step": 3734 + }, + { + "epoch": 0.26163019779873387, + "grad_norm": 7.364987850189209, + "learning_rate": 7.38615411558669e-05, + "loss": 1.0285, + "num_input_tokens_seen": 60084248, + "step": 3735 + }, + { + "epoch": 0.2617002460444631, + "grad_norm": 4.630516052246094, + "learning_rate": 7.38545429071804e-05, + "loss": 1.1585, + "num_input_tokens_seen": 60100632, + "step": 3736 + }, + { + "epoch": 0.2617702942901924, + "grad_norm": 5.3436760902404785, + "learning_rate": 7.384754465849387e-05, + "loss": 0.9723, + "num_input_tokens_seen": 60116672, + "step": 3737 + }, + { + "epoch": 0.2618403425359216, + "grad_norm": 3.843344211578369, + "learning_rate": 7.384054640980736e-05, + "loss": 0.8992, + "num_input_tokens_seen": 60133056, + "step": 3738 + }, + { + "epoch": 0.26191039078165085, + "grad_norm": 4.561652183532715, + "learning_rate": 7.383354816112085e-05, + "loss": 1.2304, + "num_input_tokens_seen": 60149440, + "step": 3739 + }, + { + "epoch": 0.2619804390273801, + "grad_norm": 3.951719045639038, + "learning_rate": 7.382654991243434e-05, + "loss": 0.8449, + "num_input_tokens_seen": 60165824, + "step": 3740 + }, + { + "epoch": 0.26205048727310937, + "grad_norm": 3.702449321746826, + "learning_rate": 7.381955166374781e-05, + "loss": 1.1251, + "num_input_tokens_seen": 60181496, + "step": 3741 + }, + { + "epoch": 0.2621205355188386, + "grad_norm": 5.43525505065918, + "learning_rate": 7.38125534150613e-05, + "loss": 1.1107, + "num_input_tokens_seen": 60197040, + "step": 3742 + }, + { + "epoch": 0.26219058376456783, + "grad_norm": 3.9709503650665283, + "learning_rate": 7.380555516637479e-05, + "loss": 1.1172, + "num_input_tokens_seen": 60213424, + "step": 3743 + }, + { + "epoch": 0.2622606320102971, + "grad_norm": 3.7183797359466553, + "learning_rate": 7.379855691768826e-05, + "loss": 1.0234, + "num_input_tokens_seen": 60229696, + "step": 3744 + }, + { + "epoch": 0.26233068025602635, + "grad_norm": 3.933479070663452, + "learning_rate": 7.379155866900175e-05, + "loss": 1.0702, + "num_input_tokens_seen": 60246080, + "step": 3745 + }, + { + "epoch": 0.2624007285017556, + "grad_norm": 4.837695598602295, + "learning_rate": 7.378456042031524e-05, + "loss": 1.1017, + "num_input_tokens_seen": 60262464, + "step": 3746 + }, + { + "epoch": 0.2624707767474848, + "grad_norm": 4.791194438934326, + "learning_rate": 7.377756217162873e-05, + "loss": 1.2467, + "num_input_tokens_seen": 60278600, + "step": 3747 + }, + { + "epoch": 0.2625408249932141, + "grad_norm": 4.53259801864624, + "learning_rate": 7.37705639229422e-05, + "loss": 1.1742, + "num_input_tokens_seen": 60293856, + "step": 3748 + }, + { + "epoch": 0.26261087323894333, + "grad_norm": 3.87522554397583, + "learning_rate": 7.37635656742557e-05, + "loss": 1.1901, + "num_input_tokens_seen": 60309888, + "step": 3749 + }, + { + "epoch": 0.2626809214846726, + "grad_norm": 4.46868896484375, + "learning_rate": 7.375656742556918e-05, + "loss": 0.9265, + "num_input_tokens_seen": 60325784, + "step": 3750 + }, + { + "epoch": 0.2627509697304018, + "grad_norm": 3.938703775405884, + "learning_rate": 7.374956917688266e-05, + "loss": 0.9785, + "num_input_tokens_seen": 60340696, + "step": 3751 + }, + { + "epoch": 0.26282101797613105, + "grad_norm": 3.5147759914398193, + "learning_rate": 7.374257092819614e-05, + "loss": 1.0984, + "num_input_tokens_seen": 60357080, + "step": 3752 + }, + { + "epoch": 0.2628910662218603, + "grad_norm": 4.008304119110107, + "learning_rate": 7.373557267950965e-05, + "loss": 1.046, + "num_input_tokens_seen": 60373464, + "step": 3753 + }, + { + "epoch": 0.2629611144675896, + "grad_norm": 3.9318859577178955, + "learning_rate": 7.372857443082312e-05, + "loss": 1.001, + "num_input_tokens_seen": 60389848, + "step": 3754 + }, + { + "epoch": 0.2630311627133188, + "grad_norm": 4.046808242797852, + "learning_rate": 7.372157618213661e-05, + "loss": 1.0768, + "num_input_tokens_seen": 60406232, + "step": 3755 + }, + { + "epoch": 0.26310121095904804, + "grad_norm": 5.451204299926758, + "learning_rate": 7.37145779334501e-05, + "loss": 0.9567, + "num_input_tokens_seen": 60422544, + "step": 3756 + }, + { + "epoch": 0.2631712592047773, + "grad_norm": 4.395990371704102, + "learning_rate": 7.370757968476357e-05, + "loss": 0.9173, + "num_input_tokens_seen": 60438312, + "step": 3757 + }, + { + "epoch": 0.26324130745050656, + "grad_norm": 5.997600078582764, + "learning_rate": 7.370058143607705e-05, + "loss": 1.049, + "num_input_tokens_seen": 60454696, + "step": 3758 + }, + { + "epoch": 0.26331135569623576, + "grad_norm": 5.588560104370117, + "learning_rate": 7.369358318739055e-05, + "loss": 0.9015, + "num_input_tokens_seen": 60470232, + "step": 3759 + }, + { + "epoch": 0.263381403941965, + "grad_norm": 3.2995078563690186, + "learning_rate": 7.368658493870404e-05, + "loss": 0.9814, + "num_input_tokens_seen": 60486224, + "step": 3760 + }, + { + "epoch": 0.2634514521876943, + "grad_norm": 4.141932964324951, + "learning_rate": 7.367958669001751e-05, + "loss": 1.0069, + "num_input_tokens_seen": 60502608, + "step": 3761 + }, + { + "epoch": 0.26352150043342354, + "grad_norm": 5.010983943939209, + "learning_rate": 7.3672588441331e-05, + "loss": 1.1533, + "num_input_tokens_seen": 60518672, + "step": 3762 + }, + { + "epoch": 0.26359154867915274, + "grad_norm": 3.555612802505493, + "learning_rate": 7.366559019264449e-05, + "loss": 1.1037, + "num_input_tokens_seen": 60534408, + "step": 3763 + }, + { + "epoch": 0.263661596924882, + "grad_norm": 4.006901264190674, + "learning_rate": 7.365859194395797e-05, + "loss": 1.0086, + "num_input_tokens_seen": 60550760, + "step": 3764 + }, + { + "epoch": 0.26373164517061126, + "grad_norm": 5.055272579193115, + "learning_rate": 7.365159369527146e-05, + "loss": 0.9645, + "num_input_tokens_seen": 60567144, + "step": 3765 + }, + { + "epoch": 0.2638016934163405, + "grad_norm": 3.860630989074707, + "learning_rate": 7.364459544658494e-05, + "loss": 1.0371, + "num_input_tokens_seen": 60583528, + "step": 3766 + }, + { + "epoch": 0.2638717416620697, + "grad_norm": 4.644535541534424, + "learning_rate": 7.363759719789843e-05, + "loss": 1.1461, + "num_input_tokens_seen": 60599912, + "step": 3767 + }, + { + "epoch": 0.263941789907799, + "grad_norm": 3.7196872234344482, + "learning_rate": 7.363059894921191e-05, + "loss": 1.1025, + "num_input_tokens_seen": 60616296, + "step": 3768 + }, + { + "epoch": 0.26401183815352824, + "grad_norm": 4.477166175842285, + "learning_rate": 7.36236007005254e-05, + "loss": 1.2221, + "num_input_tokens_seen": 60631760, + "step": 3769 + }, + { + "epoch": 0.2640818863992575, + "grad_norm": 4.906933784484863, + "learning_rate": 7.361660245183889e-05, + "loss": 0.9398, + "num_input_tokens_seen": 60648144, + "step": 3770 + }, + { + "epoch": 0.2641519346449867, + "grad_norm": 3.784450054168701, + "learning_rate": 7.360960420315236e-05, + "loss": 0.9521, + "num_input_tokens_seen": 60664528, + "step": 3771 + }, + { + "epoch": 0.26422198289071597, + "grad_norm": 4.5654191970825195, + "learning_rate": 7.360260595446585e-05, + "loss": 0.9199, + "num_input_tokens_seen": 60680912, + "step": 3772 + }, + { + "epoch": 0.2642920311364452, + "grad_norm": 3.965175151824951, + "learning_rate": 7.359560770577934e-05, + "loss": 0.9469, + "num_input_tokens_seen": 60697296, + "step": 3773 + }, + { + "epoch": 0.2643620793821745, + "grad_norm": 5.112542152404785, + "learning_rate": 7.358860945709283e-05, + "loss": 0.946, + "num_input_tokens_seen": 60713328, + "step": 3774 + }, + { + "epoch": 0.26443212762790375, + "grad_norm": 3.8610634803771973, + "learning_rate": 7.35816112084063e-05, + "loss": 1.1243, + "num_input_tokens_seen": 60729712, + "step": 3775 + }, + { + "epoch": 0.26450217587363295, + "grad_norm": 3.794217348098755, + "learning_rate": 7.35746129597198e-05, + "loss": 1.1127, + "num_input_tokens_seen": 60745824, + "step": 3776 + }, + { + "epoch": 0.2645722241193622, + "grad_norm": 3.7547152042388916, + "learning_rate": 7.356761471103328e-05, + "loss": 1.0774, + "num_input_tokens_seen": 60762024, + "step": 3777 + }, + { + "epoch": 0.26464227236509147, + "grad_norm": 3.492917537689209, + "learning_rate": 7.356061646234675e-05, + "loss": 1.0505, + "num_input_tokens_seen": 60778096, + "step": 3778 + }, + { + "epoch": 0.26471232061082073, + "grad_norm": 3.856019973754883, + "learning_rate": 7.355361821366026e-05, + "loss": 0.9716, + "num_input_tokens_seen": 60794480, + "step": 3779 + }, + { + "epoch": 0.26478236885654993, + "grad_norm": 3.68072509765625, + "learning_rate": 7.354661996497374e-05, + "loss": 1.2316, + "num_input_tokens_seen": 60810584, + "step": 3780 + }, + { + "epoch": 0.2648524171022792, + "grad_norm": 4.4739909172058105, + "learning_rate": 7.353962171628722e-05, + "loss": 1.2492, + "num_input_tokens_seen": 60826240, + "step": 3781 + }, + { + "epoch": 0.26492246534800845, + "grad_norm": 5.2342610359191895, + "learning_rate": 7.353262346760071e-05, + "loss": 1.1018, + "num_input_tokens_seen": 60842216, + "step": 3782 + }, + { + "epoch": 0.2649925135937377, + "grad_norm": 4.408970355987549, + "learning_rate": 7.35256252189142e-05, + "loss": 1.0485, + "num_input_tokens_seen": 60857336, + "step": 3783 + }, + { + "epoch": 0.2650625618394669, + "grad_norm": 3.8172199726104736, + "learning_rate": 7.351862697022767e-05, + "loss": 1.1399, + "num_input_tokens_seen": 60873720, + "step": 3784 + }, + { + "epoch": 0.2651326100851962, + "grad_norm": 4.250039100646973, + "learning_rate": 7.351162872154116e-05, + "loss": 1.003, + "num_input_tokens_seen": 60890104, + "step": 3785 + }, + { + "epoch": 0.26520265833092543, + "grad_norm": 4.257120609283447, + "learning_rate": 7.350463047285465e-05, + "loss": 1.0466, + "num_input_tokens_seen": 60906488, + "step": 3786 + }, + { + "epoch": 0.2652727065766547, + "grad_norm": 4.205286026000977, + "learning_rate": 7.349763222416814e-05, + "loss": 1.2149, + "num_input_tokens_seen": 60922872, + "step": 3787 + }, + { + "epoch": 0.2653427548223839, + "grad_norm": 4.304909706115723, + "learning_rate": 7.349063397548161e-05, + "loss": 1.0023, + "num_input_tokens_seen": 60939256, + "step": 3788 + }, + { + "epoch": 0.26541280306811316, + "grad_norm": 4.793664455413818, + "learning_rate": 7.34836357267951e-05, + "loss": 1.0475, + "num_input_tokens_seen": 60955440, + "step": 3789 + }, + { + "epoch": 0.2654828513138424, + "grad_norm": 4.383579730987549, + "learning_rate": 7.347663747810859e-05, + "loss": 1.1924, + "num_input_tokens_seen": 60971824, + "step": 3790 + }, + { + "epoch": 0.2655528995595717, + "grad_norm": 3.9962210655212402, + "learning_rate": 7.346963922942206e-05, + "loss": 1.0429, + "num_input_tokens_seen": 60987168, + "step": 3791 + }, + { + "epoch": 0.2656229478053009, + "grad_norm": 4.356331825256348, + "learning_rate": 7.346264098073555e-05, + "loss": 0.9332, + "num_input_tokens_seen": 61002840, + "step": 3792 + }, + { + "epoch": 0.26569299605103014, + "grad_norm": 5.836807727813721, + "learning_rate": 7.345564273204904e-05, + "loss": 1.205, + "num_input_tokens_seen": 61019224, + "step": 3793 + }, + { + "epoch": 0.2657630442967594, + "grad_norm": 4.778296947479248, + "learning_rate": 7.344864448336253e-05, + "loss": 1.0227, + "num_input_tokens_seen": 61034712, + "step": 3794 + }, + { + "epoch": 0.26583309254248866, + "grad_norm": 6.723006248474121, + "learning_rate": 7.3441646234676e-05, + "loss": 0.955, + "num_input_tokens_seen": 61050328, + "step": 3795 + }, + { + "epoch": 0.26590314078821786, + "grad_norm": 3.773984670639038, + "learning_rate": 7.34346479859895e-05, + "loss": 1.1262, + "num_input_tokens_seen": 61066048, + "step": 3796 + }, + { + "epoch": 0.2659731890339471, + "grad_norm": 3.915708065032959, + "learning_rate": 7.342764973730298e-05, + "loss": 1.1027, + "num_input_tokens_seen": 61082136, + "step": 3797 + }, + { + "epoch": 0.2660432372796764, + "grad_norm": 6.568943977355957, + "learning_rate": 7.342065148861646e-05, + "loss": 1.0457, + "num_input_tokens_seen": 61097216, + "step": 3798 + }, + { + "epoch": 0.26611328552540564, + "grad_norm": 5.0017499923706055, + "learning_rate": 7.341365323992995e-05, + "loss": 1.0194, + "num_input_tokens_seen": 61112344, + "step": 3799 + }, + { + "epoch": 0.26618333377113484, + "grad_norm": 4.1988935470581055, + "learning_rate": 7.340665499124345e-05, + "loss": 1.0794, + "num_input_tokens_seen": 61128728, + "step": 3800 + }, + { + "epoch": 0.26618333377113484, + "eval_loss": 1.1352765560150146, + "eval_runtime": 0.2173, + "eval_samples_per_second": 4.603, + "eval_steps_per_second": 4.603, + "num_input_tokens_seen": 61128728, + "step": 3800 + }, + { + "epoch": 0.2662533820168641, + "grad_norm": 3.991041660308838, + "learning_rate": 7.339965674255692e-05, + "loss": 1.1468, + "num_input_tokens_seen": 61145112, + "step": 3801 + }, + { + "epoch": 0.26632343026259336, + "grad_norm": 4.921470642089844, + "learning_rate": 7.33926584938704e-05, + "loss": 1.1756, + "num_input_tokens_seen": 61160952, + "step": 3802 + }, + { + "epoch": 0.2663934785083226, + "grad_norm": 3.835486888885498, + "learning_rate": 7.33856602451839e-05, + "loss": 0.782, + "num_input_tokens_seen": 61177024, + "step": 3803 + }, + { + "epoch": 0.2664635267540518, + "grad_norm": 4.419501304626465, + "learning_rate": 7.337866199649738e-05, + "loss": 1.0029, + "num_input_tokens_seen": 61193408, + "step": 3804 + }, + { + "epoch": 0.2665335749997811, + "grad_norm": 4.003963947296143, + "learning_rate": 7.337166374781086e-05, + "loss": 1.0805, + "num_input_tokens_seen": 61209792, + "step": 3805 + }, + { + "epoch": 0.26660362324551035, + "grad_norm": 4.115198612213135, + "learning_rate": 7.336466549912435e-05, + "loss": 1.1718, + "num_input_tokens_seen": 61226176, + "step": 3806 + }, + { + "epoch": 0.2666736714912396, + "grad_norm": 3.663464307785034, + "learning_rate": 7.335766725043784e-05, + "loss": 1.1447, + "num_input_tokens_seen": 61242560, + "step": 3807 + }, + { + "epoch": 0.2667437197369688, + "grad_norm": 3.7513012886047363, + "learning_rate": 7.335066900175132e-05, + "loss": 1.1208, + "num_input_tokens_seen": 61258944, + "step": 3808 + }, + { + "epoch": 0.26681376798269807, + "grad_norm": 4.693987846374512, + "learning_rate": 7.33436707530648e-05, + "loss": 1.2823, + "num_input_tokens_seen": 61275048, + "step": 3809 + }, + { + "epoch": 0.26688381622842733, + "grad_norm": 6.161116600036621, + "learning_rate": 7.333667250437829e-05, + "loss": 1.1606, + "num_input_tokens_seen": 61291368, + "step": 3810 + }, + { + "epoch": 0.2669538644741566, + "grad_norm": 5.942180633544922, + "learning_rate": 7.332967425569177e-05, + "loss": 1.2382, + "num_input_tokens_seen": 61307680, + "step": 3811 + }, + { + "epoch": 0.26702391271988585, + "grad_norm": 4.940249443054199, + "learning_rate": 7.332267600700526e-05, + "loss": 1.0407, + "num_input_tokens_seen": 61324064, + "step": 3812 + }, + { + "epoch": 0.26709396096561505, + "grad_norm": 5.384439468383789, + "learning_rate": 7.331567775831875e-05, + "loss": 1.007, + "num_input_tokens_seen": 61340416, + "step": 3813 + }, + { + "epoch": 0.2671640092113443, + "grad_norm": 5.4137959480285645, + "learning_rate": 7.330867950963223e-05, + "loss": 1.0485, + "num_input_tokens_seen": 61356800, + "step": 3814 + }, + { + "epoch": 0.26723405745707357, + "grad_norm": 5.492247581481934, + "learning_rate": 7.330168126094571e-05, + "loss": 1.1623, + "num_input_tokens_seen": 61371736, + "step": 3815 + }, + { + "epoch": 0.26730410570280283, + "grad_norm": 5.316330909729004, + "learning_rate": 7.32946830122592e-05, + "loss": 1.0147, + "num_input_tokens_seen": 61388120, + "step": 3816 + }, + { + "epoch": 0.26737415394853203, + "grad_norm": 3.976797103881836, + "learning_rate": 7.328768476357269e-05, + "loss": 1.1049, + "num_input_tokens_seen": 61403672, + "step": 3817 + }, + { + "epoch": 0.2674442021942613, + "grad_norm": 7.333898544311523, + "learning_rate": 7.328068651488616e-05, + "loss": 1.0696, + "num_input_tokens_seen": 61420056, + "step": 3818 + }, + { + "epoch": 0.26751425043999055, + "grad_norm": 3.795746088027954, + "learning_rate": 7.327368826619965e-05, + "loss": 1.0545, + "num_input_tokens_seen": 61436440, + "step": 3819 + }, + { + "epoch": 0.2675842986857198, + "grad_norm": 6.624248027801514, + "learning_rate": 7.326669001751315e-05, + "loss": 1.0736, + "num_input_tokens_seen": 61452824, + "step": 3820 + }, + { + "epoch": 0.267654346931449, + "grad_norm": 4.991429805755615, + "learning_rate": 7.325969176882663e-05, + "loss": 1.0681, + "num_input_tokens_seen": 61469208, + "step": 3821 + }, + { + "epoch": 0.2677243951771783, + "grad_norm": 3.8505215644836426, + "learning_rate": 7.32526935201401e-05, + "loss": 1.0217, + "num_input_tokens_seen": 61485592, + "step": 3822 + }, + { + "epoch": 0.26779444342290754, + "grad_norm": 3.7079288959503174, + "learning_rate": 7.324569527145359e-05, + "loss": 1.049, + "num_input_tokens_seen": 61501976, + "step": 3823 + }, + { + "epoch": 0.2678644916686368, + "grad_norm": 3.8987131118774414, + "learning_rate": 7.323869702276708e-05, + "loss": 1.0152, + "num_input_tokens_seen": 61518360, + "step": 3824 + }, + { + "epoch": 0.267934539914366, + "grad_norm": 4.0447516441345215, + "learning_rate": 7.323169877408055e-05, + "loss": 1.0604, + "num_input_tokens_seen": 61534744, + "step": 3825 + }, + { + "epoch": 0.26800458816009526, + "grad_norm": 4.089504241943359, + "learning_rate": 7.322470052539406e-05, + "loss": 1.119, + "num_input_tokens_seen": 61551128, + "step": 3826 + }, + { + "epoch": 0.2680746364058245, + "grad_norm": 3.864943265914917, + "learning_rate": 7.321770227670754e-05, + "loss": 1.002, + "num_input_tokens_seen": 61566872, + "step": 3827 + }, + { + "epoch": 0.2681446846515538, + "grad_norm": 4.649239540100098, + "learning_rate": 7.321070402802102e-05, + "loss": 1.059, + "num_input_tokens_seen": 61582704, + "step": 3828 + }, + { + "epoch": 0.268214732897283, + "grad_norm": 7.537643909454346, + "learning_rate": 7.32037057793345e-05, + "loss": 1.289, + "num_input_tokens_seen": 61599088, + "step": 3829 + }, + { + "epoch": 0.26828478114301224, + "grad_norm": 3.312519073486328, + "learning_rate": 7.3196707530648e-05, + "loss": 0.872, + "num_input_tokens_seen": 61615472, + "step": 3830 + }, + { + "epoch": 0.2683548293887415, + "grad_norm": 7.833526134490967, + "learning_rate": 7.318970928196147e-05, + "loss": 1.0896, + "num_input_tokens_seen": 61631288, + "step": 3831 + }, + { + "epoch": 0.26842487763447076, + "grad_norm": 3.9574341773986816, + "learning_rate": 7.318271103327496e-05, + "loss": 1.1105, + "num_input_tokens_seen": 61646400, + "step": 3832 + }, + { + "epoch": 0.26849492588019996, + "grad_norm": 3.8763623237609863, + "learning_rate": 7.317571278458845e-05, + "loss": 1.0339, + "num_input_tokens_seen": 61662784, + "step": 3833 + }, + { + "epoch": 0.2685649741259292, + "grad_norm": 4.006046295166016, + "learning_rate": 7.316871453590194e-05, + "loss": 1.1266, + "num_input_tokens_seen": 61678296, + "step": 3834 + }, + { + "epoch": 0.2686350223716585, + "grad_norm": 4.0256500244140625, + "learning_rate": 7.316171628721541e-05, + "loss": 0.9773, + "num_input_tokens_seen": 61694680, + "step": 3835 + }, + { + "epoch": 0.26870507061738774, + "grad_norm": 4.045619964599609, + "learning_rate": 7.31547180385289e-05, + "loss": 1.0445, + "num_input_tokens_seen": 61711064, + "step": 3836 + }, + { + "epoch": 0.26877511886311695, + "grad_norm": 4.189207553863525, + "learning_rate": 7.314771978984239e-05, + "loss": 1.1357, + "num_input_tokens_seen": 61727448, + "step": 3837 + }, + { + "epoch": 0.2688451671088462, + "grad_norm": 6.098819255828857, + "learning_rate": 7.314072154115587e-05, + "loss": 1.0298, + "num_input_tokens_seen": 61743600, + "step": 3838 + }, + { + "epoch": 0.26891521535457547, + "grad_norm": 3.832962989807129, + "learning_rate": 7.313372329246935e-05, + "loss": 1.0985, + "num_input_tokens_seen": 61759984, + "step": 3839 + }, + { + "epoch": 0.2689852636003047, + "grad_norm": 4.448224067687988, + "learning_rate": 7.312672504378284e-05, + "loss": 0.9682, + "num_input_tokens_seen": 61776368, + "step": 3840 + }, + { + "epoch": 0.26905531184603393, + "grad_norm": 4.621326446533203, + "learning_rate": 7.311972679509633e-05, + "loss": 0.9866, + "num_input_tokens_seen": 61791992, + "step": 3841 + }, + { + "epoch": 0.2691253600917632, + "grad_norm": 4.979477882385254, + "learning_rate": 7.31127285464098e-05, + "loss": 1.1592, + "num_input_tokens_seen": 61807912, + "step": 3842 + }, + { + "epoch": 0.26919540833749245, + "grad_norm": 4.678060054779053, + "learning_rate": 7.31057302977233e-05, + "loss": 1.218, + "num_input_tokens_seen": 61824296, + "step": 3843 + }, + { + "epoch": 0.2692654565832217, + "grad_norm": 5.379042625427246, + "learning_rate": 7.309873204903678e-05, + "loss": 1.0687, + "num_input_tokens_seen": 61840680, + "step": 3844 + }, + { + "epoch": 0.26933550482895097, + "grad_norm": 5.836205005645752, + "learning_rate": 7.309173380035026e-05, + "loss": 1.0435, + "num_input_tokens_seen": 61856296, + "step": 3845 + }, + { + "epoch": 0.26940555307468017, + "grad_norm": 4.040728569030762, + "learning_rate": 7.308473555166376e-05, + "loss": 1.0494, + "num_input_tokens_seen": 61872680, + "step": 3846 + }, + { + "epoch": 0.26947560132040943, + "grad_norm": 5.207007884979248, + "learning_rate": 7.307773730297725e-05, + "loss": 0.9293, + "num_input_tokens_seen": 61889064, + "step": 3847 + }, + { + "epoch": 0.2695456495661387, + "grad_norm": 4.996053695678711, + "learning_rate": 7.307073905429072e-05, + "loss": 1.0765, + "num_input_tokens_seen": 61905448, + "step": 3848 + }, + { + "epoch": 0.26961569781186795, + "grad_norm": 3.9249801635742188, + "learning_rate": 7.30637408056042e-05, + "loss": 1.0971, + "num_input_tokens_seen": 61921832, + "step": 3849 + }, + { + "epoch": 0.26968574605759715, + "grad_norm": 4.512659072875977, + "learning_rate": 7.305674255691769e-05, + "loss": 1.0811, + "num_input_tokens_seen": 61937928, + "step": 3850 + }, + { + "epoch": 0.2697557943033264, + "grad_norm": 3.8067586421966553, + "learning_rate": 7.304974430823118e-05, + "loss": 1.0381, + "num_input_tokens_seen": 61953992, + "step": 3851 + }, + { + "epoch": 0.26982584254905567, + "grad_norm": 3.5481879711151123, + "learning_rate": 7.304274605954466e-05, + "loss": 0.9524, + "num_input_tokens_seen": 61969856, + "step": 3852 + }, + { + "epoch": 0.26989589079478493, + "grad_norm": 5.14021635055542, + "learning_rate": 7.303574781085815e-05, + "loss": 1.0893, + "num_input_tokens_seen": 61985448, + "step": 3853 + }, + { + "epoch": 0.26996593904051414, + "grad_norm": 4.729730606079102, + "learning_rate": 7.302874956217164e-05, + "loss": 0.955, + "num_input_tokens_seen": 62001832, + "step": 3854 + }, + { + "epoch": 0.2700359872862434, + "grad_norm": 4.081509113311768, + "learning_rate": 7.302175131348512e-05, + "loss": 1.3099, + "num_input_tokens_seen": 62018216, + "step": 3855 + }, + { + "epoch": 0.27010603553197265, + "grad_norm": 3.9220404624938965, + "learning_rate": 7.301475306479859e-05, + "loss": 1.256, + "num_input_tokens_seen": 62034600, + "step": 3856 + }, + { + "epoch": 0.2701760837777019, + "grad_norm": 3.9707326889038086, + "learning_rate": 7.30077548161121e-05, + "loss": 0.9347, + "num_input_tokens_seen": 62050984, + "step": 3857 + }, + { + "epoch": 0.2702461320234311, + "grad_norm": 3.985651731491089, + "learning_rate": 7.300075656742557e-05, + "loss": 1.0869, + "num_input_tokens_seen": 62066496, + "step": 3858 + }, + { + "epoch": 0.2703161802691604, + "grad_norm": 4.900750160217285, + "learning_rate": 7.299375831873906e-05, + "loss": 1.2112, + "num_input_tokens_seen": 62082880, + "step": 3859 + }, + { + "epoch": 0.27038622851488964, + "grad_norm": 3.7562901973724365, + "learning_rate": 7.298676007005255e-05, + "loss": 1.0372, + "num_input_tokens_seen": 62099264, + "step": 3860 + }, + { + "epoch": 0.2704562767606189, + "grad_norm": 4.3399271965026855, + "learning_rate": 7.297976182136604e-05, + "loss": 1.2113, + "num_input_tokens_seen": 62115648, + "step": 3861 + }, + { + "epoch": 0.2705263250063481, + "grad_norm": 3.792924642562866, + "learning_rate": 7.297276357267951e-05, + "loss": 1.0027, + "num_input_tokens_seen": 62132032, + "step": 3862 + }, + { + "epoch": 0.27059637325207736, + "grad_norm": 4.10078763961792, + "learning_rate": 7.2965765323993e-05, + "loss": 1.0485, + "num_input_tokens_seen": 62148416, + "step": 3863 + }, + { + "epoch": 0.2706664214978066, + "grad_norm": 3.6712818145751953, + "learning_rate": 7.295876707530649e-05, + "loss": 0.982, + "num_input_tokens_seen": 62164080, + "step": 3864 + }, + { + "epoch": 0.2707364697435359, + "grad_norm": 4.216330051422119, + "learning_rate": 7.295176882661996e-05, + "loss": 0.9988, + "num_input_tokens_seen": 62179952, + "step": 3865 + }, + { + "epoch": 0.2708065179892651, + "grad_norm": 3.803950548171997, + "learning_rate": 7.294477057793345e-05, + "loss": 1.1107, + "num_input_tokens_seen": 62196336, + "step": 3866 + }, + { + "epoch": 0.27087656623499434, + "grad_norm": 4.4687676429748535, + "learning_rate": 7.293777232924694e-05, + "loss": 1.1374, + "num_input_tokens_seen": 62212072, + "step": 3867 + }, + { + "epoch": 0.2709466144807236, + "grad_norm": 3.8923938274383545, + "learning_rate": 7.293077408056043e-05, + "loss": 1.0037, + "num_input_tokens_seen": 62227384, + "step": 3868 + }, + { + "epoch": 0.27101666272645286, + "grad_norm": 3.7378618717193604, + "learning_rate": 7.29237758318739e-05, + "loss": 0.9185, + "num_input_tokens_seen": 62243768, + "step": 3869 + }, + { + "epoch": 0.27108671097218207, + "grad_norm": 4.39946985244751, + "learning_rate": 7.291677758318739e-05, + "loss": 1.2908, + "num_input_tokens_seen": 62259760, + "step": 3870 + }, + { + "epoch": 0.2711567592179113, + "grad_norm": 4.526809215545654, + "learning_rate": 7.290977933450088e-05, + "loss": 1.1677, + "num_input_tokens_seen": 62275880, + "step": 3871 + }, + { + "epoch": 0.2712268074636406, + "grad_norm": 5.780641078948975, + "learning_rate": 7.290278108581437e-05, + "loss": 1.3366, + "num_input_tokens_seen": 62291992, + "step": 3872 + }, + { + "epoch": 0.27129685570936984, + "grad_norm": 3.932300329208374, + "learning_rate": 7.289578283712786e-05, + "loss": 0.9404, + "num_input_tokens_seen": 62308168, + "step": 3873 + }, + { + "epoch": 0.27136690395509905, + "grad_norm": 6.381493091583252, + "learning_rate": 7.288878458844135e-05, + "loss": 0.9909, + "num_input_tokens_seen": 62324552, + "step": 3874 + }, + { + "epoch": 0.2714369522008283, + "grad_norm": 6.920464515686035, + "learning_rate": 7.288178633975482e-05, + "loss": 1.0534, + "num_input_tokens_seen": 62340712, + "step": 3875 + }, + { + "epoch": 0.27150700044655757, + "grad_norm": 4.327527046203613, + "learning_rate": 7.28747880910683e-05, + "loss": 1.2133, + "num_input_tokens_seen": 62355904, + "step": 3876 + }, + { + "epoch": 0.2715770486922868, + "grad_norm": 6.8873610496521, + "learning_rate": 7.286778984238178e-05, + "loss": 1.1857, + "num_input_tokens_seen": 62372288, + "step": 3877 + }, + { + "epoch": 0.2716470969380161, + "grad_norm": 4.397764205932617, + "learning_rate": 7.286079159369527e-05, + "loss": 1.1458, + "num_input_tokens_seen": 62388672, + "step": 3878 + }, + { + "epoch": 0.2717171451837453, + "grad_norm": 4.200334072113037, + "learning_rate": 7.285379334500876e-05, + "loss": 1.1534, + "num_input_tokens_seen": 62403728, + "step": 3879 + }, + { + "epoch": 0.27178719342947455, + "grad_norm": 3.8102898597717285, + "learning_rate": 7.284679509632225e-05, + "loss": 1.2455, + "num_input_tokens_seen": 62419712, + "step": 3880 + }, + { + "epoch": 0.2718572416752038, + "grad_norm": 5.665886878967285, + "learning_rate": 7.283979684763574e-05, + "loss": 1.0506, + "num_input_tokens_seen": 62435648, + "step": 3881 + }, + { + "epoch": 0.27192728992093307, + "grad_norm": 5.59833288192749, + "learning_rate": 7.283279859894921e-05, + "loss": 1.1289, + "num_input_tokens_seen": 62451760, + "step": 3882 + }, + { + "epoch": 0.2719973381666623, + "grad_norm": 4.3096699714660645, + "learning_rate": 7.282580035026269e-05, + "loss": 1.1069, + "num_input_tokens_seen": 62468144, + "step": 3883 + }, + { + "epoch": 0.27206738641239153, + "grad_norm": 3.584202766418457, + "learning_rate": 7.281880210157619e-05, + "loss": 0.981, + "num_input_tokens_seen": 62484528, + "step": 3884 + }, + { + "epoch": 0.2721374346581208, + "grad_norm": 5.078696250915527, + "learning_rate": 7.281180385288967e-05, + "loss": 1.0727, + "num_input_tokens_seen": 62500912, + "step": 3885 + }, + { + "epoch": 0.27220748290385005, + "grad_norm": 3.4883761405944824, + "learning_rate": 7.280480560420315e-05, + "loss": 0.888, + "num_input_tokens_seen": 62517296, + "step": 3886 + }, + { + "epoch": 0.27227753114957925, + "grad_norm": 3.938286066055298, + "learning_rate": 7.279780735551664e-05, + "loss": 0.9736, + "num_input_tokens_seen": 62532896, + "step": 3887 + }, + { + "epoch": 0.2723475793953085, + "grad_norm": 3.7150652408599854, + "learning_rate": 7.279080910683013e-05, + "loss": 1.1163, + "num_input_tokens_seen": 62549072, + "step": 3888 + }, + { + "epoch": 0.2724176276410378, + "grad_norm": 5.31076717376709, + "learning_rate": 7.278381085814361e-05, + "loss": 0.9943, + "num_input_tokens_seen": 62564384, + "step": 3889 + }, + { + "epoch": 0.27248767588676703, + "grad_norm": 4.8600053787231445, + "learning_rate": 7.27768126094571e-05, + "loss": 1.1767, + "num_input_tokens_seen": 62580768, + "step": 3890 + }, + { + "epoch": 0.27255772413249624, + "grad_norm": 3.5890231132507324, + "learning_rate": 7.276981436077058e-05, + "loss": 1.0949, + "num_input_tokens_seen": 62596928, + "step": 3891 + }, + { + "epoch": 0.2726277723782255, + "grad_norm": 4.171263217926025, + "learning_rate": 7.276281611208407e-05, + "loss": 1.0013, + "num_input_tokens_seen": 62613312, + "step": 3892 + }, + { + "epoch": 0.27269782062395476, + "grad_norm": 5.907830715179443, + "learning_rate": 7.275581786339755e-05, + "loss": 1.0622, + "num_input_tokens_seen": 62627840, + "step": 3893 + }, + { + "epoch": 0.272767868869684, + "grad_norm": 3.912140369415283, + "learning_rate": 7.274881961471104e-05, + "loss": 1.1128, + "num_input_tokens_seen": 62643760, + "step": 3894 + }, + { + "epoch": 0.2728379171154132, + "grad_norm": 3.9871180057525635, + "learning_rate": 7.274182136602453e-05, + "loss": 1.0879, + "num_input_tokens_seen": 62660144, + "step": 3895 + }, + { + "epoch": 0.2729079653611425, + "grad_norm": 3.8014907836914062, + "learning_rate": 7.2734823117338e-05, + "loss": 1.0135, + "num_input_tokens_seen": 62676200, + "step": 3896 + }, + { + "epoch": 0.27297801360687174, + "grad_norm": 3.7584786415100098, + "learning_rate": 7.272782486865149e-05, + "loss": 1.0366, + "num_input_tokens_seen": 62692584, + "step": 3897 + }, + { + "epoch": 0.273048061852601, + "grad_norm": 3.573341131210327, + "learning_rate": 7.272082661996498e-05, + "loss": 0.8726, + "num_input_tokens_seen": 62708968, + "step": 3898 + }, + { + "epoch": 0.2731181100983302, + "grad_norm": 4.013971328735352, + "learning_rate": 7.271382837127847e-05, + "loss": 0.991, + "num_input_tokens_seen": 62725352, + "step": 3899 + }, + { + "epoch": 0.27318815834405946, + "grad_norm": 4.3081488609313965, + "learning_rate": 7.270683012259195e-05, + "loss": 1.0632, + "num_input_tokens_seen": 62741736, + "step": 3900 + }, + { + "epoch": 0.2732582065897887, + "grad_norm": 3.857982635498047, + "learning_rate": 7.269983187390544e-05, + "loss": 1.1116, + "num_input_tokens_seen": 62757624, + "step": 3901 + }, + { + "epoch": 0.273328254835518, + "grad_norm": 3.5167486667633057, + "learning_rate": 7.269283362521892e-05, + "loss": 0.9951, + "num_input_tokens_seen": 62774008, + "step": 3902 + }, + { + "epoch": 0.2733983030812472, + "grad_norm": 4.025612831115723, + "learning_rate": 7.26858353765324e-05, + "loss": 1.1632, + "num_input_tokens_seen": 62789560, + "step": 3903 + }, + { + "epoch": 0.27346835132697644, + "grad_norm": 3.6391422748565674, + "learning_rate": 7.267883712784588e-05, + "loss": 0.9442, + "num_input_tokens_seen": 62805824, + "step": 3904 + }, + { + "epoch": 0.2735383995727057, + "grad_norm": 4.352347373962402, + "learning_rate": 7.267183887915937e-05, + "loss": 1.0882, + "num_input_tokens_seen": 62821368, + "step": 3905 + }, + { + "epoch": 0.27360844781843496, + "grad_norm": 3.782601833343506, + "learning_rate": 7.266484063047286e-05, + "loss": 0.9795, + "num_input_tokens_seen": 62837024, + "step": 3906 + }, + { + "epoch": 0.27367849606416417, + "grad_norm": 3.860903263092041, + "learning_rate": 7.265784238178635e-05, + "loss": 1.1751, + "num_input_tokens_seen": 62853408, + "step": 3907 + }, + { + "epoch": 0.2737485443098934, + "grad_norm": 6.185113430023193, + "learning_rate": 7.265084413309984e-05, + "loss": 1.1976, + "num_input_tokens_seen": 62869792, + "step": 3908 + }, + { + "epoch": 0.2738185925556227, + "grad_norm": 6.02334451675415, + "learning_rate": 7.264384588441331e-05, + "loss": 1.0472, + "num_input_tokens_seen": 62886088, + "step": 3909 + }, + { + "epoch": 0.27388864080135195, + "grad_norm": 4.019417762756348, + "learning_rate": 7.263684763572679e-05, + "loss": 0.9597, + "num_input_tokens_seen": 62902472, + "step": 3910 + }, + { + "epoch": 0.27395868904708115, + "grad_norm": 4.0645527839660645, + "learning_rate": 7.262984938704029e-05, + "loss": 1.0267, + "num_input_tokens_seen": 62918552, + "step": 3911 + }, + { + "epoch": 0.2740287372928104, + "grad_norm": 3.978803873062134, + "learning_rate": 7.262285113835378e-05, + "loss": 1.1366, + "num_input_tokens_seen": 62934272, + "step": 3912 + }, + { + "epoch": 0.27409878553853967, + "grad_norm": 4.659839630126953, + "learning_rate": 7.261585288966725e-05, + "loss": 1.0485, + "num_input_tokens_seen": 62950656, + "step": 3913 + }, + { + "epoch": 0.27416883378426893, + "grad_norm": 4.378306865692139, + "learning_rate": 7.260885464098074e-05, + "loss": 0.9949, + "num_input_tokens_seen": 62966120, + "step": 3914 + }, + { + "epoch": 0.2742388820299982, + "grad_norm": 3.723999261856079, + "learning_rate": 7.260185639229423e-05, + "loss": 1.0575, + "num_input_tokens_seen": 62982504, + "step": 3915 + }, + { + "epoch": 0.2743089302757274, + "grad_norm": 4.133684158325195, + "learning_rate": 7.25948581436077e-05, + "loss": 0.9707, + "num_input_tokens_seen": 62998888, + "step": 3916 + }, + { + "epoch": 0.27437897852145665, + "grad_norm": 3.8377842903137207, + "learning_rate": 7.258785989492119e-05, + "loss": 1.1018, + "num_input_tokens_seen": 63015272, + "step": 3917 + }, + { + "epoch": 0.2744490267671859, + "grad_norm": 3.546846389770508, + "learning_rate": 7.258086164623468e-05, + "loss": 0.9544, + "num_input_tokens_seen": 63031656, + "step": 3918 + }, + { + "epoch": 0.27451907501291517, + "grad_norm": 3.8629097938537598, + "learning_rate": 7.257386339754817e-05, + "loss": 1.0174, + "num_input_tokens_seen": 63047208, + "step": 3919 + }, + { + "epoch": 0.2745891232586444, + "grad_norm": 3.780395984649658, + "learning_rate": 7.256686514886165e-05, + "loss": 1.0927, + "num_input_tokens_seen": 63063592, + "step": 3920 + }, + { + "epoch": 0.27465917150437363, + "grad_norm": 3.5188148021698, + "learning_rate": 7.255986690017513e-05, + "loss": 0.9973, + "num_input_tokens_seen": 63079976, + "step": 3921 + }, + { + "epoch": 0.2747292197501029, + "grad_norm": 4.295319080352783, + "learning_rate": 7.255286865148862e-05, + "loss": 1.1545, + "num_input_tokens_seen": 63096360, + "step": 3922 + }, + { + "epoch": 0.27479926799583215, + "grad_norm": 6.307181358337402, + "learning_rate": 7.25458704028021e-05, + "loss": 1.0283, + "num_input_tokens_seen": 63112744, + "step": 3923 + }, + { + "epoch": 0.27486931624156136, + "grad_norm": 4.0670342445373535, + "learning_rate": 7.253887215411559e-05, + "loss": 1.0834, + "num_input_tokens_seen": 63129000, + "step": 3924 + }, + { + "epoch": 0.2749393644872906, + "grad_norm": 4.441539287567139, + "learning_rate": 7.253187390542907e-05, + "loss": 1.1264, + "num_input_tokens_seen": 63145304, + "step": 3925 + }, + { + "epoch": 0.2750094127330199, + "grad_norm": 6.151254653930664, + "learning_rate": 7.252487565674256e-05, + "loss": 0.911, + "num_input_tokens_seen": 63161688, + "step": 3926 + }, + { + "epoch": 0.27507946097874914, + "grad_norm": 5.355491638183594, + "learning_rate": 7.251787740805605e-05, + "loss": 1.0604, + "num_input_tokens_seen": 63176128, + "step": 3927 + }, + { + "epoch": 0.27514950922447834, + "grad_norm": 3.4603800773620605, + "learning_rate": 7.251087915936954e-05, + "loss": 0.7811, + "num_input_tokens_seen": 63192512, + "step": 3928 + }, + { + "epoch": 0.2752195574702076, + "grad_norm": 5.412753105163574, + "learning_rate": 7.250388091068302e-05, + "loss": 0.9675, + "num_input_tokens_seen": 63208896, + "step": 3929 + }, + { + "epoch": 0.27528960571593686, + "grad_norm": 3.928074598312378, + "learning_rate": 7.249688266199649e-05, + "loss": 1.0562, + "num_input_tokens_seen": 63224296, + "step": 3930 + }, + { + "epoch": 0.2753596539616661, + "grad_norm": 4.239214897155762, + "learning_rate": 7.248988441330998e-05, + "loss": 0.9697, + "num_input_tokens_seen": 63239312, + "step": 3931 + }, + { + "epoch": 0.2754297022073953, + "grad_norm": 3.8074252605438232, + "learning_rate": 7.248288616462348e-05, + "loss": 1.0834, + "num_input_tokens_seen": 63255664, + "step": 3932 + }, + { + "epoch": 0.2754997504531246, + "grad_norm": 3.721026659011841, + "learning_rate": 7.247588791593696e-05, + "loss": 1.1663, + "num_input_tokens_seen": 63272048, + "step": 3933 + }, + { + "epoch": 0.27556979869885384, + "grad_norm": 4.076726913452148, + "learning_rate": 7.246888966725044e-05, + "loss": 1.1179, + "num_input_tokens_seen": 63288432, + "step": 3934 + }, + { + "epoch": 0.2756398469445831, + "grad_norm": 4.238835334777832, + "learning_rate": 7.246189141856393e-05, + "loss": 1.0894, + "num_input_tokens_seen": 63304168, + "step": 3935 + }, + { + "epoch": 0.2757098951903123, + "grad_norm": 4.4860148429870605, + "learning_rate": 7.245489316987741e-05, + "loss": 1.1763, + "num_input_tokens_seen": 63320552, + "step": 3936 + }, + { + "epoch": 0.27577994343604156, + "grad_norm": 6.002726078033447, + "learning_rate": 7.244789492119088e-05, + "loss": 1.158, + "num_input_tokens_seen": 63336792, + "step": 3937 + }, + { + "epoch": 0.2758499916817708, + "grad_norm": 3.799751043319702, + "learning_rate": 7.244089667250439e-05, + "loss": 1.0316, + "num_input_tokens_seen": 63353176, + "step": 3938 + }, + { + "epoch": 0.2759200399275001, + "grad_norm": 4.905911445617676, + "learning_rate": 7.243389842381787e-05, + "loss": 0.8847, + "num_input_tokens_seen": 63369560, + "step": 3939 + }, + { + "epoch": 0.2759900881732293, + "grad_norm": 5.141537666320801, + "learning_rate": 7.242690017513135e-05, + "loss": 1.109, + "num_input_tokens_seen": 63385944, + "step": 3940 + }, + { + "epoch": 0.27606013641895855, + "grad_norm": 5.276777267456055, + "learning_rate": 7.241990192644484e-05, + "loss": 0.9881, + "num_input_tokens_seen": 63401672, + "step": 3941 + }, + { + "epoch": 0.2761301846646878, + "grad_norm": 5.267075538635254, + "learning_rate": 7.241290367775833e-05, + "loss": 1.0048, + "num_input_tokens_seen": 63417792, + "step": 3942 + }, + { + "epoch": 0.27620023291041707, + "grad_norm": 4.065691947937012, + "learning_rate": 7.24059054290718e-05, + "loss": 1.0088, + "num_input_tokens_seen": 63434176, + "step": 3943 + }, + { + "epoch": 0.27627028115614627, + "grad_norm": 7.921762466430664, + "learning_rate": 7.239890718038529e-05, + "loss": 1.3552, + "num_input_tokens_seen": 63450032, + "step": 3944 + }, + { + "epoch": 0.27634032940187553, + "grad_norm": 3.55094313621521, + "learning_rate": 7.239190893169878e-05, + "loss": 0.9957, + "num_input_tokens_seen": 63466416, + "step": 3945 + }, + { + "epoch": 0.2764103776476048, + "grad_norm": 5.732813358306885, + "learning_rate": 7.238491068301227e-05, + "loss": 1.0968, + "num_input_tokens_seen": 63482296, + "step": 3946 + }, + { + "epoch": 0.27648042589333405, + "grad_norm": 3.9143989086151123, + "learning_rate": 7.237791243432574e-05, + "loss": 0.9218, + "num_input_tokens_seen": 63498472, + "step": 3947 + }, + { + "epoch": 0.2765504741390633, + "grad_norm": 4.123042106628418, + "learning_rate": 7.237091418563923e-05, + "loss": 1.0081, + "num_input_tokens_seen": 63513856, + "step": 3948 + }, + { + "epoch": 0.2766205223847925, + "grad_norm": 3.7550277709960938, + "learning_rate": 7.236391593695272e-05, + "loss": 1.0612, + "num_input_tokens_seen": 63529432, + "step": 3949 + }, + { + "epoch": 0.27669057063052177, + "grad_norm": 3.841831922531128, + "learning_rate": 7.23569176882662e-05, + "loss": 1.1208, + "num_input_tokens_seen": 63545816, + "step": 3950 + }, + { + "epoch": 0.27676061887625103, + "grad_norm": 4.626603126525879, + "learning_rate": 7.234991943957968e-05, + "loss": 1.3412, + "num_input_tokens_seen": 63561960, + "step": 3951 + }, + { + "epoch": 0.2768306671219803, + "grad_norm": 3.874140977859497, + "learning_rate": 7.234292119089319e-05, + "loss": 1.0549, + "num_input_tokens_seen": 63578344, + "step": 3952 + }, + { + "epoch": 0.2769007153677095, + "grad_norm": 3.6525163650512695, + "learning_rate": 7.233592294220666e-05, + "loss": 1.0905, + "num_input_tokens_seen": 63594520, + "step": 3953 + }, + { + "epoch": 0.27697076361343875, + "grad_norm": 5.065535068511963, + "learning_rate": 7.232892469352015e-05, + "loss": 1.1913, + "num_input_tokens_seen": 63610904, + "step": 3954 + }, + { + "epoch": 0.277040811859168, + "grad_norm": 7.97597599029541, + "learning_rate": 7.232192644483364e-05, + "loss": 0.9109, + "num_input_tokens_seen": 63625896, + "step": 3955 + }, + { + "epoch": 0.27711086010489727, + "grad_norm": 5.0254645347595215, + "learning_rate": 7.231492819614711e-05, + "loss": 1.0177, + "num_input_tokens_seen": 63642280, + "step": 3956 + }, + { + "epoch": 0.2771809083506265, + "grad_norm": 4.171605587005615, + "learning_rate": 7.230792994746059e-05, + "loss": 1.3166, + "num_input_tokens_seen": 63658400, + "step": 3957 + }, + { + "epoch": 0.27725095659635574, + "grad_norm": 4.036003589630127, + "learning_rate": 7.230093169877409e-05, + "loss": 1.0489, + "num_input_tokens_seen": 63674784, + "step": 3958 + }, + { + "epoch": 0.277321004842085, + "grad_norm": 4.664374828338623, + "learning_rate": 7.229393345008758e-05, + "loss": 1.3189, + "num_input_tokens_seen": 63691168, + "step": 3959 + }, + { + "epoch": 0.27739105308781425, + "grad_norm": 3.7217307090759277, + "learning_rate": 7.228693520140105e-05, + "loss": 1.2532, + "num_input_tokens_seen": 63707552, + "step": 3960 + }, + { + "epoch": 0.27746110133354346, + "grad_norm": 3.622593879699707, + "learning_rate": 7.227993695271454e-05, + "loss": 0.8604, + "num_input_tokens_seen": 63723936, + "step": 3961 + }, + { + "epoch": 0.2775311495792727, + "grad_norm": 4.154850006103516, + "learning_rate": 7.227293870402803e-05, + "loss": 1.1366, + "num_input_tokens_seen": 63740320, + "step": 3962 + }, + { + "epoch": 0.277601197825002, + "grad_norm": 4.157016754150391, + "learning_rate": 7.22659404553415e-05, + "loss": 0.8815, + "num_input_tokens_seen": 63756456, + "step": 3963 + }, + { + "epoch": 0.27767124607073124, + "grad_norm": 4.652394771575928, + "learning_rate": 7.2258942206655e-05, + "loss": 0.8966, + "num_input_tokens_seen": 63772840, + "step": 3964 + }, + { + "epoch": 0.27774129431646044, + "grad_norm": 7.87667989730835, + "learning_rate": 7.225194395796848e-05, + "loss": 1.1371, + "num_input_tokens_seen": 63788800, + "step": 3965 + }, + { + "epoch": 0.2778113425621897, + "grad_norm": 4.333608627319336, + "learning_rate": 7.224494570928197e-05, + "loss": 1.3465, + "num_input_tokens_seen": 63805088, + "step": 3966 + }, + { + "epoch": 0.27788139080791896, + "grad_norm": 7.2095184326171875, + "learning_rate": 7.223794746059545e-05, + "loss": 1.0276, + "num_input_tokens_seen": 63821472, + "step": 3967 + }, + { + "epoch": 0.2779514390536482, + "grad_norm": 3.9144251346588135, + "learning_rate": 7.223094921190893e-05, + "loss": 0.9954, + "num_input_tokens_seen": 63837048, + "step": 3968 + }, + { + "epoch": 0.2780214872993774, + "grad_norm": 4.380809783935547, + "learning_rate": 7.222395096322242e-05, + "loss": 0.9757, + "num_input_tokens_seen": 63852872, + "step": 3969 + }, + { + "epoch": 0.2780915355451067, + "grad_norm": 3.637685537338257, + "learning_rate": 7.22169527145359e-05, + "loss": 1.0264, + "num_input_tokens_seen": 63868864, + "step": 3970 + }, + { + "epoch": 0.27816158379083594, + "grad_norm": 4.742129802703857, + "learning_rate": 7.220995446584939e-05, + "loss": 1.2344, + "num_input_tokens_seen": 63885248, + "step": 3971 + }, + { + "epoch": 0.2782316320365652, + "grad_norm": 4.7221269607543945, + "learning_rate": 7.220295621716289e-05, + "loss": 1.1001, + "num_input_tokens_seen": 63901632, + "step": 3972 + }, + { + "epoch": 0.2783016802822944, + "grad_norm": 3.6607449054718018, + "learning_rate": 7.219595796847636e-05, + "loss": 1.1179, + "num_input_tokens_seen": 63917688, + "step": 3973 + }, + { + "epoch": 0.27837172852802367, + "grad_norm": 4.264851093292236, + "learning_rate": 7.218895971978984e-05, + "loss": 1.0158, + "num_input_tokens_seen": 63934072, + "step": 3974 + }, + { + "epoch": 0.2784417767737529, + "grad_norm": 5.0043511390686035, + "learning_rate": 7.218196147110333e-05, + "loss": 1.0359, + "num_input_tokens_seen": 63950200, + "step": 3975 + }, + { + "epoch": 0.2785118250194822, + "grad_norm": 4.323488235473633, + "learning_rate": 7.217496322241682e-05, + "loss": 1.1791, + "num_input_tokens_seen": 63966584, + "step": 3976 + }, + { + "epoch": 0.2785818732652114, + "grad_norm": 6.721888065338135, + "learning_rate": 7.216796497373029e-05, + "loss": 0.9446, + "num_input_tokens_seen": 63982440, + "step": 3977 + }, + { + "epoch": 0.27865192151094065, + "grad_norm": 6.3528289794921875, + "learning_rate": 7.21609667250438e-05, + "loss": 1.1506, + "num_input_tokens_seen": 63998824, + "step": 3978 + }, + { + "epoch": 0.2787219697566699, + "grad_norm": 5.293467044830322, + "learning_rate": 7.215396847635728e-05, + "loss": 1.2791, + "num_input_tokens_seen": 64014984, + "step": 3979 + }, + { + "epoch": 0.27879201800239917, + "grad_norm": 3.8228442668914795, + "learning_rate": 7.214697022767076e-05, + "loss": 1.1086, + "num_input_tokens_seen": 64031080, + "step": 3980 + }, + { + "epoch": 0.27886206624812837, + "grad_norm": 3.8407061100006104, + "learning_rate": 7.213997197898425e-05, + "loss": 1.1772, + "num_input_tokens_seen": 64046416, + "step": 3981 + }, + { + "epoch": 0.27893211449385763, + "grad_norm": 3.9471728801727295, + "learning_rate": 7.213297373029773e-05, + "loss": 0.9394, + "num_input_tokens_seen": 64062784, + "step": 3982 + }, + { + "epoch": 0.2790021627395869, + "grad_norm": 4.1796722412109375, + "learning_rate": 7.212597548161121e-05, + "loss": 0.9966, + "num_input_tokens_seen": 64077504, + "step": 3983 + }, + { + "epoch": 0.27907221098531615, + "grad_norm": 3.78998064994812, + "learning_rate": 7.21189772329247e-05, + "loss": 1.1219, + "num_input_tokens_seen": 64093888, + "step": 3984 + }, + { + "epoch": 0.2791422592310454, + "grad_norm": 3.383371591567993, + "learning_rate": 7.211197898423819e-05, + "loss": 0.8832, + "num_input_tokens_seen": 64110272, + "step": 3985 + }, + { + "epoch": 0.2792123074767746, + "grad_norm": 3.6502346992492676, + "learning_rate": 7.210498073555168e-05, + "loss": 1.0114, + "num_input_tokens_seen": 64125464, + "step": 3986 + }, + { + "epoch": 0.27928235572250387, + "grad_norm": 3.9421629905700684, + "learning_rate": 7.209798248686515e-05, + "loss": 1.1305, + "num_input_tokens_seen": 64141848, + "step": 3987 + }, + { + "epoch": 0.27935240396823313, + "grad_norm": 4.40875244140625, + "learning_rate": 7.209098423817864e-05, + "loss": 0.9603, + "num_input_tokens_seen": 64158232, + "step": 3988 + }, + { + "epoch": 0.2794224522139624, + "grad_norm": 5.909340858459473, + "learning_rate": 7.208398598949213e-05, + "loss": 1.121, + "num_input_tokens_seen": 64174616, + "step": 3989 + }, + { + "epoch": 0.2794925004596916, + "grad_norm": 4.548187732696533, + "learning_rate": 7.20769877408056e-05, + "loss": 0.9575, + "num_input_tokens_seen": 64191000, + "step": 3990 + }, + { + "epoch": 0.27956254870542085, + "grad_norm": 4.1479926109313965, + "learning_rate": 7.206998949211909e-05, + "loss": 1.1205, + "num_input_tokens_seen": 64207384, + "step": 3991 + }, + { + "epoch": 0.2796325969511501, + "grad_norm": 4.229100227355957, + "learning_rate": 7.206299124343259e-05, + "loss": 1.0597, + "num_input_tokens_seen": 64223304, + "step": 3992 + }, + { + "epoch": 0.2797026451968794, + "grad_norm": 7.431615352630615, + "learning_rate": 7.205599299474607e-05, + "loss": 1.2793, + "num_input_tokens_seen": 64239688, + "step": 3993 + }, + { + "epoch": 0.2797726934426086, + "grad_norm": 4.346622943878174, + "learning_rate": 7.204899474605954e-05, + "loss": 1.2385, + "num_input_tokens_seen": 64255752, + "step": 3994 + }, + { + "epoch": 0.27984274168833784, + "grad_norm": 3.771306276321411, + "learning_rate": 7.204199649737303e-05, + "loss": 1.0211, + "num_input_tokens_seen": 64271760, + "step": 3995 + }, + { + "epoch": 0.2799127899340671, + "grad_norm": 4.411479473114014, + "learning_rate": 7.203499824868652e-05, + "loss": 1.3369, + "num_input_tokens_seen": 64288144, + "step": 3996 + }, + { + "epoch": 0.27998283817979636, + "grad_norm": 4.591271877288818, + "learning_rate": 7.2028e-05, + "loss": 1.1021, + "num_input_tokens_seen": 64304528, + "step": 3997 + }, + { + "epoch": 0.28005288642552556, + "grad_norm": 3.88271427154541, + "learning_rate": 7.20210017513135e-05, + "loss": 0.9181, + "num_input_tokens_seen": 64320912, + "step": 3998 + }, + { + "epoch": 0.2801229346712548, + "grad_norm": 10.80846118927002, + "learning_rate": 7.201400350262699e-05, + "loss": 1.0922, + "num_input_tokens_seen": 64337296, + "step": 3999 + }, + { + "epoch": 0.2801929829169841, + "grad_norm": 3.7112953662872314, + "learning_rate": 7.200700525394046e-05, + "loss": 1.0157, + "num_input_tokens_seen": 64353680, + "step": 4000 + }, + { + "epoch": 0.2801929829169841, + "eval_loss": 1.1334750652313232, + "eval_runtime": 0.1958, + "eval_samples_per_second": 5.106, + "eval_steps_per_second": 5.106, + "num_input_tokens_seen": 64353680, + "step": 4000 + }, + { + "epoch": 0.28026303116271334, + "grad_norm": 7.529544830322266, + "learning_rate": 7.200000700525394e-05, + "loss": 1.1264, + "num_input_tokens_seen": 64368408, + "step": 4001 + }, + { + "epoch": 0.28033307940844254, + "grad_norm": 3.761939764022827, + "learning_rate": 7.199300875656742e-05, + "loss": 1.1027, + "num_input_tokens_seen": 64384792, + "step": 4002 + }, + { + "epoch": 0.2804031276541718, + "grad_norm": 4.091811656951904, + "learning_rate": 7.198601050788091e-05, + "loss": 1.0368, + "num_input_tokens_seen": 64400520, + "step": 4003 + }, + { + "epoch": 0.28047317589990106, + "grad_norm": 5.5972795486450195, + "learning_rate": 7.19790122591944e-05, + "loss": 1.0957, + "num_input_tokens_seen": 64416904, + "step": 4004 + }, + { + "epoch": 0.2805432241456303, + "grad_norm": 3.4631423950195312, + "learning_rate": 7.197201401050789e-05, + "loss": 0.9517, + "num_input_tokens_seen": 64432168, + "step": 4005 + }, + { + "epoch": 0.2806132723913595, + "grad_norm": 6.3156938552856445, + "learning_rate": 7.196501576182138e-05, + "loss": 1.0554, + "num_input_tokens_seen": 64447752, + "step": 4006 + }, + { + "epoch": 0.2806833206370888, + "grad_norm": 10.07819652557373, + "learning_rate": 7.195801751313485e-05, + "loss": 0.99, + "num_input_tokens_seen": 64464136, + "step": 4007 + }, + { + "epoch": 0.28075336888281804, + "grad_norm": 4.695057392120361, + "learning_rate": 7.195101926444834e-05, + "loss": 0.9745, + "num_input_tokens_seen": 64480520, + "step": 4008 + }, + { + "epoch": 0.2808234171285473, + "grad_norm": 4.74672269821167, + "learning_rate": 7.194402101576183e-05, + "loss": 1.0648, + "num_input_tokens_seen": 64496904, + "step": 4009 + }, + { + "epoch": 0.2808934653742765, + "grad_norm": 3.834928512573242, + "learning_rate": 7.19370227670753e-05, + "loss": 1.0163, + "num_input_tokens_seen": 64513288, + "step": 4010 + }, + { + "epoch": 0.28096351362000577, + "grad_norm": 4.1937103271484375, + "learning_rate": 7.19300245183888e-05, + "loss": 1.1351, + "num_input_tokens_seen": 64528992, + "step": 4011 + }, + { + "epoch": 0.281033561865735, + "grad_norm": 4.1531243324279785, + "learning_rate": 7.192302626970228e-05, + "loss": 1.0835, + "num_input_tokens_seen": 64544776, + "step": 4012 + }, + { + "epoch": 0.2811036101114643, + "grad_norm": 5.006285190582275, + "learning_rate": 7.191602802101577e-05, + "loss": 1.1282, + "num_input_tokens_seen": 64560944, + "step": 4013 + }, + { + "epoch": 0.2811736583571935, + "grad_norm": 3.433964252471924, + "learning_rate": 7.190902977232925e-05, + "loss": 1.1164, + "num_input_tokens_seen": 64577328, + "step": 4014 + }, + { + "epoch": 0.28124370660292275, + "grad_norm": 6.165640354156494, + "learning_rate": 7.190203152364274e-05, + "loss": 0.9713, + "num_input_tokens_seen": 64593672, + "step": 4015 + }, + { + "epoch": 0.281313754848652, + "grad_norm": 6.037381649017334, + "learning_rate": 7.189503327495622e-05, + "loss": 1.032, + "num_input_tokens_seen": 64610056, + "step": 4016 + }, + { + "epoch": 0.28138380309438127, + "grad_norm": 4.2639923095703125, + "learning_rate": 7.18880350262697e-05, + "loss": 1.1842, + "num_input_tokens_seen": 64625864, + "step": 4017 + }, + { + "epoch": 0.28145385134011053, + "grad_norm": 3.8862967491149902, + "learning_rate": 7.188103677758319e-05, + "loss": 0.9448, + "num_input_tokens_seen": 64642248, + "step": 4018 + }, + { + "epoch": 0.28152389958583973, + "grad_norm": 3.9584991931915283, + "learning_rate": 7.187403852889669e-05, + "loss": 0.9602, + "num_input_tokens_seen": 64658632, + "step": 4019 + }, + { + "epoch": 0.281593947831569, + "grad_norm": 6.037077903747559, + "learning_rate": 7.186704028021017e-05, + "loss": 1.0913, + "num_input_tokens_seen": 64675016, + "step": 4020 + }, + { + "epoch": 0.28166399607729825, + "grad_norm": 3.750059127807617, + "learning_rate": 7.186004203152364e-05, + "loss": 1.1294, + "num_input_tokens_seen": 64691400, + "step": 4021 + }, + { + "epoch": 0.2817340443230275, + "grad_norm": 4.364743709564209, + "learning_rate": 7.185304378283713e-05, + "loss": 1.0983, + "num_input_tokens_seen": 64706512, + "step": 4022 + }, + { + "epoch": 0.2818040925687567, + "grad_norm": 3.463717460632324, + "learning_rate": 7.184604553415062e-05, + "loss": 1.0773, + "num_input_tokens_seen": 64722688, + "step": 4023 + }, + { + "epoch": 0.281874140814486, + "grad_norm": 3.939438819885254, + "learning_rate": 7.18390472854641e-05, + "loss": 1.3356, + "num_input_tokens_seen": 64738856, + "step": 4024 + }, + { + "epoch": 0.28194418906021523, + "grad_norm": 3.813849687576294, + "learning_rate": 7.18320490367776e-05, + "loss": 1.0521, + "num_input_tokens_seen": 64755240, + "step": 4025 + }, + { + "epoch": 0.2820142373059445, + "grad_norm": 3.5874619483947754, + "learning_rate": 7.182505078809108e-05, + "loss": 1.0328, + "num_input_tokens_seen": 64771184, + "step": 4026 + }, + { + "epoch": 0.2820842855516737, + "grad_norm": 4.544376850128174, + "learning_rate": 7.181805253940456e-05, + "loss": 1.1132, + "num_input_tokens_seen": 64787568, + "step": 4027 + }, + { + "epoch": 0.28215433379740296, + "grad_norm": 3.6816799640655518, + "learning_rate": 7.181105429071803e-05, + "loss": 1.1088, + "num_input_tokens_seen": 64803064, + "step": 4028 + }, + { + "epoch": 0.2822243820431322, + "grad_norm": 7.1433939933776855, + "learning_rate": 7.180405604203152e-05, + "loss": 1.0069, + "num_input_tokens_seen": 64818736, + "step": 4029 + }, + { + "epoch": 0.2822944302888615, + "grad_norm": 4.308315753936768, + "learning_rate": 7.179705779334501e-05, + "loss": 1.1992, + "num_input_tokens_seen": 64834848, + "step": 4030 + }, + { + "epoch": 0.2823644785345907, + "grad_norm": 4.985830783843994, + "learning_rate": 7.17900595446585e-05, + "loss": 1.1996, + "num_input_tokens_seen": 64851224, + "step": 4031 + }, + { + "epoch": 0.28243452678031994, + "grad_norm": 4.884370803833008, + "learning_rate": 7.178306129597199e-05, + "loss": 1.0541, + "num_input_tokens_seen": 64867608, + "step": 4032 + }, + { + "epoch": 0.2825045750260492, + "grad_norm": 4.335781097412109, + "learning_rate": 7.177606304728548e-05, + "loss": 1.0596, + "num_input_tokens_seen": 64883840, + "step": 4033 + }, + { + "epoch": 0.28257462327177846, + "grad_norm": 3.729811191558838, + "learning_rate": 7.176906479859895e-05, + "loss": 1.0167, + "num_input_tokens_seen": 64899872, + "step": 4034 + }, + { + "epoch": 0.28264467151750766, + "grad_norm": 3.7386136054992676, + "learning_rate": 7.176206654991244e-05, + "loss": 0.7835, + "num_input_tokens_seen": 64916256, + "step": 4035 + }, + { + "epoch": 0.2827147197632369, + "grad_norm": 3.8022067546844482, + "learning_rate": 7.175506830122593e-05, + "loss": 1.0571, + "num_input_tokens_seen": 64932640, + "step": 4036 + }, + { + "epoch": 0.2827847680089662, + "grad_norm": 4.713296890258789, + "learning_rate": 7.17480700525394e-05, + "loss": 1.2877, + "num_input_tokens_seen": 64948520, + "step": 4037 + }, + { + "epoch": 0.28285481625469544, + "grad_norm": 3.682568073272705, + "learning_rate": 7.174107180385289e-05, + "loss": 1.0193, + "num_input_tokens_seen": 64964904, + "step": 4038 + }, + { + "epoch": 0.28292486450042464, + "grad_norm": 4.533677101135254, + "learning_rate": 7.173407355516638e-05, + "loss": 1.133, + "num_input_tokens_seen": 64981288, + "step": 4039 + }, + { + "epoch": 0.2829949127461539, + "grad_norm": 4.343021392822266, + "learning_rate": 7.172707530647987e-05, + "loss": 1.2843, + "num_input_tokens_seen": 64997640, + "step": 4040 + }, + { + "epoch": 0.28306496099188316, + "grad_norm": 4.942739009857178, + "learning_rate": 7.172007705779334e-05, + "loss": 1.1391, + "num_input_tokens_seen": 65012456, + "step": 4041 + }, + { + "epoch": 0.2831350092376124, + "grad_norm": 6.1112213134765625, + "learning_rate": 7.171307880910683e-05, + "loss": 1.0135, + "num_input_tokens_seen": 65028840, + "step": 4042 + }, + { + "epoch": 0.2832050574833416, + "grad_norm": 4.650609016418457, + "learning_rate": 7.170608056042032e-05, + "loss": 0.8585, + "num_input_tokens_seen": 65044608, + "step": 4043 + }, + { + "epoch": 0.2832751057290709, + "grad_norm": 5.383882522583008, + "learning_rate": 7.169908231173381e-05, + "loss": 1.3442, + "num_input_tokens_seen": 65060992, + "step": 4044 + }, + { + "epoch": 0.28334515397480015, + "grad_norm": 3.569399118423462, + "learning_rate": 7.169208406304729e-05, + "loss": 1.0645, + "num_input_tokens_seen": 65077096, + "step": 4045 + }, + { + "epoch": 0.2834152022205294, + "grad_norm": 5.199350833892822, + "learning_rate": 7.168508581436079e-05, + "loss": 1.0178, + "num_input_tokens_seen": 65093480, + "step": 4046 + }, + { + "epoch": 0.2834852504662586, + "grad_norm": 4.172554969787598, + "learning_rate": 7.167808756567426e-05, + "loss": 1.1721, + "num_input_tokens_seen": 65109864, + "step": 4047 + }, + { + "epoch": 0.28355529871198787, + "grad_norm": 3.822197437286377, + "learning_rate": 7.167108931698774e-05, + "loss": 0.9076, + "num_input_tokens_seen": 65126248, + "step": 4048 + }, + { + "epoch": 0.28362534695771713, + "grad_norm": 3.8899435997009277, + "learning_rate": 7.166409106830123e-05, + "loss": 1.1228, + "num_input_tokens_seen": 65141984, + "step": 4049 + }, + { + "epoch": 0.2836953952034464, + "grad_norm": 4.559451580047607, + "learning_rate": 7.165709281961471e-05, + "loss": 1.0732, + "num_input_tokens_seen": 65157984, + "step": 4050 + }, + { + "epoch": 0.28376544344917565, + "grad_norm": 5.253831386566162, + "learning_rate": 7.16500945709282e-05, + "loss": 1.1104, + "num_input_tokens_seen": 65174040, + "step": 4051 + }, + { + "epoch": 0.28383549169490485, + "grad_norm": 3.827268123626709, + "learning_rate": 7.164309632224169e-05, + "loss": 1.0689, + "num_input_tokens_seen": 65190424, + "step": 4052 + }, + { + "epoch": 0.2839055399406341, + "grad_norm": 4.432236194610596, + "learning_rate": 7.163609807355518e-05, + "loss": 0.9357, + "num_input_tokens_seen": 65206808, + "step": 4053 + }, + { + "epoch": 0.28397558818636337, + "grad_norm": 5.008002281188965, + "learning_rate": 7.162909982486866e-05, + "loss": 1.1584, + "num_input_tokens_seen": 65222744, + "step": 4054 + }, + { + "epoch": 0.28404563643209263, + "grad_norm": 3.748089551925659, + "learning_rate": 7.162210157618213e-05, + "loss": 1.0242, + "num_input_tokens_seen": 65238592, + "step": 4055 + }, + { + "epoch": 0.28411568467782183, + "grad_norm": 4.073843002319336, + "learning_rate": 7.161510332749562e-05, + "loss": 1.0629, + "num_input_tokens_seen": 65254464, + "step": 4056 + }, + { + "epoch": 0.2841857329235511, + "grad_norm": 4.0271100997924805, + "learning_rate": 7.160810507880911e-05, + "loss": 1.0191, + "num_input_tokens_seen": 65269744, + "step": 4057 + }, + { + "epoch": 0.28425578116928035, + "grad_norm": 4.266842365264893, + "learning_rate": 7.16011068301226e-05, + "loss": 1.0061, + "num_input_tokens_seen": 65286128, + "step": 4058 + }, + { + "epoch": 0.2843258294150096, + "grad_norm": 3.4473531246185303, + "learning_rate": 7.159410858143608e-05, + "loss": 0.8837, + "num_input_tokens_seen": 65301864, + "step": 4059 + }, + { + "epoch": 0.2843958776607388, + "grad_norm": 3.717029333114624, + "learning_rate": 7.158711033274957e-05, + "loss": 1.0704, + "num_input_tokens_seen": 65317880, + "step": 4060 + }, + { + "epoch": 0.2844659259064681, + "grad_norm": 4.008082866668701, + "learning_rate": 7.158011208406305e-05, + "loss": 1.0322, + "num_input_tokens_seen": 65334096, + "step": 4061 + }, + { + "epoch": 0.28453597415219734, + "grad_norm": 5.350658893585205, + "learning_rate": 7.157311383537654e-05, + "loss": 1.1277, + "num_input_tokens_seen": 65348288, + "step": 4062 + }, + { + "epoch": 0.2846060223979266, + "grad_norm": 8.911882400512695, + "learning_rate": 7.156611558669003e-05, + "loss": 1.0978, + "num_input_tokens_seen": 65364672, + "step": 4063 + }, + { + "epoch": 0.2846760706436558, + "grad_norm": 4.207833766937256, + "learning_rate": 7.155911733800351e-05, + "loss": 1.1248, + "num_input_tokens_seen": 65380600, + "step": 4064 + }, + { + "epoch": 0.28474611888938506, + "grad_norm": 3.492713689804077, + "learning_rate": 7.155211908931699e-05, + "loss": 0.9513, + "num_input_tokens_seen": 65396920, + "step": 4065 + }, + { + "epoch": 0.2848161671351143, + "grad_norm": 3.866763114929199, + "learning_rate": 7.154512084063048e-05, + "loss": 0.9899, + "num_input_tokens_seen": 65413136, + "step": 4066 + }, + { + "epoch": 0.2848862153808436, + "grad_norm": 4.352143287658691, + "learning_rate": 7.153812259194397e-05, + "loss": 1.097, + "num_input_tokens_seen": 65428368, + "step": 4067 + }, + { + "epoch": 0.2849562636265728, + "grad_norm": 5.335500717163086, + "learning_rate": 7.153112434325744e-05, + "loss": 1.1697, + "num_input_tokens_seen": 65444752, + "step": 4068 + }, + { + "epoch": 0.28502631187230204, + "grad_norm": 3.7467970848083496, + "learning_rate": 7.152412609457093e-05, + "loss": 0.9655, + "num_input_tokens_seen": 65461136, + "step": 4069 + }, + { + "epoch": 0.2850963601180313, + "grad_norm": 3.410472869873047, + "learning_rate": 7.151712784588442e-05, + "loss": 0.8464, + "num_input_tokens_seen": 65477520, + "step": 4070 + }, + { + "epoch": 0.28516640836376056, + "grad_norm": 6.551929950714111, + "learning_rate": 7.151012959719791e-05, + "loss": 1.0369, + "num_input_tokens_seen": 65493904, + "step": 4071 + }, + { + "epoch": 0.28523645660948976, + "grad_norm": 3.4140212535858154, + "learning_rate": 7.150313134851138e-05, + "loss": 1.0508, + "num_input_tokens_seen": 65510288, + "step": 4072 + }, + { + "epoch": 0.285306504855219, + "grad_norm": 4.227553367614746, + "learning_rate": 7.149613309982488e-05, + "loss": 1.0793, + "num_input_tokens_seen": 65526672, + "step": 4073 + }, + { + "epoch": 0.2853765531009483, + "grad_norm": 4.202794551849365, + "learning_rate": 7.148913485113836e-05, + "loss": 1.1393, + "num_input_tokens_seen": 65542456, + "step": 4074 + }, + { + "epoch": 0.28544660134667754, + "grad_norm": 5.172013759613037, + "learning_rate": 7.148213660245183e-05, + "loss": 1.2451, + "num_input_tokens_seen": 65558384, + "step": 4075 + }, + { + "epoch": 0.28551664959240675, + "grad_norm": 3.716113567352295, + "learning_rate": 7.147513835376532e-05, + "loss": 0.8515, + "num_input_tokens_seen": 65574768, + "step": 4076 + }, + { + "epoch": 0.285586697838136, + "grad_norm": 8.10258674621582, + "learning_rate": 7.146814010507881e-05, + "loss": 1.0737, + "num_input_tokens_seen": 65590632, + "step": 4077 + }, + { + "epoch": 0.28565674608386527, + "grad_norm": 3.649273157119751, + "learning_rate": 7.14611418563923e-05, + "loss": 1.0376, + "num_input_tokens_seen": 65607016, + "step": 4078 + }, + { + "epoch": 0.2857267943295945, + "grad_norm": 4.202502250671387, + "learning_rate": 7.145414360770579e-05, + "loss": 1.1102, + "num_input_tokens_seen": 65622856, + "step": 4079 + }, + { + "epoch": 0.28579684257532373, + "grad_norm": 4.027415752410889, + "learning_rate": 7.144714535901928e-05, + "loss": 1.26, + "num_input_tokens_seen": 65639240, + "step": 4080 + }, + { + "epoch": 0.285866890821053, + "grad_norm": 4.549161434173584, + "learning_rate": 7.144014711033275e-05, + "loss": 1.1598, + "num_input_tokens_seen": 65655624, + "step": 4081 + }, + { + "epoch": 0.28593693906678225, + "grad_norm": 4.43501615524292, + "learning_rate": 7.143314886164623e-05, + "loss": 1.0735, + "num_input_tokens_seen": 65671016, + "step": 4082 + }, + { + "epoch": 0.2860069873125115, + "grad_norm": 3.739610433578491, + "learning_rate": 7.142615061295972e-05, + "loss": 1.0321, + "num_input_tokens_seen": 65687072, + "step": 4083 + }, + { + "epoch": 0.2860770355582407, + "grad_norm": 3.725759506225586, + "learning_rate": 7.14191523642732e-05, + "loss": 1.0712, + "num_input_tokens_seen": 65703456, + "step": 4084 + }, + { + "epoch": 0.28614708380396997, + "grad_norm": 3.706056594848633, + "learning_rate": 7.14121541155867e-05, + "loss": 1.0643, + "num_input_tokens_seen": 65719552, + "step": 4085 + }, + { + "epoch": 0.28621713204969923, + "grad_norm": 4.971164703369141, + "learning_rate": 7.140515586690018e-05, + "loss": 1.2084, + "num_input_tokens_seen": 65735936, + "step": 4086 + }, + { + "epoch": 0.2862871802954285, + "grad_norm": 7.377131938934326, + "learning_rate": 7.139815761821367e-05, + "loss": 0.8867, + "num_input_tokens_seen": 65752320, + "step": 4087 + }, + { + "epoch": 0.28635722854115775, + "grad_norm": 4.293169975280762, + "learning_rate": 7.139115936952715e-05, + "loss": 1.0805, + "num_input_tokens_seen": 65768704, + "step": 4088 + }, + { + "epoch": 0.28642727678688695, + "grad_norm": 3.4757955074310303, + "learning_rate": 7.138416112084063e-05, + "loss": 0.9749, + "num_input_tokens_seen": 65785088, + "step": 4089 + }, + { + "epoch": 0.2864973250326162, + "grad_norm": 4.5705695152282715, + "learning_rate": 7.137716287215412e-05, + "loss": 1.209, + "num_input_tokens_seen": 65801472, + "step": 4090 + }, + { + "epoch": 0.28656737327834547, + "grad_norm": 5.240487575531006, + "learning_rate": 7.137016462346761e-05, + "loss": 0.9684, + "num_input_tokens_seen": 65817856, + "step": 4091 + }, + { + "epoch": 0.28663742152407473, + "grad_norm": 3.7815425395965576, + "learning_rate": 7.136316637478109e-05, + "loss": 0.9431, + "num_input_tokens_seen": 65833872, + "step": 4092 + }, + { + "epoch": 0.28670746976980394, + "grad_norm": 5.411090850830078, + "learning_rate": 7.135616812609457e-05, + "loss": 1.1237, + "num_input_tokens_seen": 65849064, + "step": 4093 + }, + { + "epoch": 0.2867775180155332, + "grad_norm": 4.07004451751709, + "learning_rate": 7.134916987740806e-05, + "loss": 1.0168, + "num_input_tokens_seen": 65865448, + "step": 4094 + }, + { + "epoch": 0.28684756626126245, + "grad_norm": 3.636051893234253, + "learning_rate": 7.134217162872154e-05, + "loss": 0.9363, + "num_input_tokens_seen": 65881320, + "step": 4095 + }, + { + "epoch": 0.2869176145069917, + "grad_norm": 4.265620708465576, + "learning_rate": 7.133517338003503e-05, + "loss": 1.2098, + "num_input_tokens_seen": 65896832, + "step": 4096 + }, + { + "epoch": 0.2869876627527209, + "grad_norm": 4.145105838775635, + "learning_rate": 7.132817513134852e-05, + "loss": 0.9785, + "num_input_tokens_seen": 65912960, + "step": 4097 + }, + { + "epoch": 0.2870577109984502, + "grad_norm": 3.6198408603668213, + "learning_rate": 7.1321176882662e-05, + "loss": 1.0276, + "num_input_tokens_seen": 65929344, + "step": 4098 + }, + { + "epoch": 0.28712775924417944, + "grad_norm": 4.000823497772217, + "learning_rate": 7.131417863397548e-05, + "loss": 1.2109, + "num_input_tokens_seen": 65945480, + "step": 4099 + }, + { + "epoch": 0.2871978074899087, + "grad_norm": 4.2647271156311035, + "learning_rate": 7.130718038528898e-05, + "loss": 1.1588, + "num_input_tokens_seen": 65961672, + "step": 4100 + }, + { + "epoch": 0.2872678557356379, + "grad_norm": 4.704364776611328, + "learning_rate": 7.130018213660246e-05, + "loss": 1.0707, + "num_input_tokens_seen": 65976848, + "step": 4101 + }, + { + "epoch": 0.28733790398136716, + "grad_norm": 3.8795642852783203, + "learning_rate": 7.129318388791593e-05, + "loss": 1.0087, + "num_input_tokens_seen": 65993120, + "step": 4102 + }, + { + "epoch": 0.2874079522270964, + "grad_norm": 4.356956958770752, + "learning_rate": 7.128618563922942e-05, + "loss": 1.4218, + "num_input_tokens_seen": 66008448, + "step": 4103 + }, + { + "epoch": 0.2874780004728257, + "grad_norm": 3.5145177841186523, + "learning_rate": 7.127918739054291e-05, + "loss": 0.9055, + "num_input_tokens_seen": 66024712, + "step": 4104 + }, + { + "epoch": 0.2875480487185549, + "grad_norm": 3.7384872436523438, + "learning_rate": 7.12721891418564e-05, + "loss": 1.0574, + "num_input_tokens_seen": 66041096, + "step": 4105 + }, + { + "epoch": 0.28761809696428414, + "grad_norm": 3.9706084728240967, + "learning_rate": 7.126519089316989e-05, + "loss": 1.1538, + "num_input_tokens_seen": 66056880, + "step": 4106 + }, + { + "epoch": 0.2876881452100134, + "grad_norm": 3.692093849182129, + "learning_rate": 7.125819264448337e-05, + "loss": 0.9421, + "num_input_tokens_seen": 66073264, + "step": 4107 + }, + { + "epoch": 0.28775819345574266, + "grad_norm": 4.967808246612549, + "learning_rate": 7.125119439579685e-05, + "loss": 0.8829, + "num_input_tokens_seen": 66089648, + "step": 4108 + }, + { + "epoch": 0.28782824170147187, + "grad_norm": 3.8627805709838867, + "learning_rate": 7.124419614711032e-05, + "loss": 1.1056, + "num_input_tokens_seen": 66105992, + "step": 4109 + }, + { + "epoch": 0.2878982899472011, + "grad_norm": 3.7407474517822266, + "learning_rate": 7.123719789842381e-05, + "loss": 1.0241, + "num_input_tokens_seen": 66122040, + "step": 4110 + }, + { + "epoch": 0.2879683381929304, + "grad_norm": 4.028223514556885, + "learning_rate": 7.123019964973732e-05, + "loss": 1.161, + "num_input_tokens_seen": 66138056, + "step": 4111 + }, + { + "epoch": 0.28803838643865964, + "grad_norm": 4.248149394989014, + "learning_rate": 7.122320140105079e-05, + "loss": 1.083, + "num_input_tokens_seen": 66154384, + "step": 4112 + }, + { + "epoch": 0.28810843468438885, + "grad_norm": 3.49904465675354, + "learning_rate": 7.121620315236428e-05, + "loss": 1.0217, + "num_input_tokens_seen": 66170016, + "step": 4113 + }, + { + "epoch": 0.2881784829301181, + "grad_norm": 5.039339542388916, + "learning_rate": 7.120920490367777e-05, + "loss": 0.8658, + "num_input_tokens_seen": 66185744, + "step": 4114 + }, + { + "epoch": 0.28824853117584737, + "grad_norm": 3.800870656967163, + "learning_rate": 7.120220665499124e-05, + "loss": 1.1031, + "num_input_tokens_seen": 66202128, + "step": 4115 + }, + { + "epoch": 0.2883185794215766, + "grad_norm": 4.8073530197143555, + "learning_rate": 7.119520840630473e-05, + "loss": 1.1191, + "num_input_tokens_seen": 66217840, + "step": 4116 + }, + { + "epoch": 0.28838862766730583, + "grad_norm": 3.495415210723877, + "learning_rate": 7.118821015761822e-05, + "loss": 0.8693, + "num_input_tokens_seen": 66234224, + "step": 4117 + }, + { + "epoch": 0.2884586759130351, + "grad_norm": 4.46912956237793, + "learning_rate": 7.118121190893171e-05, + "loss": 1.2077, + "num_input_tokens_seen": 66249968, + "step": 4118 + }, + { + "epoch": 0.28852872415876435, + "grad_norm": 4.553129196166992, + "learning_rate": 7.117421366024518e-05, + "loss": 1.1039, + "num_input_tokens_seen": 66265304, + "step": 4119 + }, + { + "epoch": 0.2885987724044936, + "grad_norm": 3.713836193084717, + "learning_rate": 7.116721541155867e-05, + "loss": 1.0833, + "num_input_tokens_seen": 66281680, + "step": 4120 + }, + { + "epoch": 0.28866882065022287, + "grad_norm": 3.9745819568634033, + "learning_rate": 7.116021716287216e-05, + "loss": 1.1524, + "num_input_tokens_seen": 66298064, + "step": 4121 + }, + { + "epoch": 0.28873886889595207, + "grad_norm": 6.237453937530518, + "learning_rate": 7.115321891418564e-05, + "loss": 1.3598, + "num_input_tokens_seen": 66314448, + "step": 4122 + }, + { + "epoch": 0.28880891714168133, + "grad_norm": 3.7947497367858887, + "learning_rate": 7.114622066549912e-05, + "loss": 0.9342, + "num_input_tokens_seen": 66330832, + "step": 4123 + }, + { + "epoch": 0.2888789653874106, + "grad_norm": 5.574815273284912, + "learning_rate": 7.113922241681261e-05, + "loss": 1.1212, + "num_input_tokens_seen": 66347216, + "step": 4124 + }, + { + "epoch": 0.28894901363313985, + "grad_norm": 3.538344144821167, + "learning_rate": 7.11322241681261e-05, + "loss": 1.0205, + "num_input_tokens_seen": 66363352, + "step": 4125 + }, + { + "epoch": 0.28901906187886905, + "grad_norm": 3.792769193649292, + "learning_rate": 7.112522591943958e-05, + "loss": 1.1266, + "num_input_tokens_seen": 66379736, + "step": 4126 + }, + { + "epoch": 0.2890891101245983, + "grad_norm": 4.527935981750488, + "learning_rate": 7.111822767075308e-05, + "loss": 1.0124, + "num_input_tokens_seen": 66396120, + "step": 4127 + }, + { + "epoch": 0.2891591583703276, + "grad_norm": 3.753326416015625, + "learning_rate": 7.111122942206655e-05, + "loss": 0.9993, + "num_input_tokens_seen": 66412424, + "step": 4128 + }, + { + "epoch": 0.28922920661605683, + "grad_norm": 4.310519218444824, + "learning_rate": 7.110423117338003e-05, + "loss": 1.0481, + "num_input_tokens_seen": 66428176, + "step": 4129 + }, + { + "epoch": 0.28929925486178604, + "grad_norm": 3.9848945140838623, + "learning_rate": 7.109723292469352e-05, + "loss": 1.2687, + "num_input_tokens_seen": 66444560, + "step": 4130 + }, + { + "epoch": 0.2893693031075153, + "grad_norm": 4.654316425323486, + "learning_rate": 7.109023467600702e-05, + "loss": 1.0025, + "num_input_tokens_seen": 66460944, + "step": 4131 + }, + { + "epoch": 0.28943935135324456, + "grad_norm": 4.566670894622803, + "learning_rate": 7.10832364273205e-05, + "loss": 0.9224, + "num_input_tokens_seen": 66475928, + "step": 4132 + }, + { + "epoch": 0.2895093995989738, + "grad_norm": 4.4292988777160645, + "learning_rate": 7.107623817863398e-05, + "loss": 1.0922, + "num_input_tokens_seen": 66491904, + "step": 4133 + }, + { + "epoch": 0.289579447844703, + "grad_norm": 6.520173072814941, + "learning_rate": 7.106923992994747e-05, + "loss": 0.9938, + "num_input_tokens_seen": 66507256, + "step": 4134 + }, + { + "epoch": 0.2896494960904323, + "grad_norm": 3.8424220085144043, + "learning_rate": 7.106224168126095e-05, + "loss": 1.0857, + "num_input_tokens_seen": 66522736, + "step": 4135 + }, + { + "epoch": 0.28971954433616154, + "grad_norm": 4.742796897888184, + "learning_rate": 7.105524343257442e-05, + "loss": 1.0296, + "num_input_tokens_seen": 66538480, + "step": 4136 + }, + { + "epoch": 0.2897895925818908, + "grad_norm": 3.552365779876709, + "learning_rate": 7.104824518388792e-05, + "loss": 1.0597, + "num_input_tokens_seen": 66554576, + "step": 4137 + }, + { + "epoch": 0.28985964082762, + "grad_norm": 6.649835109710693, + "learning_rate": 7.104124693520141e-05, + "loss": 0.9729, + "num_input_tokens_seen": 66570000, + "step": 4138 + }, + { + "epoch": 0.28992968907334926, + "grad_norm": 3.9890356063842773, + "learning_rate": 7.103424868651489e-05, + "loss": 0.9774, + "num_input_tokens_seen": 66585640, + "step": 4139 + }, + { + "epoch": 0.2899997373190785, + "grad_norm": 3.80637526512146, + "learning_rate": 7.102725043782838e-05, + "loss": 1.0373, + "num_input_tokens_seen": 66601696, + "step": 4140 + }, + { + "epoch": 0.2900697855648078, + "grad_norm": 4.089916706085205, + "learning_rate": 7.102025218914186e-05, + "loss": 1.0919, + "num_input_tokens_seen": 66618080, + "step": 4141 + }, + { + "epoch": 0.290139833810537, + "grad_norm": 3.2609710693359375, + "learning_rate": 7.101325394045534e-05, + "loss": 0.9409, + "num_input_tokens_seen": 66634216, + "step": 4142 + }, + { + "epoch": 0.29020988205626624, + "grad_norm": 4.3664093017578125, + "learning_rate": 7.100625569176883e-05, + "loss": 0.9031, + "num_input_tokens_seen": 66650600, + "step": 4143 + }, + { + "epoch": 0.2902799303019955, + "grad_norm": 4.460801124572754, + "learning_rate": 7.099925744308232e-05, + "loss": 1.0582, + "num_input_tokens_seen": 66666592, + "step": 4144 + }, + { + "epoch": 0.29034997854772476, + "grad_norm": 4.474677562713623, + "learning_rate": 7.09922591943958e-05, + "loss": 1.0016, + "num_input_tokens_seen": 66681544, + "step": 4145 + }, + { + "epoch": 0.29042002679345397, + "grad_norm": 3.6482129096984863, + "learning_rate": 7.098526094570928e-05, + "loss": 1.0823, + "num_input_tokens_seen": 66697928, + "step": 4146 + }, + { + "epoch": 0.2904900750391832, + "grad_norm": 3.483290195465088, + "learning_rate": 7.097826269702277e-05, + "loss": 0.8853, + "num_input_tokens_seen": 66714312, + "step": 4147 + }, + { + "epoch": 0.2905601232849125, + "grad_norm": 4.703539848327637, + "learning_rate": 7.097126444833626e-05, + "loss": 0.9718, + "num_input_tokens_seen": 66729632, + "step": 4148 + }, + { + "epoch": 0.29063017153064175, + "grad_norm": 3.8614907264709473, + "learning_rate": 7.096426619964973e-05, + "loss": 1.0047, + "num_input_tokens_seen": 66746016, + "step": 4149 + }, + { + "epoch": 0.29070021977637095, + "grad_norm": 3.612683057785034, + "learning_rate": 7.095726795096322e-05, + "loss": 1.1783, + "num_input_tokens_seen": 66762400, + "step": 4150 + }, + { + "epoch": 0.2907702680221002, + "grad_norm": 3.980149984359741, + "learning_rate": 7.095026970227672e-05, + "loss": 0.9993, + "num_input_tokens_seen": 66778392, + "step": 4151 + }, + { + "epoch": 0.29084031626782947, + "grad_norm": 3.857588052749634, + "learning_rate": 7.09432714535902e-05, + "loss": 1.0506, + "num_input_tokens_seen": 66794200, + "step": 4152 + }, + { + "epoch": 0.29091036451355873, + "grad_norm": 5.106949806213379, + "learning_rate": 7.093627320490367e-05, + "loss": 1.2222, + "num_input_tokens_seen": 66810584, + "step": 4153 + }, + { + "epoch": 0.29098041275928793, + "grad_norm": 4.338438987731934, + "learning_rate": 7.092927495621718e-05, + "loss": 1.1203, + "num_input_tokens_seen": 66826208, + "step": 4154 + }, + { + "epoch": 0.2910504610050172, + "grad_norm": 3.962877035140991, + "learning_rate": 7.092227670753065e-05, + "loss": 1.1026, + "num_input_tokens_seen": 66842592, + "step": 4155 + }, + { + "epoch": 0.29112050925074645, + "grad_norm": 3.8490965366363525, + "learning_rate": 7.091527845884413e-05, + "loss": 0.9551, + "num_input_tokens_seen": 66858832, + "step": 4156 + }, + { + "epoch": 0.2911905574964757, + "grad_norm": 4.559625148773193, + "learning_rate": 7.090828021015763e-05, + "loss": 1.3951, + "num_input_tokens_seen": 66875216, + "step": 4157 + }, + { + "epoch": 0.29126060574220497, + "grad_norm": 8.37543773651123, + "learning_rate": 7.090128196147112e-05, + "loss": 1.2365, + "num_input_tokens_seen": 66891600, + "step": 4158 + }, + { + "epoch": 0.2913306539879342, + "grad_norm": 4.128559112548828, + "learning_rate": 7.089428371278459e-05, + "loss": 0.8789, + "num_input_tokens_seen": 66907984, + "step": 4159 + }, + { + "epoch": 0.29140070223366343, + "grad_norm": 4.81403112411499, + "learning_rate": 7.088728546409808e-05, + "loss": 1.1149, + "num_input_tokens_seen": 66923240, + "step": 4160 + }, + { + "epoch": 0.2914707504793927, + "grad_norm": 4.534300804138184, + "learning_rate": 7.088028721541157e-05, + "loss": 0.8906, + "num_input_tokens_seen": 66939624, + "step": 4161 + }, + { + "epoch": 0.29154079872512195, + "grad_norm": 4.46708869934082, + "learning_rate": 7.087328896672504e-05, + "loss": 0.873, + "num_input_tokens_seen": 66955968, + "step": 4162 + }, + { + "epoch": 0.29161084697085116, + "grad_norm": 4.142822265625, + "learning_rate": 7.086629071803853e-05, + "loss": 0.8286, + "num_input_tokens_seen": 66971680, + "step": 4163 + }, + { + "epoch": 0.2916808952165804, + "grad_norm": 3.686167001724243, + "learning_rate": 7.085929246935202e-05, + "loss": 0.897, + "num_input_tokens_seen": 66987952, + "step": 4164 + }, + { + "epoch": 0.2917509434623097, + "grad_norm": 8.076430320739746, + "learning_rate": 7.085229422066551e-05, + "loss": 1.1215, + "num_input_tokens_seen": 67004336, + "step": 4165 + }, + { + "epoch": 0.29182099170803893, + "grad_norm": 8.69857120513916, + "learning_rate": 7.084529597197898e-05, + "loss": 1.2295, + "num_input_tokens_seen": 67020216, + "step": 4166 + }, + { + "epoch": 0.29189103995376814, + "grad_norm": 3.7867684364318848, + "learning_rate": 7.083829772329247e-05, + "loss": 1.058, + "num_input_tokens_seen": 67035600, + "step": 4167 + }, + { + "epoch": 0.2919610881994974, + "grad_norm": 5.560591697692871, + "learning_rate": 7.083129947460596e-05, + "loss": 1.0864, + "num_input_tokens_seen": 67051680, + "step": 4168 + }, + { + "epoch": 0.29203113644522666, + "grad_norm": 3.857120990753174, + "learning_rate": 7.082430122591944e-05, + "loss": 1.1991, + "num_input_tokens_seen": 67068064, + "step": 4169 + }, + { + "epoch": 0.2921011846909559, + "grad_norm": 4.343360900878906, + "learning_rate": 7.081730297723293e-05, + "loss": 1.0973, + "num_input_tokens_seen": 67084448, + "step": 4170 + }, + { + "epoch": 0.2921712329366851, + "grad_norm": 4.198531150817871, + "learning_rate": 7.081030472854643e-05, + "loss": 1.1271, + "num_input_tokens_seen": 67100832, + "step": 4171 + }, + { + "epoch": 0.2922412811824144, + "grad_norm": 3.539684772491455, + "learning_rate": 7.08033064798599e-05, + "loss": 0.9532, + "num_input_tokens_seen": 67117216, + "step": 4172 + }, + { + "epoch": 0.29231132942814364, + "grad_norm": 4.2374444007873535, + "learning_rate": 7.079630823117338e-05, + "loss": 0.9965, + "num_input_tokens_seen": 67133600, + "step": 4173 + }, + { + "epoch": 0.2923813776738729, + "grad_norm": 4.106996059417725, + "learning_rate": 7.078930998248687e-05, + "loss": 1.1141, + "num_input_tokens_seen": 67149984, + "step": 4174 + }, + { + "epoch": 0.2924514259196021, + "grad_norm": 3.7100484371185303, + "learning_rate": 7.078231173380035e-05, + "loss": 1.0702, + "num_input_tokens_seen": 67166168, + "step": 4175 + }, + { + "epoch": 0.29252147416533136, + "grad_norm": 5.189118385314941, + "learning_rate": 7.077531348511383e-05, + "loss": 0.9642, + "num_input_tokens_seen": 67181472, + "step": 4176 + }, + { + "epoch": 0.2925915224110606, + "grad_norm": 4.540155410766602, + "learning_rate": 7.076831523642733e-05, + "loss": 1.0558, + "num_input_tokens_seen": 67197856, + "step": 4177 + }, + { + "epoch": 0.2926615706567899, + "grad_norm": 4.748345375061035, + "learning_rate": 7.076131698774082e-05, + "loss": 0.8845, + "num_input_tokens_seen": 67214240, + "step": 4178 + }, + { + "epoch": 0.2927316189025191, + "grad_norm": 4.252089023590088, + "learning_rate": 7.07543187390543e-05, + "loss": 1.1002, + "num_input_tokens_seen": 67230312, + "step": 4179 + }, + { + "epoch": 0.29280166714824835, + "grad_norm": 4.273370742797852, + "learning_rate": 7.074732049036777e-05, + "loss": 1.1759, + "num_input_tokens_seen": 67246152, + "step": 4180 + }, + { + "epoch": 0.2928717153939776, + "grad_norm": 3.9271481037139893, + "learning_rate": 7.074032224168127e-05, + "loss": 1.0159, + "num_input_tokens_seen": 67261688, + "step": 4181 + }, + { + "epoch": 0.29294176363970686, + "grad_norm": 3.875622034072876, + "learning_rate": 7.073332399299475e-05, + "loss": 1.2345, + "num_input_tokens_seen": 67278072, + "step": 4182 + }, + { + "epoch": 0.29301181188543607, + "grad_norm": 3.8089005947113037, + "learning_rate": 7.072632574430824e-05, + "loss": 1.1025, + "num_input_tokens_seen": 67293760, + "step": 4183 + }, + { + "epoch": 0.29308186013116533, + "grad_norm": 4.402803421020508, + "learning_rate": 7.071932749562172e-05, + "loss": 1.0397, + "num_input_tokens_seen": 67310144, + "step": 4184 + }, + { + "epoch": 0.2931519083768946, + "grad_norm": 4.4534783363342285, + "learning_rate": 7.071232924693521e-05, + "loss": 1.0222, + "num_input_tokens_seen": 67326528, + "step": 4185 + }, + { + "epoch": 0.29322195662262385, + "grad_norm": 4.247747898101807, + "learning_rate": 7.070533099824869e-05, + "loss": 1.0667, + "num_input_tokens_seen": 67342080, + "step": 4186 + }, + { + "epoch": 0.29329200486835305, + "grad_norm": 5.280468463897705, + "learning_rate": 7.069833274956218e-05, + "loss": 1.0492, + "num_input_tokens_seen": 67357168, + "step": 4187 + }, + { + "epoch": 0.2933620531140823, + "grad_norm": 5.14320707321167, + "learning_rate": 7.069133450087567e-05, + "loss": 1.1073, + "num_input_tokens_seen": 67373552, + "step": 4188 + }, + { + "epoch": 0.29343210135981157, + "grad_norm": 4.131645679473877, + "learning_rate": 7.068433625218914e-05, + "loss": 1.3795, + "num_input_tokens_seen": 67389936, + "step": 4189 + }, + { + "epoch": 0.29350214960554083, + "grad_norm": 4.727990627288818, + "learning_rate": 7.067733800350263e-05, + "loss": 1.2066, + "num_input_tokens_seen": 67406320, + "step": 4190 + }, + { + "epoch": 0.2935721978512701, + "grad_norm": 5.857666969299316, + "learning_rate": 7.067033975481612e-05, + "loss": 1.028, + "num_input_tokens_seen": 67422680, + "step": 4191 + }, + { + "epoch": 0.2936422460969993, + "grad_norm": 4.185948371887207, + "learning_rate": 7.06633415061296e-05, + "loss": 1.2738, + "num_input_tokens_seen": 67439064, + "step": 4192 + }, + { + "epoch": 0.29371229434272855, + "grad_norm": 3.749274969100952, + "learning_rate": 7.065634325744308e-05, + "loss": 1.0327, + "num_input_tokens_seen": 67454680, + "step": 4193 + }, + { + "epoch": 0.2937823425884578, + "grad_norm": 4.332368850708008, + "learning_rate": 7.064934500875657e-05, + "loss": 1.0986, + "num_input_tokens_seen": 67470800, + "step": 4194 + }, + { + "epoch": 0.29385239083418707, + "grad_norm": 5.514054775238037, + "learning_rate": 7.064234676007006e-05, + "loss": 1.2602, + "num_input_tokens_seen": 67487184, + "step": 4195 + }, + { + "epoch": 0.2939224390799163, + "grad_norm": 4.534146785736084, + "learning_rate": 7.063534851138353e-05, + "loss": 1.2929, + "num_input_tokens_seen": 67503504, + "step": 4196 + }, + { + "epoch": 0.29399248732564554, + "grad_norm": 4.86776876449585, + "learning_rate": 7.062835026269702e-05, + "loss": 1.111, + "num_input_tokens_seen": 67519056, + "step": 4197 + }, + { + "epoch": 0.2940625355713748, + "grad_norm": 3.8528504371643066, + "learning_rate": 7.062135201401052e-05, + "loss": 0.9151, + "num_input_tokens_seen": 67535440, + "step": 4198 + }, + { + "epoch": 0.29413258381710405, + "grad_norm": 4.244069576263428, + "learning_rate": 7.0614353765324e-05, + "loss": 1.1733, + "num_input_tokens_seen": 67551264, + "step": 4199 + }, + { + "epoch": 0.29420263206283326, + "grad_norm": 3.5963211059570312, + "learning_rate": 7.060735551663747e-05, + "loss": 1.008, + "num_input_tokens_seen": 67567648, + "step": 4200 + }, + { + "epoch": 0.29420263206283326, + "eval_loss": 1.1331984996795654, + "eval_runtime": 0.203, + "eval_samples_per_second": 4.927, + "eval_steps_per_second": 4.927, + "num_input_tokens_seen": 67567648, + "step": 4200 + }, + { + "epoch": 0.2942726803085625, + "grad_norm": 4.51765775680542, + "learning_rate": 7.060035726795096e-05, + "loss": 1.1284, + "num_input_tokens_seen": 67583792, + "step": 4201 + }, + { + "epoch": 0.2943427285542918, + "grad_norm": 4.541067123413086, + "learning_rate": 7.059335901926445e-05, + "loss": 1.1246, + "num_input_tokens_seen": 67599856, + "step": 4202 + }, + { + "epoch": 0.29441277680002104, + "grad_norm": 4.095570087432861, + "learning_rate": 7.058636077057794e-05, + "loss": 1.0087, + "num_input_tokens_seen": 67616240, + "step": 4203 + }, + { + "epoch": 0.29448282504575024, + "grad_norm": 4.616795539855957, + "learning_rate": 7.057936252189143e-05, + "loss": 1.2549, + "num_input_tokens_seen": 67632496, + "step": 4204 + }, + { + "epoch": 0.2945528732914795, + "grad_norm": 3.8619420528411865, + "learning_rate": 7.057236427320492e-05, + "loss": 0.9626, + "num_input_tokens_seen": 67648880, + "step": 4205 + }, + { + "epoch": 0.29462292153720876, + "grad_norm": 4.194519996643066, + "learning_rate": 7.056536602451839e-05, + "loss": 0.958, + "num_input_tokens_seen": 67665264, + "step": 4206 + }, + { + "epoch": 0.294692969782938, + "grad_norm": 4.835122585296631, + "learning_rate": 7.055836777583187e-05, + "loss": 1.0201, + "num_input_tokens_seen": 67681648, + "step": 4207 + }, + { + "epoch": 0.2947630180286672, + "grad_norm": 4.2085280418396, + "learning_rate": 7.055136952714537e-05, + "loss": 0.9584, + "num_input_tokens_seen": 67697960, + "step": 4208 + }, + { + "epoch": 0.2948330662743965, + "grad_norm": 4.439855575561523, + "learning_rate": 7.054437127845884e-05, + "loss": 1.0693, + "num_input_tokens_seen": 67714344, + "step": 4209 + }, + { + "epoch": 0.29490311452012574, + "grad_norm": 5.427484035491943, + "learning_rate": 7.053737302977233e-05, + "loss": 1.1782, + "num_input_tokens_seen": 67730728, + "step": 4210 + }, + { + "epoch": 0.294973162765855, + "grad_norm": 3.6627275943756104, + "learning_rate": 7.053037478108582e-05, + "loss": 0.8507, + "num_input_tokens_seen": 67746704, + "step": 4211 + }, + { + "epoch": 0.2950432110115842, + "grad_norm": 4.450380325317383, + "learning_rate": 7.052337653239931e-05, + "loss": 1.0651, + "num_input_tokens_seen": 67762320, + "step": 4212 + }, + { + "epoch": 0.29511325925731346, + "grad_norm": 3.6644749641418457, + "learning_rate": 7.051637828371279e-05, + "loss": 1.1532, + "num_input_tokens_seen": 67778704, + "step": 4213 + }, + { + "epoch": 0.2951833075030427, + "grad_norm": 4.331392288208008, + "learning_rate": 7.050938003502627e-05, + "loss": 1.0854, + "num_input_tokens_seen": 67795088, + "step": 4214 + }, + { + "epoch": 0.295253355748772, + "grad_norm": 4.157777786254883, + "learning_rate": 7.050238178633976e-05, + "loss": 1.2039, + "num_input_tokens_seen": 67811472, + "step": 4215 + }, + { + "epoch": 0.2953234039945012, + "grad_norm": 3.858069896697998, + "learning_rate": 7.049538353765324e-05, + "loss": 1.1751, + "num_input_tokens_seen": 67827488, + "step": 4216 + }, + { + "epoch": 0.29539345224023045, + "grad_norm": 4.279262542724609, + "learning_rate": 7.048838528896673e-05, + "loss": 1.0344, + "num_input_tokens_seen": 67843872, + "step": 4217 + }, + { + "epoch": 0.2954635004859597, + "grad_norm": 4.539918422698975, + "learning_rate": 7.048138704028021e-05, + "loss": 1.0244, + "num_input_tokens_seen": 67860256, + "step": 4218 + }, + { + "epoch": 0.29553354873168897, + "grad_norm": 3.738811492919922, + "learning_rate": 7.04743887915937e-05, + "loss": 1.067, + "num_input_tokens_seen": 67876224, + "step": 4219 + }, + { + "epoch": 0.29560359697741817, + "grad_norm": 4.634495258331299, + "learning_rate": 7.046739054290718e-05, + "loss": 0.9273, + "num_input_tokens_seen": 67892040, + "step": 4220 + }, + { + "epoch": 0.29567364522314743, + "grad_norm": 5.988262176513672, + "learning_rate": 7.046039229422067e-05, + "loss": 0.956, + "num_input_tokens_seen": 67908424, + "step": 4221 + }, + { + "epoch": 0.2957436934688767, + "grad_norm": 7.2220258712768555, + "learning_rate": 7.045339404553416e-05, + "loss": 1.246, + "num_input_tokens_seen": 67924808, + "step": 4222 + }, + { + "epoch": 0.29581374171460595, + "grad_norm": 8.866394996643066, + "learning_rate": 7.044639579684764e-05, + "loss": 0.9932, + "num_input_tokens_seen": 67941192, + "step": 4223 + }, + { + "epoch": 0.2958837899603352, + "grad_norm": 4.791526794433594, + "learning_rate": 7.043939754816112e-05, + "loss": 1.1966, + "num_input_tokens_seen": 67957576, + "step": 4224 + }, + { + "epoch": 0.2959538382060644, + "grad_norm": 3.8345704078674316, + "learning_rate": 7.043239929947462e-05, + "loss": 0.9754, + "num_input_tokens_seen": 67973112, + "step": 4225 + }, + { + "epoch": 0.29602388645179367, + "grad_norm": 5.0572099685668945, + "learning_rate": 7.04254010507881e-05, + "loss": 1.3761, + "num_input_tokens_seen": 67989360, + "step": 4226 + }, + { + "epoch": 0.29609393469752293, + "grad_norm": 4.467088222503662, + "learning_rate": 7.041840280210157e-05, + "loss": 0.981, + "num_input_tokens_seen": 68005744, + "step": 4227 + }, + { + "epoch": 0.2961639829432522, + "grad_norm": 6.415910243988037, + "learning_rate": 7.041140455341506e-05, + "loss": 1.1376, + "num_input_tokens_seen": 68021592, + "step": 4228 + }, + { + "epoch": 0.2962340311889814, + "grad_norm": 4.432079315185547, + "learning_rate": 7.040440630472855e-05, + "loss": 1.1264, + "num_input_tokens_seen": 68037976, + "step": 4229 + }, + { + "epoch": 0.29630407943471065, + "grad_norm": 4.207062721252441, + "learning_rate": 7.039740805604204e-05, + "loss": 1.2702, + "num_input_tokens_seen": 68054328, + "step": 4230 + }, + { + "epoch": 0.2963741276804399, + "grad_norm": 4.825972557067871, + "learning_rate": 7.039040980735553e-05, + "loss": 1.3091, + "num_input_tokens_seen": 68070416, + "step": 4231 + }, + { + "epoch": 0.2964441759261692, + "grad_norm": 3.917593002319336, + "learning_rate": 7.038341155866901e-05, + "loss": 1.1863, + "num_input_tokens_seen": 68086800, + "step": 4232 + }, + { + "epoch": 0.2965142241718984, + "grad_norm": 3.8865675926208496, + "learning_rate": 7.037641330998249e-05, + "loss": 1.2023, + "num_input_tokens_seen": 68103184, + "step": 4233 + }, + { + "epoch": 0.29658427241762764, + "grad_norm": 3.8321971893310547, + "learning_rate": 7.036941506129596e-05, + "loss": 0.9507, + "num_input_tokens_seen": 68119568, + "step": 4234 + }, + { + "epoch": 0.2966543206633569, + "grad_norm": 5.020960807800293, + "learning_rate": 7.036241681260947e-05, + "loss": 1.0438, + "num_input_tokens_seen": 68135416, + "step": 4235 + }, + { + "epoch": 0.29672436890908616, + "grad_norm": 3.653468608856201, + "learning_rate": 7.035541856392294e-05, + "loss": 1.0664, + "num_input_tokens_seen": 68151800, + "step": 4236 + }, + { + "epoch": 0.29679441715481536, + "grad_norm": 3.8133575916290283, + "learning_rate": 7.034842031523643e-05, + "loss": 1.0867, + "num_input_tokens_seen": 68168184, + "step": 4237 + }, + { + "epoch": 0.2968644654005446, + "grad_norm": 3.6642141342163086, + "learning_rate": 7.034142206654992e-05, + "loss": 0.9505, + "num_input_tokens_seen": 68184080, + "step": 4238 + }, + { + "epoch": 0.2969345136462739, + "grad_norm": 4.362963676452637, + "learning_rate": 7.033442381786341e-05, + "loss": 1.0335, + "num_input_tokens_seen": 68199928, + "step": 4239 + }, + { + "epoch": 0.29700456189200314, + "grad_norm": 3.6831562519073486, + "learning_rate": 7.032742556917688e-05, + "loss": 0.9608, + "num_input_tokens_seen": 68215952, + "step": 4240 + }, + { + "epoch": 0.29707461013773234, + "grad_norm": 4.906534194946289, + "learning_rate": 7.032042732049037e-05, + "loss": 0.9434, + "num_input_tokens_seen": 68232336, + "step": 4241 + }, + { + "epoch": 0.2971446583834616, + "grad_norm": 3.446749687194824, + "learning_rate": 7.031342907180386e-05, + "loss": 0.8306, + "num_input_tokens_seen": 68247832, + "step": 4242 + }, + { + "epoch": 0.29721470662919086, + "grad_norm": 4.729014873504639, + "learning_rate": 7.030643082311735e-05, + "loss": 1.0787, + "num_input_tokens_seen": 68264216, + "step": 4243 + }, + { + "epoch": 0.2972847548749201, + "grad_norm": 4.196920871734619, + "learning_rate": 7.029943257443082e-05, + "loss": 1.1496, + "num_input_tokens_seen": 68280600, + "step": 4244 + }, + { + "epoch": 0.2973548031206493, + "grad_norm": 7.193357467651367, + "learning_rate": 7.029243432574431e-05, + "loss": 1.0509, + "num_input_tokens_seen": 68296984, + "step": 4245 + }, + { + "epoch": 0.2974248513663786, + "grad_norm": 4.00344181060791, + "learning_rate": 7.02854360770578e-05, + "loss": 0.9025, + "num_input_tokens_seen": 68312720, + "step": 4246 + }, + { + "epoch": 0.29749489961210784, + "grad_norm": 4.04103422164917, + "learning_rate": 7.027843782837128e-05, + "loss": 1.1307, + "num_input_tokens_seen": 68328608, + "step": 4247 + }, + { + "epoch": 0.2975649478578371, + "grad_norm": 4.010391712188721, + "learning_rate": 7.027143957968476e-05, + "loss": 1.153, + "num_input_tokens_seen": 68343288, + "step": 4248 + }, + { + "epoch": 0.2976349961035663, + "grad_norm": 6.364760398864746, + "learning_rate": 7.026444133099825e-05, + "loss": 1.1629, + "num_input_tokens_seen": 68359672, + "step": 4249 + }, + { + "epoch": 0.29770504434929557, + "grad_norm": 5.682034969329834, + "learning_rate": 7.025744308231174e-05, + "loss": 1.1388, + "num_input_tokens_seen": 68376056, + "step": 4250 + }, + { + "epoch": 0.2977750925950248, + "grad_norm": 3.6160550117492676, + "learning_rate": 7.025044483362522e-05, + "loss": 1.0105, + "num_input_tokens_seen": 68392440, + "step": 4251 + }, + { + "epoch": 0.2978451408407541, + "grad_norm": 4.839343070983887, + "learning_rate": 7.024344658493872e-05, + "loss": 0.9924, + "num_input_tokens_seen": 68408608, + "step": 4252 + }, + { + "epoch": 0.2979151890864833, + "grad_norm": 5.255819320678711, + "learning_rate": 7.02364483362522e-05, + "loss": 1.1425, + "num_input_tokens_seen": 68424944, + "step": 4253 + }, + { + "epoch": 0.29798523733221255, + "grad_norm": 3.7549142837524414, + "learning_rate": 7.022945008756567e-05, + "loss": 0.8801, + "num_input_tokens_seen": 68441328, + "step": 4254 + }, + { + "epoch": 0.2980552855779418, + "grad_norm": 5.159091472625732, + "learning_rate": 7.022245183887916e-05, + "loss": 1.0075, + "num_input_tokens_seen": 68457712, + "step": 4255 + }, + { + "epoch": 0.29812533382367107, + "grad_norm": 3.8031342029571533, + "learning_rate": 7.021545359019265e-05, + "loss": 0.9975, + "num_input_tokens_seen": 68474072, + "step": 4256 + }, + { + "epoch": 0.29819538206940027, + "grad_norm": 6.039318084716797, + "learning_rate": 7.020845534150613e-05, + "loss": 1.0791, + "num_input_tokens_seen": 68490456, + "step": 4257 + }, + { + "epoch": 0.29826543031512953, + "grad_norm": 3.9376237392425537, + "learning_rate": 7.020145709281962e-05, + "loss": 1.0753, + "num_input_tokens_seen": 68506760, + "step": 4258 + }, + { + "epoch": 0.2983354785608588, + "grad_norm": 4.599661827087402, + "learning_rate": 7.019445884413311e-05, + "loss": 0.9722, + "num_input_tokens_seen": 68523144, + "step": 4259 + }, + { + "epoch": 0.29840552680658805, + "grad_norm": 3.743640661239624, + "learning_rate": 7.018746059544659e-05, + "loss": 1.157, + "num_input_tokens_seen": 68539448, + "step": 4260 + }, + { + "epoch": 0.2984755750523173, + "grad_norm": 6.111955642700195, + "learning_rate": 7.018046234676006e-05, + "loss": 1.2148, + "num_input_tokens_seen": 68555832, + "step": 4261 + }, + { + "epoch": 0.2985456232980465, + "grad_norm": 4.297199249267578, + "learning_rate": 7.017346409807356e-05, + "loss": 0.9796, + "num_input_tokens_seen": 68572216, + "step": 4262 + }, + { + "epoch": 0.2986156715437758, + "grad_norm": 4.126640319824219, + "learning_rate": 7.016646584938705e-05, + "loss": 1.0781, + "num_input_tokens_seen": 68588600, + "step": 4263 + }, + { + "epoch": 0.29868571978950503, + "grad_norm": 3.8142640590667725, + "learning_rate": 7.015946760070053e-05, + "loss": 1.1943, + "num_input_tokens_seen": 68604336, + "step": 4264 + }, + { + "epoch": 0.2987557680352343, + "grad_norm": 3.9500539302825928, + "learning_rate": 7.015246935201402e-05, + "loss": 1.1179, + "num_input_tokens_seen": 68620056, + "step": 4265 + }, + { + "epoch": 0.2988258162809635, + "grad_norm": 4.431976318359375, + "learning_rate": 7.01454711033275e-05, + "loss": 1.3419, + "num_input_tokens_seen": 68636328, + "step": 4266 + }, + { + "epoch": 0.29889586452669276, + "grad_norm": 5.619480609893799, + "learning_rate": 7.013847285464098e-05, + "loss": 1.099, + "num_input_tokens_seen": 68651984, + "step": 4267 + }, + { + "epoch": 0.298965912772422, + "grad_norm": 3.8473827838897705, + "learning_rate": 7.013147460595447e-05, + "loss": 1.1273, + "num_input_tokens_seen": 68668176, + "step": 4268 + }, + { + "epoch": 0.2990359610181513, + "grad_norm": 5.942142486572266, + "learning_rate": 7.012447635726796e-05, + "loss": 1.1058, + "num_input_tokens_seen": 68684560, + "step": 4269 + }, + { + "epoch": 0.2991060092638805, + "grad_norm": 6.194666862487793, + "learning_rate": 7.011747810858145e-05, + "loss": 0.9782, + "num_input_tokens_seen": 68699816, + "step": 4270 + }, + { + "epoch": 0.29917605750960974, + "grad_norm": 4.336294651031494, + "learning_rate": 7.011047985989492e-05, + "loss": 1.0038, + "num_input_tokens_seen": 68716200, + "step": 4271 + }, + { + "epoch": 0.299246105755339, + "grad_norm": 4.277907371520996, + "learning_rate": 7.010348161120841e-05, + "loss": 1.0151, + "num_input_tokens_seen": 68732584, + "step": 4272 + }, + { + "epoch": 0.29931615400106826, + "grad_norm": 5.045118808746338, + "learning_rate": 7.00964833625219e-05, + "loss": 1.0992, + "num_input_tokens_seen": 68748840, + "step": 4273 + }, + { + "epoch": 0.29938620224679746, + "grad_norm": 4.3978400230407715, + "learning_rate": 7.008948511383537e-05, + "loss": 1.1796, + "num_input_tokens_seen": 68765224, + "step": 4274 + }, + { + "epoch": 0.2994562504925267, + "grad_norm": 5.052615165710449, + "learning_rate": 7.008248686514886e-05, + "loss": 1.0557, + "num_input_tokens_seen": 68780808, + "step": 4275 + }, + { + "epoch": 0.299526298738256, + "grad_norm": 6.902999401092529, + "learning_rate": 7.007548861646235e-05, + "loss": 1.0952, + "num_input_tokens_seen": 68797064, + "step": 4276 + }, + { + "epoch": 0.29959634698398524, + "grad_norm": 5.947190761566162, + "learning_rate": 7.006849036777584e-05, + "loss": 1.1163, + "num_input_tokens_seen": 68812904, + "step": 4277 + }, + { + "epoch": 0.29966639522971444, + "grad_norm": 5.443974018096924, + "learning_rate": 7.006149211908931e-05, + "loss": 1.146, + "num_input_tokens_seen": 68828736, + "step": 4278 + }, + { + "epoch": 0.2997364434754437, + "grad_norm": 3.9849112033843994, + "learning_rate": 7.005449387040282e-05, + "loss": 0.9636, + "num_input_tokens_seen": 68843928, + "step": 4279 + }, + { + "epoch": 0.29980649172117296, + "grad_norm": 5.787483215332031, + "learning_rate": 7.004749562171629e-05, + "loss": 1.194, + "num_input_tokens_seen": 68860312, + "step": 4280 + }, + { + "epoch": 0.2998765399669022, + "grad_norm": 3.8437387943267822, + "learning_rate": 7.004049737302977e-05, + "loss": 0.9214, + "num_input_tokens_seen": 68876696, + "step": 4281 + }, + { + "epoch": 0.2999465882126314, + "grad_norm": 3.94879150390625, + "learning_rate": 7.003349912434325e-05, + "loss": 1.1403, + "num_input_tokens_seen": 68893080, + "step": 4282 + }, + { + "epoch": 0.3000166364583607, + "grad_norm": 4.746649265289307, + "learning_rate": 7.002650087565676e-05, + "loss": 1.077, + "num_input_tokens_seen": 68909464, + "step": 4283 + }, + { + "epoch": 0.30008668470408995, + "grad_norm": 4.1024861335754395, + "learning_rate": 7.001950262697023e-05, + "loss": 1.0592, + "num_input_tokens_seen": 68925352, + "step": 4284 + }, + { + "epoch": 0.3001567329498192, + "grad_norm": 4.5073699951171875, + "learning_rate": 7.001250437828372e-05, + "loss": 1.2092, + "num_input_tokens_seen": 68941736, + "step": 4285 + }, + { + "epoch": 0.3002267811955484, + "grad_norm": 4.947534561157227, + "learning_rate": 7.000550612959721e-05, + "loss": 1.1389, + "num_input_tokens_seen": 68958120, + "step": 4286 + }, + { + "epoch": 0.30029682944127767, + "grad_norm": 3.8399429321289062, + "learning_rate": 6.999850788091068e-05, + "loss": 1.1268, + "num_input_tokens_seen": 68974232, + "step": 4287 + }, + { + "epoch": 0.30036687768700693, + "grad_norm": 3.9180405139923096, + "learning_rate": 6.999150963222416e-05, + "loss": 1.1666, + "num_input_tokens_seen": 68990616, + "step": 4288 + }, + { + "epoch": 0.3004369259327362, + "grad_norm": 3.9542794227600098, + "learning_rate": 6.998451138353766e-05, + "loss": 1.1474, + "num_input_tokens_seen": 69006952, + "step": 4289 + }, + { + "epoch": 0.3005069741784654, + "grad_norm": 3.5275325775146484, + "learning_rate": 6.997751313485115e-05, + "loss": 1.1239, + "num_input_tokens_seen": 69023336, + "step": 4290 + }, + { + "epoch": 0.30057702242419465, + "grad_norm": 3.9485349655151367, + "learning_rate": 6.997051488616462e-05, + "loss": 0.9736, + "num_input_tokens_seen": 69038392, + "step": 4291 + }, + { + "epoch": 0.3006470706699239, + "grad_norm": 3.4944114685058594, + "learning_rate": 6.996351663747811e-05, + "loss": 0.7473, + "num_input_tokens_seen": 69054160, + "step": 4292 + }, + { + "epoch": 0.30071711891565317, + "grad_norm": 3.387148380279541, + "learning_rate": 6.99565183887916e-05, + "loss": 0.9142, + "num_input_tokens_seen": 69070056, + "step": 4293 + }, + { + "epoch": 0.30078716716138243, + "grad_norm": 3.9591586589813232, + "learning_rate": 6.994952014010508e-05, + "loss": 1.133, + "num_input_tokens_seen": 69086240, + "step": 4294 + }, + { + "epoch": 0.30085721540711163, + "grad_norm": 8.32682991027832, + "learning_rate": 6.994252189141857e-05, + "loss": 1.1697, + "num_input_tokens_seen": 69102408, + "step": 4295 + }, + { + "epoch": 0.3009272636528409, + "grad_norm": 3.5885214805603027, + "learning_rate": 6.993552364273205e-05, + "loss": 1.0626, + "num_input_tokens_seen": 69118376, + "step": 4296 + }, + { + "epoch": 0.30099731189857015, + "grad_norm": 4.784765243530273, + "learning_rate": 6.992852539404554e-05, + "loss": 0.9771, + "num_input_tokens_seen": 69133664, + "step": 4297 + }, + { + "epoch": 0.3010673601442994, + "grad_norm": 6.456319808959961, + "learning_rate": 6.992152714535902e-05, + "loss": 1.3836, + "num_input_tokens_seen": 69148224, + "step": 4298 + }, + { + "epoch": 0.3011374083900286, + "grad_norm": 5.820954322814941, + "learning_rate": 6.99145288966725e-05, + "loss": 0.9987, + "num_input_tokens_seen": 69164440, + "step": 4299 + }, + { + "epoch": 0.3012074566357579, + "grad_norm": 6.690483570098877, + "learning_rate": 6.9907530647986e-05, + "loss": 1.1583, + "num_input_tokens_seen": 69180824, + "step": 4300 + }, + { + "epoch": 0.30127750488148713, + "grad_norm": 3.8018131256103516, + "learning_rate": 6.990053239929947e-05, + "loss": 1.1643, + "num_input_tokens_seen": 69197016, + "step": 4301 + }, + { + "epoch": 0.3013475531272164, + "grad_norm": 4.574918746948242, + "learning_rate": 6.989353415061296e-05, + "loss": 1.168, + "num_input_tokens_seen": 69213400, + "step": 4302 + }, + { + "epoch": 0.3014176013729456, + "grad_norm": 3.3843026161193848, + "learning_rate": 6.988653590192646e-05, + "loss": 0.9762, + "num_input_tokens_seen": 69229784, + "step": 4303 + }, + { + "epoch": 0.30148764961867486, + "grad_norm": 6.179981708526611, + "learning_rate": 6.987953765323994e-05, + "loss": 1.173, + "num_input_tokens_seen": 69246168, + "step": 4304 + }, + { + "epoch": 0.3015576978644041, + "grad_norm": 4.759994029998779, + "learning_rate": 6.987253940455341e-05, + "loss": 0.947, + "num_input_tokens_seen": 69262512, + "step": 4305 + }, + { + "epoch": 0.3016277461101334, + "grad_norm": 3.719902992248535, + "learning_rate": 6.986554115586691e-05, + "loss": 1.1882, + "num_input_tokens_seen": 69278496, + "step": 4306 + }, + { + "epoch": 0.3016977943558626, + "grad_norm": 3.6757240295410156, + "learning_rate": 6.985854290718039e-05, + "loss": 1.1506, + "num_input_tokens_seen": 69294880, + "step": 4307 + }, + { + "epoch": 0.30176784260159184, + "grad_norm": 4.316056251525879, + "learning_rate": 6.985154465849386e-05, + "loss": 0.8921, + "num_input_tokens_seen": 69311264, + "step": 4308 + }, + { + "epoch": 0.3018378908473211, + "grad_norm": 5.248560428619385, + "learning_rate": 6.984454640980736e-05, + "loss": 1.1127, + "num_input_tokens_seen": 69327648, + "step": 4309 + }, + { + "epoch": 0.30190793909305036, + "grad_norm": 3.601381540298462, + "learning_rate": 6.983754816112085e-05, + "loss": 1.0002, + "num_input_tokens_seen": 69344032, + "step": 4310 + }, + { + "epoch": 0.30197798733877956, + "grad_norm": 4.555902004241943, + "learning_rate": 6.983054991243433e-05, + "loss": 1.0674, + "num_input_tokens_seen": 69360416, + "step": 4311 + }, + { + "epoch": 0.3020480355845088, + "grad_norm": 4.615258693695068, + "learning_rate": 6.982355166374782e-05, + "loss": 1.1759, + "num_input_tokens_seen": 69375728, + "step": 4312 + }, + { + "epoch": 0.3021180838302381, + "grad_norm": 5.953250408172607, + "learning_rate": 6.98165534150613e-05, + "loss": 1.1161, + "num_input_tokens_seen": 69391768, + "step": 4313 + }, + { + "epoch": 0.30218813207596734, + "grad_norm": 4.049426555633545, + "learning_rate": 6.980955516637478e-05, + "loss": 1.0466, + "num_input_tokens_seen": 69407328, + "step": 4314 + }, + { + "epoch": 0.30225818032169655, + "grad_norm": 4.012260437011719, + "learning_rate": 6.980255691768827e-05, + "loss": 1.338, + "num_input_tokens_seen": 69423712, + "step": 4315 + }, + { + "epoch": 0.3023282285674258, + "grad_norm": 3.8932242393493652, + "learning_rate": 6.979555866900176e-05, + "loss": 1.0836, + "num_input_tokens_seen": 69440096, + "step": 4316 + }, + { + "epoch": 0.30239827681315506, + "grad_norm": 7.58411169052124, + "learning_rate": 6.978856042031525e-05, + "loss": 1.0849, + "num_input_tokens_seen": 69456088, + "step": 4317 + }, + { + "epoch": 0.3024683250588843, + "grad_norm": 5.275664806365967, + "learning_rate": 6.978156217162872e-05, + "loss": 0.9773, + "num_input_tokens_seen": 69471768, + "step": 4318 + }, + { + "epoch": 0.30253837330461353, + "grad_norm": 3.6384737491607666, + "learning_rate": 6.977456392294221e-05, + "loss": 1.1168, + "num_input_tokens_seen": 69488152, + "step": 4319 + }, + { + "epoch": 0.3026084215503428, + "grad_norm": 5.059805870056152, + "learning_rate": 6.97675656742557e-05, + "loss": 1.1221, + "num_input_tokens_seen": 69504536, + "step": 4320 + }, + { + "epoch": 0.30267846979607205, + "grad_norm": 5.672605037689209, + "learning_rate": 6.976056742556917e-05, + "loss": 0.8506, + "num_input_tokens_seen": 69520920, + "step": 4321 + }, + { + "epoch": 0.3027485180418013, + "grad_norm": 3.5066421031951904, + "learning_rate": 6.975356917688266e-05, + "loss": 1.1437, + "num_input_tokens_seen": 69537304, + "step": 4322 + }, + { + "epoch": 0.3028185662875305, + "grad_norm": 4.403011798858643, + "learning_rate": 6.974657092819616e-05, + "loss": 1.0946, + "num_input_tokens_seen": 69553688, + "step": 4323 + }, + { + "epoch": 0.30288861453325977, + "grad_norm": 3.87226939201355, + "learning_rate": 6.973957267950964e-05, + "loss": 0.9997, + "num_input_tokens_seen": 69570072, + "step": 4324 + }, + { + "epoch": 0.30295866277898903, + "grad_norm": 4.516434192657471, + "learning_rate": 6.973257443082311e-05, + "loss": 1.0816, + "num_input_tokens_seen": 69585056, + "step": 4325 + }, + { + "epoch": 0.3030287110247183, + "grad_norm": 4.07093620300293, + "learning_rate": 6.97255761821366e-05, + "loss": 1.1811, + "num_input_tokens_seen": 69601440, + "step": 4326 + }, + { + "epoch": 0.3030987592704475, + "grad_norm": 3.663632392883301, + "learning_rate": 6.971857793345009e-05, + "loss": 1.0103, + "num_input_tokens_seen": 69617824, + "step": 4327 + }, + { + "epoch": 0.30316880751617675, + "grad_norm": 3.791191577911377, + "learning_rate": 6.971157968476357e-05, + "loss": 1.1402, + "num_input_tokens_seen": 69634208, + "step": 4328 + }, + { + "epoch": 0.303238855761906, + "grad_norm": 4.766335964202881, + "learning_rate": 6.970458143607707e-05, + "loss": 1.057, + "num_input_tokens_seen": 69650592, + "step": 4329 + }, + { + "epoch": 0.30330890400763527, + "grad_norm": 3.6603240966796875, + "learning_rate": 6.969758318739056e-05, + "loss": 1.0052, + "num_input_tokens_seen": 69666976, + "step": 4330 + }, + { + "epoch": 0.30337895225336453, + "grad_norm": 4.231273174285889, + "learning_rate": 6.969058493870403e-05, + "loss": 1.1661, + "num_input_tokens_seen": 69683360, + "step": 4331 + }, + { + "epoch": 0.30344900049909374, + "grad_norm": 3.7526698112487793, + "learning_rate": 6.968358669001751e-05, + "loss": 1.0783, + "num_input_tokens_seen": 69699744, + "step": 4332 + }, + { + "epoch": 0.303519048744823, + "grad_norm": 3.8541617393493652, + "learning_rate": 6.967658844133101e-05, + "loss": 1.0485, + "num_input_tokens_seen": 69715960, + "step": 4333 + }, + { + "epoch": 0.30358909699055225, + "grad_norm": 3.914926767349243, + "learning_rate": 6.966959019264448e-05, + "loss": 0.9525, + "num_input_tokens_seen": 69732344, + "step": 4334 + }, + { + "epoch": 0.3036591452362815, + "grad_norm": 4.39329719543457, + "learning_rate": 6.966259194395797e-05, + "loss": 1.0234, + "num_input_tokens_seen": 69748728, + "step": 4335 + }, + { + "epoch": 0.3037291934820107, + "grad_norm": 3.914006233215332, + "learning_rate": 6.965559369527146e-05, + "loss": 1.1397, + "num_input_tokens_seen": 69765112, + "step": 4336 + }, + { + "epoch": 0.30379924172774, + "grad_norm": 4.536770343780518, + "learning_rate": 6.964859544658495e-05, + "loss": 1.1825, + "num_input_tokens_seen": 69781496, + "step": 4337 + }, + { + "epoch": 0.30386928997346924, + "grad_norm": 4.147655010223389, + "learning_rate": 6.964159719789843e-05, + "loss": 0.7535, + "num_input_tokens_seen": 69797880, + "step": 4338 + }, + { + "epoch": 0.3039393382191985, + "grad_norm": 4.224967956542969, + "learning_rate": 6.963459894921191e-05, + "loss": 1.1842, + "num_input_tokens_seen": 69814264, + "step": 4339 + }, + { + "epoch": 0.3040093864649277, + "grad_norm": 4.415369033813477, + "learning_rate": 6.96276007005254e-05, + "loss": 1.1396, + "num_input_tokens_seen": 69830648, + "step": 4340 + }, + { + "epoch": 0.30407943471065696, + "grad_norm": 3.5865182876586914, + "learning_rate": 6.962060245183888e-05, + "loss": 1.1649, + "num_input_tokens_seen": 69846608, + "step": 4341 + }, + { + "epoch": 0.3041494829563862, + "grad_norm": 6.16670560836792, + "learning_rate": 6.961360420315237e-05, + "loss": 1.0684, + "num_input_tokens_seen": 69862232, + "step": 4342 + }, + { + "epoch": 0.3042195312021155, + "grad_norm": 7.907288074493408, + "learning_rate": 6.960660595446586e-05, + "loss": 0.9661, + "num_input_tokens_seen": 69878616, + "step": 4343 + }, + { + "epoch": 0.3042895794478447, + "grad_norm": 3.7910618782043457, + "learning_rate": 6.959960770577934e-05, + "loss": 1.0852, + "num_input_tokens_seen": 69895000, + "step": 4344 + }, + { + "epoch": 0.30435962769357394, + "grad_norm": 3.4832661151885986, + "learning_rate": 6.959260945709282e-05, + "loss": 1.0312, + "num_input_tokens_seen": 69911384, + "step": 4345 + }, + { + "epoch": 0.3044296759393032, + "grad_norm": 3.563248872756958, + "learning_rate": 6.958561120840631e-05, + "loss": 1.1249, + "num_input_tokens_seen": 69927768, + "step": 4346 + }, + { + "epoch": 0.30449972418503246, + "grad_norm": 4.838014602661133, + "learning_rate": 6.95786129597198e-05, + "loss": 1.2449, + "num_input_tokens_seen": 69944152, + "step": 4347 + }, + { + "epoch": 0.30456977243076166, + "grad_norm": 3.6796975135803223, + "learning_rate": 6.957161471103327e-05, + "loss": 0.8156, + "num_input_tokens_seen": 69959968, + "step": 4348 + }, + { + "epoch": 0.3046398206764909, + "grad_norm": 4.028040885925293, + "learning_rate": 6.956461646234677e-05, + "loss": 1.0653, + "num_input_tokens_seen": 69975960, + "step": 4349 + }, + { + "epoch": 0.3047098689222202, + "grad_norm": 4.073189735412598, + "learning_rate": 6.955761821366026e-05, + "loss": 1.0004, + "num_input_tokens_seen": 69991656, + "step": 4350 + }, + { + "epoch": 0.30477991716794944, + "grad_norm": 5.757152080535889, + "learning_rate": 6.955061996497374e-05, + "loss": 1.2074, + "num_input_tokens_seen": 70008040, + "step": 4351 + }, + { + "epoch": 0.30484996541367865, + "grad_norm": 5.49181604385376, + "learning_rate": 6.954362171628721e-05, + "loss": 1.0235, + "num_input_tokens_seen": 70024424, + "step": 4352 + }, + { + "epoch": 0.3049200136594079, + "grad_norm": 5.573401927947998, + "learning_rate": 6.95366234676007e-05, + "loss": 0.9787, + "num_input_tokens_seen": 70040808, + "step": 4353 + }, + { + "epoch": 0.30499006190513717, + "grad_norm": 3.491823673248291, + "learning_rate": 6.952962521891419e-05, + "loss": 1.0254, + "num_input_tokens_seen": 70057192, + "step": 4354 + }, + { + "epoch": 0.3050601101508664, + "grad_norm": 6.05043888092041, + "learning_rate": 6.952262697022768e-05, + "loss": 1.0709, + "num_input_tokens_seen": 70073576, + "step": 4355 + }, + { + "epoch": 0.30513015839659563, + "grad_norm": 3.848910331726074, + "learning_rate": 6.951562872154117e-05, + "loss": 1.0267, + "num_input_tokens_seen": 70089960, + "step": 4356 + }, + { + "epoch": 0.3052002066423249, + "grad_norm": 4.134339332580566, + "learning_rate": 6.950863047285465e-05, + "loss": 1.2447, + "num_input_tokens_seen": 70106344, + "step": 4357 + }, + { + "epoch": 0.30527025488805415, + "grad_norm": 3.6560862064361572, + "learning_rate": 6.950163222416813e-05, + "loss": 1.1018, + "num_input_tokens_seen": 70122056, + "step": 4358 + }, + { + "epoch": 0.3053403031337834, + "grad_norm": 3.813434600830078, + "learning_rate": 6.94946339754816e-05, + "loss": 1.0149, + "num_input_tokens_seen": 70138408, + "step": 4359 + }, + { + "epoch": 0.3054103513795126, + "grad_norm": 5.002225875854492, + "learning_rate": 6.948763572679511e-05, + "loss": 1.1563, + "num_input_tokens_seen": 70154792, + "step": 4360 + }, + { + "epoch": 0.30548039962524187, + "grad_norm": 3.8483340740203857, + "learning_rate": 6.948063747810858e-05, + "loss": 0.9643, + "num_input_tokens_seen": 70171176, + "step": 4361 + }, + { + "epoch": 0.30555044787097113, + "grad_norm": 5.18534517288208, + "learning_rate": 6.947363922942207e-05, + "loss": 1.1841, + "num_input_tokens_seen": 70187336, + "step": 4362 + }, + { + "epoch": 0.3056204961167004, + "grad_norm": 3.92976713180542, + "learning_rate": 6.946664098073556e-05, + "loss": 1.0051, + "num_input_tokens_seen": 70203720, + "step": 4363 + }, + { + "epoch": 0.30569054436242965, + "grad_norm": 3.4534151554107666, + "learning_rate": 6.945964273204905e-05, + "loss": 0.9356, + "num_input_tokens_seen": 70220104, + "step": 4364 + }, + { + "epoch": 0.30576059260815885, + "grad_norm": 3.7937867641448975, + "learning_rate": 6.945264448336252e-05, + "loss": 1.1694, + "num_input_tokens_seen": 70236488, + "step": 4365 + }, + { + "epoch": 0.3058306408538881, + "grad_norm": 3.9063713550567627, + "learning_rate": 6.944564623467601e-05, + "loss": 1.0969, + "num_input_tokens_seen": 70252872, + "step": 4366 + }, + { + "epoch": 0.3059006890996174, + "grad_norm": 3.9363296031951904, + "learning_rate": 6.94386479859895e-05, + "loss": 0.9776, + "num_input_tokens_seen": 70269256, + "step": 4367 + }, + { + "epoch": 0.30597073734534663, + "grad_norm": 4.722838401794434, + "learning_rate": 6.943164973730297e-05, + "loss": 0.9503, + "num_input_tokens_seen": 70285640, + "step": 4368 + }, + { + "epoch": 0.30604078559107584, + "grad_norm": 4.053229808807373, + "learning_rate": 6.942465148861646e-05, + "loss": 1.2669, + "num_input_tokens_seen": 70301688, + "step": 4369 + }, + { + "epoch": 0.3061108338368051, + "grad_norm": 3.71604323387146, + "learning_rate": 6.941765323992995e-05, + "loss": 1.0619, + "num_input_tokens_seen": 70318072, + "step": 4370 + }, + { + "epoch": 0.30618088208253436, + "grad_norm": 3.8376901149749756, + "learning_rate": 6.941065499124344e-05, + "loss": 1.0007, + "num_input_tokens_seen": 70334456, + "step": 4371 + }, + { + "epoch": 0.3062509303282636, + "grad_norm": 4.157979488372803, + "learning_rate": 6.940365674255692e-05, + "loss": 1.2379, + "num_input_tokens_seen": 70350424, + "step": 4372 + }, + { + "epoch": 0.3063209785739928, + "grad_norm": 4.173924922943115, + "learning_rate": 6.93966584938704e-05, + "loss": 1.1111, + "num_input_tokens_seen": 70366808, + "step": 4373 + }, + { + "epoch": 0.3063910268197221, + "grad_norm": 4.114030838012695, + "learning_rate": 6.938966024518389e-05, + "loss": 1.0932, + "num_input_tokens_seen": 70383000, + "step": 4374 + }, + { + "epoch": 0.30646107506545134, + "grad_norm": 4.31168794631958, + "learning_rate": 6.938266199649738e-05, + "loss": 1.243, + "num_input_tokens_seen": 70399016, + "step": 4375 + }, + { + "epoch": 0.3065311233111806, + "grad_norm": 6.187852382659912, + "learning_rate": 6.937566374781087e-05, + "loss": 0.9274, + "num_input_tokens_seen": 70413016, + "step": 4376 + }, + { + "epoch": 0.3066011715569098, + "grad_norm": 4.700244903564453, + "learning_rate": 6.936866549912436e-05, + "loss": 1.0809, + "num_input_tokens_seen": 70427552, + "step": 4377 + }, + { + "epoch": 0.30667121980263906, + "grad_norm": 4.941024303436279, + "learning_rate": 6.936166725043783e-05, + "loss": 1.1653, + "num_input_tokens_seen": 70443936, + "step": 4378 + }, + { + "epoch": 0.3067412680483683, + "grad_norm": 3.8171792030334473, + "learning_rate": 6.935466900175131e-05, + "loss": 1.1128, + "num_input_tokens_seen": 70460320, + "step": 4379 + }, + { + "epoch": 0.3068113162940976, + "grad_norm": 5.006760597229004, + "learning_rate": 6.93476707530648e-05, + "loss": 1.1674, + "num_input_tokens_seen": 70476704, + "step": 4380 + }, + { + "epoch": 0.3068813645398268, + "grad_norm": 3.8567628860473633, + "learning_rate": 6.934067250437829e-05, + "loss": 1.1478, + "num_input_tokens_seen": 70493016, + "step": 4381 + }, + { + "epoch": 0.30695141278555604, + "grad_norm": 3.7168126106262207, + "learning_rate": 6.933367425569177e-05, + "loss": 0.9496, + "num_input_tokens_seen": 70509400, + "step": 4382 + }, + { + "epoch": 0.3070214610312853, + "grad_norm": 4.72265625, + "learning_rate": 6.932667600700526e-05, + "loss": 1.0319, + "num_input_tokens_seen": 70525592, + "step": 4383 + }, + { + "epoch": 0.30709150927701456, + "grad_norm": 4.502997875213623, + "learning_rate": 6.931967775831875e-05, + "loss": 1.0556, + "num_input_tokens_seen": 70541976, + "step": 4384 + }, + { + "epoch": 0.30716155752274377, + "grad_norm": 4.090621471405029, + "learning_rate": 6.931267950963223e-05, + "loss": 1.1441, + "num_input_tokens_seen": 70558360, + "step": 4385 + }, + { + "epoch": 0.307231605768473, + "grad_norm": 3.501185655593872, + "learning_rate": 6.93056812609457e-05, + "loss": 0.9005, + "num_input_tokens_seen": 70574640, + "step": 4386 + }, + { + "epoch": 0.3073016540142023, + "grad_norm": 3.937352180480957, + "learning_rate": 6.92986830122592e-05, + "loss": 1.1003, + "num_input_tokens_seen": 70591024, + "step": 4387 + }, + { + "epoch": 0.30737170225993155, + "grad_norm": 8.832700729370117, + "learning_rate": 6.929168476357268e-05, + "loss": 0.9773, + "num_input_tokens_seen": 70606720, + "step": 4388 + }, + { + "epoch": 0.30744175050566075, + "grad_norm": 3.8081719875335693, + "learning_rate": 6.928468651488617e-05, + "loss": 0.8348, + "num_input_tokens_seen": 70622864, + "step": 4389 + }, + { + "epoch": 0.30751179875139, + "grad_norm": 3.836366653442383, + "learning_rate": 6.927768826619966e-05, + "loss": 1.0858, + "num_input_tokens_seen": 70639248, + "step": 4390 + }, + { + "epoch": 0.30758184699711927, + "grad_norm": 5.150767803192139, + "learning_rate": 6.927069001751314e-05, + "loss": 1.0395, + "num_input_tokens_seen": 70655128, + "step": 4391 + }, + { + "epoch": 0.3076518952428485, + "grad_norm": 5.0762434005737305, + "learning_rate": 6.926369176882662e-05, + "loss": 0.9503, + "num_input_tokens_seen": 70671512, + "step": 4392 + }, + { + "epoch": 0.30772194348857773, + "grad_norm": 3.7713098526000977, + "learning_rate": 6.925669352014011e-05, + "loss": 0.9273, + "num_input_tokens_seen": 70687896, + "step": 4393 + }, + { + "epoch": 0.307791991734307, + "grad_norm": 5.246247291564941, + "learning_rate": 6.92496952714536e-05, + "loss": 1.1295, + "num_input_tokens_seen": 70704280, + "step": 4394 + }, + { + "epoch": 0.30786203998003625, + "grad_norm": 3.5723984241485596, + "learning_rate": 6.924269702276707e-05, + "loss": 0.9063, + "num_input_tokens_seen": 70720664, + "step": 4395 + }, + { + "epoch": 0.3079320882257655, + "grad_norm": 3.5165982246398926, + "learning_rate": 6.923569877408056e-05, + "loss": 0.963, + "num_input_tokens_seen": 70736968, + "step": 4396 + }, + { + "epoch": 0.30800213647149477, + "grad_norm": 4.140204429626465, + "learning_rate": 6.922870052539405e-05, + "loss": 0.9557, + "num_input_tokens_seen": 70753352, + "step": 4397 + }, + { + "epoch": 0.308072184717224, + "grad_norm": 7.949122428894043, + "learning_rate": 6.922170227670754e-05, + "loss": 0.9429, + "num_input_tokens_seen": 70769720, + "step": 4398 + }, + { + "epoch": 0.30814223296295323, + "grad_norm": 6.45367431640625, + "learning_rate": 6.921470402802101e-05, + "loss": 1.2214, + "num_input_tokens_seen": 70784984, + "step": 4399 + }, + { + "epoch": 0.3082122812086825, + "grad_norm": 4.139477252960205, + "learning_rate": 6.92077057793345e-05, + "loss": 1.2376, + "num_input_tokens_seen": 70800376, + "step": 4400 + }, + { + "epoch": 0.3082122812086825, + "eval_loss": 1.1308872699737549, + "eval_runtime": 0.2076, + "eval_samples_per_second": 4.818, + "eval_steps_per_second": 4.818, + "num_input_tokens_seen": 70800376, + "step": 4400 + }, + { + "epoch": 0.30828232945441175, + "grad_norm": 4.095129013061523, + "learning_rate": 6.920070753064799e-05, + "loss": 1.1774, + "num_input_tokens_seen": 70816560, + "step": 4401 + }, + { + "epoch": 0.30835237770014096, + "grad_norm": 3.6730854511260986, + "learning_rate": 6.919370928196148e-05, + "loss": 1.0242, + "num_input_tokens_seen": 70831848, + "step": 4402 + }, + { + "epoch": 0.3084224259458702, + "grad_norm": 4.013517379760742, + "learning_rate": 6.918671103327497e-05, + "loss": 0.9785, + "num_input_tokens_seen": 70847408, + "step": 4403 + }, + { + "epoch": 0.3084924741915995, + "grad_norm": 5.617120742797852, + "learning_rate": 6.917971278458846e-05, + "loss": 0.9883, + "num_input_tokens_seen": 70862080, + "step": 4404 + }, + { + "epoch": 0.30856252243732873, + "grad_norm": 3.5201385021209717, + "learning_rate": 6.917271453590193e-05, + "loss": 0.9537, + "num_input_tokens_seen": 70878464, + "step": 4405 + }, + { + "epoch": 0.30863257068305794, + "grad_norm": 5.116230010986328, + "learning_rate": 6.91657162872154e-05, + "loss": 1.0934, + "num_input_tokens_seen": 70894848, + "step": 4406 + }, + { + "epoch": 0.3087026189287872, + "grad_norm": 3.4510743618011475, + "learning_rate": 6.91587180385289e-05, + "loss": 1.0857, + "num_input_tokens_seen": 70911232, + "step": 4407 + }, + { + "epoch": 0.30877266717451646, + "grad_norm": 4.719654083251953, + "learning_rate": 6.915171978984238e-05, + "loss": 1.1565, + "num_input_tokens_seen": 70927616, + "step": 4408 + }, + { + "epoch": 0.3088427154202457, + "grad_norm": 4.52898645401001, + "learning_rate": 6.914472154115587e-05, + "loss": 0.9418, + "num_input_tokens_seen": 70944000, + "step": 4409 + }, + { + "epoch": 0.3089127636659749, + "grad_norm": 4.237354755401611, + "learning_rate": 6.913772329246936e-05, + "loss": 1.1614, + "num_input_tokens_seen": 70960384, + "step": 4410 + }, + { + "epoch": 0.3089828119117042, + "grad_norm": 5.489138126373291, + "learning_rate": 6.913072504378285e-05, + "loss": 0.9871, + "num_input_tokens_seen": 70976768, + "step": 4411 + }, + { + "epoch": 0.30905286015743344, + "grad_norm": 5.482370853424072, + "learning_rate": 6.912372679509632e-05, + "loss": 0.9962, + "num_input_tokens_seen": 70992496, + "step": 4412 + }, + { + "epoch": 0.3091229084031627, + "grad_norm": 3.8174126148223877, + "learning_rate": 6.91167285464098e-05, + "loss": 1.0605, + "num_input_tokens_seen": 71008880, + "step": 4413 + }, + { + "epoch": 0.3091929566488919, + "grad_norm": 4.064924716949463, + "learning_rate": 6.91097302977233e-05, + "loss": 0.8307, + "num_input_tokens_seen": 71023912, + "step": 4414 + }, + { + "epoch": 0.30926300489462116, + "grad_norm": 3.955643653869629, + "learning_rate": 6.910273204903678e-05, + "loss": 1.2599, + "num_input_tokens_seen": 71040296, + "step": 4415 + }, + { + "epoch": 0.3093330531403504, + "grad_norm": 3.771191358566284, + "learning_rate": 6.909573380035026e-05, + "loss": 1.0682, + "num_input_tokens_seen": 71056680, + "step": 4416 + }, + { + "epoch": 0.3094031013860797, + "grad_norm": 5.4105963706970215, + "learning_rate": 6.908873555166375e-05, + "loss": 1.1571, + "num_input_tokens_seen": 71072640, + "step": 4417 + }, + { + "epoch": 0.3094731496318089, + "grad_norm": 4.549078464508057, + "learning_rate": 6.908173730297724e-05, + "loss": 1.0837, + "num_input_tokens_seen": 71087336, + "step": 4418 + }, + { + "epoch": 0.30954319787753815, + "grad_norm": 3.998065233230591, + "learning_rate": 6.907473905429072e-05, + "loss": 1.2753, + "num_input_tokens_seen": 71102952, + "step": 4419 + }, + { + "epoch": 0.3096132461232674, + "grad_norm": 3.834508180618286, + "learning_rate": 6.90677408056042e-05, + "loss": 0.8886, + "num_input_tokens_seen": 71119328, + "step": 4420 + }, + { + "epoch": 0.30968329436899666, + "grad_norm": 3.932875156402588, + "learning_rate": 6.90607425569177e-05, + "loss": 1.1568, + "num_input_tokens_seen": 71134968, + "step": 4421 + }, + { + "epoch": 0.30975334261472587, + "grad_norm": 3.712484359741211, + "learning_rate": 6.905374430823118e-05, + "loss": 1.0686, + "num_input_tokens_seen": 71150976, + "step": 4422 + }, + { + "epoch": 0.30982339086045513, + "grad_norm": 3.6733663082122803, + "learning_rate": 6.904674605954466e-05, + "loss": 0.8884, + "num_input_tokens_seen": 71167232, + "step": 4423 + }, + { + "epoch": 0.3098934391061844, + "grad_norm": 3.9877066612243652, + "learning_rate": 6.903974781085815e-05, + "loss": 1.0473, + "num_input_tokens_seen": 71182704, + "step": 4424 + }, + { + "epoch": 0.30996348735191365, + "grad_norm": 3.908582925796509, + "learning_rate": 6.903274956217163e-05, + "loss": 0.9944, + "num_input_tokens_seen": 71198920, + "step": 4425 + }, + { + "epoch": 0.31003353559764285, + "grad_norm": 4.310460090637207, + "learning_rate": 6.902575131348511e-05, + "loss": 0.9651, + "num_input_tokens_seen": 71215256, + "step": 4426 + }, + { + "epoch": 0.3101035838433721, + "grad_norm": 3.8914272785186768, + "learning_rate": 6.90187530647986e-05, + "loss": 0.9858, + "num_input_tokens_seen": 71231432, + "step": 4427 + }, + { + "epoch": 0.31017363208910137, + "grad_norm": 5.774794578552246, + "learning_rate": 6.901175481611209e-05, + "loss": 0.9792, + "num_input_tokens_seen": 71246944, + "step": 4428 + }, + { + "epoch": 0.31024368033483063, + "grad_norm": 6.370543956756592, + "learning_rate": 6.900475656742558e-05, + "loss": 1.0283, + "num_input_tokens_seen": 71263120, + "step": 4429 + }, + { + "epoch": 0.31031372858055983, + "grad_norm": 3.8334455490112305, + "learning_rate": 6.899775831873906e-05, + "loss": 1.0173, + "num_input_tokens_seen": 71279040, + "step": 4430 + }, + { + "epoch": 0.3103837768262891, + "grad_norm": 3.624006509780884, + "learning_rate": 6.899076007005255e-05, + "loss": 0.9908, + "num_input_tokens_seen": 71295424, + "step": 4431 + }, + { + "epoch": 0.31045382507201835, + "grad_norm": 3.8340702056884766, + "learning_rate": 6.898376182136603e-05, + "loss": 1.1257, + "num_input_tokens_seen": 71311808, + "step": 4432 + }, + { + "epoch": 0.3105238733177476, + "grad_norm": 4.4179277420043945, + "learning_rate": 6.89767635726795e-05, + "loss": 1.0439, + "num_input_tokens_seen": 71327560, + "step": 4433 + }, + { + "epoch": 0.31059392156347687, + "grad_norm": 5.758373260498047, + "learning_rate": 6.896976532399299e-05, + "loss": 0.9307, + "num_input_tokens_seen": 71342848, + "step": 4434 + }, + { + "epoch": 0.3106639698092061, + "grad_norm": 3.7063519954681396, + "learning_rate": 6.896276707530648e-05, + "loss": 1.1769, + "num_input_tokens_seen": 71359232, + "step": 4435 + }, + { + "epoch": 0.31073401805493533, + "grad_norm": 4.19386625289917, + "learning_rate": 6.895576882661997e-05, + "loss": 1.1185, + "num_input_tokens_seen": 71375616, + "step": 4436 + }, + { + "epoch": 0.3108040663006646, + "grad_norm": 4.116868019104004, + "learning_rate": 6.894877057793346e-05, + "loss": 0.9571, + "num_input_tokens_seen": 71392000, + "step": 4437 + }, + { + "epoch": 0.31087411454639385, + "grad_norm": 4.810275077819824, + "learning_rate": 6.894177232924695e-05, + "loss": 1.113, + "num_input_tokens_seen": 71407448, + "step": 4438 + }, + { + "epoch": 0.31094416279212306, + "grad_norm": 4.026486873626709, + "learning_rate": 6.893477408056042e-05, + "loss": 0.9949, + "num_input_tokens_seen": 71423832, + "step": 4439 + }, + { + "epoch": 0.3110142110378523, + "grad_norm": 4.268560886383057, + "learning_rate": 6.89277758318739e-05, + "loss": 1.0746, + "num_input_tokens_seen": 71440216, + "step": 4440 + }, + { + "epoch": 0.3110842592835816, + "grad_norm": 3.3299612998962402, + "learning_rate": 6.89207775831874e-05, + "loss": 0.7408, + "num_input_tokens_seen": 71456160, + "step": 4441 + }, + { + "epoch": 0.31115430752931084, + "grad_norm": 3.678912401199341, + "learning_rate": 6.891377933450089e-05, + "loss": 0.9508, + "num_input_tokens_seen": 71472344, + "step": 4442 + }, + { + "epoch": 0.31122435577504004, + "grad_norm": 3.3206088542938232, + "learning_rate": 6.890678108581436e-05, + "loss": 0.9531, + "num_input_tokens_seen": 71488728, + "step": 4443 + }, + { + "epoch": 0.3112944040207693, + "grad_norm": 3.6073081493377686, + "learning_rate": 6.889978283712785e-05, + "loss": 1.18, + "num_input_tokens_seen": 71505112, + "step": 4444 + }, + { + "epoch": 0.31136445226649856, + "grad_norm": 4.998234748840332, + "learning_rate": 6.889278458844134e-05, + "loss": 1.0307, + "num_input_tokens_seen": 71521496, + "step": 4445 + }, + { + "epoch": 0.3114345005122278, + "grad_norm": 3.7966136932373047, + "learning_rate": 6.888578633975481e-05, + "loss": 1.05, + "num_input_tokens_seen": 71537880, + "step": 4446 + }, + { + "epoch": 0.311504548757957, + "grad_norm": 3.7041022777557373, + "learning_rate": 6.88787880910683e-05, + "loss": 1.0701, + "num_input_tokens_seen": 71554136, + "step": 4447 + }, + { + "epoch": 0.3115745970036863, + "grad_norm": 4.155350208282471, + "learning_rate": 6.887178984238179e-05, + "loss": 1.0144, + "num_input_tokens_seen": 71570520, + "step": 4448 + }, + { + "epoch": 0.31164464524941554, + "grad_norm": 3.608290195465088, + "learning_rate": 6.886479159369528e-05, + "loss": 1.0224, + "num_input_tokens_seen": 71586904, + "step": 4449 + }, + { + "epoch": 0.3117146934951448, + "grad_norm": 5.258309841156006, + "learning_rate": 6.885779334500875e-05, + "loss": 1.0929, + "num_input_tokens_seen": 71602280, + "step": 4450 + }, + { + "epoch": 0.311784741740874, + "grad_norm": 4.176782608032227, + "learning_rate": 6.885079509632224e-05, + "loss": 1.0301, + "num_input_tokens_seen": 71618248, + "step": 4451 + }, + { + "epoch": 0.31185478998660326, + "grad_norm": 3.219015121459961, + "learning_rate": 6.884379684763573e-05, + "loss": 0.8699, + "num_input_tokens_seen": 71634632, + "step": 4452 + }, + { + "epoch": 0.3119248382323325, + "grad_norm": 3.485370397567749, + "learning_rate": 6.883679859894921e-05, + "loss": 0.947, + "num_input_tokens_seen": 71651016, + "step": 4453 + }, + { + "epoch": 0.3119948864780618, + "grad_norm": 4.25452184677124, + "learning_rate": 6.88298003502627e-05, + "loss": 1.1649, + "num_input_tokens_seen": 71667400, + "step": 4454 + }, + { + "epoch": 0.312064934723791, + "grad_norm": 4.2082133293151855, + "learning_rate": 6.882280210157618e-05, + "loss": 1.0457, + "num_input_tokens_seen": 71682656, + "step": 4455 + }, + { + "epoch": 0.31213498296952025, + "grad_norm": 3.366642475128174, + "learning_rate": 6.881580385288967e-05, + "loss": 0.8594, + "num_input_tokens_seen": 71699040, + "step": 4456 + }, + { + "epoch": 0.3122050312152495, + "grad_norm": 3.795114278793335, + "learning_rate": 6.880880560420316e-05, + "loss": 1.0749, + "num_input_tokens_seen": 71715424, + "step": 4457 + }, + { + "epoch": 0.31227507946097877, + "grad_norm": 7.3179121017456055, + "learning_rate": 6.880180735551665e-05, + "loss": 1.2978, + "num_input_tokens_seen": 71731808, + "step": 4458 + }, + { + "epoch": 0.31234512770670797, + "grad_norm": 5.0151848793029785, + "learning_rate": 6.879480910683012e-05, + "loss": 1.1359, + "num_input_tokens_seen": 71746688, + "step": 4459 + }, + { + "epoch": 0.31241517595243723, + "grad_norm": 4.136596202850342, + "learning_rate": 6.87878108581436e-05, + "loss": 1.141, + "num_input_tokens_seen": 71763016, + "step": 4460 + }, + { + "epoch": 0.3124852241981665, + "grad_norm": 3.6476573944091797, + "learning_rate": 6.878081260945709e-05, + "loss": 0.9967, + "num_input_tokens_seen": 71779400, + "step": 4461 + }, + { + "epoch": 0.31255527244389575, + "grad_norm": 4.907565593719482, + "learning_rate": 6.877381436077059e-05, + "loss": 1.0391, + "num_input_tokens_seen": 71795784, + "step": 4462 + }, + { + "epoch": 0.31262532068962495, + "grad_norm": 3.82183575630188, + "learning_rate": 6.876681611208407e-05, + "loss": 1.2237, + "num_input_tokens_seen": 71812168, + "step": 4463 + }, + { + "epoch": 0.3126953689353542, + "grad_norm": 4.63422966003418, + "learning_rate": 6.875981786339755e-05, + "loss": 1.1271, + "num_input_tokens_seen": 71828296, + "step": 4464 + }, + { + "epoch": 0.31276541718108347, + "grad_norm": 4.02967643737793, + "learning_rate": 6.875281961471104e-05, + "loss": 1.105, + "num_input_tokens_seen": 71844096, + "step": 4465 + }, + { + "epoch": 0.31283546542681273, + "grad_norm": 3.477452516555786, + "learning_rate": 6.874582136602452e-05, + "loss": 1.0503, + "num_input_tokens_seen": 71860480, + "step": 4466 + }, + { + "epoch": 0.312905513672542, + "grad_norm": 4.4327168464660645, + "learning_rate": 6.873882311733799e-05, + "loss": 1.2305, + "num_input_tokens_seen": 71876208, + "step": 4467 + }, + { + "epoch": 0.3129755619182712, + "grad_norm": 3.8214218616485596, + "learning_rate": 6.87318248686515e-05, + "loss": 1.0271, + "num_input_tokens_seen": 71892592, + "step": 4468 + }, + { + "epoch": 0.31304561016400045, + "grad_norm": 3.4210402965545654, + "learning_rate": 6.872482661996498e-05, + "loss": 0.8505, + "num_input_tokens_seen": 71908976, + "step": 4469 + }, + { + "epoch": 0.3131156584097297, + "grad_norm": 3.567034959793091, + "learning_rate": 6.871782837127846e-05, + "loss": 0.7866, + "num_input_tokens_seen": 71925200, + "step": 4470 + }, + { + "epoch": 0.313185706655459, + "grad_norm": 4.694231986999512, + "learning_rate": 6.871083012259195e-05, + "loss": 0.9634, + "num_input_tokens_seen": 71941584, + "step": 4471 + }, + { + "epoch": 0.3132557549011882, + "grad_norm": 5.802227973937988, + "learning_rate": 6.870383187390544e-05, + "loss": 1.1923, + "num_input_tokens_seen": 71957968, + "step": 4472 + }, + { + "epoch": 0.31332580314691744, + "grad_norm": 4.238499641418457, + "learning_rate": 6.869683362521891e-05, + "loss": 1.3381, + "num_input_tokens_seen": 71973376, + "step": 4473 + }, + { + "epoch": 0.3133958513926467, + "grad_norm": 4.2250213623046875, + "learning_rate": 6.86898353765324e-05, + "loss": 1.0959, + "num_input_tokens_seen": 71988560, + "step": 4474 + }, + { + "epoch": 0.31346589963837596, + "grad_norm": 4.052889823913574, + "learning_rate": 6.868283712784589e-05, + "loss": 1.1711, + "num_input_tokens_seen": 72004232, + "step": 4475 + }, + { + "epoch": 0.31353594788410516, + "grad_norm": 3.694481134414673, + "learning_rate": 6.867583887915938e-05, + "loss": 1.187, + "num_input_tokens_seen": 72020616, + "step": 4476 + }, + { + "epoch": 0.3136059961298344, + "grad_norm": 4.2295074462890625, + "learning_rate": 6.866884063047285e-05, + "loss": 1.2454, + "num_input_tokens_seen": 72036912, + "step": 4477 + }, + { + "epoch": 0.3136760443755637, + "grad_norm": 3.9813766479492188, + "learning_rate": 6.866184238178634e-05, + "loss": 1.1627, + "num_input_tokens_seen": 72053296, + "step": 4478 + }, + { + "epoch": 0.31374609262129294, + "grad_norm": 4.473883152008057, + "learning_rate": 6.865484413309983e-05, + "loss": 1.0522, + "num_input_tokens_seen": 72069680, + "step": 4479 + }, + { + "epoch": 0.31381614086702214, + "grad_norm": 3.7663521766662598, + "learning_rate": 6.86478458844133e-05, + "loss": 0.937, + "num_input_tokens_seen": 72085840, + "step": 4480 + }, + { + "epoch": 0.3138861891127514, + "grad_norm": 3.9587883949279785, + "learning_rate": 6.864084763572679e-05, + "loss": 1.1194, + "num_input_tokens_seen": 72102224, + "step": 4481 + }, + { + "epoch": 0.31395623735848066, + "grad_norm": 3.953232526779175, + "learning_rate": 6.86338493870403e-05, + "loss": 0.9581, + "num_input_tokens_seen": 72118608, + "step": 4482 + }, + { + "epoch": 0.3140262856042099, + "grad_norm": 3.917574882507324, + "learning_rate": 6.862685113835377e-05, + "loss": 1.1883, + "num_input_tokens_seen": 72134504, + "step": 4483 + }, + { + "epoch": 0.3140963338499391, + "grad_norm": 3.756253242492676, + "learning_rate": 6.861985288966726e-05, + "loss": 1.1057, + "num_input_tokens_seen": 72150368, + "step": 4484 + }, + { + "epoch": 0.3141663820956684, + "grad_norm": 4.146200656890869, + "learning_rate": 6.861285464098075e-05, + "loss": 0.98, + "num_input_tokens_seen": 72166752, + "step": 4485 + }, + { + "epoch": 0.31423643034139764, + "grad_norm": 3.98949933052063, + "learning_rate": 6.860585639229422e-05, + "loss": 1.2088, + "num_input_tokens_seen": 72182696, + "step": 4486 + }, + { + "epoch": 0.3143064785871269, + "grad_norm": 3.99951434135437, + "learning_rate": 6.85988581436077e-05, + "loss": 1.218, + "num_input_tokens_seen": 72199080, + "step": 4487 + }, + { + "epoch": 0.3143765268328561, + "grad_norm": 4.351415157318115, + "learning_rate": 6.85918598949212e-05, + "loss": 1.0178, + "num_input_tokens_seen": 72215176, + "step": 4488 + }, + { + "epoch": 0.31444657507858537, + "grad_norm": 4.563141822814941, + "learning_rate": 6.858486164623469e-05, + "loss": 1.0002, + "num_input_tokens_seen": 72231560, + "step": 4489 + }, + { + "epoch": 0.3145166233243146, + "grad_norm": 4.523083686828613, + "learning_rate": 6.857786339754816e-05, + "loss": 1.1464, + "num_input_tokens_seen": 72246920, + "step": 4490 + }, + { + "epoch": 0.3145866715700439, + "grad_norm": 4.032657623291016, + "learning_rate": 6.857086514886165e-05, + "loss": 1.1774, + "num_input_tokens_seen": 72263304, + "step": 4491 + }, + { + "epoch": 0.3146567198157731, + "grad_norm": 4.755338191986084, + "learning_rate": 6.856386690017514e-05, + "loss": 1.0756, + "num_input_tokens_seen": 72279688, + "step": 4492 + }, + { + "epoch": 0.31472676806150235, + "grad_norm": 4.037180423736572, + "learning_rate": 6.855686865148862e-05, + "loss": 1.2973, + "num_input_tokens_seen": 72296072, + "step": 4493 + }, + { + "epoch": 0.3147968163072316, + "grad_norm": 3.308746099472046, + "learning_rate": 6.85498704028021e-05, + "loss": 0.9127, + "num_input_tokens_seen": 72312360, + "step": 4494 + }, + { + "epoch": 0.31486686455296087, + "grad_norm": 4.204549789428711, + "learning_rate": 6.854287215411559e-05, + "loss": 1.1138, + "num_input_tokens_seen": 72328744, + "step": 4495 + }, + { + "epoch": 0.31493691279869007, + "grad_norm": 4.142894744873047, + "learning_rate": 6.853587390542908e-05, + "loss": 1.0273, + "num_input_tokens_seen": 72344944, + "step": 4496 + }, + { + "epoch": 0.31500696104441933, + "grad_norm": 5.43609094619751, + "learning_rate": 6.852887565674256e-05, + "loss": 0.9369, + "num_input_tokens_seen": 72360672, + "step": 4497 + }, + { + "epoch": 0.3150770092901486, + "grad_norm": 4.20035982131958, + "learning_rate": 6.852187740805604e-05, + "loss": 1.0857, + "num_input_tokens_seen": 72376744, + "step": 4498 + }, + { + "epoch": 0.31514705753587785, + "grad_norm": 3.6777737140655518, + "learning_rate": 6.851487915936953e-05, + "loss": 1.0489, + "num_input_tokens_seen": 72393128, + "step": 4499 + }, + { + "epoch": 0.31521710578160705, + "grad_norm": 5.047235488891602, + "learning_rate": 6.850788091068301e-05, + "loss": 1.0644, + "num_input_tokens_seen": 72408016, + "step": 4500 + }, + { + "epoch": 0.3152871540273363, + "grad_norm": 4.095731258392334, + "learning_rate": 6.85008826619965e-05, + "loss": 1.0881, + "num_input_tokens_seen": 72424400, + "step": 4501 + }, + { + "epoch": 0.3153572022730656, + "grad_norm": 3.6437504291534424, + "learning_rate": 6.849388441331e-05, + "loss": 1.1428, + "num_input_tokens_seen": 72440368, + "step": 4502 + }, + { + "epoch": 0.31542725051879483, + "grad_norm": 5.345888614654541, + "learning_rate": 6.848688616462347e-05, + "loss": 1.0143, + "num_input_tokens_seen": 72456752, + "step": 4503 + }, + { + "epoch": 0.3154972987645241, + "grad_norm": 4.471817970275879, + "learning_rate": 6.847988791593695e-05, + "loss": 1.11, + "num_input_tokens_seen": 72472952, + "step": 4504 + }, + { + "epoch": 0.3155673470102533, + "grad_norm": 3.8012888431549072, + "learning_rate": 6.847288966725044e-05, + "loss": 1.1961, + "num_input_tokens_seen": 72489256, + "step": 4505 + }, + { + "epoch": 0.31563739525598256, + "grad_norm": 7.531235218048096, + "learning_rate": 6.846589141856393e-05, + "loss": 1.0254, + "num_input_tokens_seen": 72503752, + "step": 4506 + }, + { + "epoch": 0.3157074435017118, + "grad_norm": 4.075259208679199, + "learning_rate": 6.84588931698774e-05, + "loss": 1.1834, + "num_input_tokens_seen": 72520136, + "step": 4507 + }, + { + "epoch": 0.3157774917474411, + "grad_norm": 5.203637599945068, + "learning_rate": 6.84518949211909e-05, + "loss": 1.1198, + "num_input_tokens_seen": 72536520, + "step": 4508 + }, + { + "epoch": 0.3158475399931703, + "grad_norm": 5.733241081237793, + "learning_rate": 6.844489667250439e-05, + "loss": 1.1981, + "num_input_tokens_seen": 72551976, + "step": 4509 + }, + { + "epoch": 0.31591758823889954, + "grad_norm": 4.182814121246338, + "learning_rate": 6.843789842381787e-05, + "loss": 1.0879, + "num_input_tokens_seen": 72568360, + "step": 4510 + }, + { + "epoch": 0.3159876364846288, + "grad_norm": 5.769293785095215, + "learning_rate": 6.843090017513136e-05, + "loss": 0.988, + "num_input_tokens_seen": 72584744, + "step": 4511 + }, + { + "epoch": 0.31605768473035806, + "grad_norm": 5.052547454833984, + "learning_rate": 6.842390192644484e-05, + "loss": 0.952, + "num_input_tokens_seen": 72600608, + "step": 4512 + }, + { + "epoch": 0.31612773297608726, + "grad_norm": 3.7260072231292725, + "learning_rate": 6.841690367775832e-05, + "loss": 0.948, + "num_input_tokens_seen": 72616720, + "step": 4513 + }, + { + "epoch": 0.3161977812218165, + "grad_norm": 4.230448246002197, + "learning_rate": 6.840990542907181e-05, + "loss": 1.3362, + "num_input_tokens_seen": 72632896, + "step": 4514 + }, + { + "epoch": 0.3162678294675458, + "grad_norm": 3.7840049266815186, + "learning_rate": 6.84029071803853e-05, + "loss": 1.1432, + "num_input_tokens_seen": 72649280, + "step": 4515 + }, + { + "epoch": 0.31633787771327504, + "grad_norm": 3.6891443729400635, + "learning_rate": 6.839590893169878e-05, + "loss": 0.9276, + "num_input_tokens_seen": 72665664, + "step": 4516 + }, + { + "epoch": 0.31640792595900424, + "grad_norm": 5.132042407989502, + "learning_rate": 6.838891068301226e-05, + "loss": 0.9418, + "num_input_tokens_seen": 72682048, + "step": 4517 + }, + { + "epoch": 0.3164779742047335, + "grad_norm": 4.329607009887695, + "learning_rate": 6.838191243432575e-05, + "loss": 1.0247, + "num_input_tokens_seen": 72698136, + "step": 4518 + }, + { + "epoch": 0.31654802245046276, + "grad_norm": 4.269455432891846, + "learning_rate": 6.837491418563924e-05, + "loss": 1.1186, + "num_input_tokens_seen": 72714296, + "step": 4519 + }, + { + "epoch": 0.316618070696192, + "grad_norm": 3.5963287353515625, + "learning_rate": 6.836791593695271e-05, + "loss": 0.8834, + "num_input_tokens_seen": 72730680, + "step": 4520 + }, + { + "epoch": 0.3166881189419212, + "grad_norm": 3.9145658016204834, + "learning_rate": 6.83609176882662e-05, + "loss": 1.1385, + "num_input_tokens_seen": 72746296, + "step": 4521 + }, + { + "epoch": 0.3167581671876505, + "grad_norm": 4.266791820526123, + "learning_rate": 6.83539194395797e-05, + "loss": 1.0825, + "num_input_tokens_seen": 72762680, + "step": 4522 + }, + { + "epoch": 0.31682821543337975, + "grad_norm": 3.850743532180786, + "learning_rate": 6.834692119089318e-05, + "loss": 1.0558, + "num_input_tokens_seen": 72778816, + "step": 4523 + }, + { + "epoch": 0.316898263679109, + "grad_norm": 3.8117008209228516, + "learning_rate": 6.833992294220665e-05, + "loss": 0.9562, + "num_input_tokens_seen": 72794576, + "step": 4524 + }, + { + "epoch": 0.3169683119248382, + "grad_norm": 4.469017028808594, + "learning_rate": 6.833292469352014e-05, + "loss": 1.2533, + "num_input_tokens_seen": 72810960, + "step": 4525 + }, + { + "epoch": 0.31703836017056747, + "grad_norm": 3.538980007171631, + "learning_rate": 6.832592644483363e-05, + "loss": 0.9393, + "num_input_tokens_seen": 72826480, + "step": 4526 + }, + { + "epoch": 0.3171084084162967, + "grad_norm": 3.6429643630981445, + "learning_rate": 6.83189281961471e-05, + "loss": 1.0492, + "num_input_tokens_seen": 72842440, + "step": 4527 + }, + { + "epoch": 0.317178456662026, + "grad_norm": 3.876481056213379, + "learning_rate": 6.831192994746061e-05, + "loss": 1.0699, + "num_input_tokens_seen": 72858424, + "step": 4528 + }, + { + "epoch": 0.3172485049077552, + "grad_norm": 5.119854927062988, + "learning_rate": 6.83049316987741e-05, + "loss": 1.1704, + "num_input_tokens_seen": 72874808, + "step": 4529 + }, + { + "epoch": 0.31731855315348445, + "grad_norm": 3.908071994781494, + "learning_rate": 6.829793345008757e-05, + "loss": 1.0156, + "num_input_tokens_seen": 72891192, + "step": 4530 + }, + { + "epoch": 0.3173886013992137, + "grad_norm": 4.499825954437256, + "learning_rate": 6.829093520140105e-05, + "loss": 0.9863, + "num_input_tokens_seen": 72907576, + "step": 4531 + }, + { + "epoch": 0.31745864964494297, + "grad_norm": 4.060844421386719, + "learning_rate": 6.828393695271453e-05, + "loss": 1.0173, + "num_input_tokens_seen": 72923960, + "step": 4532 + }, + { + "epoch": 0.3175286978906722, + "grad_norm": 4.47066068649292, + "learning_rate": 6.827693870402802e-05, + "loss": 0.859, + "num_input_tokens_seen": 72939576, + "step": 4533 + }, + { + "epoch": 0.31759874613640143, + "grad_norm": 3.6252682209014893, + "learning_rate": 6.826994045534151e-05, + "loss": 0.996, + "num_input_tokens_seen": 72955136, + "step": 4534 + }, + { + "epoch": 0.3176687943821307, + "grad_norm": 4.25836181640625, + "learning_rate": 6.8262942206655e-05, + "loss": 1.0267, + "num_input_tokens_seen": 72971520, + "step": 4535 + }, + { + "epoch": 0.31773884262785995, + "grad_norm": 3.6240739822387695, + "learning_rate": 6.825594395796849e-05, + "loss": 1.0116, + "num_input_tokens_seen": 72987440, + "step": 4536 + }, + { + "epoch": 0.3178088908735892, + "grad_norm": 4.470614910125732, + "learning_rate": 6.824894570928196e-05, + "loss": 1.1302, + "num_input_tokens_seen": 73003824, + "step": 4537 + }, + { + "epoch": 0.3178789391193184, + "grad_norm": 3.5759263038635254, + "learning_rate": 6.824194746059545e-05, + "loss": 0.9902, + "num_input_tokens_seen": 73020208, + "step": 4538 + }, + { + "epoch": 0.3179489873650477, + "grad_norm": 4.424665451049805, + "learning_rate": 6.823494921190894e-05, + "loss": 1.0239, + "num_input_tokens_seen": 73036592, + "step": 4539 + }, + { + "epoch": 0.31801903561077693, + "grad_norm": 3.803205966949463, + "learning_rate": 6.822795096322242e-05, + "loss": 0.9315, + "num_input_tokens_seen": 73052976, + "step": 4540 + }, + { + "epoch": 0.3180890838565062, + "grad_norm": 4.25760555267334, + "learning_rate": 6.82209527145359e-05, + "loss": 1.0985, + "num_input_tokens_seen": 73069360, + "step": 4541 + }, + { + "epoch": 0.3181591321022354, + "grad_norm": 4.006928443908691, + "learning_rate": 6.82139544658494e-05, + "loss": 0.9056, + "num_input_tokens_seen": 73084624, + "step": 4542 + }, + { + "epoch": 0.31822918034796466, + "grad_norm": 3.56350040435791, + "learning_rate": 6.820695621716288e-05, + "loss": 0.8721, + "num_input_tokens_seen": 73100008, + "step": 4543 + }, + { + "epoch": 0.3182992285936939, + "grad_norm": 3.7276062965393066, + "learning_rate": 6.819995796847636e-05, + "loss": 1.1001, + "num_input_tokens_seen": 73116392, + "step": 4544 + }, + { + "epoch": 0.3183692768394232, + "grad_norm": 4.955738544464111, + "learning_rate": 6.819295971978985e-05, + "loss": 1.0459, + "num_input_tokens_seen": 73131920, + "step": 4545 + }, + { + "epoch": 0.3184393250851524, + "grad_norm": 3.5275161266326904, + "learning_rate": 6.818596147110333e-05, + "loss": 1.1006, + "num_input_tokens_seen": 73148304, + "step": 4546 + }, + { + "epoch": 0.31850937333088164, + "grad_norm": 6.4245924949646, + "learning_rate": 6.817896322241681e-05, + "loss": 1.2968, + "num_input_tokens_seen": 73164688, + "step": 4547 + }, + { + "epoch": 0.3185794215766109, + "grad_norm": 4.1172966957092285, + "learning_rate": 6.81719649737303e-05, + "loss": 1.0743, + "num_input_tokens_seen": 73181072, + "step": 4548 + }, + { + "epoch": 0.31864946982234016, + "grad_norm": 3.849090337753296, + "learning_rate": 6.81649667250438e-05, + "loss": 1.1064, + "num_input_tokens_seen": 73197456, + "step": 4549 + }, + { + "epoch": 0.31871951806806936, + "grad_norm": 6.241509437561035, + "learning_rate": 6.815796847635728e-05, + "loss": 1.0592, + "num_input_tokens_seen": 73213568, + "step": 4550 + }, + { + "epoch": 0.3187895663137986, + "grad_norm": 4.039997577667236, + "learning_rate": 6.815097022767075e-05, + "loss": 0.9789, + "num_input_tokens_seen": 73229648, + "step": 4551 + }, + { + "epoch": 0.3188596145595279, + "grad_norm": 3.757549285888672, + "learning_rate": 6.814397197898424e-05, + "loss": 1.1547, + "num_input_tokens_seen": 73245952, + "step": 4552 + }, + { + "epoch": 0.31892966280525714, + "grad_norm": 4.177220821380615, + "learning_rate": 6.813697373029773e-05, + "loss": 1.3134, + "num_input_tokens_seen": 73262336, + "step": 4553 + }, + { + "epoch": 0.31899971105098635, + "grad_norm": 3.659167766571045, + "learning_rate": 6.812997548161122e-05, + "loss": 0.9954, + "num_input_tokens_seen": 73278304, + "step": 4554 + }, + { + "epoch": 0.3190697592967156, + "grad_norm": 4.289649486541748, + "learning_rate": 6.81229772329247e-05, + "loss": 0.8452, + "num_input_tokens_seen": 73294320, + "step": 4555 + }, + { + "epoch": 0.31913980754244486, + "grad_norm": 4.452631950378418, + "learning_rate": 6.811597898423819e-05, + "loss": 1.0265, + "num_input_tokens_seen": 73310256, + "step": 4556 + }, + { + "epoch": 0.3192098557881741, + "grad_norm": 3.572444438934326, + "learning_rate": 6.810898073555167e-05, + "loss": 1.0247, + "num_input_tokens_seen": 73326640, + "step": 4557 + }, + { + "epoch": 0.31927990403390333, + "grad_norm": 4.059347629547119, + "learning_rate": 6.810198248686514e-05, + "loss": 1.0103, + "num_input_tokens_seen": 73342096, + "step": 4558 + }, + { + "epoch": 0.3193499522796326, + "grad_norm": 5.144520282745361, + "learning_rate": 6.809498423817863e-05, + "loss": 1.1181, + "num_input_tokens_seen": 73358480, + "step": 4559 + }, + { + "epoch": 0.31942000052536185, + "grad_norm": 4.210456848144531, + "learning_rate": 6.808798598949212e-05, + "loss": 1.1197, + "num_input_tokens_seen": 73374864, + "step": 4560 + }, + { + "epoch": 0.3194900487710911, + "grad_norm": 5.06007194519043, + "learning_rate": 6.808098774080561e-05, + "loss": 0.8933, + "num_input_tokens_seen": 73391248, + "step": 4561 + }, + { + "epoch": 0.3195600970168203, + "grad_norm": 4.032425403594971, + "learning_rate": 6.80739894921191e-05, + "loss": 0.9132, + "num_input_tokens_seen": 73406728, + "step": 4562 + }, + { + "epoch": 0.31963014526254957, + "grad_norm": 4.344507694244385, + "learning_rate": 6.806699124343259e-05, + "loss": 1.1248, + "num_input_tokens_seen": 73423112, + "step": 4563 + }, + { + "epoch": 0.31970019350827883, + "grad_norm": 3.7113993167877197, + "learning_rate": 6.805999299474606e-05, + "loss": 0.9122, + "num_input_tokens_seen": 73439496, + "step": 4564 + }, + { + "epoch": 0.3197702417540081, + "grad_norm": 4.160495281219482, + "learning_rate": 6.805299474605955e-05, + "loss": 1.0425, + "num_input_tokens_seen": 73455456, + "step": 4565 + }, + { + "epoch": 0.3198402899997373, + "grad_norm": 5.51431131362915, + "learning_rate": 6.804599649737304e-05, + "loss": 0.9416, + "num_input_tokens_seen": 73471840, + "step": 4566 + }, + { + "epoch": 0.31991033824546655, + "grad_norm": 4.145261287689209, + "learning_rate": 6.803899824868651e-05, + "loss": 0.947, + "num_input_tokens_seen": 73487688, + "step": 4567 + }, + { + "epoch": 0.3199803864911958, + "grad_norm": 3.917922019958496, + "learning_rate": 6.8032e-05, + "loss": 1.1859, + "num_input_tokens_seen": 73504072, + "step": 4568 + }, + { + "epoch": 0.32005043473692507, + "grad_norm": 3.8644864559173584, + "learning_rate": 6.802500175131349e-05, + "loss": 0.9176, + "num_input_tokens_seen": 73520344, + "step": 4569 + }, + { + "epoch": 0.32012048298265433, + "grad_norm": 4.043839931488037, + "learning_rate": 6.801800350262698e-05, + "loss": 1.0045, + "num_input_tokens_seen": 73536248, + "step": 4570 + }, + { + "epoch": 0.32019053122838353, + "grad_norm": 4.793722629547119, + "learning_rate": 6.801100525394045e-05, + "loss": 1.2245, + "num_input_tokens_seen": 73552512, + "step": 4571 + }, + { + "epoch": 0.3202605794741128, + "grad_norm": 3.632899761199951, + "learning_rate": 6.800400700525394e-05, + "loss": 0.9899, + "num_input_tokens_seen": 73568896, + "step": 4572 + }, + { + "epoch": 0.32033062771984205, + "grad_norm": 6.236395359039307, + "learning_rate": 6.799700875656743e-05, + "loss": 1.113, + "num_input_tokens_seen": 73585280, + "step": 4573 + }, + { + "epoch": 0.3204006759655713, + "grad_norm": 4.591775417327881, + "learning_rate": 6.799001050788092e-05, + "loss": 1.0019, + "num_input_tokens_seen": 73600328, + "step": 4574 + }, + { + "epoch": 0.3204707242113005, + "grad_norm": 3.9546539783477783, + "learning_rate": 6.79830122591944e-05, + "loss": 1.0444, + "num_input_tokens_seen": 73616568, + "step": 4575 + }, + { + "epoch": 0.3205407724570298, + "grad_norm": 4.425241470336914, + "learning_rate": 6.79760140105079e-05, + "loss": 1.0112, + "num_input_tokens_seen": 73632552, + "step": 4576 + }, + { + "epoch": 0.32061082070275904, + "grad_norm": 3.999953508377075, + "learning_rate": 6.796901576182137e-05, + "loss": 1.1854, + "num_input_tokens_seen": 73648672, + "step": 4577 + }, + { + "epoch": 0.3206808689484883, + "grad_norm": 3.6718766689300537, + "learning_rate": 6.796201751313485e-05, + "loss": 1.0379, + "num_input_tokens_seen": 73665056, + "step": 4578 + }, + { + "epoch": 0.3207509171942175, + "grad_norm": 4.37136173248291, + "learning_rate": 6.795501926444834e-05, + "loss": 0.9921, + "num_input_tokens_seen": 73679680, + "step": 4579 + }, + { + "epoch": 0.32082096543994676, + "grad_norm": 5.109454154968262, + "learning_rate": 6.794802101576182e-05, + "loss": 1.0901, + "num_input_tokens_seen": 73695768, + "step": 4580 + }, + { + "epoch": 0.320891013685676, + "grad_norm": 4.277298927307129, + "learning_rate": 6.794102276707531e-05, + "loss": 1.0651, + "num_input_tokens_seen": 73711840, + "step": 4581 + }, + { + "epoch": 0.3209610619314053, + "grad_norm": 4.598893165588379, + "learning_rate": 6.79340245183888e-05, + "loss": 1.1733, + "num_input_tokens_seen": 73728184, + "step": 4582 + }, + { + "epoch": 0.3210311101771345, + "grad_norm": 5.124484539031982, + "learning_rate": 6.792702626970229e-05, + "loss": 0.9399, + "num_input_tokens_seen": 73744568, + "step": 4583 + }, + { + "epoch": 0.32110115842286374, + "grad_norm": 4.426584243774414, + "learning_rate": 6.792002802101577e-05, + "loss": 0.8339, + "num_input_tokens_seen": 73760424, + "step": 4584 + }, + { + "epoch": 0.321171206668593, + "grad_norm": 3.5181384086608887, + "learning_rate": 6.791302977232924e-05, + "loss": 0.8025, + "num_input_tokens_seen": 73776808, + "step": 4585 + }, + { + "epoch": 0.32124125491432226, + "grad_norm": 6.614295482635498, + "learning_rate": 6.790603152364273e-05, + "loss": 1.1392, + "num_input_tokens_seen": 73793192, + "step": 4586 + }, + { + "epoch": 0.32131130316005146, + "grad_norm": 5.212308406829834, + "learning_rate": 6.789903327495622e-05, + "loss": 1.0909, + "num_input_tokens_seen": 73809576, + "step": 4587 + }, + { + "epoch": 0.3213813514057807, + "grad_norm": 4.7378106117248535, + "learning_rate": 6.78920350262697e-05, + "loss": 1.112, + "num_input_tokens_seen": 73825680, + "step": 4588 + }, + { + "epoch": 0.32145139965151, + "grad_norm": 5.0195136070251465, + "learning_rate": 6.78850367775832e-05, + "loss": 1.4437, + "num_input_tokens_seen": 73841200, + "step": 4589 + }, + { + "epoch": 0.32152144789723924, + "grad_norm": 6.186412811279297, + "learning_rate": 6.787803852889668e-05, + "loss": 1.0715, + "num_input_tokens_seen": 73857584, + "step": 4590 + }, + { + "epoch": 0.32159149614296845, + "grad_norm": 6.835412502288818, + "learning_rate": 6.787104028021016e-05, + "loss": 0.9454, + "num_input_tokens_seen": 73873624, + "step": 4591 + }, + { + "epoch": 0.3216615443886977, + "grad_norm": 4.3859333992004395, + "learning_rate": 6.786404203152365e-05, + "loss": 0.9344, + "num_input_tokens_seen": 73890008, + "step": 4592 + }, + { + "epoch": 0.32173159263442697, + "grad_norm": 3.8230555057525635, + "learning_rate": 6.785704378283714e-05, + "loss": 0.9475, + "num_input_tokens_seen": 73906392, + "step": 4593 + }, + { + "epoch": 0.3218016408801562, + "grad_norm": 4.458274841308594, + "learning_rate": 6.785004553415062e-05, + "loss": 1.1223, + "num_input_tokens_seen": 73922776, + "step": 4594 + }, + { + "epoch": 0.32187168912588543, + "grad_norm": 4.006426811218262, + "learning_rate": 6.78430472854641e-05, + "loss": 1.3019, + "num_input_tokens_seen": 73938896, + "step": 4595 + }, + { + "epoch": 0.3219417373716147, + "grad_norm": 4.637386322021484, + "learning_rate": 6.783604903677759e-05, + "loss": 1.0272, + "num_input_tokens_seen": 73955280, + "step": 4596 + }, + { + "epoch": 0.32201178561734395, + "grad_norm": 5.13168478012085, + "learning_rate": 6.782905078809108e-05, + "loss": 1.1046, + "num_input_tokens_seen": 73971480, + "step": 4597 + }, + { + "epoch": 0.3220818338630732, + "grad_norm": 3.8248770236968994, + "learning_rate": 6.782205253940455e-05, + "loss": 1.0467, + "num_input_tokens_seen": 73987712, + "step": 4598 + }, + { + "epoch": 0.3221518821088024, + "grad_norm": 5.167041778564453, + "learning_rate": 6.781505429071804e-05, + "loss": 1.005, + "num_input_tokens_seen": 74004096, + "step": 4599 + }, + { + "epoch": 0.32222193035453167, + "grad_norm": 3.779311180114746, + "learning_rate": 6.780805604203153e-05, + "loss": 0.9102, + "num_input_tokens_seen": 74020176, + "step": 4600 + }, + { + "epoch": 0.32222193035453167, + "eval_loss": 1.1318858861923218, + "eval_runtime": 0.2027, + "eval_samples_per_second": 4.933, + "eval_steps_per_second": 4.933, + "num_input_tokens_seen": 74020176, + "step": 4600 + }, + { + "epoch": 0.32229197860026093, + "grad_norm": 3.8468148708343506, + "learning_rate": 6.780105779334502e-05, + "loss": 0.9602, + "num_input_tokens_seen": 74035664, + "step": 4601 + }, + { + "epoch": 0.3223620268459902, + "grad_norm": 4.28491735458374, + "learning_rate": 6.779405954465849e-05, + "loss": 1.1125, + "num_input_tokens_seen": 74050408, + "step": 4602 + }, + { + "epoch": 0.3224320750917194, + "grad_norm": 4.872751712799072, + "learning_rate": 6.7787061295972e-05, + "loss": 0.9746, + "num_input_tokens_seen": 74066336, + "step": 4603 + }, + { + "epoch": 0.32250212333744865, + "grad_norm": 4.060647487640381, + "learning_rate": 6.778006304728547e-05, + "loss": 1.0575, + "num_input_tokens_seen": 74082720, + "step": 4604 + }, + { + "epoch": 0.3225721715831779, + "grad_norm": 3.607623815536499, + "learning_rate": 6.777306479859894e-05, + "loss": 0.9797, + "num_input_tokens_seen": 74099104, + "step": 4605 + }, + { + "epoch": 0.3226422198289072, + "grad_norm": 3.719801187515259, + "learning_rate": 6.776606654991243e-05, + "loss": 1.0249, + "num_input_tokens_seen": 74115488, + "step": 4606 + }, + { + "epoch": 0.32271226807463643, + "grad_norm": 5.072197914123535, + "learning_rate": 6.775906830122592e-05, + "loss": 1.1264, + "num_input_tokens_seen": 74131224, + "step": 4607 + }, + { + "epoch": 0.32278231632036564, + "grad_norm": 6.052949905395508, + "learning_rate": 6.775207005253941e-05, + "loss": 1.1319, + "num_input_tokens_seen": 74147608, + "step": 4608 + }, + { + "epoch": 0.3228523645660949, + "grad_norm": 6.214832782745361, + "learning_rate": 6.77450718038529e-05, + "loss": 0.9861, + "num_input_tokens_seen": 74162840, + "step": 4609 + }, + { + "epoch": 0.32292241281182416, + "grad_norm": 4.279264450073242, + "learning_rate": 6.773807355516639e-05, + "loss": 0.9855, + "num_input_tokens_seen": 74179224, + "step": 4610 + }, + { + "epoch": 0.3229924610575534, + "grad_norm": 3.8564460277557373, + "learning_rate": 6.773107530647986e-05, + "loss": 1.0555, + "num_input_tokens_seen": 74195608, + "step": 4611 + }, + { + "epoch": 0.3230625093032826, + "grad_norm": 4.747770309448242, + "learning_rate": 6.772407705779334e-05, + "loss": 1.0011, + "num_input_tokens_seen": 74211664, + "step": 4612 + }, + { + "epoch": 0.3231325575490119, + "grad_norm": 3.5425655841827393, + "learning_rate": 6.771707880910683e-05, + "loss": 1.135, + "num_input_tokens_seen": 74228048, + "step": 4613 + }, + { + "epoch": 0.32320260579474114, + "grad_norm": 3.919851303100586, + "learning_rate": 6.771008056042033e-05, + "loss": 0.9791, + "num_input_tokens_seen": 74243424, + "step": 4614 + }, + { + "epoch": 0.3232726540404704, + "grad_norm": 4.061427593231201, + "learning_rate": 6.77030823117338e-05, + "loss": 1.2477, + "num_input_tokens_seen": 74259696, + "step": 4615 + }, + { + "epoch": 0.3233427022861996, + "grad_norm": 5.14341926574707, + "learning_rate": 6.769608406304729e-05, + "loss": 0.9715, + "num_input_tokens_seen": 74274968, + "step": 4616 + }, + { + "epoch": 0.32341275053192886, + "grad_norm": 6.207670211791992, + "learning_rate": 6.768908581436078e-05, + "loss": 1.0955, + "num_input_tokens_seen": 74291352, + "step": 4617 + }, + { + "epoch": 0.3234827987776581, + "grad_norm": 5.948925971984863, + "learning_rate": 6.768208756567426e-05, + "loss": 1.1007, + "num_input_tokens_seen": 74307000, + "step": 4618 + }, + { + "epoch": 0.3235528470233874, + "grad_norm": 5.205277442932129, + "learning_rate": 6.767508931698774e-05, + "loss": 0.9458, + "num_input_tokens_seen": 74323384, + "step": 4619 + }, + { + "epoch": 0.3236228952691166, + "grad_norm": 3.8878557682037354, + "learning_rate": 6.766809106830123e-05, + "loss": 1.01, + "num_input_tokens_seen": 74339768, + "step": 4620 + }, + { + "epoch": 0.32369294351484584, + "grad_norm": 4.9194111824035645, + "learning_rate": 6.766109281961472e-05, + "loss": 1.1011, + "num_input_tokens_seen": 74355888, + "step": 4621 + }, + { + "epoch": 0.3237629917605751, + "grad_norm": 3.5212655067443848, + "learning_rate": 6.76540945709282e-05, + "loss": 1.0886, + "num_input_tokens_seen": 74372048, + "step": 4622 + }, + { + "epoch": 0.32383304000630436, + "grad_norm": 3.6212568283081055, + "learning_rate": 6.764709632224168e-05, + "loss": 1.0616, + "num_input_tokens_seen": 74388432, + "step": 4623 + }, + { + "epoch": 0.32390308825203357, + "grad_norm": 3.795515298843384, + "learning_rate": 6.764009807355517e-05, + "loss": 1.1594, + "num_input_tokens_seen": 74404584, + "step": 4624 + }, + { + "epoch": 0.3239731364977628, + "grad_norm": 4.537838935852051, + "learning_rate": 6.763309982486865e-05, + "loss": 1.1319, + "num_input_tokens_seen": 74420304, + "step": 4625 + }, + { + "epoch": 0.3240431847434921, + "grad_norm": 4.276764392852783, + "learning_rate": 6.762610157618214e-05, + "loss": 0.9162, + "num_input_tokens_seen": 74436688, + "step": 4626 + }, + { + "epoch": 0.32411323298922134, + "grad_norm": 3.9739227294921875, + "learning_rate": 6.761910332749563e-05, + "loss": 1.0002, + "num_input_tokens_seen": 74451824, + "step": 4627 + }, + { + "epoch": 0.32418328123495055, + "grad_norm": 4.176823616027832, + "learning_rate": 6.761210507880911e-05, + "loss": 1.2547, + "num_input_tokens_seen": 74467080, + "step": 4628 + }, + { + "epoch": 0.3242533294806798, + "grad_norm": 4.471405029296875, + "learning_rate": 6.760510683012259e-05, + "loss": 0.9694, + "num_input_tokens_seen": 74483464, + "step": 4629 + }, + { + "epoch": 0.32432337772640907, + "grad_norm": 3.95442271232605, + "learning_rate": 6.759810858143609e-05, + "loss": 1.1059, + "num_input_tokens_seen": 74499848, + "step": 4630 + }, + { + "epoch": 0.3243934259721383, + "grad_norm": 5.348501682281494, + "learning_rate": 6.759111033274957e-05, + "loss": 1.043, + "num_input_tokens_seen": 74516232, + "step": 4631 + }, + { + "epoch": 0.32446347421786753, + "grad_norm": 4.405150413513184, + "learning_rate": 6.758411208406304e-05, + "loss": 1.0732, + "num_input_tokens_seen": 74531120, + "step": 4632 + }, + { + "epoch": 0.3245335224635968, + "grad_norm": 3.633358955383301, + "learning_rate": 6.757711383537653e-05, + "loss": 0.9585, + "num_input_tokens_seen": 74547504, + "step": 4633 + }, + { + "epoch": 0.32460357070932605, + "grad_norm": 4.668785095214844, + "learning_rate": 6.757011558669003e-05, + "loss": 1.2355, + "num_input_tokens_seen": 74563888, + "step": 4634 + }, + { + "epoch": 0.3246736189550553, + "grad_norm": 5.222908020019531, + "learning_rate": 6.756311733800351e-05, + "loss": 1.094, + "num_input_tokens_seen": 74580224, + "step": 4635 + }, + { + "epoch": 0.3247436672007845, + "grad_norm": 3.812385082244873, + "learning_rate": 6.7556119089317e-05, + "loss": 1.1326, + "num_input_tokens_seen": 74596608, + "step": 4636 + }, + { + "epoch": 0.3248137154465138, + "grad_norm": 5.080833911895752, + "learning_rate": 6.754912084063048e-05, + "loss": 1.0665, + "num_input_tokens_seen": 74612456, + "step": 4637 + }, + { + "epoch": 0.32488376369224303, + "grad_norm": 5.309609413146973, + "learning_rate": 6.754212259194396e-05, + "loss": 1.0206, + "num_input_tokens_seen": 74627840, + "step": 4638 + }, + { + "epoch": 0.3249538119379723, + "grad_norm": 4.46236515045166, + "learning_rate": 6.753512434325743e-05, + "loss": 1.1093, + "num_input_tokens_seen": 74643800, + "step": 4639 + }, + { + "epoch": 0.32502386018370155, + "grad_norm": 9.981855392456055, + "learning_rate": 6.752812609457094e-05, + "loss": 1.2777, + "num_input_tokens_seen": 74660184, + "step": 4640 + }, + { + "epoch": 0.32509390842943076, + "grad_norm": 5.075852870941162, + "learning_rate": 6.752112784588443e-05, + "loss": 0.9977, + "num_input_tokens_seen": 74676568, + "step": 4641 + }, + { + "epoch": 0.32516395667516, + "grad_norm": 3.8985090255737305, + "learning_rate": 6.75141295971979e-05, + "loss": 1.1299, + "num_input_tokens_seen": 74692952, + "step": 4642 + }, + { + "epoch": 0.3252340049208893, + "grad_norm": 4.9769673347473145, + "learning_rate": 6.750713134851139e-05, + "loss": 1.3023, + "num_input_tokens_seen": 74709216, + "step": 4643 + }, + { + "epoch": 0.32530405316661853, + "grad_norm": 4.508238315582275, + "learning_rate": 6.750013309982488e-05, + "loss": 1.014, + "num_input_tokens_seen": 74724640, + "step": 4644 + }, + { + "epoch": 0.32537410141234774, + "grad_norm": 4.214225769042969, + "learning_rate": 6.749313485113835e-05, + "loss": 1.0864, + "num_input_tokens_seen": 74740696, + "step": 4645 + }, + { + "epoch": 0.325444149658077, + "grad_norm": 4.217604160308838, + "learning_rate": 6.748613660245184e-05, + "loss": 1.0521, + "num_input_tokens_seen": 74756520, + "step": 4646 + }, + { + "epoch": 0.32551419790380626, + "grad_norm": 3.5975253582000732, + "learning_rate": 6.747913835376533e-05, + "loss": 0.9642, + "num_input_tokens_seen": 74772904, + "step": 4647 + }, + { + "epoch": 0.3255842461495355, + "grad_norm": 3.5055267810821533, + "learning_rate": 6.747214010507882e-05, + "loss": 1.0975, + "num_input_tokens_seen": 74789288, + "step": 4648 + }, + { + "epoch": 0.3256542943952647, + "grad_norm": 3.8605833053588867, + "learning_rate": 6.746514185639229e-05, + "loss": 1.095, + "num_input_tokens_seen": 74804768, + "step": 4649 + }, + { + "epoch": 0.325724342640994, + "grad_norm": 9.446599006652832, + "learning_rate": 6.745814360770578e-05, + "loss": 1.1894, + "num_input_tokens_seen": 74821152, + "step": 4650 + }, + { + "epoch": 0.32579439088672324, + "grad_norm": 4.161158084869385, + "learning_rate": 6.745114535901927e-05, + "loss": 0.984, + "num_input_tokens_seen": 74836992, + "step": 4651 + }, + { + "epoch": 0.3258644391324525, + "grad_norm": 3.5690324306488037, + "learning_rate": 6.744414711033275e-05, + "loss": 1.0186, + "num_input_tokens_seen": 74852896, + "step": 4652 + }, + { + "epoch": 0.3259344873781817, + "grad_norm": 3.5873210430145264, + "learning_rate": 6.743714886164623e-05, + "loss": 1.0069, + "num_input_tokens_seen": 74868472, + "step": 4653 + }, + { + "epoch": 0.32600453562391096, + "grad_norm": 4.192559719085693, + "learning_rate": 6.743015061295972e-05, + "loss": 1.0646, + "num_input_tokens_seen": 74884856, + "step": 4654 + }, + { + "epoch": 0.3260745838696402, + "grad_norm": 4.633018493652344, + "learning_rate": 6.742315236427321e-05, + "loss": 1.1525, + "num_input_tokens_seen": 74900848, + "step": 4655 + }, + { + "epoch": 0.3261446321153695, + "grad_norm": 3.568934440612793, + "learning_rate": 6.741615411558669e-05, + "loss": 1.061, + "num_input_tokens_seen": 74917232, + "step": 4656 + }, + { + "epoch": 0.3262146803610987, + "grad_norm": 3.6099655628204346, + "learning_rate": 6.740915586690019e-05, + "loss": 1.0758, + "num_input_tokens_seen": 74933616, + "step": 4657 + }, + { + "epoch": 0.32628472860682795, + "grad_norm": 4.272975921630859, + "learning_rate": 6.740215761821366e-05, + "loss": 1.1901, + "num_input_tokens_seen": 74950000, + "step": 4658 + }, + { + "epoch": 0.3263547768525572, + "grad_norm": 4.2752251625061035, + "learning_rate": 6.739515936952714e-05, + "loss": 1.0835, + "num_input_tokens_seen": 74966032, + "step": 4659 + }, + { + "epoch": 0.32642482509828646, + "grad_norm": 5.06410551071167, + "learning_rate": 6.738816112084064e-05, + "loss": 1.2041, + "num_input_tokens_seen": 74981432, + "step": 4660 + }, + { + "epoch": 0.32649487334401567, + "grad_norm": 6.378856182098389, + "learning_rate": 6.738116287215413e-05, + "loss": 1.2996, + "num_input_tokens_seen": 74997440, + "step": 4661 + }, + { + "epoch": 0.3265649215897449, + "grad_norm": 5.427485466003418, + "learning_rate": 6.73741646234676e-05, + "loss": 1.2233, + "num_input_tokens_seen": 75013824, + "step": 4662 + }, + { + "epoch": 0.3266349698354742, + "grad_norm": 4.366839408874512, + "learning_rate": 6.736716637478109e-05, + "loss": 1.2077, + "num_input_tokens_seen": 75030208, + "step": 4663 + }, + { + "epoch": 0.32670501808120345, + "grad_norm": 5.765005588531494, + "learning_rate": 6.736016812609458e-05, + "loss": 1.0833, + "num_input_tokens_seen": 75046592, + "step": 4664 + }, + { + "epoch": 0.32677506632693265, + "grad_norm": 3.4886975288391113, + "learning_rate": 6.735316987740806e-05, + "loss": 0.7976, + "num_input_tokens_seen": 75062976, + "step": 4665 + }, + { + "epoch": 0.3268451145726619, + "grad_norm": 4.1105875968933105, + "learning_rate": 6.734617162872154e-05, + "loss": 1.011, + "num_input_tokens_seen": 75078024, + "step": 4666 + }, + { + "epoch": 0.32691516281839117, + "grad_norm": 3.8737053871154785, + "learning_rate": 6.733917338003503e-05, + "loss": 1.0544, + "num_input_tokens_seen": 75094408, + "step": 4667 + }, + { + "epoch": 0.32698521106412043, + "grad_norm": 4.077807426452637, + "learning_rate": 6.733217513134852e-05, + "loss": 1.1573, + "num_input_tokens_seen": 75110792, + "step": 4668 + }, + { + "epoch": 0.32705525930984963, + "grad_norm": 4.339305400848389, + "learning_rate": 6.7325176882662e-05, + "loss": 0.7132, + "num_input_tokens_seen": 75126240, + "step": 4669 + }, + { + "epoch": 0.3271253075555789, + "grad_norm": 4.241507053375244, + "learning_rate": 6.731817863397549e-05, + "loss": 1.1594, + "num_input_tokens_seen": 75142144, + "step": 4670 + }, + { + "epoch": 0.32719535580130815, + "grad_norm": 7.518558979034424, + "learning_rate": 6.731118038528897e-05, + "loss": 1.0168, + "num_input_tokens_seen": 75158528, + "step": 4671 + }, + { + "epoch": 0.3272654040470374, + "grad_norm": 4.342295169830322, + "learning_rate": 6.730418213660245e-05, + "loss": 1.2134, + "num_input_tokens_seen": 75174912, + "step": 4672 + }, + { + "epoch": 0.3273354522927666, + "grad_norm": 3.3599188327789307, + "learning_rate": 6.729718388791594e-05, + "loss": 0.9183, + "num_input_tokens_seen": 75190720, + "step": 4673 + }, + { + "epoch": 0.3274055005384959, + "grad_norm": 4.393617153167725, + "learning_rate": 6.729018563922943e-05, + "loss": 1.1215, + "num_input_tokens_seen": 75207104, + "step": 4674 + }, + { + "epoch": 0.32747554878422513, + "grad_norm": 3.948538064956665, + "learning_rate": 6.728318739054292e-05, + "loss": 0.9105, + "num_input_tokens_seen": 75222736, + "step": 4675 + }, + { + "epoch": 0.3275455970299544, + "grad_norm": 5.3323469161987305, + "learning_rate": 6.727618914185639e-05, + "loss": 0.9977, + "num_input_tokens_seen": 75238680, + "step": 4676 + }, + { + "epoch": 0.32761564527568365, + "grad_norm": 4.943187713623047, + "learning_rate": 6.726919089316988e-05, + "loss": 0.9327, + "num_input_tokens_seen": 75255064, + "step": 4677 + }, + { + "epoch": 0.32768569352141286, + "grad_norm": 4.083932399749756, + "learning_rate": 6.726219264448337e-05, + "loss": 1.2085, + "num_input_tokens_seen": 75271448, + "step": 4678 + }, + { + "epoch": 0.3277557417671421, + "grad_norm": 4.682622909545898, + "learning_rate": 6.725519439579684e-05, + "loss": 1.0105, + "num_input_tokens_seen": 75287752, + "step": 4679 + }, + { + "epoch": 0.3278257900128714, + "grad_norm": 4.544816493988037, + "learning_rate": 6.724819614711033e-05, + "loss": 1.0422, + "num_input_tokens_seen": 75304136, + "step": 4680 + }, + { + "epoch": 0.32789583825860064, + "grad_norm": 3.859891176223755, + "learning_rate": 6.724119789842383e-05, + "loss": 0.9317, + "num_input_tokens_seen": 75320520, + "step": 4681 + }, + { + "epoch": 0.32796588650432984, + "grad_norm": 5.739070415496826, + "learning_rate": 6.723419964973731e-05, + "loss": 1.1315, + "num_input_tokens_seen": 75336904, + "step": 4682 + }, + { + "epoch": 0.3280359347500591, + "grad_norm": 4.289483547210693, + "learning_rate": 6.722720140105078e-05, + "loss": 1.0576, + "num_input_tokens_seen": 75353288, + "step": 4683 + }, + { + "epoch": 0.32810598299578836, + "grad_norm": 4.03695011138916, + "learning_rate": 6.722020315236429e-05, + "loss": 1.0129, + "num_input_tokens_seen": 75369424, + "step": 4684 + }, + { + "epoch": 0.3281760312415176, + "grad_norm": 3.8941352367401123, + "learning_rate": 6.721320490367776e-05, + "loss": 1.015, + "num_input_tokens_seen": 75385760, + "step": 4685 + }, + { + "epoch": 0.3282460794872468, + "grad_norm": 4.345769882202148, + "learning_rate": 6.720620665499125e-05, + "loss": 0.9842, + "num_input_tokens_seen": 75401736, + "step": 4686 + }, + { + "epoch": 0.3283161277329761, + "grad_norm": 5.759182453155518, + "learning_rate": 6.719920840630474e-05, + "loss": 1.0937, + "num_input_tokens_seen": 75417928, + "step": 4687 + }, + { + "epoch": 0.32838617597870534, + "grad_norm": 4.947919845581055, + "learning_rate": 6.719221015761823e-05, + "loss": 1.0346, + "num_input_tokens_seen": 75433624, + "step": 4688 + }, + { + "epoch": 0.3284562242244346, + "grad_norm": 3.936934471130371, + "learning_rate": 6.71852119089317e-05, + "loss": 1.1684, + "num_input_tokens_seen": 75450008, + "step": 4689 + }, + { + "epoch": 0.3285262724701638, + "grad_norm": 3.7944555282592773, + "learning_rate": 6.717821366024519e-05, + "loss": 0.9825, + "num_input_tokens_seen": 75466392, + "step": 4690 + }, + { + "epoch": 0.32859632071589306, + "grad_norm": 3.8094451427459717, + "learning_rate": 6.717121541155868e-05, + "loss": 0.9309, + "num_input_tokens_seen": 75482776, + "step": 4691 + }, + { + "epoch": 0.3286663689616223, + "grad_norm": 4.426685333251953, + "learning_rate": 6.716421716287215e-05, + "loss": 0.9497, + "num_input_tokens_seen": 75497760, + "step": 4692 + }, + { + "epoch": 0.3287364172073516, + "grad_norm": 4.299224376678467, + "learning_rate": 6.715721891418564e-05, + "loss": 1.214, + "num_input_tokens_seen": 75513024, + "step": 4693 + }, + { + "epoch": 0.3288064654530808, + "grad_norm": 3.765477418899536, + "learning_rate": 6.715022066549913e-05, + "loss": 1.2114, + "num_input_tokens_seen": 75529304, + "step": 4694 + }, + { + "epoch": 0.32887651369881005, + "grad_norm": 3.991591453552246, + "learning_rate": 6.714322241681262e-05, + "loss": 0.8295, + "num_input_tokens_seen": 75545264, + "step": 4695 + }, + { + "epoch": 0.3289465619445393, + "grad_norm": 3.652726888656616, + "learning_rate": 6.71362241681261e-05, + "loss": 0.953, + "num_input_tokens_seen": 75561648, + "step": 4696 + }, + { + "epoch": 0.32901661019026857, + "grad_norm": 6.083689212799072, + "learning_rate": 6.712922591943958e-05, + "loss": 1.0838, + "num_input_tokens_seen": 75578032, + "step": 4697 + }, + { + "epoch": 0.32908665843599777, + "grad_norm": 4.732533931732178, + "learning_rate": 6.712222767075307e-05, + "loss": 0.9885, + "num_input_tokens_seen": 75593944, + "step": 4698 + }, + { + "epoch": 0.32915670668172703, + "grad_norm": 5.024901866912842, + "learning_rate": 6.711522942206655e-05, + "loss": 0.887, + "num_input_tokens_seen": 75610328, + "step": 4699 + }, + { + "epoch": 0.3292267549274563, + "grad_norm": 4.663429260253906, + "learning_rate": 6.710823117338004e-05, + "loss": 1.0955, + "num_input_tokens_seen": 75626712, + "step": 4700 + }, + { + "epoch": 0.32929680317318555, + "grad_norm": 4.396904945373535, + "learning_rate": 6.710123292469354e-05, + "loss": 1.2419, + "num_input_tokens_seen": 75643096, + "step": 4701 + }, + { + "epoch": 0.32936685141891475, + "grad_norm": 3.7963149547576904, + "learning_rate": 6.709423467600701e-05, + "loss": 1.1536, + "num_input_tokens_seen": 75658616, + "step": 4702 + }, + { + "epoch": 0.329436899664644, + "grad_norm": 4.154513835906982, + "learning_rate": 6.708723642732049e-05, + "loss": 1.0529, + "num_input_tokens_seen": 75675000, + "step": 4703 + }, + { + "epoch": 0.32950694791037327, + "grad_norm": 3.8939032554626465, + "learning_rate": 6.708023817863398e-05, + "loss": 1.115, + "num_input_tokens_seen": 75690728, + "step": 4704 + }, + { + "epoch": 0.32957699615610253, + "grad_norm": 4.7678375244140625, + "learning_rate": 6.707323992994746e-05, + "loss": 0.9747, + "num_input_tokens_seen": 75707080, + "step": 4705 + }, + { + "epoch": 0.32964704440183173, + "grad_norm": 6.56498384475708, + "learning_rate": 6.706624168126094e-05, + "loss": 1.058, + "num_input_tokens_seen": 75723464, + "step": 4706 + }, + { + "epoch": 0.329717092647561, + "grad_norm": 6.917506694793701, + "learning_rate": 6.705924343257444e-05, + "loss": 1.0576, + "num_input_tokens_seen": 75739848, + "step": 4707 + }, + { + "epoch": 0.32978714089329025, + "grad_norm": 3.9431846141815186, + "learning_rate": 6.705224518388793e-05, + "loss": 0.9693, + "num_input_tokens_seen": 75756232, + "step": 4708 + }, + { + "epoch": 0.3298571891390195, + "grad_norm": 4.838469505310059, + "learning_rate": 6.70452469352014e-05, + "loss": 1.2367, + "num_input_tokens_seen": 75772616, + "step": 4709 + }, + { + "epoch": 0.3299272373847488, + "grad_norm": 4.0371012687683105, + "learning_rate": 6.703824868651488e-05, + "loss": 1.0494, + "num_input_tokens_seen": 75789000, + "step": 4710 + }, + { + "epoch": 0.329997285630478, + "grad_norm": 3.491875410079956, + "learning_rate": 6.703125043782838e-05, + "loss": 0.8919, + "num_input_tokens_seen": 75805384, + "step": 4711 + }, + { + "epoch": 0.33006733387620724, + "grad_norm": 3.5304512977600098, + "learning_rate": 6.702425218914186e-05, + "loss": 0.8896, + "num_input_tokens_seen": 75821104, + "step": 4712 + }, + { + "epoch": 0.3301373821219365, + "grad_norm": 3.642528533935547, + "learning_rate": 6.701725394045535e-05, + "loss": 0.9843, + "num_input_tokens_seen": 75837424, + "step": 4713 + }, + { + "epoch": 0.33020743036766576, + "grad_norm": 6.536950588226318, + "learning_rate": 6.701025569176883e-05, + "loss": 0.9545, + "num_input_tokens_seen": 75853808, + "step": 4714 + }, + { + "epoch": 0.33027747861339496, + "grad_norm": 3.376460075378418, + "learning_rate": 6.700325744308232e-05, + "loss": 0.9607, + "num_input_tokens_seen": 75870192, + "step": 4715 + }, + { + "epoch": 0.3303475268591242, + "grad_norm": 4.988052845001221, + "learning_rate": 6.69962591943958e-05, + "loss": 1.1392, + "num_input_tokens_seen": 75886576, + "step": 4716 + }, + { + "epoch": 0.3304175751048535, + "grad_norm": 4.724236965179443, + "learning_rate": 6.698926094570929e-05, + "loss": 1.0015, + "num_input_tokens_seen": 75902960, + "step": 4717 + }, + { + "epoch": 0.33048762335058274, + "grad_norm": 4.877357006072998, + "learning_rate": 6.698226269702278e-05, + "loss": 0.9892, + "num_input_tokens_seen": 75919344, + "step": 4718 + }, + { + "epoch": 0.33055767159631194, + "grad_norm": 3.981224775314331, + "learning_rate": 6.697526444833625e-05, + "loss": 0.9356, + "num_input_tokens_seen": 75935728, + "step": 4719 + }, + { + "epoch": 0.3306277198420412, + "grad_norm": 5.456554889678955, + "learning_rate": 6.696826619964974e-05, + "loss": 0.8373, + "num_input_tokens_seen": 75951304, + "step": 4720 + }, + { + "epoch": 0.33069776808777046, + "grad_norm": 3.9885287284851074, + "learning_rate": 6.696126795096323e-05, + "loss": 1.2658, + "num_input_tokens_seen": 75967688, + "step": 4721 + }, + { + "epoch": 0.3307678163334997, + "grad_norm": 3.447371482849121, + "learning_rate": 6.695426970227672e-05, + "loss": 0.9301, + "num_input_tokens_seen": 75984072, + "step": 4722 + }, + { + "epoch": 0.3308378645792289, + "grad_norm": 4.405709743499756, + "learning_rate": 6.694727145359019e-05, + "loss": 1.2445, + "num_input_tokens_seen": 76000456, + "step": 4723 + }, + { + "epoch": 0.3309079128249582, + "grad_norm": 3.7595372200012207, + "learning_rate": 6.694027320490368e-05, + "loss": 1.1851, + "num_input_tokens_seen": 76016840, + "step": 4724 + }, + { + "epoch": 0.33097796107068744, + "grad_norm": 5.460091590881348, + "learning_rate": 6.693327495621717e-05, + "loss": 0.8514, + "num_input_tokens_seen": 76032344, + "step": 4725 + }, + { + "epoch": 0.3310480093164167, + "grad_norm": 7.111250400543213, + "learning_rate": 6.692627670753064e-05, + "loss": 1.0086, + "num_input_tokens_seen": 76048728, + "step": 4726 + }, + { + "epoch": 0.3311180575621459, + "grad_norm": 4.799232482910156, + "learning_rate": 6.691927845884415e-05, + "loss": 0.9995, + "num_input_tokens_seen": 76063832, + "step": 4727 + }, + { + "epoch": 0.33118810580787517, + "grad_norm": 4.045900344848633, + "learning_rate": 6.691228021015763e-05, + "loss": 1.162, + "num_input_tokens_seen": 76079792, + "step": 4728 + }, + { + "epoch": 0.3312581540536044, + "grad_norm": 3.9942305088043213, + "learning_rate": 6.690528196147111e-05, + "loss": 1.1444, + "num_input_tokens_seen": 76095992, + "step": 4729 + }, + { + "epoch": 0.3313282022993337, + "grad_norm": 4.173962116241455, + "learning_rate": 6.689828371278458e-05, + "loss": 0.928, + "num_input_tokens_seen": 76111760, + "step": 4730 + }, + { + "epoch": 0.3313982505450629, + "grad_norm": 8.357215881347656, + "learning_rate": 6.689128546409807e-05, + "loss": 1.1803, + "num_input_tokens_seen": 76127184, + "step": 4731 + }, + { + "epoch": 0.33146829879079215, + "grad_norm": 3.7359249591827393, + "learning_rate": 6.688428721541156e-05, + "loss": 1.0539, + "num_input_tokens_seen": 76143536, + "step": 4732 + }, + { + "epoch": 0.3315383470365214, + "grad_norm": 4.159603595733643, + "learning_rate": 6.687728896672505e-05, + "loss": 1.1565, + "num_input_tokens_seen": 76159640, + "step": 4733 + }, + { + "epoch": 0.33160839528225067, + "grad_norm": 4.893441200256348, + "learning_rate": 6.687029071803854e-05, + "loss": 1.191, + "num_input_tokens_seen": 76176024, + "step": 4734 + }, + { + "epoch": 0.33167844352797987, + "grad_norm": 4.4292426109313965, + "learning_rate": 6.686329246935203e-05, + "loss": 1.1852, + "num_input_tokens_seen": 76192408, + "step": 4735 + }, + { + "epoch": 0.33174849177370913, + "grad_norm": 3.612821102142334, + "learning_rate": 6.68562942206655e-05, + "loss": 1.0195, + "num_input_tokens_seen": 76208792, + "step": 4736 + }, + { + "epoch": 0.3318185400194384, + "grad_norm": 3.6046557426452637, + "learning_rate": 6.684929597197898e-05, + "loss": 1.1402, + "num_input_tokens_seen": 76225176, + "step": 4737 + }, + { + "epoch": 0.33188858826516765, + "grad_norm": 4.637216567993164, + "learning_rate": 6.684229772329248e-05, + "loss": 0.8202, + "num_input_tokens_seen": 76241560, + "step": 4738 + }, + { + "epoch": 0.33195863651089685, + "grad_norm": 4.83438777923584, + "learning_rate": 6.683529947460595e-05, + "loss": 0.9085, + "num_input_tokens_seen": 76257264, + "step": 4739 + }, + { + "epoch": 0.3320286847566261, + "grad_norm": 3.903982400894165, + "learning_rate": 6.682830122591944e-05, + "loss": 1.2306, + "num_input_tokens_seen": 76273608, + "step": 4740 + }, + { + "epoch": 0.3320987330023554, + "grad_norm": 6.24022102355957, + "learning_rate": 6.682130297723293e-05, + "loss": 0.9706, + "num_input_tokens_seen": 76289840, + "step": 4741 + }, + { + "epoch": 0.33216878124808463, + "grad_norm": 5.286207675933838, + "learning_rate": 6.681430472854642e-05, + "loss": 1.0803, + "num_input_tokens_seen": 76306088, + "step": 4742 + }, + { + "epoch": 0.3322388294938139, + "grad_norm": 5.145969867706299, + "learning_rate": 6.68073064798599e-05, + "loss": 1.2303, + "num_input_tokens_seen": 76322152, + "step": 4743 + }, + { + "epoch": 0.3323088777395431, + "grad_norm": 3.6806249618530273, + "learning_rate": 6.680030823117338e-05, + "loss": 1.0168, + "num_input_tokens_seen": 76338424, + "step": 4744 + }, + { + "epoch": 0.33237892598527236, + "grad_norm": 3.743912696838379, + "learning_rate": 6.679330998248687e-05, + "loss": 0.9507, + "num_input_tokens_seen": 76354808, + "step": 4745 + }, + { + "epoch": 0.3324489742310016, + "grad_norm": 5.072415828704834, + "learning_rate": 6.678631173380035e-05, + "loss": 1.014, + "num_input_tokens_seen": 76369696, + "step": 4746 + }, + { + "epoch": 0.3325190224767309, + "grad_norm": 3.366450548171997, + "learning_rate": 6.677931348511384e-05, + "loss": 0.9201, + "num_input_tokens_seen": 76385560, + "step": 4747 + }, + { + "epoch": 0.3325890707224601, + "grad_norm": 3.8318989276885986, + "learning_rate": 6.677231523642732e-05, + "loss": 1.0973, + "num_input_tokens_seen": 76401168, + "step": 4748 + }, + { + "epoch": 0.33265911896818934, + "grad_norm": 3.9670164585113525, + "learning_rate": 6.676531698774081e-05, + "loss": 0.9238, + "num_input_tokens_seen": 76417552, + "step": 4749 + }, + { + "epoch": 0.3327291672139186, + "grad_norm": 4.344585418701172, + "learning_rate": 6.675831873905429e-05, + "loss": 1.0099, + "num_input_tokens_seen": 76433936, + "step": 4750 + }, + { + "epoch": 0.33279921545964786, + "grad_norm": 7.547675132751465, + "learning_rate": 6.675132049036778e-05, + "loss": 1.1412, + "num_input_tokens_seen": 76450320, + "step": 4751 + }, + { + "epoch": 0.33286926370537706, + "grad_norm": 7.854677677154541, + "learning_rate": 6.674432224168127e-05, + "loss": 0.8778, + "num_input_tokens_seen": 76465696, + "step": 4752 + }, + { + "epoch": 0.3329393119511063, + "grad_norm": 4.030972480773926, + "learning_rate": 6.673732399299475e-05, + "loss": 1.1389, + "num_input_tokens_seen": 76482080, + "step": 4753 + }, + { + "epoch": 0.3330093601968356, + "grad_norm": 5.990024089813232, + "learning_rate": 6.673032574430824e-05, + "loss": 0.9469, + "num_input_tokens_seen": 76498464, + "step": 4754 + }, + { + "epoch": 0.33307940844256484, + "grad_norm": 3.8437137603759766, + "learning_rate": 6.672332749562173e-05, + "loss": 1.009, + "num_input_tokens_seen": 76514848, + "step": 4755 + }, + { + "epoch": 0.33314945668829404, + "grad_norm": 3.883882761001587, + "learning_rate": 6.67163292469352e-05, + "loss": 1.0267, + "num_input_tokens_seen": 76531232, + "step": 4756 + }, + { + "epoch": 0.3332195049340233, + "grad_norm": 4.205630779266357, + "learning_rate": 6.670933099824868e-05, + "loss": 1.0847, + "num_input_tokens_seen": 76547616, + "step": 4757 + }, + { + "epoch": 0.33328955317975256, + "grad_norm": 6.173430442810059, + "learning_rate": 6.670233274956217e-05, + "loss": 1.2014, + "num_input_tokens_seen": 76564000, + "step": 4758 + }, + { + "epoch": 0.3333596014254818, + "grad_norm": 3.464181661605835, + "learning_rate": 6.669533450087566e-05, + "loss": 0.8751, + "num_input_tokens_seen": 76579904, + "step": 4759 + }, + { + "epoch": 0.333429649671211, + "grad_norm": 3.3506994247436523, + "learning_rate": 6.668833625218915e-05, + "loss": 0.8281, + "num_input_tokens_seen": 76596288, + "step": 4760 + }, + { + "epoch": 0.3334996979169403, + "grad_norm": 7.188508987426758, + "learning_rate": 6.668133800350264e-05, + "loss": 1.1058, + "num_input_tokens_seen": 76611632, + "step": 4761 + }, + { + "epoch": 0.33356974616266954, + "grad_norm": 3.916689872741699, + "learning_rate": 6.667433975481612e-05, + "loss": 1.0815, + "num_input_tokens_seen": 76626840, + "step": 4762 + }, + { + "epoch": 0.3336397944083988, + "grad_norm": 3.4827966690063477, + "learning_rate": 6.66673415061296e-05, + "loss": 0.9103, + "num_input_tokens_seen": 76643024, + "step": 4763 + }, + { + "epoch": 0.333709842654128, + "grad_norm": 4.479428768157959, + "learning_rate": 6.666034325744307e-05, + "loss": 0.9238, + "num_input_tokens_seen": 76659408, + "step": 4764 + }, + { + "epoch": 0.33377989089985727, + "grad_norm": 6.008899211883545, + "learning_rate": 6.665334500875658e-05, + "loss": 1.2375, + "num_input_tokens_seen": 76675272, + "step": 4765 + }, + { + "epoch": 0.3338499391455865, + "grad_norm": 4.10992431640625, + "learning_rate": 6.664634676007005e-05, + "loss": 1.0539, + "num_input_tokens_seen": 76691000, + "step": 4766 + }, + { + "epoch": 0.3339199873913158, + "grad_norm": 3.953507423400879, + "learning_rate": 6.663934851138354e-05, + "loss": 1.1051, + "num_input_tokens_seen": 76707024, + "step": 4767 + }, + { + "epoch": 0.333990035637045, + "grad_norm": 4.237090587615967, + "learning_rate": 6.663235026269703e-05, + "loss": 1.1683, + "num_input_tokens_seen": 76723408, + "step": 4768 + }, + { + "epoch": 0.33406008388277425, + "grad_norm": 4.417295932769775, + "learning_rate": 6.662535201401052e-05, + "loss": 0.923, + "num_input_tokens_seen": 76739296, + "step": 4769 + }, + { + "epoch": 0.3341301321285035, + "grad_norm": 3.664970874786377, + "learning_rate": 6.661835376532399e-05, + "loss": 0.9556, + "num_input_tokens_seen": 76755432, + "step": 4770 + }, + { + "epoch": 0.33420018037423277, + "grad_norm": 3.702932834625244, + "learning_rate": 6.661135551663748e-05, + "loss": 0.9457, + "num_input_tokens_seen": 76770688, + "step": 4771 + }, + { + "epoch": 0.334270228619962, + "grad_norm": 3.741722822189331, + "learning_rate": 6.660435726795097e-05, + "loss": 1.0923, + "num_input_tokens_seen": 76787072, + "step": 4772 + }, + { + "epoch": 0.33434027686569123, + "grad_norm": 3.9605424404144287, + "learning_rate": 6.659735901926446e-05, + "loss": 1.0823, + "num_input_tokens_seen": 76803456, + "step": 4773 + }, + { + "epoch": 0.3344103251114205, + "grad_norm": 3.9401822090148926, + "learning_rate": 6.659036077057793e-05, + "loss": 0.996, + "num_input_tokens_seen": 76819840, + "step": 4774 + }, + { + "epoch": 0.33448037335714975, + "grad_norm": 3.8762905597686768, + "learning_rate": 6.658336252189142e-05, + "loss": 0.9796, + "num_input_tokens_seen": 76836224, + "step": 4775 + }, + { + "epoch": 0.33455042160287896, + "grad_norm": 4.117221832275391, + "learning_rate": 6.657636427320491e-05, + "loss": 1.2631, + "num_input_tokens_seen": 76852608, + "step": 4776 + }, + { + "epoch": 0.3346204698486082, + "grad_norm": 3.814997434616089, + "learning_rate": 6.656936602451839e-05, + "loss": 1.0891, + "num_input_tokens_seen": 76868400, + "step": 4777 + }, + { + "epoch": 0.3346905180943375, + "grad_norm": 3.6070499420166016, + "learning_rate": 6.656236777583187e-05, + "loss": 0.8537, + "num_input_tokens_seen": 76884784, + "step": 4778 + }, + { + "epoch": 0.33476056634006673, + "grad_norm": 6.291281700134277, + "learning_rate": 6.655536952714536e-05, + "loss": 1.2195, + "num_input_tokens_seen": 76901168, + "step": 4779 + }, + { + "epoch": 0.334830614585796, + "grad_norm": 7.043301105499268, + "learning_rate": 6.654837127845885e-05, + "loss": 1.1015, + "num_input_tokens_seen": 76917552, + "step": 4780 + }, + { + "epoch": 0.3349006628315252, + "grad_norm": 3.6702778339385986, + "learning_rate": 6.654137302977234e-05, + "loss": 1.0832, + "num_input_tokens_seen": 76933936, + "step": 4781 + }, + { + "epoch": 0.33497071107725446, + "grad_norm": 4.228512287139893, + "learning_rate": 6.653437478108583e-05, + "loss": 0.8781, + "num_input_tokens_seen": 76950320, + "step": 4782 + }, + { + "epoch": 0.3350407593229837, + "grad_norm": 3.9304075241088867, + "learning_rate": 6.65273765323993e-05, + "loss": 1.1066, + "num_input_tokens_seen": 76966704, + "step": 4783 + }, + { + "epoch": 0.335110807568713, + "grad_norm": 3.608708620071411, + "learning_rate": 6.652037828371278e-05, + "loss": 1.0409, + "num_input_tokens_seen": 76983016, + "step": 4784 + }, + { + "epoch": 0.3351808558144422, + "grad_norm": 4.402626037597656, + "learning_rate": 6.651338003502627e-05, + "loss": 1.1832, + "num_input_tokens_seen": 76999344, + "step": 4785 + }, + { + "epoch": 0.33525090406017144, + "grad_norm": 4.109679222106934, + "learning_rate": 6.650638178633976e-05, + "loss": 1.0003, + "num_input_tokens_seen": 77014960, + "step": 4786 + }, + { + "epoch": 0.3353209523059007, + "grad_norm": 3.893702507019043, + "learning_rate": 6.649938353765324e-05, + "loss": 1.1101, + "num_input_tokens_seen": 77031344, + "step": 4787 + }, + { + "epoch": 0.33539100055162996, + "grad_norm": 4.326907157897949, + "learning_rate": 6.649238528896673e-05, + "loss": 1.0554, + "num_input_tokens_seen": 77047264, + "step": 4788 + }, + { + "epoch": 0.33546104879735916, + "grad_norm": 4.946060657501221, + "learning_rate": 6.648538704028022e-05, + "loss": 1.0413, + "num_input_tokens_seen": 77063648, + "step": 4789 + }, + { + "epoch": 0.3355310970430884, + "grad_norm": 3.5379018783569336, + "learning_rate": 6.64783887915937e-05, + "loss": 1.0981, + "num_input_tokens_seen": 77080032, + "step": 4790 + }, + { + "epoch": 0.3356011452888177, + "grad_norm": 4.117929935455322, + "learning_rate": 6.647139054290717e-05, + "loss": 1.0624, + "num_input_tokens_seen": 77096416, + "step": 4791 + }, + { + "epoch": 0.33567119353454694, + "grad_norm": 4.293130397796631, + "learning_rate": 6.646439229422067e-05, + "loss": 1.1938, + "num_input_tokens_seen": 77112800, + "step": 4792 + }, + { + "epoch": 0.33574124178027615, + "grad_norm": 3.8246893882751465, + "learning_rate": 6.645739404553416e-05, + "loss": 1.0944, + "num_input_tokens_seen": 77128272, + "step": 4793 + }, + { + "epoch": 0.3358112900260054, + "grad_norm": 4.095324993133545, + "learning_rate": 6.645039579684764e-05, + "loss": 1.0024, + "num_input_tokens_seen": 77144008, + "step": 4794 + }, + { + "epoch": 0.33588133827173466, + "grad_norm": 3.7015397548675537, + "learning_rate": 6.644339754816113e-05, + "loss": 1.1318, + "num_input_tokens_seen": 77160392, + "step": 4795 + }, + { + "epoch": 0.3359513865174639, + "grad_norm": 3.7702248096466064, + "learning_rate": 6.643639929947461e-05, + "loss": 1.1815, + "num_input_tokens_seen": 77176776, + "step": 4796 + }, + { + "epoch": 0.3360214347631931, + "grad_norm": 6.468194961547852, + "learning_rate": 6.642940105078809e-05, + "loss": 1.1144, + "num_input_tokens_seen": 77192000, + "step": 4797 + }, + { + "epoch": 0.3360914830089224, + "grad_norm": 5.211976528167725, + "learning_rate": 6.642240280210158e-05, + "loss": 1.0252, + "num_input_tokens_seen": 77208040, + "step": 4798 + }, + { + "epoch": 0.33616153125465165, + "grad_norm": 4.3227763175964355, + "learning_rate": 6.641540455341507e-05, + "loss": 1.1343, + "num_input_tokens_seen": 77224016, + "step": 4799 + }, + { + "epoch": 0.3362315795003809, + "grad_norm": 3.6128039360046387, + "learning_rate": 6.640840630472856e-05, + "loss": 1.0222, + "num_input_tokens_seen": 77240400, + "step": 4800 + }, + { + "epoch": 0.3362315795003809, + "eval_loss": 1.1314613819122314, + "eval_runtime": 0.2053, + "eval_samples_per_second": 4.87, + "eval_steps_per_second": 4.87, + "num_input_tokens_seen": 77240400, + "step": 4800 + }, + { + "epoch": 0.3363016277461101, + "grad_norm": 3.851407766342163, + "learning_rate": 6.640140805604203e-05, + "loss": 1.0153, + "num_input_tokens_seen": 77255896, + "step": 4801 + }, + { + "epoch": 0.33637167599183937, + "grad_norm": 3.8287763595581055, + "learning_rate": 6.639440980735552e-05, + "loss": 1.02, + "num_input_tokens_seen": 77272280, + "step": 4802 + }, + { + "epoch": 0.33644172423756863, + "grad_norm": 4.373470306396484, + "learning_rate": 6.638741155866901e-05, + "loss": 1.0321, + "num_input_tokens_seen": 77288664, + "step": 4803 + }, + { + "epoch": 0.3365117724832979, + "grad_norm": 5.384084224700928, + "learning_rate": 6.638041330998248e-05, + "loss": 1.08, + "num_input_tokens_seen": 77304544, + "step": 4804 + }, + { + "epoch": 0.3365818207290271, + "grad_norm": 4.742502212524414, + "learning_rate": 6.637341506129597e-05, + "loss": 1.0856, + "num_input_tokens_seen": 77320928, + "step": 4805 + }, + { + "epoch": 0.33665186897475635, + "grad_norm": 3.5294950008392334, + "learning_rate": 6.636641681260946e-05, + "loss": 0.827, + "num_input_tokens_seen": 77337072, + "step": 4806 + }, + { + "epoch": 0.3367219172204856, + "grad_norm": 4.925806999206543, + "learning_rate": 6.635941856392295e-05, + "loss": 1.1351, + "num_input_tokens_seen": 77352312, + "step": 4807 + }, + { + "epoch": 0.33679196546621487, + "grad_norm": 4.373791694641113, + "learning_rate": 6.635242031523644e-05, + "loss": 1.055, + "num_input_tokens_seen": 77368696, + "step": 4808 + }, + { + "epoch": 0.3368620137119441, + "grad_norm": 4.921911239624023, + "learning_rate": 6.634542206654993e-05, + "loss": 1.0054, + "num_input_tokens_seen": 77384976, + "step": 4809 + }, + { + "epoch": 0.33693206195767333, + "grad_norm": 6.203757286071777, + "learning_rate": 6.63384238178634e-05, + "loss": 1.1801, + "num_input_tokens_seen": 77401360, + "step": 4810 + }, + { + "epoch": 0.3370021102034026, + "grad_norm": 3.675086498260498, + "learning_rate": 6.633142556917688e-05, + "loss": 1.1148, + "num_input_tokens_seen": 77417744, + "step": 4811 + }, + { + "epoch": 0.33707215844913185, + "grad_norm": 5.169121742248535, + "learning_rate": 6.632442732049036e-05, + "loss": 0.9541, + "num_input_tokens_seen": 77434128, + "step": 4812 + }, + { + "epoch": 0.3371422066948611, + "grad_norm": 4.036499977111816, + "learning_rate": 6.631742907180387e-05, + "loss": 0.9914, + "num_input_tokens_seen": 77449856, + "step": 4813 + }, + { + "epoch": 0.3372122549405903, + "grad_norm": 4.040637016296387, + "learning_rate": 6.631043082311734e-05, + "loss": 1.0704, + "num_input_tokens_seen": 77465536, + "step": 4814 + }, + { + "epoch": 0.3372823031863196, + "grad_norm": 5.76871395111084, + "learning_rate": 6.630343257443083e-05, + "loss": 1.0127, + "num_input_tokens_seen": 77481920, + "step": 4815 + }, + { + "epoch": 0.33735235143204884, + "grad_norm": 5.222348690032959, + "learning_rate": 6.629643432574432e-05, + "loss": 0.9411, + "num_input_tokens_seen": 77497464, + "step": 4816 + }, + { + "epoch": 0.3374223996777781, + "grad_norm": 4.099587440490723, + "learning_rate": 6.62894360770578e-05, + "loss": 1.011, + "num_input_tokens_seen": 77513848, + "step": 4817 + }, + { + "epoch": 0.3374924479235073, + "grad_norm": 4.034639835357666, + "learning_rate": 6.628243782837127e-05, + "loss": 1.0626, + "num_input_tokens_seen": 77530000, + "step": 4818 + }, + { + "epoch": 0.33756249616923656, + "grad_norm": 5.280242443084717, + "learning_rate": 6.627543957968477e-05, + "loss": 1.1305, + "num_input_tokens_seen": 77546384, + "step": 4819 + }, + { + "epoch": 0.3376325444149658, + "grad_norm": 4.851918697357178, + "learning_rate": 6.626844133099826e-05, + "loss": 1.0557, + "num_input_tokens_seen": 77561000, + "step": 4820 + }, + { + "epoch": 0.3377025926606951, + "grad_norm": 3.957601308822632, + "learning_rate": 6.626144308231173e-05, + "loss": 0.9625, + "num_input_tokens_seen": 77576512, + "step": 4821 + }, + { + "epoch": 0.3377726409064243, + "grad_norm": 3.9369540214538574, + "learning_rate": 6.625444483362522e-05, + "loss": 1.0611, + "num_input_tokens_seen": 77592896, + "step": 4822 + }, + { + "epoch": 0.33784268915215354, + "grad_norm": 4.397778511047363, + "learning_rate": 6.624744658493871e-05, + "loss": 1.0173, + "num_input_tokens_seen": 77609280, + "step": 4823 + }, + { + "epoch": 0.3379127373978828, + "grad_norm": 6.919220447540283, + "learning_rate": 6.624044833625219e-05, + "loss": 0.9992, + "num_input_tokens_seen": 77625664, + "step": 4824 + }, + { + "epoch": 0.33798278564361206, + "grad_norm": 5.501309871673584, + "learning_rate": 6.623345008756568e-05, + "loss": 1.0321, + "num_input_tokens_seen": 77642048, + "step": 4825 + }, + { + "epoch": 0.33805283388934126, + "grad_norm": 4.240433216094971, + "learning_rate": 6.622645183887916e-05, + "loss": 0.8771, + "num_input_tokens_seen": 77658432, + "step": 4826 + }, + { + "epoch": 0.3381228821350705, + "grad_norm": 3.9563584327697754, + "learning_rate": 6.621945359019265e-05, + "loss": 1.1921, + "num_input_tokens_seen": 77674816, + "step": 4827 + }, + { + "epoch": 0.3381929303807998, + "grad_norm": 6.861433982849121, + "learning_rate": 6.621245534150613e-05, + "loss": 0.9414, + "num_input_tokens_seen": 77689712, + "step": 4828 + }, + { + "epoch": 0.33826297862652904, + "grad_norm": 3.951972246170044, + "learning_rate": 6.620545709281962e-05, + "loss": 1.2363, + "num_input_tokens_seen": 77706096, + "step": 4829 + }, + { + "epoch": 0.33833302687225825, + "grad_norm": 4.419849395751953, + "learning_rate": 6.61984588441331e-05, + "loss": 1.1375, + "num_input_tokens_seen": 77721168, + "step": 4830 + }, + { + "epoch": 0.3384030751179875, + "grad_norm": 5.075031280517578, + "learning_rate": 6.619146059544658e-05, + "loss": 1.1363, + "num_input_tokens_seen": 77737552, + "step": 4831 + }, + { + "epoch": 0.33847312336371677, + "grad_norm": 4.216047763824463, + "learning_rate": 6.618446234676007e-05, + "loss": 1.0683, + "num_input_tokens_seen": 77753936, + "step": 4832 + }, + { + "epoch": 0.338543171609446, + "grad_norm": 4.175511360168457, + "learning_rate": 6.617746409807357e-05, + "loss": 1.1294, + "num_input_tokens_seen": 77769848, + "step": 4833 + }, + { + "epoch": 0.33861321985517523, + "grad_norm": 3.894831418991089, + "learning_rate": 6.617046584938705e-05, + "loss": 0.9527, + "num_input_tokens_seen": 77785792, + "step": 4834 + }, + { + "epoch": 0.3386832681009045, + "grad_norm": 4.06626033782959, + "learning_rate": 6.616346760070053e-05, + "loss": 1.039, + "num_input_tokens_seen": 77801728, + "step": 4835 + }, + { + "epoch": 0.33875331634663375, + "grad_norm": 6.5680341720581055, + "learning_rate": 6.615646935201402e-05, + "loss": 1.2627, + "num_input_tokens_seen": 77818112, + "step": 4836 + }, + { + "epoch": 0.338823364592363, + "grad_norm": 4.967332363128662, + "learning_rate": 6.61494711033275e-05, + "loss": 1.1455, + "num_input_tokens_seen": 77833464, + "step": 4837 + }, + { + "epoch": 0.3388934128380922, + "grad_norm": 4.244988918304443, + "learning_rate": 6.614247285464097e-05, + "loss": 1.0809, + "num_input_tokens_seen": 77849848, + "step": 4838 + }, + { + "epoch": 0.33896346108382147, + "grad_norm": 4.361011028289795, + "learning_rate": 6.613547460595447e-05, + "loss": 1.0217, + "num_input_tokens_seen": 77866232, + "step": 4839 + }, + { + "epoch": 0.33903350932955073, + "grad_norm": 6.348353385925293, + "learning_rate": 6.612847635726796e-05, + "loss": 1.1202, + "num_input_tokens_seen": 77882616, + "step": 4840 + }, + { + "epoch": 0.33910355757528, + "grad_norm": 3.8235714435577393, + "learning_rate": 6.612147810858144e-05, + "loss": 0.9018, + "num_input_tokens_seen": 77899000, + "step": 4841 + }, + { + "epoch": 0.3391736058210092, + "grad_norm": 4.069207191467285, + "learning_rate": 6.611447985989493e-05, + "loss": 1.1303, + "num_input_tokens_seen": 77915384, + "step": 4842 + }, + { + "epoch": 0.33924365406673845, + "grad_norm": 3.8036270141601562, + "learning_rate": 6.610748161120842e-05, + "loss": 1.1935, + "num_input_tokens_seen": 77931768, + "step": 4843 + }, + { + "epoch": 0.3393137023124677, + "grad_norm": 5.776700019836426, + "learning_rate": 6.610048336252189e-05, + "loss": 1.0031, + "num_input_tokens_seen": 77948152, + "step": 4844 + }, + { + "epoch": 0.339383750558197, + "grad_norm": 5.484714508056641, + "learning_rate": 6.609348511383538e-05, + "loss": 1.2233, + "num_input_tokens_seen": 77964536, + "step": 4845 + }, + { + "epoch": 0.3394537988039262, + "grad_norm": 4.595640659332275, + "learning_rate": 6.608648686514887e-05, + "loss": 1.0854, + "num_input_tokens_seen": 77980648, + "step": 4846 + }, + { + "epoch": 0.33952384704965544, + "grad_norm": 5.0377197265625, + "learning_rate": 6.607948861646236e-05, + "loss": 1.0513, + "num_input_tokens_seen": 77995624, + "step": 4847 + }, + { + "epoch": 0.3395938952953847, + "grad_norm": 3.796713352203369, + "learning_rate": 6.607249036777583e-05, + "loss": 0.8928, + "num_input_tokens_seen": 78012008, + "step": 4848 + }, + { + "epoch": 0.33966394354111396, + "grad_norm": 5.138030052185059, + "learning_rate": 6.606549211908932e-05, + "loss": 0.9565, + "num_input_tokens_seen": 78027960, + "step": 4849 + }, + { + "epoch": 0.3397339917868432, + "grad_norm": 5.852467060089111, + "learning_rate": 6.605849387040281e-05, + "loss": 1.1297, + "num_input_tokens_seen": 78044144, + "step": 4850 + }, + { + "epoch": 0.3398040400325724, + "grad_norm": 3.5677835941314697, + "learning_rate": 6.605149562171628e-05, + "loss": 1.0115, + "num_input_tokens_seen": 78059096, + "step": 4851 + }, + { + "epoch": 0.3398740882783017, + "grad_norm": 4.033452987670898, + "learning_rate": 6.604449737302977e-05, + "loss": 1.1311, + "num_input_tokens_seen": 78075480, + "step": 4852 + }, + { + "epoch": 0.33994413652403094, + "grad_norm": 5.06736421585083, + "learning_rate": 6.603749912434327e-05, + "loss": 1.0456, + "num_input_tokens_seen": 78091864, + "step": 4853 + }, + { + "epoch": 0.3400141847697602, + "grad_norm": 4.851357460021973, + "learning_rate": 6.603050087565675e-05, + "loss": 0.9985, + "num_input_tokens_seen": 78108248, + "step": 4854 + }, + { + "epoch": 0.3400842330154894, + "grad_norm": 4.659403324127197, + "learning_rate": 6.602350262697022e-05, + "loss": 1.0618, + "num_input_tokens_seen": 78123720, + "step": 4855 + }, + { + "epoch": 0.34015428126121866, + "grad_norm": 4.0248870849609375, + "learning_rate": 6.601650437828371e-05, + "loss": 1.0192, + "num_input_tokens_seen": 78140104, + "step": 4856 + }, + { + "epoch": 0.3402243295069479, + "grad_norm": 3.615807056427002, + "learning_rate": 6.60095061295972e-05, + "loss": 0.9875, + "num_input_tokens_seen": 78155768, + "step": 4857 + }, + { + "epoch": 0.3402943777526772, + "grad_norm": 4.032024383544922, + "learning_rate": 6.600250788091068e-05, + "loss": 0.8621, + "num_input_tokens_seen": 78171688, + "step": 4858 + }, + { + "epoch": 0.3403644259984064, + "grad_norm": 4.213406085968018, + "learning_rate": 6.599550963222418e-05, + "loss": 1.1186, + "num_input_tokens_seen": 78188072, + "step": 4859 + }, + { + "epoch": 0.34043447424413564, + "grad_norm": 4.343294620513916, + "learning_rate": 6.598851138353767e-05, + "loss": 0.9555, + "num_input_tokens_seen": 78203328, + "step": 4860 + }, + { + "epoch": 0.3405045224898649, + "grad_norm": 5.112723350524902, + "learning_rate": 6.598151313485114e-05, + "loss": 0.8201, + "num_input_tokens_seen": 78218480, + "step": 4861 + }, + { + "epoch": 0.34057457073559416, + "grad_norm": 5.0619215965271, + "learning_rate": 6.597451488616463e-05, + "loss": 1.1147, + "num_input_tokens_seen": 78234864, + "step": 4862 + }, + { + "epoch": 0.34064461898132337, + "grad_norm": 4.161584377288818, + "learning_rate": 6.596751663747812e-05, + "loss": 1.3292, + "num_input_tokens_seen": 78250664, + "step": 4863 + }, + { + "epoch": 0.3407146672270526, + "grad_norm": 4.402634143829346, + "learning_rate": 6.59605183887916e-05, + "loss": 1.2664, + "num_input_tokens_seen": 78266976, + "step": 4864 + }, + { + "epoch": 0.3407847154727819, + "grad_norm": 4.01839017868042, + "learning_rate": 6.595352014010508e-05, + "loss": 1.1515, + "num_input_tokens_seen": 78283360, + "step": 4865 + }, + { + "epoch": 0.34085476371851114, + "grad_norm": 3.6157965660095215, + "learning_rate": 6.594652189141857e-05, + "loss": 0.8962, + "num_input_tokens_seen": 78299744, + "step": 4866 + }, + { + "epoch": 0.34092481196424035, + "grad_norm": 4.221523761749268, + "learning_rate": 6.593952364273206e-05, + "loss": 1.3689, + "num_input_tokens_seen": 78314944, + "step": 4867 + }, + { + "epoch": 0.3409948602099696, + "grad_norm": 5.253129482269287, + "learning_rate": 6.593252539404554e-05, + "loss": 1.0223, + "num_input_tokens_seen": 78331168, + "step": 4868 + }, + { + "epoch": 0.34106490845569887, + "grad_norm": 4.839991569519043, + "learning_rate": 6.592552714535902e-05, + "loss": 1.1622, + "num_input_tokens_seen": 78347200, + "step": 4869 + }, + { + "epoch": 0.3411349567014281, + "grad_norm": 5.994297504425049, + "learning_rate": 6.591852889667251e-05, + "loss": 1.0511, + "num_input_tokens_seen": 78363584, + "step": 4870 + }, + { + "epoch": 0.34120500494715733, + "grad_norm": 4.886160850524902, + "learning_rate": 6.591153064798599e-05, + "loss": 1.0025, + "num_input_tokens_seen": 78379968, + "step": 4871 + }, + { + "epoch": 0.3412750531928866, + "grad_norm": 8.658349990844727, + "learning_rate": 6.590453239929948e-05, + "loss": 1.0145, + "num_input_tokens_seen": 78395368, + "step": 4872 + }, + { + "epoch": 0.34134510143861585, + "grad_norm": 5.1440935134887695, + "learning_rate": 6.589753415061298e-05, + "loss": 0.9584, + "num_input_tokens_seen": 78411752, + "step": 4873 + }, + { + "epoch": 0.3414151496843451, + "grad_norm": 4.83282995223999, + "learning_rate": 6.589053590192645e-05, + "loss": 1.1825, + "num_input_tokens_seen": 78428128, + "step": 4874 + }, + { + "epoch": 0.3414851979300743, + "grad_norm": 3.603290557861328, + "learning_rate": 6.588353765323993e-05, + "loss": 1.081, + "num_input_tokens_seen": 78444512, + "step": 4875 + }, + { + "epoch": 0.3415552461758036, + "grad_norm": 3.8035361766815186, + "learning_rate": 6.587653940455342e-05, + "loss": 1.1571, + "num_input_tokens_seen": 78460896, + "step": 4876 + }, + { + "epoch": 0.34162529442153283, + "grad_norm": 4.02992582321167, + "learning_rate": 6.58695411558669e-05, + "loss": 1.0974, + "num_input_tokens_seen": 78477280, + "step": 4877 + }, + { + "epoch": 0.3416953426672621, + "grad_norm": 4.898126125335693, + "learning_rate": 6.586254290718038e-05, + "loss": 1.0632, + "num_input_tokens_seen": 78493664, + "step": 4878 + }, + { + "epoch": 0.3417653909129913, + "grad_norm": 4.779463768005371, + "learning_rate": 6.585554465849388e-05, + "loss": 1.0473, + "num_input_tokens_seen": 78509280, + "step": 4879 + }, + { + "epoch": 0.34183543915872056, + "grad_norm": 3.7280569076538086, + "learning_rate": 6.584854640980737e-05, + "loss": 1.1184, + "num_input_tokens_seen": 78525664, + "step": 4880 + }, + { + "epoch": 0.3419054874044498, + "grad_norm": 4.691235542297363, + "learning_rate": 6.584154816112085e-05, + "loss": 0.9956, + "num_input_tokens_seen": 78542048, + "step": 4881 + }, + { + "epoch": 0.3419755356501791, + "grad_norm": 4.188792705535889, + "learning_rate": 6.583454991243432e-05, + "loss": 1.1065, + "num_input_tokens_seen": 78558432, + "step": 4882 + }, + { + "epoch": 0.34204558389590833, + "grad_norm": 3.7049522399902344, + "learning_rate": 6.582755166374781e-05, + "loss": 0.9286, + "num_input_tokens_seen": 78574816, + "step": 4883 + }, + { + "epoch": 0.34211563214163754, + "grad_norm": 5.808310508728027, + "learning_rate": 6.58205534150613e-05, + "loss": 1.0674, + "num_input_tokens_seen": 78590992, + "step": 4884 + }, + { + "epoch": 0.3421856803873668, + "grad_norm": 3.877638339996338, + "learning_rate": 6.581355516637479e-05, + "loss": 0.9908, + "num_input_tokens_seen": 78607368, + "step": 4885 + }, + { + "epoch": 0.34225572863309606, + "grad_norm": 3.7855000495910645, + "learning_rate": 6.580655691768828e-05, + "loss": 1.0697, + "num_input_tokens_seen": 78622712, + "step": 4886 + }, + { + "epoch": 0.3423257768788253, + "grad_norm": 3.9921584129333496, + "learning_rate": 6.579955866900176e-05, + "loss": 1.0196, + "num_input_tokens_seen": 78638840, + "step": 4887 + }, + { + "epoch": 0.3423958251245545, + "grad_norm": 4.037683486938477, + "learning_rate": 6.579256042031524e-05, + "loss": 0.9606, + "num_input_tokens_seen": 78655224, + "step": 4888 + }, + { + "epoch": 0.3424658733702838, + "grad_norm": 4.109930515289307, + "learning_rate": 6.578556217162873e-05, + "loss": 1.1189, + "num_input_tokens_seen": 78670984, + "step": 4889 + }, + { + "epoch": 0.34253592161601304, + "grad_norm": 5.201082229614258, + "learning_rate": 6.577856392294222e-05, + "loss": 1.0844, + "num_input_tokens_seen": 78686856, + "step": 4890 + }, + { + "epoch": 0.3426059698617423, + "grad_norm": 5.922754764556885, + "learning_rate": 6.577156567425569e-05, + "loss": 1.2428, + "num_input_tokens_seen": 78702688, + "step": 4891 + }, + { + "epoch": 0.3426760181074715, + "grad_norm": 4.052786350250244, + "learning_rate": 6.576456742556918e-05, + "loss": 1.0765, + "num_input_tokens_seen": 78719072, + "step": 4892 + }, + { + "epoch": 0.34274606635320076, + "grad_norm": 4.0263671875, + "learning_rate": 6.575756917688267e-05, + "loss": 1.2076, + "num_input_tokens_seen": 78735456, + "step": 4893 + }, + { + "epoch": 0.34281611459893, + "grad_norm": 3.773024082183838, + "learning_rate": 6.575057092819616e-05, + "loss": 1.1275, + "num_input_tokens_seen": 78751424, + "step": 4894 + }, + { + "epoch": 0.3428861628446593, + "grad_norm": 3.770413398742676, + "learning_rate": 6.574357267950963e-05, + "loss": 1.1331, + "num_input_tokens_seen": 78767808, + "step": 4895 + }, + { + "epoch": 0.3429562110903885, + "grad_norm": 6.26648473739624, + "learning_rate": 6.573657443082312e-05, + "loss": 1.1432, + "num_input_tokens_seen": 78783448, + "step": 4896 + }, + { + "epoch": 0.34302625933611774, + "grad_norm": 4.071943283081055, + "learning_rate": 6.572957618213661e-05, + "loss": 0.9008, + "num_input_tokens_seen": 78798976, + "step": 4897 + }, + { + "epoch": 0.343096307581847, + "grad_norm": 7.654726505279541, + "learning_rate": 6.572257793345008e-05, + "loss": 1.0902, + "num_input_tokens_seen": 78814664, + "step": 4898 + }, + { + "epoch": 0.34316635582757626, + "grad_norm": 5.928562641143799, + "learning_rate": 6.571557968476357e-05, + "loss": 1.0462, + "num_input_tokens_seen": 78830792, + "step": 4899 + }, + { + "epoch": 0.34323640407330547, + "grad_norm": 3.8699424266815186, + "learning_rate": 6.570858143607708e-05, + "loss": 1.0568, + "num_input_tokens_seen": 78847176, + "step": 4900 + }, + { + "epoch": 0.3433064523190347, + "grad_norm": 4.177735328674316, + "learning_rate": 6.570158318739055e-05, + "loss": 1.1381, + "num_input_tokens_seen": 78863496, + "step": 4901 + }, + { + "epoch": 0.343376500564764, + "grad_norm": 3.5755650997161865, + "learning_rate": 6.569458493870403e-05, + "loss": 1.0307, + "num_input_tokens_seen": 78879688, + "step": 4902 + }, + { + "epoch": 0.34344654881049325, + "grad_norm": 5.799609184265137, + "learning_rate": 6.568758669001751e-05, + "loss": 0.9005, + "num_input_tokens_seen": 78894744, + "step": 4903 + }, + { + "epoch": 0.34351659705622245, + "grad_norm": 3.7705209255218506, + "learning_rate": 6.5680588441331e-05, + "loss": 1.0557, + "num_input_tokens_seen": 78911112, + "step": 4904 + }, + { + "epoch": 0.3435866453019517, + "grad_norm": 4.713012218475342, + "learning_rate": 6.567359019264449e-05, + "loss": 1.1005, + "num_input_tokens_seen": 78927496, + "step": 4905 + }, + { + "epoch": 0.34365669354768097, + "grad_norm": 3.8360157012939453, + "learning_rate": 6.566659194395798e-05, + "loss": 1.1281, + "num_input_tokens_seen": 78942712, + "step": 4906 + }, + { + "epoch": 0.34372674179341023, + "grad_norm": 3.6071383953094482, + "learning_rate": 6.565959369527147e-05, + "loss": 0.974, + "num_input_tokens_seen": 78959016, + "step": 4907 + }, + { + "epoch": 0.34379679003913943, + "grad_norm": 4.876083850860596, + "learning_rate": 6.565259544658494e-05, + "loss": 1.1583, + "num_input_tokens_seen": 78975400, + "step": 4908 + }, + { + "epoch": 0.3438668382848687, + "grad_norm": 4.011876583099365, + "learning_rate": 6.564559719789842e-05, + "loss": 1.0749, + "num_input_tokens_seen": 78991784, + "step": 4909 + }, + { + "epoch": 0.34393688653059795, + "grad_norm": 3.74336576461792, + "learning_rate": 6.563859894921191e-05, + "loss": 1.0358, + "num_input_tokens_seen": 79008168, + "step": 4910 + }, + { + "epoch": 0.3440069347763272, + "grad_norm": 4.092207908630371, + "learning_rate": 6.56316007005254e-05, + "loss": 0.9901, + "num_input_tokens_seen": 79024200, + "step": 4911 + }, + { + "epoch": 0.3440769830220564, + "grad_norm": 3.771979331970215, + "learning_rate": 6.562460245183888e-05, + "loss": 0.9599, + "num_input_tokens_seen": 79040584, + "step": 4912 + }, + { + "epoch": 0.3441470312677857, + "grad_norm": 4.791725158691406, + "learning_rate": 6.561760420315237e-05, + "loss": 1.0563, + "num_input_tokens_seen": 79056552, + "step": 4913 + }, + { + "epoch": 0.34421707951351493, + "grad_norm": 4.9150519371032715, + "learning_rate": 6.561060595446586e-05, + "loss": 0.9623, + "num_input_tokens_seen": 79072536, + "step": 4914 + }, + { + "epoch": 0.3442871277592442, + "grad_norm": 4.550070285797119, + "learning_rate": 6.560360770577934e-05, + "loss": 1.07, + "num_input_tokens_seen": 79088376, + "step": 4915 + }, + { + "epoch": 0.34435717600497345, + "grad_norm": 4.497488975524902, + "learning_rate": 6.559660945709283e-05, + "loss": 1.0779, + "num_input_tokens_seen": 79104760, + "step": 4916 + }, + { + "epoch": 0.34442722425070266, + "grad_norm": 4.417470455169678, + "learning_rate": 6.558961120840631e-05, + "loss": 1.0471, + "num_input_tokens_seen": 79120296, + "step": 4917 + }, + { + "epoch": 0.3444972724964319, + "grad_norm": 4.967655658721924, + "learning_rate": 6.558261295971979e-05, + "loss": 0.9294, + "num_input_tokens_seen": 79135936, + "step": 4918 + }, + { + "epoch": 0.3445673207421612, + "grad_norm": 4.973440647125244, + "learning_rate": 6.557561471103328e-05, + "loss": 1.1045, + "num_input_tokens_seen": 79151632, + "step": 4919 + }, + { + "epoch": 0.34463736898789044, + "grad_norm": 5.2282609939575195, + "learning_rate": 6.556861646234677e-05, + "loss": 1.2508, + "num_input_tokens_seen": 79167112, + "step": 4920 + }, + { + "epoch": 0.34470741723361964, + "grad_norm": 4.118466854095459, + "learning_rate": 6.556161821366025e-05, + "loss": 1.0162, + "num_input_tokens_seen": 79183496, + "step": 4921 + }, + { + "epoch": 0.3447774654793489, + "grad_norm": 4.74249267578125, + "learning_rate": 6.555461996497373e-05, + "loss": 1.1383, + "num_input_tokens_seen": 79199240, + "step": 4922 + }, + { + "epoch": 0.34484751372507816, + "grad_norm": 4.21056604385376, + "learning_rate": 6.554762171628722e-05, + "loss": 1.2693, + "num_input_tokens_seen": 79215560, + "step": 4923 + }, + { + "epoch": 0.3449175619708074, + "grad_norm": 3.584332227706909, + "learning_rate": 6.55406234676007e-05, + "loss": 1.0466, + "num_input_tokens_seen": 79231944, + "step": 4924 + }, + { + "epoch": 0.3449876102165366, + "grad_norm": 7.287233829498291, + "learning_rate": 6.55336252189142e-05, + "loss": 1.0568, + "num_input_tokens_seen": 79248328, + "step": 4925 + }, + { + "epoch": 0.3450576584622659, + "grad_norm": 6.5669379234313965, + "learning_rate": 6.552662697022767e-05, + "loss": 0.8538, + "num_input_tokens_seen": 79264712, + "step": 4926 + }, + { + "epoch": 0.34512770670799514, + "grad_norm": 4.086475849151611, + "learning_rate": 6.551962872154117e-05, + "loss": 1.082, + "num_input_tokens_seen": 79281096, + "step": 4927 + }, + { + "epoch": 0.3451977549537244, + "grad_norm": 5.543658256530762, + "learning_rate": 6.551263047285465e-05, + "loss": 0.9835, + "num_input_tokens_seen": 79297120, + "step": 4928 + }, + { + "epoch": 0.3452678031994536, + "grad_norm": 6.474762439727783, + "learning_rate": 6.550563222416812e-05, + "loss": 1.2022, + "num_input_tokens_seen": 79313504, + "step": 4929 + }, + { + "epoch": 0.34533785144518286, + "grad_norm": 3.8226888179779053, + "learning_rate": 6.549863397548161e-05, + "loss": 0.9796, + "num_input_tokens_seen": 79329888, + "step": 4930 + }, + { + "epoch": 0.3454078996909121, + "grad_norm": 3.8926212787628174, + "learning_rate": 6.54916357267951e-05, + "loss": 1.0837, + "num_input_tokens_seen": 79346272, + "step": 4931 + }, + { + "epoch": 0.3454779479366414, + "grad_norm": 4.127487659454346, + "learning_rate": 6.548463747810859e-05, + "loss": 1.1942, + "num_input_tokens_seen": 79362656, + "step": 4932 + }, + { + "epoch": 0.3455479961823706, + "grad_norm": 6.770711421966553, + "learning_rate": 6.547763922942208e-05, + "loss": 0.9898, + "num_input_tokens_seen": 79378544, + "step": 4933 + }, + { + "epoch": 0.34561804442809985, + "grad_norm": 5.547317028045654, + "learning_rate": 6.547064098073557e-05, + "loss": 1.0748, + "num_input_tokens_seen": 79394896, + "step": 4934 + }, + { + "epoch": 0.3456880926738291, + "grad_norm": 4.469418048858643, + "learning_rate": 6.546364273204904e-05, + "loss": 1.1633, + "num_input_tokens_seen": 79410480, + "step": 4935 + }, + { + "epoch": 0.34575814091955837, + "grad_norm": 4.901472091674805, + "learning_rate": 6.545664448336252e-05, + "loss": 1.0252, + "num_input_tokens_seen": 79426864, + "step": 4936 + }, + { + "epoch": 0.34582818916528757, + "grad_norm": 3.60495662689209, + "learning_rate": 6.5449646234676e-05, + "loss": 1.007, + "num_input_tokens_seen": 79443248, + "step": 4937 + }, + { + "epoch": 0.34589823741101683, + "grad_norm": 4.513663291931152, + "learning_rate": 6.544264798598949e-05, + "loss": 1.2239, + "num_input_tokens_seen": 79459632, + "step": 4938 + }, + { + "epoch": 0.3459682856567461, + "grad_norm": 3.6959240436553955, + "learning_rate": 6.543564973730298e-05, + "loss": 1.0561, + "num_input_tokens_seen": 79475320, + "step": 4939 + }, + { + "epoch": 0.34603833390247535, + "grad_norm": 4.071475505828857, + "learning_rate": 6.542865148861647e-05, + "loss": 1.1963, + "num_input_tokens_seen": 79491704, + "step": 4940 + }, + { + "epoch": 0.34610838214820455, + "grad_norm": 3.665421962738037, + "learning_rate": 6.542165323992996e-05, + "loss": 0.9609, + "num_input_tokens_seen": 79508088, + "step": 4941 + }, + { + "epoch": 0.3461784303939338, + "grad_norm": 4.1782941818237305, + "learning_rate": 6.541465499124343e-05, + "loss": 0.85, + "num_input_tokens_seen": 79523936, + "step": 4942 + }, + { + "epoch": 0.34624847863966307, + "grad_norm": 4.728964328765869, + "learning_rate": 6.540765674255691e-05, + "loss": 1.0283, + "num_input_tokens_seen": 79539848, + "step": 4943 + }, + { + "epoch": 0.34631852688539233, + "grad_norm": 5.39119815826416, + "learning_rate": 6.540065849387041e-05, + "loss": 1.185, + "num_input_tokens_seen": 79555040, + "step": 4944 + }, + { + "epoch": 0.34638857513112153, + "grad_norm": 3.8394956588745117, + "learning_rate": 6.53936602451839e-05, + "loss": 0.7774, + "num_input_tokens_seen": 79570504, + "step": 4945 + }, + { + "epoch": 0.3464586233768508, + "grad_norm": 5.03010368347168, + "learning_rate": 6.538666199649737e-05, + "loss": 0.8746, + "num_input_tokens_seen": 79586888, + "step": 4946 + }, + { + "epoch": 0.34652867162258005, + "grad_norm": 3.984548807144165, + "learning_rate": 6.537966374781086e-05, + "loss": 1.0893, + "num_input_tokens_seen": 79603128, + "step": 4947 + }, + { + "epoch": 0.3465987198683093, + "grad_norm": 5.096433162689209, + "learning_rate": 6.537266549912435e-05, + "loss": 1.0547, + "num_input_tokens_seen": 79618624, + "step": 4948 + }, + { + "epoch": 0.3466687681140385, + "grad_norm": 3.6773791313171387, + "learning_rate": 6.536566725043783e-05, + "loss": 0.985, + "num_input_tokens_seen": 79635008, + "step": 4949 + }, + { + "epoch": 0.3467388163597678, + "grad_norm": 4.050341606140137, + "learning_rate": 6.535866900175132e-05, + "loss": 0.9229, + "num_input_tokens_seen": 79651392, + "step": 4950 + }, + { + "epoch": 0.34680886460549704, + "grad_norm": 3.8354263305664062, + "learning_rate": 6.53516707530648e-05, + "loss": 1.0264, + "num_input_tokens_seen": 79667040, + "step": 4951 + }, + { + "epoch": 0.3468789128512263, + "grad_norm": 4.2188873291015625, + "learning_rate": 6.534467250437829e-05, + "loss": 1.0297, + "num_input_tokens_seen": 79683152, + "step": 4952 + }, + { + "epoch": 0.34694896109695555, + "grad_norm": 4.75797700881958, + "learning_rate": 6.533767425569177e-05, + "loss": 1.2475, + "num_input_tokens_seen": 79699536, + "step": 4953 + }, + { + "epoch": 0.34701900934268476, + "grad_norm": 3.494459867477417, + "learning_rate": 6.533067600700527e-05, + "loss": 0.9534, + "num_input_tokens_seen": 79715920, + "step": 4954 + }, + { + "epoch": 0.347089057588414, + "grad_norm": 3.860872268676758, + "learning_rate": 6.532367775831874e-05, + "loss": 1.081, + "num_input_tokens_seen": 79731832, + "step": 4955 + }, + { + "epoch": 0.3471591058341433, + "grad_norm": 4.188973426818848, + "learning_rate": 6.531667950963222e-05, + "loss": 1.0814, + "num_input_tokens_seen": 79747592, + "step": 4956 + }, + { + "epoch": 0.34722915407987254, + "grad_norm": 5.598564624786377, + "learning_rate": 6.530968126094571e-05, + "loss": 1.0699, + "num_input_tokens_seen": 79763048, + "step": 4957 + }, + { + "epoch": 0.34729920232560174, + "grad_norm": 4.153980255126953, + "learning_rate": 6.53026830122592e-05, + "loss": 1.1726, + "num_input_tokens_seen": 79777928, + "step": 4958 + }, + { + "epoch": 0.347369250571331, + "grad_norm": 3.875469446182251, + "learning_rate": 6.529568476357269e-05, + "loss": 1.1449, + "num_input_tokens_seen": 79794312, + "step": 4959 + }, + { + "epoch": 0.34743929881706026, + "grad_norm": 5.391599655151367, + "learning_rate": 6.528868651488617e-05, + "loss": 1.1748, + "num_input_tokens_seen": 79810696, + "step": 4960 + }, + { + "epoch": 0.3475093470627895, + "grad_norm": 3.3462777137756348, + "learning_rate": 6.528168826619966e-05, + "loss": 0.8645, + "num_input_tokens_seen": 79826208, + "step": 4961 + }, + { + "epoch": 0.3475793953085187, + "grad_norm": 3.5444939136505127, + "learning_rate": 6.527469001751314e-05, + "loss": 1.0989, + "num_input_tokens_seen": 79842592, + "step": 4962 + }, + { + "epoch": 0.347649443554248, + "grad_norm": 4.541754722595215, + "learning_rate": 6.526769176882661e-05, + "loss": 0.9, + "num_input_tokens_seen": 79858976, + "step": 4963 + }, + { + "epoch": 0.34771949179997724, + "grad_norm": 3.728207588195801, + "learning_rate": 6.52606935201401e-05, + "loss": 1.0493, + "num_input_tokens_seen": 79874944, + "step": 4964 + }, + { + "epoch": 0.3477895400457065, + "grad_norm": 5.615260601043701, + "learning_rate": 6.525369527145359e-05, + "loss": 1.0588, + "num_input_tokens_seen": 79890968, + "step": 4965 + }, + { + "epoch": 0.3478595882914357, + "grad_norm": 4.863505840301514, + "learning_rate": 6.524669702276708e-05, + "loss": 0.9896, + "num_input_tokens_seen": 79907352, + "step": 4966 + }, + { + "epoch": 0.34792963653716497, + "grad_norm": 3.6932058334350586, + "learning_rate": 6.523969877408057e-05, + "loss": 0.9675, + "num_input_tokens_seen": 79923736, + "step": 4967 + }, + { + "epoch": 0.3479996847828942, + "grad_norm": 4.483904838562012, + "learning_rate": 6.523270052539406e-05, + "loss": 1.222, + "num_input_tokens_seen": 79939360, + "step": 4968 + }, + { + "epoch": 0.3480697330286235, + "grad_norm": 3.540771007537842, + "learning_rate": 6.522570227670753e-05, + "loss": 0.9759, + "num_input_tokens_seen": 79955744, + "step": 4969 + }, + { + "epoch": 0.3481397812743527, + "grad_norm": 3.980483293533325, + "learning_rate": 6.5218704028021e-05, + "loss": 1.1637, + "num_input_tokens_seen": 79971368, + "step": 4970 + }, + { + "epoch": 0.34820982952008195, + "grad_norm": 5.302091598510742, + "learning_rate": 6.521170577933451e-05, + "loss": 1.0568, + "num_input_tokens_seen": 79986688, + "step": 4971 + }, + { + "epoch": 0.3482798777658112, + "grad_norm": 4.176638603210449, + "learning_rate": 6.5204707530648e-05, + "loss": 1.1928, + "num_input_tokens_seen": 80003072, + "step": 4972 + }, + { + "epoch": 0.34834992601154047, + "grad_norm": 5.939540386199951, + "learning_rate": 6.519770928196147e-05, + "loss": 1.0465, + "num_input_tokens_seen": 80019344, + "step": 4973 + }, + { + "epoch": 0.34841997425726967, + "grad_norm": 4.681301593780518, + "learning_rate": 6.519071103327496e-05, + "loss": 1.1121, + "num_input_tokens_seen": 80034504, + "step": 4974 + }, + { + "epoch": 0.34849002250299893, + "grad_norm": 4.993075847625732, + "learning_rate": 6.518371278458845e-05, + "loss": 0.8792, + "num_input_tokens_seen": 80050488, + "step": 4975 + }, + { + "epoch": 0.3485600707487282, + "grad_norm": 3.87778377532959, + "learning_rate": 6.517671453590192e-05, + "loss": 0.9458, + "num_input_tokens_seen": 80066872, + "step": 4976 + }, + { + "epoch": 0.34863011899445745, + "grad_norm": 3.652738332748413, + "learning_rate": 6.516971628721541e-05, + "loss": 0.9912, + "num_input_tokens_seen": 80083232, + "step": 4977 + }, + { + "epoch": 0.34870016724018665, + "grad_norm": 3.9958438873291016, + "learning_rate": 6.51627180385289e-05, + "loss": 0.8653, + "num_input_tokens_seen": 80099616, + "step": 4978 + }, + { + "epoch": 0.3487702154859159, + "grad_norm": 4.190839767456055, + "learning_rate": 6.515571978984239e-05, + "loss": 1.2081, + "num_input_tokens_seen": 80116000, + "step": 4979 + }, + { + "epoch": 0.3488402637316452, + "grad_norm": 4.848324298858643, + "learning_rate": 6.514872154115586e-05, + "loss": 1.197, + "num_input_tokens_seen": 80132384, + "step": 4980 + }, + { + "epoch": 0.34891031197737443, + "grad_norm": 4.863750457763672, + "learning_rate": 6.514172329246937e-05, + "loss": 1.1181, + "num_input_tokens_seen": 80148768, + "step": 4981 + }, + { + "epoch": 0.34898036022310364, + "grad_norm": 4.555769443511963, + "learning_rate": 6.513472504378284e-05, + "loss": 0.9769, + "num_input_tokens_seen": 80164984, + "step": 4982 + }, + { + "epoch": 0.3490504084688329, + "grad_norm": 5.041413307189941, + "learning_rate": 6.512772679509632e-05, + "loss": 1.0183, + "num_input_tokens_seen": 80181336, + "step": 4983 + }, + { + "epoch": 0.34912045671456216, + "grad_norm": 4.58367395401001, + "learning_rate": 6.51207285464098e-05, + "loss": 1.232, + "num_input_tokens_seen": 80197720, + "step": 4984 + }, + { + "epoch": 0.3491905049602914, + "grad_norm": 3.9667036533355713, + "learning_rate": 6.51137302977233e-05, + "loss": 1.1363, + "num_input_tokens_seen": 80212776, + "step": 4985 + }, + { + "epoch": 0.3492605532060207, + "grad_norm": 3.474071979522705, + "learning_rate": 6.510673204903678e-05, + "loss": 0.8978, + "num_input_tokens_seen": 80229160, + "step": 4986 + }, + { + "epoch": 0.3493306014517499, + "grad_norm": 3.912496328353882, + "learning_rate": 6.509973380035027e-05, + "loss": 0.9695, + "num_input_tokens_seen": 80245544, + "step": 4987 + }, + { + "epoch": 0.34940064969747914, + "grad_norm": 3.760340690612793, + "learning_rate": 6.509273555166376e-05, + "loss": 0.97, + "num_input_tokens_seen": 80261400, + "step": 4988 + }, + { + "epoch": 0.3494706979432084, + "grad_norm": 4.982266426086426, + "learning_rate": 6.508573730297723e-05, + "loss": 1.008, + "num_input_tokens_seen": 80277784, + "step": 4989 + }, + { + "epoch": 0.34954074618893766, + "grad_norm": 4.6823530197143555, + "learning_rate": 6.507873905429071e-05, + "loss": 1.3118, + "num_input_tokens_seen": 80294168, + "step": 4990 + }, + { + "epoch": 0.34961079443466686, + "grad_norm": 3.768439769744873, + "learning_rate": 6.50717408056042e-05, + "loss": 0.91, + "num_input_tokens_seen": 80310552, + "step": 4991 + }, + { + "epoch": 0.3496808426803961, + "grad_norm": 3.5285451412200928, + "learning_rate": 6.50647425569177e-05, + "loss": 0.8937, + "num_input_tokens_seen": 80326464, + "step": 4992 + }, + { + "epoch": 0.3497508909261254, + "grad_norm": 3.875992774963379, + "learning_rate": 6.505774430823118e-05, + "loss": 0.9514, + "num_input_tokens_seen": 80342848, + "step": 4993 + }, + { + "epoch": 0.34982093917185464, + "grad_norm": 4.061910152435303, + "learning_rate": 6.505074605954466e-05, + "loss": 0.9607, + "num_input_tokens_seen": 80359232, + "step": 4994 + }, + { + "epoch": 0.34989098741758384, + "grad_norm": 4.456427097320557, + "learning_rate": 6.504374781085815e-05, + "loss": 1.1927, + "num_input_tokens_seen": 80375616, + "step": 4995 + }, + { + "epoch": 0.3499610356633131, + "grad_norm": 4.381276607513428, + "learning_rate": 6.503674956217163e-05, + "loss": 1.0291, + "num_input_tokens_seen": 80392000, + "step": 4996 + }, + { + "epoch": 0.35003108390904236, + "grad_norm": 6.789033889770508, + "learning_rate": 6.50297513134851e-05, + "loss": 0.9971, + "num_input_tokens_seen": 80407360, + "step": 4997 + }, + { + "epoch": 0.3501011321547716, + "grad_norm": 3.953124761581421, + "learning_rate": 6.50227530647986e-05, + "loss": 1.1585, + "num_input_tokens_seen": 80423744, + "step": 4998 + }, + { + "epoch": 0.3501711804005008, + "grad_norm": 3.498389482498169, + "learning_rate": 6.50157548161121e-05, + "loss": 0.9259, + "num_input_tokens_seen": 80440128, + "step": 4999 + }, + { + "epoch": 0.3502412286462301, + "grad_norm": 5.498814582824707, + "learning_rate": 6.500875656742557e-05, + "loss": 0.9867, + "num_input_tokens_seen": 80456512, + "step": 5000 + }, + { + "epoch": 0.3502412286462301, + "eval_loss": 1.1277527809143066, + "eval_runtime": 0.1909, + "eval_samples_per_second": 5.238, + "eval_steps_per_second": 5.238, + "num_input_tokens_seen": 80456512, + "step": 5000 + }, + { + "epoch": 0.35031127689195934, + "grad_norm": 3.440230131149292, + "learning_rate": 6.500175831873906e-05, + "loss": 0.8354, + "num_input_tokens_seen": 80472456, + "step": 5001 + }, + { + "epoch": 0.3503813251376886, + "grad_norm": 5.069565296173096, + "learning_rate": 6.499476007005255e-05, + "loss": 1.1994, + "num_input_tokens_seen": 80488840, + "step": 5002 + }, + { + "epoch": 0.3504513733834178, + "grad_norm": 4.53994607925415, + "learning_rate": 6.498776182136602e-05, + "loss": 1.0962, + "num_input_tokens_seen": 80504984, + "step": 5003 + }, + { + "epoch": 0.35052142162914707, + "grad_norm": 4.136146068572998, + "learning_rate": 6.498076357267951e-05, + "loss": 0.9885, + "num_input_tokens_seen": 80520448, + "step": 5004 + }, + { + "epoch": 0.3505914698748763, + "grad_norm": 5.609417915344238, + "learning_rate": 6.4973765323993e-05, + "loss": 1.0242, + "num_input_tokens_seen": 80536496, + "step": 5005 + }, + { + "epoch": 0.3506615181206056, + "grad_norm": 4.375439643859863, + "learning_rate": 6.496676707530649e-05, + "loss": 0.9937, + "num_input_tokens_seen": 80551592, + "step": 5006 + }, + { + "epoch": 0.3507315663663348, + "grad_norm": 3.5269775390625, + "learning_rate": 6.495976882661996e-05, + "loss": 0.9995, + "num_input_tokens_seen": 80567976, + "step": 5007 + }, + { + "epoch": 0.35080161461206405, + "grad_norm": 3.9541778564453125, + "learning_rate": 6.495277057793346e-05, + "loss": 1.1451, + "num_input_tokens_seen": 80584360, + "step": 5008 + }, + { + "epoch": 0.3508716628577933, + "grad_norm": 5.544612407684326, + "learning_rate": 6.494577232924694e-05, + "loss": 1.3493, + "num_input_tokens_seen": 80599856, + "step": 5009 + }, + { + "epoch": 0.35094171110352257, + "grad_norm": 4.189836502075195, + "learning_rate": 6.493877408056041e-05, + "loss": 1.2096, + "num_input_tokens_seen": 80615392, + "step": 5010 + }, + { + "epoch": 0.3510117593492518, + "grad_norm": 4.8789825439453125, + "learning_rate": 6.49317758318739e-05, + "loss": 1.0665, + "num_input_tokens_seen": 80631776, + "step": 5011 + }, + { + "epoch": 0.35108180759498103, + "grad_norm": 4.271617412567139, + "learning_rate": 6.49247775831874e-05, + "loss": 0.9655, + "num_input_tokens_seen": 80648160, + "step": 5012 + }, + { + "epoch": 0.3511518558407103, + "grad_norm": 4.656182765960693, + "learning_rate": 6.491777933450088e-05, + "loss": 0.9566, + "num_input_tokens_seen": 80664424, + "step": 5013 + }, + { + "epoch": 0.35122190408643955, + "grad_norm": 6.627303600311279, + "learning_rate": 6.491078108581437e-05, + "loss": 1.2156, + "num_input_tokens_seen": 80680128, + "step": 5014 + }, + { + "epoch": 0.35129195233216876, + "grad_norm": 3.6189517974853516, + "learning_rate": 6.490378283712786e-05, + "loss": 1.0828, + "num_input_tokens_seen": 80695848, + "step": 5015 + }, + { + "epoch": 0.351362000577898, + "grad_norm": 3.58449387550354, + "learning_rate": 6.489678458844133e-05, + "loss": 1.0578, + "num_input_tokens_seen": 80712232, + "step": 5016 + }, + { + "epoch": 0.3514320488236273, + "grad_norm": 4.014143466949463, + "learning_rate": 6.488978633975481e-05, + "loss": 1.1271, + "num_input_tokens_seen": 80726480, + "step": 5017 + }, + { + "epoch": 0.35150209706935653, + "grad_norm": 4.461588382720947, + "learning_rate": 6.488278809106831e-05, + "loss": 1.1175, + "num_input_tokens_seen": 80742776, + "step": 5018 + }, + { + "epoch": 0.35157214531508574, + "grad_norm": 4.534054279327393, + "learning_rate": 6.48757898423818e-05, + "loss": 1.1009, + "num_input_tokens_seen": 80758024, + "step": 5019 + }, + { + "epoch": 0.351642193560815, + "grad_norm": 3.502699613571167, + "learning_rate": 6.486879159369527e-05, + "loss": 1.0564, + "num_input_tokens_seen": 80774152, + "step": 5020 + }, + { + "epoch": 0.35171224180654426, + "grad_norm": 4.463150978088379, + "learning_rate": 6.486179334500876e-05, + "loss": 0.9945, + "num_input_tokens_seen": 80790528, + "step": 5021 + }, + { + "epoch": 0.3517822900522735, + "grad_norm": 4.1127543449401855, + "learning_rate": 6.485479509632225e-05, + "loss": 0.9813, + "num_input_tokens_seen": 80805400, + "step": 5022 + }, + { + "epoch": 0.3518523382980028, + "grad_norm": 3.6113109588623047, + "learning_rate": 6.484779684763572e-05, + "loss": 1.1071, + "num_input_tokens_seen": 80821584, + "step": 5023 + }, + { + "epoch": 0.351922386543732, + "grad_norm": 4.167325019836426, + "learning_rate": 6.484079859894921e-05, + "loss": 1.0636, + "num_input_tokens_seen": 80837968, + "step": 5024 + }, + { + "epoch": 0.35199243478946124, + "grad_norm": 3.9422924518585205, + "learning_rate": 6.48338003502627e-05, + "loss": 1.0665, + "num_input_tokens_seen": 80854352, + "step": 5025 + }, + { + "epoch": 0.3520624830351905, + "grad_norm": 4.867110729217529, + "learning_rate": 6.482680210157619e-05, + "loss": 0.9098, + "num_input_tokens_seen": 80870648, + "step": 5026 + }, + { + "epoch": 0.35213253128091976, + "grad_norm": 4.714593887329102, + "learning_rate": 6.481980385288967e-05, + "loss": 1.0256, + "num_input_tokens_seen": 80886704, + "step": 5027 + }, + { + "epoch": 0.35220257952664896, + "grad_norm": 3.8926947116851807, + "learning_rate": 6.481280560420315e-05, + "loss": 0.9577, + "num_input_tokens_seen": 80902184, + "step": 5028 + }, + { + "epoch": 0.3522726277723782, + "grad_norm": 4.510727405548096, + "learning_rate": 6.480580735551664e-05, + "loss": 1.1543, + "num_input_tokens_seen": 80917960, + "step": 5029 + }, + { + "epoch": 0.3523426760181075, + "grad_norm": 3.6175239086151123, + "learning_rate": 6.479880910683012e-05, + "loss": 1.0692, + "num_input_tokens_seen": 80934344, + "step": 5030 + }, + { + "epoch": 0.35241272426383674, + "grad_norm": 4.112790584564209, + "learning_rate": 6.47918108581436e-05, + "loss": 1.1518, + "num_input_tokens_seen": 80950336, + "step": 5031 + }, + { + "epoch": 0.35248277250956594, + "grad_norm": 4.372056007385254, + "learning_rate": 6.478481260945711e-05, + "loss": 1.0732, + "num_input_tokens_seen": 80966272, + "step": 5032 + }, + { + "epoch": 0.3525528207552952, + "grad_norm": 5.2401204109191895, + "learning_rate": 6.477781436077058e-05, + "loss": 1.0378, + "num_input_tokens_seen": 80981568, + "step": 5033 + }, + { + "epoch": 0.35262286900102446, + "grad_norm": 4.032891273498535, + "learning_rate": 6.477081611208406e-05, + "loss": 1.0788, + "num_input_tokens_seen": 80997384, + "step": 5034 + }, + { + "epoch": 0.3526929172467537, + "grad_norm": 5.448423385620117, + "learning_rate": 6.476381786339756e-05, + "loss": 1.2136, + "num_input_tokens_seen": 81013768, + "step": 5035 + }, + { + "epoch": 0.3527629654924829, + "grad_norm": 3.5669469833374023, + "learning_rate": 6.475681961471104e-05, + "loss": 1.0039, + "num_input_tokens_seen": 81030152, + "step": 5036 + }, + { + "epoch": 0.3528330137382122, + "grad_norm": 3.4767303466796875, + "learning_rate": 6.474982136602451e-05, + "loss": 0.9563, + "num_input_tokens_seen": 81046536, + "step": 5037 + }, + { + "epoch": 0.35290306198394145, + "grad_norm": 4.859378814697266, + "learning_rate": 6.474282311733801e-05, + "loss": 1.2855, + "num_input_tokens_seen": 81062528, + "step": 5038 + }, + { + "epoch": 0.3529731102296707, + "grad_norm": 5.003366470336914, + "learning_rate": 6.47358248686515e-05, + "loss": 1.1317, + "num_input_tokens_seen": 81078912, + "step": 5039 + }, + { + "epoch": 0.3530431584753999, + "grad_norm": 3.9362549781799316, + "learning_rate": 6.472882661996498e-05, + "loss": 1.2051, + "num_input_tokens_seen": 81095296, + "step": 5040 + }, + { + "epoch": 0.35311320672112917, + "grad_norm": 3.319826364517212, + "learning_rate": 6.472182837127847e-05, + "loss": 0.9632, + "num_input_tokens_seen": 81111640, + "step": 5041 + }, + { + "epoch": 0.35318325496685843, + "grad_norm": 3.5816714763641357, + "learning_rate": 6.471483012259195e-05, + "loss": 0.9576, + "num_input_tokens_seen": 81128024, + "step": 5042 + }, + { + "epoch": 0.3532533032125877, + "grad_norm": 4.352350234985352, + "learning_rate": 6.470783187390543e-05, + "loss": 1.1754, + "num_input_tokens_seen": 81143992, + "step": 5043 + }, + { + "epoch": 0.3533233514583169, + "grad_norm": 3.4122314453125, + "learning_rate": 6.470083362521892e-05, + "loss": 1.104, + "num_input_tokens_seen": 81160376, + "step": 5044 + }, + { + "epoch": 0.35339339970404615, + "grad_norm": 4.0952324867248535, + "learning_rate": 6.46938353765324e-05, + "loss": 0.9727, + "num_input_tokens_seen": 81175968, + "step": 5045 + }, + { + "epoch": 0.3534634479497754, + "grad_norm": 3.9099533557891846, + "learning_rate": 6.46868371278459e-05, + "loss": 1.0624, + "num_input_tokens_seen": 81192352, + "step": 5046 + }, + { + "epoch": 0.35353349619550467, + "grad_norm": 6.379274845123291, + "learning_rate": 6.467983887915937e-05, + "loss": 1.0069, + "num_input_tokens_seen": 81208648, + "step": 5047 + }, + { + "epoch": 0.3536035444412339, + "grad_norm": 3.9650473594665527, + "learning_rate": 6.467284063047286e-05, + "loss": 1.0727, + "num_input_tokens_seen": 81224472, + "step": 5048 + }, + { + "epoch": 0.35367359268696313, + "grad_norm": 3.7729573249816895, + "learning_rate": 6.466584238178635e-05, + "loss": 1.0097, + "num_input_tokens_seen": 81240232, + "step": 5049 + }, + { + "epoch": 0.3537436409326924, + "grad_norm": 4.012545585632324, + "learning_rate": 6.465884413309982e-05, + "loss": 1.0527, + "num_input_tokens_seen": 81256616, + "step": 5050 + }, + { + "epoch": 0.35381368917842165, + "grad_norm": 3.679382801055908, + "learning_rate": 6.465184588441331e-05, + "loss": 1.0033, + "num_input_tokens_seen": 81272888, + "step": 5051 + }, + { + "epoch": 0.35388373742415086, + "grad_norm": 3.897606134414673, + "learning_rate": 6.464484763572681e-05, + "loss": 0.9513, + "num_input_tokens_seen": 81289272, + "step": 5052 + }, + { + "epoch": 0.3539537856698801, + "grad_norm": 4.988255023956299, + "learning_rate": 6.463784938704029e-05, + "loss": 0.8484, + "num_input_tokens_seen": 81305656, + "step": 5053 + }, + { + "epoch": 0.3540238339156094, + "grad_norm": 4.226601600646973, + "learning_rate": 6.463085113835376e-05, + "loss": 1.0048, + "num_input_tokens_seen": 81320912, + "step": 5054 + }, + { + "epoch": 0.35409388216133864, + "grad_norm": 4.0905070304870605, + "learning_rate": 6.462385288966725e-05, + "loss": 1.2044, + "num_input_tokens_seen": 81337296, + "step": 5055 + }, + { + "epoch": 0.3541639304070679, + "grad_norm": 4.470916748046875, + "learning_rate": 6.461685464098074e-05, + "loss": 1.1198, + "num_input_tokens_seen": 81353680, + "step": 5056 + }, + { + "epoch": 0.3542339786527971, + "grad_norm": 3.8264098167419434, + "learning_rate": 6.460985639229421e-05, + "loss": 0.8444, + "num_input_tokens_seen": 81370064, + "step": 5057 + }, + { + "epoch": 0.35430402689852636, + "grad_norm": 5.07196569442749, + "learning_rate": 6.460285814360772e-05, + "loss": 0.9035, + "num_input_tokens_seen": 81386368, + "step": 5058 + }, + { + "epoch": 0.3543740751442556, + "grad_norm": 4.830010414123535, + "learning_rate": 6.45958598949212e-05, + "loss": 1.0685, + "num_input_tokens_seen": 81402752, + "step": 5059 + }, + { + "epoch": 0.3544441233899849, + "grad_norm": 3.5972540378570557, + "learning_rate": 6.458886164623468e-05, + "loss": 0.9466, + "num_input_tokens_seen": 81418856, + "step": 5060 + }, + { + "epoch": 0.3545141716357141, + "grad_norm": 4.840418815612793, + "learning_rate": 6.458186339754816e-05, + "loss": 1.0174, + "num_input_tokens_seen": 81434344, + "step": 5061 + }, + { + "epoch": 0.35458421988144334, + "grad_norm": 4.891697883605957, + "learning_rate": 6.457486514886166e-05, + "loss": 1.0537, + "num_input_tokens_seen": 81450280, + "step": 5062 + }, + { + "epoch": 0.3546542681271726, + "grad_norm": 3.7236123085021973, + "learning_rate": 6.456786690017513e-05, + "loss": 1.0524, + "num_input_tokens_seen": 81466664, + "step": 5063 + }, + { + "epoch": 0.35472431637290186, + "grad_norm": 3.6597838401794434, + "learning_rate": 6.456086865148862e-05, + "loss": 0.9648, + "num_input_tokens_seen": 81483048, + "step": 5064 + }, + { + "epoch": 0.35479436461863106, + "grad_norm": 4.048685073852539, + "learning_rate": 6.455387040280211e-05, + "loss": 1.0033, + "num_input_tokens_seen": 81499080, + "step": 5065 + }, + { + "epoch": 0.3548644128643603, + "grad_norm": 3.683549165725708, + "learning_rate": 6.45468721541156e-05, + "loss": 1.054, + "num_input_tokens_seen": 81515464, + "step": 5066 + }, + { + "epoch": 0.3549344611100896, + "grad_norm": 4.80827522277832, + "learning_rate": 6.453987390542907e-05, + "loss": 1.0664, + "num_input_tokens_seen": 81530672, + "step": 5067 + }, + { + "epoch": 0.35500450935581884, + "grad_norm": 3.6255602836608887, + "learning_rate": 6.453287565674256e-05, + "loss": 1.0027, + "num_input_tokens_seen": 81546976, + "step": 5068 + }, + { + "epoch": 0.35507455760154805, + "grad_norm": 3.430290460586548, + "learning_rate": 6.452587740805605e-05, + "loss": 1.1253, + "num_input_tokens_seen": 81562936, + "step": 5069 + }, + { + "epoch": 0.3551446058472773, + "grad_norm": 5.140942573547363, + "learning_rate": 6.451887915936953e-05, + "loss": 0.9522, + "num_input_tokens_seen": 81579120, + "step": 5070 + }, + { + "epoch": 0.35521465409300657, + "grad_norm": 4.5443115234375, + "learning_rate": 6.451188091068301e-05, + "loss": 1.2141, + "num_input_tokens_seen": 81595504, + "step": 5071 + }, + { + "epoch": 0.3552847023387358, + "grad_norm": 4.33146333694458, + "learning_rate": 6.45048826619965e-05, + "loss": 1.0189, + "num_input_tokens_seen": 81611024, + "step": 5072 + }, + { + "epoch": 0.35535475058446503, + "grad_norm": 4.212037563323975, + "learning_rate": 6.449788441330999e-05, + "loss": 1.2356, + "num_input_tokens_seen": 81627208, + "step": 5073 + }, + { + "epoch": 0.3554247988301943, + "grad_norm": 3.714611053466797, + "learning_rate": 6.449088616462347e-05, + "loss": 0.9699, + "num_input_tokens_seen": 81642744, + "step": 5074 + }, + { + "epoch": 0.35549484707592355, + "grad_norm": 3.985471487045288, + "learning_rate": 6.448388791593696e-05, + "loss": 1.1381, + "num_input_tokens_seen": 81659128, + "step": 5075 + }, + { + "epoch": 0.3555648953216528, + "grad_norm": 4.519073963165283, + "learning_rate": 6.447688966725044e-05, + "loss": 1.1515, + "num_input_tokens_seen": 81675512, + "step": 5076 + }, + { + "epoch": 0.355634943567382, + "grad_norm": 4.546297550201416, + "learning_rate": 6.446989141856392e-05, + "loss": 1.324, + "num_input_tokens_seen": 81691528, + "step": 5077 + }, + { + "epoch": 0.35570499181311127, + "grad_norm": 4.023989200592041, + "learning_rate": 6.446289316987741e-05, + "loss": 1.068, + "num_input_tokens_seen": 81707912, + "step": 5078 + }, + { + "epoch": 0.35577504005884053, + "grad_norm": 4.442357540130615, + "learning_rate": 6.445589492119091e-05, + "loss": 0.9021, + "num_input_tokens_seen": 81724296, + "step": 5079 + }, + { + "epoch": 0.3558450883045698, + "grad_norm": 3.63273286819458, + "learning_rate": 6.444889667250438e-05, + "loss": 0.919, + "num_input_tokens_seen": 81740112, + "step": 5080 + }, + { + "epoch": 0.355915136550299, + "grad_norm": 3.8844716548919678, + "learning_rate": 6.444189842381786e-05, + "loss": 1.1389, + "num_input_tokens_seen": 81756024, + "step": 5081 + }, + { + "epoch": 0.35598518479602825, + "grad_norm": 3.8603484630584717, + "learning_rate": 6.443490017513135e-05, + "loss": 0.8949, + "num_input_tokens_seen": 81772408, + "step": 5082 + }, + { + "epoch": 0.3560552330417575, + "grad_norm": 4.305675029754639, + "learning_rate": 6.442790192644484e-05, + "loss": 1.0133, + "num_input_tokens_seen": 81787992, + "step": 5083 + }, + { + "epoch": 0.3561252812874868, + "grad_norm": 5.944203853607178, + "learning_rate": 6.442090367775833e-05, + "loss": 1.0635, + "num_input_tokens_seen": 81804032, + "step": 5084 + }, + { + "epoch": 0.356195329533216, + "grad_norm": 5.269783020019531, + "learning_rate": 6.441390542907181e-05, + "loss": 1.0697, + "num_input_tokens_seen": 81820416, + "step": 5085 + }, + { + "epoch": 0.35626537777894524, + "grad_norm": 3.775933027267456, + "learning_rate": 6.44069071803853e-05, + "loss": 1.0638, + "num_input_tokens_seen": 81836712, + "step": 5086 + }, + { + "epoch": 0.3563354260246745, + "grad_norm": 4.133227825164795, + "learning_rate": 6.439990893169878e-05, + "loss": 0.9842, + "num_input_tokens_seen": 81853096, + "step": 5087 + }, + { + "epoch": 0.35640547427040375, + "grad_norm": 4.418367862701416, + "learning_rate": 6.439291068301225e-05, + "loss": 1.1836, + "num_input_tokens_seen": 81869480, + "step": 5088 + }, + { + "epoch": 0.356475522516133, + "grad_norm": 3.584392786026001, + "learning_rate": 6.438591243432575e-05, + "loss": 1.0805, + "num_input_tokens_seen": 81885864, + "step": 5089 + }, + { + "epoch": 0.3565455707618622, + "grad_norm": 4.216940402984619, + "learning_rate": 6.437891418563923e-05, + "loss": 0.8602, + "num_input_tokens_seen": 81902248, + "step": 5090 + }, + { + "epoch": 0.3566156190075915, + "grad_norm": 4.383372783660889, + "learning_rate": 6.437191593695272e-05, + "loss": 0.9763, + "num_input_tokens_seen": 81918464, + "step": 5091 + }, + { + "epoch": 0.35668566725332074, + "grad_norm": 4.06666374206543, + "learning_rate": 6.436491768826621e-05, + "loss": 0.9784, + "num_input_tokens_seen": 81934848, + "step": 5092 + }, + { + "epoch": 0.35675571549905, + "grad_norm": 5.485066890716553, + "learning_rate": 6.43579194395797e-05, + "loss": 0.9188, + "num_input_tokens_seen": 81950696, + "step": 5093 + }, + { + "epoch": 0.3568257637447792, + "grad_norm": 6.794841766357422, + "learning_rate": 6.435092119089317e-05, + "loss": 1.1765, + "num_input_tokens_seen": 81967080, + "step": 5094 + }, + { + "epoch": 0.35689581199050846, + "grad_norm": 3.531291961669922, + "learning_rate": 6.434392294220666e-05, + "loss": 0.9904, + "num_input_tokens_seen": 81983464, + "step": 5095 + }, + { + "epoch": 0.3569658602362377, + "grad_norm": 3.694018840789795, + "learning_rate": 6.433692469352015e-05, + "loss": 1.0384, + "num_input_tokens_seen": 81999848, + "step": 5096 + }, + { + "epoch": 0.357035908481967, + "grad_norm": 6.933582305908203, + "learning_rate": 6.432992644483362e-05, + "loss": 1.0262, + "num_input_tokens_seen": 82015304, + "step": 5097 + }, + { + "epoch": 0.3571059567276962, + "grad_norm": 5.904866695404053, + "learning_rate": 6.432292819614711e-05, + "loss": 1.0849, + "num_input_tokens_seen": 82031688, + "step": 5098 + }, + { + "epoch": 0.35717600497342544, + "grad_norm": 4.199756145477295, + "learning_rate": 6.43159299474606e-05, + "loss": 1.1007, + "num_input_tokens_seen": 82047336, + "step": 5099 + }, + { + "epoch": 0.3572460532191547, + "grad_norm": 3.703000783920288, + "learning_rate": 6.430893169877409e-05, + "loss": 0.8503, + "num_input_tokens_seen": 82063720, + "step": 5100 + }, + { + "epoch": 0.35731610146488396, + "grad_norm": 4.844930171966553, + "learning_rate": 6.430193345008756e-05, + "loss": 1.0255, + "num_input_tokens_seen": 82079632, + "step": 5101 + }, + { + "epoch": 0.35738614971061317, + "grad_norm": 3.870488166809082, + "learning_rate": 6.429493520140105e-05, + "loss": 1.0116, + "num_input_tokens_seen": 82094864, + "step": 5102 + }, + { + "epoch": 0.3574561979563424, + "grad_norm": 3.9125707149505615, + "learning_rate": 6.428793695271454e-05, + "loss": 0.9626, + "num_input_tokens_seen": 82111136, + "step": 5103 + }, + { + "epoch": 0.3575262462020717, + "grad_norm": 4.347132205963135, + "learning_rate": 6.428093870402803e-05, + "loss": 0.9538, + "num_input_tokens_seen": 82127064, + "step": 5104 + }, + { + "epoch": 0.35759629444780094, + "grad_norm": 3.739053964614868, + "learning_rate": 6.42739404553415e-05, + "loss": 1.201, + "num_input_tokens_seen": 82143448, + "step": 5105 + }, + { + "epoch": 0.35766634269353015, + "grad_norm": 4.781857967376709, + "learning_rate": 6.4266942206655e-05, + "loss": 1.0323, + "num_input_tokens_seen": 82159832, + "step": 5106 + }, + { + "epoch": 0.3577363909392594, + "grad_norm": 4.3711700439453125, + "learning_rate": 6.425994395796848e-05, + "loss": 1.2023, + "num_input_tokens_seen": 82175944, + "step": 5107 + }, + { + "epoch": 0.35780643918498867, + "grad_norm": 3.6916282176971436, + "learning_rate": 6.425294570928196e-05, + "loss": 0.787, + "num_input_tokens_seen": 82192304, + "step": 5108 + }, + { + "epoch": 0.3578764874307179, + "grad_norm": 4.418915271759033, + "learning_rate": 6.424594746059545e-05, + "loss": 1.0842, + "num_input_tokens_seen": 82208080, + "step": 5109 + }, + { + "epoch": 0.35794653567644713, + "grad_norm": 3.9138340950012207, + "learning_rate": 6.423894921190893e-05, + "loss": 1.0261, + "num_input_tokens_seen": 82224464, + "step": 5110 + }, + { + "epoch": 0.3580165839221764, + "grad_norm": 3.99479079246521, + "learning_rate": 6.423195096322242e-05, + "loss": 1.0562, + "num_input_tokens_seen": 82240664, + "step": 5111 + }, + { + "epoch": 0.35808663216790565, + "grad_norm": 4.260537147521973, + "learning_rate": 6.422495271453591e-05, + "loss": 1.1133, + "num_input_tokens_seen": 82257048, + "step": 5112 + }, + { + "epoch": 0.3581566804136349, + "grad_norm": 3.5181097984313965, + "learning_rate": 6.42179544658494e-05, + "loss": 0.98, + "num_input_tokens_seen": 82273432, + "step": 5113 + }, + { + "epoch": 0.3582267286593641, + "grad_norm": 5.96913480758667, + "learning_rate": 6.421095621716287e-05, + "loss": 0.8867, + "num_input_tokens_seen": 82289816, + "step": 5114 + }, + { + "epoch": 0.3582967769050934, + "grad_norm": 4.628411769866943, + "learning_rate": 6.420395796847635e-05, + "loss": 1.1363, + "num_input_tokens_seen": 82305784, + "step": 5115 + }, + { + "epoch": 0.35836682515082263, + "grad_norm": 3.5981955528259277, + "learning_rate": 6.419695971978985e-05, + "loss": 0.9182, + "num_input_tokens_seen": 82321384, + "step": 5116 + }, + { + "epoch": 0.3584368733965519, + "grad_norm": 4.410891056060791, + "learning_rate": 6.418996147110333e-05, + "loss": 1.1118, + "num_input_tokens_seen": 82336184, + "step": 5117 + }, + { + "epoch": 0.3585069216422811, + "grad_norm": 4.316674709320068, + "learning_rate": 6.418296322241682e-05, + "loss": 1.1604, + "num_input_tokens_seen": 82351520, + "step": 5118 + }, + { + "epoch": 0.35857696988801036, + "grad_norm": 5.662688255310059, + "learning_rate": 6.41759649737303e-05, + "loss": 1.1212, + "num_input_tokens_seen": 82367904, + "step": 5119 + }, + { + "epoch": 0.3586470181337396, + "grad_norm": 4.5336151123046875, + "learning_rate": 6.416896672504379e-05, + "loss": 1.0093, + "num_input_tokens_seen": 82384288, + "step": 5120 + }, + { + "epoch": 0.3587170663794689, + "grad_norm": 6.43854284286499, + "learning_rate": 6.416196847635727e-05, + "loss": 0.9434, + "num_input_tokens_seen": 82400120, + "step": 5121 + }, + { + "epoch": 0.3587871146251981, + "grad_norm": 3.519869089126587, + "learning_rate": 6.415497022767076e-05, + "loss": 0.9704, + "num_input_tokens_seen": 82416504, + "step": 5122 + }, + { + "epoch": 0.35885716287092734, + "grad_norm": 4.426568508148193, + "learning_rate": 6.414797197898425e-05, + "loss": 0.9778, + "num_input_tokens_seen": 82431936, + "step": 5123 + }, + { + "epoch": 0.3589272111166566, + "grad_norm": 10.392409324645996, + "learning_rate": 6.414097373029773e-05, + "loss": 1.0289, + "num_input_tokens_seen": 82447232, + "step": 5124 + }, + { + "epoch": 0.35899725936238586, + "grad_norm": 4.133431434631348, + "learning_rate": 6.413397548161121e-05, + "loss": 1.1998, + "num_input_tokens_seen": 82462648, + "step": 5125 + }, + { + "epoch": 0.3590673076081151, + "grad_norm": 5.43566370010376, + "learning_rate": 6.41269772329247e-05, + "loss": 0.9587, + "num_input_tokens_seen": 82478536, + "step": 5126 + }, + { + "epoch": 0.3591373558538443, + "grad_norm": 4.205079555511475, + "learning_rate": 6.411997898423819e-05, + "loss": 1.1152, + "num_input_tokens_seen": 82494224, + "step": 5127 + }, + { + "epoch": 0.3592074040995736, + "grad_norm": 4.165416240692139, + "learning_rate": 6.411298073555166e-05, + "loss": 1.3017, + "num_input_tokens_seen": 82510608, + "step": 5128 + }, + { + "epoch": 0.35927745234530284, + "grad_norm": 3.7855117321014404, + "learning_rate": 6.410598248686515e-05, + "loss": 0.8362, + "num_input_tokens_seen": 82526992, + "step": 5129 + }, + { + "epoch": 0.3593475005910321, + "grad_norm": 4.406207084655762, + "learning_rate": 6.409898423817864e-05, + "loss": 1.0353, + "num_input_tokens_seen": 82543376, + "step": 5130 + }, + { + "epoch": 0.3594175488367613, + "grad_norm": 4.228625774383545, + "learning_rate": 6.409198598949213e-05, + "loss": 0.9788, + "num_input_tokens_seen": 82559760, + "step": 5131 + }, + { + "epoch": 0.35948759708249056, + "grad_norm": 3.6679983139038086, + "learning_rate": 6.40849877408056e-05, + "loss": 1.072, + "num_input_tokens_seen": 82575552, + "step": 5132 + }, + { + "epoch": 0.3595576453282198, + "grad_norm": 4.011179447174072, + "learning_rate": 6.40779894921191e-05, + "loss": 1.0443, + "num_input_tokens_seen": 82591936, + "step": 5133 + }, + { + "epoch": 0.3596276935739491, + "grad_norm": 4.861363410949707, + "learning_rate": 6.407099124343258e-05, + "loss": 1.1077, + "num_input_tokens_seen": 82608320, + "step": 5134 + }, + { + "epoch": 0.3596977418196783, + "grad_norm": 4.128578186035156, + "learning_rate": 6.406399299474605e-05, + "loss": 1.1903, + "num_input_tokens_seen": 82624704, + "step": 5135 + }, + { + "epoch": 0.35976779006540754, + "grad_norm": 4.036421775817871, + "learning_rate": 6.405699474605954e-05, + "loss": 1.1624, + "num_input_tokens_seen": 82641088, + "step": 5136 + }, + { + "epoch": 0.3598378383111368, + "grad_norm": 4.536168098449707, + "learning_rate": 6.404999649737303e-05, + "loss": 0.9512, + "num_input_tokens_seen": 82657472, + "step": 5137 + }, + { + "epoch": 0.35990788655686606, + "grad_norm": 3.665916681289673, + "learning_rate": 6.404299824868652e-05, + "loss": 1.1718, + "num_input_tokens_seen": 82673856, + "step": 5138 + }, + { + "epoch": 0.35997793480259527, + "grad_norm": 3.798205852508545, + "learning_rate": 6.403600000000001e-05, + "loss": 1.0625, + "num_input_tokens_seen": 82690240, + "step": 5139 + }, + { + "epoch": 0.3600479830483245, + "grad_norm": 3.9616305828094482, + "learning_rate": 6.40290017513135e-05, + "loss": 1.1314, + "num_input_tokens_seen": 82706624, + "step": 5140 + }, + { + "epoch": 0.3601180312940538, + "grad_norm": 4.6059489250183105, + "learning_rate": 6.402200350262697e-05, + "loss": 0.9534, + "num_input_tokens_seen": 82723008, + "step": 5141 + }, + { + "epoch": 0.36018807953978305, + "grad_norm": 4.2935943603515625, + "learning_rate": 6.401500525394045e-05, + "loss": 0.9653, + "num_input_tokens_seen": 82739392, + "step": 5142 + }, + { + "epoch": 0.36025812778551225, + "grad_norm": 4.02174711227417, + "learning_rate": 6.400800700525395e-05, + "loss": 1.2037, + "num_input_tokens_seen": 82755600, + "step": 5143 + }, + { + "epoch": 0.3603281760312415, + "grad_norm": 4.0431599617004395, + "learning_rate": 6.400100875656744e-05, + "loss": 1.0548, + "num_input_tokens_seen": 82771592, + "step": 5144 + }, + { + "epoch": 0.36039822427697077, + "grad_norm": 3.6921310424804688, + "learning_rate": 6.399401050788091e-05, + "loss": 0.8992, + "num_input_tokens_seen": 82787728, + "step": 5145 + }, + { + "epoch": 0.36046827252270003, + "grad_norm": 4.27170991897583, + "learning_rate": 6.39870122591944e-05, + "loss": 1.0908, + "num_input_tokens_seen": 82803152, + "step": 5146 + }, + { + "epoch": 0.36053832076842923, + "grad_norm": 4.670827865600586, + "learning_rate": 6.398001401050789e-05, + "loss": 1.1134, + "num_input_tokens_seen": 82819536, + "step": 5147 + }, + { + "epoch": 0.3606083690141585, + "grad_norm": 3.6219654083251953, + "learning_rate": 6.397301576182136e-05, + "loss": 0.9576, + "num_input_tokens_seen": 82835920, + "step": 5148 + }, + { + "epoch": 0.36067841725988775, + "grad_norm": 3.53466796875, + "learning_rate": 6.396601751313485e-05, + "loss": 0.905, + "num_input_tokens_seen": 82852304, + "step": 5149 + }, + { + "epoch": 0.360748465505617, + "grad_norm": 4.027638912200928, + "learning_rate": 6.395901926444834e-05, + "loss": 1.0661, + "num_input_tokens_seen": 82867816, + "step": 5150 + }, + { + "epoch": 0.3608185137513462, + "grad_norm": 5.701491832733154, + "learning_rate": 6.395202101576183e-05, + "loss": 1.2476, + "num_input_tokens_seen": 82883480, + "step": 5151 + }, + { + "epoch": 0.3608885619970755, + "grad_norm": 4.156428337097168, + "learning_rate": 6.39450227670753e-05, + "loss": 1.1507, + "num_input_tokens_seen": 82899608, + "step": 5152 + }, + { + "epoch": 0.36095861024280473, + "grad_norm": 5.278023719787598, + "learning_rate": 6.39380245183888e-05, + "loss": 1.0583, + "num_input_tokens_seen": 82915656, + "step": 5153 + }, + { + "epoch": 0.361028658488534, + "grad_norm": 3.6892948150634766, + "learning_rate": 6.393102626970228e-05, + "loss": 1.0063, + "num_input_tokens_seen": 82931632, + "step": 5154 + }, + { + "epoch": 0.3610987067342632, + "grad_norm": 5.179676055908203, + "learning_rate": 6.392402802101576e-05, + "loss": 1.1701, + "num_input_tokens_seen": 82947344, + "step": 5155 + }, + { + "epoch": 0.36116875497999246, + "grad_norm": 4.948189735412598, + "learning_rate": 6.391702977232925e-05, + "loss": 1.056, + "num_input_tokens_seen": 82963720, + "step": 5156 + }, + { + "epoch": 0.3612388032257217, + "grad_norm": 4.465184688568115, + "learning_rate": 6.391003152364274e-05, + "loss": 1.225, + "num_input_tokens_seen": 82980048, + "step": 5157 + }, + { + "epoch": 0.361308851471451, + "grad_norm": 4.053642749786377, + "learning_rate": 6.390303327495622e-05, + "loss": 1.1481, + "num_input_tokens_seen": 82996432, + "step": 5158 + }, + { + "epoch": 0.36137889971718024, + "grad_norm": 8.422308921813965, + "learning_rate": 6.38960350262697e-05, + "loss": 1.241, + "num_input_tokens_seen": 83012560, + "step": 5159 + }, + { + "epoch": 0.36144894796290944, + "grad_norm": 3.4304730892181396, + "learning_rate": 6.38890367775832e-05, + "loss": 1.1008, + "num_input_tokens_seen": 83028680, + "step": 5160 + }, + { + "epoch": 0.3615189962086387, + "grad_norm": 9.87295913696289, + "learning_rate": 6.388203852889668e-05, + "loss": 1.0512, + "num_input_tokens_seen": 83045064, + "step": 5161 + }, + { + "epoch": 0.36158904445436796, + "grad_norm": 3.7000608444213867, + "learning_rate": 6.387504028021015e-05, + "loss": 1.0758, + "num_input_tokens_seen": 83061448, + "step": 5162 + }, + { + "epoch": 0.3616590927000972, + "grad_norm": 3.5490283966064453, + "learning_rate": 6.386804203152364e-05, + "loss": 0.9705, + "num_input_tokens_seen": 83077176, + "step": 5163 + }, + { + "epoch": 0.3617291409458264, + "grad_norm": 3.850770950317383, + "learning_rate": 6.386104378283714e-05, + "loss": 1.0371, + "num_input_tokens_seen": 83093560, + "step": 5164 + }, + { + "epoch": 0.3617991891915557, + "grad_norm": 5.09017276763916, + "learning_rate": 6.385404553415062e-05, + "loss": 1.0084, + "num_input_tokens_seen": 83109752, + "step": 5165 + }, + { + "epoch": 0.36186923743728494, + "grad_norm": 4.801665782928467, + "learning_rate": 6.38470472854641e-05, + "loss": 1.0909, + "num_input_tokens_seen": 83125048, + "step": 5166 + }, + { + "epoch": 0.3619392856830142, + "grad_norm": 3.954345941543579, + "learning_rate": 6.38400490367776e-05, + "loss": 0.9775, + "num_input_tokens_seen": 83140808, + "step": 5167 + }, + { + "epoch": 0.3620093339287434, + "grad_norm": 4.874080657958984, + "learning_rate": 6.383305078809107e-05, + "loss": 1.1408, + "num_input_tokens_seen": 83157176, + "step": 5168 + }, + { + "epoch": 0.36207938217447266, + "grad_norm": 4.3997111320495605, + "learning_rate": 6.382605253940454e-05, + "loss": 1.1489, + "num_input_tokens_seen": 83173560, + "step": 5169 + }, + { + "epoch": 0.3621494304202019, + "grad_norm": 4.431540489196777, + "learning_rate": 6.381905429071805e-05, + "loss": 1.1138, + "num_input_tokens_seen": 83189864, + "step": 5170 + }, + { + "epoch": 0.3622194786659312, + "grad_norm": 4.48107385635376, + "learning_rate": 6.381205604203153e-05, + "loss": 1.2451, + "num_input_tokens_seen": 83205560, + "step": 5171 + }, + { + "epoch": 0.3622895269116604, + "grad_norm": 4.369350910186768, + "learning_rate": 6.380505779334501e-05, + "loss": 1.0877, + "num_input_tokens_seen": 83221544, + "step": 5172 + }, + { + "epoch": 0.36235957515738965, + "grad_norm": 3.8510024547576904, + "learning_rate": 6.37980595446585e-05, + "loss": 0.8895, + "num_input_tokens_seen": 83237928, + "step": 5173 + }, + { + "epoch": 0.3624296234031189, + "grad_norm": 3.7452402114868164, + "learning_rate": 6.379106129597199e-05, + "loss": 1.1425, + "num_input_tokens_seen": 83254168, + "step": 5174 + }, + { + "epoch": 0.36249967164884817, + "grad_norm": 4.53076171875, + "learning_rate": 6.378406304728546e-05, + "loss": 1.1516, + "num_input_tokens_seen": 83269568, + "step": 5175 + }, + { + "epoch": 0.36256971989457737, + "grad_norm": 3.729602813720703, + "learning_rate": 6.377706479859895e-05, + "loss": 1.2105, + "num_input_tokens_seen": 83285952, + "step": 5176 + }, + { + "epoch": 0.36263976814030663, + "grad_norm": 4.085333824157715, + "learning_rate": 6.377006654991244e-05, + "loss": 1.0517, + "num_input_tokens_seen": 83302200, + "step": 5177 + }, + { + "epoch": 0.3627098163860359, + "grad_norm": 3.9202303886413574, + "learning_rate": 6.376306830122593e-05, + "loss": 1.0358, + "num_input_tokens_seen": 83318584, + "step": 5178 + }, + { + "epoch": 0.36277986463176515, + "grad_norm": 4.10648775100708, + "learning_rate": 6.37560700525394e-05, + "loss": 1.3052, + "num_input_tokens_seen": 83334288, + "step": 5179 + }, + { + "epoch": 0.36284991287749435, + "grad_norm": 3.975217580795288, + "learning_rate": 6.374907180385289e-05, + "loss": 1.1725, + "num_input_tokens_seen": 83350096, + "step": 5180 + }, + { + "epoch": 0.3629199611232236, + "grad_norm": 4.207096099853516, + "learning_rate": 6.374207355516638e-05, + "loss": 1.1396, + "num_input_tokens_seen": 83366480, + "step": 5181 + }, + { + "epoch": 0.36299000936895287, + "grad_norm": 3.9960830211639404, + "learning_rate": 6.373507530647986e-05, + "loss": 1.1971, + "num_input_tokens_seen": 83381832, + "step": 5182 + }, + { + "epoch": 0.36306005761468213, + "grad_norm": 4.142012596130371, + "learning_rate": 6.372807705779334e-05, + "loss": 1.0829, + "num_input_tokens_seen": 83398216, + "step": 5183 + }, + { + "epoch": 0.36313010586041133, + "grad_norm": 3.8692433834075928, + "learning_rate": 6.372107880910685e-05, + "loss": 1.0649, + "num_input_tokens_seen": 83414600, + "step": 5184 + }, + { + "epoch": 0.3632001541061406, + "grad_norm": 3.663544178009033, + "learning_rate": 6.371408056042032e-05, + "loss": 0.8924, + "num_input_tokens_seen": 83430984, + "step": 5185 + }, + { + "epoch": 0.36327020235186985, + "grad_norm": 4.056418418884277, + "learning_rate": 6.37070823117338e-05, + "loss": 0.9463, + "num_input_tokens_seen": 83447368, + "step": 5186 + }, + { + "epoch": 0.3633402505975991, + "grad_norm": 4.209747314453125, + "learning_rate": 6.37000840630473e-05, + "loss": 1.0641, + "num_input_tokens_seen": 83463752, + "step": 5187 + }, + { + "epoch": 0.3634102988433283, + "grad_norm": 4.93091344833374, + "learning_rate": 6.369308581436077e-05, + "loss": 1.2046, + "num_input_tokens_seen": 83479424, + "step": 5188 + }, + { + "epoch": 0.3634803470890576, + "grad_norm": 3.6523993015289307, + "learning_rate": 6.368608756567425e-05, + "loss": 0.8965, + "num_input_tokens_seen": 83495808, + "step": 5189 + }, + { + "epoch": 0.36355039533478684, + "grad_norm": 4.8949294090271, + "learning_rate": 6.367908931698775e-05, + "loss": 0.8928, + "num_input_tokens_seen": 83511448, + "step": 5190 + }, + { + "epoch": 0.3636204435805161, + "grad_norm": 5.856332778930664, + "learning_rate": 6.367209106830124e-05, + "loss": 0.9844, + "num_input_tokens_seen": 83526664, + "step": 5191 + }, + { + "epoch": 0.3636904918262453, + "grad_norm": 3.762014865875244, + "learning_rate": 6.366509281961471e-05, + "loss": 1.0865, + "num_input_tokens_seen": 83542792, + "step": 5192 + }, + { + "epoch": 0.36376054007197456, + "grad_norm": 4.075290203094482, + "learning_rate": 6.36580945709282e-05, + "loss": 1.0261, + "num_input_tokens_seen": 83558992, + "step": 5193 + }, + { + "epoch": 0.3638305883177038, + "grad_norm": 4.124780178070068, + "learning_rate": 6.365109632224169e-05, + "loss": 1.1021, + "num_input_tokens_seen": 83575376, + "step": 5194 + }, + { + "epoch": 0.3639006365634331, + "grad_norm": 6.1159210205078125, + "learning_rate": 6.364409807355517e-05, + "loss": 0.9209, + "num_input_tokens_seen": 83591400, + "step": 5195 + }, + { + "epoch": 0.36397068480916234, + "grad_norm": 3.8839027881622314, + "learning_rate": 6.363709982486865e-05, + "loss": 1.0866, + "num_input_tokens_seen": 83607784, + "step": 5196 + }, + { + "epoch": 0.36404073305489154, + "grad_norm": 4.260892391204834, + "learning_rate": 6.363010157618214e-05, + "loss": 1.0747, + "num_input_tokens_seen": 83623944, + "step": 5197 + }, + { + "epoch": 0.3641107813006208, + "grad_norm": 4.111022472381592, + "learning_rate": 6.362310332749563e-05, + "loss": 1.2594, + "num_input_tokens_seen": 83639408, + "step": 5198 + }, + { + "epoch": 0.36418082954635006, + "grad_norm": 3.567676305770874, + "learning_rate": 6.361610507880911e-05, + "loss": 1.0115, + "num_input_tokens_seen": 83655496, + "step": 5199 + }, + { + "epoch": 0.3642508777920793, + "grad_norm": 4.935754299163818, + "learning_rate": 6.36091068301226e-05, + "loss": 1.2028, + "num_input_tokens_seen": 83671016, + "step": 5200 + }, + { + "epoch": 0.3642508777920793, + "eval_loss": 1.129547119140625, + "eval_runtime": 0.1857, + "eval_samples_per_second": 5.386, + "eval_steps_per_second": 5.386, + "num_input_tokens_seen": 83671016, + "step": 5200 + }, + { + "epoch": 0.3643209260378085, + "grad_norm": 3.8546817302703857, + "learning_rate": 6.360210858143608e-05, + "loss": 0.9873, + "num_input_tokens_seen": 83685736, + "step": 5201 + }, + { + "epoch": 0.3643909742835378, + "grad_norm": 3.900425910949707, + "learning_rate": 6.359511033274956e-05, + "loss": 1.0005, + "num_input_tokens_seen": 83702120, + "step": 5202 + }, + { + "epoch": 0.36446102252926704, + "grad_norm": 4.270096302032471, + "learning_rate": 6.358811208406305e-05, + "loss": 0.9098, + "num_input_tokens_seen": 83718504, + "step": 5203 + }, + { + "epoch": 0.3645310707749963, + "grad_norm": 5.027628421783447, + "learning_rate": 6.358111383537655e-05, + "loss": 1.1363, + "num_input_tokens_seen": 83734888, + "step": 5204 + }, + { + "epoch": 0.3646011190207255, + "grad_norm": 4.843371868133545, + "learning_rate": 6.357411558669002e-05, + "loss": 0.9629, + "num_input_tokens_seen": 83749488, + "step": 5205 + }, + { + "epoch": 0.36467116726645477, + "grad_norm": 7.530435562133789, + "learning_rate": 6.35671173380035e-05, + "loss": 1.0575, + "num_input_tokens_seen": 83765872, + "step": 5206 + }, + { + "epoch": 0.364741215512184, + "grad_norm": 4.028171062469482, + "learning_rate": 6.356011908931699e-05, + "loss": 1.2011, + "num_input_tokens_seen": 83781936, + "step": 5207 + }, + { + "epoch": 0.3648112637579133, + "grad_norm": 6.744492053985596, + "learning_rate": 6.355312084063048e-05, + "loss": 1.0464, + "num_input_tokens_seen": 83797520, + "step": 5208 + }, + { + "epoch": 0.3648813120036425, + "grad_norm": 3.9689910411834717, + "learning_rate": 6.354612259194395e-05, + "loss": 1.0156, + "num_input_tokens_seen": 83813872, + "step": 5209 + }, + { + "epoch": 0.36495136024937175, + "grad_norm": 4.990142345428467, + "learning_rate": 6.353912434325745e-05, + "loss": 1.2019, + "num_input_tokens_seen": 83830256, + "step": 5210 + }, + { + "epoch": 0.365021408495101, + "grad_norm": 4.547253131866455, + "learning_rate": 6.353212609457094e-05, + "loss": 1.1825, + "num_input_tokens_seen": 83846640, + "step": 5211 + }, + { + "epoch": 0.36509145674083027, + "grad_norm": 4.108243465423584, + "learning_rate": 6.352512784588442e-05, + "loss": 1.1827, + "num_input_tokens_seen": 83863024, + "step": 5212 + }, + { + "epoch": 0.36516150498655947, + "grad_norm": 4.540827751159668, + "learning_rate": 6.351812959719789e-05, + "loss": 1.0034, + "num_input_tokens_seen": 83878976, + "step": 5213 + }, + { + "epoch": 0.36523155323228873, + "grad_norm": 5.3233842849731445, + "learning_rate": 6.35111313485114e-05, + "loss": 1.2247, + "num_input_tokens_seen": 83895360, + "step": 5214 + }, + { + "epoch": 0.365301601478018, + "grad_norm": 5.161661624908447, + "learning_rate": 6.350413309982487e-05, + "loss": 1.2067, + "num_input_tokens_seen": 83910064, + "step": 5215 + }, + { + "epoch": 0.36537164972374725, + "grad_norm": 4.908864498138428, + "learning_rate": 6.349713485113836e-05, + "loss": 1.1748, + "num_input_tokens_seen": 83926448, + "step": 5216 + }, + { + "epoch": 0.36544169796947645, + "grad_norm": 5.954193592071533, + "learning_rate": 6.349013660245185e-05, + "loss": 0.99, + "num_input_tokens_seen": 83942248, + "step": 5217 + }, + { + "epoch": 0.3655117462152057, + "grad_norm": 3.5276272296905518, + "learning_rate": 6.348313835376534e-05, + "loss": 0.9637, + "num_input_tokens_seen": 83958632, + "step": 5218 + }, + { + "epoch": 0.365581794460935, + "grad_norm": 3.736661195755005, + "learning_rate": 6.347614010507881e-05, + "loss": 1.059, + "num_input_tokens_seen": 83975016, + "step": 5219 + }, + { + "epoch": 0.36565184270666423, + "grad_norm": 5.434671401977539, + "learning_rate": 6.34691418563923e-05, + "loss": 1.0891, + "num_input_tokens_seen": 83990424, + "step": 5220 + }, + { + "epoch": 0.36572189095239344, + "grad_norm": 3.9301772117614746, + "learning_rate": 6.346214360770579e-05, + "loss": 1.0278, + "num_input_tokens_seen": 84006808, + "step": 5221 + }, + { + "epoch": 0.3657919391981227, + "grad_norm": 5.101827621459961, + "learning_rate": 6.345514535901926e-05, + "loss": 1.2129, + "num_input_tokens_seen": 84022624, + "step": 5222 + }, + { + "epoch": 0.36586198744385195, + "grad_norm": 4.042179584503174, + "learning_rate": 6.344814711033275e-05, + "loss": 1.2996, + "num_input_tokens_seen": 84038688, + "step": 5223 + }, + { + "epoch": 0.3659320356895812, + "grad_norm": 4.2309441566467285, + "learning_rate": 6.344114886164624e-05, + "loss": 1.1113, + "num_input_tokens_seen": 84055072, + "step": 5224 + }, + { + "epoch": 0.3660020839353104, + "grad_norm": 6.73452615737915, + "learning_rate": 6.343415061295973e-05, + "loss": 1.1523, + "num_input_tokens_seen": 84071456, + "step": 5225 + }, + { + "epoch": 0.3660721321810397, + "grad_norm": 3.684497833251953, + "learning_rate": 6.34271523642732e-05, + "loss": 1.0967, + "num_input_tokens_seen": 84087840, + "step": 5226 + }, + { + "epoch": 0.36614218042676894, + "grad_norm": 3.7974796295166016, + "learning_rate": 6.342015411558669e-05, + "loss": 1.0675, + "num_input_tokens_seen": 84103456, + "step": 5227 + }, + { + "epoch": 0.3662122286724982, + "grad_norm": 4.681473255157471, + "learning_rate": 6.341315586690018e-05, + "loss": 0.9202, + "num_input_tokens_seen": 84119840, + "step": 5228 + }, + { + "epoch": 0.36628227691822746, + "grad_norm": 4.197212219238281, + "learning_rate": 6.340615761821366e-05, + "loss": 0.9594, + "num_input_tokens_seen": 84136224, + "step": 5229 + }, + { + "epoch": 0.36635232516395666, + "grad_norm": 4.1414794921875, + "learning_rate": 6.339915936952716e-05, + "loss": 1.1421, + "num_input_tokens_seen": 84152552, + "step": 5230 + }, + { + "epoch": 0.3664223734096859, + "grad_norm": 4.138907432556152, + "learning_rate": 6.339216112084065e-05, + "loss": 1.0841, + "num_input_tokens_seen": 84168936, + "step": 5231 + }, + { + "epoch": 0.3664924216554152, + "grad_norm": 4.723425388336182, + "learning_rate": 6.338516287215412e-05, + "loss": 1.1302, + "num_input_tokens_seen": 84185320, + "step": 5232 + }, + { + "epoch": 0.36656246990114444, + "grad_norm": 4.167308330535889, + "learning_rate": 6.33781646234676e-05, + "loss": 1.2636, + "num_input_tokens_seen": 84201704, + "step": 5233 + }, + { + "epoch": 0.36663251814687364, + "grad_norm": 3.832829236984253, + "learning_rate": 6.337116637478109e-05, + "loss": 0.9078, + "num_input_tokens_seen": 84217216, + "step": 5234 + }, + { + "epoch": 0.3667025663926029, + "grad_norm": 6.1642842292785645, + "learning_rate": 6.336416812609457e-05, + "loss": 0.8102, + "num_input_tokens_seen": 84232896, + "step": 5235 + }, + { + "epoch": 0.36677261463833216, + "grad_norm": 3.948350429534912, + "learning_rate": 6.335716987740806e-05, + "loss": 1.1285, + "num_input_tokens_seen": 84248448, + "step": 5236 + }, + { + "epoch": 0.3668426628840614, + "grad_norm": 3.6216750144958496, + "learning_rate": 6.335017162872155e-05, + "loss": 1.022, + "num_input_tokens_seen": 84264832, + "step": 5237 + }, + { + "epoch": 0.3669127111297906, + "grad_norm": 5.787931442260742, + "learning_rate": 6.334317338003504e-05, + "loss": 1.1968, + "num_input_tokens_seen": 84281216, + "step": 5238 + }, + { + "epoch": 0.3669827593755199, + "grad_norm": 4.830391883850098, + "learning_rate": 6.333617513134851e-05, + "loss": 1.3014, + "num_input_tokens_seen": 84297352, + "step": 5239 + }, + { + "epoch": 0.36705280762124914, + "grad_norm": 3.839425563812256, + "learning_rate": 6.332917688266199e-05, + "loss": 1.031, + "num_input_tokens_seen": 84313608, + "step": 5240 + }, + { + "epoch": 0.3671228558669784, + "grad_norm": 3.963012456893921, + "learning_rate": 6.332217863397549e-05, + "loss": 1.0232, + "num_input_tokens_seen": 84329680, + "step": 5241 + }, + { + "epoch": 0.3671929041127076, + "grad_norm": 3.4596047401428223, + "learning_rate": 6.331518038528897e-05, + "loss": 1.0028, + "num_input_tokens_seen": 84346064, + "step": 5242 + }, + { + "epoch": 0.36726295235843687, + "grad_norm": 5.7928290367126465, + "learning_rate": 6.330818213660246e-05, + "loss": 1.2292, + "num_input_tokens_seen": 84361800, + "step": 5243 + }, + { + "epoch": 0.3673330006041661, + "grad_norm": 3.5012640953063965, + "learning_rate": 6.330118388791594e-05, + "loss": 1.0095, + "num_input_tokens_seen": 84378184, + "step": 5244 + }, + { + "epoch": 0.3674030488498954, + "grad_norm": 4.464978218078613, + "learning_rate": 6.329418563922943e-05, + "loss": 1.2258, + "num_input_tokens_seen": 84394568, + "step": 5245 + }, + { + "epoch": 0.3674730970956246, + "grad_norm": 3.4716012477874756, + "learning_rate": 6.328718739054291e-05, + "loss": 1.087, + "num_input_tokens_seen": 84410584, + "step": 5246 + }, + { + "epoch": 0.36754314534135385, + "grad_norm": 4.010568618774414, + "learning_rate": 6.32801891418564e-05, + "loss": 1.0823, + "num_input_tokens_seen": 84426968, + "step": 5247 + }, + { + "epoch": 0.3676131935870831, + "grad_norm": 3.763718605041504, + "learning_rate": 6.327319089316989e-05, + "loss": 1.079, + "num_input_tokens_seen": 84443352, + "step": 5248 + }, + { + "epoch": 0.36768324183281237, + "grad_norm": 5.381477355957031, + "learning_rate": 6.326619264448336e-05, + "loss": 1.0387, + "num_input_tokens_seen": 84459736, + "step": 5249 + }, + { + "epoch": 0.3677532900785416, + "grad_norm": 3.6646018028259277, + "learning_rate": 6.325919439579685e-05, + "loss": 1.026, + "num_input_tokens_seen": 84476120, + "step": 5250 + }, + { + "epoch": 0.36782333832427083, + "grad_norm": 4.005465507507324, + "learning_rate": 6.325219614711034e-05, + "loss": 1.0341, + "num_input_tokens_seen": 84492400, + "step": 5251 + }, + { + "epoch": 0.3678933865700001, + "grad_norm": 3.4287807941436768, + "learning_rate": 6.324519789842383e-05, + "loss": 0.9892, + "num_input_tokens_seen": 84508720, + "step": 5252 + }, + { + "epoch": 0.36796343481572935, + "grad_norm": 3.8715076446533203, + "learning_rate": 6.32381996497373e-05, + "loss": 1.3025, + "num_input_tokens_seen": 84524592, + "step": 5253 + }, + { + "epoch": 0.36803348306145856, + "grad_norm": 3.4789586067199707, + "learning_rate": 6.323120140105079e-05, + "loss": 0.9109, + "num_input_tokens_seen": 84540176, + "step": 5254 + }, + { + "epoch": 0.3681035313071878, + "grad_norm": 3.992988348007202, + "learning_rate": 6.322420315236428e-05, + "loss": 1.138, + "num_input_tokens_seen": 84556560, + "step": 5255 + }, + { + "epoch": 0.3681735795529171, + "grad_norm": 4.3957743644714355, + "learning_rate": 6.321720490367775e-05, + "loss": 1.2542, + "num_input_tokens_seen": 84572240, + "step": 5256 + }, + { + "epoch": 0.36824362779864633, + "grad_norm": 3.7909469604492188, + "learning_rate": 6.321020665499126e-05, + "loss": 0.9282, + "num_input_tokens_seen": 84587400, + "step": 5257 + }, + { + "epoch": 0.36831367604437554, + "grad_norm": 3.747345209121704, + "learning_rate": 6.320320840630474e-05, + "loss": 0.9673, + "num_input_tokens_seen": 84603240, + "step": 5258 + }, + { + "epoch": 0.3683837242901048, + "grad_norm": 3.6753249168395996, + "learning_rate": 6.319621015761822e-05, + "loss": 1.0435, + "num_input_tokens_seen": 84619624, + "step": 5259 + }, + { + "epoch": 0.36845377253583406, + "grad_norm": 3.6952924728393555, + "learning_rate": 6.31892119089317e-05, + "loss": 1.0577, + "num_input_tokens_seen": 84636008, + "step": 5260 + }, + { + "epoch": 0.3685238207815633, + "grad_norm": 4.606325149536133, + "learning_rate": 6.318221366024518e-05, + "loss": 1.0212, + "num_input_tokens_seen": 84652392, + "step": 5261 + }, + { + "epoch": 0.3685938690272925, + "grad_norm": 3.749755382537842, + "learning_rate": 6.317521541155867e-05, + "loss": 1.0378, + "num_input_tokens_seen": 84667832, + "step": 5262 + }, + { + "epoch": 0.3686639172730218, + "grad_norm": 3.7973029613494873, + "learning_rate": 6.316821716287216e-05, + "loss": 1.1695, + "num_input_tokens_seen": 84683904, + "step": 5263 + }, + { + "epoch": 0.36873396551875104, + "grad_norm": 4.264857769012451, + "learning_rate": 6.316121891418565e-05, + "loss": 1.0638, + "num_input_tokens_seen": 84700288, + "step": 5264 + }, + { + "epoch": 0.3688040137644803, + "grad_norm": 3.4577653408050537, + "learning_rate": 6.315422066549914e-05, + "loss": 1.0037, + "num_input_tokens_seen": 84716672, + "step": 5265 + }, + { + "epoch": 0.36887406201020956, + "grad_norm": 4.049471378326416, + "learning_rate": 6.314722241681261e-05, + "loss": 1.0595, + "num_input_tokens_seen": 84732976, + "step": 5266 + }, + { + "epoch": 0.36894411025593876, + "grad_norm": 4.293907165527344, + "learning_rate": 6.314022416812609e-05, + "loss": 1.1094, + "num_input_tokens_seen": 84747480, + "step": 5267 + }, + { + "epoch": 0.369014158501668, + "grad_norm": 7.115272045135498, + "learning_rate": 6.313322591943959e-05, + "loss": 0.9904, + "num_input_tokens_seen": 84763864, + "step": 5268 + }, + { + "epoch": 0.3690842067473973, + "grad_norm": 6.85962438583374, + "learning_rate": 6.312622767075306e-05, + "loss": 0.9934, + "num_input_tokens_seen": 84778648, + "step": 5269 + }, + { + "epoch": 0.36915425499312654, + "grad_norm": 4.24301290512085, + "learning_rate": 6.311922942206655e-05, + "loss": 1.0426, + "num_input_tokens_seen": 84794440, + "step": 5270 + }, + { + "epoch": 0.36922430323885574, + "grad_norm": 3.533189535140991, + "learning_rate": 6.311223117338004e-05, + "loss": 0.9863, + "num_input_tokens_seen": 84810824, + "step": 5271 + }, + { + "epoch": 0.369294351484585, + "grad_norm": 4.706559658050537, + "learning_rate": 6.310523292469353e-05, + "loss": 1.2352, + "num_input_tokens_seen": 84827208, + "step": 5272 + }, + { + "epoch": 0.36936439973031426, + "grad_norm": 3.492366075515747, + "learning_rate": 6.3098234676007e-05, + "loss": 0.9802, + "num_input_tokens_seen": 84842744, + "step": 5273 + }, + { + "epoch": 0.3694344479760435, + "grad_norm": 4.733495712280273, + "learning_rate": 6.30912364273205e-05, + "loss": 1.1914, + "num_input_tokens_seen": 84858432, + "step": 5274 + }, + { + "epoch": 0.3695044962217727, + "grad_norm": 3.6145412921905518, + "learning_rate": 6.308423817863398e-05, + "loss": 1.0411, + "num_input_tokens_seen": 84874496, + "step": 5275 + }, + { + "epoch": 0.369574544467502, + "grad_norm": 3.764568328857422, + "learning_rate": 6.307723992994746e-05, + "loss": 1.1604, + "num_input_tokens_seen": 84890880, + "step": 5276 + }, + { + "epoch": 0.36964459271323125, + "grad_norm": 5.0368428230285645, + "learning_rate": 6.307024168126095e-05, + "loss": 1.0006, + "num_input_tokens_seen": 84907264, + "step": 5277 + }, + { + "epoch": 0.3697146409589605, + "grad_norm": 3.9158520698547363, + "learning_rate": 6.306324343257443e-05, + "loss": 1.0409, + "num_input_tokens_seen": 84923648, + "step": 5278 + }, + { + "epoch": 0.3697846892046897, + "grad_norm": 4.663973808288574, + "learning_rate": 6.305624518388792e-05, + "loss": 0.9818, + "num_input_tokens_seen": 84939976, + "step": 5279 + }, + { + "epoch": 0.36985473745041897, + "grad_norm": 4.3741455078125, + "learning_rate": 6.30492469352014e-05, + "loss": 1.2902, + "num_input_tokens_seen": 84956184, + "step": 5280 + }, + { + "epoch": 0.36992478569614823, + "grad_norm": 5.071192264556885, + "learning_rate": 6.304224868651489e-05, + "loss": 1.0856, + "num_input_tokens_seen": 84972024, + "step": 5281 + }, + { + "epoch": 0.3699948339418775, + "grad_norm": 3.5479323863983154, + "learning_rate": 6.303525043782838e-05, + "loss": 1.0809, + "num_input_tokens_seen": 84988408, + "step": 5282 + }, + { + "epoch": 0.3700648821876067, + "grad_norm": 4.6933465003967285, + "learning_rate": 6.302825218914186e-05, + "loss": 1.1826, + "num_input_tokens_seen": 85004720, + "step": 5283 + }, + { + "epoch": 0.37013493043333595, + "grad_norm": 3.594067096710205, + "learning_rate": 6.302125394045535e-05, + "loss": 0.8992, + "num_input_tokens_seen": 85020456, + "step": 5284 + }, + { + "epoch": 0.3702049786790652, + "grad_norm": 3.972480535507202, + "learning_rate": 6.301425569176884e-05, + "loss": 1.09, + "num_input_tokens_seen": 85036840, + "step": 5285 + }, + { + "epoch": 0.37027502692479447, + "grad_norm": 4.674763202667236, + "learning_rate": 6.300725744308232e-05, + "loss": 1.029, + "num_input_tokens_seen": 85053224, + "step": 5286 + }, + { + "epoch": 0.3703450751705237, + "grad_norm": 4.716235160827637, + "learning_rate": 6.300025919439579e-05, + "loss": 0.9872, + "num_input_tokens_seen": 85068624, + "step": 5287 + }, + { + "epoch": 0.37041512341625293, + "grad_norm": 5.01246452331543, + "learning_rate": 6.299326094570928e-05, + "loss": 0.9608, + "num_input_tokens_seen": 85085008, + "step": 5288 + }, + { + "epoch": 0.3704851716619822, + "grad_norm": 5.020605087280273, + "learning_rate": 6.298626269702277e-05, + "loss": 0.9759, + "num_input_tokens_seen": 85101392, + "step": 5289 + }, + { + "epoch": 0.37055521990771145, + "grad_norm": 5.841190814971924, + "learning_rate": 6.297926444833626e-05, + "loss": 1.3302, + "num_input_tokens_seen": 85117776, + "step": 5290 + }, + { + "epoch": 0.37062526815344066, + "grad_norm": 4.592007637023926, + "learning_rate": 6.297226619964975e-05, + "loss": 0.9129, + "num_input_tokens_seen": 85134160, + "step": 5291 + }, + { + "epoch": 0.3706953163991699, + "grad_norm": 3.678398609161377, + "learning_rate": 6.296526795096323e-05, + "loss": 0.9809, + "num_input_tokens_seen": 85150544, + "step": 5292 + }, + { + "epoch": 0.3707653646448992, + "grad_norm": 3.9148921966552734, + "learning_rate": 6.295826970227671e-05, + "loss": 1.1459, + "num_input_tokens_seen": 85166208, + "step": 5293 + }, + { + "epoch": 0.37083541289062844, + "grad_norm": 3.83375883102417, + "learning_rate": 6.295127145359018e-05, + "loss": 1.1273, + "num_input_tokens_seen": 85182592, + "step": 5294 + }, + { + "epoch": 0.37090546113635764, + "grad_norm": 6.339621067047119, + "learning_rate": 6.294427320490369e-05, + "loss": 1.0995, + "num_input_tokens_seen": 85197512, + "step": 5295 + }, + { + "epoch": 0.3709755093820869, + "grad_norm": 3.931565046310425, + "learning_rate": 6.293727495621716e-05, + "loss": 0.9326, + "num_input_tokens_seen": 85213800, + "step": 5296 + }, + { + "epoch": 0.37104555762781616, + "grad_norm": 4.46995210647583, + "learning_rate": 6.293027670753065e-05, + "loss": 1.0782, + "num_input_tokens_seen": 85229528, + "step": 5297 + }, + { + "epoch": 0.3711156058735454, + "grad_norm": 4.4390363693237305, + "learning_rate": 6.292327845884414e-05, + "loss": 1.1976, + "num_input_tokens_seen": 85245912, + "step": 5298 + }, + { + "epoch": 0.3711856541192747, + "grad_norm": 4.089926719665527, + "learning_rate": 6.291628021015763e-05, + "loss": 1.037, + "num_input_tokens_seen": 85262296, + "step": 5299 + }, + { + "epoch": 0.3712557023650039, + "grad_norm": 4.190539360046387, + "learning_rate": 6.29092819614711e-05, + "loss": 1.1928, + "num_input_tokens_seen": 85278560, + "step": 5300 + }, + { + "epoch": 0.37132575061073314, + "grad_norm": 5.1102166175842285, + "learning_rate": 6.290228371278459e-05, + "loss": 0.8734, + "num_input_tokens_seen": 85294944, + "step": 5301 + }, + { + "epoch": 0.3713957988564624, + "grad_norm": 4.174960136413574, + "learning_rate": 6.289528546409808e-05, + "loss": 1.0425, + "num_input_tokens_seen": 85311328, + "step": 5302 + }, + { + "epoch": 0.37146584710219166, + "grad_norm": 3.8785698413848877, + "learning_rate": 6.288828721541157e-05, + "loss": 1.0008, + "num_input_tokens_seen": 85326784, + "step": 5303 + }, + { + "epoch": 0.37153589534792086, + "grad_norm": 3.728626251220703, + "learning_rate": 6.288128896672504e-05, + "loss": 1.1116, + "num_input_tokens_seen": 85343168, + "step": 5304 + }, + { + "epoch": 0.3716059435936501, + "grad_norm": 5.1877312660217285, + "learning_rate": 6.287429071803853e-05, + "loss": 1.0917, + "num_input_tokens_seen": 85359552, + "step": 5305 + }, + { + "epoch": 0.3716759918393794, + "grad_norm": 5.751648902893066, + "learning_rate": 6.286729246935202e-05, + "loss": 1.2662, + "num_input_tokens_seen": 85375136, + "step": 5306 + }, + { + "epoch": 0.37174604008510864, + "grad_norm": 3.7917258739471436, + "learning_rate": 6.28602942206655e-05, + "loss": 0.8499, + "num_input_tokens_seen": 85391520, + "step": 5307 + }, + { + "epoch": 0.37181608833083785, + "grad_norm": 4.268946647644043, + "learning_rate": 6.285329597197898e-05, + "loss": 1.0928, + "num_input_tokens_seen": 85406848, + "step": 5308 + }, + { + "epoch": 0.3718861365765671, + "grad_norm": 4.350981712341309, + "learning_rate": 6.284629772329247e-05, + "loss": 1.1725, + "num_input_tokens_seen": 85423232, + "step": 5309 + }, + { + "epoch": 0.37195618482229637, + "grad_norm": 3.8072032928466797, + "learning_rate": 6.283929947460596e-05, + "loss": 0.9999, + "num_input_tokens_seen": 85439616, + "step": 5310 + }, + { + "epoch": 0.3720262330680256, + "grad_norm": 4.0531697273254395, + "learning_rate": 6.283230122591945e-05, + "loss": 0.9389, + "num_input_tokens_seen": 85456000, + "step": 5311 + }, + { + "epoch": 0.37209628131375483, + "grad_norm": 5.18675422668457, + "learning_rate": 6.282530297723294e-05, + "loss": 1.0504, + "num_input_tokens_seen": 85472384, + "step": 5312 + }, + { + "epoch": 0.3721663295594841, + "grad_norm": 4.675386428833008, + "learning_rate": 6.281830472854641e-05, + "loss": 0.8373, + "num_input_tokens_seen": 85488544, + "step": 5313 + }, + { + "epoch": 0.37223637780521335, + "grad_norm": 6.522333145141602, + "learning_rate": 6.281130647985989e-05, + "loss": 0.9685, + "num_input_tokens_seen": 85504352, + "step": 5314 + }, + { + "epoch": 0.3723064260509426, + "grad_norm": 3.9266233444213867, + "learning_rate": 6.280430823117338e-05, + "loss": 1.0443, + "num_input_tokens_seen": 85520688, + "step": 5315 + }, + { + "epoch": 0.3723764742966718, + "grad_norm": 4.6428093910217285, + "learning_rate": 6.279730998248687e-05, + "loss": 0.9396, + "num_input_tokens_seen": 85537072, + "step": 5316 + }, + { + "epoch": 0.37244652254240107, + "grad_norm": 3.6043691635131836, + "learning_rate": 6.279031173380035e-05, + "loss": 0.903, + "num_input_tokens_seen": 85553456, + "step": 5317 + }, + { + "epoch": 0.37251657078813033, + "grad_norm": 3.4878151416778564, + "learning_rate": 6.278331348511384e-05, + "loss": 1.101, + "num_input_tokens_seen": 85569824, + "step": 5318 + }, + { + "epoch": 0.3725866190338596, + "grad_norm": 4.275106906890869, + "learning_rate": 6.277631523642733e-05, + "loss": 0.8912, + "num_input_tokens_seen": 85586208, + "step": 5319 + }, + { + "epoch": 0.3726566672795888, + "grad_norm": 7.615388870239258, + "learning_rate": 6.27693169877408e-05, + "loss": 1.0786, + "num_input_tokens_seen": 85600984, + "step": 5320 + }, + { + "epoch": 0.37272671552531805, + "grad_norm": 4.4750752449035645, + "learning_rate": 6.276231873905428e-05, + "loss": 1.1369, + "num_input_tokens_seen": 85617368, + "step": 5321 + }, + { + "epoch": 0.3727967637710473, + "grad_norm": 3.7900373935699463, + "learning_rate": 6.275532049036778e-05, + "loss": 1.0727, + "num_input_tokens_seen": 85633304, + "step": 5322 + }, + { + "epoch": 0.37286681201677657, + "grad_norm": 8.58016300201416, + "learning_rate": 6.274832224168127e-05, + "loss": 1.0942, + "num_input_tokens_seen": 85648592, + "step": 5323 + }, + { + "epoch": 0.3729368602625058, + "grad_norm": 3.847476005554199, + "learning_rate": 6.274132399299475e-05, + "loss": 1.2543, + "num_input_tokens_seen": 85664976, + "step": 5324 + }, + { + "epoch": 0.37300690850823504, + "grad_norm": 3.68683123588562, + "learning_rate": 6.273432574430824e-05, + "loss": 1.1331, + "num_input_tokens_seen": 85681360, + "step": 5325 + }, + { + "epoch": 0.3730769567539643, + "grad_norm": 4.07316255569458, + "learning_rate": 6.272732749562172e-05, + "loss": 1.1859, + "num_input_tokens_seen": 85697744, + "step": 5326 + }, + { + "epoch": 0.37314700499969355, + "grad_norm": 3.7817749977111816, + "learning_rate": 6.27203292469352e-05, + "loss": 1.128, + "num_input_tokens_seen": 85713680, + "step": 5327 + }, + { + "epoch": 0.37321705324542276, + "grad_norm": 3.8322465419769287, + "learning_rate": 6.271333099824869e-05, + "loss": 1.1804, + "num_input_tokens_seen": 85730064, + "step": 5328 + }, + { + "epoch": 0.373287101491152, + "grad_norm": 5.689653396606445, + "learning_rate": 6.270633274956218e-05, + "loss": 1.0848, + "num_input_tokens_seen": 85745904, + "step": 5329 + }, + { + "epoch": 0.3733571497368813, + "grad_norm": 5.568809509277344, + "learning_rate": 6.269933450087566e-05, + "loss": 0.9887, + "num_input_tokens_seen": 85762288, + "step": 5330 + }, + { + "epoch": 0.37342719798261054, + "grad_norm": 3.982375383377075, + "learning_rate": 6.269233625218914e-05, + "loss": 0.9975, + "num_input_tokens_seen": 85778672, + "step": 5331 + }, + { + "epoch": 0.3734972462283398, + "grad_norm": 3.430204391479492, + "learning_rate": 6.268533800350263e-05, + "loss": 1.0241, + "num_input_tokens_seen": 85795056, + "step": 5332 + }, + { + "epoch": 0.373567294474069, + "grad_norm": 3.465724229812622, + "learning_rate": 6.267833975481612e-05, + "loss": 0.9229, + "num_input_tokens_seen": 85811392, + "step": 5333 + }, + { + "epoch": 0.37363734271979826, + "grad_norm": 3.837188482284546, + "learning_rate": 6.267134150612959e-05, + "loss": 1.1354, + "num_input_tokens_seen": 85827016, + "step": 5334 + }, + { + "epoch": 0.3737073909655275, + "grad_norm": 7.360764980316162, + "learning_rate": 6.266434325744308e-05, + "loss": 1.0209, + "num_input_tokens_seen": 85842040, + "step": 5335 + }, + { + "epoch": 0.3737774392112568, + "grad_norm": 3.567553997039795, + "learning_rate": 6.265734500875657e-05, + "loss": 1.0502, + "num_input_tokens_seen": 85858424, + "step": 5336 + }, + { + "epoch": 0.373847487456986, + "grad_norm": 4.564986705780029, + "learning_rate": 6.265034676007006e-05, + "loss": 1.0178, + "num_input_tokens_seen": 85874808, + "step": 5337 + }, + { + "epoch": 0.37391753570271524, + "grad_norm": 3.4568405151367188, + "learning_rate": 6.264334851138355e-05, + "loss": 0.9245, + "num_input_tokens_seen": 85890672, + "step": 5338 + }, + { + "epoch": 0.3739875839484445, + "grad_norm": 3.723557233810425, + "learning_rate": 6.263635026269704e-05, + "loss": 1.0175, + "num_input_tokens_seen": 85906920, + "step": 5339 + }, + { + "epoch": 0.37405763219417376, + "grad_norm": 3.5800676345825195, + "learning_rate": 6.262935201401051e-05, + "loss": 0.9726, + "num_input_tokens_seen": 85923304, + "step": 5340 + }, + { + "epoch": 0.37412768043990297, + "grad_norm": 3.8996667861938477, + "learning_rate": 6.262235376532399e-05, + "loss": 1.2368, + "num_input_tokens_seen": 85938984, + "step": 5341 + }, + { + "epoch": 0.3741977286856322, + "grad_norm": 3.417182207107544, + "learning_rate": 6.261535551663747e-05, + "loss": 1.0959, + "num_input_tokens_seen": 85955368, + "step": 5342 + }, + { + "epoch": 0.3742677769313615, + "grad_norm": 4.214803695678711, + "learning_rate": 6.260835726795098e-05, + "loss": 1.1107, + "num_input_tokens_seen": 85971320, + "step": 5343 + }, + { + "epoch": 0.37433782517709074, + "grad_norm": 3.7782840728759766, + "learning_rate": 6.260135901926445e-05, + "loss": 0.9455, + "num_input_tokens_seen": 85987704, + "step": 5344 + }, + { + "epoch": 0.37440787342281995, + "grad_norm": 3.6186842918395996, + "learning_rate": 6.259436077057794e-05, + "loss": 1.0682, + "num_input_tokens_seen": 86004088, + "step": 5345 + }, + { + "epoch": 0.3744779216685492, + "grad_norm": 4.2028913497924805, + "learning_rate": 6.258736252189143e-05, + "loss": 1.2203, + "num_input_tokens_seen": 86020472, + "step": 5346 + }, + { + "epoch": 0.37454796991427847, + "grad_norm": 4.17422342300415, + "learning_rate": 6.25803642732049e-05, + "loss": 1.2483, + "num_input_tokens_seen": 86036856, + "step": 5347 + }, + { + "epoch": 0.3746180181600077, + "grad_norm": 3.3578243255615234, + "learning_rate": 6.257336602451838e-05, + "loss": 1.0315, + "num_input_tokens_seen": 86053224, + "step": 5348 + }, + { + "epoch": 0.37468806640573693, + "grad_norm": 4.105921268463135, + "learning_rate": 6.256636777583188e-05, + "loss": 1.0552, + "num_input_tokens_seen": 86069272, + "step": 5349 + }, + { + "epoch": 0.3747581146514662, + "grad_norm": 3.7420692443847656, + "learning_rate": 6.255936952714537e-05, + "loss": 1.0672, + "num_input_tokens_seen": 86085656, + "step": 5350 + }, + { + "epoch": 0.37482816289719545, + "grad_norm": 5.1573872566223145, + "learning_rate": 6.255237127845884e-05, + "loss": 1.376, + "num_input_tokens_seen": 86102040, + "step": 5351 + }, + { + "epoch": 0.3748982111429247, + "grad_norm": 3.9844436645507812, + "learning_rate": 6.254537302977233e-05, + "loss": 1.0042, + "num_input_tokens_seen": 86117976, + "step": 5352 + }, + { + "epoch": 0.3749682593886539, + "grad_norm": 3.6582653522491455, + "learning_rate": 6.253837478108582e-05, + "loss": 0.9786, + "num_input_tokens_seen": 86134360, + "step": 5353 + }, + { + "epoch": 0.3750383076343832, + "grad_norm": 4.814766883850098, + "learning_rate": 6.25313765323993e-05, + "loss": 1.2574, + "num_input_tokens_seen": 86150208, + "step": 5354 + }, + { + "epoch": 0.37510835588011243, + "grad_norm": 4.7514262199401855, + "learning_rate": 6.252437828371278e-05, + "loss": 1.071, + "num_input_tokens_seen": 86165672, + "step": 5355 + }, + { + "epoch": 0.3751784041258417, + "grad_norm": 3.9450578689575195, + "learning_rate": 6.251738003502627e-05, + "loss": 1.1295, + "num_input_tokens_seen": 86182056, + "step": 5356 + }, + { + "epoch": 0.3752484523715709, + "grad_norm": 3.5215647220611572, + "learning_rate": 6.251038178633976e-05, + "loss": 1.04, + "num_input_tokens_seen": 86198440, + "step": 5357 + }, + { + "epoch": 0.37531850061730015, + "grad_norm": 3.805070161819458, + "learning_rate": 6.250338353765324e-05, + "loss": 1.036, + "num_input_tokens_seen": 86214824, + "step": 5358 + }, + { + "epoch": 0.3753885488630294, + "grad_norm": 4.033730983734131, + "learning_rate": 6.249638528896673e-05, + "loss": 1.092, + "num_input_tokens_seen": 86231208, + "step": 5359 + }, + { + "epoch": 0.3754585971087587, + "grad_norm": 3.8157355785369873, + "learning_rate": 6.248938704028021e-05, + "loss": 1.0032, + "num_input_tokens_seen": 86247392, + "step": 5360 + }, + { + "epoch": 0.3755286453544879, + "grad_norm": 4.832013130187988, + "learning_rate": 6.248238879159369e-05, + "loss": 1.0711, + "num_input_tokens_seen": 86263776, + "step": 5361 + }, + { + "epoch": 0.37559869360021714, + "grad_norm": 3.753471612930298, + "learning_rate": 6.247539054290718e-05, + "loss": 1.0532, + "num_input_tokens_seen": 86279912, + "step": 5362 + }, + { + "epoch": 0.3756687418459464, + "grad_norm": 8.569518089294434, + "learning_rate": 6.246839229422068e-05, + "loss": 1.1073, + "num_input_tokens_seen": 86296296, + "step": 5363 + }, + { + "epoch": 0.37573879009167566, + "grad_norm": 4.399802207946777, + "learning_rate": 6.246139404553416e-05, + "loss": 1.1484, + "num_input_tokens_seen": 86312680, + "step": 5364 + }, + { + "epoch": 0.37580883833740486, + "grad_norm": 4.230834484100342, + "learning_rate": 6.245439579684764e-05, + "loss": 1.0905, + "num_input_tokens_seen": 86329064, + "step": 5365 + }, + { + "epoch": 0.3758788865831341, + "grad_norm": 4.750765800476074, + "learning_rate": 6.244739754816113e-05, + "loss": 1.2126, + "num_input_tokens_seen": 86345448, + "step": 5366 + }, + { + "epoch": 0.3759489348288634, + "grad_norm": 6.567142963409424, + "learning_rate": 6.244039929947461e-05, + "loss": 1.314, + "num_input_tokens_seen": 86361272, + "step": 5367 + }, + { + "epoch": 0.37601898307459264, + "grad_norm": 3.9668781757354736, + "learning_rate": 6.243340105078808e-05, + "loss": 1.0427, + "num_input_tokens_seen": 86377448, + "step": 5368 + }, + { + "epoch": 0.3760890313203219, + "grad_norm": 4.619864463806152, + "learning_rate": 6.242640280210158e-05, + "loss": 1.0687, + "num_input_tokens_seen": 86393600, + "step": 5369 + }, + { + "epoch": 0.3761590795660511, + "grad_norm": 6.837228298187256, + "learning_rate": 6.241940455341507e-05, + "loss": 0.9225, + "num_input_tokens_seen": 86409896, + "step": 5370 + }, + { + "epoch": 0.37622912781178036, + "grad_norm": 4.634070873260498, + "learning_rate": 6.241240630472855e-05, + "loss": 1.0147, + "num_input_tokens_seen": 86426280, + "step": 5371 + }, + { + "epoch": 0.3762991760575096, + "grad_norm": 3.944580554962158, + "learning_rate": 6.240540805604204e-05, + "loss": 1.144, + "num_input_tokens_seen": 86442640, + "step": 5372 + }, + { + "epoch": 0.3763692243032389, + "grad_norm": 7.016427516937256, + "learning_rate": 6.239840980735553e-05, + "loss": 1.0016, + "num_input_tokens_seen": 86459024, + "step": 5373 + }, + { + "epoch": 0.3764392725489681, + "grad_norm": 3.9997384548187256, + "learning_rate": 6.2391411558669e-05, + "loss": 0.9382, + "num_input_tokens_seen": 86475408, + "step": 5374 + }, + { + "epoch": 0.37650932079469734, + "grad_norm": 4.016181945800781, + "learning_rate": 6.238441330998249e-05, + "loss": 1.1728, + "num_input_tokens_seen": 86491680, + "step": 5375 + }, + { + "epoch": 0.3765793690404266, + "grad_norm": 4.19748592376709, + "learning_rate": 6.237741506129598e-05, + "loss": 1.161, + "num_input_tokens_seen": 86507768, + "step": 5376 + }, + { + "epoch": 0.37664941728615586, + "grad_norm": 4.579540252685547, + "learning_rate": 6.237041681260947e-05, + "loss": 1.0014, + "num_input_tokens_seen": 86524040, + "step": 5377 + }, + { + "epoch": 0.37671946553188507, + "grad_norm": 3.784952402114868, + "learning_rate": 6.236341856392294e-05, + "loss": 1.0435, + "num_input_tokens_seen": 86540424, + "step": 5378 + }, + { + "epoch": 0.3767895137776143, + "grad_norm": 5.813356876373291, + "learning_rate": 6.235642031523643e-05, + "loss": 0.9772, + "num_input_tokens_seen": 86556360, + "step": 5379 + }, + { + "epoch": 0.3768595620233436, + "grad_norm": 4.314088344573975, + "learning_rate": 6.234942206654992e-05, + "loss": 1.2318, + "num_input_tokens_seen": 86572744, + "step": 5380 + }, + { + "epoch": 0.37692961026907285, + "grad_norm": 3.898298740386963, + "learning_rate": 6.23424238178634e-05, + "loss": 1.1217, + "num_input_tokens_seen": 86588888, + "step": 5381 + }, + { + "epoch": 0.37699965851480205, + "grad_norm": 3.514692544937134, + "learning_rate": 6.233542556917688e-05, + "loss": 0.9526, + "num_input_tokens_seen": 86605272, + "step": 5382 + }, + { + "epoch": 0.3770697067605313, + "grad_norm": 3.7073886394500732, + "learning_rate": 6.232842732049038e-05, + "loss": 1.1199, + "num_input_tokens_seen": 86621656, + "step": 5383 + }, + { + "epoch": 0.37713975500626057, + "grad_norm": 3.9826815128326416, + "learning_rate": 6.232142907180386e-05, + "loss": 1.1417, + "num_input_tokens_seen": 86638040, + "step": 5384 + }, + { + "epoch": 0.37720980325198983, + "grad_norm": 3.6563196182250977, + "learning_rate": 6.231443082311733e-05, + "loss": 0.888, + "num_input_tokens_seen": 86654424, + "step": 5385 + }, + { + "epoch": 0.37727985149771903, + "grad_norm": 3.5995571613311768, + "learning_rate": 6.230743257443082e-05, + "loss": 1.0457, + "num_input_tokens_seen": 86670328, + "step": 5386 + }, + { + "epoch": 0.3773498997434483, + "grad_norm": 4.254338264465332, + "learning_rate": 6.230043432574431e-05, + "loss": 0.94, + "num_input_tokens_seen": 86685960, + "step": 5387 + }, + { + "epoch": 0.37741994798917755, + "grad_norm": 3.689716100692749, + "learning_rate": 6.229343607705779e-05, + "loss": 0.81, + "num_input_tokens_seen": 86702008, + "step": 5388 + }, + { + "epoch": 0.3774899962349068, + "grad_norm": 3.4042210578918457, + "learning_rate": 6.228643782837129e-05, + "loss": 1.0077, + "num_input_tokens_seen": 86718392, + "step": 5389 + }, + { + "epoch": 0.377560044480636, + "grad_norm": 4.607806205749512, + "learning_rate": 6.227943957968478e-05, + "loss": 1.2891, + "num_input_tokens_seen": 86734624, + "step": 5390 + }, + { + "epoch": 0.3776300927263653, + "grad_norm": 3.951362133026123, + "learning_rate": 6.227244133099825e-05, + "loss": 1.0501, + "num_input_tokens_seen": 86749816, + "step": 5391 + }, + { + "epoch": 0.37770014097209453, + "grad_norm": 3.535480260848999, + "learning_rate": 6.226544308231174e-05, + "loss": 0.8942, + "num_input_tokens_seen": 86765800, + "step": 5392 + }, + { + "epoch": 0.3777701892178238, + "grad_norm": 5.398930549621582, + "learning_rate": 6.225844483362523e-05, + "loss": 1.1322, + "num_input_tokens_seen": 86782184, + "step": 5393 + }, + { + "epoch": 0.377840237463553, + "grad_norm": 4.456240177154541, + "learning_rate": 6.22514465849387e-05, + "loss": 1.1725, + "num_input_tokens_seen": 86798568, + "step": 5394 + }, + { + "epoch": 0.37791028570928226, + "grad_norm": 3.8764703273773193, + "learning_rate": 6.224444833625219e-05, + "loss": 1.0041, + "num_input_tokens_seen": 86814824, + "step": 5395 + }, + { + "epoch": 0.3779803339550115, + "grad_norm": 3.8746144771575928, + "learning_rate": 6.223745008756568e-05, + "loss": 1.066, + "num_input_tokens_seen": 86831208, + "step": 5396 + }, + { + "epoch": 0.3780503822007408, + "grad_norm": 4.3454742431640625, + "learning_rate": 6.223045183887917e-05, + "loss": 1.1164, + "num_input_tokens_seen": 86846872, + "step": 5397 + }, + { + "epoch": 0.37812043044647, + "grad_norm": 5.006749153137207, + "learning_rate": 6.222345359019265e-05, + "loss": 0.8317, + "num_input_tokens_seen": 86863256, + "step": 5398 + }, + { + "epoch": 0.37819047869219924, + "grad_norm": 3.7388808727264404, + "learning_rate": 6.221645534150613e-05, + "loss": 1.1562, + "num_input_tokens_seen": 86879640, + "step": 5399 + }, + { + "epoch": 0.3782605269379285, + "grad_norm": 4.515074253082275, + "learning_rate": 6.220945709281962e-05, + "loss": 1.0428, + "num_input_tokens_seen": 86896024, + "step": 5400 + }, + { + "epoch": 0.3782605269379285, + "eval_loss": 1.1279726028442383, + "eval_runtime": 0.2024, + "eval_samples_per_second": 4.94, + "eval_steps_per_second": 4.94, + "num_input_tokens_seen": 86896024, + "step": 5400 + }, + { + "epoch": 0.37833057518365776, + "grad_norm": 3.5468356609344482, + "learning_rate": 6.22024588441331e-05, + "loss": 0.9858, + "num_input_tokens_seen": 86912032, + "step": 5401 + }, + { + "epoch": 0.378400623429387, + "grad_norm": 4.281546115875244, + "learning_rate": 6.219546059544659e-05, + "loss": 1.2335, + "num_input_tokens_seen": 86928080, + "step": 5402 + }, + { + "epoch": 0.3784706716751162, + "grad_norm": 4.247570037841797, + "learning_rate": 6.218846234676009e-05, + "loss": 1.0889, + "num_input_tokens_seen": 86944424, + "step": 5403 + }, + { + "epoch": 0.3785407199208455, + "grad_norm": 3.78439998626709, + "learning_rate": 6.218146409807356e-05, + "loss": 1.0476, + "num_input_tokens_seen": 86960808, + "step": 5404 + }, + { + "epoch": 0.37861076816657474, + "grad_norm": 4.174613952636719, + "learning_rate": 6.217446584938704e-05, + "loss": 1.2858, + "num_input_tokens_seen": 86976472, + "step": 5405 + }, + { + "epoch": 0.378680816412304, + "grad_norm": 4.759533882141113, + "learning_rate": 6.216746760070053e-05, + "loss": 0.9813, + "num_input_tokens_seen": 86992856, + "step": 5406 + }, + { + "epoch": 0.3787508646580332, + "grad_norm": 5.2616801261901855, + "learning_rate": 6.216046935201402e-05, + "loss": 1.1752, + "num_input_tokens_seen": 87007936, + "step": 5407 + }, + { + "epoch": 0.37882091290376246, + "grad_norm": 4.626899719238281, + "learning_rate": 6.215347110332749e-05, + "loss": 1.0348, + "num_input_tokens_seen": 87023888, + "step": 5408 + }, + { + "epoch": 0.3788909611494917, + "grad_norm": 3.7142221927642822, + "learning_rate": 6.214647285464099e-05, + "loss": 1.0051, + "num_input_tokens_seen": 87040272, + "step": 5409 + }, + { + "epoch": 0.378961009395221, + "grad_norm": 6.228342056274414, + "learning_rate": 6.213947460595448e-05, + "loss": 1.0807, + "num_input_tokens_seen": 87056656, + "step": 5410 + }, + { + "epoch": 0.3790310576409502, + "grad_norm": 3.7979259490966797, + "learning_rate": 6.213247635726796e-05, + "loss": 1.0051, + "num_input_tokens_seen": 87073040, + "step": 5411 + }, + { + "epoch": 0.37910110588667945, + "grad_norm": 3.903106927871704, + "learning_rate": 6.212547810858143e-05, + "loss": 1.0546, + "num_input_tokens_seen": 87089344, + "step": 5412 + }, + { + "epoch": 0.3791711541324087, + "grad_norm": 3.966651201248169, + "learning_rate": 6.211847985989492e-05, + "loss": 1.0678, + "num_input_tokens_seen": 87105144, + "step": 5413 + }, + { + "epoch": 0.37924120237813796, + "grad_norm": 4.070274829864502, + "learning_rate": 6.211148161120841e-05, + "loss": 1.1021, + "num_input_tokens_seen": 87121528, + "step": 5414 + }, + { + "epoch": 0.37931125062386717, + "grad_norm": 3.516997814178467, + "learning_rate": 6.21044833625219e-05, + "loss": 1.0112, + "num_input_tokens_seen": 87137752, + "step": 5415 + }, + { + "epoch": 0.37938129886959643, + "grad_norm": 4.28290319442749, + "learning_rate": 6.209748511383539e-05, + "loss": 1.2252, + "num_input_tokens_seen": 87154136, + "step": 5416 + }, + { + "epoch": 0.3794513471153257, + "grad_norm": 4.765808582305908, + "learning_rate": 6.209048686514887e-05, + "loss": 1.0135, + "num_input_tokens_seen": 87170520, + "step": 5417 + }, + { + "epoch": 0.37952139536105495, + "grad_norm": 3.8507494926452637, + "learning_rate": 6.208348861646235e-05, + "loss": 1.0304, + "num_input_tokens_seen": 87186904, + "step": 5418 + }, + { + "epoch": 0.37959144360678415, + "grad_norm": 7.46950626373291, + "learning_rate": 6.207649036777584e-05, + "loss": 1.1376, + "num_input_tokens_seen": 87203288, + "step": 5419 + }, + { + "epoch": 0.3796614918525134, + "grad_norm": 5.770944595336914, + "learning_rate": 6.206949211908933e-05, + "loss": 1.23, + "num_input_tokens_seen": 87219552, + "step": 5420 + }, + { + "epoch": 0.37973154009824267, + "grad_norm": 3.752936363220215, + "learning_rate": 6.20624938704028e-05, + "loss": 0.8285, + "num_input_tokens_seen": 87235736, + "step": 5421 + }, + { + "epoch": 0.37980158834397193, + "grad_norm": 3.8336403369903564, + "learning_rate": 6.205549562171629e-05, + "loss": 0.8416, + "num_input_tokens_seen": 87252120, + "step": 5422 + }, + { + "epoch": 0.37987163658970113, + "grad_norm": 7.380855083465576, + "learning_rate": 6.204849737302978e-05, + "loss": 1.1149, + "num_input_tokens_seen": 87268504, + "step": 5423 + }, + { + "epoch": 0.3799416848354304, + "grad_norm": 4.780874729156494, + "learning_rate": 6.204149912434327e-05, + "loss": 0.9103, + "num_input_tokens_seen": 87284888, + "step": 5424 + }, + { + "epoch": 0.38001173308115965, + "grad_norm": 4.691160202026367, + "learning_rate": 6.203450087565674e-05, + "loss": 1.1994, + "num_input_tokens_seen": 87301272, + "step": 5425 + }, + { + "epoch": 0.3800817813268889, + "grad_norm": 3.592348098754883, + "learning_rate": 6.202750262697023e-05, + "loss": 1.017, + "num_input_tokens_seen": 87317288, + "step": 5426 + }, + { + "epoch": 0.3801518295726181, + "grad_norm": 4.750811576843262, + "learning_rate": 6.202050437828372e-05, + "loss": 1.2781, + "num_input_tokens_seen": 87332488, + "step": 5427 + }, + { + "epoch": 0.3802218778183474, + "grad_norm": 4.564239501953125, + "learning_rate": 6.20135061295972e-05, + "loss": 1.133, + "num_input_tokens_seen": 87348264, + "step": 5428 + }, + { + "epoch": 0.38029192606407664, + "grad_norm": 4.697380065917969, + "learning_rate": 6.200650788091068e-05, + "loss": 1.0091, + "num_input_tokens_seen": 87363920, + "step": 5429 + }, + { + "epoch": 0.3803619743098059, + "grad_norm": 4.026552677154541, + "learning_rate": 6.199950963222419e-05, + "loss": 1.2177, + "num_input_tokens_seen": 87379920, + "step": 5430 + }, + { + "epoch": 0.3804320225555351, + "grad_norm": 5.023289203643799, + "learning_rate": 6.199251138353766e-05, + "loss": 0.9822, + "num_input_tokens_seen": 87395848, + "step": 5431 + }, + { + "epoch": 0.38050207080126436, + "grad_norm": 3.6005523204803467, + "learning_rate": 6.198551313485114e-05, + "loss": 1.0709, + "num_input_tokens_seen": 87411632, + "step": 5432 + }, + { + "epoch": 0.3805721190469936, + "grad_norm": 4.094357967376709, + "learning_rate": 6.197851488616462e-05, + "loss": 1.1254, + "num_input_tokens_seen": 87426912, + "step": 5433 + }, + { + "epoch": 0.3806421672927229, + "grad_norm": 4.452909469604492, + "learning_rate": 6.197151663747811e-05, + "loss": 1.248, + "num_input_tokens_seen": 87443296, + "step": 5434 + }, + { + "epoch": 0.3807122155384521, + "grad_norm": 3.975532054901123, + "learning_rate": 6.19645183887916e-05, + "loss": 1.0786, + "num_input_tokens_seen": 87459680, + "step": 5435 + }, + { + "epoch": 0.38078226378418134, + "grad_norm": 4.745920181274414, + "learning_rate": 6.195752014010509e-05, + "loss": 1.2534, + "num_input_tokens_seen": 87476064, + "step": 5436 + }, + { + "epoch": 0.3808523120299106, + "grad_norm": 3.8793790340423584, + "learning_rate": 6.195052189141858e-05, + "loss": 1.1197, + "num_input_tokens_seen": 87492448, + "step": 5437 + }, + { + "epoch": 0.38092236027563986, + "grad_norm": 4.695518493652344, + "learning_rate": 6.194352364273205e-05, + "loss": 0.93, + "num_input_tokens_seen": 87508832, + "step": 5438 + }, + { + "epoch": 0.3809924085213691, + "grad_norm": 3.5820047855377197, + "learning_rate": 6.193652539404553e-05, + "loss": 1.0007, + "num_input_tokens_seen": 87524728, + "step": 5439 + }, + { + "epoch": 0.3810624567670983, + "grad_norm": 5.76292610168457, + "learning_rate": 6.192952714535902e-05, + "loss": 1.1919, + "num_input_tokens_seen": 87540752, + "step": 5440 + }, + { + "epoch": 0.3811325050128276, + "grad_norm": 4.334653377532959, + "learning_rate": 6.19225288966725e-05, + "loss": 0.9847, + "num_input_tokens_seen": 87556384, + "step": 5441 + }, + { + "epoch": 0.38120255325855684, + "grad_norm": 3.7438180446624756, + "learning_rate": 6.1915530647986e-05, + "loss": 1.0083, + "num_input_tokens_seen": 87572320, + "step": 5442 + }, + { + "epoch": 0.3812726015042861, + "grad_norm": 4.082560062408447, + "learning_rate": 6.190853239929948e-05, + "loss": 0.8908, + "num_input_tokens_seen": 87588704, + "step": 5443 + }, + { + "epoch": 0.3813426497500153, + "grad_norm": 3.9324755668640137, + "learning_rate": 6.190153415061297e-05, + "loss": 1.0991, + "num_input_tokens_seen": 87605088, + "step": 5444 + }, + { + "epoch": 0.38141269799574457, + "grad_norm": 5.329967498779297, + "learning_rate": 6.189453590192645e-05, + "loss": 1.1817, + "num_input_tokens_seen": 87621472, + "step": 5445 + }, + { + "epoch": 0.3814827462414738, + "grad_norm": 3.627267837524414, + "learning_rate": 6.188753765323993e-05, + "loss": 0.9407, + "num_input_tokens_seen": 87637856, + "step": 5446 + }, + { + "epoch": 0.3815527944872031, + "grad_norm": 3.6728835105895996, + "learning_rate": 6.188053940455342e-05, + "loss": 0.8623, + "num_input_tokens_seen": 87653720, + "step": 5447 + }, + { + "epoch": 0.3816228427329323, + "grad_norm": 3.556185245513916, + "learning_rate": 6.18735411558669e-05, + "loss": 0.9531, + "num_input_tokens_seen": 87670104, + "step": 5448 + }, + { + "epoch": 0.38169289097866155, + "grad_norm": 4.075231552124023, + "learning_rate": 6.186654290718039e-05, + "loss": 1.0284, + "num_input_tokens_seen": 87686488, + "step": 5449 + }, + { + "epoch": 0.3817629392243908, + "grad_norm": 3.981752395629883, + "learning_rate": 6.185954465849388e-05, + "loss": 1.0822, + "num_input_tokens_seen": 87702872, + "step": 5450 + }, + { + "epoch": 0.38183298747012007, + "grad_norm": 4.75683069229126, + "learning_rate": 6.185254640980736e-05, + "loss": 0.9611, + "num_input_tokens_seen": 87718912, + "step": 5451 + }, + { + "epoch": 0.38190303571584927, + "grad_norm": 6.081716060638428, + "learning_rate": 6.184554816112084e-05, + "loss": 1.092, + "num_input_tokens_seen": 87735160, + "step": 5452 + }, + { + "epoch": 0.38197308396157853, + "grad_norm": 6.651247978210449, + "learning_rate": 6.183854991243433e-05, + "loss": 1.0397, + "num_input_tokens_seen": 87749232, + "step": 5453 + }, + { + "epoch": 0.3820431322073078, + "grad_norm": 4.12028694152832, + "learning_rate": 6.183155166374782e-05, + "loss": 1.094, + "num_input_tokens_seen": 87765328, + "step": 5454 + }, + { + "epoch": 0.38211318045303705, + "grad_norm": 6.3344645500183105, + "learning_rate": 6.18245534150613e-05, + "loss": 1.0275, + "num_input_tokens_seen": 87781712, + "step": 5455 + }, + { + "epoch": 0.38218322869876625, + "grad_norm": 3.745476007461548, + "learning_rate": 6.181755516637478e-05, + "loss": 0.9485, + "num_input_tokens_seen": 87798032, + "step": 5456 + }, + { + "epoch": 0.3822532769444955, + "grad_norm": 3.515174388885498, + "learning_rate": 6.181055691768828e-05, + "loss": 1.1138, + "num_input_tokens_seen": 87814416, + "step": 5457 + }, + { + "epoch": 0.38232332519022477, + "grad_norm": 4.101998329162598, + "learning_rate": 6.180355866900176e-05, + "loss": 0.9787, + "num_input_tokens_seen": 87830504, + "step": 5458 + }, + { + "epoch": 0.38239337343595403, + "grad_norm": 4.045940399169922, + "learning_rate": 6.179656042031523e-05, + "loss": 1.1278, + "num_input_tokens_seen": 87846264, + "step": 5459 + }, + { + "epoch": 0.38246342168168324, + "grad_norm": 8.09753131866455, + "learning_rate": 6.178956217162872e-05, + "loss": 1.131, + "num_input_tokens_seen": 87861856, + "step": 5460 + }, + { + "epoch": 0.3825334699274125, + "grad_norm": 5.395979404449463, + "learning_rate": 6.178256392294221e-05, + "loss": 1.0364, + "num_input_tokens_seen": 87878024, + "step": 5461 + }, + { + "epoch": 0.38260351817314175, + "grad_norm": 3.452855110168457, + "learning_rate": 6.17755656742557e-05, + "loss": 0.8875, + "num_input_tokens_seen": 87894408, + "step": 5462 + }, + { + "epoch": 0.382673566418871, + "grad_norm": 3.9877512454986572, + "learning_rate": 6.176856742556919e-05, + "loss": 1.0349, + "num_input_tokens_seen": 87910400, + "step": 5463 + }, + { + "epoch": 0.3827436146646002, + "grad_norm": 3.9095492362976074, + "learning_rate": 6.176156917688268e-05, + "loss": 1.063, + "num_input_tokens_seen": 87926040, + "step": 5464 + }, + { + "epoch": 0.3828136629103295, + "grad_norm": 4.558162212371826, + "learning_rate": 6.175457092819615e-05, + "loss": 1.1126, + "num_input_tokens_seen": 87942424, + "step": 5465 + }, + { + "epoch": 0.38288371115605874, + "grad_norm": 3.786123275756836, + "learning_rate": 6.174757267950963e-05, + "loss": 1.0414, + "num_input_tokens_seen": 87958808, + "step": 5466 + }, + { + "epoch": 0.382953759401788, + "grad_norm": 4.0291056632995605, + "learning_rate": 6.174057443082311e-05, + "loss": 1.0603, + "num_input_tokens_seen": 87975192, + "step": 5467 + }, + { + "epoch": 0.3830238076475172, + "grad_norm": 3.698666572570801, + "learning_rate": 6.17335761821366e-05, + "loss": 0.9187, + "num_input_tokens_seen": 87991504, + "step": 5468 + }, + { + "epoch": 0.38309385589324646, + "grad_norm": 3.7802882194519043, + "learning_rate": 6.172657793345009e-05, + "loss": 0.9568, + "num_input_tokens_seen": 88007888, + "step": 5469 + }, + { + "epoch": 0.3831639041389757, + "grad_norm": 4.754447937011719, + "learning_rate": 6.171957968476358e-05, + "loss": 1.1788, + "num_input_tokens_seen": 88023832, + "step": 5470 + }, + { + "epoch": 0.383233952384705, + "grad_norm": 3.502560615539551, + "learning_rate": 6.171258143607707e-05, + "loss": 0.9993, + "num_input_tokens_seen": 88040216, + "step": 5471 + }, + { + "epoch": 0.38330400063043424, + "grad_norm": 4.379989147186279, + "learning_rate": 6.170558318739054e-05, + "loss": 1.0609, + "num_input_tokens_seen": 88055768, + "step": 5472 + }, + { + "epoch": 0.38337404887616344, + "grad_norm": 3.3798177242279053, + "learning_rate": 6.169858493870403e-05, + "loss": 0.6884, + "num_input_tokens_seen": 88072152, + "step": 5473 + }, + { + "epoch": 0.3834440971218927, + "grad_norm": 4.265483856201172, + "learning_rate": 6.169158669001752e-05, + "loss": 1.0405, + "num_input_tokens_seen": 88087816, + "step": 5474 + }, + { + "epoch": 0.38351414536762196, + "grad_norm": 4.468397617340088, + "learning_rate": 6.168458844133101e-05, + "loss": 0.885, + "num_input_tokens_seen": 88103160, + "step": 5475 + }, + { + "epoch": 0.3835841936133512, + "grad_norm": 3.888359546661377, + "learning_rate": 6.167759019264448e-05, + "loss": 1.1768, + "num_input_tokens_seen": 88119544, + "step": 5476 + }, + { + "epoch": 0.3836542418590804, + "grad_norm": 3.7953927516937256, + "learning_rate": 6.167059194395797e-05, + "loss": 1.1585, + "num_input_tokens_seen": 88135928, + "step": 5477 + }, + { + "epoch": 0.3837242901048097, + "grad_norm": 3.7742021083831787, + "learning_rate": 6.166359369527146e-05, + "loss": 0.9201, + "num_input_tokens_seen": 88151928, + "step": 5478 + }, + { + "epoch": 0.38379433835053894, + "grad_norm": 3.811535120010376, + "learning_rate": 6.165659544658494e-05, + "loss": 0.9211, + "num_input_tokens_seen": 88168312, + "step": 5479 + }, + { + "epoch": 0.3838643865962682, + "grad_norm": 5.1758646965026855, + "learning_rate": 6.164959719789842e-05, + "loss": 1.2087, + "num_input_tokens_seen": 88184064, + "step": 5480 + }, + { + "epoch": 0.3839344348419974, + "grad_norm": 4.529813289642334, + "learning_rate": 6.164259894921191e-05, + "loss": 0.9142, + "num_input_tokens_seen": 88200216, + "step": 5481 + }, + { + "epoch": 0.38400448308772667, + "grad_norm": 4.426999568939209, + "learning_rate": 6.16356007005254e-05, + "loss": 1.0046, + "num_input_tokens_seen": 88215568, + "step": 5482 + }, + { + "epoch": 0.3840745313334559, + "grad_norm": 4.73276948928833, + "learning_rate": 6.162860245183888e-05, + "loss": 1.0082, + "num_input_tokens_seen": 88231952, + "step": 5483 + }, + { + "epoch": 0.3841445795791852, + "grad_norm": 3.6280384063720703, + "learning_rate": 6.162160420315238e-05, + "loss": 0.9015, + "num_input_tokens_seen": 88247728, + "step": 5484 + }, + { + "epoch": 0.3842146278249144, + "grad_norm": 3.6947717666625977, + "learning_rate": 6.161460595446585e-05, + "loss": 0.9671, + "num_input_tokens_seen": 88263472, + "step": 5485 + }, + { + "epoch": 0.38428467607064365, + "grad_norm": 3.683591842651367, + "learning_rate": 6.160760770577933e-05, + "loss": 1.1844, + "num_input_tokens_seen": 88279856, + "step": 5486 + }, + { + "epoch": 0.3843547243163729, + "grad_norm": 6.020013332366943, + "learning_rate": 6.160060945709282e-05, + "loss": 1.0372, + "num_input_tokens_seen": 88295864, + "step": 5487 + }, + { + "epoch": 0.38442477256210217, + "grad_norm": 8.429437637329102, + "learning_rate": 6.15936112084063e-05, + "loss": 1.143, + "num_input_tokens_seen": 88311752, + "step": 5488 + }, + { + "epoch": 0.3844948208078314, + "grad_norm": 3.679159164428711, + "learning_rate": 6.15866129597198e-05, + "loss": 0.9212, + "num_input_tokens_seen": 88327784, + "step": 5489 + }, + { + "epoch": 0.38456486905356063, + "grad_norm": 4.131216526031494, + "learning_rate": 6.157961471103328e-05, + "loss": 1.0983, + "num_input_tokens_seen": 88343480, + "step": 5490 + }, + { + "epoch": 0.3846349172992899, + "grad_norm": 4.294956684112549, + "learning_rate": 6.157261646234677e-05, + "loss": 1.224, + "num_input_tokens_seen": 88359864, + "step": 5491 + }, + { + "epoch": 0.38470496554501915, + "grad_norm": 4.683321952819824, + "learning_rate": 6.156561821366025e-05, + "loss": 1.0377, + "num_input_tokens_seen": 88375184, + "step": 5492 + }, + { + "epoch": 0.38477501379074835, + "grad_norm": 3.932366371154785, + "learning_rate": 6.155861996497372e-05, + "loss": 1.1341, + "num_input_tokens_seen": 88391568, + "step": 5493 + }, + { + "epoch": 0.3848450620364776, + "grad_norm": 4.191849231719971, + "learning_rate": 6.155162171628721e-05, + "loss": 1.2247, + "num_input_tokens_seen": 88407680, + "step": 5494 + }, + { + "epoch": 0.3849151102822069, + "grad_norm": 3.983915090560913, + "learning_rate": 6.154462346760071e-05, + "loss": 1.0115, + "num_input_tokens_seen": 88422888, + "step": 5495 + }, + { + "epoch": 0.38498515852793613, + "grad_norm": 4.163250923156738, + "learning_rate": 6.153762521891419e-05, + "loss": 1.057, + "num_input_tokens_seen": 88439272, + "step": 5496 + }, + { + "epoch": 0.38505520677366534, + "grad_norm": 6.113068580627441, + "learning_rate": 6.153062697022768e-05, + "loss": 1.0971, + "num_input_tokens_seen": 88455656, + "step": 5497 + }, + { + "epoch": 0.3851252550193946, + "grad_norm": 5.32371187210083, + "learning_rate": 6.152362872154117e-05, + "loss": 1.1886, + "num_input_tokens_seen": 88472040, + "step": 5498 + }, + { + "epoch": 0.38519530326512386, + "grad_norm": 6.110095500946045, + "learning_rate": 6.151663047285464e-05, + "loss": 0.9587, + "num_input_tokens_seen": 88487720, + "step": 5499 + }, + { + "epoch": 0.3852653515108531, + "grad_norm": 3.9656851291656494, + "learning_rate": 6.150963222416813e-05, + "loss": 0.9635, + "num_input_tokens_seen": 88504104, + "step": 5500 + }, + { + "epoch": 0.3853353997565823, + "grad_norm": 4.601620197296143, + "learning_rate": 6.150263397548162e-05, + "loss": 1.2542, + "num_input_tokens_seen": 88520160, + "step": 5501 + }, + { + "epoch": 0.3854054480023116, + "grad_norm": 4.273797988891602, + "learning_rate": 6.14956357267951e-05, + "loss": 1.3405, + "num_input_tokens_seen": 88535832, + "step": 5502 + }, + { + "epoch": 0.38547549624804084, + "grad_norm": 4.023514747619629, + "learning_rate": 6.148863747810858e-05, + "loss": 1.1248, + "num_input_tokens_seen": 88552000, + "step": 5503 + }, + { + "epoch": 0.3855455444937701, + "grad_norm": 3.7229719161987305, + "learning_rate": 6.148163922942207e-05, + "loss": 1.1117, + "num_input_tokens_seen": 88567600, + "step": 5504 + }, + { + "epoch": 0.38561559273949936, + "grad_norm": 4.696394920349121, + "learning_rate": 6.147464098073556e-05, + "loss": 0.8945, + "num_input_tokens_seen": 88583224, + "step": 5505 + }, + { + "epoch": 0.38568564098522856, + "grad_norm": 5.354174613952637, + "learning_rate": 6.146764273204903e-05, + "loss": 1.079, + "num_input_tokens_seen": 88599608, + "step": 5506 + }, + { + "epoch": 0.3857556892309578, + "grad_norm": 4.717334747314453, + "learning_rate": 6.146064448336252e-05, + "loss": 1.1293, + "num_input_tokens_seen": 88615048, + "step": 5507 + }, + { + "epoch": 0.3858257374766871, + "grad_norm": 5.373983383178711, + "learning_rate": 6.145364623467601e-05, + "loss": 0.9625, + "num_input_tokens_seen": 88630888, + "step": 5508 + }, + { + "epoch": 0.38589578572241634, + "grad_norm": 4.338916301727295, + "learning_rate": 6.14466479859895e-05, + "loss": 1.0884, + "num_input_tokens_seen": 88647072, + "step": 5509 + }, + { + "epoch": 0.38596583396814554, + "grad_norm": 3.898721694946289, + "learning_rate": 6.143964973730297e-05, + "loss": 1.095, + "num_input_tokens_seen": 88663128, + "step": 5510 + }, + { + "epoch": 0.3860358822138748, + "grad_norm": 4.614948749542236, + "learning_rate": 6.143265148861648e-05, + "loss": 1.0729, + "num_input_tokens_seen": 88679512, + "step": 5511 + }, + { + "epoch": 0.38610593045960406, + "grad_norm": 5.1157732009887695, + "learning_rate": 6.142565323992995e-05, + "loss": 1.0776, + "num_input_tokens_seen": 88695664, + "step": 5512 + }, + { + "epoch": 0.3861759787053333, + "grad_norm": 4.29611873626709, + "learning_rate": 6.141865499124343e-05, + "loss": 1.0838, + "num_input_tokens_seen": 88711560, + "step": 5513 + }, + { + "epoch": 0.3862460269510625, + "grad_norm": 3.9464735984802246, + "learning_rate": 6.141165674255692e-05, + "loss": 1.1907, + "num_input_tokens_seen": 88727464, + "step": 5514 + }, + { + "epoch": 0.3863160751967918, + "grad_norm": 3.8381590843200684, + "learning_rate": 6.140465849387042e-05, + "loss": 1.1416, + "num_input_tokens_seen": 88743848, + "step": 5515 + }, + { + "epoch": 0.38638612344252105, + "grad_norm": 3.573434829711914, + "learning_rate": 6.139766024518389e-05, + "loss": 0.9316, + "num_input_tokens_seen": 88759312, + "step": 5516 + }, + { + "epoch": 0.3864561716882503, + "grad_norm": 4.257131576538086, + "learning_rate": 6.139066199649738e-05, + "loss": 0.9534, + "num_input_tokens_seen": 88775112, + "step": 5517 + }, + { + "epoch": 0.3865262199339795, + "grad_norm": 4.2985310554504395, + "learning_rate": 6.138366374781087e-05, + "loss": 0.9387, + "num_input_tokens_seen": 88791496, + "step": 5518 + }, + { + "epoch": 0.38659626817970877, + "grad_norm": 3.7012977600097656, + "learning_rate": 6.137666549912434e-05, + "loss": 1.0104, + "num_input_tokens_seen": 88807880, + "step": 5519 + }, + { + "epoch": 0.38666631642543803, + "grad_norm": 5.4860453605651855, + "learning_rate": 6.136966725043782e-05, + "loss": 1.1978, + "num_input_tokens_seen": 88823392, + "step": 5520 + }, + { + "epoch": 0.3867363646711673, + "grad_norm": 4.165813446044922, + "learning_rate": 6.136266900175132e-05, + "loss": 1.0184, + "num_input_tokens_seen": 88839352, + "step": 5521 + }, + { + "epoch": 0.3868064129168965, + "grad_norm": 3.6253862380981445, + "learning_rate": 6.135567075306481e-05, + "loss": 0.9544, + "num_input_tokens_seen": 88855736, + "step": 5522 + }, + { + "epoch": 0.38687646116262575, + "grad_norm": 3.834057331085205, + "learning_rate": 6.134867250437829e-05, + "loss": 1.1863, + "num_input_tokens_seen": 88871952, + "step": 5523 + }, + { + "epoch": 0.386946509408355, + "grad_norm": 4.534783363342285, + "learning_rate": 6.134167425569177e-05, + "loss": 0.85, + "num_input_tokens_seen": 88888336, + "step": 5524 + }, + { + "epoch": 0.38701655765408427, + "grad_norm": 5.4073381423950195, + "learning_rate": 6.133467600700526e-05, + "loss": 0.9257, + "num_input_tokens_seen": 88904256, + "step": 5525 + }, + { + "epoch": 0.3870866058998135, + "grad_norm": 3.819841146469116, + "learning_rate": 6.132767775831874e-05, + "loss": 1.0911, + "num_input_tokens_seen": 88920640, + "step": 5526 + }, + { + "epoch": 0.38715665414554273, + "grad_norm": 3.814857244491577, + "learning_rate": 6.132067950963223e-05, + "loss": 1.2414, + "num_input_tokens_seen": 88937024, + "step": 5527 + }, + { + "epoch": 0.387226702391272, + "grad_norm": 3.682535171508789, + "learning_rate": 6.131368126094571e-05, + "loss": 0.9158, + "num_input_tokens_seen": 88952712, + "step": 5528 + }, + { + "epoch": 0.38729675063700125, + "grad_norm": 3.5657262802124023, + "learning_rate": 6.13066830122592e-05, + "loss": 0.9336, + "num_input_tokens_seen": 88969096, + "step": 5529 + }, + { + "epoch": 0.38736679888273046, + "grad_norm": 3.851977825164795, + "learning_rate": 6.129968476357268e-05, + "loss": 1.0546, + "num_input_tokens_seen": 88985480, + "step": 5530 + }, + { + "epoch": 0.3874368471284597, + "grad_norm": 4.079189777374268, + "learning_rate": 6.129268651488617e-05, + "loss": 0.856, + "num_input_tokens_seen": 89001104, + "step": 5531 + }, + { + "epoch": 0.387506895374189, + "grad_norm": 4.388980865478516, + "learning_rate": 6.128568826619966e-05, + "loss": 1.0785, + "num_input_tokens_seen": 89017232, + "step": 5532 + }, + { + "epoch": 0.38757694361991823, + "grad_norm": 3.6747231483459473, + "learning_rate": 6.127869001751313e-05, + "loss": 1.1171, + "num_input_tokens_seen": 89033576, + "step": 5533 + }, + { + "epoch": 0.38764699186564744, + "grad_norm": 4.62367057800293, + "learning_rate": 6.127169176882662e-05, + "loss": 1.138, + "num_input_tokens_seen": 89049224, + "step": 5534 + }, + { + "epoch": 0.3877170401113767, + "grad_norm": 3.8601040840148926, + "learning_rate": 6.126469352014011e-05, + "loss": 1.0254, + "num_input_tokens_seen": 89064968, + "step": 5535 + }, + { + "epoch": 0.38778708835710596, + "grad_norm": 5.132208347320557, + "learning_rate": 6.12576952714536e-05, + "loss": 1.0121, + "num_input_tokens_seen": 89081352, + "step": 5536 + }, + { + "epoch": 0.3878571366028352, + "grad_norm": 3.9259984493255615, + "learning_rate": 6.125069702276707e-05, + "loss": 0.9146, + "num_input_tokens_seen": 89097696, + "step": 5537 + }, + { + "epoch": 0.3879271848485644, + "grad_norm": 3.9004077911376953, + "learning_rate": 6.124369877408057e-05, + "loss": 1.0059, + "num_input_tokens_seen": 89114080, + "step": 5538 + }, + { + "epoch": 0.3879972330942937, + "grad_norm": 4.657776355743408, + "learning_rate": 6.123670052539405e-05, + "loss": 1.0612, + "num_input_tokens_seen": 89129584, + "step": 5539 + }, + { + "epoch": 0.38806728134002294, + "grad_norm": 3.4758501052856445, + "learning_rate": 6.122970227670752e-05, + "loss": 1.0179, + "num_input_tokens_seen": 89145968, + "step": 5540 + }, + { + "epoch": 0.3881373295857522, + "grad_norm": 3.949275255203247, + "learning_rate": 6.122270402802101e-05, + "loss": 1.0336, + "num_input_tokens_seen": 89161904, + "step": 5541 + }, + { + "epoch": 0.38820737783148146, + "grad_norm": 5.620425224304199, + "learning_rate": 6.121570577933451e-05, + "loss": 0.9776, + "num_input_tokens_seen": 89178032, + "step": 5542 + }, + { + "epoch": 0.38827742607721066, + "grad_norm": 5.1215643882751465, + "learning_rate": 6.120870753064799e-05, + "loss": 1.0577, + "num_input_tokens_seen": 89193568, + "step": 5543 + }, + { + "epoch": 0.3883474743229399, + "grad_norm": 3.994556427001953, + "learning_rate": 6.120170928196148e-05, + "loss": 1.0631, + "num_input_tokens_seen": 89209952, + "step": 5544 + }, + { + "epoch": 0.3884175225686692, + "grad_norm": 6.86944055557251, + "learning_rate": 6.119471103327497e-05, + "loss": 1.208, + "num_input_tokens_seen": 89226336, + "step": 5545 + }, + { + "epoch": 0.38848757081439844, + "grad_norm": 3.72501540184021, + "learning_rate": 6.118771278458844e-05, + "loss": 1.0198, + "num_input_tokens_seen": 89242720, + "step": 5546 + }, + { + "epoch": 0.38855761906012765, + "grad_norm": 3.6887834072113037, + "learning_rate": 6.118071453590193e-05, + "loss": 1.0964, + "num_input_tokens_seen": 89258536, + "step": 5547 + }, + { + "epoch": 0.3886276673058569, + "grad_norm": 5.15130615234375, + "learning_rate": 6.117371628721542e-05, + "loss": 1.0193, + "num_input_tokens_seen": 89274920, + "step": 5548 + }, + { + "epoch": 0.38869771555158616, + "grad_norm": 3.7503981590270996, + "learning_rate": 6.116671803852891e-05, + "loss": 0.9457, + "num_input_tokens_seen": 89291304, + "step": 5549 + }, + { + "epoch": 0.3887677637973154, + "grad_norm": 4.851298809051514, + "learning_rate": 6.115971978984238e-05, + "loss": 1.132, + "num_input_tokens_seen": 89307080, + "step": 5550 + }, + { + "epoch": 0.38883781204304463, + "grad_norm": 3.72981858253479, + "learning_rate": 6.115272154115587e-05, + "loss": 1.0371, + "num_input_tokens_seen": 89323464, + "step": 5551 + }, + { + "epoch": 0.3889078602887739, + "grad_norm": 4.1301140785217285, + "learning_rate": 6.114572329246936e-05, + "loss": 0.9746, + "num_input_tokens_seen": 89339696, + "step": 5552 + }, + { + "epoch": 0.38897790853450315, + "grad_norm": 4.225720405578613, + "learning_rate": 6.113872504378283e-05, + "loss": 1.124, + "num_input_tokens_seen": 89356080, + "step": 5553 + }, + { + "epoch": 0.3890479567802324, + "grad_norm": 3.7197327613830566, + "learning_rate": 6.113172679509632e-05, + "loss": 1.0739, + "num_input_tokens_seen": 89372464, + "step": 5554 + }, + { + "epoch": 0.3891180050259616, + "grad_norm": 4.626903057098389, + "learning_rate": 6.112472854640981e-05, + "loss": 0.9896, + "num_input_tokens_seen": 89388848, + "step": 5555 + }, + { + "epoch": 0.38918805327169087, + "grad_norm": 4.229621410369873, + "learning_rate": 6.111773029772329e-05, + "loss": 0.9925, + "num_input_tokens_seen": 89405112, + "step": 5556 + }, + { + "epoch": 0.38925810151742013, + "grad_norm": 3.5502984523773193, + "learning_rate": 6.111073204903678e-05, + "loss": 0.9966, + "num_input_tokens_seen": 89421496, + "step": 5557 + }, + { + "epoch": 0.3893281497631494, + "grad_norm": 4.251241207122803, + "learning_rate": 6.110373380035026e-05, + "loss": 1.0249, + "num_input_tokens_seen": 89437880, + "step": 5558 + }, + { + "epoch": 0.3893981980088786, + "grad_norm": 5.076200485229492, + "learning_rate": 6.109673555166375e-05, + "loss": 1.0592, + "num_input_tokens_seen": 89454264, + "step": 5559 + }, + { + "epoch": 0.38946824625460785, + "grad_norm": 4.018000602722168, + "learning_rate": 6.108973730297723e-05, + "loss": 1.197, + "num_input_tokens_seen": 89470648, + "step": 5560 + }, + { + "epoch": 0.3895382945003371, + "grad_norm": 4.3367180824279785, + "learning_rate": 6.108273905429072e-05, + "loss": 1.0778, + "num_input_tokens_seen": 89487032, + "step": 5561 + }, + { + "epoch": 0.38960834274606637, + "grad_norm": 6.027153015136719, + "learning_rate": 6.10757408056042e-05, + "loss": 0.7955, + "num_input_tokens_seen": 89502064, + "step": 5562 + }, + { + "epoch": 0.3896783909917956, + "grad_norm": 3.499268054962158, + "learning_rate": 6.10687425569177e-05, + "loss": 0.935, + "num_input_tokens_seen": 89518448, + "step": 5563 + }, + { + "epoch": 0.38974843923752484, + "grad_norm": 3.3691868782043457, + "learning_rate": 6.106174430823117e-05, + "loss": 0.9249, + "num_input_tokens_seen": 89534640, + "step": 5564 + }, + { + "epoch": 0.3898184874832541, + "grad_norm": 3.4140114784240723, + "learning_rate": 6.105474605954467e-05, + "loss": 0.9594, + "num_input_tokens_seen": 89551024, + "step": 5565 + }, + { + "epoch": 0.38988853572898335, + "grad_norm": 4.049834728240967, + "learning_rate": 6.104774781085815e-05, + "loss": 1.1637, + "num_input_tokens_seen": 89567408, + "step": 5566 + }, + { + "epoch": 0.38995858397471256, + "grad_norm": 4.825027942657471, + "learning_rate": 6.104074956217162e-05, + "loss": 1.0514, + "num_input_tokens_seen": 89583792, + "step": 5567 + }, + { + "epoch": 0.3900286322204418, + "grad_norm": 5.281174659729004, + "learning_rate": 6.103375131348512e-05, + "loss": 1.0534, + "num_input_tokens_seen": 89600176, + "step": 5568 + }, + { + "epoch": 0.3900986804661711, + "grad_norm": 3.567270278930664, + "learning_rate": 6.1026753064798605e-05, + "loss": 0.9217, + "num_input_tokens_seen": 89616560, + "step": 5569 + }, + { + "epoch": 0.39016872871190034, + "grad_norm": 5.449852466583252, + "learning_rate": 6.1019754816112086e-05, + "loss": 1.0362, + "num_input_tokens_seen": 89631968, + "step": 5570 + }, + { + "epoch": 0.39023877695762954, + "grad_norm": 4.016347885131836, + "learning_rate": 6.1012756567425575e-05, + "loss": 0.9634, + "num_input_tokens_seen": 89646712, + "step": 5571 + }, + { + "epoch": 0.3903088252033588, + "grad_norm": 3.8826510906219482, + "learning_rate": 6.100575831873906e-05, + "loss": 1.1645, + "num_input_tokens_seen": 89662776, + "step": 5572 + }, + { + "epoch": 0.39037887344908806, + "grad_norm": 3.80755615234375, + "learning_rate": 6.099876007005254e-05, + "loss": 1.0404, + "num_input_tokens_seen": 89679096, + "step": 5573 + }, + { + "epoch": 0.3904489216948173, + "grad_norm": 3.7274065017700195, + "learning_rate": 6.099176182136602e-05, + "loss": 1.1665, + "num_input_tokens_seen": 89695048, + "step": 5574 + }, + { + "epoch": 0.3905189699405466, + "grad_norm": 4.335930824279785, + "learning_rate": 6.09847635726795e-05, + "loss": 1.0662, + "num_input_tokens_seen": 89711432, + "step": 5575 + }, + { + "epoch": 0.3905890181862758, + "grad_norm": 3.8839964866638184, + "learning_rate": 6.0977765323993e-05, + "loss": 1.0635, + "num_input_tokens_seen": 89727712, + "step": 5576 + }, + { + "epoch": 0.39065906643200504, + "grad_norm": 4.8028035163879395, + "learning_rate": 6.097076707530648e-05, + "loss": 1.0906, + "num_input_tokens_seen": 89744096, + "step": 5577 + }, + { + "epoch": 0.3907291146777343, + "grad_norm": 4.042201519012451, + "learning_rate": 6.096376882661997e-05, + "loss": 0.8609, + "num_input_tokens_seen": 89758688, + "step": 5578 + }, + { + "epoch": 0.39079916292346356, + "grad_norm": 4.1316986083984375, + "learning_rate": 6.095677057793345e-05, + "loss": 0.9509, + "num_input_tokens_seen": 89774720, + "step": 5579 + }, + { + "epoch": 0.39086921116919277, + "grad_norm": 5.164004802703857, + "learning_rate": 6.094977232924693e-05, + "loss": 1.1927, + "num_input_tokens_seen": 89788480, + "step": 5580 + }, + { + "epoch": 0.390939259414922, + "grad_norm": 4.125234127044678, + "learning_rate": 6.094277408056043e-05, + "loss": 0.9237, + "num_input_tokens_seen": 89803000, + "step": 5581 + }, + { + "epoch": 0.3910093076606513, + "grad_norm": 4.798699855804443, + "learning_rate": 6.093577583187392e-05, + "loss": 1.1069, + "num_input_tokens_seen": 89818288, + "step": 5582 + }, + { + "epoch": 0.39107935590638054, + "grad_norm": 4.383975028991699, + "learning_rate": 6.0928777583187404e-05, + "loss": 0.9997, + "num_input_tokens_seen": 89833616, + "step": 5583 + }, + { + "epoch": 0.39114940415210975, + "grad_norm": 4.20830774307251, + "learning_rate": 6.092177933450087e-05, + "loss": 0.8794, + "num_input_tokens_seen": 89849200, + "step": 5584 + }, + { + "epoch": 0.391219452397839, + "grad_norm": 4.470288276672363, + "learning_rate": 6.091478108581437e-05, + "loss": 1.0974, + "num_input_tokens_seen": 89865184, + "step": 5585 + }, + { + "epoch": 0.39128950064356827, + "grad_norm": 4.8457112312316895, + "learning_rate": 6.090778283712785e-05, + "loss": 1.1091, + "num_input_tokens_seen": 89881184, + "step": 5586 + }, + { + "epoch": 0.3913595488892975, + "grad_norm": 4.112722873687744, + "learning_rate": 6.090078458844133e-05, + "loss": 1.0189, + "num_input_tokens_seen": 89897568, + "step": 5587 + }, + { + "epoch": 0.39142959713502673, + "grad_norm": 4.070732116699219, + "learning_rate": 6.089378633975482e-05, + "loss": 1.0694, + "num_input_tokens_seen": 89913952, + "step": 5588 + }, + { + "epoch": 0.391499645380756, + "grad_norm": 4.092299461364746, + "learning_rate": 6.08867880910683e-05, + "loss": 1.0922, + "num_input_tokens_seen": 89929216, + "step": 5589 + }, + { + "epoch": 0.39156969362648525, + "grad_norm": 3.8092305660247803, + "learning_rate": 6.08797898423818e-05, + "loss": 1.0041, + "num_input_tokens_seen": 89945376, + "step": 5590 + }, + { + "epoch": 0.3916397418722145, + "grad_norm": 5.461154937744141, + "learning_rate": 6.0872791593695265e-05, + "loss": 1.1393, + "num_input_tokens_seen": 89960296, + "step": 5591 + }, + { + "epoch": 0.3917097901179437, + "grad_norm": 5.2103190422058105, + "learning_rate": 6.086579334500877e-05, + "loss": 1.079, + "num_input_tokens_seen": 89975864, + "step": 5592 + }, + { + "epoch": 0.39177983836367297, + "grad_norm": 3.6308488845825195, + "learning_rate": 6.085879509632225e-05, + "loss": 1.0344, + "num_input_tokens_seen": 89991880, + "step": 5593 + }, + { + "epoch": 0.39184988660940223, + "grad_norm": 4.632900714874268, + "learning_rate": 6.0851796847635724e-05, + "loss": 0.8255, + "num_input_tokens_seen": 90006560, + "step": 5594 + }, + { + "epoch": 0.3919199348551315, + "grad_norm": 3.8614165782928467, + "learning_rate": 6.084479859894921e-05, + "loss": 1.1599, + "num_input_tokens_seen": 90022136, + "step": 5595 + }, + { + "epoch": 0.3919899831008607, + "grad_norm": 3.768287420272827, + "learning_rate": 6.0837800350262695e-05, + "loss": 1.0694, + "num_input_tokens_seen": 90038520, + "step": 5596 + }, + { + "epoch": 0.39206003134658995, + "grad_norm": 3.355902671813965, + "learning_rate": 6.083080210157618e-05, + "loss": 0.944, + "num_input_tokens_seen": 90054592, + "step": 5597 + }, + { + "epoch": 0.3921300795923192, + "grad_norm": 3.2001609802246094, + "learning_rate": 6.082380385288967e-05, + "loss": 0.8642, + "num_input_tokens_seen": 90070976, + "step": 5598 + }, + { + "epoch": 0.3922001278380485, + "grad_norm": 3.74692440032959, + "learning_rate": 6.081680560420317e-05, + "loss": 0.9807, + "num_input_tokens_seen": 90087360, + "step": 5599 + }, + { + "epoch": 0.3922701760837777, + "grad_norm": 5.602208614349365, + "learning_rate": 6.080980735551665e-05, + "loss": 1.2451, + "num_input_tokens_seen": 90103744, + "step": 5600 + }, + { + "epoch": 0.3922701760837777, + "eval_loss": 1.127113699913025, + "eval_runtime": 0.2033, + "eval_samples_per_second": 4.919, + "eval_steps_per_second": 4.919, + "num_input_tokens_seen": 90103744, + "step": 5600 + }, + { + "epoch": 0.39234022432950694, + "grad_norm": 3.727559804916382, + "learning_rate": 6.080280910683012e-05, + "loss": 1.0777, + "num_input_tokens_seen": 90119400, + "step": 5601 + }, + { + "epoch": 0.3924102725752362, + "grad_norm": 4.577515125274658, + "learning_rate": 6.0795810858143606e-05, + "loss": 1.0032, + "num_input_tokens_seen": 90135168, + "step": 5602 + }, + { + "epoch": 0.39248032082096546, + "grad_norm": 5.225588798522949, + "learning_rate": 6.07888126094571e-05, + "loss": 1.0964, + "num_input_tokens_seen": 90151480, + "step": 5603 + }, + { + "epoch": 0.39255036906669466, + "grad_norm": 3.6131844520568848, + "learning_rate": 6.078181436077057e-05, + "loss": 0.9255, + "num_input_tokens_seen": 90167864, + "step": 5604 + }, + { + "epoch": 0.3926204173124239, + "grad_norm": 4.127248287200928, + "learning_rate": 6.0774816112084065e-05, + "loss": 1.1939, + "num_input_tokens_seen": 90184248, + "step": 5605 + }, + { + "epoch": 0.3926904655581532, + "grad_norm": 4.599911689758301, + "learning_rate": 6.076781786339756e-05, + "loss": 1.2819, + "num_input_tokens_seen": 90199816, + "step": 5606 + }, + { + "epoch": 0.39276051380388244, + "grad_norm": 3.7179722785949707, + "learning_rate": 6.076081961471104e-05, + "loss": 0.8559, + "num_input_tokens_seen": 90215640, + "step": 5607 + }, + { + "epoch": 0.39283056204961164, + "grad_norm": 3.872941493988037, + "learning_rate": 6.0753821366024524e-05, + "loss": 0.9986, + "num_input_tokens_seen": 90232024, + "step": 5608 + }, + { + "epoch": 0.3929006102953409, + "grad_norm": 3.7326548099517822, + "learning_rate": 6.074682311733801e-05, + "loss": 1.0295, + "num_input_tokens_seen": 90247536, + "step": 5609 + }, + { + "epoch": 0.39297065854107016, + "grad_norm": 4.05418062210083, + "learning_rate": 6.0739824868651494e-05, + "loss": 1.2199, + "num_input_tokens_seen": 90263920, + "step": 5610 + }, + { + "epoch": 0.3930407067867994, + "grad_norm": 5.326319694519043, + "learning_rate": 6.073282661996497e-05, + "loss": 1.0705, + "num_input_tokens_seen": 90280240, + "step": 5611 + }, + { + "epoch": 0.3931107550325287, + "grad_norm": 4.132864952087402, + "learning_rate": 6.072582837127846e-05, + "loss": 1.2377, + "num_input_tokens_seen": 90296128, + "step": 5612 + }, + { + "epoch": 0.3931808032782579, + "grad_norm": 3.7307562828063965, + "learning_rate": 6.0718830122591953e-05, + "loss": 0.9765, + "num_input_tokens_seen": 90312512, + "step": 5613 + }, + { + "epoch": 0.39325085152398714, + "grad_norm": 6.35123872756958, + "learning_rate": 6.0711831873905435e-05, + "loss": 0.9049, + "num_input_tokens_seen": 90328896, + "step": 5614 + }, + { + "epoch": 0.3933208997697164, + "grad_norm": 5.536827564239502, + "learning_rate": 6.070483362521892e-05, + "loss": 0.952, + "num_input_tokens_seen": 90344648, + "step": 5615 + }, + { + "epoch": 0.39339094801544566, + "grad_norm": 3.6394944190979004, + "learning_rate": 6.0697835376532406e-05, + "loss": 0.9446, + "num_input_tokens_seen": 90361032, + "step": 5616 + }, + { + "epoch": 0.39346099626117487, + "grad_norm": 3.4719443321228027, + "learning_rate": 6.069083712784589e-05, + "loss": 0.9859, + "num_input_tokens_seen": 90377192, + "step": 5617 + }, + { + "epoch": 0.3935310445069041, + "grad_norm": 6.195781230926514, + "learning_rate": 6.068383887915936e-05, + "loss": 0.9482, + "num_input_tokens_seen": 90393576, + "step": 5618 + }, + { + "epoch": 0.3936010927526334, + "grad_norm": 3.5569331645965576, + "learning_rate": 6.0676840630472865e-05, + "loss": 1.0244, + "num_input_tokens_seen": 90409960, + "step": 5619 + }, + { + "epoch": 0.39367114099836265, + "grad_norm": 5.535704135894775, + "learning_rate": 6.0669842381786346e-05, + "loss": 1.3058, + "num_input_tokens_seen": 90426216, + "step": 5620 + }, + { + "epoch": 0.39374118924409185, + "grad_norm": 3.81278395652771, + "learning_rate": 6.0662844133099815e-05, + "loss": 0.9343, + "num_input_tokens_seen": 90442320, + "step": 5621 + }, + { + "epoch": 0.3938112374898211, + "grad_norm": 4.093146324157715, + "learning_rate": 6.065584588441331e-05, + "loss": 1.0698, + "num_input_tokens_seen": 90458704, + "step": 5622 + }, + { + "epoch": 0.39388128573555037, + "grad_norm": 6.061161518096924, + "learning_rate": 6.06488476357268e-05, + "loss": 0.9046, + "num_input_tokens_seen": 90474408, + "step": 5623 + }, + { + "epoch": 0.39395133398127963, + "grad_norm": 3.763059616088867, + "learning_rate": 6.064184938704028e-05, + "loss": 1.002, + "num_input_tokens_seen": 90490592, + "step": 5624 + }, + { + "epoch": 0.39402138222700883, + "grad_norm": 5.287941932678223, + "learning_rate": 6.063485113835376e-05, + "loss": 1.0667, + "num_input_tokens_seen": 90506568, + "step": 5625 + }, + { + "epoch": 0.3940914304727381, + "grad_norm": 3.5837693214416504, + "learning_rate": 6.062785288966726e-05, + "loss": 1.1136, + "num_input_tokens_seen": 90522952, + "step": 5626 + }, + { + "epoch": 0.39416147871846735, + "grad_norm": 3.5523111820220947, + "learning_rate": 6.062085464098074e-05, + "loss": 1.0703, + "num_input_tokens_seen": 90539336, + "step": 5627 + }, + { + "epoch": 0.3942315269641966, + "grad_norm": 4.3880934715271, + "learning_rate": 6.061385639229421e-05, + "loss": 0.993, + "num_input_tokens_seen": 90555720, + "step": 5628 + }, + { + "epoch": 0.3943015752099258, + "grad_norm": 4.26425313949585, + "learning_rate": 6.060685814360772e-05, + "loss": 1.08, + "num_input_tokens_seen": 90572104, + "step": 5629 + }, + { + "epoch": 0.3943716234556551, + "grad_norm": 3.8837990760803223, + "learning_rate": 6.05998598949212e-05, + "loss": 0.9414, + "num_input_tokens_seen": 90588488, + "step": 5630 + }, + { + "epoch": 0.39444167170138433, + "grad_norm": 3.704282522201538, + "learning_rate": 6.059286164623468e-05, + "loss": 1.0037, + "num_input_tokens_seen": 90604872, + "step": 5631 + }, + { + "epoch": 0.3945117199471136, + "grad_norm": 5.933957099914551, + "learning_rate": 6.058586339754816e-05, + "loss": 1.0753, + "num_input_tokens_seen": 90621256, + "step": 5632 + }, + { + "epoch": 0.3945817681928428, + "grad_norm": 4.185206413269043, + "learning_rate": 6.057886514886165e-05, + "loss": 1.0044, + "num_input_tokens_seen": 90637640, + "step": 5633 + }, + { + "epoch": 0.39465181643857206, + "grad_norm": 3.97603440284729, + "learning_rate": 6.057186690017513e-05, + "loss": 1.2243, + "num_input_tokens_seen": 90654024, + "step": 5634 + }, + { + "epoch": 0.3947218646843013, + "grad_norm": 3.394630193710327, + "learning_rate": 6.056486865148863e-05, + "loss": 0.9702, + "num_input_tokens_seen": 90670008, + "step": 5635 + }, + { + "epoch": 0.3947919129300306, + "grad_norm": 3.810899019241333, + "learning_rate": 6.055787040280211e-05, + "loss": 0.9998, + "num_input_tokens_seen": 90686392, + "step": 5636 + }, + { + "epoch": 0.3948619611757598, + "grad_norm": 4.237402439117432, + "learning_rate": 6.055087215411559e-05, + "loss": 1.1013, + "num_input_tokens_seen": 90702776, + "step": 5637 + }, + { + "epoch": 0.39493200942148904, + "grad_norm": 5.481308937072754, + "learning_rate": 6.054387390542907e-05, + "loss": 1.0064, + "num_input_tokens_seen": 90718312, + "step": 5638 + }, + { + "epoch": 0.3950020576672183, + "grad_norm": 3.582808017730713, + "learning_rate": 6.053687565674256e-05, + "loss": 0.9305, + "num_input_tokens_seen": 90733856, + "step": 5639 + }, + { + "epoch": 0.39507210591294756, + "grad_norm": 3.9277966022491455, + "learning_rate": 6.0529877408056044e-05, + "loss": 1.0986, + "num_input_tokens_seen": 90750240, + "step": 5640 + }, + { + "epoch": 0.39514215415867676, + "grad_norm": 4.61000394821167, + "learning_rate": 6.0522879159369526e-05, + "loss": 1.1705, + "num_input_tokens_seen": 90766352, + "step": 5641 + }, + { + "epoch": 0.395212202404406, + "grad_norm": 4.445149898529053, + "learning_rate": 6.051588091068301e-05, + "loss": 1.0809, + "num_input_tokens_seen": 90782736, + "step": 5642 + }, + { + "epoch": 0.3952822506501353, + "grad_norm": 4.652968406677246, + "learning_rate": 6.0508882661996516e-05, + "loss": 0.9761, + "num_input_tokens_seen": 90799120, + "step": 5643 + }, + { + "epoch": 0.39535229889586454, + "grad_norm": 4.172330856323242, + "learning_rate": 6.0501884413309985e-05, + "loss": 1.0637, + "num_input_tokens_seen": 90815504, + "step": 5644 + }, + { + "epoch": 0.3954223471415938, + "grad_norm": 3.647385358810425, + "learning_rate": 6.0494886164623466e-05, + "loss": 0.9284, + "num_input_tokens_seen": 90831888, + "step": 5645 + }, + { + "epoch": 0.395492395387323, + "grad_norm": 3.9353525638580322, + "learning_rate": 6.0487887915936955e-05, + "loss": 1.1498, + "num_input_tokens_seen": 90848048, + "step": 5646 + }, + { + "epoch": 0.39556244363305226, + "grad_norm": 4.216567039489746, + "learning_rate": 6.048088966725044e-05, + "loss": 1.1247, + "num_input_tokens_seen": 90863576, + "step": 5647 + }, + { + "epoch": 0.3956324918787815, + "grad_norm": 5.031260013580322, + "learning_rate": 6.047389141856392e-05, + "loss": 1.1314, + "num_input_tokens_seen": 90879960, + "step": 5648 + }, + { + "epoch": 0.3957025401245108, + "grad_norm": 4.927192211151123, + "learning_rate": 6.0466893169877414e-05, + "loss": 1.0977, + "num_input_tokens_seen": 90896344, + "step": 5649 + }, + { + "epoch": 0.39577258837024, + "grad_norm": 4.589445114135742, + "learning_rate": 6.045989492119091e-05, + "loss": 1.0233, + "num_input_tokens_seen": 90912728, + "step": 5650 + }, + { + "epoch": 0.39584263661596925, + "grad_norm": 3.5707035064697266, + "learning_rate": 6.045289667250438e-05, + "loss": 0.9732, + "num_input_tokens_seen": 90929112, + "step": 5651 + }, + { + "epoch": 0.3959126848616985, + "grad_norm": 3.637237787246704, + "learning_rate": 6.044589842381787e-05, + "loss": 1.2063, + "num_input_tokens_seen": 90945376, + "step": 5652 + }, + { + "epoch": 0.39598273310742776, + "grad_norm": 4.068975448608398, + "learning_rate": 6.043890017513136e-05, + "loss": 1.0301, + "num_input_tokens_seen": 90961448, + "step": 5653 + }, + { + "epoch": 0.39605278135315697, + "grad_norm": 3.8378570079803467, + "learning_rate": 6.0431901926444837e-05, + "loss": 0.9195, + "num_input_tokens_seen": 90977832, + "step": 5654 + }, + { + "epoch": 0.39612282959888623, + "grad_norm": 4.5788092613220215, + "learning_rate": 6.042490367775832e-05, + "loss": 1.1601, + "num_input_tokens_seen": 90993296, + "step": 5655 + }, + { + "epoch": 0.3961928778446155, + "grad_norm": 3.7392847537994385, + "learning_rate": 6.041790542907181e-05, + "loss": 1.0081, + "num_input_tokens_seen": 91009680, + "step": 5656 + }, + { + "epoch": 0.39626292609034475, + "grad_norm": 5.830812931060791, + "learning_rate": 6.04109071803853e-05, + "loss": 1.0544, + "num_input_tokens_seen": 91025400, + "step": 5657 + }, + { + "epoch": 0.39633297433607395, + "grad_norm": 3.7372663021087646, + "learning_rate": 6.040390893169877e-05, + "loss": 1.1403, + "num_input_tokens_seen": 91041784, + "step": 5658 + }, + { + "epoch": 0.3964030225818032, + "grad_norm": 3.756762981414795, + "learning_rate": 6.0396910683012266e-05, + "loss": 1.0175, + "num_input_tokens_seen": 91058168, + "step": 5659 + }, + { + "epoch": 0.39647307082753247, + "grad_norm": 3.659280776977539, + "learning_rate": 6.0389912434325755e-05, + "loss": 1.0396, + "num_input_tokens_seen": 91074288, + "step": 5660 + }, + { + "epoch": 0.39654311907326173, + "grad_norm": 4.339829921722412, + "learning_rate": 6.038291418563923e-05, + "loss": 1.2101, + "num_input_tokens_seen": 91090024, + "step": 5661 + }, + { + "epoch": 0.39661316731899093, + "grad_norm": 4.062867641448975, + "learning_rate": 6.037591593695272e-05, + "loss": 0.9874, + "num_input_tokens_seen": 91106408, + "step": 5662 + }, + { + "epoch": 0.3966832155647202, + "grad_norm": 4.45166015625, + "learning_rate": 6.0368917688266214e-05, + "loss": 0.9504, + "num_input_tokens_seen": 91122648, + "step": 5663 + }, + { + "epoch": 0.39675326381044945, + "grad_norm": 3.4350759983062744, + "learning_rate": 6.036191943957968e-05, + "loss": 0.8876, + "num_input_tokens_seen": 91138200, + "step": 5664 + }, + { + "epoch": 0.3968233120561787, + "grad_norm": 3.5637154579162598, + "learning_rate": 6.0354921190893164e-05, + "loss": 1.0616, + "num_input_tokens_seen": 91154584, + "step": 5665 + }, + { + "epoch": 0.3968933603019079, + "grad_norm": 3.8793985843658447, + "learning_rate": 6.034792294220666e-05, + "loss": 1.0933, + "num_input_tokens_seen": 91170968, + "step": 5666 + }, + { + "epoch": 0.3969634085476372, + "grad_norm": 4.1613545417785645, + "learning_rate": 6.0340924693520154e-05, + "loss": 1.0895, + "num_input_tokens_seen": 91185856, + "step": 5667 + }, + { + "epoch": 0.39703345679336643, + "grad_norm": 8.646449089050293, + "learning_rate": 6.033392644483362e-05, + "loss": 1.1391, + "num_input_tokens_seen": 91201088, + "step": 5668 + }, + { + "epoch": 0.3971035050390957, + "grad_norm": 4.862243175506592, + "learning_rate": 6.032692819614711e-05, + "loss": 1.1871, + "num_input_tokens_seen": 91217472, + "step": 5669 + }, + { + "epoch": 0.3971735532848249, + "grad_norm": 4.013809680938721, + "learning_rate": 6.0319929947460607e-05, + "loss": 1.0307, + "num_input_tokens_seen": 91233760, + "step": 5670 + }, + { + "epoch": 0.39724360153055416, + "grad_norm": 4.664083480834961, + "learning_rate": 6.0312931698774075e-05, + "loss": 1.1507, + "num_input_tokens_seen": 91250144, + "step": 5671 + }, + { + "epoch": 0.3973136497762834, + "grad_norm": 4.330606937408447, + "learning_rate": 6.030593345008756e-05, + "loss": 1.1741, + "num_input_tokens_seen": 91264592, + "step": 5672 + }, + { + "epoch": 0.3973836980220127, + "grad_norm": 4.158743381500244, + "learning_rate": 6.0298935201401066e-05, + "loss": 0.8608, + "num_input_tokens_seen": 91280520, + "step": 5673 + }, + { + "epoch": 0.3974537462677419, + "grad_norm": 3.800955057144165, + "learning_rate": 6.029193695271455e-05, + "loss": 0.9451, + "num_input_tokens_seen": 91296472, + "step": 5674 + }, + { + "epoch": 0.39752379451347114, + "grad_norm": 4.307434558868408, + "learning_rate": 6.028493870402803e-05, + "loss": 1.0936, + "num_input_tokens_seen": 91312856, + "step": 5675 + }, + { + "epoch": 0.3975938427592004, + "grad_norm": 4.052398204803467, + "learning_rate": 6.027794045534151e-05, + "loss": 1.006, + "num_input_tokens_seen": 91329216, + "step": 5676 + }, + { + "epoch": 0.39766389100492966, + "grad_norm": 4.665764331817627, + "learning_rate": 6.0270942206655e-05, + "loss": 1.1573, + "num_input_tokens_seen": 91345184, + "step": 5677 + }, + { + "epoch": 0.3977339392506589, + "grad_norm": 4.070000648498535, + "learning_rate": 6.0263943957968475e-05, + "loss": 1.1296, + "num_input_tokens_seen": 91361568, + "step": 5678 + }, + { + "epoch": 0.3978039874963881, + "grad_norm": 4.304214954376221, + "learning_rate": 6.025694570928198e-05, + "loss": 0.8688, + "num_input_tokens_seen": 91376656, + "step": 5679 + }, + { + "epoch": 0.3978740357421174, + "grad_norm": 4.051540374755859, + "learning_rate": 6.024994746059546e-05, + "loss": 1.0862, + "num_input_tokens_seen": 91393040, + "step": 5680 + }, + { + "epoch": 0.39794408398784664, + "grad_norm": 3.986542224884033, + "learning_rate": 6.024294921190894e-05, + "loss": 1.1477, + "num_input_tokens_seen": 91408208, + "step": 5681 + }, + { + "epoch": 0.3980141322335759, + "grad_norm": 4.302114963531494, + "learning_rate": 6.023595096322242e-05, + "loss": 0.8569, + "num_input_tokens_seen": 91424592, + "step": 5682 + }, + { + "epoch": 0.3980841804793051, + "grad_norm": 6.019785404205322, + "learning_rate": 6.0228952714535904e-05, + "loss": 1.2166, + "num_input_tokens_seen": 91440976, + "step": 5683 + }, + { + "epoch": 0.39815422872503436, + "grad_norm": 3.667469024658203, + "learning_rate": 6.022195446584939e-05, + "loss": 1.0587, + "num_input_tokens_seen": 91457360, + "step": 5684 + }, + { + "epoch": 0.3982242769707636, + "grad_norm": 4.30043363571167, + "learning_rate": 6.0214956217162874e-05, + "loss": 1.0923, + "num_input_tokens_seen": 91473744, + "step": 5685 + }, + { + "epoch": 0.3982943252164929, + "grad_norm": 3.7446558475494385, + "learning_rate": 6.0207957968476356e-05, + "loss": 1.1302, + "num_input_tokens_seen": 91489544, + "step": 5686 + }, + { + "epoch": 0.3983643734622221, + "grad_norm": 3.722567081451416, + "learning_rate": 6.020095971978985e-05, + "loss": 0.9913, + "num_input_tokens_seen": 91505584, + "step": 5687 + }, + { + "epoch": 0.39843442170795135, + "grad_norm": 4.311237812042236, + "learning_rate": 6.019396147110332e-05, + "loss": 1.0352, + "num_input_tokens_seen": 91520656, + "step": 5688 + }, + { + "epoch": 0.3985044699536806, + "grad_norm": 5.650984764099121, + "learning_rate": 6.018696322241683e-05, + "loss": 1.2826, + "num_input_tokens_seen": 91537040, + "step": 5689 + }, + { + "epoch": 0.39857451819940987, + "grad_norm": 4.106716632843018, + "learning_rate": 6.017996497373031e-05, + "loss": 1.0234, + "num_input_tokens_seen": 91553424, + "step": 5690 + }, + { + "epoch": 0.39864456644513907, + "grad_norm": 3.893007516860962, + "learning_rate": 6.0172966725043786e-05, + "loss": 1.1175, + "num_input_tokens_seen": 91569184, + "step": 5691 + }, + { + "epoch": 0.39871461469086833, + "grad_norm": 3.6435177326202393, + "learning_rate": 6.016596847635727e-05, + "loss": 0.9446, + "num_input_tokens_seen": 91584832, + "step": 5692 + }, + { + "epoch": 0.3987846629365976, + "grad_norm": 3.639324188232422, + "learning_rate": 6.015897022767075e-05, + "loss": 1.0866, + "num_input_tokens_seen": 91601216, + "step": 5693 + }, + { + "epoch": 0.39885471118232685, + "grad_norm": 3.680997848510742, + "learning_rate": 6.0151971978984245e-05, + "loss": 1.1352, + "num_input_tokens_seen": 91617600, + "step": 5694 + }, + { + "epoch": 0.39892475942805605, + "grad_norm": 5.37217903137207, + "learning_rate": 6.0144973730297726e-05, + "loss": 1.1507, + "num_input_tokens_seen": 91633760, + "step": 5695 + }, + { + "epoch": 0.3989948076737853, + "grad_norm": 3.6297101974487305, + "learning_rate": 6.013797548161122e-05, + "loss": 0.8412, + "num_input_tokens_seen": 91649536, + "step": 5696 + }, + { + "epoch": 0.39906485591951457, + "grad_norm": 8.193422317504883, + "learning_rate": 6.0130977232924704e-05, + "loss": 1.1533, + "num_input_tokens_seen": 91665920, + "step": 5697 + }, + { + "epoch": 0.39913490416524383, + "grad_norm": 3.6126644611358643, + "learning_rate": 6.0123978984238185e-05, + "loss": 0.8261, + "num_input_tokens_seen": 91682272, + "step": 5698 + }, + { + "epoch": 0.39920495241097304, + "grad_norm": 4.277047634124756, + "learning_rate": 6.011698073555167e-05, + "loss": 1.0904, + "num_input_tokens_seen": 91698656, + "step": 5699 + }, + { + "epoch": 0.3992750006567023, + "grad_norm": 4.661556720733643, + "learning_rate": 6.0109982486865156e-05, + "loss": 1.1948, + "num_input_tokens_seen": 91715040, + "step": 5700 + }, + { + "epoch": 0.39934504890243155, + "grad_norm": 4.143563270568848, + "learning_rate": 6.010298423817864e-05, + "loss": 1.0421, + "num_input_tokens_seen": 91731424, + "step": 5701 + }, + { + "epoch": 0.3994150971481608, + "grad_norm": 5.750835418701172, + "learning_rate": 6.009598598949212e-05, + "loss": 1.0594, + "num_input_tokens_seen": 91747808, + "step": 5702 + }, + { + "epoch": 0.39948514539389, + "grad_norm": 4.048924446105957, + "learning_rate": 6.0088987740805615e-05, + "loss": 1.0488, + "num_input_tokens_seen": 91764192, + "step": 5703 + }, + { + "epoch": 0.3995551936396193, + "grad_norm": 3.7284796237945557, + "learning_rate": 6.00819894921191e-05, + "loss": 0.9723, + "num_input_tokens_seen": 91780576, + "step": 5704 + }, + { + "epoch": 0.39962524188534854, + "grad_norm": 4.041873931884766, + "learning_rate": 6.007499124343258e-05, + "loss": 0.9354, + "num_input_tokens_seen": 91796960, + "step": 5705 + }, + { + "epoch": 0.3996952901310778, + "grad_norm": 4.37992000579834, + "learning_rate": 6.006799299474607e-05, + "loss": 1.3032, + "num_input_tokens_seen": 91813344, + "step": 5706 + }, + { + "epoch": 0.399765338376807, + "grad_norm": 4.49924373626709, + "learning_rate": 6.006099474605955e-05, + "loss": 0.9709, + "num_input_tokens_seen": 91829728, + "step": 5707 + }, + { + "epoch": 0.39983538662253626, + "grad_norm": 4.609983444213867, + "learning_rate": 6.005399649737303e-05, + "loss": 1.0591, + "num_input_tokens_seen": 91844968, + "step": 5708 + }, + { + "epoch": 0.3999054348682655, + "grad_norm": 4.1750006675720215, + "learning_rate": 6.004699824868651e-05, + "loss": 1.1413, + "num_input_tokens_seen": 91860464, + "step": 5709 + }, + { + "epoch": 0.3999754831139948, + "grad_norm": 4.009062767028809, + "learning_rate": 6.0039999999999994e-05, + "loss": 1.0244, + "num_input_tokens_seen": 91876848, + "step": 5710 + }, + { + "epoch": 0.400045531359724, + "grad_norm": 4.0535078048706055, + "learning_rate": 6.003300175131349e-05, + "loss": 1.0851, + "num_input_tokens_seen": 91892192, + "step": 5711 + }, + { + "epoch": 0.40011557960545324, + "grad_norm": 4.592657089233398, + "learning_rate": 6.002600350262697e-05, + "loss": 0.8823, + "num_input_tokens_seen": 91907040, + "step": 5712 + }, + { + "epoch": 0.4001856278511825, + "grad_norm": 3.87369966506958, + "learning_rate": 6.001900525394046e-05, + "loss": 1.153, + "num_input_tokens_seen": 91922712, + "step": 5713 + }, + { + "epoch": 0.40025567609691176, + "grad_norm": 3.93766713142395, + "learning_rate": 6.001200700525394e-05, + "loss": 1.0225, + "num_input_tokens_seen": 91939096, + "step": 5714 + }, + { + "epoch": 0.400325724342641, + "grad_norm": 3.823153018951416, + "learning_rate": 6.0005008756567424e-05, + "loss": 0.9229, + "num_input_tokens_seen": 91955016, + "step": 5715 + }, + { + "epoch": 0.4003957725883702, + "grad_norm": 3.5592081546783447, + "learning_rate": 5.999801050788092e-05, + "loss": 0.9163, + "num_input_tokens_seen": 91971400, + "step": 5716 + }, + { + "epoch": 0.4004658208340995, + "grad_norm": 3.8749887943267822, + "learning_rate": 5.9991012259194414e-05, + "loss": 1.0194, + "num_input_tokens_seen": 91987784, + "step": 5717 + }, + { + "epoch": 0.40053586907982874, + "grad_norm": 4.749402046203613, + "learning_rate": 5.998401401050788e-05, + "loss": 0.9442, + "num_input_tokens_seen": 92003592, + "step": 5718 + }, + { + "epoch": 0.400605917325558, + "grad_norm": 4.114437580108643, + "learning_rate": 5.9977015761821365e-05, + "loss": 1.0576, + "num_input_tokens_seen": 92019640, + "step": 5719 + }, + { + "epoch": 0.4006759655712872, + "grad_norm": 3.3297617435455322, + "learning_rate": 5.997001751313486e-05, + "loss": 0.8776, + "num_input_tokens_seen": 92035544, + "step": 5720 + }, + { + "epoch": 0.40074601381701647, + "grad_norm": 4.204908847808838, + "learning_rate": 5.996301926444834e-05, + "loss": 1.141, + "num_input_tokens_seen": 92051840, + "step": 5721 + }, + { + "epoch": 0.4008160620627457, + "grad_norm": 4.198369979858398, + "learning_rate": 5.9956021015761824e-05, + "loss": 1.2653, + "num_input_tokens_seen": 92068224, + "step": 5722 + }, + { + "epoch": 0.400886110308475, + "grad_norm": 4.46641206741333, + "learning_rate": 5.994902276707531e-05, + "loss": 1.0866, + "num_input_tokens_seen": 92083656, + "step": 5723 + }, + { + "epoch": 0.4009561585542042, + "grad_norm": 4.2217535972595215, + "learning_rate": 5.994202451838881e-05, + "loss": 1.066, + "num_input_tokens_seen": 92100040, + "step": 5724 + }, + { + "epoch": 0.40102620679993345, + "grad_norm": 4.484360218048096, + "learning_rate": 5.993502626970229e-05, + "loss": 1.1031, + "num_input_tokens_seen": 92115592, + "step": 5725 + }, + { + "epoch": 0.4010962550456627, + "grad_norm": 4.69040060043335, + "learning_rate": 5.992802802101576e-05, + "loss": 1.1487, + "num_input_tokens_seen": 92131280, + "step": 5726 + }, + { + "epoch": 0.40116630329139197, + "grad_norm": 3.8119077682495117, + "learning_rate": 5.992102977232926e-05, + "loss": 1.1336, + "num_input_tokens_seen": 92147664, + "step": 5727 + }, + { + "epoch": 0.40123635153712117, + "grad_norm": 4.186896800994873, + "learning_rate": 5.9914031523642735e-05, + "loss": 0.9449, + "num_input_tokens_seen": 92164048, + "step": 5728 + }, + { + "epoch": 0.40130639978285043, + "grad_norm": 4.658702850341797, + "learning_rate": 5.9907033274956217e-05, + "loss": 1.1733, + "num_input_tokens_seen": 92180432, + "step": 5729 + }, + { + "epoch": 0.4013764480285797, + "grad_norm": 3.8305857181549072, + "learning_rate": 5.9900035026269705e-05, + "loss": 1.0041, + "num_input_tokens_seen": 92196816, + "step": 5730 + }, + { + "epoch": 0.40144649627430895, + "grad_norm": 4.063295364379883, + "learning_rate": 5.989303677758319e-05, + "loss": 1.1743, + "num_input_tokens_seen": 92212928, + "step": 5731 + }, + { + "epoch": 0.40151654452003815, + "grad_norm": 6.850064277648926, + "learning_rate": 5.988603852889667e-05, + "loss": 1.0967, + "num_input_tokens_seen": 92229312, + "step": 5732 + }, + { + "epoch": 0.4015865927657674, + "grad_norm": 5.065973281860352, + "learning_rate": 5.9879040280210164e-05, + "loss": 1.0513, + "num_input_tokens_seen": 92245696, + "step": 5733 + }, + { + "epoch": 0.4016566410114967, + "grad_norm": 4.02882719039917, + "learning_rate": 5.987204203152366e-05, + "loss": 0.934, + "num_input_tokens_seen": 92261936, + "step": 5734 + }, + { + "epoch": 0.40172668925722593, + "grad_norm": 3.9505250453948975, + "learning_rate": 5.986504378283714e-05, + "loss": 0.9871, + "num_input_tokens_seen": 92278320, + "step": 5735 + }, + { + "epoch": 0.40179673750295514, + "grad_norm": 4.949488162994385, + "learning_rate": 5.985804553415061e-05, + "loss": 0.9759, + "num_input_tokens_seen": 92294664, + "step": 5736 + }, + { + "epoch": 0.4018667857486844, + "grad_norm": 3.614008903503418, + "learning_rate": 5.98510472854641e-05, + "loss": 1.006, + "num_input_tokens_seen": 92311048, + "step": 5737 + }, + { + "epoch": 0.40193683399441366, + "grad_norm": 3.739224433898926, + "learning_rate": 5.984404903677758e-05, + "loss": 0.8997, + "num_input_tokens_seen": 92327432, + "step": 5738 + }, + { + "epoch": 0.4020068822401429, + "grad_norm": 3.6126298904418945, + "learning_rate": 5.983705078809106e-05, + "loss": 1.0062, + "num_input_tokens_seen": 92343816, + "step": 5739 + }, + { + "epoch": 0.4020769304858721, + "grad_norm": 4.304609298706055, + "learning_rate": 5.983005253940456e-05, + "loss": 1.0691, + "num_input_tokens_seen": 92358872, + "step": 5740 + }, + { + "epoch": 0.4021469787316014, + "grad_norm": 4.121729850769043, + "learning_rate": 5.982305429071805e-05, + "loss": 1.1047, + "num_input_tokens_seen": 92374960, + "step": 5741 + }, + { + "epoch": 0.40221702697733064, + "grad_norm": 4.137178421020508, + "learning_rate": 5.9816056042031534e-05, + "loss": 1.0809, + "num_input_tokens_seen": 92391344, + "step": 5742 + }, + { + "epoch": 0.4022870752230599, + "grad_norm": 8.537243843078613, + "learning_rate": 5.9809057793345016e-05, + "loss": 1.1571, + "num_input_tokens_seen": 92406096, + "step": 5743 + }, + { + "epoch": 0.4023571234687891, + "grad_norm": 4.613489627838135, + "learning_rate": 5.9802059544658505e-05, + "loss": 1.126, + "num_input_tokens_seen": 92422480, + "step": 5744 + }, + { + "epoch": 0.40242717171451836, + "grad_norm": 4.812812805175781, + "learning_rate": 5.9795061295971987e-05, + "loss": 1.2987, + "num_input_tokens_seen": 92437472, + "step": 5745 + }, + { + "epoch": 0.4024972199602476, + "grad_norm": 4.785153865814209, + "learning_rate": 5.978806304728546e-05, + "loss": 0.9517, + "num_input_tokens_seen": 92452560, + "step": 5746 + }, + { + "epoch": 0.4025672682059769, + "grad_norm": 4.450865268707275, + "learning_rate": 5.978106479859895e-05, + "loss": 0.9144, + "num_input_tokens_seen": 92468312, + "step": 5747 + }, + { + "epoch": 0.40263731645170614, + "grad_norm": 4.854867935180664, + "learning_rate": 5.9774066549912446e-05, + "loss": 1.1128, + "num_input_tokens_seen": 92484616, + "step": 5748 + }, + { + "epoch": 0.40270736469743534, + "grad_norm": 4.00141716003418, + "learning_rate": 5.976706830122593e-05, + "loss": 1.1298, + "num_input_tokens_seen": 92500568, + "step": 5749 + }, + { + "epoch": 0.4027774129431646, + "grad_norm": 3.819101333618164, + "learning_rate": 5.976007005253941e-05, + "loss": 0.9113, + "num_input_tokens_seen": 92515952, + "step": 5750 + }, + { + "epoch": 0.40284746118889386, + "grad_norm": 3.7954423427581787, + "learning_rate": 5.97530718038529e-05, + "loss": 1.1873, + "num_input_tokens_seen": 92532336, + "step": 5751 + }, + { + "epoch": 0.4029175094346231, + "grad_norm": 4.081971645355225, + "learning_rate": 5.974607355516638e-05, + "loss": 1.1075, + "num_input_tokens_seen": 92548720, + "step": 5752 + }, + { + "epoch": 0.4029875576803523, + "grad_norm": 3.834063768386841, + "learning_rate": 5.9739075306479855e-05, + "loss": 0.9963, + "num_input_tokens_seen": 92564648, + "step": 5753 + }, + { + "epoch": 0.4030576059260816, + "grad_norm": 3.6766366958618164, + "learning_rate": 5.973207705779336e-05, + "loss": 1.1064, + "num_input_tokens_seen": 92581032, + "step": 5754 + }, + { + "epoch": 0.40312765417181085, + "grad_norm": 4.423589706420898, + "learning_rate": 5.972507880910684e-05, + "loss": 1.0285, + "num_input_tokens_seen": 92596520, + "step": 5755 + }, + { + "epoch": 0.4031977024175401, + "grad_norm": 4.0820207595825195, + "learning_rate": 5.971808056042031e-05, + "loss": 1.0288, + "num_input_tokens_seen": 92612448, + "step": 5756 + }, + { + "epoch": 0.4032677506632693, + "grad_norm": 6.634023189544678, + "learning_rate": 5.97110823117338e-05, + "loss": 1.1607, + "num_input_tokens_seen": 92627736, + "step": 5757 + }, + { + "epoch": 0.40333779890899857, + "grad_norm": 3.517611026763916, + "learning_rate": 5.970408406304729e-05, + "loss": 0.9292, + "num_input_tokens_seen": 92643216, + "step": 5758 + }, + { + "epoch": 0.4034078471547278, + "grad_norm": 3.7658562660217285, + "learning_rate": 5.969708581436077e-05, + "loss": 1.052, + "num_input_tokens_seen": 92659248, + "step": 5759 + }, + { + "epoch": 0.4034778954004571, + "grad_norm": 4.128793716430664, + "learning_rate": 5.9690087565674254e-05, + "loss": 1.1067, + "num_input_tokens_seen": 92675632, + "step": 5760 + }, + { + "epoch": 0.4035479436461863, + "grad_norm": 4.200130939483643, + "learning_rate": 5.968308931698775e-05, + "loss": 0.9264, + "num_input_tokens_seen": 92691408, + "step": 5761 + }, + { + "epoch": 0.40361799189191555, + "grad_norm": 4.131740093231201, + "learning_rate": 5.967609106830123e-05, + "loss": 1.0687, + "num_input_tokens_seen": 92706568, + "step": 5762 + }, + { + "epoch": 0.4036880401376448, + "grad_norm": 4.074241638183594, + "learning_rate": 5.96690928196147e-05, + "loss": 1.0122, + "num_input_tokens_seen": 92722952, + "step": 5763 + }, + { + "epoch": 0.40375808838337407, + "grad_norm": 4.351722717285156, + "learning_rate": 5.9662094570928195e-05, + "loss": 0.8518, + "num_input_tokens_seen": 92739248, + "step": 5764 + }, + { + "epoch": 0.4038281366291033, + "grad_norm": 4.0495734214782715, + "learning_rate": 5.965509632224169e-05, + "loss": 1.1392, + "num_input_tokens_seen": 92755040, + "step": 5765 + }, + { + "epoch": 0.40389818487483253, + "grad_norm": 3.881098747253418, + "learning_rate": 5.964809807355517e-05, + "loss": 0.9899, + "num_input_tokens_seen": 92771424, + "step": 5766 + }, + { + "epoch": 0.4039682331205618, + "grad_norm": 3.964268922805786, + "learning_rate": 5.9641099824868654e-05, + "loss": 1.1636, + "num_input_tokens_seen": 92787808, + "step": 5767 + }, + { + "epoch": 0.40403828136629105, + "grad_norm": 4.126365661621094, + "learning_rate": 5.963410157618214e-05, + "loss": 1.273, + "num_input_tokens_seen": 92804192, + "step": 5768 + }, + { + "epoch": 0.40410832961202026, + "grad_norm": 4.142693519592285, + "learning_rate": 5.9627103327495625e-05, + "loss": 1.2161, + "num_input_tokens_seen": 92819920, + "step": 5769 + }, + { + "epoch": 0.4041783778577495, + "grad_norm": 4.906876087188721, + "learning_rate": 5.962010507880912e-05, + "loss": 0.9985, + "num_input_tokens_seen": 92836304, + "step": 5770 + }, + { + "epoch": 0.4042484261034788, + "grad_norm": 4.597287654876709, + "learning_rate": 5.96131068301226e-05, + "loss": 1.13, + "num_input_tokens_seen": 92852144, + "step": 5771 + }, + { + "epoch": 0.40431847434920803, + "grad_norm": 3.525669813156128, + "learning_rate": 5.9606108581436084e-05, + "loss": 1.0209, + "num_input_tokens_seen": 92867944, + "step": 5772 + }, + { + "epoch": 0.40438852259493724, + "grad_norm": 4.908353328704834, + "learning_rate": 5.9599110332749565e-05, + "loss": 1.2495, + "num_input_tokens_seen": 92884328, + "step": 5773 + }, + { + "epoch": 0.4044585708406665, + "grad_norm": 4.31436824798584, + "learning_rate": 5.959211208406305e-05, + "loss": 1.0821, + "num_input_tokens_seen": 92900152, + "step": 5774 + }, + { + "epoch": 0.40452861908639576, + "grad_norm": 3.652494430541992, + "learning_rate": 5.9585113835376536e-05, + "loss": 1.0086, + "num_input_tokens_seen": 92916416, + "step": 5775 + }, + { + "epoch": 0.404598667332125, + "grad_norm": 3.9569268226623535, + "learning_rate": 5.957811558669002e-05, + "loss": 1.0284, + "num_input_tokens_seen": 92932088, + "step": 5776 + }, + { + "epoch": 0.4046687155778542, + "grad_norm": 4.301011562347412, + "learning_rate": 5.95711173380035e-05, + "loss": 1.0994, + "num_input_tokens_seen": 92948472, + "step": 5777 + }, + { + "epoch": 0.4047387638235835, + "grad_norm": 4.0318474769592285, + "learning_rate": 5.9564119089316995e-05, + "loss": 1.0636, + "num_input_tokens_seen": 92964856, + "step": 5778 + }, + { + "epoch": 0.40480881206931274, + "grad_norm": 4.05795955657959, + "learning_rate": 5.955712084063048e-05, + "loss": 0.9947, + "num_input_tokens_seen": 92980040, + "step": 5779 + }, + { + "epoch": 0.404878860315042, + "grad_norm": 5.828601837158203, + "learning_rate": 5.9550122591943945e-05, + "loss": 1.1361, + "num_input_tokens_seen": 92996424, + "step": 5780 + }, + { + "epoch": 0.4049489085607712, + "grad_norm": 3.6617836952209473, + "learning_rate": 5.954312434325745e-05, + "loss": 0.9282, + "num_input_tokens_seen": 93012400, + "step": 5781 + }, + { + "epoch": 0.40501895680650046, + "grad_norm": 3.8668923377990723, + "learning_rate": 5.953612609457093e-05, + "loss": 1.2368, + "num_input_tokens_seen": 93028176, + "step": 5782 + }, + { + "epoch": 0.4050890050522297, + "grad_norm": 6.206475257873535, + "learning_rate": 5.952912784588441e-05, + "loss": 1.133, + "num_input_tokens_seen": 93044560, + "step": 5783 + }, + { + "epoch": 0.405159053297959, + "grad_norm": 3.6768481731414795, + "learning_rate": 5.952212959719789e-05, + "loss": 1.0141, + "num_input_tokens_seen": 93060944, + "step": 5784 + }, + { + "epoch": 0.40522910154368824, + "grad_norm": 3.8317768573760986, + "learning_rate": 5.95151313485114e-05, + "loss": 1.1, + "num_input_tokens_seen": 93077328, + "step": 5785 + }, + { + "epoch": 0.40529914978941745, + "grad_norm": 5.8504252433776855, + "learning_rate": 5.950813309982487e-05, + "loss": 1.1581, + "num_input_tokens_seen": 93092016, + "step": 5786 + }, + { + "epoch": 0.4053691980351467, + "grad_norm": 3.6718640327453613, + "learning_rate": 5.9501134851138365e-05, + "loss": 0.9614, + "num_input_tokens_seen": 93108400, + "step": 5787 + }, + { + "epoch": 0.40543924628087596, + "grad_norm": 4.16236686706543, + "learning_rate": 5.949413660245185e-05, + "loss": 1.1426, + "num_input_tokens_seen": 93124784, + "step": 5788 + }, + { + "epoch": 0.4055092945266052, + "grad_norm": 4.1938958168029785, + "learning_rate": 5.948713835376533e-05, + "loss": 1.241, + "num_input_tokens_seen": 93141168, + "step": 5789 + }, + { + "epoch": 0.40557934277233443, + "grad_norm": 3.7515947818756104, + "learning_rate": 5.948014010507881e-05, + "loss": 1.0706, + "num_input_tokens_seen": 93157552, + "step": 5790 + }, + { + "epoch": 0.4056493910180637, + "grad_norm": 3.710805654525757, + "learning_rate": 5.94731418563923e-05, + "loss": 0.9877, + "num_input_tokens_seen": 93173936, + "step": 5791 + }, + { + "epoch": 0.40571943926379295, + "grad_norm": 3.4491820335388184, + "learning_rate": 5.9466143607705794e-05, + "loss": 1.0282, + "num_input_tokens_seen": 93190320, + "step": 5792 + }, + { + "epoch": 0.4057894875095222, + "grad_norm": 3.8709781169891357, + "learning_rate": 5.945914535901926e-05, + "loss": 0.8852, + "num_input_tokens_seen": 93206648, + "step": 5793 + }, + { + "epoch": 0.4058595357552514, + "grad_norm": 5.104569435119629, + "learning_rate": 5.945214711033276e-05, + "loss": 0.9954, + "num_input_tokens_seen": 93223032, + "step": 5794 + }, + { + "epoch": 0.40592958400098067, + "grad_norm": 4.294493675231934, + "learning_rate": 5.944514886164625e-05, + "loss": 1.0673, + "num_input_tokens_seen": 93239344, + "step": 5795 + }, + { + "epoch": 0.40599963224670993, + "grad_norm": 4.654513835906982, + "learning_rate": 5.943815061295972e-05, + "loss": 1.0239, + "num_input_tokens_seen": 93255104, + "step": 5796 + }, + { + "epoch": 0.4060696804924392, + "grad_norm": 4.339935779571533, + "learning_rate": 5.9431152364273204e-05, + "loss": 0.982, + "num_input_tokens_seen": 93270448, + "step": 5797 + }, + { + "epoch": 0.4061397287381684, + "grad_norm": 3.6498191356658936, + "learning_rate": 5.942415411558669e-05, + "loss": 0.8815, + "num_input_tokens_seen": 93286672, + "step": 5798 + }, + { + "epoch": 0.40620977698389765, + "grad_norm": 3.9488580226898193, + "learning_rate": 5.9417155866900174e-05, + "loss": 1.2938, + "num_input_tokens_seen": 93302256, + "step": 5799 + }, + { + "epoch": 0.4062798252296269, + "grad_norm": 3.9446182250976562, + "learning_rate": 5.9410157618213656e-05, + "loss": 0.8862, + "num_input_tokens_seen": 93318640, + "step": 5800 + }, + { + "epoch": 0.4062798252296269, + "eval_loss": 1.1256848573684692, + "eval_runtime": 0.2106, + "eval_samples_per_second": 4.748, + "eval_steps_per_second": 4.748, + "num_input_tokens_seen": 93318640, + "step": 5800 + }, + { + "epoch": 0.40634987347535617, + "grad_norm": 3.986703872680664, + "learning_rate": 5.940315936952715e-05, + "loss": 1.1783, + "num_input_tokens_seen": 93335024, + "step": 5801 + }, + { + "epoch": 0.4064199217210854, + "grad_norm": 3.56948184967041, + "learning_rate": 5.9396161120840647e-05, + "loss": 1.035, + "num_input_tokens_seen": 93351408, + "step": 5802 + }, + { + "epoch": 0.40648996996681463, + "grad_norm": 3.9857194423675537, + "learning_rate": 5.9389162872154115e-05, + "loss": 1.0596, + "num_input_tokens_seen": 93367728, + "step": 5803 + }, + { + "epoch": 0.4065600182125439, + "grad_norm": 5.165848731994629, + "learning_rate": 5.93821646234676e-05, + "loss": 0.9764, + "num_input_tokens_seen": 93384112, + "step": 5804 + }, + { + "epoch": 0.40663006645827315, + "grad_norm": 3.742520809173584, + "learning_rate": 5.9375166374781085e-05, + "loss": 1.0802, + "num_input_tokens_seen": 93400072, + "step": 5805 + }, + { + "epoch": 0.40670011470400236, + "grad_norm": 4.13803768157959, + "learning_rate": 5.936816812609457e-05, + "loss": 1.1845, + "num_input_tokens_seen": 93416016, + "step": 5806 + }, + { + "epoch": 0.4067701629497316, + "grad_norm": 4.530385494232178, + "learning_rate": 5.936116987740805e-05, + "loss": 1.1034, + "num_input_tokens_seen": 93432400, + "step": 5807 + }, + { + "epoch": 0.4068402111954609, + "grad_norm": 4.162608623504639, + "learning_rate": 5.935417162872156e-05, + "loss": 1.0239, + "num_input_tokens_seen": 93448360, + "step": 5808 + }, + { + "epoch": 0.40691025944119014, + "grad_norm": 3.5075366497039795, + "learning_rate": 5.934717338003504e-05, + "loss": 0.8568, + "num_input_tokens_seen": 93464744, + "step": 5809 + }, + { + "epoch": 0.40698030768691934, + "grad_norm": 4.963081359863281, + "learning_rate": 5.934017513134851e-05, + "loss": 1.028, + "num_input_tokens_seen": 93480448, + "step": 5810 + }, + { + "epoch": 0.4070503559326486, + "grad_norm": 3.83306622505188, + "learning_rate": 5.9333176882662e-05, + "loss": 1.1558, + "num_input_tokens_seen": 93496832, + "step": 5811 + }, + { + "epoch": 0.40712040417837786, + "grad_norm": 3.878345489501953, + "learning_rate": 5.932617863397549e-05, + "loss": 0.9784, + "num_input_tokens_seen": 93513216, + "step": 5812 + }, + { + "epoch": 0.4071904524241071, + "grad_norm": 4.1416192054748535, + "learning_rate": 5.931918038528897e-05, + "loss": 1.0167, + "num_input_tokens_seen": 93529504, + "step": 5813 + }, + { + "epoch": 0.4072605006698363, + "grad_norm": 5.459712028503418, + "learning_rate": 5.9312182136602455e-05, + "loss": 1.0816, + "num_input_tokens_seen": 93545888, + "step": 5814 + }, + { + "epoch": 0.4073305489155656, + "grad_norm": 5.8356852531433105, + "learning_rate": 5.930518388791595e-05, + "loss": 1.1543, + "num_input_tokens_seen": 93562272, + "step": 5815 + }, + { + "epoch": 0.40740059716129484, + "grad_norm": 6.23671817779541, + "learning_rate": 5.929818563922943e-05, + "loss": 1.1929, + "num_input_tokens_seen": 93578656, + "step": 5816 + }, + { + "epoch": 0.4074706454070241, + "grad_norm": 3.6252057552337646, + "learning_rate": 5.9291187390542914e-05, + "loss": 1.1491, + "num_input_tokens_seen": 93594816, + "step": 5817 + }, + { + "epoch": 0.40754069365275336, + "grad_norm": 4.245891571044922, + "learning_rate": 5.9284189141856396e-05, + "loss": 1.1624, + "num_input_tokens_seen": 93611200, + "step": 5818 + }, + { + "epoch": 0.40761074189848256, + "grad_norm": 4.052443504333496, + "learning_rate": 5.9277190893169885e-05, + "loss": 0.9608, + "num_input_tokens_seen": 93627080, + "step": 5819 + }, + { + "epoch": 0.4076807901442118, + "grad_norm": 4.427778720855713, + "learning_rate": 5.927019264448336e-05, + "loss": 0.9268, + "num_input_tokens_seen": 93643464, + "step": 5820 + }, + { + "epoch": 0.4077508383899411, + "grad_norm": 4.1961541175842285, + "learning_rate": 5.926319439579685e-05, + "loss": 1.0374, + "num_input_tokens_seen": 93659088, + "step": 5821 + }, + { + "epoch": 0.40782088663567034, + "grad_norm": 4.001824378967285, + "learning_rate": 5.9256196147110344e-05, + "loss": 1.1674, + "num_input_tokens_seen": 93674928, + "step": 5822 + }, + { + "epoch": 0.40789093488139955, + "grad_norm": 4.010315895080566, + "learning_rate": 5.924919789842381e-05, + "loss": 0.9092, + "num_input_tokens_seen": 93690840, + "step": 5823 + }, + { + "epoch": 0.4079609831271288, + "grad_norm": 3.771390438079834, + "learning_rate": 5.9242199649737294e-05, + "loss": 1.1374, + "num_input_tokens_seen": 93707224, + "step": 5824 + }, + { + "epoch": 0.40803103137285807, + "grad_norm": 3.920438051223755, + "learning_rate": 5.92352014010508e-05, + "loss": 0.9206, + "num_input_tokens_seen": 93722536, + "step": 5825 + }, + { + "epoch": 0.4081010796185873, + "grad_norm": 4.679770469665527, + "learning_rate": 5.922820315236428e-05, + "loss": 1.0143, + "num_input_tokens_seen": 93738736, + "step": 5826 + }, + { + "epoch": 0.40817112786431653, + "grad_norm": 4.082173824310303, + "learning_rate": 5.922120490367776e-05, + "loss": 0.9896, + "num_input_tokens_seen": 93755120, + "step": 5827 + }, + { + "epoch": 0.4082411761100458, + "grad_norm": 4.076204776763916, + "learning_rate": 5.921420665499124e-05, + "loss": 0.9632, + "num_input_tokens_seen": 93771504, + "step": 5828 + }, + { + "epoch": 0.40831122435577505, + "grad_norm": 4.721165180206299, + "learning_rate": 5.920720840630474e-05, + "loss": 1.237, + "num_input_tokens_seen": 93787888, + "step": 5829 + }, + { + "epoch": 0.4083812726015043, + "grad_norm": 4.313892841339111, + "learning_rate": 5.9200210157618205e-05, + "loss": 1.1815, + "num_input_tokens_seen": 93804272, + "step": 5830 + }, + { + "epoch": 0.4084513208472335, + "grad_norm": 4.506958961486816, + "learning_rate": 5.91932119089317e-05, + "loss": 0.8849, + "num_input_tokens_seen": 93820416, + "step": 5831 + }, + { + "epoch": 0.40852136909296277, + "grad_norm": 4.6436991691589355, + "learning_rate": 5.9186213660245196e-05, + "loss": 0.8856, + "num_input_tokens_seen": 93836648, + "step": 5832 + }, + { + "epoch": 0.40859141733869203, + "grad_norm": 3.9535446166992188, + "learning_rate": 5.917921541155868e-05, + "loss": 1.1285, + "num_input_tokens_seen": 93853032, + "step": 5833 + }, + { + "epoch": 0.4086614655844213, + "grad_norm": 6.974640846252441, + "learning_rate": 5.917221716287216e-05, + "loss": 1.0669, + "num_input_tokens_seen": 93868640, + "step": 5834 + }, + { + "epoch": 0.4087315138301505, + "grad_norm": 4.3199262619018555, + "learning_rate": 5.916521891418565e-05, + "loss": 1.1921, + "num_input_tokens_seen": 93885024, + "step": 5835 + }, + { + "epoch": 0.40880156207587975, + "grad_norm": 3.892812967300415, + "learning_rate": 5.915822066549913e-05, + "loss": 1.0978, + "num_input_tokens_seen": 93901408, + "step": 5836 + }, + { + "epoch": 0.408871610321609, + "grad_norm": 4.434093952178955, + "learning_rate": 5.915122241681261e-05, + "loss": 1.1484, + "num_input_tokens_seen": 93917632, + "step": 5837 + }, + { + "epoch": 0.4089416585673383, + "grad_norm": 3.960766553878784, + "learning_rate": 5.914422416812611e-05, + "loss": 0.994, + "num_input_tokens_seen": 93934016, + "step": 5838 + }, + { + "epoch": 0.4090117068130675, + "grad_norm": 6.450897693634033, + "learning_rate": 5.913722591943959e-05, + "loss": 1.1364, + "num_input_tokens_seen": 93950328, + "step": 5839 + }, + { + "epoch": 0.40908175505879674, + "grad_norm": 4.3362956047058105, + "learning_rate": 5.913022767075307e-05, + "loss": 1.0023, + "num_input_tokens_seen": 93966712, + "step": 5840 + }, + { + "epoch": 0.409151803304526, + "grad_norm": 4.250185966491699, + "learning_rate": 5.912322942206655e-05, + "loss": 0.9535, + "num_input_tokens_seen": 93982840, + "step": 5841 + }, + { + "epoch": 0.40922185155025526, + "grad_norm": 5.399633407592773, + "learning_rate": 5.911623117338004e-05, + "loss": 1.0504, + "num_input_tokens_seen": 93998192, + "step": 5842 + }, + { + "epoch": 0.40929189979598446, + "grad_norm": 3.926515579223633, + "learning_rate": 5.910923292469352e-05, + "loss": 1.1711, + "num_input_tokens_seen": 94014296, + "step": 5843 + }, + { + "epoch": 0.4093619480417137, + "grad_norm": 3.9847402572631836, + "learning_rate": 5.9102234676007005e-05, + "loss": 1.0845, + "num_input_tokens_seen": 94030520, + "step": 5844 + }, + { + "epoch": 0.409431996287443, + "grad_norm": 4.106837272644043, + "learning_rate": 5.9095236427320486e-05, + "loss": 1.1529, + "num_input_tokens_seen": 94046512, + "step": 5845 + }, + { + "epoch": 0.40950204453317224, + "grad_norm": 5.044075965881348, + "learning_rate": 5.908823817863398e-05, + "loss": 0.8541, + "num_input_tokens_seen": 94061536, + "step": 5846 + }, + { + "epoch": 0.40957209277890144, + "grad_norm": 7.444840908050537, + "learning_rate": 5.908123992994745e-05, + "loss": 0.9811, + "num_input_tokens_seen": 94077920, + "step": 5847 + }, + { + "epoch": 0.4096421410246307, + "grad_norm": 4.676487445831299, + "learning_rate": 5.907424168126095e-05, + "loss": 0.9323, + "num_input_tokens_seen": 94094040, + "step": 5848 + }, + { + "epoch": 0.40971218927035996, + "grad_norm": 3.945162057876587, + "learning_rate": 5.9067243432574434e-05, + "loss": 1.0101, + "num_input_tokens_seen": 94110424, + "step": 5849 + }, + { + "epoch": 0.4097822375160892, + "grad_norm": 3.9881343841552734, + "learning_rate": 5.9060245183887916e-05, + "loss": 1.2385, + "num_input_tokens_seen": 94126808, + "step": 5850 + }, + { + "epoch": 0.4098522857618185, + "grad_norm": 3.5962657928466797, + "learning_rate": 5.90532469352014e-05, + "loss": 0.988, + "num_input_tokens_seen": 94143192, + "step": 5851 + }, + { + "epoch": 0.4099223340075477, + "grad_norm": 5.755387783050537, + "learning_rate": 5.904624868651491e-05, + "loss": 1.1936, + "num_input_tokens_seen": 94158728, + "step": 5852 + }, + { + "epoch": 0.40999238225327694, + "grad_norm": 4.999855995178223, + "learning_rate": 5.9039250437828375e-05, + "loss": 1.1235, + "num_input_tokens_seen": 94174736, + "step": 5853 + }, + { + "epoch": 0.4100624304990062, + "grad_norm": 5.452749729156494, + "learning_rate": 5.903225218914186e-05, + "loss": 1.1477, + "num_input_tokens_seen": 94191120, + "step": 5854 + }, + { + "epoch": 0.41013247874473546, + "grad_norm": 4.211399078369141, + "learning_rate": 5.902525394045535e-05, + "loss": 0.9808, + "num_input_tokens_seen": 94207504, + "step": 5855 + }, + { + "epoch": 0.41020252699046467, + "grad_norm": 3.3774921894073486, + "learning_rate": 5.9018255691768834e-05, + "loss": 1.0297, + "num_input_tokens_seen": 94223888, + "step": 5856 + }, + { + "epoch": 0.4102725752361939, + "grad_norm": 6.850539207458496, + "learning_rate": 5.9011257443082316e-05, + "loss": 1.0313, + "num_input_tokens_seen": 94240120, + "step": 5857 + }, + { + "epoch": 0.4103426234819232, + "grad_norm": 3.2469308376312256, + "learning_rate": 5.9004259194395804e-05, + "loss": 0.9631, + "num_input_tokens_seen": 94256488, + "step": 5858 + }, + { + "epoch": 0.41041267172765244, + "grad_norm": 5.0675201416015625, + "learning_rate": 5.89972609457093e-05, + "loss": 0.8961, + "num_input_tokens_seen": 94272048, + "step": 5859 + }, + { + "epoch": 0.41048271997338165, + "grad_norm": 3.8971400260925293, + "learning_rate": 5.899026269702277e-05, + "loss": 1.2323, + "num_input_tokens_seen": 94287984, + "step": 5860 + }, + { + "epoch": 0.4105527682191109, + "grad_norm": 3.8453164100646973, + "learning_rate": 5.898326444833625e-05, + "loss": 1.1039, + "num_input_tokens_seen": 94303976, + "step": 5861 + }, + { + "epoch": 0.41062281646484017, + "grad_norm": 3.7564680576324463, + "learning_rate": 5.897626619964975e-05, + "loss": 1.0977, + "num_input_tokens_seen": 94319552, + "step": 5862 + }, + { + "epoch": 0.4106928647105694, + "grad_norm": 4.541357517242432, + "learning_rate": 5.896926795096323e-05, + "loss": 1.2374, + "num_input_tokens_seen": 94335936, + "step": 5863 + }, + { + "epoch": 0.41076291295629863, + "grad_norm": 4.841330051422119, + "learning_rate": 5.896226970227671e-05, + "loss": 0.8555, + "num_input_tokens_seen": 94351784, + "step": 5864 + }, + { + "epoch": 0.4108329612020279, + "grad_norm": 3.9247653484344482, + "learning_rate": 5.89552714535902e-05, + "loss": 1.0169, + "num_input_tokens_seen": 94368080, + "step": 5865 + }, + { + "epoch": 0.41090300944775715, + "grad_norm": 4.763803958892822, + "learning_rate": 5.894827320490368e-05, + "loss": 0.9812, + "num_input_tokens_seen": 94384056, + "step": 5866 + }, + { + "epoch": 0.4109730576934864, + "grad_norm": 5.141749858856201, + "learning_rate": 5.894127495621716e-05, + "loss": 1.0001, + "num_input_tokens_seen": 94400440, + "step": 5867 + }, + { + "epoch": 0.4110431059392156, + "grad_norm": 3.360656261444092, + "learning_rate": 5.8934276707530656e-05, + "loss": 0.988, + "num_input_tokens_seen": 94416824, + "step": 5868 + }, + { + "epoch": 0.4111131541849449, + "grad_norm": 4.226006507873535, + "learning_rate": 5.892727845884415e-05, + "loss": 1.0762, + "num_input_tokens_seen": 94433208, + "step": 5869 + }, + { + "epoch": 0.41118320243067413, + "grad_norm": 6.924234390258789, + "learning_rate": 5.892028021015762e-05, + "loss": 1.1498, + "num_input_tokens_seen": 94448848, + "step": 5870 + }, + { + "epoch": 0.4112532506764034, + "grad_norm": 3.643950939178467, + "learning_rate": 5.89132819614711e-05, + "loss": 0.9977, + "num_input_tokens_seen": 94464720, + "step": 5871 + }, + { + "epoch": 0.4113232989221326, + "grad_norm": 3.3077268600463867, + "learning_rate": 5.890628371278459e-05, + "loss": 0.9421, + "num_input_tokens_seen": 94481104, + "step": 5872 + }, + { + "epoch": 0.41139334716786186, + "grad_norm": 3.9556264877319336, + "learning_rate": 5.889928546409807e-05, + "loss": 0.9313, + "num_input_tokens_seen": 94496944, + "step": 5873 + }, + { + "epoch": 0.4114633954135911, + "grad_norm": 7.0952606201171875, + "learning_rate": 5.8892287215411554e-05, + "loss": 1.3029, + "num_input_tokens_seen": 94511336, + "step": 5874 + }, + { + "epoch": 0.4115334436593204, + "grad_norm": 3.946803569793701, + "learning_rate": 5.888528896672505e-05, + "loss": 0.9865, + "num_input_tokens_seen": 94526904, + "step": 5875 + }, + { + "epoch": 0.4116034919050496, + "grad_norm": 4.165796756744385, + "learning_rate": 5.8878290718038545e-05, + "loss": 1.2249, + "num_input_tokens_seen": 94543024, + "step": 5876 + }, + { + "epoch": 0.41167354015077884, + "grad_norm": 3.9492764472961426, + "learning_rate": 5.887129246935201e-05, + "loss": 1.0304, + "num_input_tokens_seen": 94558744, + "step": 5877 + }, + { + "epoch": 0.4117435883965081, + "grad_norm": 4.144934177398682, + "learning_rate": 5.8864294220665495e-05, + "loss": 1.0468, + "num_input_tokens_seen": 94574288, + "step": 5878 + }, + { + "epoch": 0.41181363664223736, + "grad_norm": 3.7620224952697754, + "learning_rate": 5.8857295971979e-05, + "loss": 0.929, + "num_input_tokens_seen": 94590672, + "step": 5879 + }, + { + "epoch": 0.41188368488796656, + "grad_norm": 4.788266181945801, + "learning_rate": 5.885029772329247e-05, + "loss": 1.0106, + "num_input_tokens_seen": 94606792, + "step": 5880 + }, + { + "epoch": 0.4119537331336958, + "grad_norm": 4.383455276489258, + "learning_rate": 5.8843299474605954e-05, + "loss": 0.9611, + "num_input_tokens_seen": 94623176, + "step": 5881 + }, + { + "epoch": 0.4120237813794251, + "grad_norm": 7.097373962402344, + "learning_rate": 5.883630122591944e-05, + "loss": 1.1559, + "num_input_tokens_seen": 94638952, + "step": 5882 + }, + { + "epoch": 0.41209382962515434, + "grad_norm": 5.4228901863098145, + "learning_rate": 5.882930297723294e-05, + "loss": 1.2569, + "num_input_tokens_seen": 94655224, + "step": 5883 + }, + { + "epoch": 0.41216387787088354, + "grad_norm": 3.792999267578125, + "learning_rate": 5.882230472854642e-05, + "loss": 0.8853, + "num_input_tokens_seen": 94671608, + "step": 5884 + }, + { + "epoch": 0.4122339261166128, + "grad_norm": 5.401544094085693, + "learning_rate": 5.88153064798599e-05, + "loss": 1.0763, + "num_input_tokens_seen": 94687048, + "step": 5885 + }, + { + "epoch": 0.41230397436234206, + "grad_norm": 3.2229812145233154, + "learning_rate": 5.880830823117339e-05, + "loss": 0.9375, + "num_input_tokens_seen": 94703432, + "step": 5886 + }, + { + "epoch": 0.4123740226080713, + "grad_norm": 4.5977277755737305, + "learning_rate": 5.8801309982486865e-05, + "loss": 1.1245, + "num_input_tokens_seen": 94719152, + "step": 5887 + }, + { + "epoch": 0.4124440708538006, + "grad_norm": 3.436765670776367, + "learning_rate": 5.879431173380035e-05, + "loss": 0.8353, + "num_input_tokens_seen": 94735536, + "step": 5888 + }, + { + "epoch": 0.4125141190995298, + "grad_norm": 3.4720351696014404, + "learning_rate": 5.878731348511385e-05, + "loss": 1.0329, + "num_input_tokens_seen": 94751840, + "step": 5889 + }, + { + "epoch": 0.41258416734525905, + "grad_norm": 3.621783971786499, + "learning_rate": 5.878031523642732e-05, + "loss": 1.0355, + "num_input_tokens_seen": 94768224, + "step": 5890 + }, + { + "epoch": 0.4126542155909883, + "grad_norm": 4.453585624694824, + "learning_rate": 5.87733169877408e-05, + "loss": 1.1171, + "num_input_tokens_seen": 94783672, + "step": 5891 + }, + { + "epoch": 0.41272426383671756, + "grad_norm": 4.191892147064209, + "learning_rate": 5.8766318739054294e-05, + "loss": 0.9505, + "num_input_tokens_seen": 94800056, + "step": 5892 + }, + { + "epoch": 0.41279431208244677, + "grad_norm": 3.5963308811187744, + "learning_rate": 5.875932049036778e-05, + "loss": 1.0223, + "num_input_tokens_seen": 94816440, + "step": 5893 + }, + { + "epoch": 0.412864360328176, + "grad_norm": 3.904618978500366, + "learning_rate": 5.8752322241681265e-05, + "loss": 1.0205, + "num_input_tokens_seen": 94832824, + "step": 5894 + }, + { + "epoch": 0.4129344085739053, + "grad_norm": 3.650961399078369, + "learning_rate": 5.8745323992994747e-05, + "loss": 0.9613, + "num_input_tokens_seen": 94849208, + "step": 5895 + }, + { + "epoch": 0.41300445681963455, + "grad_norm": 4.438238143920898, + "learning_rate": 5.873832574430824e-05, + "loss": 0.9865, + "num_input_tokens_seen": 94865592, + "step": 5896 + }, + { + "epoch": 0.41307450506536375, + "grad_norm": 3.6906070709228516, + "learning_rate": 5.873132749562171e-05, + "loss": 1.1228, + "num_input_tokens_seen": 94881976, + "step": 5897 + }, + { + "epoch": 0.413144553311093, + "grad_norm": 5.111722469329834, + "learning_rate": 5.872432924693519e-05, + "loss": 1.2678, + "num_input_tokens_seen": 94898360, + "step": 5898 + }, + { + "epoch": 0.41321460155682227, + "grad_norm": 3.7190804481506348, + "learning_rate": 5.871733099824869e-05, + "loss": 1.152, + "num_input_tokens_seen": 94914512, + "step": 5899 + }, + { + "epoch": 0.41328464980255153, + "grad_norm": 3.779918670654297, + "learning_rate": 5.871033274956218e-05, + "loss": 0.905, + "num_input_tokens_seen": 94930816, + "step": 5900 + }, + { + "epoch": 0.41335469804828073, + "grad_norm": 3.5921852588653564, + "learning_rate": 5.8703334500875665e-05, + "loss": 0.9119, + "num_input_tokens_seen": 94947200, + "step": 5901 + }, + { + "epoch": 0.41342474629401, + "grad_norm": 4.024271011352539, + "learning_rate": 5.8696336252189146e-05, + "loss": 1.048, + "num_input_tokens_seen": 94963584, + "step": 5902 + }, + { + "epoch": 0.41349479453973925, + "grad_norm": 4.798417091369629, + "learning_rate": 5.8689338003502635e-05, + "loss": 1.1399, + "num_input_tokens_seen": 94979968, + "step": 5903 + }, + { + "epoch": 0.4135648427854685, + "grad_norm": 3.5821495056152344, + "learning_rate": 5.868233975481612e-05, + "loss": 1.015, + "num_input_tokens_seen": 94996320, + "step": 5904 + }, + { + "epoch": 0.4136348910311977, + "grad_norm": 3.998082399368286, + "learning_rate": 5.867534150612959e-05, + "loss": 0.9569, + "num_input_tokens_seen": 95012704, + "step": 5905 + }, + { + "epoch": 0.413704939276927, + "grad_norm": 3.6389498710632324, + "learning_rate": 5.8668343257443094e-05, + "loss": 1.1562, + "num_input_tokens_seen": 95029088, + "step": 5906 + }, + { + "epoch": 0.41377498752265623, + "grad_norm": 4.137228012084961, + "learning_rate": 5.8661345008756576e-05, + "loss": 1.1683, + "num_input_tokens_seen": 95045472, + "step": 5907 + }, + { + "epoch": 0.4138450357683855, + "grad_norm": 4.181145668029785, + "learning_rate": 5.865434676007006e-05, + "loss": 0.993, + "num_input_tokens_seen": 95061856, + "step": 5908 + }, + { + "epoch": 0.4139150840141147, + "grad_norm": 3.759474754333496, + "learning_rate": 5.864734851138354e-05, + "loss": 1.0144, + "num_input_tokens_seen": 95077904, + "step": 5909 + }, + { + "epoch": 0.41398513225984396, + "grad_norm": 4.474549293518066, + "learning_rate": 5.864035026269703e-05, + "loss": 1.1008, + "num_input_tokens_seen": 95094288, + "step": 5910 + }, + { + "epoch": 0.4140551805055732, + "grad_norm": 7.399059295654297, + "learning_rate": 5.863335201401051e-05, + "loss": 1.2234, + "num_input_tokens_seen": 95107872, + "step": 5911 + }, + { + "epoch": 0.4141252287513025, + "grad_norm": 4.018132209777832, + "learning_rate": 5.862635376532399e-05, + "loss": 0.9736, + "num_input_tokens_seen": 95124256, + "step": 5912 + }, + { + "epoch": 0.4141952769970317, + "grad_norm": 3.825305223464966, + "learning_rate": 5.861935551663749e-05, + "loss": 0.9833, + "num_input_tokens_seen": 95139784, + "step": 5913 + }, + { + "epoch": 0.41426532524276094, + "grad_norm": 3.7942214012145996, + "learning_rate": 5.861235726795097e-05, + "loss": 1.0999, + "num_input_tokens_seen": 95155136, + "step": 5914 + }, + { + "epoch": 0.4143353734884902, + "grad_norm": 3.9006733894348145, + "learning_rate": 5.860535901926444e-05, + "loss": 1.0608, + "num_input_tokens_seen": 95171520, + "step": 5915 + }, + { + "epoch": 0.41440542173421946, + "grad_norm": 3.688754081726074, + "learning_rate": 5.859836077057794e-05, + "loss": 1.1338, + "num_input_tokens_seen": 95187752, + "step": 5916 + }, + { + "epoch": 0.41447546997994866, + "grad_norm": 3.954989433288574, + "learning_rate": 5.859136252189142e-05, + "loss": 0.8965, + "num_input_tokens_seen": 95203864, + "step": 5917 + }, + { + "epoch": 0.4145455182256779, + "grad_norm": 3.733185052871704, + "learning_rate": 5.85843642732049e-05, + "loss": 0.9872, + "num_input_tokens_seen": 95220248, + "step": 5918 + }, + { + "epoch": 0.4146155664714072, + "grad_norm": 4.39019775390625, + "learning_rate": 5.8577366024518385e-05, + "loss": 1.0452, + "num_input_tokens_seen": 95236632, + "step": 5919 + }, + { + "epoch": 0.41468561471713644, + "grad_norm": 3.716066837310791, + "learning_rate": 5.857036777583188e-05, + "loss": 0.9231, + "num_input_tokens_seen": 95251888, + "step": 5920 + }, + { + "epoch": 0.4147556629628657, + "grad_norm": 3.7525405883789062, + "learning_rate": 5.856336952714536e-05, + "loss": 1.1147, + "num_input_tokens_seen": 95268272, + "step": 5921 + }, + { + "epoch": 0.4148257112085949, + "grad_norm": 3.605818033218384, + "learning_rate": 5.855637127845886e-05, + "loss": 0.9874, + "num_input_tokens_seen": 95283880, + "step": 5922 + }, + { + "epoch": 0.41489575945432416, + "grad_norm": 3.878814458847046, + "learning_rate": 5.854937302977234e-05, + "loss": 1.1556, + "num_input_tokens_seen": 95300264, + "step": 5923 + }, + { + "epoch": 0.4149658077000534, + "grad_norm": 4.94001579284668, + "learning_rate": 5.854237478108582e-05, + "loss": 1.0014, + "num_input_tokens_seen": 95316648, + "step": 5924 + }, + { + "epoch": 0.4150358559457827, + "grad_norm": 4.213568210601807, + "learning_rate": 5.85353765323993e-05, + "loss": 1.2315, + "num_input_tokens_seen": 95332880, + "step": 5925 + }, + { + "epoch": 0.4151059041915119, + "grad_norm": 4.2475996017456055, + "learning_rate": 5.8528378283712784e-05, + "loss": 1.0717, + "num_input_tokens_seen": 95349176, + "step": 5926 + }, + { + "epoch": 0.41517595243724115, + "grad_norm": 4.237911224365234, + "learning_rate": 5.852138003502627e-05, + "loss": 0.93, + "num_input_tokens_seen": 95365560, + "step": 5927 + }, + { + "epoch": 0.4152460006829704, + "grad_norm": 3.695140838623047, + "learning_rate": 5.8514381786339755e-05, + "loss": 0.9561, + "num_input_tokens_seen": 95381280, + "step": 5928 + }, + { + "epoch": 0.41531604892869967, + "grad_norm": 4.457770347595215, + "learning_rate": 5.850738353765325e-05, + "loss": 1.0541, + "num_input_tokens_seen": 95397664, + "step": 5929 + }, + { + "epoch": 0.41538609717442887, + "grad_norm": 4.237982273101807, + "learning_rate": 5.850038528896673e-05, + "loss": 1.0123, + "num_input_tokens_seen": 95414048, + "step": 5930 + }, + { + "epoch": 0.41545614542015813, + "grad_norm": 3.5690579414367676, + "learning_rate": 5.8493387040280214e-05, + "loss": 0.9788, + "num_input_tokens_seen": 95430432, + "step": 5931 + }, + { + "epoch": 0.4155261936658874, + "grad_norm": 4.665618419647217, + "learning_rate": 5.8486388791593696e-05, + "loss": 1.0138, + "num_input_tokens_seen": 95446816, + "step": 5932 + }, + { + "epoch": 0.41559624191161665, + "grad_norm": 3.76755952835083, + "learning_rate": 5.8479390542907184e-05, + "loss": 0.9853, + "num_input_tokens_seen": 95463200, + "step": 5933 + }, + { + "epoch": 0.41566629015734585, + "grad_norm": 4.855432033538818, + "learning_rate": 5.8472392294220666e-05, + "loss": 1.1747, + "num_input_tokens_seen": 95478704, + "step": 5934 + }, + { + "epoch": 0.4157363384030751, + "grad_norm": 4.860055446624756, + "learning_rate": 5.846539404553415e-05, + "loss": 0.9747, + "num_input_tokens_seen": 95495088, + "step": 5935 + }, + { + "epoch": 0.41580638664880437, + "grad_norm": 4.268356800079346, + "learning_rate": 5.845839579684763e-05, + "loss": 1.1564, + "num_input_tokens_seen": 95511472, + "step": 5936 + }, + { + "epoch": 0.41587643489453363, + "grad_norm": 7.423181533813477, + "learning_rate": 5.8451397548161125e-05, + "loss": 1.0369, + "num_input_tokens_seen": 95527008, + "step": 5937 + }, + { + "epoch": 0.41594648314026283, + "grad_norm": 5.740126609802246, + "learning_rate": 5.844439929947461e-05, + "loss": 1.1188, + "num_input_tokens_seen": 95542536, + "step": 5938 + }, + { + "epoch": 0.4160165313859921, + "grad_norm": 5.135944366455078, + "learning_rate": 5.8437401050788096e-05, + "loss": 1.1815, + "num_input_tokens_seen": 95558688, + "step": 5939 + }, + { + "epoch": 0.41608657963172135, + "grad_norm": 3.879530191421509, + "learning_rate": 5.843040280210158e-05, + "loss": 1.162, + "num_input_tokens_seen": 95575064, + "step": 5940 + }, + { + "epoch": 0.4161566278774506, + "grad_norm": 4.096410274505615, + "learning_rate": 5.842340455341506e-05, + "loss": 1.0883, + "num_input_tokens_seen": 95591272, + "step": 5941 + }, + { + "epoch": 0.4162266761231798, + "grad_norm": 4.095829486846924, + "learning_rate": 5.841640630472854e-05, + "loss": 1.0516, + "num_input_tokens_seen": 95607656, + "step": 5942 + }, + { + "epoch": 0.4162967243689091, + "grad_norm": 4.076023101806641, + "learning_rate": 5.840940805604205e-05, + "loss": 1.0361, + "num_input_tokens_seen": 95623384, + "step": 5943 + }, + { + "epoch": 0.41636677261463834, + "grad_norm": 4.08365535736084, + "learning_rate": 5.840240980735553e-05, + "loss": 1.1102, + "num_input_tokens_seen": 95639768, + "step": 5944 + }, + { + "epoch": 0.4164368208603676, + "grad_norm": 4.182791233062744, + "learning_rate": 5.8395411558669e-05, + "loss": 1.0212, + "num_input_tokens_seen": 95655928, + "step": 5945 + }, + { + "epoch": 0.4165068691060968, + "grad_norm": 4.3107428550720215, + "learning_rate": 5.8388413309982495e-05, + "loss": 0.8607, + "num_input_tokens_seen": 95672312, + "step": 5946 + }, + { + "epoch": 0.41657691735182606, + "grad_norm": 3.7357101440429688, + "learning_rate": 5.838141506129598e-05, + "loss": 1.0563, + "num_input_tokens_seen": 95688696, + "step": 5947 + }, + { + "epoch": 0.4166469655975553, + "grad_norm": 3.9959046840667725, + "learning_rate": 5.837441681260946e-05, + "loss": 1.1043, + "num_input_tokens_seen": 95704304, + "step": 5948 + }, + { + "epoch": 0.4167170138432846, + "grad_norm": 4.395400524139404, + "learning_rate": 5.836741856392295e-05, + "loss": 1.1087, + "num_input_tokens_seen": 95720688, + "step": 5949 + }, + { + "epoch": 0.4167870620890138, + "grad_norm": 4.998651027679443, + "learning_rate": 5.836042031523644e-05, + "loss": 1.1903, + "num_input_tokens_seen": 95737072, + "step": 5950 + }, + { + "epoch": 0.41685711033474304, + "grad_norm": 4.060539245605469, + "learning_rate": 5.8353422066549925e-05, + "loss": 1.033, + "num_input_tokens_seen": 95753456, + "step": 5951 + }, + { + "epoch": 0.4169271585804723, + "grad_norm": 5.286706924438477, + "learning_rate": 5.834642381786339e-05, + "loss": 1.0968, + "num_input_tokens_seen": 95769840, + "step": 5952 + }, + { + "epoch": 0.41699720682620156, + "grad_norm": 4.501932144165039, + "learning_rate": 5.833942556917689e-05, + "loss": 1.2626, + "num_input_tokens_seen": 95786224, + "step": 5953 + }, + { + "epoch": 0.41706725507193076, + "grad_norm": 5.144174575805664, + "learning_rate": 5.833242732049038e-05, + "loss": 0.8148, + "num_input_tokens_seen": 95802608, + "step": 5954 + }, + { + "epoch": 0.41713730331766, + "grad_norm": 3.6604678630828857, + "learning_rate": 5.832542907180385e-05, + "loss": 0.9467, + "num_input_tokens_seen": 95818992, + "step": 5955 + }, + { + "epoch": 0.4172073515633893, + "grad_norm": 5.387998104095459, + "learning_rate": 5.831843082311734e-05, + "loss": 1.181, + "num_input_tokens_seen": 95835376, + "step": 5956 + }, + { + "epoch": 0.41727739980911854, + "grad_norm": 4.576782703399658, + "learning_rate": 5.831143257443082e-05, + "loss": 1.0577, + "num_input_tokens_seen": 95851760, + "step": 5957 + }, + { + "epoch": 0.4173474480548478, + "grad_norm": 5.737542629241943, + "learning_rate": 5.8304434325744304e-05, + "loss": 1.039, + "num_input_tokens_seen": 95868144, + "step": 5958 + }, + { + "epoch": 0.417417496300577, + "grad_norm": 3.515028238296509, + "learning_rate": 5.8297436077057786e-05, + "loss": 0.9503, + "num_input_tokens_seen": 95884528, + "step": 5959 + }, + { + "epoch": 0.41748754454630627, + "grad_norm": 3.9339003562927246, + "learning_rate": 5.8290437828371295e-05, + "loss": 0.928, + "num_input_tokens_seen": 95900688, + "step": 5960 + }, + { + "epoch": 0.4175575927920355, + "grad_norm": 3.896474838256836, + "learning_rate": 5.828343957968477e-05, + "loss": 1.1249, + "num_input_tokens_seen": 95917072, + "step": 5961 + }, + { + "epoch": 0.4176276410377648, + "grad_norm": 5.101248264312744, + "learning_rate": 5.8276441330998245e-05, + "loss": 1.0097, + "num_input_tokens_seen": 95931976, + "step": 5962 + }, + { + "epoch": 0.417697689283494, + "grad_norm": 7.063873291015625, + "learning_rate": 5.8269443082311734e-05, + "loss": 1.1686, + "num_input_tokens_seen": 95948360, + "step": 5963 + }, + { + "epoch": 0.41776773752922325, + "grad_norm": 3.564887762069702, + "learning_rate": 5.826244483362523e-05, + "loss": 1.1375, + "num_input_tokens_seen": 95964744, + "step": 5964 + }, + { + "epoch": 0.4178377857749525, + "grad_norm": 3.821101427078247, + "learning_rate": 5.82554465849387e-05, + "loss": 1.252, + "num_input_tokens_seen": 95980760, + "step": 5965 + }, + { + "epoch": 0.41790783402068177, + "grad_norm": 3.609252691268921, + "learning_rate": 5.824844833625219e-05, + "loss": 1.0633, + "num_input_tokens_seen": 95997144, + "step": 5966 + }, + { + "epoch": 0.41797788226641097, + "grad_norm": 4.1750874519348145, + "learning_rate": 5.824145008756569e-05, + "loss": 1.119, + "num_input_tokens_seen": 96012872, + "step": 5967 + }, + { + "epoch": 0.41804793051214023, + "grad_norm": 3.776747465133667, + "learning_rate": 5.823445183887917e-05, + "loss": 0.9792, + "num_input_tokens_seen": 96029168, + "step": 5968 + }, + { + "epoch": 0.4181179787578695, + "grad_norm": 8.143741607666016, + "learning_rate": 5.822745359019264e-05, + "loss": 1.0686, + "num_input_tokens_seen": 96045376, + "step": 5969 + }, + { + "epoch": 0.41818802700359875, + "grad_norm": 4.336330890655518, + "learning_rate": 5.822045534150614e-05, + "loss": 1.0597, + "num_input_tokens_seen": 96060904, + "step": 5970 + }, + { + "epoch": 0.41825807524932795, + "grad_norm": 3.731605052947998, + "learning_rate": 5.821345709281962e-05, + "loss": 1.1302, + "num_input_tokens_seen": 96076824, + "step": 5971 + }, + { + "epoch": 0.4183281234950572, + "grad_norm": 3.8380699157714844, + "learning_rate": 5.82064588441331e-05, + "loss": 1.1711, + "num_input_tokens_seen": 96092616, + "step": 5972 + }, + { + "epoch": 0.4183981717407865, + "grad_norm": 3.9088358879089355, + "learning_rate": 5.8199460595446586e-05, + "loss": 1.1622, + "num_input_tokens_seen": 96109000, + "step": 5973 + }, + { + "epoch": 0.41846821998651573, + "grad_norm": 4.0047783851623535, + "learning_rate": 5.819246234676008e-05, + "loss": 1.0949, + "num_input_tokens_seen": 96125344, + "step": 5974 + }, + { + "epoch": 0.41853826823224494, + "grad_norm": 3.936495542526245, + "learning_rate": 5.818546409807356e-05, + "loss": 1.1566, + "num_input_tokens_seen": 96141536, + "step": 5975 + }, + { + "epoch": 0.4186083164779742, + "grad_norm": 3.8510451316833496, + "learning_rate": 5.8178465849387045e-05, + "loss": 1.0454, + "num_input_tokens_seen": 96157920, + "step": 5976 + }, + { + "epoch": 0.41867836472370346, + "grad_norm": 3.5825259685516357, + "learning_rate": 5.817146760070053e-05, + "loss": 1.0491, + "num_input_tokens_seen": 96173848, + "step": 5977 + }, + { + "epoch": 0.4187484129694327, + "grad_norm": 4.0553717613220215, + "learning_rate": 5.8164469352014015e-05, + "loss": 1.0669, + "num_input_tokens_seen": 96190232, + "step": 5978 + }, + { + "epoch": 0.4188184612151619, + "grad_norm": 4.085362434387207, + "learning_rate": 5.815747110332749e-05, + "loss": 1.2832, + "num_input_tokens_seen": 96205896, + "step": 5979 + }, + { + "epoch": 0.4188885094608912, + "grad_norm": 6.552733421325684, + "learning_rate": 5.815047285464098e-05, + "loss": 1.1183, + "num_input_tokens_seen": 96221784, + "step": 5980 + }, + { + "epoch": 0.41895855770662044, + "grad_norm": 4.052005290985107, + "learning_rate": 5.8143474605954474e-05, + "loss": 1.0309, + "num_input_tokens_seen": 96237472, + "step": 5981 + }, + { + "epoch": 0.4190286059523497, + "grad_norm": 3.9679994583129883, + "learning_rate": 5.813647635726794e-05, + "loss": 1.0298, + "num_input_tokens_seen": 96253616, + "step": 5982 + }, + { + "epoch": 0.4190986541980789, + "grad_norm": 4.879584312438965, + "learning_rate": 5.812947810858144e-05, + "loss": 0.9334, + "num_input_tokens_seen": 96270000, + "step": 5983 + }, + { + "epoch": 0.41916870244380816, + "grad_norm": 4.894060134887695, + "learning_rate": 5.8122479859894926e-05, + "loss": 0.9829, + "num_input_tokens_seen": 96286384, + "step": 5984 + }, + { + "epoch": 0.4192387506895374, + "grad_norm": 3.9925336837768555, + "learning_rate": 5.811548161120841e-05, + "loss": 1.0285, + "num_input_tokens_seen": 96302200, + "step": 5985 + }, + { + "epoch": 0.4193087989352667, + "grad_norm": 4.043905258178711, + "learning_rate": 5.810848336252189e-05, + "loss": 1.0217, + "num_input_tokens_seen": 96318584, + "step": 5986 + }, + { + "epoch": 0.4193788471809959, + "grad_norm": 4.216322422027588, + "learning_rate": 5.8101485113835385e-05, + "loss": 0.9483, + "num_input_tokens_seen": 96334720, + "step": 5987 + }, + { + "epoch": 0.41944889542672514, + "grad_norm": 3.772749900817871, + "learning_rate": 5.809448686514887e-05, + "loss": 1.272, + "num_input_tokens_seen": 96351104, + "step": 5988 + }, + { + "epoch": 0.4195189436724544, + "grad_norm": 3.6716036796569824, + "learning_rate": 5.8087488616462335e-05, + "loss": 1.1796, + "num_input_tokens_seen": 96367488, + "step": 5989 + }, + { + "epoch": 0.41958899191818366, + "grad_norm": 3.9748408794403076, + "learning_rate": 5.8080490367775844e-05, + "loss": 1.0994, + "num_input_tokens_seen": 96383872, + "step": 5990 + }, + { + "epoch": 0.4196590401639129, + "grad_norm": 5.4619269371032715, + "learning_rate": 5.8073492119089326e-05, + "loss": 1.0553, + "num_input_tokens_seen": 96400256, + "step": 5991 + }, + { + "epoch": 0.4197290884096421, + "grad_norm": 3.4772391319274902, + "learning_rate": 5.806649387040281e-05, + "loss": 1.0543, + "num_input_tokens_seen": 96416640, + "step": 5992 + }, + { + "epoch": 0.4197991366553714, + "grad_norm": 4.003359794616699, + "learning_rate": 5.805949562171629e-05, + "loss": 0.9306, + "num_input_tokens_seen": 96432872, + "step": 5993 + }, + { + "epoch": 0.41986918490110064, + "grad_norm": 3.433760166168213, + "learning_rate": 5.805249737302979e-05, + "loss": 1.0197, + "num_input_tokens_seen": 96449256, + "step": 5994 + }, + { + "epoch": 0.4199392331468299, + "grad_norm": 4.519425868988037, + "learning_rate": 5.804549912434326e-05, + "loss": 1.0303, + "num_input_tokens_seen": 96465456, + "step": 5995 + }, + { + "epoch": 0.4200092813925591, + "grad_norm": 3.8798038959503174, + "learning_rate": 5.803850087565674e-05, + "loss": 1.1426, + "num_input_tokens_seen": 96481840, + "step": 5996 + }, + { + "epoch": 0.42007932963828837, + "grad_norm": 7.4741058349609375, + "learning_rate": 5.803150262697024e-05, + "loss": 1.0759, + "num_input_tokens_seen": 96497160, + "step": 5997 + }, + { + "epoch": 0.4201493778840176, + "grad_norm": 3.6269989013671875, + "learning_rate": 5.802450437828372e-05, + "loss": 1.0676, + "num_input_tokens_seen": 96512672, + "step": 5998 + }, + { + "epoch": 0.4202194261297469, + "grad_norm": 3.6369056701660156, + "learning_rate": 5.80175061295972e-05, + "loss": 0.9351, + "num_input_tokens_seen": 96529056, + "step": 5999 + }, + { + "epoch": 0.4202894743754761, + "grad_norm": 6.0609564781188965, + "learning_rate": 5.801050788091069e-05, + "loss": 1.3902, + "num_input_tokens_seen": 96545320, + "step": 6000 + }, + { + "epoch": 0.4202894743754761, + "eval_loss": 1.1260257959365845, + "eval_runtime": 0.1972, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 5.072, + "num_input_tokens_seen": 96545320, + "step": 6000 + }, + { + "epoch": 0.42035952262120535, + "grad_norm": 3.939091682434082, + "learning_rate": 5.800350963222417e-05, + "loss": 1.2935, + "num_input_tokens_seen": 96561704, + "step": 6001 + }, + { + "epoch": 0.4204295708669346, + "grad_norm": 4.907895565032959, + "learning_rate": 5.799651138353765e-05, + "loss": 0.9923, + "num_input_tokens_seen": 96578088, + "step": 6002 + }, + { + "epoch": 0.42049961911266387, + "grad_norm": 4.598423480987549, + "learning_rate": 5.7989513134851135e-05, + "loss": 0.9852, + "num_input_tokens_seen": 96594392, + "step": 6003 + }, + { + "epoch": 0.4205696673583931, + "grad_norm": 4.8221540451049805, + "learning_rate": 5.7982514886164644e-05, + "loss": 1.2558, + "num_input_tokens_seen": 96609688, + "step": 6004 + }, + { + "epoch": 0.42063971560412233, + "grad_norm": 6.331230163574219, + "learning_rate": 5.797551663747811e-05, + "loss": 0.943, + "num_input_tokens_seen": 96625480, + "step": 6005 + }, + { + "epoch": 0.4207097638498516, + "grad_norm": 4.262217044830322, + "learning_rate": 5.7968518388791594e-05, + "loss": 1.1607, + "num_input_tokens_seen": 96641040, + "step": 6006 + }, + { + "epoch": 0.42077981209558085, + "grad_norm": 4.552499294281006, + "learning_rate": 5.796152014010508e-05, + "loss": 0.9532, + "num_input_tokens_seen": 96657424, + "step": 6007 + }, + { + "epoch": 0.42084986034131006, + "grad_norm": 3.414970874786377, + "learning_rate": 5.7954521891418564e-05, + "loss": 0.8991, + "num_input_tokens_seen": 96673792, + "step": 6008 + }, + { + "epoch": 0.4209199085870393, + "grad_norm": 3.70623517036438, + "learning_rate": 5.7947523642732046e-05, + "loss": 1.167, + "num_input_tokens_seen": 96690176, + "step": 6009 + }, + { + "epoch": 0.4209899568327686, + "grad_norm": 4.370288848876953, + "learning_rate": 5.794052539404554e-05, + "loss": 1.0194, + "num_input_tokens_seen": 96706272, + "step": 6010 + }, + { + "epoch": 0.42106000507849783, + "grad_norm": 3.4775140285491943, + "learning_rate": 5.793352714535904e-05, + "loss": 0.9383, + "num_input_tokens_seen": 96722312, + "step": 6011 + }, + { + "epoch": 0.42113005332422704, + "grad_norm": 3.9860763549804688, + "learning_rate": 5.7926528896672505e-05, + "loss": 1.0999, + "num_input_tokens_seen": 96737040, + "step": 6012 + }, + { + "epoch": 0.4212001015699563, + "grad_norm": 5.0287933349609375, + "learning_rate": 5.791953064798599e-05, + "loss": 0.9892, + "num_input_tokens_seen": 96753424, + "step": 6013 + }, + { + "epoch": 0.42127014981568556, + "grad_norm": 3.821143627166748, + "learning_rate": 5.791253239929949e-05, + "loss": 1.0388, + "num_input_tokens_seen": 96769552, + "step": 6014 + }, + { + "epoch": 0.4213401980614148, + "grad_norm": 4.180905818939209, + "learning_rate": 5.7905534150612964e-05, + "loss": 1.1118, + "num_input_tokens_seen": 96785936, + "step": 6015 + }, + { + "epoch": 0.421410246307144, + "grad_norm": 4.334224224090576, + "learning_rate": 5.7898535901926446e-05, + "loss": 1.3425, + "num_input_tokens_seen": 96802320, + "step": 6016 + }, + { + "epoch": 0.4214802945528733, + "grad_norm": 4.317337989807129, + "learning_rate": 5.7891537653239934e-05, + "loss": 1.0218, + "num_input_tokens_seen": 96818360, + "step": 6017 + }, + { + "epoch": 0.42155034279860254, + "grad_norm": 3.789919376373291, + "learning_rate": 5.788453940455343e-05, + "loss": 1.0815, + "num_input_tokens_seen": 96833928, + "step": 6018 + }, + { + "epoch": 0.4216203910443318, + "grad_norm": 4.240170001983643, + "learning_rate": 5.78775411558669e-05, + "loss": 1.0818, + "num_input_tokens_seen": 96850312, + "step": 6019 + }, + { + "epoch": 0.421690439290061, + "grad_norm": 5.163384914398193, + "learning_rate": 5.7870542907180393e-05, + "loss": 1.061, + "num_input_tokens_seen": 96866696, + "step": 6020 + }, + { + "epoch": 0.42176048753579026, + "grad_norm": 3.653265953063965, + "learning_rate": 5.786354465849388e-05, + "loss": 0.8955, + "num_input_tokens_seen": 96883080, + "step": 6021 + }, + { + "epoch": 0.4218305357815195, + "grad_norm": 3.4269649982452393, + "learning_rate": 5.785654640980736e-05, + "loss": 1.009, + "num_input_tokens_seen": 96899176, + "step": 6022 + }, + { + "epoch": 0.4219005840272488, + "grad_norm": 5.8838276863098145, + "learning_rate": 5.784954816112084e-05, + "loss": 1.0385, + "num_input_tokens_seen": 96914576, + "step": 6023 + }, + { + "epoch": 0.42197063227297804, + "grad_norm": 4.201550006866455, + "learning_rate": 5.784254991243433e-05, + "loss": 1.0398, + "num_input_tokens_seen": 96930808, + "step": 6024 + }, + { + "epoch": 0.42204068051870725, + "grad_norm": 3.961399793624878, + "learning_rate": 5.783555166374781e-05, + "loss": 1.0815, + "num_input_tokens_seen": 96947192, + "step": 6025 + }, + { + "epoch": 0.4221107287644365, + "grad_norm": 4.811456680297852, + "learning_rate": 5.782855341506129e-05, + "loss": 1.0576, + "num_input_tokens_seen": 96961896, + "step": 6026 + }, + { + "epoch": 0.42218077701016576, + "grad_norm": 3.6154356002807617, + "learning_rate": 5.7821555166374787e-05, + "loss": 0.9678, + "num_input_tokens_seen": 96977656, + "step": 6027 + }, + { + "epoch": 0.422250825255895, + "grad_norm": 3.787724256515503, + "learning_rate": 5.7814556917688275e-05, + "loss": 1.0813, + "num_input_tokens_seen": 96993936, + "step": 6028 + }, + { + "epoch": 0.4223208735016242, + "grad_norm": 4.215615272521973, + "learning_rate": 5.780755866900175e-05, + "loss": 1.2758, + "num_input_tokens_seen": 97010320, + "step": 6029 + }, + { + "epoch": 0.4223909217473535, + "grad_norm": 3.9257047176361084, + "learning_rate": 5.780056042031524e-05, + "loss": 0.9753, + "num_input_tokens_seen": 97026704, + "step": 6030 + }, + { + "epoch": 0.42246096999308275, + "grad_norm": 3.5415945053100586, + "learning_rate": 5.7793562171628734e-05, + "loss": 1.0718, + "num_input_tokens_seen": 97043088, + "step": 6031 + }, + { + "epoch": 0.422531018238812, + "grad_norm": 4.213465213775635, + "learning_rate": 5.77865639229422e-05, + "loss": 1.0011, + "num_input_tokens_seen": 97059472, + "step": 6032 + }, + { + "epoch": 0.4226010664845412, + "grad_norm": 3.8070178031921387, + "learning_rate": 5.7779565674255684e-05, + "loss": 0.881, + "num_input_tokens_seen": 97074712, + "step": 6033 + }, + { + "epoch": 0.42267111473027047, + "grad_norm": 3.8083109855651855, + "learning_rate": 5.777256742556918e-05, + "loss": 1.0003, + "num_input_tokens_seen": 97091096, + "step": 6034 + }, + { + "epoch": 0.42274116297599973, + "grad_norm": 3.491002082824707, + "learning_rate": 5.7765569176882675e-05, + "loss": 1.0276, + "num_input_tokens_seen": 97107304, + "step": 6035 + }, + { + "epoch": 0.422811211221729, + "grad_norm": 4.1060919761657715, + "learning_rate": 5.775857092819616e-05, + "loss": 0.979, + "num_input_tokens_seen": 97123688, + "step": 6036 + }, + { + "epoch": 0.4228812594674582, + "grad_norm": 3.8975484371185303, + "learning_rate": 5.775157267950964e-05, + "loss": 1.0906, + "num_input_tokens_seen": 97140008, + "step": 6037 + }, + { + "epoch": 0.42295130771318745, + "grad_norm": 4.4457197189331055, + "learning_rate": 5.774457443082313e-05, + "loss": 1.0763, + "num_input_tokens_seen": 97156392, + "step": 6038 + }, + { + "epoch": 0.4230213559589167, + "grad_norm": 3.5186471939086914, + "learning_rate": 5.77375761821366e-05, + "loss": 1.0242, + "num_input_tokens_seen": 97172776, + "step": 6039 + }, + { + "epoch": 0.42309140420464597, + "grad_norm": 3.729041814804077, + "learning_rate": 5.7730577933450084e-05, + "loss": 1.0272, + "num_input_tokens_seen": 97189160, + "step": 6040 + }, + { + "epoch": 0.4231614524503752, + "grad_norm": 4.501081466674805, + "learning_rate": 5.7723579684763586e-05, + "loss": 0.9879, + "num_input_tokens_seen": 97205544, + "step": 6041 + }, + { + "epoch": 0.42323150069610443, + "grad_norm": 5.922353744506836, + "learning_rate": 5.771658143607707e-05, + "loss": 1.1519, + "num_input_tokens_seen": 97221928, + "step": 6042 + }, + { + "epoch": 0.4233015489418337, + "grad_norm": 3.649948835372925, + "learning_rate": 5.770958318739055e-05, + "loss": 0.9467, + "num_input_tokens_seen": 97238048, + "step": 6043 + }, + { + "epoch": 0.42337159718756295, + "grad_norm": 4.660130977630615, + "learning_rate": 5.770258493870403e-05, + "loss": 1.0903, + "num_input_tokens_seen": 97254272, + "step": 6044 + }, + { + "epoch": 0.42344164543329216, + "grad_norm": 4.064535140991211, + "learning_rate": 5.769558669001752e-05, + "loss": 1.2646, + "num_input_tokens_seen": 97270656, + "step": 6045 + }, + { + "epoch": 0.4235116936790214, + "grad_norm": 3.931034803390503, + "learning_rate": 5.7688588441331e-05, + "loss": 0.9511, + "num_input_tokens_seen": 97287040, + "step": 6046 + }, + { + "epoch": 0.4235817419247507, + "grad_norm": 3.920013427734375, + "learning_rate": 5.7681590192644484e-05, + "loss": 1.1886, + "num_input_tokens_seen": 97302784, + "step": 6047 + }, + { + "epoch": 0.42365179017047994, + "grad_norm": 3.356661319732666, + "learning_rate": 5.767459194395798e-05, + "loss": 0.9283, + "num_input_tokens_seen": 97319168, + "step": 6048 + }, + { + "epoch": 0.42372183841620914, + "grad_norm": 4.33698034286499, + "learning_rate": 5.766759369527145e-05, + "loss": 1.0689, + "num_input_tokens_seen": 97335552, + "step": 6049 + }, + { + "epoch": 0.4237918866619384, + "grad_norm": 6.201281547546387, + "learning_rate": 5.766059544658493e-05, + "loss": 0.9756, + "num_input_tokens_seen": 97350720, + "step": 6050 + }, + { + "epoch": 0.42386193490766766, + "grad_norm": 4.005791664123535, + "learning_rate": 5.7653597197898425e-05, + "loss": 0.9559, + "num_input_tokens_seen": 97367104, + "step": 6051 + }, + { + "epoch": 0.4239319831533969, + "grad_norm": 4.238742828369141, + "learning_rate": 5.764659894921191e-05, + "loss": 1.0348, + "num_input_tokens_seen": 97383488, + "step": 6052 + }, + { + "epoch": 0.4240020313991261, + "grad_norm": 4.139926433563232, + "learning_rate": 5.7639600700525395e-05, + "loss": 1.0571, + "num_input_tokens_seen": 97398864, + "step": 6053 + }, + { + "epoch": 0.4240720796448554, + "grad_norm": 3.538890838623047, + "learning_rate": 5.763260245183888e-05, + "loss": 0.9162, + "num_input_tokens_seen": 97414416, + "step": 6054 + }, + { + "epoch": 0.42414212789058464, + "grad_norm": 3.888108253479004, + "learning_rate": 5.762560420315237e-05, + "loss": 1.0937, + "num_input_tokens_seen": 97429616, + "step": 6055 + }, + { + "epoch": 0.4242121761363139, + "grad_norm": 4.287962436676025, + "learning_rate": 5.7618605954465854e-05, + "loss": 0.9786, + "num_input_tokens_seen": 97444784, + "step": 6056 + }, + { + "epoch": 0.4242822243820431, + "grad_norm": 3.5160460472106934, + "learning_rate": 5.761160770577935e-05, + "loss": 1.0405, + "num_input_tokens_seen": 97461104, + "step": 6057 + }, + { + "epoch": 0.42435227262777236, + "grad_norm": 4.076432704925537, + "learning_rate": 5.760460945709283e-05, + "loss": 1.1768, + "num_input_tokens_seen": 97477488, + "step": 6058 + }, + { + "epoch": 0.4244223208735016, + "grad_norm": 3.4506590366363525, + "learning_rate": 5.759761120840631e-05, + "loss": 0.9435, + "num_input_tokens_seen": 97493872, + "step": 6059 + }, + { + "epoch": 0.4244923691192309, + "grad_norm": 4.196661472320557, + "learning_rate": 5.7590612959719795e-05, + "loss": 1.0714, + "num_input_tokens_seen": 97509088, + "step": 6060 + }, + { + "epoch": 0.42456241736496014, + "grad_norm": 4.412662506103516, + "learning_rate": 5.758361471103328e-05, + "loss": 1.1809, + "num_input_tokens_seen": 97525472, + "step": 6061 + }, + { + "epoch": 0.42463246561068935, + "grad_norm": 3.4199881553649902, + "learning_rate": 5.7576616462346765e-05, + "loss": 1.0078, + "num_input_tokens_seen": 97541856, + "step": 6062 + }, + { + "epoch": 0.4247025138564186, + "grad_norm": 4.215256214141846, + "learning_rate": 5.756961821366025e-05, + "loss": 0.9772, + "num_input_tokens_seen": 97558240, + "step": 6063 + }, + { + "epoch": 0.42477256210214787, + "grad_norm": 4.764070510864258, + "learning_rate": 5.756261996497374e-05, + "loss": 1.1994, + "num_input_tokens_seen": 97574624, + "step": 6064 + }, + { + "epoch": 0.4248426103478771, + "grad_norm": 3.8896613121032715, + "learning_rate": 5.7555621716287224e-05, + "loss": 1.1135, + "num_input_tokens_seen": 97591008, + "step": 6065 + }, + { + "epoch": 0.42491265859360633, + "grad_norm": 5.101664066314697, + "learning_rate": 5.7548623467600706e-05, + "loss": 1.0647, + "num_input_tokens_seen": 97607392, + "step": 6066 + }, + { + "epoch": 0.4249827068393356, + "grad_norm": 4.464064121246338, + "learning_rate": 5.754162521891419e-05, + "loss": 1.0992, + "num_input_tokens_seen": 97623776, + "step": 6067 + }, + { + "epoch": 0.42505275508506485, + "grad_norm": 3.9882681369781494, + "learning_rate": 5.7534626970227676e-05, + "loss": 1.0344, + "num_input_tokens_seen": 97639296, + "step": 6068 + }, + { + "epoch": 0.4251228033307941, + "grad_norm": 5.5437331199646, + "learning_rate": 5.752762872154116e-05, + "loss": 1.0782, + "num_input_tokens_seen": 97655592, + "step": 6069 + }, + { + "epoch": 0.4251928515765233, + "grad_norm": 4.157887935638428, + "learning_rate": 5.752063047285464e-05, + "loss": 1.2531, + "num_input_tokens_seen": 97671976, + "step": 6070 + }, + { + "epoch": 0.42526289982225257, + "grad_norm": 4.455500602722168, + "learning_rate": 5.751363222416812e-05, + "loss": 1.0738, + "num_input_tokens_seen": 97688360, + "step": 6071 + }, + { + "epoch": 0.42533294806798183, + "grad_norm": 5.3056254386901855, + "learning_rate": 5.750663397548162e-05, + "loss": 1.2483, + "num_input_tokens_seen": 97704008, + "step": 6072 + }, + { + "epoch": 0.4254029963137111, + "grad_norm": 3.5183193683624268, + "learning_rate": 5.74996357267951e-05, + "loss": 0.9862, + "num_input_tokens_seen": 97720392, + "step": 6073 + }, + { + "epoch": 0.4254730445594403, + "grad_norm": 4.44768762588501, + "learning_rate": 5.749263747810859e-05, + "loss": 1.2951, + "num_input_tokens_seen": 97736584, + "step": 6074 + }, + { + "epoch": 0.42554309280516955, + "grad_norm": 3.6957905292510986, + "learning_rate": 5.748563922942207e-05, + "loss": 1.2134, + "num_input_tokens_seen": 97752968, + "step": 6075 + }, + { + "epoch": 0.4256131410508988, + "grad_norm": 3.6841094493865967, + "learning_rate": 5.747864098073555e-05, + "loss": 0.9744, + "num_input_tokens_seen": 97769352, + "step": 6076 + }, + { + "epoch": 0.4256831892966281, + "grad_norm": 6.541488170623779, + "learning_rate": 5.747164273204903e-05, + "loss": 1.1638, + "num_input_tokens_seen": 97785736, + "step": 6077 + }, + { + "epoch": 0.4257532375423573, + "grad_norm": 4.056735515594482, + "learning_rate": 5.7464644483362515e-05, + "loss": 0.9758, + "num_input_tokens_seen": 97801624, + "step": 6078 + }, + { + "epoch": 0.42582328578808654, + "grad_norm": 3.5294058322906494, + "learning_rate": 5.745764623467601e-05, + "loss": 0.9682, + "num_input_tokens_seen": 97817544, + "step": 6079 + }, + { + "epoch": 0.4258933340338158, + "grad_norm": 3.851330280303955, + "learning_rate": 5.745064798598949e-05, + "loss": 1.0858, + "num_input_tokens_seen": 97833600, + "step": 6080 + }, + { + "epoch": 0.42596338227954506, + "grad_norm": 3.6939046382904053, + "learning_rate": 5.744364973730299e-05, + "loss": 0.9469, + "num_input_tokens_seen": 97849984, + "step": 6081 + }, + { + "epoch": 0.42603343052527426, + "grad_norm": 3.7894139289855957, + "learning_rate": 5.743665148861647e-05, + "loss": 1.1953, + "num_input_tokens_seen": 97866368, + "step": 6082 + }, + { + "epoch": 0.4261034787710035, + "grad_norm": 3.377105712890625, + "learning_rate": 5.742965323992995e-05, + "loss": 1.0573, + "num_input_tokens_seen": 97882752, + "step": 6083 + }, + { + "epoch": 0.4261735270167328, + "grad_norm": 4.0349440574646, + "learning_rate": 5.742265499124344e-05, + "loss": 1.1328, + "num_input_tokens_seen": 97899136, + "step": 6084 + }, + { + "epoch": 0.42624357526246204, + "grad_norm": 3.9353208541870117, + "learning_rate": 5.7415656742556935e-05, + "loss": 0.9787, + "num_input_tokens_seen": 97915520, + "step": 6085 + }, + { + "epoch": 0.42631362350819124, + "grad_norm": 3.6593427658081055, + "learning_rate": 5.740865849387042e-05, + "loss": 1.0417, + "num_input_tokens_seen": 97931904, + "step": 6086 + }, + { + "epoch": 0.4263836717539205, + "grad_norm": 3.543994665145874, + "learning_rate": 5.7401660245183885e-05, + "loss": 0.9268, + "num_input_tokens_seen": 97948288, + "step": 6087 + }, + { + "epoch": 0.42645371999964976, + "grad_norm": 3.925420045852661, + "learning_rate": 5.739466199649738e-05, + "loss": 1.1635, + "num_input_tokens_seen": 97964672, + "step": 6088 + }, + { + "epoch": 0.426523768245379, + "grad_norm": 4.6036224365234375, + "learning_rate": 5.738766374781086e-05, + "loss": 1.1229, + "num_input_tokens_seen": 97981056, + "step": 6089 + }, + { + "epoch": 0.4265938164911082, + "grad_norm": 6.555153846740723, + "learning_rate": 5.7380665499124344e-05, + "loss": 1.1401, + "num_input_tokens_seen": 97997440, + "step": 6090 + }, + { + "epoch": 0.4266638647368375, + "grad_norm": 3.7414231300354004, + "learning_rate": 5.737366725043783e-05, + "loss": 1.0223, + "num_input_tokens_seen": 98013264, + "step": 6091 + }, + { + "epoch": 0.42673391298256674, + "grad_norm": 4.380615234375, + "learning_rate": 5.7366669001751314e-05, + "loss": 1.1524, + "num_input_tokens_seen": 98029176, + "step": 6092 + }, + { + "epoch": 0.426803961228296, + "grad_norm": 4.624136924743652, + "learning_rate": 5.7359670753064796e-05, + "loss": 1.1277, + "num_input_tokens_seen": 98044384, + "step": 6093 + }, + { + "epoch": 0.42687400947402526, + "grad_norm": 4.984564781188965, + "learning_rate": 5.735267250437828e-05, + "loss": 1.1115, + "num_input_tokens_seen": 98060768, + "step": 6094 + }, + { + "epoch": 0.42694405771975447, + "grad_norm": 5.481975078582764, + "learning_rate": 5.734567425569178e-05, + "loss": 1.167, + "num_input_tokens_seen": 98077152, + "step": 6095 + }, + { + "epoch": 0.4270141059654837, + "grad_norm": 3.3822808265686035, + "learning_rate": 5.733867600700526e-05, + "loss": 0.9442, + "num_input_tokens_seen": 98093224, + "step": 6096 + }, + { + "epoch": 0.427084154211213, + "grad_norm": 3.8090853691101074, + "learning_rate": 5.733167775831874e-05, + "loss": 1.0478, + "num_input_tokens_seen": 98109608, + "step": 6097 + }, + { + "epoch": 0.42715420245694224, + "grad_norm": 4.279370307922363, + "learning_rate": 5.7324679509632226e-05, + "loss": 0.918, + "num_input_tokens_seen": 98125992, + "step": 6098 + }, + { + "epoch": 0.42722425070267145, + "grad_norm": 5.998210430145264, + "learning_rate": 5.731768126094571e-05, + "loss": 0.9409, + "num_input_tokens_seen": 98142376, + "step": 6099 + }, + { + "epoch": 0.4272942989484007, + "grad_norm": 4.388184070587158, + "learning_rate": 5.731068301225919e-05, + "loss": 1.0364, + "num_input_tokens_seen": 98158760, + "step": 6100 + }, + { + "epoch": 0.42736434719412997, + "grad_norm": 4.937825679779053, + "learning_rate": 5.7303684763572685e-05, + "loss": 1.1185, + "num_input_tokens_seen": 98175144, + "step": 6101 + }, + { + "epoch": 0.4274343954398592, + "grad_norm": 3.800776720046997, + "learning_rate": 5.729668651488618e-05, + "loss": 1.1608, + "num_input_tokens_seen": 98191184, + "step": 6102 + }, + { + "epoch": 0.42750444368558843, + "grad_norm": 3.857093334197998, + "learning_rate": 5.728968826619966e-05, + "loss": 0.8588, + "num_input_tokens_seen": 98207568, + "step": 6103 + }, + { + "epoch": 0.4275744919313177, + "grad_norm": 3.562218189239502, + "learning_rate": 5.728269001751313e-05, + "loss": 1.1002, + "num_input_tokens_seen": 98223952, + "step": 6104 + }, + { + "epoch": 0.42764454017704695, + "grad_norm": 3.826802968978882, + "learning_rate": 5.7275691768826626e-05, + "loss": 0.7401, + "num_input_tokens_seen": 98239576, + "step": 6105 + }, + { + "epoch": 0.4277145884227762, + "grad_norm": 4.127960205078125, + "learning_rate": 5.7268693520140114e-05, + "loss": 1.0163, + "num_input_tokens_seen": 98255960, + "step": 6106 + }, + { + "epoch": 0.4277846366685054, + "grad_norm": 4.270632743835449, + "learning_rate": 5.726169527145359e-05, + "loss": 1.2359, + "num_input_tokens_seen": 98272080, + "step": 6107 + }, + { + "epoch": 0.4278546849142347, + "grad_norm": 4.543783187866211, + "learning_rate": 5.725469702276708e-05, + "loss": 1.1117, + "num_input_tokens_seen": 98288464, + "step": 6108 + }, + { + "epoch": 0.42792473315996393, + "grad_norm": 3.993234634399414, + "learning_rate": 5.724769877408057e-05, + "loss": 1.0059, + "num_input_tokens_seen": 98304424, + "step": 6109 + }, + { + "epoch": 0.4279947814056932, + "grad_norm": 4.11693000793457, + "learning_rate": 5.7240700525394055e-05, + "loss": 1.0718, + "num_input_tokens_seen": 98320808, + "step": 6110 + }, + { + "epoch": 0.4280648296514224, + "grad_norm": 4.000871658325195, + "learning_rate": 5.723370227670754e-05, + "loss": 0.9777, + "num_input_tokens_seen": 98337192, + "step": 6111 + }, + { + "epoch": 0.42813487789715166, + "grad_norm": 3.642763614654541, + "learning_rate": 5.7226704028021025e-05, + "loss": 0.9108, + "num_input_tokens_seen": 98353320, + "step": 6112 + }, + { + "epoch": 0.4282049261428809, + "grad_norm": 4.22330379486084, + "learning_rate": 5.721970577933451e-05, + "loss": 1.0968, + "num_input_tokens_seen": 98369704, + "step": 6113 + }, + { + "epoch": 0.4282749743886102, + "grad_norm": 3.7961175441741943, + "learning_rate": 5.721270753064798e-05, + "loss": 0.8756, + "num_input_tokens_seen": 98385544, + "step": 6114 + }, + { + "epoch": 0.4283450226343394, + "grad_norm": 3.771034002304077, + "learning_rate": 5.720570928196147e-05, + "loss": 1.1139, + "num_input_tokens_seen": 98401928, + "step": 6115 + }, + { + "epoch": 0.42841507088006864, + "grad_norm": 3.8084332942962646, + "learning_rate": 5.719871103327495e-05, + "loss": 1.1042, + "num_input_tokens_seen": 98418136, + "step": 6116 + }, + { + "epoch": 0.4284851191257979, + "grad_norm": 3.890608549118042, + "learning_rate": 5.7191712784588434e-05, + "loss": 0.9865, + "num_input_tokens_seen": 98433656, + "step": 6117 + }, + { + "epoch": 0.42855516737152716, + "grad_norm": 6.781351089477539, + "learning_rate": 5.718471453590193e-05, + "loss": 0.8032, + "num_input_tokens_seen": 98448776, + "step": 6118 + }, + { + "epoch": 0.42862521561725636, + "grad_norm": 3.941107749938965, + "learning_rate": 5.717771628721542e-05, + "loss": 1.104, + "num_input_tokens_seen": 98465160, + "step": 6119 + }, + { + "epoch": 0.4286952638629856, + "grad_norm": 4.457616329193115, + "learning_rate": 5.71707180385289e-05, + "loss": 1.1159, + "num_input_tokens_seen": 98481184, + "step": 6120 + }, + { + "epoch": 0.4287653121087149, + "grad_norm": 3.889111042022705, + "learning_rate": 5.7163719789842375e-05, + "loss": 1.0685, + "num_input_tokens_seen": 98497568, + "step": 6121 + }, + { + "epoch": 0.42883536035444414, + "grad_norm": 3.7574422359466553, + "learning_rate": 5.715672154115588e-05, + "loss": 0.9091, + "num_input_tokens_seen": 98513920, + "step": 6122 + }, + { + "epoch": 0.42890540860017334, + "grad_norm": 3.578437089920044, + "learning_rate": 5.714972329246936e-05, + "loss": 0.9449, + "num_input_tokens_seen": 98529664, + "step": 6123 + }, + { + "epoch": 0.4289754568459026, + "grad_norm": 5.0676398277282715, + "learning_rate": 5.714272504378283e-05, + "loss": 1.0768, + "num_input_tokens_seen": 98544936, + "step": 6124 + }, + { + "epoch": 0.42904550509163186, + "grad_norm": 4.475335121154785, + "learning_rate": 5.713572679509632e-05, + "loss": 0.9347, + "num_input_tokens_seen": 98560520, + "step": 6125 + }, + { + "epoch": 0.4291155533373611, + "grad_norm": 6.345788955688477, + "learning_rate": 5.712872854640982e-05, + "loss": 1.1897, + "num_input_tokens_seen": 98576320, + "step": 6126 + }, + { + "epoch": 0.4291856015830903, + "grad_norm": 3.775374174118042, + "learning_rate": 5.71217302977233e-05, + "loss": 0.9803, + "num_input_tokens_seen": 98592704, + "step": 6127 + }, + { + "epoch": 0.4292556498288196, + "grad_norm": 4.224292278289795, + "learning_rate": 5.711473204903678e-05, + "loss": 1.2253, + "num_input_tokens_seen": 98607664, + "step": 6128 + }, + { + "epoch": 0.42932569807454884, + "grad_norm": 4.470034122467041, + "learning_rate": 5.710773380035027e-05, + "loss": 0.9915, + "num_input_tokens_seen": 98624048, + "step": 6129 + }, + { + "epoch": 0.4293957463202781, + "grad_norm": 6.22687292098999, + "learning_rate": 5.710073555166375e-05, + "loss": 1.2048, + "num_input_tokens_seen": 98640432, + "step": 6130 + }, + { + "epoch": 0.42946579456600736, + "grad_norm": 3.9434430599212646, + "learning_rate": 5.709373730297722e-05, + "loss": 1.0306, + "num_input_tokens_seen": 98656672, + "step": 6131 + }, + { + "epoch": 0.42953584281173657, + "grad_norm": 3.7640228271484375, + "learning_rate": 5.7086739054290716e-05, + "loss": 1.061, + "num_input_tokens_seen": 98673056, + "step": 6132 + }, + { + "epoch": 0.4296058910574658, + "grad_norm": 5.742674827575684, + "learning_rate": 5.707974080560421e-05, + "loss": 1.0773, + "num_input_tokens_seen": 98688400, + "step": 6133 + }, + { + "epoch": 0.4296759393031951, + "grad_norm": 4.938521862030029, + "learning_rate": 5.707274255691769e-05, + "loss": 0.9877, + "num_input_tokens_seen": 98703304, + "step": 6134 + }, + { + "epoch": 0.42974598754892435, + "grad_norm": 3.7322773933410645, + "learning_rate": 5.7065744308231175e-05, + "loss": 0.9787, + "num_input_tokens_seen": 98717536, + "step": 6135 + }, + { + "epoch": 0.42981603579465355, + "grad_norm": 3.741265296936035, + "learning_rate": 5.7058746059544663e-05, + "loss": 1.1105, + "num_input_tokens_seen": 98733632, + "step": 6136 + }, + { + "epoch": 0.4298860840403828, + "grad_norm": 3.9021074771881104, + "learning_rate": 5.7051747810858145e-05, + "loss": 0.9721, + "num_input_tokens_seen": 98749088, + "step": 6137 + }, + { + "epoch": 0.42995613228611207, + "grad_norm": 4.327329635620117, + "learning_rate": 5.704474956217163e-05, + "loss": 1.0862, + "num_input_tokens_seen": 98765328, + "step": 6138 + }, + { + "epoch": 0.43002618053184133, + "grad_norm": 4.335643768310547, + "learning_rate": 5.703775131348512e-05, + "loss": 1.2386, + "num_input_tokens_seen": 98780744, + "step": 6139 + }, + { + "epoch": 0.43009622877757053, + "grad_norm": 4.66419792175293, + "learning_rate": 5.7030753064798604e-05, + "loss": 1.0696, + "num_input_tokens_seen": 98797128, + "step": 6140 + }, + { + "epoch": 0.4301662770232998, + "grad_norm": 4.208861351013184, + "learning_rate": 5.702375481611207e-05, + "loss": 0.9743, + "num_input_tokens_seen": 98812776, + "step": 6141 + }, + { + "epoch": 0.43023632526902905, + "grad_norm": 4.90700626373291, + "learning_rate": 5.701675656742557e-05, + "loss": 0.9744, + "num_input_tokens_seen": 98829160, + "step": 6142 + }, + { + "epoch": 0.4303063735147583, + "grad_norm": 3.942166805267334, + "learning_rate": 5.7009758318739056e-05, + "loss": 1.0032, + "num_input_tokens_seen": 98845544, + "step": 6143 + }, + { + "epoch": 0.4303764217604875, + "grad_norm": 4.919578552246094, + "learning_rate": 5.700276007005254e-05, + "loss": 1.0218, + "num_input_tokens_seen": 98861928, + "step": 6144 + }, + { + "epoch": 0.4304464700062168, + "grad_norm": 3.6429073810577393, + "learning_rate": 5.699576182136602e-05, + "loss": 1.1052, + "num_input_tokens_seen": 98878288, + "step": 6145 + }, + { + "epoch": 0.43051651825194603, + "grad_norm": 4.227152347564697, + "learning_rate": 5.6988763572679515e-05, + "loss": 1.059, + "num_input_tokens_seen": 98893816, + "step": 6146 + }, + { + "epoch": 0.4305865664976753, + "grad_norm": 4.016188144683838, + "learning_rate": 5.6981765323993e-05, + "loss": 0.9898, + "num_input_tokens_seen": 98909968, + "step": 6147 + }, + { + "epoch": 0.4306566147434045, + "grad_norm": 4.0402069091796875, + "learning_rate": 5.697476707530648e-05, + "loss": 1.1366, + "num_input_tokens_seen": 98926352, + "step": 6148 + }, + { + "epoch": 0.43072666298913376, + "grad_norm": 5.771969318389893, + "learning_rate": 5.6967768826619974e-05, + "loss": 0.9641, + "num_input_tokens_seen": 98941512, + "step": 6149 + }, + { + "epoch": 0.430796711234863, + "grad_norm": 4.444697856903076, + "learning_rate": 5.6960770577933456e-05, + "loss": 1.1114, + "num_input_tokens_seen": 98957624, + "step": 6150 + }, + { + "epoch": 0.4308667594805923, + "grad_norm": 3.386268377304077, + "learning_rate": 5.695377232924694e-05, + "loss": 0.9552, + "num_input_tokens_seen": 98974008, + "step": 6151 + }, + { + "epoch": 0.4309368077263215, + "grad_norm": 3.950138807296753, + "learning_rate": 5.694677408056042e-05, + "loss": 1.0048, + "num_input_tokens_seen": 98990392, + "step": 6152 + }, + { + "epoch": 0.43100685597205074, + "grad_norm": 3.7290585041046143, + "learning_rate": 5.693977583187392e-05, + "loss": 1.103, + "num_input_tokens_seen": 99006776, + "step": 6153 + }, + { + "epoch": 0.43107690421778, + "grad_norm": 3.3678364753723145, + "learning_rate": 5.693277758318739e-05, + "loss": 0.7396, + "num_input_tokens_seen": 99022912, + "step": 6154 + }, + { + "epoch": 0.43114695246350926, + "grad_norm": 5.882314682006836, + "learning_rate": 5.6925779334500886e-05, + "loss": 1.0949, + "num_input_tokens_seen": 99038208, + "step": 6155 + }, + { + "epoch": 0.43121700070923846, + "grad_norm": 4.231525421142578, + "learning_rate": 5.691878108581437e-05, + "loss": 1.0437, + "num_input_tokens_seen": 99053496, + "step": 6156 + }, + { + "epoch": 0.4312870489549677, + "grad_norm": 4.864506721496582, + "learning_rate": 5.691178283712785e-05, + "loss": 1.0978, + "num_input_tokens_seen": 99069600, + "step": 6157 + }, + { + "epoch": 0.431357097200697, + "grad_norm": 6.483276844024658, + "learning_rate": 5.690478458844133e-05, + "loss": 0.9262, + "num_input_tokens_seen": 99085456, + "step": 6158 + }, + { + "epoch": 0.43142714544642624, + "grad_norm": 3.830292224884033, + "learning_rate": 5.689778633975482e-05, + "loss": 1.1837, + "num_input_tokens_seen": 99101840, + "step": 6159 + }, + { + "epoch": 0.43149719369215545, + "grad_norm": 4.078514099121094, + "learning_rate": 5.68907880910683e-05, + "loss": 0.9916, + "num_input_tokens_seen": 99118224, + "step": 6160 + }, + { + "epoch": 0.4315672419378847, + "grad_norm": 4.1833648681640625, + "learning_rate": 5.688378984238178e-05, + "loss": 1.2243, + "num_input_tokens_seen": 99134608, + "step": 6161 + }, + { + "epoch": 0.43163729018361396, + "grad_norm": 4.761826515197754, + "learning_rate": 5.6876791593695265e-05, + "loss": 1.1017, + "num_input_tokens_seen": 99150992, + "step": 6162 + }, + { + "epoch": 0.4317073384293432, + "grad_norm": 4.992908954620361, + "learning_rate": 5.686979334500877e-05, + "loss": 0.9658, + "num_input_tokens_seen": 99167320, + "step": 6163 + }, + { + "epoch": 0.4317773866750725, + "grad_norm": 3.8283936977386475, + "learning_rate": 5.686279509632224e-05, + "loss": 1.0521, + "num_input_tokens_seen": 99183512, + "step": 6164 + }, + { + "epoch": 0.4318474349208017, + "grad_norm": 3.4508893489837646, + "learning_rate": 5.685579684763573e-05, + "loss": 0.9632, + "num_input_tokens_seen": 99199896, + "step": 6165 + }, + { + "epoch": 0.43191748316653095, + "grad_norm": 4.4798431396484375, + "learning_rate": 5.684879859894921e-05, + "loss": 0.8848, + "num_input_tokens_seen": 99216280, + "step": 6166 + }, + { + "epoch": 0.4319875314122602, + "grad_norm": 5.227555751800537, + "learning_rate": 5.6841800350262694e-05, + "loss": 0.9621, + "num_input_tokens_seen": 99230656, + "step": 6167 + }, + { + "epoch": 0.43205757965798947, + "grad_norm": 5.358756065368652, + "learning_rate": 5.6834802101576176e-05, + "loss": 1.0361, + "num_input_tokens_seen": 99246864, + "step": 6168 + }, + { + "epoch": 0.43212762790371867, + "grad_norm": 4.224287986755371, + "learning_rate": 5.682780385288967e-05, + "loss": 1.0804, + "num_input_tokens_seen": 99263248, + "step": 6169 + }, + { + "epoch": 0.43219767614944793, + "grad_norm": 5.748126983642578, + "learning_rate": 5.682080560420317e-05, + "loss": 1.0353, + "num_input_tokens_seen": 99279632, + "step": 6170 + }, + { + "epoch": 0.4322677243951772, + "grad_norm": 4.036735534667969, + "learning_rate": 5.6813807355516635e-05, + "loss": 1.0776, + "num_input_tokens_seen": 99296016, + "step": 6171 + }, + { + "epoch": 0.43233777264090645, + "grad_norm": 4.133121013641357, + "learning_rate": 5.680680910683013e-05, + "loss": 1.0796, + "num_input_tokens_seen": 99312400, + "step": 6172 + }, + { + "epoch": 0.43240782088663565, + "grad_norm": 4.70187520980835, + "learning_rate": 5.679981085814362e-05, + "loss": 1.1069, + "num_input_tokens_seen": 99328504, + "step": 6173 + }, + { + "epoch": 0.4324778691323649, + "grad_norm": 3.515967845916748, + "learning_rate": 5.6792812609457094e-05, + "loss": 1.0462, + "num_input_tokens_seen": 99344424, + "step": 6174 + }, + { + "epoch": 0.43254791737809417, + "grad_norm": 5.408679962158203, + "learning_rate": 5.6785814360770576e-05, + "loss": 1.0638, + "num_input_tokens_seen": 99360056, + "step": 6175 + }, + { + "epoch": 0.43261796562382343, + "grad_norm": 3.7438695430755615, + "learning_rate": 5.677881611208408e-05, + "loss": 1.0115, + "num_input_tokens_seen": 99375920, + "step": 6176 + }, + { + "epoch": 0.43268801386955263, + "grad_norm": 4.337923526763916, + "learning_rate": 5.677181786339756e-05, + "loss": 1.1368, + "num_input_tokens_seen": 99392040, + "step": 6177 + }, + { + "epoch": 0.4327580621152819, + "grad_norm": 6.04982852935791, + "learning_rate": 5.676481961471103e-05, + "loss": 1.1253, + "num_input_tokens_seen": 99407592, + "step": 6178 + }, + { + "epoch": 0.43282811036101115, + "grad_norm": 3.9268686771392822, + "learning_rate": 5.6757821366024524e-05, + "loss": 1.1957, + "num_input_tokens_seen": 99423976, + "step": 6179 + }, + { + "epoch": 0.4328981586067404, + "grad_norm": 4.466431140899658, + "learning_rate": 5.675082311733801e-05, + "loss": 0.8999, + "num_input_tokens_seen": 99440360, + "step": 6180 + }, + { + "epoch": 0.4329682068524696, + "grad_norm": 4.166913032531738, + "learning_rate": 5.674382486865149e-05, + "loss": 1.0157, + "num_input_tokens_seen": 99456744, + "step": 6181 + }, + { + "epoch": 0.4330382550981989, + "grad_norm": 3.525611400604248, + "learning_rate": 5.6736826619964976e-05, + "loss": 1.1182, + "num_input_tokens_seen": 99473080, + "step": 6182 + }, + { + "epoch": 0.43310830334392814, + "grad_norm": 6.099409103393555, + "learning_rate": 5.672982837127847e-05, + "loss": 0.9801, + "num_input_tokens_seen": 99489088, + "step": 6183 + }, + { + "epoch": 0.4331783515896574, + "grad_norm": 3.5886685848236084, + "learning_rate": 5.672283012259194e-05, + "loss": 1.0214, + "num_input_tokens_seen": 99505248, + "step": 6184 + }, + { + "epoch": 0.4332483998353866, + "grad_norm": 3.5279197692871094, + "learning_rate": 5.671583187390542e-05, + "loss": 0.9724, + "num_input_tokens_seen": 99521632, + "step": 6185 + }, + { + "epoch": 0.43331844808111586, + "grad_norm": 4.606603622436523, + "learning_rate": 5.670883362521892e-05, + "loss": 0.8623, + "num_input_tokens_seen": 99537336, + "step": 6186 + }, + { + "epoch": 0.4333884963268451, + "grad_norm": 3.5966908931732178, + "learning_rate": 5.6701835376532405e-05, + "loss": 0.9549, + "num_input_tokens_seen": 99553720, + "step": 6187 + }, + { + "epoch": 0.4334585445725744, + "grad_norm": 3.981893301010132, + "learning_rate": 5.669483712784588e-05, + "loss": 0.923, + "num_input_tokens_seen": 99569384, + "step": 6188 + }, + { + "epoch": 0.4335285928183036, + "grad_norm": 4.06168270111084, + "learning_rate": 5.668783887915937e-05, + "loss": 1.12, + "num_input_tokens_seen": 99585768, + "step": 6189 + }, + { + "epoch": 0.43359864106403284, + "grad_norm": 4.476738929748535, + "learning_rate": 5.6680840630472864e-05, + "loss": 1.0997, + "num_input_tokens_seen": 99602152, + "step": 6190 + }, + { + "epoch": 0.4336686893097621, + "grad_norm": 7.592894554138184, + "learning_rate": 5.667384238178633e-05, + "loss": 1.1001, + "num_input_tokens_seen": 99618536, + "step": 6191 + }, + { + "epoch": 0.43373873755549136, + "grad_norm": 3.4367337226867676, + "learning_rate": 5.666684413309983e-05, + "loss": 0.8883, + "num_input_tokens_seen": 99634920, + "step": 6192 + }, + { + "epoch": 0.43380878580122056, + "grad_norm": 3.8736166954040527, + "learning_rate": 5.665984588441332e-05, + "loss": 1.1062, + "num_input_tokens_seen": 99651304, + "step": 6193 + }, + { + "epoch": 0.4338788340469498, + "grad_norm": 4.0018463134765625, + "learning_rate": 5.6652847635726805e-05, + "loss": 1.15, + "num_input_tokens_seen": 99667688, + "step": 6194 + }, + { + "epoch": 0.4339488822926791, + "grad_norm": 4.243009090423584, + "learning_rate": 5.664584938704029e-05, + "loss": 1.0738, + "num_input_tokens_seen": 99684072, + "step": 6195 + }, + { + "epoch": 0.43401893053840834, + "grad_norm": 5.533624172210693, + "learning_rate": 5.663885113835377e-05, + "loss": 0.9901, + "num_input_tokens_seen": 99698440, + "step": 6196 + }, + { + "epoch": 0.4340889787841376, + "grad_norm": 3.9158618450164795, + "learning_rate": 5.663185288966726e-05, + "loss": 0.9884, + "num_input_tokens_seen": 99714824, + "step": 6197 + }, + { + "epoch": 0.4341590270298668, + "grad_norm": 6.666274070739746, + "learning_rate": 5.662485464098073e-05, + "loss": 1.3152, + "num_input_tokens_seen": 99731208, + "step": 6198 + }, + { + "epoch": 0.43422907527559607, + "grad_norm": 4.02492618560791, + "learning_rate": 5.6617856392294235e-05, + "loss": 1.014, + "num_input_tokens_seen": 99747592, + "step": 6199 + }, + { + "epoch": 0.4342991235213253, + "grad_norm": 3.4257941246032715, + "learning_rate": 5.6610858143607716e-05, + "loss": 0.9081, + "num_input_tokens_seen": 99763656, + "step": 6200 + }, + { + "epoch": 0.4342991235213253, + "eval_loss": 1.126607060432434, + "eval_runtime": 0.1953, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 5.119, + "num_input_tokens_seen": 99763656, + "step": 6200 + }, + { + "epoch": 0.4343691717670546, + "grad_norm": 3.7283267974853516, + "learning_rate": 5.66038598949212e-05, + "loss": 0.8829, + "num_input_tokens_seen": 99779416, + "step": 6201 + }, + { + "epoch": 0.4344392200127838, + "grad_norm": 5.657198429107666, + "learning_rate": 5.659686164623468e-05, + "loss": 1.1111, + "num_input_tokens_seen": 99795376, + "step": 6202 + }, + { + "epoch": 0.43450926825851305, + "grad_norm": 4.102888107299805, + "learning_rate": 5.658986339754817e-05, + "loss": 1.1068, + "num_input_tokens_seen": 99811760, + "step": 6203 + }, + { + "epoch": 0.4345793165042423, + "grad_norm": 5.449219226837158, + "learning_rate": 5.658286514886165e-05, + "loss": 0.8439, + "num_input_tokens_seen": 99827264, + "step": 6204 + }, + { + "epoch": 0.43464936474997157, + "grad_norm": 4.1982197761535645, + "learning_rate": 5.657586690017513e-05, + "loss": 1.077, + "num_input_tokens_seen": 99843648, + "step": 6205 + }, + { + "epoch": 0.43471941299570077, + "grad_norm": 4.246870517730713, + "learning_rate": 5.6568868651488614e-05, + "loss": 1.0653, + "num_input_tokens_seen": 99859800, + "step": 6206 + }, + { + "epoch": 0.43478946124143003, + "grad_norm": 7.575351238250732, + "learning_rate": 5.656187040280211e-05, + "loss": 1.231, + "num_input_tokens_seen": 99875240, + "step": 6207 + }, + { + "epoch": 0.4348595094871593, + "grad_norm": 4.253138065338135, + "learning_rate": 5.655487215411558e-05, + "loss": 1.1178, + "num_input_tokens_seen": 99891624, + "step": 6208 + }, + { + "epoch": 0.43492955773288855, + "grad_norm": 3.5073490142822266, + "learning_rate": 5.654787390542908e-05, + "loss": 1.0251, + "num_input_tokens_seen": 99908008, + "step": 6209 + }, + { + "epoch": 0.43499960597861775, + "grad_norm": 3.669361114501953, + "learning_rate": 5.654087565674256e-05, + "loss": 1.0233, + "num_input_tokens_seen": 99924392, + "step": 6210 + }, + { + "epoch": 0.435069654224347, + "grad_norm": 4.25203800201416, + "learning_rate": 5.6533877408056043e-05, + "loss": 0.972, + "num_input_tokens_seen": 99940776, + "step": 6211 + }, + { + "epoch": 0.4351397024700763, + "grad_norm": 3.7570602893829346, + "learning_rate": 5.6526879159369525e-05, + "loss": 1.086, + "num_input_tokens_seen": 99956488, + "step": 6212 + }, + { + "epoch": 0.43520975071580553, + "grad_norm": 3.47245454788208, + "learning_rate": 5.651988091068301e-05, + "loss": 0.8526, + "num_input_tokens_seen": 99972792, + "step": 6213 + }, + { + "epoch": 0.43527979896153474, + "grad_norm": 4.902298927307129, + "learning_rate": 5.65128826619965e-05, + "loss": 1.1583, + "num_input_tokens_seen": 99988144, + "step": 6214 + }, + { + "epoch": 0.435349847207264, + "grad_norm": 3.796644926071167, + "learning_rate": 5.6505884413309984e-05, + "loss": 1.0739, + "num_input_tokens_seen": 100004528, + "step": 6215 + }, + { + "epoch": 0.43541989545299326, + "grad_norm": 4.4678425788879395, + "learning_rate": 5.649888616462348e-05, + "loss": 1.0843, + "num_input_tokens_seen": 100020448, + "step": 6216 + }, + { + "epoch": 0.4354899436987225, + "grad_norm": 5.181003570556641, + "learning_rate": 5.649188791593696e-05, + "loss": 0.9784, + "num_input_tokens_seen": 100036832, + "step": 6217 + }, + { + "epoch": 0.4355599919444517, + "grad_norm": 3.4864094257354736, + "learning_rate": 5.648488966725044e-05, + "loss": 1.1016, + "num_input_tokens_seen": 100052656, + "step": 6218 + }, + { + "epoch": 0.435630040190181, + "grad_norm": 3.6500463485717773, + "learning_rate": 5.647789141856393e-05, + "loss": 0.9406, + "num_input_tokens_seen": 100069040, + "step": 6219 + }, + { + "epoch": 0.43570008843591024, + "grad_norm": 4.463146686553955, + "learning_rate": 5.647089316987743e-05, + "loss": 1.0597, + "num_input_tokens_seen": 100085248, + "step": 6220 + }, + { + "epoch": 0.4357701366816395, + "grad_norm": 4.013953685760498, + "learning_rate": 5.6463894921190895e-05, + "loss": 0.9562, + "num_input_tokens_seen": 100100600, + "step": 6221 + }, + { + "epoch": 0.4358401849273687, + "grad_norm": 4.549919128417969, + "learning_rate": 5.645689667250438e-05, + "loss": 0.9911, + "num_input_tokens_seen": 100116368, + "step": 6222 + }, + { + "epoch": 0.43591023317309796, + "grad_norm": 4.227685451507568, + "learning_rate": 5.644989842381787e-05, + "loss": 1.0951, + "num_input_tokens_seen": 100132752, + "step": 6223 + }, + { + "epoch": 0.4359802814188272, + "grad_norm": 3.699406147003174, + "learning_rate": 5.6442900175131354e-05, + "loss": 0.9597, + "num_input_tokens_seen": 100149048, + "step": 6224 + }, + { + "epoch": 0.4360503296645565, + "grad_norm": 4.6700944900512695, + "learning_rate": 5.6435901926444836e-05, + "loss": 1.1303, + "num_input_tokens_seen": 100165432, + "step": 6225 + }, + { + "epoch": 0.4361203779102857, + "grad_norm": 3.524517774581909, + "learning_rate": 5.6428903677758325e-05, + "loss": 1.002, + "num_input_tokens_seen": 100181672, + "step": 6226 + }, + { + "epoch": 0.43619042615601494, + "grad_norm": 3.6368181705474854, + "learning_rate": 5.642190542907181e-05, + "loss": 0.9204, + "num_input_tokens_seen": 100198056, + "step": 6227 + }, + { + "epoch": 0.4362604744017442, + "grad_norm": 4.629672050476074, + "learning_rate": 5.641490718038529e-05, + "loss": 1.0703, + "num_input_tokens_seen": 100214440, + "step": 6228 + }, + { + "epoch": 0.43633052264747346, + "grad_norm": 4.120620250701904, + "learning_rate": 5.640790893169877e-05, + "loss": 0.9761, + "num_input_tokens_seen": 100230824, + "step": 6229 + }, + { + "epoch": 0.43640057089320267, + "grad_norm": 3.8496460914611816, + "learning_rate": 5.640091068301227e-05, + "loss": 1.0973, + "num_input_tokens_seen": 100245920, + "step": 6230 + }, + { + "epoch": 0.4364706191389319, + "grad_norm": 3.8419101238250732, + "learning_rate": 5.639391243432575e-05, + "loss": 1.1615, + "num_input_tokens_seen": 100262184, + "step": 6231 + }, + { + "epoch": 0.4365406673846612, + "grad_norm": 4.283138751983643, + "learning_rate": 5.638691418563923e-05, + "loss": 0.999, + "num_input_tokens_seen": 100278568, + "step": 6232 + }, + { + "epoch": 0.43661071563039044, + "grad_norm": 3.7390506267547607, + "learning_rate": 5.637991593695272e-05, + "loss": 0.9675, + "num_input_tokens_seen": 100294952, + "step": 6233 + }, + { + "epoch": 0.4366807638761197, + "grad_norm": 3.894780158996582, + "learning_rate": 5.63729176882662e-05, + "loss": 0.996, + "num_input_tokens_seen": 100310536, + "step": 6234 + }, + { + "epoch": 0.4367508121218489, + "grad_norm": 5.446288108825684, + "learning_rate": 5.636591943957968e-05, + "loss": 0.973, + "num_input_tokens_seen": 100326336, + "step": 6235 + }, + { + "epoch": 0.43682086036757817, + "grad_norm": 5.487906455993652, + "learning_rate": 5.635892119089318e-05, + "loss": 0.9008, + "num_input_tokens_seen": 100342720, + "step": 6236 + }, + { + "epoch": 0.4368909086133074, + "grad_norm": 4.296425819396973, + "learning_rate": 5.635192294220667e-05, + "loss": 1.0326, + "num_input_tokens_seen": 100358912, + "step": 6237 + }, + { + "epoch": 0.4369609568590367, + "grad_norm": 4.339141845703125, + "learning_rate": 5.634492469352014e-05, + "loss": 1.1232, + "num_input_tokens_seen": 100374952, + "step": 6238 + }, + { + "epoch": 0.4370310051047659, + "grad_norm": 4.520789623260498, + "learning_rate": 5.633792644483362e-05, + "loss": 1.1701, + "num_input_tokens_seen": 100390712, + "step": 6239 + }, + { + "epoch": 0.43710105335049515, + "grad_norm": 3.7790653705596924, + "learning_rate": 5.633092819614712e-05, + "loss": 0.9953, + "num_input_tokens_seen": 100406936, + "step": 6240 + }, + { + "epoch": 0.4371711015962244, + "grad_norm": 3.7649457454681396, + "learning_rate": 5.63239299474606e-05, + "loss": 1.0837, + "num_input_tokens_seen": 100423320, + "step": 6241 + }, + { + "epoch": 0.43724114984195367, + "grad_norm": 5.27927827835083, + "learning_rate": 5.631693169877408e-05, + "loss": 1.1157, + "num_input_tokens_seen": 100439704, + "step": 6242 + }, + { + "epoch": 0.4373111980876829, + "grad_norm": 4.266254901885986, + "learning_rate": 5.630993345008757e-05, + "loss": 1.1607, + "num_input_tokens_seen": 100455008, + "step": 6243 + }, + { + "epoch": 0.43738124633341213, + "grad_norm": 4.195004940032959, + "learning_rate": 5.6302935201401065e-05, + "loss": 1.0163, + "num_input_tokens_seen": 100471392, + "step": 6244 + }, + { + "epoch": 0.4374512945791414, + "grad_norm": 4.85727596282959, + "learning_rate": 5.629593695271455e-05, + "loss": 1.0097, + "num_input_tokens_seen": 100486832, + "step": 6245 + }, + { + "epoch": 0.43752134282487065, + "grad_norm": 3.865466594696045, + "learning_rate": 5.628893870402803e-05, + "loss": 1.1528, + "num_input_tokens_seen": 100502496, + "step": 6246 + }, + { + "epoch": 0.43759139107059986, + "grad_norm": 4.107895851135254, + "learning_rate": 5.628194045534152e-05, + "loss": 1.0452, + "num_input_tokens_seen": 100518816, + "step": 6247 + }, + { + "epoch": 0.4376614393163291, + "grad_norm": 5.402096271514893, + "learning_rate": 5.627494220665499e-05, + "loss": 1.0368, + "num_input_tokens_seen": 100535200, + "step": 6248 + }, + { + "epoch": 0.4377314875620584, + "grad_norm": 4.255467414855957, + "learning_rate": 5.6267943957968474e-05, + "loss": 1.268, + "num_input_tokens_seen": 100551456, + "step": 6249 + }, + { + "epoch": 0.43780153580778763, + "grad_norm": 3.4338836669921875, + "learning_rate": 5.626094570928196e-05, + "loss": 1.0149, + "num_input_tokens_seen": 100567840, + "step": 6250 + }, + { + "epoch": 0.43787158405351684, + "grad_norm": 4.445374488830566, + "learning_rate": 5.6253947460595445e-05, + "loss": 1.2527, + "num_input_tokens_seen": 100583848, + "step": 6251 + }, + { + "epoch": 0.4379416322992461, + "grad_norm": 4.0756072998046875, + "learning_rate": 5.6246949211908927e-05, + "loss": 0.8848, + "num_input_tokens_seen": 100600000, + "step": 6252 + }, + { + "epoch": 0.43801168054497536, + "grad_norm": 3.910945177078247, + "learning_rate": 5.623995096322242e-05, + "loss": 0.9661, + "num_input_tokens_seen": 100615424, + "step": 6253 + }, + { + "epoch": 0.4380817287907046, + "grad_norm": 3.878586769104004, + "learning_rate": 5.623295271453591e-05, + "loss": 1.0631, + "num_input_tokens_seen": 100631808, + "step": 6254 + }, + { + "epoch": 0.4381517770364338, + "grad_norm": 4.295658111572266, + "learning_rate": 5.622595446584939e-05, + "loss": 1.2368, + "num_input_tokens_seen": 100647344, + "step": 6255 + }, + { + "epoch": 0.4382218252821631, + "grad_norm": 3.88688063621521, + "learning_rate": 5.621895621716287e-05, + "loss": 1.0371, + "num_input_tokens_seen": 100663464, + "step": 6256 + }, + { + "epoch": 0.43829187352789234, + "grad_norm": 3.6060731410980225, + "learning_rate": 5.621195796847637e-05, + "loss": 1.1189, + "num_input_tokens_seen": 100679848, + "step": 6257 + }, + { + "epoch": 0.4383619217736216, + "grad_norm": 4.274289608001709, + "learning_rate": 5.620495971978984e-05, + "loss": 0.9809, + "num_input_tokens_seen": 100695760, + "step": 6258 + }, + { + "epoch": 0.4384319700193508, + "grad_norm": 4.854022979736328, + "learning_rate": 5.619796147110332e-05, + "loss": 0.8043, + "num_input_tokens_seen": 100711144, + "step": 6259 + }, + { + "epoch": 0.43850201826508006, + "grad_norm": 3.9589812755584717, + "learning_rate": 5.6190963222416815e-05, + "loss": 1.202, + "num_input_tokens_seen": 100727088, + "step": 6260 + }, + { + "epoch": 0.4385720665108093, + "grad_norm": 5.07575798034668, + "learning_rate": 5.618396497373031e-05, + "loss": 1.0708, + "num_input_tokens_seen": 100743232, + "step": 6261 + }, + { + "epoch": 0.4386421147565386, + "grad_norm": 3.557736396789551, + "learning_rate": 5.617696672504379e-05, + "loss": 1.085, + "num_input_tokens_seen": 100759616, + "step": 6262 + }, + { + "epoch": 0.4387121630022678, + "grad_norm": 4.200889587402344, + "learning_rate": 5.6169968476357274e-05, + "loss": 0.9893, + "num_input_tokens_seen": 100775176, + "step": 6263 + }, + { + "epoch": 0.43878221124799704, + "grad_norm": 4.214064121246338, + "learning_rate": 5.616297022767076e-05, + "loss": 1.2024, + "num_input_tokens_seen": 100791232, + "step": 6264 + }, + { + "epoch": 0.4388522594937263, + "grad_norm": 5.372243881225586, + "learning_rate": 5.6155971978984244e-05, + "loss": 1.0551, + "num_input_tokens_seen": 100807496, + "step": 6265 + }, + { + "epoch": 0.43892230773945556, + "grad_norm": 4.754215717315674, + "learning_rate": 5.614897373029771e-05, + "loss": 1.1895, + "num_input_tokens_seen": 100823880, + "step": 6266 + }, + { + "epoch": 0.4389923559851848, + "grad_norm": 3.3892760276794434, + "learning_rate": 5.614197548161121e-05, + "loss": 1.0086, + "num_input_tokens_seen": 100839928, + "step": 6267 + }, + { + "epoch": 0.439062404230914, + "grad_norm": 4.554326057434082, + "learning_rate": 5.61349772329247e-05, + "loss": 0.993, + "num_input_tokens_seen": 100856312, + "step": 6268 + }, + { + "epoch": 0.4391324524766433, + "grad_norm": 4.118383407592773, + "learning_rate": 5.6127978984238185e-05, + "loss": 0.7906, + "num_input_tokens_seen": 100872696, + "step": 6269 + }, + { + "epoch": 0.43920250072237255, + "grad_norm": 4.403461456298828, + "learning_rate": 5.612098073555167e-05, + "loss": 1.1391, + "num_input_tokens_seen": 100888808, + "step": 6270 + }, + { + "epoch": 0.4392725489681018, + "grad_norm": 3.841547966003418, + "learning_rate": 5.6113982486865156e-05, + "loss": 1.0572, + "num_input_tokens_seen": 100905192, + "step": 6271 + }, + { + "epoch": 0.439342597213831, + "grad_norm": 4.147423267364502, + "learning_rate": 5.610698423817864e-05, + "loss": 1.1632, + "num_input_tokens_seen": 100920552, + "step": 6272 + }, + { + "epoch": 0.43941264545956027, + "grad_norm": 4.717578887939453, + "learning_rate": 5.609998598949212e-05, + "loss": 1.1097, + "num_input_tokens_seen": 100936656, + "step": 6273 + }, + { + "epoch": 0.43948269370528953, + "grad_norm": 5.503146171569824, + "learning_rate": 5.6092987740805615e-05, + "loss": 1.0768, + "num_input_tokens_seen": 100952952, + "step": 6274 + }, + { + "epoch": 0.4395527419510188, + "grad_norm": 3.6871585845947266, + "learning_rate": 5.6085989492119096e-05, + "loss": 0.9386, + "num_input_tokens_seen": 100969152, + "step": 6275 + }, + { + "epoch": 0.439622790196748, + "grad_norm": 3.751429796218872, + "learning_rate": 5.6078991243432565e-05, + "loss": 1.0478, + "num_input_tokens_seen": 100985520, + "step": 6276 + }, + { + "epoch": 0.43969283844247725, + "grad_norm": 4.053867340087891, + "learning_rate": 5.607199299474606e-05, + "loss": 1.1341, + "num_input_tokens_seen": 101001336, + "step": 6277 + }, + { + "epoch": 0.4397628866882065, + "grad_norm": 3.786154270172119, + "learning_rate": 5.606499474605955e-05, + "loss": 1.077, + "num_input_tokens_seen": 101017240, + "step": 6278 + }, + { + "epoch": 0.43983293493393577, + "grad_norm": 3.516772747039795, + "learning_rate": 5.605799649737303e-05, + "loss": 0.9854, + "num_input_tokens_seen": 101033264, + "step": 6279 + }, + { + "epoch": 0.439902983179665, + "grad_norm": 4.568872928619385, + "learning_rate": 5.605099824868651e-05, + "loss": 1.1444, + "num_input_tokens_seen": 101049648, + "step": 6280 + }, + { + "epoch": 0.43997303142539423, + "grad_norm": 4.430622577667236, + "learning_rate": 5.604400000000001e-05, + "loss": 1.0675, + "num_input_tokens_seen": 101066032, + "step": 6281 + }, + { + "epoch": 0.4400430796711235, + "grad_norm": 5.061071872711182, + "learning_rate": 5.603700175131349e-05, + "loss": 1.0891, + "num_input_tokens_seen": 101082416, + "step": 6282 + }, + { + "epoch": 0.44011312791685275, + "grad_norm": 3.696657180786133, + "learning_rate": 5.603000350262696e-05, + "loss": 0.9781, + "num_input_tokens_seen": 101098800, + "step": 6283 + }, + { + "epoch": 0.44018317616258196, + "grad_norm": 4.1430840492248535, + "learning_rate": 5.6023005253940467e-05, + "loss": 1.1299, + "num_input_tokens_seen": 101114672, + "step": 6284 + }, + { + "epoch": 0.4402532244083112, + "grad_norm": 4.862906455993652, + "learning_rate": 5.601600700525395e-05, + "loss": 0.8972, + "num_input_tokens_seen": 101130632, + "step": 6285 + }, + { + "epoch": 0.4403232726540405, + "grad_norm": 4.017249584197998, + "learning_rate": 5.600900875656743e-05, + "loss": 0.9911, + "num_input_tokens_seen": 101145920, + "step": 6286 + }, + { + "epoch": 0.44039332089976974, + "grad_norm": 4.797904014587402, + "learning_rate": 5.600201050788091e-05, + "loss": 1.106, + "num_input_tokens_seen": 101162296, + "step": 6287 + }, + { + "epoch": 0.44046336914549894, + "grad_norm": 3.685084342956543, + "learning_rate": 5.59950122591944e-05, + "loss": 1.0434, + "num_input_tokens_seen": 101178048, + "step": 6288 + }, + { + "epoch": 0.4405334173912282, + "grad_norm": 4.259701728820801, + "learning_rate": 5.598801401050788e-05, + "loss": 1.0203, + "num_input_tokens_seen": 101194080, + "step": 6289 + }, + { + "epoch": 0.44060346563695746, + "grad_norm": 3.961292266845703, + "learning_rate": 5.598101576182138e-05, + "loss": 0.9682, + "num_input_tokens_seen": 101209000, + "step": 6290 + }, + { + "epoch": 0.4406735138826867, + "grad_norm": 3.863640308380127, + "learning_rate": 5.597401751313486e-05, + "loss": 1.0203, + "num_input_tokens_seen": 101224184, + "step": 6291 + }, + { + "epoch": 0.4407435621284159, + "grad_norm": 6.002960681915283, + "learning_rate": 5.596701926444834e-05, + "loss": 1.3855, + "num_input_tokens_seen": 101240568, + "step": 6292 + }, + { + "epoch": 0.4408136103741452, + "grad_norm": 3.870892286300659, + "learning_rate": 5.596002101576182e-05, + "loss": 1.0754, + "num_input_tokens_seen": 101256952, + "step": 6293 + }, + { + "epoch": 0.44088365861987444, + "grad_norm": 3.654907703399658, + "learning_rate": 5.5953022767075305e-05, + "loss": 1.1244, + "num_input_tokens_seen": 101273192, + "step": 6294 + }, + { + "epoch": 0.4409537068656037, + "grad_norm": 3.2243661880493164, + "learning_rate": 5.5946024518388794e-05, + "loss": 0.9084, + "num_input_tokens_seen": 101289488, + "step": 6295 + }, + { + "epoch": 0.4410237551113329, + "grad_norm": 3.947880983352661, + "learning_rate": 5.5939026269702275e-05, + "loss": 1.1628, + "num_input_tokens_seen": 101305064, + "step": 6296 + }, + { + "epoch": 0.44109380335706216, + "grad_norm": 3.546065092086792, + "learning_rate": 5.593202802101576e-05, + "loss": 0.9669, + "num_input_tokens_seen": 101321448, + "step": 6297 + }, + { + "epoch": 0.4411638516027914, + "grad_norm": 4.489794731140137, + "learning_rate": 5.592502977232925e-05, + "loss": 1.1958, + "num_input_tokens_seen": 101337832, + "step": 6298 + }, + { + "epoch": 0.4412338998485207, + "grad_norm": 3.9517438411712646, + "learning_rate": 5.5918031523642734e-05, + "loss": 1.1256, + "num_input_tokens_seen": 101354216, + "step": 6299 + }, + { + "epoch": 0.4413039480942499, + "grad_norm": 4.599244594573975, + "learning_rate": 5.591103327495622e-05, + "loss": 0.9712, + "num_input_tokens_seen": 101370088, + "step": 6300 + }, + { + "epoch": 0.44137399633997915, + "grad_norm": 3.753528356552124, + "learning_rate": 5.5904035026269705e-05, + "loss": 0.9998, + "num_input_tokens_seen": 101385432, + "step": 6301 + }, + { + "epoch": 0.4414440445857084, + "grad_norm": 4.569333553314209, + "learning_rate": 5.589703677758319e-05, + "loss": 1.1676, + "num_input_tokens_seen": 101401816, + "step": 6302 + }, + { + "epoch": 0.44151409283143767, + "grad_norm": 4.010447978973389, + "learning_rate": 5.589003852889667e-05, + "loss": 1.0655, + "num_input_tokens_seen": 101417272, + "step": 6303 + }, + { + "epoch": 0.4415841410771669, + "grad_norm": 5.169422626495361, + "learning_rate": 5.588304028021015e-05, + "loss": 1.3555, + "num_input_tokens_seen": 101433656, + "step": 6304 + }, + { + "epoch": 0.44165418932289613, + "grad_norm": 4.6301069259643555, + "learning_rate": 5.587604203152366e-05, + "loss": 0.929, + "num_input_tokens_seen": 101450040, + "step": 6305 + }, + { + "epoch": 0.4417242375686254, + "grad_norm": 4.814012050628662, + "learning_rate": 5.586904378283713e-05, + "loss": 1.102, + "num_input_tokens_seen": 101466320, + "step": 6306 + }, + { + "epoch": 0.44179428581435465, + "grad_norm": 4.340104579925537, + "learning_rate": 5.586204553415062e-05, + "loss": 1.1179, + "num_input_tokens_seen": 101482592, + "step": 6307 + }, + { + "epoch": 0.4418643340600839, + "grad_norm": 3.807495355606079, + "learning_rate": 5.5855047285464105e-05, + "loss": 1.1407, + "num_input_tokens_seen": 101498920, + "step": 6308 + }, + { + "epoch": 0.4419343823058131, + "grad_norm": 3.97273325920105, + "learning_rate": 5.5848049036777586e-05, + "loss": 1.1377, + "num_input_tokens_seen": 101515304, + "step": 6309 + }, + { + "epoch": 0.44200443055154237, + "grad_norm": 6.926362037658691, + "learning_rate": 5.584105078809107e-05, + "loss": 0.9045, + "num_input_tokens_seen": 101531688, + "step": 6310 + }, + { + "epoch": 0.44207447879727163, + "grad_norm": 4.482272624969482, + "learning_rate": 5.583405253940457e-05, + "loss": 1.1431, + "num_input_tokens_seen": 101547912, + "step": 6311 + }, + { + "epoch": 0.4421445270430009, + "grad_norm": 3.726999044418335, + "learning_rate": 5.582705429071805e-05, + "loss": 1.0609, + "num_input_tokens_seen": 101563640, + "step": 6312 + }, + { + "epoch": 0.4422145752887301, + "grad_norm": 4.305807113647461, + "learning_rate": 5.582005604203152e-05, + "loss": 1.0612, + "num_input_tokens_seen": 101580024, + "step": 6313 + }, + { + "epoch": 0.44228462353445935, + "grad_norm": 5.402091979980469, + "learning_rate": 5.5813057793345016e-05, + "loss": 0.9018, + "num_input_tokens_seen": 101596408, + "step": 6314 + }, + { + "epoch": 0.4423546717801886, + "grad_norm": 3.658170700073242, + "learning_rate": 5.5806059544658504e-05, + "loss": 1.1726, + "num_input_tokens_seen": 101612792, + "step": 6315 + }, + { + "epoch": 0.4424247200259179, + "grad_norm": 3.91109561920166, + "learning_rate": 5.579906129597198e-05, + "loss": 0.9991, + "num_input_tokens_seen": 101628408, + "step": 6316 + }, + { + "epoch": 0.4424947682716471, + "grad_norm": 3.9523725509643555, + "learning_rate": 5.579206304728547e-05, + "loss": 1.1404, + "num_input_tokens_seen": 101644616, + "step": 6317 + }, + { + "epoch": 0.44256481651737634, + "grad_norm": 4.591569423675537, + "learning_rate": 5.578506479859895e-05, + "loss": 0.9778, + "num_input_tokens_seen": 101660536, + "step": 6318 + }, + { + "epoch": 0.4426348647631056, + "grad_norm": 3.7487003803253174, + "learning_rate": 5.577806654991243e-05, + "loss": 1.1305, + "num_input_tokens_seen": 101676920, + "step": 6319 + }, + { + "epoch": 0.44270491300883485, + "grad_norm": 4.111825942993164, + "learning_rate": 5.5771068301225913e-05, + "loss": 1.177, + "num_input_tokens_seen": 101692904, + "step": 6320 + }, + { + "epoch": 0.44277496125456406, + "grad_norm": 3.7022197246551514, + "learning_rate": 5.576407005253941e-05, + "loss": 1.0351, + "num_input_tokens_seen": 101709288, + "step": 6321 + }, + { + "epoch": 0.4428450095002933, + "grad_norm": 5.004938125610352, + "learning_rate": 5.57570718038529e-05, + "loss": 1.1042, + "num_input_tokens_seen": 101725176, + "step": 6322 + }, + { + "epoch": 0.4429150577460226, + "grad_norm": 3.728410005569458, + "learning_rate": 5.575007355516637e-05, + "loss": 0.9879, + "num_input_tokens_seen": 101741160, + "step": 6323 + }, + { + "epoch": 0.44298510599175184, + "grad_norm": 4.526604175567627, + "learning_rate": 5.574307530647986e-05, + "loss": 1.1465, + "num_input_tokens_seen": 101756848, + "step": 6324 + }, + { + "epoch": 0.44305515423748104, + "grad_norm": 3.4281585216522217, + "learning_rate": 5.5736077057793356e-05, + "loss": 1.0865, + "num_input_tokens_seen": 101773232, + "step": 6325 + }, + { + "epoch": 0.4431252024832103, + "grad_norm": 5.678319931030273, + "learning_rate": 5.5729078809106825e-05, + "loss": 1.0443, + "num_input_tokens_seen": 101789064, + "step": 6326 + }, + { + "epoch": 0.44319525072893956, + "grad_norm": 4.231290817260742, + "learning_rate": 5.572208056042032e-05, + "loss": 1.0336, + "num_input_tokens_seen": 101805360, + "step": 6327 + }, + { + "epoch": 0.4432652989746688, + "grad_norm": 3.9336435794830322, + "learning_rate": 5.5715082311733815e-05, + "loss": 1.0592, + "num_input_tokens_seen": 101821744, + "step": 6328 + }, + { + "epoch": 0.443335347220398, + "grad_norm": 3.6775193214416504, + "learning_rate": 5.57080840630473e-05, + "loss": 1.0551, + "num_input_tokens_seen": 101838128, + "step": 6329 + }, + { + "epoch": 0.4434053954661273, + "grad_norm": 3.7788445949554443, + "learning_rate": 5.5701085814360766e-05, + "loss": 0.9174, + "num_input_tokens_seen": 101854216, + "step": 6330 + }, + { + "epoch": 0.44347544371185654, + "grad_norm": 5.49542236328125, + "learning_rate": 5.569408756567426e-05, + "loss": 1.0043, + "num_input_tokens_seen": 101870600, + "step": 6331 + }, + { + "epoch": 0.4435454919575858, + "grad_norm": 4.486842155456543, + "learning_rate": 5.568708931698775e-05, + "loss": 1.2571, + "num_input_tokens_seen": 101886984, + "step": 6332 + }, + { + "epoch": 0.443615540203315, + "grad_norm": 4.949841499328613, + "learning_rate": 5.5680091068301225e-05, + "loss": 0.9723, + "num_input_tokens_seen": 101903368, + "step": 6333 + }, + { + "epoch": 0.44368558844904427, + "grad_norm": 3.6375255584716797, + "learning_rate": 5.567309281961471e-05, + "loss": 1.0938, + "num_input_tokens_seen": 101919568, + "step": 6334 + }, + { + "epoch": 0.4437556366947735, + "grad_norm": 4.649466037750244, + "learning_rate": 5.566609457092821e-05, + "loss": 1.0182, + "num_input_tokens_seen": 101935952, + "step": 6335 + }, + { + "epoch": 0.4438256849405028, + "grad_norm": 3.971482276916504, + "learning_rate": 5.565909632224169e-05, + "loss": 1.2222, + "num_input_tokens_seen": 101952336, + "step": 6336 + }, + { + "epoch": 0.44389573318623204, + "grad_norm": 4.605628967285156, + "learning_rate": 5.565209807355517e-05, + "loss": 1.1634, + "num_input_tokens_seen": 101967752, + "step": 6337 + }, + { + "epoch": 0.44396578143196125, + "grad_norm": 6.878963947296143, + "learning_rate": 5.564509982486866e-05, + "loss": 0.831, + "num_input_tokens_seen": 101982648, + "step": 6338 + }, + { + "epoch": 0.4440358296776905, + "grad_norm": 4.339694976806641, + "learning_rate": 5.563810157618214e-05, + "loss": 1.206, + "num_input_tokens_seen": 101998912, + "step": 6339 + }, + { + "epoch": 0.44410587792341977, + "grad_norm": 3.5509302616119385, + "learning_rate": 5.563110332749562e-05, + "loss": 0.969, + "num_input_tokens_seen": 102015296, + "step": 6340 + }, + { + "epoch": 0.444175926169149, + "grad_norm": 6.927268981933594, + "learning_rate": 5.5624105078809106e-05, + "loss": 1.0158, + "num_input_tokens_seen": 102031136, + "step": 6341 + }, + { + "epoch": 0.44424597441487823, + "grad_norm": 4.596194267272949, + "learning_rate": 5.56171068301226e-05, + "loss": 1.0602, + "num_input_tokens_seen": 102047016, + "step": 6342 + }, + { + "epoch": 0.4443160226606075, + "grad_norm": 3.8641550540924072, + "learning_rate": 5.561010858143607e-05, + "loss": 1.007, + "num_input_tokens_seen": 102062344, + "step": 6343 + }, + { + "epoch": 0.44438607090633675, + "grad_norm": 5.471240997314453, + "learning_rate": 5.5603110332749565e-05, + "loss": 0.9372, + "num_input_tokens_seen": 102078392, + "step": 6344 + }, + { + "epoch": 0.444456119152066, + "grad_norm": 3.971010208129883, + "learning_rate": 5.5596112084063054e-05, + "loss": 1.1335, + "num_input_tokens_seen": 102092872, + "step": 6345 + }, + { + "epoch": 0.4445261673977952, + "grad_norm": 4.037472724914551, + "learning_rate": 5.5589113835376536e-05, + "loss": 0.959, + "num_input_tokens_seen": 102109256, + "step": 6346 + }, + { + "epoch": 0.4445962156435245, + "grad_norm": 3.834984302520752, + "learning_rate": 5.558211558669002e-05, + "loss": 1.129, + "num_input_tokens_seen": 102125424, + "step": 6347 + }, + { + "epoch": 0.44466626388925373, + "grad_norm": 5.131717205047607, + "learning_rate": 5.55751173380035e-05, + "loss": 1.0966, + "num_input_tokens_seen": 102141808, + "step": 6348 + }, + { + "epoch": 0.444736312134983, + "grad_norm": 4.5308837890625, + "learning_rate": 5.5568119089316995e-05, + "loss": 1.1752, + "num_input_tokens_seen": 102158192, + "step": 6349 + }, + { + "epoch": 0.4448063603807122, + "grad_norm": 5.088570594787598, + "learning_rate": 5.556112084063046e-05, + "loss": 1.0215, + "num_input_tokens_seen": 102174576, + "step": 6350 + }, + { + "epoch": 0.44487640862644146, + "grad_norm": 3.588543176651001, + "learning_rate": 5.555412259194397e-05, + "loss": 1.0452, + "num_input_tokens_seen": 102190928, + "step": 6351 + }, + { + "epoch": 0.4449464568721707, + "grad_norm": 3.5698747634887695, + "learning_rate": 5.5547124343257454e-05, + "loss": 1.0472, + "num_input_tokens_seen": 102207072, + "step": 6352 + }, + { + "epoch": 0.4450165051179, + "grad_norm": 3.4842190742492676, + "learning_rate": 5.5540126094570935e-05, + "loss": 0.9213, + "num_input_tokens_seen": 102222664, + "step": 6353 + }, + { + "epoch": 0.4450865533636292, + "grad_norm": 3.742471218109131, + "learning_rate": 5.553312784588442e-05, + "loss": 1.1803, + "num_input_tokens_seen": 102239048, + "step": 6354 + }, + { + "epoch": 0.44515660160935844, + "grad_norm": 4.108808994293213, + "learning_rate": 5.552612959719792e-05, + "loss": 1.2055, + "num_input_tokens_seen": 102255432, + "step": 6355 + }, + { + "epoch": 0.4452266498550877, + "grad_norm": 5.498636722564697, + "learning_rate": 5.551913134851139e-05, + "loss": 1.0159, + "num_input_tokens_seen": 102270760, + "step": 6356 + }, + { + "epoch": 0.44529669810081696, + "grad_norm": 3.639392137527466, + "learning_rate": 5.551213309982487e-05, + "loss": 0.9927, + "num_input_tokens_seen": 102285560, + "step": 6357 + }, + { + "epoch": 0.44536674634654616, + "grad_norm": 4.534916400909424, + "learning_rate": 5.5505134851138365e-05, + "loss": 1.0912, + "num_input_tokens_seen": 102301456, + "step": 6358 + }, + { + "epoch": 0.4454367945922754, + "grad_norm": 3.961845636367798, + "learning_rate": 5.5498136602451847e-05, + "loss": 0.8966, + "num_input_tokens_seen": 102317840, + "step": 6359 + }, + { + "epoch": 0.4455068428380047, + "grad_norm": 3.712111473083496, + "learning_rate": 5.549113835376533e-05, + "loss": 1.0297, + "num_input_tokens_seen": 102334104, + "step": 6360 + }, + { + "epoch": 0.44557689108373394, + "grad_norm": 4.441688537597656, + "learning_rate": 5.548414010507882e-05, + "loss": 1.0116, + "num_input_tokens_seen": 102350488, + "step": 6361 + }, + { + "epoch": 0.44564693932946314, + "grad_norm": 5.881339073181152, + "learning_rate": 5.54771418563923e-05, + "loss": 0.8988, + "num_input_tokens_seen": 102366872, + "step": 6362 + }, + { + "epoch": 0.4457169875751924, + "grad_norm": 3.9736666679382324, + "learning_rate": 5.547014360770578e-05, + "loss": 1.1402, + "num_input_tokens_seen": 102383256, + "step": 6363 + }, + { + "epoch": 0.44578703582092166, + "grad_norm": 4.064074516296387, + "learning_rate": 5.546314535901926e-05, + "loss": 1.0196, + "num_input_tokens_seen": 102399024, + "step": 6364 + }, + { + "epoch": 0.4458570840666509, + "grad_norm": 4.238128662109375, + "learning_rate": 5.5456147110332765e-05, + "loss": 0.8919, + "num_input_tokens_seen": 102415408, + "step": 6365 + }, + { + "epoch": 0.4459271323123801, + "grad_norm": 3.8058905601501465, + "learning_rate": 5.544914886164624e-05, + "loss": 1.0117, + "num_input_tokens_seen": 102430616, + "step": 6366 + }, + { + "epoch": 0.4459971805581094, + "grad_norm": 5.049830436706543, + "learning_rate": 5.544215061295972e-05, + "loss": 1.0986, + "num_input_tokens_seen": 102446448, + "step": 6367 + }, + { + "epoch": 0.44606722880383864, + "grad_norm": 4.0549116134643555, + "learning_rate": 5.543515236427321e-05, + "loss": 1.1462, + "num_input_tokens_seen": 102462832, + "step": 6368 + }, + { + "epoch": 0.4461372770495679, + "grad_norm": 4.005105495452881, + "learning_rate": 5.542815411558669e-05, + "loss": 0.8846, + "num_input_tokens_seen": 102478688, + "step": 6369 + }, + { + "epoch": 0.44620732529529716, + "grad_norm": 4.298024654388428, + "learning_rate": 5.5421155866900174e-05, + "loss": 1.0249, + "num_input_tokens_seen": 102495072, + "step": 6370 + }, + { + "epoch": 0.44627737354102637, + "grad_norm": 4.816470623016357, + "learning_rate": 5.541415761821367e-05, + "loss": 1.0492, + "num_input_tokens_seen": 102511456, + "step": 6371 + }, + { + "epoch": 0.4463474217867556, + "grad_norm": 3.89819598197937, + "learning_rate": 5.5407159369527164e-05, + "loss": 0.9172, + "num_input_tokens_seen": 102527840, + "step": 6372 + }, + { + "epoch": 0.4464174700324849, + "grad_norm": 3.883650541305542, + "learning_rate": 5.540016112084063e-05, + "loss": 0.9475, + "num_input_tokens_seen": 102543008, + "step": 6373 + }, + { + "epoch": 0.44648751827821415, + "grad_norm": 3.8635551929473877, + "learning_rate": 5.5393162872154114e-05, + "loss": 1.2026, + "num_input_tokens_seen": 102558600, + "step": 6374 + }, + { + "epoch": 0.44655756652394335, + "grad_norm": 4.465150356292725, + "learning_rate": 5.538616462346761e-05, + "loss": 1.0031, + "num_input_tokens_seen": 102574984, + "step": 6375 + }, + { + "epoch": 0.4466276147696726, + "grad_norm": 3.8807246685028076, + "learning_rate": 5.537916637478109e-05, + "loss": 1.2355, + "num_input_tokens_seen": 102591368, + "step": 6376 + }, + { + "epoch": 0.44669766301540187, + "grad_norm": 4.526896953582764, + "learning_rate": 5.5372168126094573e-05, + "loss": 1.4136, + "num_input_tokens_seen": 102607520, + "step": 6377 + }, + { + "epoch": 0.44676771126113113, + "grad_norm": 4.905179023742676, + "learning_rate": 5.536516987740806e-05, + "loss": 1.1221, + "num_input_tokens_seen": 102622928, + "step": 6378 + }, + { + "epoch": 0.44683775950686033, + "grad_norm": 3.658268690109253, + "learning_rate": 5.535817162872156e-05, + "loss": 0.9257, + "num_input_tokens_seen": 102639312, + "step": 6379 + }, + { + "epoch": 0.4469078077525896, + "grad_norm": 4.125054359436035, + "learning_rate": 5.5351173380035026e-05, + "loss": 1.0569, + "num_input_tokens_seen": 102655104, + "step": 6380 + }, + { + "epoch": 0.44697785599831885, + "grad_norm": 3.6429343223571777, + "learning_rate": 5.534417513134852e-05, + "loss": 0.9446, + "num_input_tokens_seen": 102671488, + "step": 6381 + }, + { + "epoch": 0.4470479042440481, + "grad_norm": 4.249630928039551, + "learning_rate": 5.533717688266201e-05, + "loss": 1.248, + "num_input_tokens_seen": 102687872, + "step": 6382 + }, + { + "epoch": 0.4471179524897773, + "grad_norm": 4.6079277992248535, + "learning_rate": 5.5330178633975485e-05, + "loss": 1.1774, + "num_input_tokens_seen": 102704256, + "step": 6383 + }, + { + "epoch": 0.4471880007355066, + "grad_norm": 4.406512260437012, + "learning_rate": 5.5323180385288966e-05, + "loss": 1.0111, + "num_input_tokens_seen": 102720640, + "step": 6384 + }, + { + "epoch": 0.44725804898123583, + "grad_norm": 5.140946865081787, + "learning_rate": 5.5316182136602455e-05, + "loss": 1.0323, + "num_input_tokens_seen": 102735608, + "step": 6385 + }, + { + "epoch": 0.4473280972269651, + "grad_norm": 5.1392903327941895, + "learning_rate": 5.530918388791594e-05, + "loss": 1.1535, + "num_input_tokens_seen": 102751992, + "step": 6386 + }, + { + "epoch": 0.4473981454726943, + "grad_norm": 5.466961860656738, + "learning_rate": 5.530218563922942e-05, + "loss": 1.2679, + "num_input_tokens_seen": 102768376, + "step": 6387 + }, + { + "epoch": 0.44746819371842356, + "grad_norm": 3.7714486122131348, + "learning_rate": 5.5295187390542914e-05, + "loss": 0.9847, + "num_input_tokens_seen": 102784328, + "step": 6388 + }, + { + "epoch": 0.4475382419641528, + "grad_norm": 3.665609836578369, + "learning_rate": 5.52881891418564e-05, + "loss": 1.0716, + "num_input_tokens_seen": 102800032, + "step": 6389 + }, + { + "epoch": 0.4476082902098821, + "grad_norm": 6.100143909454346, + "learning_rate": 5.528119089316988e-05, + "loss": 0.8844, + "num_input_tokens_seen": 102816416, + "step": 6390 + }, + { + "epoch": 0.4476783384556113, + "grad_norm": 3.8393003940582275, + "learning_rate": 5.527419264448336e-05, + "loss": 1.1134, + "num_input_tokens_seen": 102832800, + "step": 6391 + }, + { + "epoch": 0.44774838670134054, + "grad_norm": 3.862710952758789, + "learning_rate": 5.526719439579686e-05, + "loss": 1.0571, + "num_input_tokens_seen": 102849128, + "step": 6392 + }, + { + "epoch": 0.4478184349470698, + "grad_norm": 4.032309055328369, + "learning_rate": 5.526019614711033e-05, + "loss": 1.0123, + "num_input_tokens_seen": 102865512, + "step": 6393 + }, + { + "epoch": 0.44788848319279906, + "grad_norm": 4.250918388366699, + "learning_rate": 5.525319789842381e-05, + "loss": 0.8542, + "num_input_tokens_seen": 102881896, + "step": 6394 + }, + { + "epoch": 0.44795853143852826, + "grad_norm": 3.8701565265655518, + "learning_rate": 5.524619964973731e-05, + "loss": 1.0307, + "num_input_tokens_seen": 102898192, + "step": 6395 + }, + { + "epoch": 0.4480285796842575, + "grad_norm": 4.2415571212768555, + "learning_rate": 5.52392014010508e-05, + "loss": 1.0586, + "num_input_tokens_seen": 102914192, + "step": 6396 + }, + { + "epoch": 0.4480986279299868, + "grad_norm": 3.937345027923584, + "learning_rate": 5.5232203152364284e-05, + "loss": 1.175, + "num_input_tokens_seen": 102929976, + "step": 6397 + }, + { + "epoch": 0.44816867617571604, + "grad_norm": 4.339337348937988, + "learning_rate": 5.5225204903677766e-05, + "loss": 1.2494, + "num_input_tokens_seen": 102946360, + "step": 6398 + }, + { + "epoch": 0.44823872442144524, + "grad_norm": 4.744752883911133, + "learning_rate": 5.5218206654991255e-05, + "loss": 1.1717, + "num_input_tokens_seen": 102962744, + "step": 6399 + }, + { + "epoch": 0.4483087726671745, + "grad_norm": 3.6590077877044678, + "learning_rate": 5.521120840630473e-05, + "loss": 1.014, + "num_input_tokens_seen": 102978456, + "step": 6400 + }, + { + "epoch": 0.4483087726671745, + "eval_loss": 1.1259907484054565, + "eval_runtime": 0.159, + "eval_samples_per_second": 6.289, + "eval_steps_per_second": 6.289, + "num_input_tokens_seen": 102978456, + "step": 6400 + }, + { + "epoch": 0.44837882091290376, + "grad_norm": 4.073358535766602, + "learning_rate": 5.5204210157618205e-05, + "loss": 1.1584, + "num_input_tokens_seen": 102994280, + "step": 6401 + }, + { + "epoch": 0.448448869158633, + "grad_norm": 6.3949480056762695, + "learning_rate": 5.51972119089317e-05, + "loss": 1.0338, + "num_input_tokens_seen": 103010664, + "step": 6402 + }, + { + "epoch": 0.4485189174043622, + "grad_norm": 4.09867525100708, + "learning_rate": 5.5190213660245195e-05, + "loss": 1.175, + "num_input_tokens_seen": 103027048, + "step": 6403 + }, + { + "epoch": 0.4485889656500915, + "grad_norm": 3.672407865524292, + "learning_rate": 5.518321541155868e-05, + "loss": 1.1726, + "num_input_tokens_seen": 103043432, + "step": 6404 + }, + { + "epoch": 0.44865901389582075, + "grad_norm": 3.5733370780944824, + "learning_rate": 5.517621716287216e-05, + "loss": 0.9414, + "num_input_tokens_seen": 103059816, + "step": 6405 + }, + { + "epoch": 0.44872906214155, + "grad_norm": 5.21142578125, + "learning_rate": 5.516921891418565e-05, + "loss": 1.1563, + "num_input_tokens_seen": 103076200, + "step": 6406 + }, + { + "epoch": 0.44879911038727927, + "grad_norm": 3.4936230182647705, + "learning_rate": 5.516222066549913e-05, + "loss": 0.9876, + "num_input_tokens_seen": 103092040, + "step": 6407 + }, + { + "epoch": 0.44886915863300847, + "grad_norm": 4.558346271514893, + "learning_rate": 5.515522241681261e-05, + "loss": 0.9841, + "num_input_tokens_seen": 103108424, + "step": 6408 + }, + { + "epoch": 0.44893920687873773, + "grad_norm": 5.485194206237793, + "learning_rate": 5.514822416812611e-05, + "loss": 1.0012, + "num_input_tokens_seen": 103123544, + "step": 6409 + }, + { + "epoch": 0.449009255124467, + "grad_norm": 4.365593433380127, + "learning_rate": 5.5141225919439575e-05, + "loss": 0.9441, + "num_input_tokens_seen": 103139928, + "step": 6410 + }, + { + "epoch": 0.44907930337019625, + "grad_norm": 6.034286022186279, + "learning_rate": 5.513422767075306e-05, + "loss": 1.1408, + "num_input_tokens_seen": 103154960, + "step": 6411 + }, + { + "epoch": 0.44914935161592545, + "grad_norm": 3.88476300239563, + "learning_rate": 5.512722942206655e-05, + "loss": 0.8513, + "num_input_tokens_seen": 103169984, + "step": 6412 + }, + { + "epoch": 0.4492193998616547, + "grad_norm": 3.760528087615967, + "learning_rate": 5.512023117338004e-05, + "loss": 1.07, + "num_input_tokens_seen": 103186296, + "step": 6413 + }, + { + "epoch": 0.44928944810738397, + "grad_norm": 3.779690980911255, + "learning_rate": 5.511323292469352e-05, + "loss": 0.9531, + "num_input_tokens_seen": 103202680, + "step": 6414 + }, + { + "epoch": 0.44935949635311323, + "grad_norm": 3.6536929607391357, + "learning_rate": 5.5106234676007004e-05, + "loss": 1.0908, + "num_input_tokens_seen": 103218656, + "step": 6415 + }, + { + "epoch": 0.44942954459884243, + "grad_norm": 3.9258713722229004, + "learning_rate": 5.50992364273205e-05, + "loss": 1.0175, + "num_input_tokens_seen": 103234872, + "step": 6416 + }, + { + "epoch": 0.4494995928445717, + "grad_norm": 4.860123634338379, + "learning_rate": 5.509223817863397e-05, + "loss": 1.1094, + "num_input_tokens_seen": 103251000, + "step": 6417 + }, + { + "epoch": 0.44956964109030095, + "grad_norm": 4.924446105957031, + "learning_rate": 5.508523992994745e-05, + "loss": 1.1265, + "num_input_tokens_seen": 103267296, + "step": 6418 + }, + { + "epoch": 0.4496396893360302, + "grad_norm": 4.334608554840088, + "learning_rate": 5.507824168126096e-05, + "loss": 0.9163, + "num_input_tokens_seen": 103283440, + "step": 6419 + }, + { + "epoch": 0.4497097375817594, + "grad_norm": 4.686522483825684, + "learning_rate": 5.507124343257444e-05, + "loss": 1.0374, + "num_input_tokens_seen": 103299760, + "step": 6420 + }, + { + "epoch": 0.4497797858274887, + "grad_norm": 4.797657012939453, + "learning_rate": 5.506424518388792e-05, + "loss": 1.1277, + "num_input_tokens_seen": 103316144, + "step": 6421 + }, + { + "epoch": 0.44984983407321794, + "grad_norm": 3.443018674850464, + "learning_rate": 5.5057246935201404e-05, + "loss": 0.943, + "num_input_tokens_seen": 103331712, + "step": 6422 + }, + { + "epoch": 0.4499198823189472, + "grad_norm": 4.118574619293213, + "learning_rate": 5.505024868651489e-05, + "loss": 1.0539, + "num_input_tokens_seen": 103348096, + "step": 6423 + }, + { + "epoch": 0.4499899305646764, + "grad_norm": 4.0539937019348145, + "learning_rate": 5.5043250437828375e-05, + "loss": 0.8437, + "num_input_tokens_seen": 103364480, + "step": 6424 + }, + { + "epoch": 0.45005997881040566, + "grad_norm": 4.269721031188965, + "learning_rate": 5.503625218914187e-05, + "loss": 1.0896, + "num_input_tokens_seen": 103380120, + "step": 6425 + }, + { + "epoch": 0.4501300270561349, + "grad_norm": 4.6834516525268555, + "learning_rate": 5.502925394045535e-05, + "loss": 1.1162, + "num_input_tokens_seen": 103396504, + "step": 6426 + }, + { + "epoch": 0.4502000753018642, + "grad_norm": 4.42267370223999, + "learning_rate": 5.5022255691768834e-05, + "loss": 1.0416, + "num_input_tokens_seen": 103412632, + "step": 6427 + }, + { + "epoch": 0.4502701235475934, + "grad_norm": 4.8119797706604, + "learning_rate": 5.5015257443082315e-05, + "loss": 1.2585, + "num_input_tokens_seen": 103428128, + "step": 6428 + }, + { + "epoch": 0.45034017179332264, + "grad_norm": 4.170595169067383, + "learning_rate": 5.50082591943958e-05, + "loss": 0.9985, + "num_input_tokens_seen": 103444088, + "step": 6429 + }, + { + "epoch": 0.4504102200390519, + "grad_norm": 3.7060906887054443, + "learning_rate": 5.5001260945709286e-05, + "loss": 1.0852, + "num_input_tokens_seen": 103460456, + "step": 6430 + }, + { + "epoch": 0.45048026828478116, + "grad_norm": 4.4231977462768555, + "learning_rate": 5.499426269702277e-05, + "loss": 1.043, + "num_input_tokens_seen": 103476840, + "step": 6431 + }, + { + "epoch": 0.45055031653051036, + "grad_norm": 4.086833477020264, + "learning_rate": 5.498726444833625e-05, + "loss": 1.2797, + "num_input_tokens_seen": 103492808, + "step": 6432 + }, + { + "epoch": 0.4506203647762396, + "grad_norm": 3.912932872772217, + "learning_rate": 5.4980266199649745e-05, + "loss": 0.9846, + "num_input_tokens_seen": 103508672, + "step": 6433 + }, + { + "epoch": 0.4506904130219689, + "grad_norm": 3.6088106632232666, + "learning_rate": 5.4973267950963227e-05, + "loss": 1.0097, + "num_input_tokens_seen": 103525056, + "step": 6434 + }, + { + "epoch": 0.45076046126769814, + "grad_norm": 4.725728511810303, + "learning_rate": 5.4966269702276715e-05, + "loss": 1.1345, + "num_input_tokens_seen": 103541440, + "step": 6435 + }, + { + "epoch": 0.45083050951342735, + "grad_norm": 6.745354175567627, + "learning_rate": 5.49592714535902e-05, + "loss": 0.9549, + "num_input_tokens_seen": 103556264, + "step": 6436 + }, + { + "epoch": 0.4509005577591566, + "grad_norm": 4.462937355041504, + "learning_rate": 5.495227320490368e-05, + "loss": 1.0289, + "num_input_tokens_seen": 103571576, + "step": 6437 + }, + { + "epoch": 0.45097060600488587, + "grad_norm": 4.77189826965332, + "learning_rate": 5.494527495621716e-05, + "loss": 1.2534, + "num_input_tokens_seen": 103587360, + "step": 6438 + }, + { + "epoch": 0.4510406542506151, + "grad_norm": 5.734838962554932, + "learning_rate": 5.493827670753064e-05, + "loss": 1.0698, + "num_input_tokens_seen": 103603744, + "step": 6439 + }, + { + "epoch": 0.4511107024963444, + "grad_norm": 4.221588134765625, + "learning_rate": 5.493127845884414e-05, + "loss": 1.1712, + "num_input_tokens_seen": 103620128, + "step": 6440 + }, + { + "epoch": 0.4511807507420736, + "grad_norm": 3.894184112548828, + "learning_rate": 5.492428021015762e-05, + "loss": 1.1204, + "num_input_tokens_seen": 103636512, + "step": 6441 + }, + { + "epoch": 0.45125079898780285, + "grad_norm": 4.208652496337891, + "learning_rate": 5.4917281961471115e-05, + "loss": 1.1809, + "num_input_tokens_seen": 103652824, + "step": 6442 + }, + { + "epoch": 0.4513208472335321, + "grad_norm": 3.4426159858703613, + "learning_rate": 5.49102837127846e-05, + "loss": 1.0433, + "num_input_tokens_seen": 103669000, + "step": 6443 + }, + { + "epoch": 0.45139089547926137, + "grad_norm": 3.628229856491089, + "learning_rate": 5.490328546409808e-05, + "loss": 0.918, + "num_input_tokens_seen": 103684792, + "step": 6444 + }, + { + "epoch": 0.45146094372499057, + "grad_norm": 6.008549690246582, + "learning_rate": 5.489628721541156e-05, + "loss": 1.2477, + "num_input_tokens_seen": 103701176, + "step": 6445 + }, + { + "epoch": 0.45153099197071983, + "grad_norm": 4.023336887359619, + "learning_rate": 5.488928896672506e-05, + "loss": 1.2522, + "num_input_tokens_seen": 103716488, + "step": 6446 + }, + { + "epoch": 0.4516010402164491, + "grad_norm": 3.6931705474853516, + "learning_rate": 5.488229071803853e-05, + "loss": 0.9123, + "num_input_tokens_seen": 103732560, + "step": 6447 + }, + { + "epoch": 0.45167108846217835, + "grad_norm": 3.7356324195861816, + "learning_rate": 5.487529246935201e-05, + "loss": 1.0913, + "num_input_tokens_seen": 103748512, + "step": 6448 + }, + { + "epoch": 0.45174113670790755, + "grad_norm": 5.176403045654297, + "learning_rate": 5.486829422066551e-05, + "loss": 1.0758, + "num_input_tokens_seen": 103764896, + "step": 6449 + }, + { + "epoch": 0.4518111849536368, + "grad_norm": 4.492616176605225, + "learning_rate": 5.486129597197899e-05, + "loss": 1.2558, + "num_input_tokens_seen": 103781280, + "step": 6450 + }, + { + "epoch": 0.4518812331993661, + "grad_norm": 4.058090686798096, + "learning_rate": 5.485429772329247e-05, + "loss": 1.1313, + "num_input_tokens_seen": 103797664, + "step": 6451 + }, + { + "epoch": 0.45195128144509533, + "grad_norm": 3.6828136444091797, + "learning_rate": 5.484729947460596e-05, + "loss": 1.0972, + "num_input_tokens_seen": 103813912, + "step": 6452 + }, + { + "epoch": 0.45202132969082454, + "grad_norm": 3.4555649757385254, + "learning_rate": 5.484030122591944e-05, + "loss": 1.0052, + "num_input_tokens_seen": 103830296, + "step": 6453 + }, + { + "epoch": 0.4520913779365538, + "grad_norm": 4.12479305267334, + "learning_rate": 5.4833302977232924e-05, + "loss": 0.919, + "num_input_tokens_seen": 103846272, + "step": 6454 + }, + { + "epoch": 0.45216142618228305, + "grad_norm": 4.1249237060546875, + "learning_rate": 5.4826304728546406e-05, + "loss": 0.9013, + "num_input_tokens_seen": 103862408, + "step": 6455 + }, + { + "epoch": 0.4522314744280123, + "grad_norm": 4.026651859283447, + "learning_rate": 5.481930647985989e-05, + "loss": 1.0143, + "num_input_tokens_seen": 103878592, + "step": 6456 + }, + { + "epoch": 0.4523015226737415, + "grad_norm": 4.157918453216553, + "learning_rate": 5.481230823117338e-05, + "loss": 1.0132, + "num_input_tokens_seen": 103894512, + "step": 6457 + }, + { + "epoch": 0.4523715709194708, + "grad_norm": 3.4476771354675293, + "learning_rate": 5.4805309982486865e-05, + "loss": 0.941, + "num_input_tokens_seen": 103910728, + "step": 6458 + }, + { + "epoch": 0.45244161916520004, + "grad_norm": 5.755035877227783, + "learning_rate": 5.479831173380035e-05, + "loss": 1.1735, + "num_input_tokens_seen": 103927112, + "step": 6459 + }, + { + "epoch": 0.4525116674109293, + "grad_norm": 3.774343967437744, + "learning_rate": 5.4791313485113835e-05, + "loss": 1.1775, + "num_input_tokens_seen": 103943496, + "step": 6460 + }, + { + "epoch": 0.4525817156566585, + "grad_norm": 3.8584420680999756, + "learning_rate": 5.478431523642732e-05, + "loss": 1.0433, + "num_input_tokens_seen": 103959880, + "step": 6461 + }, + { + "epoch": 0.45265176390238776, + "grad_norm": 3.545832872390747, + "learning_rate": 5.477731698774081e-05, + "loss": 1.0117, + "num_input_tokens_seen": 103976264, + "step": 6462 + }, + { + "epoch": 0.452721812148117, + "grad_norm": 4.018779277801514, + "learning_rate": 5.477031873905431e-05, + "loss": 1.0711, + "num_input_tokens_seen": 103991720, + "step": 6463 + }, + { + "epoch": 0.4527918603938463, + "grad_norm": 3.966514825820923, + "learning_rate": 5.476332049036779e-05, + "loss": 1.1632, + "num_input_tokens_seen": 104007488, + "step": 6464 + }, + { + "epoch": 0.4528619086395755, + "grad_norm": 3.8280792236328125, + "learning_rate": 5.475632224168126e-05, + "loss": 0.9702, + "num_input_tokens_seen": 104023096, + "step": 6465 + }, + { + "epoch": 0.45293195688530474, + "grad_norm": 6.540561676025391, + "learning_rate": 5.474932399299475e-05, + "loss": 1.2517, + "num_input_tokens_seen": 104038808, + "step": 6466 + }, + { + "epoch": 0.453002005131034, + "grad_norm": 4.703604221343994, + "learning_rate": 5.4742325744308235e-05, + "loss": 1.1562, + "num_input_tokens_seen": 104053944, + "step": 6467 + }, + { + "epoch": 0.45307205337676326, + "grad_norm": 3.950582504272461, + "learning_rate": 5.473532749562171e-05, + "loss": 0.9822, + "num_input_tokens_seen": 104070304, + "step": 6468 + }, + { + "epoch": 0.45314210162249247, + "grad_norm": 5.277374744415283, + "learning_rate": 5.4728329246935205e-05, + "loss": 1.1024, + "num_input_tokens_seen": 104086088, + "step": 6469 + }, + { + "epoch": 0.4532121498682217, + "grad_norm": 4.449152946472168, + "learning_rate": 5.47213309982487e-05, + "loss": 1.2031, + "num_input_tokens_seen": 104102472, + "step": 6470 + }, + { + "epoch": 0.453282198113951, + "grad_norm": 3.780017852783203, + "learning_rate": 5.471433274956218e-05, + "loss": 1.0398, + "num_input_tokens_seen": 104117552, + "step": 6471 + }, + { + "epoch": 0.45335224635968024, + "grad_norm": 3.502319574356079, + "learning_rate": 5.470733450087565e-05, + "loss": 1.0531, + "num_input_tokens_seen": 104133936, + "step": 6472 + }, + { + "epoch": 0.45342229460540945, + "grad_norm": 4.8112311363220215, + "learning_rate": 5.470033625218915e-05, + "loss": 1.3107, + "num_input_tokens_seen": 104150320, + "step": 6473 + }, + { + "epoch": 0.4534923428511387, + "grad_norm": 3.640571355819702, + "learning_rate": 5.4693338003502635e-05, + "loss": 0.8591, + "num_input_tokens_seen": 104165544, + "step": 6474 + }, + { + "epoch": 0.45356239109686797, + "grad_norm": 3.796278953552246, + "learning_rate": 5.468633975481611e-05, + "loss": 0.9476, + "num_input_tokens_seen": 104181928, + "step": 6475 + }, + { + "epoch": 0.4536324393425972, + "grad_norm": 4.749582767486572, + "learning_rate": 5.46793415061296e-05, + "loss": 1.0821, + "num_input_tokens_seen": 104197168, + "step": 6476 + }, + { + "epoch": 0.4537024875883265, + "grad_norm": 4.0797271728515625, + "learning_rate": 5.467234325744308e-05, + "loss": 0.9838, + "num_input_tokens_seen": 104213000, + "step": 6477 + }, + { + "epoch": 0.4537725358340557, + "grad_norm": 4.250664710998535, + "learning_rate": 5.466534500875656e-05, + "loss": 1.1786, + "num_input_tokens_seen": 104229136, + "step": 6478 + }, + { + "epoch": 0.45384258407978495, + "grad_norm": 3.8380579948425293, + "learning_rate": 5.465834676007006e-05, + "loss": 1.0362, + "num_input_tokens_seen": 104245520, + "step": 6479 + }, + { + "epoch": 0.4539126323255142, + "grad_norm": 3.7329459190368652, + "learning_rate": 5.4651348511383546e-05, + "loss": 1.0818, + "num_input_tokens_seen": 104261904, + "step": 6480 + }, + { + "epoch": 0.45398268057124347, + "grad_norm": 4.495264530181885, + "learning_rate": 5.464435026269703e-05, + "loss": 1.1216, + "num_input_tokens_seen": 104278288, + "step": 6481 + }, + { + "epoch": 0.4540527288169727, + "grad_norm": 3.7195420265197754, + "learning_rate": 5.46373520140105e-05, + "loss": 1.1697, + "num_input_tokens_seen": 104294192, + "step": 6482 + }, + { + "epoch": 0.45412277706270193, + "grad_norm": 5.255592346191406, + "learning_rate": 5.463035376532399e-05, + "loss": 1.2848, + "num_input_tokens_seen": 104308704, + "step": 6483 + }, + { + "epoch": 0.4541928253084312, + "grad_norm": 4.61810302734375, + "learning_rate": 5.462335551663749e-05, + "loss": 1.0923, + "num_input_tokens_seen": 104325088, + "step": 6484 + }, + { + "epoch": 0.45426287355416045, + "grad_norm": 7.175589561462402, + "learning_rate": 5.4616357267950955e-05, + "loss": 1.1434, + "num_input_tokens_seen": 104341224, + "step": 6485 + }, + { + "epoch": 0.45433292179988966, + "grad_norm": 3.756762742996216, + "learning_rate": 5.460935901926445e-05, + "loss": 0.9934, + "num_input_tokens_seen": 104356392, + "step": 6486 + }, + { + "epoch": 0.4544029700456189, + "grad_norm": 3.979435920715332, + "learning_rate": 5.4602360770577946e-05, + "loss": 0.777, + "num_input_tokens_seen": 104372776, + "step": 6487 + }, + { + "epoch": 0.4544730182913482, + "grad_norm": 3.761296272277832, + "learning_rate": 5.459536252189143e-05, + "loss": 1.113, + "num_input_tokens_seen": 104389160, + "step": 6488 + }, + { + "epoch": 0.45454306653707743, + "grad_norm": 5.02775239944458, + "learning_rate": 5.458836427320491e-05, + "loss": 0.9833, + "num_input_tokens_seen": 104404768, + "step": 6489 + }, + { + "epoch": 0.45461311478280664, + "grad_norm": 4.161303997039795, + "learning_rate": 5.45813660245184e-05, + "loss": 0.9746, + "num_input_tokens_seen": 104420152, + "step": 6490 + }, + { + "epoch": 0.4546831630285359, + "grad_norm": 3.7053780555725098, + "learning_rate": 5.457436777583188e-05, + "loss": 0.7889, + "num_input_tokens_seen": 104435512, + "step": 6491 + }, + { + "epoch": 0.45475321127426516, + "grad_norm": 4.103651523590088, + "learning_rate": 5.456736952714535e-05, + "loss": 1.0379, + "num_input_tokens_seen": 104451896, + "step": 6492 + }, + { + "epoch": 0.4548232595199944, + "grad_norm": 4.212504863739014, + "learning_rate": 5.456037127845884e-05, + "loss": 1.0417, + "num_input_tokens_seen": 104468280, + "step": 6493 + }, + { + "epoch": 0.4548933077657236, + "grad_norm": 6.549145221710205, + "learning_rate": 5.455337302977234e-05, + "loss": 1.0627, + "num_input_tokens_seen": 104484520, + "step": 6494 + }, + { + "epoch": 0.4549633560114529, + "grad_norm": 3.777740478515625, + "learning_rate": 5.454637478108582e-05, + "loss": 1.2379, + "num_input_tokens_seen": 104500536, + "step": 6495 + }, + { + "epoch": 0.45503340425718214, + "grad_norm": 3.827119827270508, + "learning_rate": 5.45393765323993e-05, + "loss": 0.9852, + "num_input_tokens_seen": 104516712, + "step": 6496 + }, + { + "epoch": 0.4551034525029114, + "grad_norm": 4.231398105621338, + "learning_rate": 5.453237828371279e-05, + "loss": 1.0009, + "num_input_tokens_seen": 104532792, + "step": 6497 + }, + { + "epoch": 0.4551735007486406, + "grad_norm": 5.237041473388672, + "learning_rate": 5.452538003502627e-05, + "loss": 1.0285, + "num_input_tokens_seen": 104549136, + "step": 6498 + }, + { + "epoch": 0.45524354899436986, + "grad_norm": 4.457448482513428, + "learning_rate": 5.451838178633975e-05, + "loss": 1.2198, + "num_input_tokens_seen": 104565320, + "step": 6499 + }, + { + "epoch": 0.4553135972400991, + "grad_norm": 3.7427215576171875, + "learning_rate": 5.451138353765325e-05, + "loss": 1.1668, + "num_input_tokens_seen": 104580744, + "step": 6500 + }, + { + "epoch": 0.4553836454858284, + "grad_norm": 4.094877243041992, + "learning_rate": 5.450438528896673e-05, + "loss": 1.1735, + "num_input_tokens_seen": 104596576, + "step": 6501 + }, + { + "epoch": 0.4554536937315576, + "grad_norm": 4.290172576904297, + "learning_rate": 5.4497387040280214e-05, + "loss": 1.2692, + "num_input_tokens_seen": 104612960, + "step": 6502 + }, + { + "epoch": 0.45552374197728684, + "grad_norm": 3.5206210613250732, + "learning_rate": 5.4490388791593695e-05, + "loss": 0.9754, + "num_input_tokens_seen": 104627664, + "step": 6503 + }, + { + "epoch": 0.4555937902230161, + "grad_norm": 3.7847232818603516, + "learning_rate": 5.4483390542907184e-05, + "loss": 1.1653, + "num_input_tokens_seen": 104644048, + "step": 6504 + }, + { + "epoch": 0.45566383846874536, + "grad_norm": 4.193985939025879, + "learning_rate": 5.4476392294220666e-05, + "loss": 1.1462, + "num_input_tokens_seen": 104659664, + "step": 6505 + }, + { + "epoch": 0.45573388671447457, + "grad_norm": 3.819429874420166, + "learning_rate": 5.446939404553415e-05, + "loss": 1.0596, + "num_input_tokens_seen": 104676048, + "step": 6506 + }, + { + "epoch": 0.4558039349602038, + "grad_norm": 3.7949306964874268, + "learning_rate": 5.446239579684764e-05, + "loss": 1.016, + "num_input_tokens_seen": 104692432, + "step": 6507 + }, + { + "epoch": 0.4558739832059331, + "grad_norm": 3.880740165710449, + "learning_rate": 5.4455397548161125e-05, + "loss": 1.2764, + "num_input_tokens_seen": 104708616, + "step": 6508 + }, + { + "epoch": 0.45594403145166235, + "grad_norm": 5.389898777008057, + "learning_rate": 5.444839929947459e-05, + "loss": 1.1917, + "num_input_tokens_seen": 104724984, + "step": 6509 + }, + { + "epoch": 0.4560140796973916, + "grad_norm": 5.053036689758301, + "learning_rate": 5.44414010507881e-05, + "loss": 1.1091, + "num_input_tokens_seen": 104741368, + "step": 6510 + }, + { + "epoch": 0.4560841279431208, + "grad_norm": 4.7108330726623535, + "learning_rate": 5.4434402802101584e-05, + "loss": 1.1051, + "num_input_tokens_seen": 104757752, + "step": 6511 + }, + { + "epoch": 0.45615417618885007, + "grad_norm": 3.8108251094818115, + "learning_rate": 5.4427404553415066e-05, + "loss": 0.9524, + "num_input_tokens_seen": 104774136, + "step": 6512 + }, + { + "epoch": 0.45622422443457933, + "grad_norm": 3.8631815910339355, + "learning_rate": 5.442040630472854e-05, + "loss": 1.1041, + "num_input_tokens_seen": 104790288, + "step": 6513 + }, + { + "epoch": 0.4562942726803086, + "grad_norm": 3.745565176010132, + "learning_rate": 5.441340805604205e-05, + "loss": 1.086, + "num_input_tokens_seen": 104806672, + "step": 6514 + }, + { + "epoch": 0.4563643209260378, + "grad_norm": 3.6682205200195312, + "learning_rate": 5.440640980735552e-05, + "loss": 1.0626, + "num_input_tokens_seen": 104822824, + "step": 6515 + }, + { + "epoch": 0.45643436917176705, + "grad_norm": 4.422383785247803, + "learning_rate": 5.439941155866901e-05, + "loss": 1.059, + "num_input_tokens_seen": 104839208, + "step": 6516 + }, + { + "epoch": 0.4565044174174963, + "grad_norm": 5.5291242599487305, + "learning_rate": 5.4392413309982495e-05, + "loss": 0.9887, + "num_input_tokens_seen": 104855296, + "step": 6517 + }, + { + "epoch": 0.45657446566322557, + "grad_norm": 7.490913391113281, + "learning_rate": 5.438541506129598e-05, + "loss": 1.2157, + "num_input_tokens_seen": 104871440, + "step": 6518 + }, + { + "epoch": 0.4566445139089548, + "grad_norm": 5.1885528564453125, + "learning_rate": 5.437841681260946e-05, + "loss": 1.1799, + "num_input_tokens_seen": 104887824, + "step": 6519 + }, + { + "epoch": 0.45671456215468403, + "grad_norm": 4.4618096351623535, + "learning_rate": 5.437141856392295e-05, + "loss": 1.34, + "num_input_tokens_seen": 104904208, + "step": 6520 + }, + { + "epoch": 0.4567846104004133, + "grad_norm": 3.8809101581573486, + "learning_rate": 5.436442031523643e-05, + "loss": 1.0538, + "num_input_tokens_seen": 104920592, + "step": 6521 + }, + { + "epoch": 0.45685465864614255, + "grad_norm": 3.429588794708252, + "learning_rate": 5.435742206654991e-05, + "loss": 0.975, + "num_input_tokens_seen": 104936976, + "step": 6522 + }, + { + "epoch": 0.45692470689187176, + "grad_norm": 3.714005947113037, + "learning_rate": 5.435042381786339e-05, + "loss": 0.9339, + "num_input_tokens_seen": 104953136, + "step": 6523 + }, + { + "epoch": 0.456994755137601, + "grad_norm": 4.082497596740723, + "learning_rate": 5.4343425569176895e-05, + "loss": 1.1525, + "num_input_tokens_seen": 104969520, + "step": 6524 + }, + { + "epoch": 0.4570648033833303, + "grad_norm": 5.983520030975342, + "learning_rate": 5.433642732049037e-05, + "loss": 1.1925, + "num_input_tokens_seen": 104984096, + "step": 6525 + }, + { + "epoch": 0.45713485162905954, + "grad_norm": 4.282527446746826, + "learning_rate": 5.432942907180385e-05, + "loss": 1.0791, + "num_input_tokens_seen": 105000120, + "step": 6526 + }, + { + "epoch": 0.45720489987478874, + "grad_norm": 4.0138726234436035, + "learning_rate": 5.432243082311734e-05, + "loss": 1.212, + "num_input_tokens_seen": 105016504, + "step": 6527 + }, + { + "epoch": 0.457274948120518, + "grad_norm": 7.047135829925537, + "learning_rate": 5.431543257443082e-05, + "loss": 1.1071, + "num_input_tokens_seen": 105031360, + "step": 6528 + }, + { + "epoch": 0.45734499636624726, + "grad_norm": 4.966803550720215, + "learning_rate": 5.4308434325744304e-05, + "loss": 1.084, + "num_input_tokens_seen": 105047744, + "step": 6529 + }, + { + "epoch": 0.4574150446119765, + "grad_norm": 3.5288639068603516, + "learning_rate": 5.43014360770578e-05, + "loss": 1.0007, + "num_input_tokens_seen": 105064128, + "step": 6530 + }, + { + "epoch": 0.4574850928577057, + "grad_norm": 5.45017147064209, + "learning_rate": 5.4294437828371295e-05, + "loss": 1.0761, + "num_input_tokens_seen": 105080512, + "step": 6531 + }, + { + "epoch": 0.457555141103435, + "grad_norm": 4.392576694488525, + "learning_rate": 5.428743957968476e-05, + "loss": 1.1192, + "num_input_tokens_seen": 105096896, + "step": 6532 + }, + { + "epoch": 0.45762518934916424, + "grad_norm": 3.4714255332946777, + "learning_rate": 5.428044133099826e-05, + "loss": 0.94, + "num_input_tokens_seen": 105113280, + "step": 6533 + }, + { + "epoch": 0.4576952375948935, + "grad_norm": 4.999673366546631, + "learning_rate": 5.427344308231175e-05, + "loss": 0.8589, + "num_input_tokens_seen": 105128640, + "step": 6534 + }, + { + "epoch": 0.4577652858406227, + "grad_norm": 3.9281249046325684, + "learning_rate": 5.426644483362522e-05, + "loss": 1.2802, + "num_input_tokens_seen": 105144176, + "step": 6535 + }, + { + "epoch": 0.45783533408635196, + "grad_norm": 4.223507881164551, + "learning_rate": 5.42594465849387e-05, + "loss": 0.9767, + "num_input_tokens_seen": 105160560, + "step": 6536 + }, + { + "epoch": 0.4579053823320812, + "grad_norm": 3.7774858474731445, + "learning_rate": 5.425244833625219e-05, + "loss": 0.8401, + "num_input_tokens_seen": 105176696, + "step": 6537 + }, + { + "epoch": 0.4579754305778105, + "grad_norm": 3.666398048400879, + "learning_rate": 5.424545008756569e-05, + "loss": 1.0199, + "num_input_tokens_seen": 105192376, + "step": 6538 + }, + { + "epoch": 0.4580454788235397, + "grad_norm": 4.442626476287842, + "learning_rate": 5.4238451838879156e-05, + "loss": 1.2166, + "num_input_tokens_seen": 105208192, + "step": 6539 + }, + { + "epoch": 0.45811552706926895, + "grad_norm": 3.8626255989074707, + "learning_rate": 5.423145359019265e-05, + "loss": 1.2679, + "num_input_tokens_seen": 105224576, + "step": 6540 + }, + { + "epoch": 0.4581855753149982, + "grad_norm": 3.713498830795288, + "learning_rate": 5.422445534150614e-05, + "loss": 1.0785, + "num_input_tokens_seen": 105240960, + "step": 6541 + }, + { + "epoch": 0.45825562356072747, + "grad_norm": 5.06941032409668, + "learning_rate": 5.4217457092819615e-05, + "loss": 1.1006, + "num_input_tokens_seen": 105257344, + "step": 6542 + }, + { + "epoch": 0.4583256718064567, + "grad_norm": 3.6487746238708496, + "learning_rate": 5.4210458844133103e-05, + "loss": 0.9838, + "num_input_tokens_seen": 105273336, + "step": 6543 + }, + { + "epoch": 0.45839572005218593, + "grad_norm": 3.70211124420166, + "learning_rate": 5.42034605954466e-05, + "loss": 1.0334, + "num_input_tokens_seen": 105289720, + "step": 6544 + }, + { + "epoch": 0.4584657682979152, + "grad_norm": 5.169928073883057, + "learning_rate": 5.419646234676007e-05, + "loss": 0.9696, + "num_input_tokens_seen": 105306104, + "step": 6545 + }, + { + "epoch": 0.45853581654364445, + "grad_norm": 4.101007461547852, + "learning_rate": 5.418946409807355e-05, + "loss": 1.1545, + "num_input_tokens_seen": 105322488, + "step": 6546 + }, + { + "epoch": 0.4586058647893737, + "grad_norm": 4.077839374542236, + "learning_rate": 5.4182465849387044e-05, + "loss": 0.9885, + "num_input_tokens_seen": 105338872, + "step": 6547 + }, + { + "epoch": 0.4586759130351029, + "grad_norm": 4.46600341796875, + "learning_rate": 5.417546760070053e-05, + "loss": 1.0451, + "num_input_tokens_seen": 105355256, + "step": 6548 + }, + { + "epoch": 0.45874596128083217, + "grad_norm": 3.765453577041626, + "learning_rate": 5.416846935201401e-05, + "loss": 0.9365, + "num_input_tokens_seen": 105370928, + "step": 6549 + }, + { + "epoch": 0.45881600952656143, + "grad_norm": 3.913649559020996, + "learning_rate": 5.4161471103327496e-05, + "loss": 1.0764, + "num_input_tokens_seen": 105387312, + "step": 6550 + }, + { + "epoch": 0.4588860577722907, + "grad_norm": 5.323554992675781, + "learning_rate": 5.415447285464099e-05, + "loss": 1.0202, + "num_input_tokens_seen": 105403408, + "step": 6551 + }, + { + "epoch": 0.4589561060180199, + "grad_norm": 3.8482306003570557, + "learning_rate": 5.414747460595446e-05, + "loss": 1.011, + "num_input_tokens_seen": 105419792, + "step": 6552 + }, + { + "epoch": 0.45902615426374915, + "grad_norm": 4.369050025939941, + "learning_rate": 5.414047635726794e-05, + "loss": 0.9605, + "num_input_tokens_seen": 105435240, + "step": 6553 + }, + { + "epoch": 0.4590962025094784, + "grad_norm": 3.4255287647247314, + "learning_rate": 5.413347810858145e-05, + "loss": 0.9871, + "num_input_tokens_seen": 105451568, + "step": 6554 + }, + { + "epoch": 0.45916625075520767, + "grad_norm": 4.246303081512451, + "learning_rate": 5.412647985989493e-05, + "loss": 1.0404, + "num_input_tokens_seen": 105467952, + "step": 6555 + }, + { + "epoch": 0.4592362990009369, + "grad_norm": 3.785661220550537, + "learning_rate": 5.4119481611208414e-05, + "loss": 0.9251, + "num_input_tokens_seen": 105484336, + "step": 6556 + }, + { + "epoch": 0.45930634724666614, + "grad_norm": 3.661653757095337, + "learning_rate": 5.4112483362521896e-05, + "loss": 0.9834, + "num_input_tokens_seen": 105500720, + "step": 6557 + }, + { + "epoch": 0.4593763954923954, + "grad_norm": 4.362829685211182, + "learning_rate": 5.4105485113835385e-05, + "loss": 1.084, + "num_input_tokens_seen": 105516488, + "step": 6558 + }, + { + "epoch": 0.45944644373812465, + "grad_norm": 3.867062568664551, + "learning_rate": 5.409848686514885e-05, + "loss": 1.048, + "num_input_tokens_seen": 105532736, + "step": 6559 + }, + { + "epoch": 0.45951649198385386, + "grad_norm": 3.8351078033447266, + "learning_rate": 5.409148861646236e-05, + "loss": 1.0162, + "num_input_tokens_seen": 105549120, + "step": 6560 + }, + { + "epoch": 0.4595865402295831, + "grad_norm": 4.525234699249268, + "learning_rate": 5.4084490367775844e-05, + "loss": 1.0798, + "num_input_tokens_seen": 105564776, + "step": 6561 + }, + { + "epoch": 0.4596565884753124, + "grad_norm": 3.8182532787323, + "learning_rate": 5.4077492119089326e-05, + "loss": 1.021, + "num_input_tokens_seen": 105581080, + "step": 6562 + }, + { + "epoch": 0.45972663672104164, + "grad_norm": 3.82145619392395, + "learning_rate": 5.407049387040281e-05, + "loss": 1.0482, + "num_input_tokens_seen": 105596496, + "step": 6563 + }, + { + "epoch": 0.45979668496677084, + "grad_norm": 4.378223419189453, + "learning_rate": 5.406349562171629e-05, + "loss": 1.0795, + "num_input_tokens_seen": 105612200, + "step": 6564 + }, + { + "epoch": 0.4598667332125001, + "grad_norm": 4.628854274749756, + "learning_rate": 5.405649737302978e-05, + "loss": 1.1581, + "num_input_tokens_seen": 105628584, + "step": 6565 + }, + { + "epoch": 0.45993678145822936, + "grad_norm": 5.091843128204346, + "learning_rate": 5.404949912434326e-05, + "loss": 1.203, + "num_input_tokens_seen": 105644968, + "step": 6566 + }, + { + "epoch": 0.4600068297039586, + "grad_norm": 4.4174580574035645, + "learning_rate": 5.404250087565674e-05, + "loss": 1.1291, + "num_input_tokens_seen": 105661352, + "step": 6567 + }, + { + "epoch": 0.4600768779496878, + "grad_norm": 4.136083602905273, + "learning_rate": 5.403550262697024e-05, + "loss": 1.183, + "num_input_tokens_seen": 105677200, + "step": 6568 + }, + { + "epoch": 0.4601469261954171, + "grad_norm": 4.388592720031738, + "learning_rate": 5.4028504378283705e-05, + "loss": 1.2882, + "num_input_tokens_seen": 105693584, + "step": 6569 + }, + { + "epoch": 0.46021697444114634, + "grad_norm": 5.642048358917236, + "learning_rate": 5.402150612959721e-05, + "loss": 1.1693, + "num_input_tokens_seen": 105709688, + "step": 6570 + }, + { + "epoch": 0.4602870226868756, + "grad_norm": 4.107602596282959, + "learning_rate": 5.401450788091069e-05, + "loss": 1.2681, + "num_input_tokens_seen": 105725608, + "step": 6571 + }, + { + "epoch": 0.4603570709326048, + "grad_norm": 6.477549076080322, + "learning_rate": 5.400750963222417e-05, + "loss": 1.0574, + "num_input_tokens_seen": 105741992, + "step": 6572 + }, + { + "epoch": 0.46042711917833407, + "grad_norm": 4.7183380126953125, + "learning_rate": 5.400051138353765e-05, + "loss": 1.087, + "num_input_tokens_seen": 105758376, + "step": 6573 + }, + { + "epoch": 0.4604971674240633, + "grad_norm": 4.523158073425293, + "learning_rate": 5.3993513134851135e-05, + "loss": 1.0595, + "num_input_tokens_seen": 105774760, + "step": 6574 + }, + { + "epoch": 0.4605672156697926, + "grad_norm": 3.631554126739502, + "learning_rate": 5.398651488616463e-05, + "loss": 1.0407, + "num_input_tokens_seen": 105791144, + "step": 6575 + }, + { + "epoch": 0.4606372639155218, + "grad_norm": 8.034467697143555, + "learning_rate": 5.397951663747811e-05, + "loss": 1.0565, + "num_input_tokens_seen": 105807528, + "step": 6576 + }, + { + "epoch": 0.46070731216125105, + "grad_norm": 3.780055522918701, + "learning_rate": 5.397251838879161e-05, + "loss": 1.0622, + "num_input_tokens_seen": 105823592, + "step": 6577 + }, + { + "epoch": 0.4607773604069803, + "grad_norm": 3.975475549697876, + "learning_rate": 5.396552014010509e-05, + "loss": 1.0651, + "num_input_tokens_seen": 105839976, + "step": 6578 + }, + { + "epoch": 0.46084740865270957, + "grad_norm": 3.4668362140655518, + "learning_rate": 5.395852189141857e-05, + "loss": 0.9884, + "num_input_tokens_seen": 105856360, + "step": 6579 + }, + { + "epoch": 0.4609174568984388, + "grad_norm": 3.7928245067596436, + "learning_rate": 5.395152364273205e-05, + "loss": 1.0635, + "num_input_tokens_seen": 105872744, + "step": 6580 + }, + { + "epoch": 0.46098750514416803, + "grad_norm": 3.8289833068847656, + "learning_rate": 5.3944525394045555e-05, + "loss": 0.8981, + "num_input_tokens_seen": 105888528, + "step": 6581 + }, + { + "epoch": 0.4610575533898973, + "grad_norm": 6.435444355010986, + "learning_rate": 5.393752714535902e-05, + "loss": 1.0163, + "num_input_tokens_seen": 105903592, + "step": 6582 + }, + { + "epoch": 0.46112760163562655, + "grad_norm": 4.274429798126221, + "learning_rate": 5.3930528896672505e-05, + "loss": 1.3321, + "num_input_tokens_seen": 105919904, + "step": 6583 + }, + { + "epoch": 0.4611976498813558, + "grad_norm": 3.619840145111084, + "learning_rate": 5.3923530647986e-05, + "loss": 0.8481, + "num_input_tokens_seen": 105936288, + "step": 6584 + }, + { + "epoch": 0.461267698127085, + "grad_norm": 3.643489122390747, + "learning_rate": 5.391653239929948e-05, + "loss": 1.2037, + "num_input_tokens_seen": 105952624, + "step": 6585 + }, + { + "epoch": 0.4613377463728143, + "grad_norm": 3.5494256019592285, + "learning_rate": 5.3909534150612964e-05, + "loss": 1.0352, + "num_input_tokens_seen": 105968568, + "step": 6586 + }, + { + "epoch": 0.46140779461854353, + "grad_norm": 5.754514694213867, + "learning_rate": 5.390253590192645e-05, + "loss": 0.8431, + "num_input_tokens_seen": 105984952, + "step": 6587 + }, + { + "epoch": 0.4614778428642728, + "grad_norm": 3.9911015033721924, + "learning_rate": 5.3895537653239934e-05, + "loss": 0.9876, + "num_input_tokens_seen": 106001336, + "step": 6588 + }, + { + "epoch": 0.461547891110002, + "grad_norm": 4.1558756828308105, + "learning_rate": 5.3888539404553416e-05, + "loss": 1.0537, + "num_input_tokens_seen": 106016736, + "step": 6589 + }, + { + "epoch": 0.46161793935573125, + "grad_norm": 4.300850868225098, + "learning_rate": 5.38815411558669e-05, + "loss": 1.0303, + "num_input_tokens_seen": 106033120, + "step": 6590 + }, + { + "epoch": 0.4616879876014605, + "grad_norm": 6.03284215927124, + "learning_rate": 5.387454290718038e-05, + "loss": 1.0919, + "num_input_tokens_seen": 106049504, + "step": 6591 + }, + { + "epoch": 0.4617580358471898, + "grad_norm": 4.091002941131592, + "learning_rate": 5.3867544658493875e-05, + "loss": 1.205, + "num_input_tokens_seen": 106065632, + "step": 6592 + }, + { + "epoch": 0.461828084092919, + "grad_norm": 3.7395520210266113, + "learning_rate": 5.386054640980736e-05, + "loss": 0.9516, + "num_input_tokens_seen": 106081632, + "step": 6593 + }, + { + "epoch": 0.46189813233864824, + "grad_norm": 4.021444797515869, + "learning_rate": 5.3853548161120845e-05, + "loss": 1.0859, + "num_input_tokens_seen": 106097376, + "step": 6594 + }, + { + "epoch": 0.4619681805843775, + "grad_norm": 5.202040672302246, + "learning_rate": 5.384654991243433e-05, + "loss": 0.935, + "num_input_tokens_seen": 106113096, + "step": 6595 + }, + { + "epoch": 0.46203822883010676, + "grad_norm": 8.020401000976562, + "learning_rate": 5.383955166374781e-05, + "loss": 1.0943, + "num_input_tokens_seen": 106129480, + "step": 6596 + }, + { + "epoch": 0.46210827707583596, + "grad_norm": 4.892960548400879, + "learning_rate": 5.3832553415061304e-05, + "loss": 1.0347, + "num_input_tokens_seen": 106145864, + "step": 6597 + }, + { + "epoch": 0.4621783253215652, + "grad_norm": 3.963135004043579, + "learning_rate": 5.38255551663748e-05, + "loss": 1.0275, + "num_input_tokens_seen": 106162248, + "step": 6598 + }, + { + "epoch": 0.4622483735672945, + "grad_norm": 5.362968444824219, + "learning_rate": 5.381855691768827e-05, + "loss": 0.9192, + "num_input_tokens_seen": 106177616, + "step": 6599 + }, + { + "epoch": 0.46231842181302374, + "grad_norm": 5.272266864776611, + "learning_rate": 5.381155866900175e-05, + "loss": 1.3188, + "num_input_tokens_seen": 106194000, + "step": 6600 + }, + { + "epoch": 0.46231842181302374, + "eval_loss": 1.1238571405410767, + "eval_runtime": 0.1703, + "eval_samples_per_second": 5.872, + "eval_steps_per_second": 5.872, + "num_input_tokens_seen": 106194000, + "step": 6600 + }, + { + "epoch": 0.46238847005875294, + "grad_norm": 6.129757881164551, + "learning_rate": 5.3804560420315245e-05, + "loss": 0.9134, + "num_input_tokens_seen": 106210384, + "step": 6601 + }, + { + "epoch": 0.4624585183044822, + "grad_norm": 4.237639904022217, + "learning_rate": 5.379756217162873e-05, + "loss": 1.013, + "num_input_tokens_seen": 106226240, + "step": 6602 + }, + { + "epoch": 0.46252856655021146, + "grad_norm": 3.4758036136627197, + "learning_rate": 5.37905639229422e-05, + "loss": 0.9542, + "num_input_tokens_seen": 106242624, + "step": 6603 + }, + { + "epoch": 0.4625986147959407, + "grad_norm": 4.031625270843506, + "learning_rate": 5.37835656742557e-05, + "loss": 0.9543, + "num_input_tokens_seen": 106259008, + "step": 6604 + }, + { + "epoch": 0.4626686630416699, + "grad_norm": 3.9605302810668945, + "learning_rate": 5.377656742556919e-05, + "loss": 1.1185, + "num_input_tokens_seen": 106274960, + "step": 6605 + }, + { + "epoch": 0.4627387112873992, + "grad_norm": 3.5777320861816406, + "learning_rate": 5.3769569176882675e-05, + "loss": 1.1453, + "num_input_tokens_seen": 106291344, + "step": 6606 + }, + { + "epoch": 0.46280875953312844, + "grad_norm": 3.553462505340576, + "learning_rate": 5.376257092819614e-05, + "loss": 0.9831, + "num_input_tokens_seen": 106306184, + "step": 6607 + }, + { + "epoch": 0.4628788077788577, + "grad_norm": 3.745340347290039, + "learning_rate": 5.3755572679509645e-05, + "loss": 0.8471, + "num_input_tokens_seen": 106320360, + "step": 6608 + }, + { + "epoch": 0.4629488560245869, + "grad_norm": 3.7483649253845215, + "learning_rate": 5.374857443082312e-05, + "loss": 1.0482, + "num_input_tokens_seen": 106336664, + "step": 6609 + }, + { + "epoch": 0.46301890427031617, + "grad_norm": 3.675184726715088, + "learning_rate": 5.37415761821366e-05, + "loss": 1.0679, + "num_input_tokens_seen": 106353048, + "step": 6610 + }, + { + "epoch": 0.4630889525160454, + "grad_norm": 4.733851432800293, + "learning_rate": 5.373457793345009e-05, + "loss": 1.0269, + "num_input_tokens_seen": 106369432, + "step": 6611 + }, + { + "epoch": 0.4631590007617747, + "grad_norm": 3.9618589878082275, + "learning_rate": 5.372757968476357e-05, + "loss": 1.0667, + "num_input_tokens_seen": 106385592, + "step": 6612 + }, + { + "epoch": 0.46322904900750395, + "grad_norm": 3.95268177986145, + "learning_rate": 5.3720581436077054e-05, + "loss": 1.1044, + "num_input_tokens_seen": 106401976, + "step": 6613 + }, + { + "epoch": 0.46329909725323315, + "grad_norm": 4.600008010864258, + "learning_rate": 5.371358318739055e-05, + "loss": 1.0004, + "num_input_tokens_seen": 106418360, + "step": 6614 + }, + { + "epoch": 0.4633691454989624, + "grad_norm": 3.6651558876037598, + "learning_rate": 5.370658493870404e-05, + "loss": 0.9977, + "num_input_tokens_seen": 106434488, + "step": 6615 + }, + { + "epoch": 0.46343919374469167, + "grad_norm": 4.116913318634033, + "learning_rate": 5.369958669001752e-05, + "loss": 1.0409, + "num_input_tokens_seen": 106450872, + "step": 6616 + }, + { + "epoch": 0.46350924199042093, + "grad_norm": 4.44846773147583, + "learning_rate": 5.3692588441330995e-05, + "loss": 0.8553, + "num_input_tokens_seen": 106467256, + "step": 6617 + }, + { + "epoch": 0.46357929023615013, + "grad_norm": 5.590776443481445, + "learning_rate": 5.3685590192644483e-05, + "loss": 0.9049, + "num_input_tokens_seen": 106483640, + "step": 6618 + }, + { + "epoch": 0.4636493384818794, + "grad_norm": 5.505274772644043, + "learning_rate": 5.3678591943957965e-05, + "loss": 1.2003, + "num_input_tokens_seen": 106500024, + "step": 6619 + }, + { + "epoch": 0.46371938672760865, + "grad_norm": 3.8726046085357666, + "learning_rate": 5.367159369527145e-05, + "loss": 0.953, + "num_input_tokens_seen": 106516408, + "step": 6620 + }, + { + "epoch": 0.4637894349733379, + "grad_norm": 3.9251434803009033, + "learning_rate": 5.366459544658494e-05, + "loss": 1.1335, + "num_input_tokens_seen": 106531504, + "step": 6621 + }, + { + "epoch": 0.4638594832190671, + "grad_norm": 3.9294116497039795, + "learning_rate": 5.365759719789844e-05, + "loss": 0.9266, + "num_input_tokens_seen": 106547888, + "step": 6622 + }, + { + "epoch": 0.4639295314647964, + "grad_norm": 4.324211120605469, + "learning_rate": 5.365059894921192e-05, + "loss": 1.0962, + "num_input_tokens_seen": 106564272, + "step": 6623 + }, + { + "epoch": 0.46399957971052563, + "grad_norm": 3.5331010818481445, + "learning_rate": 5.364360070052539e-05, + "loss": 0.9043, + "num_input_tokens_seen": 106580608, + "step": 6624 + }, + { + "epoch": 0.4640696279562549, + "grad_norm": 3.642073392868042, + "learning_rate": 5.363660245183889e-05, + "loss": 1.0047, + "num_input_tokens_seen": 106596760, + "step": 6625 + }, + { + "epoch": 0.4641396762019841, + "grad_norm": 4.356872081756592, + "learning_rate": 5.362960420315237e-05, + "loss": 1.0147, + "num_input_tokens_seen": 106613144, + "step": 6626 + }, + { + "epoch": 0.46420972444771336, + "grad_norm": 3.66884446144104, + "learning_rate": 5.362260595446584e-05, + "loss": 0.9679, + "num_input_tokens_seen": 106629528, + "step": 6627 + }, + { + "epoch": 0.4642797726934426, + "grad_norm": 4.089823246002197, + "learning_rate": 5.3615607705779335e-05, + "loss": 0.9901, + "num_input_tokens_seen": 106645864, + "step": 6628 + }, + { + "epoch": 0.4643498209391719, + "grad_norm": 4.411832332611084, + "learning_rate": 5.360860945709283e-05, + "loss": 1.0351, + "num_input_tokens_seen": 106662248, + "step": 6629 + }, + { + "epoch": 0.4644198691849011, + "grad_norm": 3.7563977241516113, + "learning_rate": 5.360161120840631e-05, + "loss": 1.0096, + "num_input_tokens_seen": 106678632, + "step": 6630 + }, + { + "epoch": 0.46448991743063034, + "grad_norm": 3.6493430137634277, + "learning_rate": 5.3594612959719794e-05, + "loss": 0.966, + "num_input_tokens_seen": 106694552, + "step": 6631 + }, + { + "epoch": 0.4645599656763596, + "grad_norm": 3.6459546089172363, + "learning_rate": 5.358761471103328e-05, + "loss": 1.0288, + "num_input_tokens_seen": 106710544, + "step": 6632 + }, + { + "epoch": 0.46463001392208886, + "grad_norm": 4.07296085357666, + "learning_rate": 5.3580616462346765e-05, + "loss": 1.0409, + "num_input_tokens_seen": 106726928, + "step": 6633 + }, + { + "epoch": 0.46470006216781806, + "grad_norm": 3.623961925506592, + "learning_rate": 5.357361821366024e-05, + "loss": 0.8566, + "num_input_tokens_seen": 106742048, + "step": 6634 + }, + { + "epoch": 0.4647701104135473, + "grad_norm": 3.8658370971679688, + "learning_rate": 5.356661996497374e-05, + "loss": 1.1116, + "num_input_tokens_seen": 106758432, + "step": 6635 + }, + { + "epoch": 0.4648401586592766, + "grad_norm": 7.479616641998291, + "learning_rate": 5.3559621716287224e-05, + "loss": 1.0146, + "num_input_tokens_seen": 106774816, + "step": 6636 + }, + { + "epoch": 0.46491020690500584, + "grad_norm": 5.282004356384277, + "learning_rate": 5.3552623467600706e-05, + "loss": 1.0662, + "num_input_tokens_seen": 106791200, + "step": 6637 + }, + { + "epoch": 0.46498025515073504, + "grad_norm": 5.323639392852783, + "learning_rate": 5.354562521891419e-05, + "loss": 1.159, + "num_input_tokens_seen": 106806400, + "step": 6638 + }, + { + "epoch": 0.4650503033964643, + "grad_norm": 3.709852933883667, + "learning_rate": 5.3538626970227676e-05, + "loss": 1.0024, + "num_input_tokens_seen": 106822224, + "step": 6639 + }, + { + "epoch": 0.46512035164219356, + "grad_norm": 3.583138942718506, + "learning_rate": 5.353162872154116e-05, + "loss": 1.1467, + "num_input_tokens_seen": 106838608, + "step": 6640 + }, + { + "epoch": 0.4651903998879228, + "grad_norm": 4.027291297912598, + "learning_rate": 5.352463047285464e-05, + "loss": 1.1139, + "num_input_tokens_seen": 106854992, + "step": 6641 + }, + { + "epoch": 0.465260448133652, + "grad_norm": 3.9708850383758545, + "learning_rate": 5.3517632224168135e-05, + "loss": 1.0602, + "num_input_tokens_seen": 106871376, + "step": 6642 + }, + { + "epoch": 0.4653304963793813, + "grad_norm": 5.148803234100342, + "learning_rate": 5.351063397548162e-05, + "loss": 1.0852, + "num_input_tokens_seen": 106887760, + "step": 6643 + }, + { + "epoch": 0.46540054462511055, + "grad_norm": 4.076368808746338, + "learning_rate": 5.3503635726795085e-05, + "loss": 1.1273, + "num_input_tokens_seen": 106904112, + "step": 6644 + }, + { + "epoch": 0.4654705928708398, + "grad_norm": 4.920746803283691, + "learning_rate": 5.349663747810858e-05, + "loss": 1.091, + "num_input_tokens_seen": 106919960, + "step": 6645 + }, + { + "epoch": 0.465540641116569, + "grad_norm": 3.8127434253692627, + "learning_rate": 5.3489639229422076e-05, + "loss": 0.9896, + "num_input_tokens_seen": 106935928, + "step": 6646 + }, + { + "epoch": 0.46561068936229827, + "grad_norm": 3.9216270446777344, + "learning_rate": 5.348264098073556e-05, + "loss": 1.0585, + "num_input_tokens_seen": 106952168, + "step": 6647 + }, + { + "epoch": 0.46568073760802753, + "grad_norm": 3.5133566856384277, + "learning_rate": 5.347564273204903e-05, + "loss": 0.8579, + "num_input_tokens_seen": 106968080, + "step": 6648 + }, + { + "epoch": 0.4657507858537568, + "grad_norm": 3.634164333343506, + "learning_rate": 5.346864448336253e-05, + "loss": 1.0907, + "num_input_tokens_seen": 106984464, + "step": 6649 + }, + { + "epoch": 0.46582083409948605, + "grad_norm": 3.7191765308380127, + "learning_rate": 5.346164623467601e-05, + "loss": 1.0374, + "num_input_tokens_seen": 107000848, + "step": 6650 + }, + { + "epoch": 0.46589088234521525, + "grad_norm": 3.767498254776001, + "learning_rate": 5.345464798598948e-05, + "loss": 1.0061, + "num_input_tokens_seen": 107017232, + "step": 6651 + }, + { + "epoch": 0.4659609305909445, + "grad_norm": 3.8340818881988525, + "learning_rate": 5.344764973730299e-05, + "loss": 1.075, + "num_input_tokens_seen": 107033616, + "step": 6652 + }, + { + "epoch": 0.46603097883667377, + "grad_norm": 9.20552921295166, + "learning_rate": 5.344065148861647e-05, + "loss": 1.1089, + "num_input_tokens_seen": 107049120, + "step": 6653 + }, + { + "epoch": 0.46610102708240303, + "grad_norm": 4.367069721221924, + "learning_rate": 5.343365323992995e-05, + "loss": 1.2988, + "num_input_tokens_seen": 107063432, + "step": 6654 + }, + { + "epoch": 0.46617107532813223, + "grad_norm": 3.6735596656799316, + "learning_rate": 5.342665499124343e-05, + "loss": 0.9416, + "num_input_tokens_seen": 107079712, + "step": 6655 + }, + { + "epoch": 0.4662411235738615, + "grad_norm": 4.066924095153809, + "learning_rate": 5.341965674255692e-05, + "loss": 1.1776, + "num_input_tokens_seen": 107096096, + "step": 6656 + }, + { + "epoch": 0.46631117181959075, + "grad_norm": 3.7454941272735596, + "learning_rate": 5.34126584938704e-05, + "loss": 1.0325, + "num_input_tokens_seen": 107112480, + "step": 6657 + }, + { + "epoch": 0.46638122006532, + "grad_norm": 3.738274574279785, + "learning_rate": 5.3405660245183885e-05, + "loss": 1.0846, + "num_input_tokens_seen": 107128864, + "step": 6658 + }, + { + "epoch": 0.4664512683110492, + "grad_norm": 8.665736198425293, + "learning_rate": 5.339866199649738e-05, + "loss": 1.1532, + "num_input_tokens_seen": 107144536, + "step": 6659 + }, + { + "epoch": 0.4665213165567785, + "grad_norm": 3.8733510971069336, + "learning_rate": 5.339166374781086e-05, + "loss": 0.7974, + "num_input_tokens_seen": 107160664, + "step": 6660 + }, + { + "epoch": 0.46659136480250774, + "grad_norm": 4.000319957733154, + "learning_rate": 5.3384665499124344e-05, + "loss": 1.0197, + "num_input_tokens_seen": 107177048, + "step": 6661 + }, + { + "epoch": 0.466661413048237, + "grad_norm": 3.8049557209014893, + "learning_rate": 5.337766725043783e-05, + "loss": 1.1256, + "num_input_tokens_seen": 107192496, + "step": 6662 + }, + { + "epoch": 0.4667314612939662, + "grad_norm": 4.009215354919434, + "learning_rate": 5.3370669001751314e-05, + "loss": 1.1667, + "num_input_tokens_seen": 107207912, + "step": 6663 + }, + { + "epoch": 0.46680150953969546, + "grad_norm": 6.3007378578186035, + "learning_rate": 5.3363670753064796e-05, + "loss": 1.3413, + "num_input_tokens_seen": 107223504, + "step": 6664 + }, + { + "epoch": 0.4668715577854247, + "grad_norm": 3.5798394680023193, + "learning_rate": 5.335667250437828e-05, + "loss": 1.0625, + "num_input_tokens_seen": 107239824, + "step": 6665 + }, + { + "epoch": 0.466941606031154, + "grad_norm": 4.701604843139648, + "learning_rate": 5.334967425569179e-05, + "loss": 1.1597, + "num_input_tokens_seen": 107256208, + "step": 6666 + }, + { + "epoch": 0.4670116542768832, + "grad_norm": 3.462380886077881, + "learning_rate": 5.3342676007005255e-05, + "loss": 0.976, + "num_input_tokens_seen": 107272592, + "step": 6667 + }, + { + "epoch": 0.46708170252261244, + "grad_norm": 5.546586513519287, + "learning_rate": 5.333567775831875e-05, + "loss": 1.3386, + "num_input_tokens_seen": 107287520, + "step": 6668 + }, + { + "epoch": 0.4671517507683417, + "grad_norm": 4.677948474884033, + "learning_rate": 5.332867950963223e-05, + "loss": 1.1263, + "num_input_tokens_seen": 107303608, + "step": 6669 + }, + { + "epoch": 0.46722179901407096, + "grad_norm": 3.95694899559021, + "learning_rate": 5.3321681260945714e-05, + "loss": 1.1536, + "num_input_tokens_seen": 107319992, + "step": 6670 + }, + { + "epoch": 0.46729184725980016, + "grad_norm": 4.037060737609863, + "learning_rate": 5.331468301225919e-05, + "loss": 1.0444, + "num_input_tokens_seen": 107336376, + "step": 6671 + }, + { + "epoch": 0.4673618955055294, + "grad_norm": 3.5486528873443604, + "learning_rate": 5.3307684763572684e-05, + "loss": 0.9887, + "num_input_tokens_seen": 107352760, + "step": 6672 + }, + { + "epoch": 0.4674319437512587, + "grad_norm": 3.868568181991577, + "learning_rate": 5.330068651488618e-05, + "loss": 0.8593, + "num_input_tokens_seen": 107369144, + "step": 6673 + }, + { + "epoch": 0.46750199199698794, + "grad_norm": 7.702548980712891, + "learning_rate": 5.329368826619965e-05, + "loss": 1.188, + "num_input_tokens_seen": 107385528, + "step": 6674 + }, + { + "epoch": 0.46757204024271715, + "grad_norm": 4.390200614929199, + "learning_rate": 5.328669001751314e-05, + "loss": 1.1342, + "num_input_tokens_seen": 107401304, + "step": 6675 + }, + { + "epoch": 0.4676420884884464, + "grad_norm": 3.7440412044525146, + "learning_rate": 5.327969176882663e-05, + "loss": 0.8969, + "num_input_tokens_seen": 107417688, + "step": 6676 + }, + { + "epoch": 0.46771213673417567, + "grad_norm": 4.894672870635986, + "learning_rate": 5.327269352014011e-05, + "loss": 1.0542, + "num_input_tokens_seen": 107433816, + "step": 6677 + }, + { + "epoch": 0.4677821849799049, + "grad_norm": 4.762908458709717, + "learning_rate": 5.326569527145359e-05, + "loss": 1.1926, + "num_input_tokens_seen": 107450200, + "step": 6678 + }, + { + "epoch": 0.46785223322563413, + "grad_norm": 4.3587870597839355, + "learning_rate": 5.325869702276708e-05, + "loss": 1.2127, + "num_input_tokens_seen": 107466584, + "step": 6679 + }, + { + "epoch": 0.4679222814713634, + "grad_norm": 4.166892051696777, + "learning_rate": 5.325169877408056e-05, + "loss": 1.0259, + "num_input_tokens_seen": 107482968, + "step": 6680 + }, + { + "epoch": 0.46799232971709265, + "grad_norm": 4.266642093658447, + "learning_rate": 5.324470052539404e-05, + "loss": 0.9715, + "num_input_tokens_seen": 107498192, + "step": 6681 + }, + { + "epoch": 0.4680623779628219, + "grad_norm": 3.3419625759124756, + "learning_rate": 5.3237702276707536e-05, + "loss": 1.0262, + "num_input_tokens_seen": 107514576, + "step": 6682 + }, + { + "epoch": 0.46813242620855117, + "grad_norm": 3.903163433074951, + "learning_rate": 5.3230704028021025e-05, + "loss": 1.0785, + "num_input_tokens_seen": 107530536, + "step": 6683 + }, + { + "epoch": 0.46820247445428037, + "grad_norm": 5.467947959899902, + "learning_rate": 5.32237057793345e-05, + "loss": 1.246, + "num_input_tokens_seen": 107546248, + "step": 6684 + }, + { + "epoch": 0.46827252270000963, + "grad_norm": 3.9213547706604004, + "learning_rate": 5.321670753064799e-05, + "loss": 1.1432, + "num_input_tokens_seen": 107561992, + "step": 6685 + }, + { + "epoch": 0.4683425709457389, + "grad_norm": 5.265954971313477, + "learning_rate": 5.320970928196147e-05, + "loss": 0.9934, + "num_input_tokens_seen": 107578376, + "step": 6686 + }, + { + "epoch": 0.46841261919146815, + "grad_norm": 3.9765655994415283, + "learning_rate": 5.320271103327495e-05, + "loss": 1.0219, + "num_input_tokens_seen": 107594680, + "step": 6687 + }, + { + "epoch": 0.46848266743719735, + "grad_norm": 4.261830806732178, + "learning_rate": 5.3195712784588434e-05, + "loss": 1.0328, + "num_input_tokens_seen": 107611064, + "step": 6688 + }, + { + "epoch": 0.4685527156829266, + "grad_norm": 7.026014804840088, + "learning_rate": 5.318871453590194e-05, + "loss": 0.9985, + "num_input_tokens_seen": 107626112, + "step": 6689 + }, + { + "epoch": 0.46862276392865587, + "grad_norm": 4.726694107055664, + "learning_rate": 5.3181716287215425e-05, + "loss": 1.0744, + "num_input_tokens_seen": 107642496, + "step": 6690 + }, + { + "epoch": 0.46869281217438513, + "grad_norm": 3.6380646228790283, + "learning_rate": 5.317471803852889e-05, + "loss": 1.1708, + "num_input_tokens_seen": 107658880, + "step": 6691 + }, + { + "epoch": 0.46876286042011434, + "grad_norm": 3.5807487964630127, + "learning_rate": 5.316771978984239e-05, + "loss": 1.1403, + "num_input_tokens_seen": 107675256, + "step": 6692 + }, + { + "epoch": 0.4688329086658436, + "grad_norm": 3.9915847778320312, + "learning_rate": 5.316072154115588e-05, + "loss": 1.0826, + "num_input_tokens_seen": 107691016, + "step": 6693 + }, + { + "epoch": 0.46890295691157285, + "grad_norm": 4.012253284454346, + "learning_rate": 5.3153723292469345e-05, + "loss": 1.0194, + "num_input_tokens_seen": 107707064, + "step": 6694 + }, + { + "epoch": 0.4689730051573021, + "grad_norm": 3.9562582969665527, + "learning_rate": 5.314672504378284e-05, + "loss": 1.0017, + "num_input_tokens_seen": 107723152, + "step": 6695 + }, + { + "epoch": 0.4690430534030313, + "grad_norm": 4.575549125671387, + "learning_rate": 5.3139726795096336e-05, + "loss": 1.0722, + "num_input_tokens_seen": 107739536, + "step": 6696 + }, + { + "epoch": 0.4691131016487606, + "grad_norm": 3.8225462436676025, + "learning_rate": 5.313272854640982e-05, + "loss": 1.0149, + "num_input_tokens_seen": 107755920, + "step": 6697 + }, + { + "epoch": 0.46918314989448984, + "grad_norm": 3.951275587081909, + "learning_rate": 5.31257302977233e-05, + "loss": 0.9675, + "num_input_tokens_seen": 107772296, + "step": 6698 + }, + { + "epoch": 0.4692531981402191, + "grad_norm": 3.5939061641693115, + "learning_rate": 5.311873204903678e-05, + "loss": 0.9912, + "num_input_tokens_seen": 107788480, + "step": 6699 + }, + { + "epoch": 0.4693232463859483, + "grad_norm": 7.109866619110107, + "learning_rate": 5.311173380035027e-05, + "loss": 0.8939, + "num_input_tokens_seen": 107804768, + "step": 6700 + }, + { + "epoch": 0.46939329463167756, + "grad_norm": 3.6135330200195312, + "learning_rate": 5.3104735551663745e-05, + "loss": 1.1388, + "num_input_tokens_seen": 107820632, + "step": 6701 + }, + { + "epoch": 0.4694633428774068, + "grad_norm": 4.7758331298828125, + "learning_rate": 5.3097737302977234e-05, + "loss": 0.9504, + "num_input_tokens_seen": 107837016, + "step": 6702 + }, + { + "epoch": 0.4695333911231361, + "grad_norm": 3.7631545066833496, + "learning_rate": 5.309073905429073e-05, + "loss": 1.021, + "num_input_tokens_seen": 107853400, + "step": 6703 + }, + { + "epoch": 0.4696034393688653, + "grad_norm": 5.737015247344971, + "learning_rate": 5.30837408056042e-05, + "loss": 1.053, + "num_input_tokens_seen": 107869784, + "step": 6704 + }, + { + "epoch": 0.46967348761459454, + "grad_norm": 3.845569610595703, + "learning_rate": 5.307674255691769e-05, + "loss": 1.0225, + "num_input_tokens_seen": 107885760, + "step": 6705 + }, + { + "epoch": 0.4697435358603238, + "grad_norm": 7.402350902557373, + "learning_rate": 5.306974430823118e-05, + "loss": 1.0404, + "num_input_tokens_seen": 107902144, + "step": 6706 + }, + { + "epoch": 0.46981358410605306, + "grad_norm": 4.036012649536133, + "learning_rate": 5.306274605954466e-05, + "loss": 1.0646, + "num_input_tokens_seen": 107918528, + "step": 6707 + }, + { + "epoch": 0.46988363235178227, + "grad_norm": 5.720461845397949, + "learning_rate": 5.3055747810858145e-05, + "loss": 0.9158, + "num_input_tokens_seen": 107934912, + "step": 6708 + }, + { + "epoch": 0.4699536805975115, + "grad_norm": 4.842574119567871, + "learning_rate": 5.304874956217163e-05, + "loss": 1.0039, + "num_input_tokens_seen": 107950800, + "step": 6709 + }, + { + "epoch": 0.4700237288432408, + "grad_norm": 3.787020444869995, + "learning_rate": 5.304175131348512e-05, + "loss": 0.8436, + "num_input_tokens_seen": 107967184, + "step": 6710 + }, + { + "epoch": 0.47009377708897004, + "grad_norm": 4.2691192626953125, + "learning_rate": 5.303475306479859e-05, + "loss": 1.0335, + "num_input_tokens_seen": 107983568, + "step": 6711 + }, + { + "epoch": 0.47016382533469925, + "grad_norm": 5.233339786529541, + "learning_rate": 5.3027754816112086e-05, + "loss": 1.0575, + "num_input_tokens_seen": 107999952, + "step": 6712 + }, + { + "epoch": 0.4702338735804285, + "grad_norm": 3.421193838119507, + "learning_rate": 5.302075656742558e-05, + "loss": 1.022, + "num_input_tokens_seen": 108016336, + "step": 6713 + }, + { + "epoch": 0.47030392182615777, + "grad_norm": 4.561410427093506, + "learning_rate": 5.301375831873906e-05, + "loss": 1.173, + "num_input_tokens_seen": 108032720, + "step": 6714 + }, + { + "epoch": 0.470373970071887, + "grad_norm": 4.749919891357422, + "learning_rate": 5.3006760070052545e-05, + "loss": 1.05, + "num_input_tokens_seen": 108049104, + "step": 6715 + }, + { + "epoch": 0.47044401831761623, + "grad_norm": 4.774212837219238, + "learning_rate": 5.299976182136603e-05, + "loss": 1.0433, + "num_input_tokens_seen": 108065488, + "step": 6716 + }, + { + "epoch": 0.4705140665633455, + "grad_norm": 3.6954824924468994, + "learning_rate": 5.2992763572679515e-05, + "loss": 0.9831, + "num_input_tokens_seen": 108081224, + "step": 6717 + }, + { + "epoch": 0.47058411480907475, + "grad_norm": 5.202620983123779, + "learning_rate": 5.2985765323993e-05, + "loss": 1.0583, + "num_input_tokens_seen": 108097608, + "step": 6718 + }, + { + "epoch": 0.470654163054804, + "grad_norm": 3.7043261528015137, + "learning_rate": 5.297876707530649e-05, + "loss": 0.9753, + "num_input_tokens_seen": 108113992, + "step": 6719 + }, + { + "epoch": 0.47072421130053327, + "grad_norm": 4.06228494644165, + "learning_rate": 5.2971768826619974e-05, + "loss": 1.0117, + "num_input_tokens_seen": 108130376, + "step": 6720 + }, + { + "epoch": 0.4707942595462625, + "grad_norm": 3.4427239894866943, + "learning_rate": 5.2964770577933456e-05, + "loss": 0.9207, + "num_input_tokens_seen": 108146760, + "step": 6721 + }, + { + "epoch": 0.47086430779199173, + "grad_norm": 6.617749214172363, + "learning_rate": 5.295777232924694e-05, + "loss": 1.1472, + "num_input_tokens_seen": 108163144, + "step": 6722 + }, + { + "epoch": 0.470934356037721, + "grad_norm": 3.744797706604004, + "learning_rate": 5.2950774080560426e-05, + "loss": 1.0143, + "num_input_tokens_seen": 108179528, + "step": 6723 + }, + { + "epoch": 0.47100440428345025, + "grad_norm": 5.034976005554199, + "learning_rate": 5.294377583187391e-05, + "loss": 1.0061, + "num_input_tokens_seen": 108195248, + "step": 6724 + }, + { + "epoch": 0.47107445252917945, + "grad_norm": 3.9690632820129395, + "learning_rate": 5.293677758318739e-05, + "loss": 1.2634, + "num_input_tokens_seen": 108210920, + "step": 6725 + }, + { + "epoch": 0.4711445007749087, + "grad_norm": 3.351450204849243, + "learning_rate": 5.292977933450087e-05, + "loss": 0.876, + "num_input_tokens_seen": 108227304, + "step": 6726 + }, + { + "epoch": 0.471214549020638, + "grad_norm": 3.7437920570373535, + "learning_rate": 5.292278108581437e-05, + "loss": 0.9549, + "num_input_tokens_seen": 108243688, + "step": 6727 + }, + { + "epoch": 0.47128459726636723, + "grad_norm": 6.022392272949219, + "learning_rate": 5.291578283712785e-05, + "loss": 1.0568, + "num_input_tokens_seen": 108260072, + "step": 6728 + }, + { + "epoch": 0.47135464551209644, + "grad_norm": 4.407289505004883, + "learning_rate": 5.290878458844134e-05, + "loss": 1.0511, + "num_input_tokens_seen": 108276456, + "step": 6729 + }, + { + "epoch": 0.4714246937578257, + "grad_norm": 3.9509878158569336, + "learning_rate": 5.290178633975482e-05, + "loss": 1.1648, + "num_input_tokens_seen": 108291632, + "step": 6730 + }, + { + "epoch": 0.47149474200355496, + "grad_norm": 4.2412285804748535, + "learning_rate": 5.28947880910683e-05, + "loss": 1.1903, + "num_input_tokens_seen": 108308016, + "step": 6731 + }, + { + "epoch": 0.4715647902492842, + "grad_norm": 4.234686374664307, + "learning_rate": 5.288778984238178e-05, + "loss": 1.1111, + "num_input_tokens_seen": 108323984, + "step": 6732 + }, + { + "epoch": 0.4716348384950134, + "grad_norm": 4.565019130706787, + "learning_rate": 5.288079159369529e-05, + "loss": 1.096, + "num_input_tokens_seen": 108340368, + "step": 6733 + }, + { + "epoch": 0.4717048867407427, + "grad_norm": 4.805628299713135, + "learning_rate": 5.287379334500876e-05, + "loss": 1.1239, + "num_input_tokens_seen": 108356752, + "step": 6734 + }, + { + "epoch": 0.47177493498647194, + "grad_norm": 3.9647700786590576, + "learning_rate": 5.286679509632224e-05, + "loss": 1.1555, + "num_input_tokens_seen": 108372216, + "step": 6735 + }, + { + "epoch": 0.4718449832322012, + "grad_norm": 3.811239004135132, + "learning_rate": 5.285979684763574e-05, + "loss": 0.9169, + "num_input_tokens_seen": 108387696, + "step": 6736 + }, + { + "epoch": 0.4719150314779304, + "grad_norm": 4.559319496154785, + "learning_rate": 5.285279859894922e-05, + "loss": 0.918, + "num_input_tokens_seen": 108403944, + "step": 6737 + }, + { + "epoch": 0.47198507972365966, + "grad_norm": 4.727875232696533, + "learning_rate": 5.2845800350262694e-05, + "loss": 1.1424, + "num_input_tokens_seen": 108420328, + "step": 6738 + }, + { + "epoch": 0.4720551279693889, + "grad_norm": 3.8609120845794678, + "learning_rate": 5.283880210157619e-05, + "loss": 1.0053, + "num_input_tokens_seen": 108436712, + "step": 6739 + }, + { + "epoch": 0.4721251762151182, + "grad_norm": 3.804370164871216, + "learning_rate": 5.2831803852889685e-05, + "loss": 1.0733, + "num_input_tokens_seen": 108453040, + "step": 6740 + }, + { + "epoch": 0.4721952244608474, + "grad_norm": 3.939620018005371, + "learning_rate": 5.282480560420315e-05, + "loss": 1.0229, + "num_input_tokens_seen": 108468880, + "step": 6741 + }, + { + "epoch": 0.47226527270657664, + "grad_norm": 4.376893043518066, + "learning_rate": 5.2817807355516635e-05, + "loss": 1.131, + "num_input_tokens_seen": 108485248, + "step": 6742 + }, + { + "epoch": 0.4723353209523059, + "grad_norm": 5.025060653686523, + "learning_rate": 5.281080910683014e-05, + "loss": 1.1299, + "num_input_tokens_seen": 108501040, + "step": 6743 + }, + { + "epoch": 0.47240536919803516, + "grad_norm": 3.524656057357788, + "learning_rate": 5.280381085814361e-05, + "loss": 0.9653, + "num_input_tokens_seen": 108516624, + "step": 6744 + }, + { + "epoch": 0.47247541744376437, + "grad_norm": 3.8542211055755615, + "learning_rate": 5.2796812609457094e-05, + "loss": 1.0025, + "num_input_tokens_seen": 108533008, + "step": 6745 + }, + { + "epoch": 0.4725454656894936, + "grad_norm": 3.8751041889190674, + "learning_rate": 5.278981436077058e-05, + "loss": 1.1803, + "num_input_tokens_seen": 108549112, + "step": 6746 + }, + { + "epoch": 0.4726155139352229, + "grad_norm": 4.343238353729248, + "learning_rate": 5.2782816112084064e-05, + "loss": 0.9759, + "num_input_tokens_seen": 108564328, + "step": 6747 + }, + { + "epoch": 0.47268556218095215, + "grad_norm": 3.695493698120117, + "learning_rate": 5.2775817863397546e-05, + "loss": 0.8834, + "num_input_tokens_seen": 108580112, + "step": 6748 + }, + { + "epoch": 0.47275561042668135, + "grad_norm": 3.8947877883911133, + "learning_rate": 5.276881961471104e-05, + "loss": 1.0522, + "num_input_tokens_seen": 108596136, + "step": 6749 + }, + { + "epoch": 0.4728256586724106, + "grad_norm": 4.2317633628845215, + "learning_rate": 5.276182136602453e-05, + "loss": 0.9472, + "num_input_tokens_seen": 108612520, + "step": 6750 + }, + { + "epoch": 0.47289570691813987, + "grad_norm": 3.608283281326294, + "learning_rate": 5.2754823117338005e-05, + "loss": 1.0748, + "num_input_tokens_seen": 108628904, + "step": 6751 + }, + { + "epoch": 0.47296575516386913, + "grad_norm": 4.512143611907959, + "learning_rate": 5.274782486865149e-05, + "loss": 1.0156, + "num_input_tokens_seen": 108644248, + "step": 6752 + }, + { + "epoch": 0.4730358034095984, + "grad_norm": 3.81160044670105, + "learning_rate": 5.2740826619964976e-05, + "loss": 1.0496, + "num_input_tokens_seen": 108660488, + "step": 6753 + }, + { + "epoch": 0.4731058516553276, + "grad_norm": 3.760336399078369, + "learning_rate": 5.273382837127846e-05, + "loss": 1.0335, + "num_input_tokens_seen": 108676872, + "step": 6754 + }, + { + "epoch": 0.47317589990105685, + "grad_norm": 3.969651222229004, + "learning_rate": 5.272683012259194e-05, + "loss": 1.2213, + "num_input_tokens_seen": 108693256, + "step": 6755 + }, + { + "epoch": 0.4732459481467861, + "grad_norm": 4.55695915222168, + "learning_rate": 5.2719831873905435e-05, + "loss": 0.9125, + "num_input_tokens_seen": 108709576, + "step": 6756 + }, + { + "epoch": 0.47331599639251537, + "grad_norm": 4.36952018737793, + "learning_rate": 5.271283362521893e-05, + "loss": 1.0403, + "num_input_tokens_seen": 108725520, + "step": 6757 + }, + { + "epoch": 0.4733860446382446, + "grad_norm": 4.689207553863525, + "learning_rate": 5.27058353765324e-05, + "loss": 1.0875, + "num_input_tokens_seen": 108741744, + "step": 6758 + }, + { + "epoch": 0.47345609288397383, + "grad_norm": 3.5912058353424072, + "learning_rate": 5.269883712784588e-05, + "loss": 1.1125, + "num_input_tokens_seen": 108757952, + "step": 6759 + }, + { + "epoch": 0.4735261411297031, + "grad_norm": 4.725868225097656, + "learning_rate": 5.269183887915938e-05, + "loss": 0.9312, + "num_input_tokens_seen": 108774088, + "step": 6760 + }, + { + "epoch": 0.47359618937543235, + "grad_norm": 4.213376045227051, + "learning_rate": 5.268484063047285e-05, + "loss": 1.0752, + "num_input_tokens_seen": 108790472, + "step": 6761 + }, + { + "epoch": 0.47366623762116156, + "grad_norm": 4.116434574127197, + "learning_rate": 5.267784238178633e-05, + "loss": 1.0481, + "num_input_tokens_seen": 108806776, + "step": 6762 + }, + { + "epoch": 0.4737362858668908, + "grad_norm": 3.8367996215820312, + "learning_rate": 5.267084413309983e-05, + "loss": 1.0882, + "num_input_tokens_seen": 108822416, + "step": 6763 + }, + { + "epoch": 0.4738063341126201, + "grad_norm": 3.609545946121216, + "learning_rate": 5.266384588441332e-05, + "loss": 0.929, + "num_input_tokens_seen": 108838208, + "step": 6764 + }, + { + "epoch": 0.47387638235834934, + "grad_norm": 4.108180522918701, + "learning_rate": 5.2656847635726805e-05, + "loss": 0.9622, + "num_input_tokens_seen": 108854592, + "step": 6765 + }, + { + "epoch": 0.47394643060407854, + "grad_norm": 4.884720325469971, + "learning_rate": 5.2649849387040287e-05, + "loss": 1.1246, + "num_input_tokens_seen": 108870976, + "step": 6766 + }, + { + "epoch": 0.4740164788498078, + "grad_norm": 4.856875896453857, + "learning_rate": 5.2642851138353775e-05, + "loss": 1.2403, + "num_input_tokens_seen": 108885688, + "step": 6767 + }, + { + "epoch": 0.47408652709553706, + "grad_norm": 3.5622432231903076, + "learning_rate": 5.263585288966725e-05, + "loss": 0.9572, + "num_input_tokens_seen": 108902072, + "step": 6768 + }, + { + "epoch": 0.4741565753412663, + "grad_norm": 5.305510997772217, + "learning_rate": 5.262885464098073e-05, + "loss": 1.229, + "num_input_tokens_seen": 108917848, + "step": 6769 + }, + { + "epoch": 0.4742266235869955, + "grad_norm": 3.729074478149414, + "learning_rate": 5.2621856392294234e-05, + "loss": 0.9361, + "num_input_tokens_seen": 108934232, + "step": 6770 + }, + { + "epoch": 0.4742966718327248, + "grad_norm": 4.5915937423706055, + "learning_rate": 5.26148581436077e-05, + "loss": 1.0442, + "num_input_tokens_seen": 108949696, + "step": 6771 + }, + { + "epoch": 0.47436672007845404, + "grad_norm": 3.977216958999634, + "learning_rate": 5.26078598949212e-05, + "loss": 1.2395, + "num_input_tokens_seen": 108965848, + "step": 6772 + }, + { + "epoch": 0.4744367683241833, + "grad_norm": 4.012653827667236, + "learning_rate": 5.260086164623468e-05, + "loss": 0.9013, + "num_input_tokens_seen": 108982232, + "step": 6773 + }, + { + "epoch": 0.4745068165699125, + "grad_norm": 4.10910701751709, + "learning_rate": 5.259386339754817e-05, + "loss": 0.9896, + "num_input_tokens_seen": 108997800, + "step": 6774 + }, + { + "epoch": 0.47457686481564176, + "grad_norm": 5.1765336990356445, + "learning_rate": 5.258686514886165e-05, + "loss": 1.1068, + "num_input_tokens_seen": 109013664, + "step": 6775 + }, + { + "epoch": 0.474646913061371, + "grad_norm": 5.6664228439331055, + "learning_rate": 5.257986690017513e-05, + "loss": 1.345, + "num_input_tokens_seen": 109029208, + "step": 6776 + }, + { + "epoch": 0.4747169613071003, + "grad_norm": 6.2354817390441895, + "learning_rate": 5.257286865148863e-05, + "loss": 1.1819, + "num_input_tokens_seen": 109044528, + "step": 6777 + }, + { + "epoch": 0.4747870095528295, + "grad_norm": 3.8308510780334473, + "learning_rate": 5.2565870402802095e-05, + "loss": 0.9639, + "num_input_tokens_seen": 109060912, + "step": 6778 + }, + { + "epoch": 0.47485705779855875, + "grad_norm": 4.019093990325928, + "learning_rate": 5.255887215411558e-05, + "loss": 0.9385, + "num_input_tokens_seen": 109077296, + "step": 6779 + }, + { + "epoch": 0.474927106044288, + "grad_norm": 6.938348293304443, + "learning_rate": 5.255187390542907e-05, + "loss": 0.9974, + "num_input_tokens_seen": 109093680, + "step": 6780 + }, + { + "epoch": 0.47499715429001726, + "grad_norm": 4.200627326965332, + "learning_rate": 5.254487565674257e-05, + "loss": 1.0353, + "num_input_tokens_seen": 109110008, + "step": 6781 + }, + { + "epoch": 0.47506720253574647, + "grad_norm": 4.06279993057251, + "learning_rate": 5.253787740805605e-05, + "loss": 0.9581, + "num_input_tokens_seen": 109126392, + "step": 6782 + }, + { + "epoch": 0.47513725078147573, + "grad_norm": 4.124377250671387, + "learning_rate": 5.2530879159369525e-05, + "loss": 1.2065, + "num_input_tokens_seen": 109142680, + "step": 6783 + }, + { + "epoch": 0.475207299027205, + "grad_norm": 4.182784557342529, + "learning_rate": 5.252388091068302e-05, + "loss": 1.073, + "num_input_tokens_seen": 109158768, + "step": 6784 + }, + { + "epoch": 0.47527734727293425, + "grad_norm": 4.513407230377197, + "learning_rate": 5.25168826619965e-05, + "loss": 1.0392, + "num_input_tokens_seen": 109175152, + "step": 6785 + }, + { + "epoch": 0.4753473955186635, + "grad_norm": 3.251490354537964, + "learning_rate": 5.250988441330997e-05, + "loss": 0.8611, + "num_input_tokens_seen": 109191056, + "step": 6786 + }, + { + "epoch": 0.4754174437643927, + "grad_norm": 8.621055603027344, + "learning_rate": 5.250288616462348e-05, + "loss": 1.0699, + "num_input_tokens_seen": 109206888, + "step": 6787 + }, + { + "epoch": 0.47548749201012197, + "grad_norm": 4.264245986938477, + "learning_rate": 5.249588791593696e-05, + "loss": 0.9409, + "num_input_tokens_seen": 109223272, + "step": 6788 + }, + { + "epoch": 0.47555754025585123, + "grad_norm": 3.6648037433624268, + "learning_rate": 5.248888966725044e-05, + "loss": 1.0575, + "num_input_tokens_seen": 109239544, + "step": 6789 + }, + { + "epoch": 0.4756275885015805, + "grad_norm": 4.528952598571777, + "learning_rate": 5.2481891418563925e-05, + "loss": 1.1251, + "num_input_tokens_seen": 109255816, + "step": 6790 + }, + { + "epoch": 0.4756976367473097, + "grad_norm": 4.45644998550415, + "learning_rate": 5.247489316987741e-05, + "loss": 1.095, + "num_input_tokens_seen": 109272200, + "step": 6791 + }, + { + "epoch": 0.47576768499303895, + "grad_norm": 3.8969879150390625, + "learning_rate": 5.2467894921190895e-05, + "loss": 0.9836, + "num_input_tokens_seen": 109288520, + "step": 6792 + }, + { + "epoch": 0.4758377332387682, + "grad_norm": 3.627748727798462, + "learning_rate": 5.246089667250438e-05, + "loss": 1.1656, + "num_input_tokens_seen": 109304624, + "step": 6793 + }, + { + "epoch": 0.47590778148449747, + "grad_norm": 4.493330478668213, + "learning_rate": 5.245389842381787e-05, + "loss": 1.2352, + "num_input_tokens_seen": 109319976, + "step": 6794 + }, + { + "epoch": 0.4759778297302267, + "grad_norm": 3.5947048664093018, + "learning_rate": 5.2446900175131354e-05, + "loss": 1.035, + "num_input_tokens_seen": 109336360, + "step": 6795 + }, + { + "epoch": 0.47604787797595594, + "grad_norm": 4.194823741912842, + "learning_rate": 5.2439901926444836e-05, + "loss": 0.9851, + "num_input_tokens_seen": 109351624, + "step": 6796 + }, + { + "epoch": 0.4761179262216852, + "grad_norm": 3.9734160900115967, + "learning_rate": 5.2432903677758324e-05, + "loss": 1.2019, + "num_input_tokens_seen": 109367072, + "step": 6797 + }, + { + "epoch": 0.47618797446741445, + "grad_norm": 4.142136096954346, + "learning_rate": 5.2425905429071806e-05, + "loss": 1.1178, + "num_input_tokens_seen": 109383408, + "step": 6798 + }, + { + "epoch": 0.47625802271314366, + "grad_norm": 4.315369129180908, + "learning_rate": 5.241890718038529e-05, + "loss": 1.2254, + "num_input_tokens_seen": 109398616, + "step": 6799 + }, + { + "epoch": 0.4763280709588729, + "grad_norm": 4.77875280380249, + "learning_rate": 5.241190893169877e-05, + "loss": 1.1018, + "num_input_tokens_seen": 109414592, + "step": 6800 + }, + { + "epoch": 0.4763280709588729, + "eval_loss": 1.1252864599227905, + "eval_runtime": 0.1585, + "eval_samples_per_second": 6.311, + "eval_steps_per_second": 6.311, + "num_input_tokens_seen": 109414592, + "step": 6800 + }, + { + "epoch": 0.4763981192046022, + "grad_norm": 4.054019927978516, + "learning_rate": 5.2404910683012265e-05, + "loss": 1.1978, + "num_input_tokens_seen": 109430896, + "step": 6801 + }, + { + "epoch": 0.47646816745033144, + "grad_norm": 4.0688276290893555, + "learning_rate": 5.239791243432575e-05, + "loss": 1.015, + "num_input_tokens_seen": 109447008, + "step": 6802 + }, + { + "epoch": 0.47653821569606064, + "grad_norm": 4.081553936004639, + "learning_rate": 5.239091418563924e-05, + "loss": 1.2566, + "num_input_tokens_seen": 109463392, + "step": 6803 + }, + { + "epoch": 0.4766082639417899, + "grad_norm": 4.719587326049805, + "learning_rate": 5.2383915936952724e-05, + "loss": 1.0577, + "num_input_tokens_seen": 109478768, + "step": 6804 + }, + { + "epoch": 0.47667831218751916, + "grad_norm": 3.7197132110595703, + "learning_rate": 5.2376917688266206e-05, + "loss": 1.0442, + "num_input_tokens_seen": 109494808, + "step": 6805 + }, + { + "epoch": 0.4767483604332484, + "grad_norm": 5.000951290130615, + "learning_rate": 5.236991943957968e-05, + "loss": 1.0497, + "num_input_tokens_seen": 109511192, + "step": 6806 + }, + { + "epoch": 0.4768184086789776, + "grad_norm": 3.9910333156585693, + "learning_rate": 5.236292119089316e-05, + "loss": 1.2905, + "num_input_tokens_seen": 109527576, + "step": 6807 + }, + { + "epoch": 0.4768884569247069, + "grad_norm": 4.522314548492432, + "learning_rate": 5.235592294220666e-05, + "loss": 0.9959, + "num_input_tokens_seen": 109543960, + "step": 6808 + }, + { + "epoch": 0.47695850517043614, + "grad_norm": 3.7235898971557617, + "learning_rate": 5.234892469352014e-05, + "loss": 0.9931, + "num_input_tokens_seen": 109560344, + "step": 6809 + }, + { + "epoch": 0.4770285534161654, + "grad_norm": 3.643763303756714, + "learning_rate": 5.2341926444833635e-05, + "loss": 0.9588, + "num_input_tokens_seen": 109576728, + "step": 6810 + }, + { + "epoch": 0.4770986016618946, + "grad_norm": 5.52113151550293, + "learning_rate": 5.233492819614712e-05, + "loss": 1.2022, + "num_input_tokens_seen": 109592584, + "step": 6811 + }, + { + "epoch": 0.47716864990762387, + "grad_norm": 4.9974188804626465, + "learning_rate": 5.23279299474606e-05, + "loss": 1.1755, + "num_input_tokens_seen": 109608960, + "step": 6812 + }, + { + "epoch": 0.4772386981533531, + "grad_norm": 5.266491889953613, + "learning_rate": 5.232093169877408e-05, + "loss": 1.1099, + "num_input_tokens_seen": 109625104, + "step": 6813 + }, + { + "epoch": 0.4773087463990824, + "grad_norm": 3.9919018745422363, + "learning_rate": 5.231393345008757e-05, + "loss": 1.0423, + "num_input_tokens_seen": 109641488, + "step": 6814 + }, + { + "epoch": 0.4773787946448116, + "grad_norm": 5.361277103424072, + "learning_rate": 5.230693520140105e-05, + "loss": 1.215, + "num_input_tokens_seen": 109657872, + "step": 6815 + }, + { + "epoch": 0.47744884289054085, + "grad_norm": 4.024937629699707, + "learning_rate": 5.229993695271453e-05, + "loss": 1.2601, + "num_input_tokens_seen": 109674256, + "step": 6816 + }, + { + "epoch": 0.4775188911362701, + "grad_norm": 3.7742490768432617, + "learning_rate": 5.229293870402803e-05, + "loss": 1.0789, + "num_input_tokens_seen": 109690576, + "step": 6817 + }, + { + "epoch": 0.47758893938199937, + "grad_norm": 3.622018814086914, + "learning_rate": 5.228594045534151e-05, + "loss": 0.8893, + "num_input_tokens_seen": 109706592, + "step": 6818 + }, + { + "epoch": 0.47765898762772857, + "grad_norm": 4.550981044769287, + "learning_rate": 5.227894220665499e-05, + "loss": 1.31, + "num_input_tokens_seen": 109722976, + "step": 6819 + }, + { + "epoch": 0.47772903587345783, + "grad_norm": 3.8553786277770996, + "learning_rate": 5.227194395796848e-05, + "loss": 0.9512, + "num_input_tokens_seen": 109738920, + "step": 6820 + }, + { + "epoch": 0.4777990841191871, + "grad_norm": 3.7159841060638428, + "learning_rate": 5.226494570928196e-05, + "loss": 0.9445, + "num_input_tokens_seen": 109755304, + "step": 6821 + }, + { + "epoch": 0.47786913236491635, + "grad_norm": 5.884495258331299, + "learning_rate": 5.2257947460595444e-05, + "loss": 0.9789, + "num_input_tokens_seen": 109771576, + "step": 6822 + }, + { + "epoch": 0.4779391806106456, + "grad_norm": 3.7047083377838135, + "learning_rate": 5.2250949211908926e-05, + "loss": 1.0297, + "num_input_tokens_seen": 109787872, + "step": 6823 + }, + { + "epoch": 0.4780092288563748, + "grad_norm": 3.485847234725952, + "learning_rate": 5.224395096322241e-05, + "loss": 0.9269, + "num_input_tokens_seen": 109803384, + "step": 6824 + }, + { + "epoch": 0.47807927710210407, + "grad_norm": 3.9222450256347656, + "learning_rate": 5.223695271453592e-05, + "loss": 1.0749, + "num_input_tokens_seen": 109818704, + "step": 6825 + }, + { + "epoch": 0.47814932534783333, + "grad_norm": 4.232855796813965, + "learning_rate": 5.2229954465849385e-05, + "loss": 1.1773, + "num_input_tokens_seen": 109835088, + "step": 6826 + }, + { + "epoch": 0.4782193735935626, + "grad_norm": 3.5413403511047363, + "learning_rate": 5.222295621716288e-05, + "loss": 0.9407, + "num_input_tokens_seen": 109851472, + "step": 6827 + }, + { + "epoch": 0.4782894218392918, + "grad_norm": 4.55118989944458, + "learning_rate": 5.2215957968476356e-05, + "loss": 1.1199, + "num_input_tokens_seen": 109867856, + "step": 6828 + }, + { + "epoch": 0.47835947008502105, + "grad_norm": 3.691756010055542, + "learning_rate": 5.220895971978984e-05, + "loss": 0.9721, + "num_input_tokens_seen": 109884240, + "step": 6829 + }, + { + "epoch": 0.4784295183307503, + "grad_norm": 3.588829755783081, + "learning_rate": 5.220196147110333e-05, + "loss": 1.0665, + "num_input_tokens_seen": 109900624, + "step": 6830 + }, + { + "epoch": 0.4784995665764796, + "grad_norm": 4.766005516052246, + "learning_rate": 5.219496322241683e-05, + "loss": 1.0467, + "num_input_tokens_seen": 109917008, + "step": 6831 + }, + { + "epoch": 0.4785696148222088, + "grad_norm": 3.7234201431274414, + "learning_rate": 5.218796497373031e-05, + "loss": 1.0377, + "num_input_tokens_seen": 109933392, + "step": 6832 + }, + { + "epoch": 0.47863966306793804, + "grad_norm": 3.434387683868408, + "learning_rate": 5.218096672504378e-05, + "loss": 0.874, + "num_input_tokens_seen": 109949776, + "step": 6833 + }, + { + "epoch": 0.4787097113136673, + "grad_norm": 3.7484259605407715, + "learning_rate": 5.2173968476357274e-05, + "loss": 0.9365, + "num_input_tokens_seen": 109966016, + "step": 6834 + }, + { + "epoch": 0.47877975955939656, + "grad_norm": 5.821316719055176, + "learning_rate": 5.216697022767076e-05, + "loss": 0.9894, + "num_input_tokens_seen": 109981168, + "step": 6835 + }, + { + "epoch": 0.47884980780512576, + "grad_norm": 5.2646484375, + "learning_rate": 5.215997197898424e-05, + "loss": 1.0894, + "num_input_tokens_seen": 109996648, + "step": 6836 + }, + { + "epoch": 0.478919856050855, + "grad_norm": 5.125279426574707, + "learning_rate": 5.2152973730297726e-05, + "loss": 0.9451, + "num_input_tokens_seen": 110013032, + "step": 6837 + }, + { + "epoch": 0.4789899042965843, + "grad_norm": 4.917844295501709, + "learning_rate": 5.214597548161121e-05, + "loss": 1.1573, + "num_input_tokens_seen": 110029040, + "step": 6838 + }, + { + "epoch": 0.47905995254231354, + "grad_norm": 3.6937522888183594, + "learning_rate": 5.21389772329247e-05, + "loss": 1.0922, + "num_input_tokens_seen": 110045032, + "step": 6839 + }, + { + "epoch": 0.47913000078804274, + "grad_norm": 4.9768757820129395, + "learning_rate": 5.213197898423817e-05, + "loss": 1.3347, + "num_input_tokens_seen": 110061416, + "step": 6840 + }, + { + "epoch": 0.479200049033772, + "grad_norm": 5.775148391723633, + "learning_rate": 5.212498073555167e-05, + "loss": 1.1443, + "num_input_tokens_seen": 110077800, + "step": 6841 + }, + { + "epoch": 0.47927009727950126, + "grad_norm": 4.3342766761779785, + "learning_rate": 5.2117982486865155e-05, + "loss": 1.0604, + "num_input_tokens_seen": 110093848, + "step": 6842 + }, + { + "epoch": 0.4793401455252305, + "grad_norm": 3.6098031997680664, + "learning_rate": 5.211098423817863e-05, + "loss": 1.0893, + "num_input_tokens_seen": 110110232, + "step": 6843 + }, + { + "epoch": 0.4794101937709597, + "grad_norm": 3.7780818939208984, + "learning_rate": 5.210398598949212e-05, + "loss": 0.9852, + "num_input_tokens_seen": 110126584, + "step": 6844 + }, + { + "epoch": 0.479480242016689, + "grad_norm": 3.732302188873291, + "learning_rate": 5.2096987740805614e-05, + "loss": 0.9158, + "num_input_tokens_seen": 110142968, + "step": 6845 + }, + { + "epoch": 0.47955029026241824, + "grad_norm": 4.920741558074951, + "learning_rate": 5.208998949211908e-05, + "loss": 0.9931, + "num_input_tokens_seen": 110159352, + "step": 6846 + }, + { + "epoch": 0.4796203385081475, + "grad_norm": 3.847682476043701, + "learning_rate": 5.208299124343258e-05, + "loss": 1.1485, + "num_input_tokens_seen": 110175736, + "step": 6847 + }, + { + "epoch": 0.4796903867538767, + "grad_norm": 3.8941121101379395, + "learning_rate": 5.207599299474607e-05, + "loss": 1.0896, + "num_input_tokens_seen": 110192040, + "step": 6848 + }, + { + "epoch": 0.47976043499960597, + "grad_norm": 4.254310131072998, + "learning_rate": 5.2068994746059555e-05, + "loss": 1.1701, + "num_input_tokens_seen": 110208304, + "step": 6849 + }, + { + "epoch": 0.4798304832453352, + "grad_norm": 3.85739803314209, + "learning_rate": 5.206199649737302e-05, + "loss": 0.9785, + "num_input_tokens_seen": 110224688, + "step": 6850 + }, + { + "epoch": 0.4799005314910645, + "grad_norm": 4.137633323669434, + "learning_rate": 5.205499824868651e-05, + "loss": 1.2111, + "num_input_tokens_seen": 110240160, + "step": 6851 + }, + { + "epoch": 0.4799705797367937, + "grad_norm": 3.827974557876587, + "learning_rate": 5.204800000000001e-05, + "loss": 0.9639, + "num_input_tokens_seen": 110255952, + "step": 6852 + }, + { + "epoch": 0.48004062798252295, + "grad_norm": 4.506080627441406, + "learning_rate": 5.2041001751313475e-05, + "loss": 1.0435, + "num_input_tokens_seen": 110272336, + "step": 6853 + }, + { + "epoch": 0.4801106762282522, + "grad_norm": 3.4824750423431396, + "learning_rate": 5.203400350262697e-05, + "loss": 0.8792, + "num_input_tokens_seen": 110288720, + "step": 6854 + }, + { + "epoch": 0.48018072447398147, + "grad_norm": 3.319546937942505, + "learning_rate": 5.2027005253940466e-05, + "loss": 0.9861, + "num_input_tokens_seen": 110304984, + "step": 6855 + }, + { + "epoch": 0.48025077271971073, + "grad_norm": 5.543242454528809, + "learning_rate": 5.202000700525395e-05, + "loss": 1.0694, + "num_input_tokens_seen": 110320488, + "step": 6856 + }, + { + "epoch": 0.48032082096543993, + "grad_norm": 6.7765069007873535, + "learning_rate": 5.201300875656743e-05, + "loss": 1.0751, + "num_input_tokens_seen": 110336872, + "step": 6857 + }, + { + "epoch": 0.4803908692111692, + "grad_norm": 3.5764353275299072, + "learning_rate": 5.200601050788092e-05, + "loss": 1.0798, + "num_input_tokens_seen": 110353160, + "step": 6858 + }, + { + "epoch": 0.48046091745689845, + "grad_norm": 4.938530921936035, + "learning_rate": 5.19990122591944e-05, + "loss": 0.9155, + "num_input_tokens_seen": 110369544, + "step": 6859 + }, + { + "epoch": 0.4805309657026277, + "grad_norm": 3.5447168350219727, + "learning_rate": 5.1992014010507875e-05, + "loss": 0.9738, + "num_input_tokens_seen": 110385928, + "step": 6860 + }, + { + "epoch": 0.4806010139483569, + "grad_norm": 4.1170220375061035, + "learning_rate": 5.1985015761821364e-05, + "loss": 1.156, + "num_input_tokens_seen": 110402224, + "step": 6861 + }, + { + "epoch": 0.4806710621940862, + "grad_norm": 3.6147382259368896, + "learning_rate": 5.197801751313486e-05, + "loss": 1.0212, + "num_input_tokens_seen": 110418608, + "step": 6862 + }, + { + "epoch": 0.48074111043981543, + "grad_norm": 3.745072841644287, + "learning_rate": 5.197101926444834e-05, + "loss": 1.1518, + "num_input_tokens_seen": 110434792, + "step": 6863 + }, + { + "epoch": 0.4808111586855447, + "grad_norm": 4.3973517417907715, + "learning_rate": 5.196402101576182e-05, + "loss": 1.0627, + "num_input_tokens_seen": 110450376, + "step": 6864 + }, + { + "epoch": 0.4808812069312739, + "grad_norm": 4.029878616333008, + "learning_rate": 5.195702276707531e-05, + "loss": 1.051, + "num_input_tokens_seen": 110466760, + "step": 6865 + }, + { + "epoch": 0.48095125517700316, + "grad_norm": 3.5051989555358887, + "learning_rate": 5.195002451838879e-05, + "loss": 1.1163, + "num_input_tokens_seen": 110483144, + "step": 6866 + }, + { + "epoch": 0.4810213034227324, + "grad_norm": 3.8468475341796875, + "learning_rate": 5.1943026269702275e-05, + "loss": 1.0515, + "num_input_tokens_seen": 110499528, + "step": 6867 + }, + { + "epoch": 0.4810913516684617, + "grad_norm": 3.4679362773895264, + "learning_rate": 5.193602802101577e-05, + "loss": 1.0516, + "num_input_tokens_seen": 110515448, + "step": 6868 + }, + { + "epoch": 0.4811613999141909, + "grad_norm": 3.540043830871582, + "learning_rate": 5.192902977232925e-05, + "loss": 1.0163, + "num_input_tokens_seen": 110531832, + "step": 6869 + }, + { + "epoch": 0.48123144815992014, + "grad_norm": 4.2961835861206055, + "learning_rate": 5.192203152364272e-05, + "loss": 0.9839, + "num_input_tokens_seen": 110548216, + "step": 6870 + }, + { + "epoch": 0.4813014964056494, + "grad_norm": 4.718245029449463, + "learning_rate": 5.191503327495623e-05, + "loss": 1.0214, + "num_input_tokens_seen": 110564600, + "step": 6871 + }, + { + "epoch": 0.48137154465137866, + "grad_norm": 4.846748352050781, + "learning_rate": 5.190803502626971e-05, + "loss": 1.1448, + "num_input_tokens_seen": 110579952, + "step": 6872 + }, + { + "epoch": 0.48144159289710786, + "grad_norm": 3.5760273933410645, + "learning_rate": 5.1901036777583186e-05, + "loss": 1.0028, + "num_input_tokens_seen": 110595984, + "step": 6873 + }, + { + "epoch": 0.4815116411428371, + "grad_norm": 6.386372089385986, + "learning_rate": 5.189403852889667e-05, + "loss": 1.1695, + "num_input_tokens_seen": 110612368, + "step": 6874 + }, + { + "epoch": 0.4815816893885664, + "grad_norm": 5.007279872894287, + "learning_rate": 5.188704028021018e-05, + "loss": 1.0406, + "num_input_tokens_seen": 110628752, + "step": 6875 + }, + { + "epoch": 0.48165173763429564, + "grad_norm": 4.01614236831665, + "learning_rate": 5.1880042031523645e-05, + "loss": 1.075, + "num_input_tokens_seen": 110645136, + "step": 6876 + }, + { + "epoch": 0.48172178588002484, + "grad_norm": 4.7416486740112305, + "learning_rate": 5.187304378283713e-05, + "loss": 1.3402, + "num_input_tokens_seen": 110661400, + "step": 6877 + }, + { + "epoch": 0.4817918341257541, + "grad_norm": 4.886537551879883, + "learning_rate": 5.186604553415062e-05, + "loss": 0.8621, + "num_input_tokens_seen": 110677784, + "step": 6878 + }, + { + "epoch": 0.48186188237148336, + "grad_norm": 4.033387660980225, + "learning_rate": 5.1859047285464104e-05, + "loss": 1.3515, + "num_input_tokens_seen": 110694168, + "step": 6879 + }, + { + "epoch": 0.4819319306172126, + "grad_norm": 3.7201569080352783, + "learning_rate": 5.1852049036777586e-05, + "loss": 1.163, + "num_input_tokens_seen": 110710552, + "step": 6880 + }, + { + "epoch": 0.4820019788629418, + "grad_norm": 3.73651123046875, + "learning_rate": 5.1845050788091075e-05, + "loss": 1.0389, + "num_input_tokens_seen": 110726440, + "step": 6881 + }, + { + "epoch": 0.4820720271086711, + "grad_norm": 4.395266532897949, + "learning_rate": 5.1838052539404556e-05, + "loss": 0.9636, + "num_input_tokens_seen": 110742200, + "step": 6882 + }, + { + "epoch": 0.48214207535440035, + "grad_norm": 3.70263409614563, + "learning_rate": 5.183105429071804e-05, + "loss": 1.0229, + "num_input_tokens_seen": 110758584, + "step": 6883 + }, + { + "epoch": 0.4822121236001296, + "grad_norm": 4.863175868988037, + "learning_rate": 5.1824056042031534e-05, + "loss": 1.1663, + "num_input_tokens_seen": 110774040, + "step": 6884 + }, + { + "epoch": 0.4822821718458588, + "grad_norm": 3.668220043182373, + "learning_rate": 5.181705779334502e-05, + "loss": 1.2351, + "num_input_tokens_seen": 110790352, + "step": 6885 + }, + { + "epoch": 0.48235222009158807, + "grad_norm": 4.210755825042725, + "learning_rate": 5.18100595446585e-05, + "loss": 1.0517, + "num_input_tokens_seen": 110805912, + "step": 6886 + }, + { + "epoch": 0.48242226833731733, + "grad_norm": 3.62275767326355, + "learning_rate": 5.180306129597198e-05, + "loss": 1.0383, + "num_input_tokens_seen": 110822296, + "step": 6887 + }, + { + "epoch": 0.4824923165830466, + "grad_norm": 3.498563051223755, + "learning_rate": 5.179606304728547e-05, + "loss": 0.9063, + "num_input_tokens_seen": 110838680, + "step": 6888 + }, + { + "epoch": 0.4825623648287758, + "grad_norm": 6.4097161293029785, + "learning_rate": 5.178906479859895e-05, + "loss": 0.9482, + "num_input_tokens_seen": 110855064, + "step": 6889 + }, + { + "epoch": 0.48263241307450505, + "grad_norm": 4.8159565925598145, + "learning_rate": 5.178206654991243e-05, + "loss": 1.2248, + "num_input_tokens_seen": 110871328, + "step": 6890 + }, + { + "epoch": 0.4827024613202343, + "grad_norm": 3.976828098297119, + "learning_rate": 5.177506830122593e-05, + "loss": 1.1521, + "num_input_tokens_seen": 110886600, + "step": 6891 + }, + { + "epoch": 0.48277250956596357, + "grad_norm": 3.6857738494873047, + "learning_rate": 5.176807005253942e-05, + "loss": 1.1149, + "num_input_tokens_seen": 110902984, + "step": 6892 + }, + { + "epoch": 0.48284255781169283, + "grad_norm": 4.129028797149658, + "learning_rate": 5.176107180385289e-05, + "loss": 1.1404, + "num_input_tokens_seen": 110918808, + "step": 6893 + }, + { + "epoch": 0.48291260605742203, + "grad_norm": 4.203270435333252, + "learning_rate": 5.175407355516637e-05, + "loss": 1.1844, + "num_input_tokens_seen": 110935192, + "step": 6894 + }, + { + "epoch": 0.4829826543031513, + "grad_norm": 3.7045552730560303, + "learning_rate": 5.1747075306479874e-05, + "loss": 0.9193, + "num_input_tokens_seen": 110951168, + "step": 6895 + }, + { + "epoch": 0.48305270254888055, + "grad_norm": 4.2172112464904785, + "learning_rate": 5.174007705779334e-05, + "loss": 0.8905, + "num_input_tokens_seen": 110967552, + "step": 6896 + }, + { + "epoch": 0.4831227507946098, + "grad_norm": 3.395329236984253, + "learning_rate": 5.1733078809106824e-05, + "loss": 0.9696, + "num_input_tokens_seen": 110983736, + "step": 6897 + }, + { + "epoch": 0.483192799040339, + "grad_norm": 6.649857044219971, + "learning_rate": 5.172608056042032e-05, + "loss": 1.2299, + "num_input_tokens_seen": 111000120, + "step": 6898 + }, + { + "epoch": 0.4832628472860683, + "grad_norm": 5.114965438842773, + "learning_rate": 5.1719082311733815e-05, + "loss": 1.196, + "num_input_tokens_seen": 111016504, + "step": 6899 + }, + { + "epoch": 0.48333289553179754, + "grad_norm": 4.1728410720825195, + "learning_rate": 5.171208406304728e-05, + "loss": 1.1445, + "num_input_tokens_seen": 111032232, + "step": 6900 + }, + { + "epoch": 0.4834029437775268, + "grad_norm": 3.674546241760254, + "learning_rate": 5.170508581436078e-05, + "loss": 1.0889, + "num_input_tokens_seen": 111047576, + "step": 6901 + }, + { + "epoch": 0.483472992023256, + "grad_norm": 3.4895896911621094, + "learning_rate": 5.169808756567427e-05, + "loss": 0.9618, + "num_input_tokens_seen": 111063792, + "step": 6902 + }, + { + "epoch": 0.48354304026898526, + "grad_norm": 8.447297096252441, + "learning_rate": 5.169108931698774e-05, + "loss": 0.911, + "num_input_tokens_seen": 111079136, + "step": 6903 + }, + { + "epoch": 0.4836130885147145, + "grad_norm": 4.854581356048584, + "learning_rate": 5.1684091068301224e-05, + "loss": 0.9725, + "num_input_tokens_seen": 111093808, + "step": 6904 + }, + { + "epoch": 0.4836831367604438, + "grad_norm": 3.4015259742736816, + "learning_rate": 5.167709281961471e-05, + "loss": 0.9395, + "num_input_tokens_seen": 111110192, + "step": 6905 + }, + { + "epoch": 0.483753185006173, + "grad_norm": 3.979801654815674, + "learning_rate": 5.1670094570928195e-05, + "loss": 1.181, + "num_input_tokens_seen": 111126576, + "step": 6906 + }, + { + "epoch": 0.48382323325190224, + "grad_norm": 3.655245542526245, + "learning_rate": 5.166309632224169e-05, + "loss": 0.9631, + "num_input_tokens_seen": 111142960, + "step": 6907 + }, + { + "epoch": 0.4838932814976315, + "grad_norm": 3.820819616317749, + "learning_rate": 5.165609807355517e-05, + "loss": 1.0845, + "num_input_tokens_seen": 111159344, + "step": 6908 + }, + { + "epoch": 0.48396332974336076, + "grad_norm": 3.6869490146636963, + "learning_rate": 5.164909982486866e-05, + "loss": 0.7909, + "num_input_tokens_seen": 111175608, + "step": 6909 + }, + { + "epoch": 0.48403337798908996, + "grad_norm": 3.644277334213257, + "learning_rate": 5.1642101576182135e-05, + "loss": 1.0442, + "num_input_tokens_seen": 111191992, + "step": 6910 + }, + { + "epoch": 0.4841034262348192, + "grad_norm": 3.794215202331543, + "learning_rate": 5.1635103327495624e-05, + "loss": 1.1105, + "num_input_tokens_seen": 111207248, + "step": 6911 + }, + { + "epoch": 0.4841734744805485, + "grad_norm": 4.5081987380981445, + "learning_rate": 5.162810507880912e-05, + "loss": 1.3952, + "num_input_tokens_seen": 111223632, + "step": 6912 + }, + { + "epoch": 0.48424352272627774, + "grad_norm": 3.632924795150757, + "learning_rate": 5.162110683012259e-05, + "loss": 1.0862, + "num_input_tokens_seen": 111240016, + "step": 6913 + }, + { + "epoch": 0.48431357097200695, + "grad_norm": 3.522996425628662, + "learning_rate": 5.161410858143607e-05, + "loss": 0.9521, + "num_input_tokens_seen": 111255840, + "step": 6914 + }, + { + "epoch": 0.4843836192177362, + "grad_norm": 4.495186805725098, + "learning_rate": 5.1607110332749565e-05, + "loss": 1.0066, + "num_input_tokens_seen": 111272224, + "step": 6915 + }, + { + "epoch": 0.48445366746346546, + "grad_norm": 3.6315512657165527, + "learning_rate": 5.160011208406306e-05, + "loss": 1.0991, + "num_input_tokens_seen": 111287920, + "step": 6916 + }, + { + "epoch": 0.4845237157091947, + "grad_norm": 3.4649548530578613, + "learning_rate": 5.159311383537654e-05, + "loss": 1.024, + "num_input_tokens_seen": 111304304, + "step": 6917 + }, + { + "epoch": 0.48459376395492393, + "grad_norm": 4.057675838470459, + "learning_rate": 5.158611558669002e-05, + "loss": 1.0403, + "num_input_tokens_seen": 111320688, + "step": 6918 + }, + { + "epoch": 0.4846638122006532, + "grad_norm": 4.989962100982666, + "learning_rate": 5.157911733800351e-05, + "loss": 1.0446, + "num_input_tokens_seen": 111337072, + "step": 6919 + }, + { + "epoch": 0.48473386044638245, + "grad_norm": 4.090515613555908, + "learning_rate": 5.157211908931698e-05, + "loss": 0.9324, + "num_input_tokens_seen": 111353456, + "step": 6920 + }, + { + "epoch": 0.4848039086921117, + "grad_norm": 4.017073154449463, + "learning_rate": 5.156512084063046e-05, + "loss": 1.0938, + "num_input_tokens_seen": 111369840, + "step": 6921 + }, + { + "epoch": 0.4848739569378409, + "grad_norm": 4.227852821350098, + "learning_rate": 5.155812259194397e-05, + "loss": 1.0553, + "num_input_tokens_seen": 111386096, + "step": 6922 + }, + { + "epoch": 0.48494400518357017, + "grad_norm": 5.356720447540283, + "learning_rate": 5.155112434325745e-05, + "loss": 1.1807, + "num_input_tokens_seen": 111402192, + "step": 6923 + }, + { + "epoch": 0.48501405342929943, + "grad_norm": 3.714996814727783, + "learning_rate": 5.1544126094570935e-05, + "loss": 0.9851, + "num_input_tokens_seen": 111418120, + "step": 6924 + }, + { + "epoch": 0.4850841016750287, + "grad_norm": 3.814669609069824, + "learning_rate": 5.153712784588442e-05, + "loss": 1.1195, + "num_input_tokens_seen": 111434408, + "step": 6925 + }, + { + "epoch": 0.48515414992075795, + "grad_norm": 4.38773250579834, + "learning_rate": 5.1530129597197905e-05, + "loss": 0.9939, + "num_input_tokens_seen": 111450384, + "step": 6926 + }, + { + "epoch": 0.48522419816648715, + "grad_norm": 5.492570877075195, + "learning_rate": 5.152313134851139e-05, + "loss": 1.0629, + "num_input_tokens_seen": 111466768, + "step": 6927 + }, + { + "epoch": 0.4852942464122164, + "grad_norm": 4.867751598358154, + "learning_rate": 5.151613309982487e-05, + "loss": 1.0787, + "num_input_tokens_seen": 111481680, + "step": 6928 + }, + { + "epoch": 0.48536429465794567, + "grad_norm": 3.6009931564331055, + "learning_rate": 5.1509134851138364e-05, + "loss": 1.1068, + "num_input_tokens_seen": 111497784, + "step": 6929 + }, + { + "epoch": 0.48543434290367493, + "grad_norm": 3.451188564300537, + "learning_rate": 5.1502136602451846e-05, + "loss": 0.9131, + "num_input_tokens_seen": 111513856, + "step": 6930 + }, + { + "epoch": 0.48550439114940414, + "grad_norm": 4.886107444763184, + "learning_rate": 5.149513835376533e-05, + "loss": 0.9234, + "num_input_tokens_seen": 111530240, + "step": 6931 + }, + { + "epoch": 0.4855744393951334, + "grad_norm": 4.033775806427002, + "learning_rate": 5.148814010507881e-05, + "loss": 1.0094, + "num_input_tokens_seen": 111546160, + "step": 6932 + }, + { + "epoch": 0.48564448764086265, + "grad_norm": 4.718981742858887, + "learning_rate": 5.14811418563923e-05, + "loss": 0.9965, + "num_input_tokens_seen": 111562432, + "step": 6933 + }, + { + "epoch": 0.4857145358865919, + "grad_norm": 3.7174808979034424, + "learning_rate": 5.147414360770578e-05, + "loss": 1.1065, + "num_input_tokens_seen": 111578816, + "step": 6934 + }, + { + "epoch": 0.4857845841323211, + "grad_norm": 4.0880208015441895, + "learning_rate": 5.146714535901926e-05, + "loss": 1.0742, + "num_input_tokens_seen": 111593928, + "step": 6935 + }, + { + "epoch": 0.4858546323780504, + "grad_norm": 3.3873400688171387, + "learning_rate": 5.146014711033276e-05, + "loss": 0.9752, + "num_input_tokens_seen": 111610312, + "step": 6936 + }, + { + "epoch": 0.48592468062377964, + "grad_norm": 3.6071503162384033, + "learning_rate": 5.145314886164624e-05, + "loss": 0.9917, + "num_input_tokens_seen": 111626696, + "step": 6937 + }, + { + "epoch": 0.4859947288695089, + "grad_norm": 3.502610445022583, + "learning_rate": 5.1446150612959735e-05, + "loss": 0.8912, + "num_input_tokens_seen": 111643080, + "step": 6938 + }, + { + "epoch": 0.4860647771152381, + "grad_norm": 3.5743067264556885, + "learning_rate": 5.1439152364273216e-05, + "loss": 1.1493, + "num_input_tokens_seen": 111659048, + "step": 6939 + }, + { + "epoch": 0.48613482536096736, + "grad_norm": 3.9423654079437256, + "learning_rate": 5.14321541155867e-05, + "loss": 1.1328, + "num_input_tokens_seen": 111675432, + "step": 6940 + }, + { + "epoch": 0.4862048736066966, + "grad_norm": 4.670028209686279, + "learning_rate": 5.142515586690017e-05, + "loss": 0.9023, + "num_input_tokens_seen": 111691816, + "step": 6941 + }, + { + "epoch": 0.4862749218524259, + "grad_norm": 3.8914809226989746, + "learning_rate": 5.1418157618213655e-05, + "loss": 1.0373, + "num_input_tokens_seen": 111708200, + "step": 6942 + }, + { + "epoch": 0.4863449700981551, + "grad_norm": 3.864323139190674, + "learning_rate": 5.141115936952715e-05, + "loss": 0.9064, + "num_input_tokens_seen": 111724488, + "step": 6943 + }, + { + "epoch": 0.48641501834388434, + "grad_norm": 3.700681447982788, + "learning_rate": 5.140416112084063e-05, + "loss": 1.092, + "num_input_tokens_seen": 111740368, + "step": 6944 + }, + { + "epoch": 0.4864850665896136, + "grad_norm": 3.7225606441497803, + "learning_rate": 5.139716287215413e-05, + "loss": 0.971, + "num_input_tokens_seen": 111755936, + "step": 6945 + }, + { + "epoch": 0.48655511483534286, + "grad_norm": 4.638529300689697, + "learning_rate": 5.139016462346761e-05, + "loss": 0.9367, + "num_input_tokens_seen": 111772152, + "step": 6946 + }, + { + "epoch": 0.48662516308107207, + "grad_norm": 5.287013053894043, + "learning_rate": 5.138316637478109e-05, + "loss": 0.9463, + "num_input_tokens_seen": 111787144, + "step": 6947 + }, + { + "epoch": 0.4866952113268013, + "grad_norm": 3.991861343383789, + "learning_rate": 5.137616812609457e-05, + "loss": 0.8265, + "num_input_tokens_seen": 111803528, + "step": 6948 + }, + { + "epoch": 0.4867652595725306, + "grad_norm": 4.166889190673828, + "learning_rate": 5.136916987740806e-05, + "loss": 1.1667, + "num_input_tokens_seen": 111819376, + "step": 6949 + }, + { + "epoch": 0.48683530781825984, + "grad_norm": 4.159299373626709, + "learning_rate": 5.1362171628721543e-05, + "loss": 1.1422, + "num_input_tokens_seen": 111835760, + "step": 6950 + }, + { + "epoch": 0.48690535606398905, + "grad_norm": 5.612180709838867, + "learning_rate": 5.1355173380035025e-05, + "loss": 0.9889, + "num_input_tokens_seen": 111851744, + "step": 6951 + }, + { + "epoch": 0.4869754043097183, + "grad_norm": 5.82523775100708, + "learning_rate": 5.134817513134852e-05, + "loss": 0.9702, + "num_input_tokens_seen": 111868128, + "step": 6952 + }, + { + "epoch": 0.48704545255544757, + "grad_norm": 3.5110416412353516, + "learning_rate": 5.1341176882662e-05, + "loss": 0.9538, + "num_input_tokens_seen": 111884512, + "step": 6953 + }, + { + "epoch": 0.4871155008011768, + "grad_norm": 4.108850479125977, + "learning_rate": 5.1334178633975484e-05, + "loss": 1.0866, + "num_input_tokens_seen": 111899544, + "step": 6954 + }, + { + "epoch": 0.48718554904690603, + "grad_norm": 3.9000258445739746, + "learning_rate": 5.132718038528897e-05, + "loss": 0.9456, + "num_input_tokens_seen": 111915928, + "step": 6955 + }, + { + "epoch": 0.4872555972926353, + "grad_norm": 4.503340244293213, + "learning_rate": 5.1320182136602455e-05, + "loss": 1.004, + "num_input_tokens_seen": 111932288, + "step": 6956 + }, + { + "epoch": 0.48732564553836455, + "grad_norm": 4.052606582641602, + "learning_rate": 5.1313183887915936e-05, + "loss": 1.1662, + "num_input_tokens_seen": 111948672, + "step": 6957 + }, + { + "epoch": 0.4873956937840938, + "grad_norm": 3.4959487915039062, + "learning_rate": 5.130618563922942e-05, + "loss": 1.0242, + "num_input_tokens_seen": 111964272, + "step": 6958 + }, + { + "epoch": 0.48746574202982307, + "grad_norm": 4.654433250427246, + "learning_rate": 5.12991873905429e-05, + "loss": 1.1307, + "num_input_tokens_seen": 111980656, + "step": 6959 + }, + { + "epoch": 0.48753579027555227, + "grad_norm": 4.125091552734375, + "learning_rate": 5.1292189141856395e-05, + "loss": 1.1137, + "num_input_tokens_seen": 111996584, + "step": 6960 + }, + { + "epoch": 0.48760583852128153, + "grad_norm": 4.574272155761719, + "learning_rate": 5.128519089316988e-05, + "loss": 0.9312, + "num_input_tokens_seen": 112011528, + "step": 6961 + }, + { + "epoch": 0.4876758867670108, + "grad_norm": 4.110400676727295, + "learning_rate": 5.127819264448337e-05, + "loss": 0.9669, + "num_input_tokens_seen": 112026256, + "step": 6962 + }, + { + "epoch": 0.48774593501274005, + "grad_norm": 3.4572913646698, + "learning_rate": 5.127119439579685e-05, + "loss": 0.9956, + "num_input_tokens_seen": 112042288, + "step": 6963 + }, + { + "epoch": 0.48781598325846925, + "grad_norm": 4.498427391052246, + "learning_rate": 5.126419614711033e-05, + "loss": 1.1088, + "num_input_tokens_seen": 112058672, + "step": 6964 + }, + { + "epoch": 0.4878860315041985, + "grad_norm": 5.4692301750183105, + "learning_rate": 5.1257197898423825e-05, + "loss": 0.9582, + "num_input_tokens_seen": 112073536, + "step": 6965 + }, + { + "epoch": 0.4879560797499278, + "grad_norm": 3.8990654945373535, + "learning_rate": 5.125019964973732e-05, + "loss": 1.2729, + "num_input_tokens_seen": 112089344, + "step": 6966 + }, + { + "epoch": 0.48802612799565703, + "grad_norm": 3.5601627826690674, + "learning_rate": 5.12432014010508e-05, + "loss": 1.0519, + "num_input_tokens_seen": 112105296, + "step": 6967 + }, + { + "epoch": 0.48809617624138624, + "grad_norm": 3.91282057762146, + "learning_rate": 5.123620315236427e-05, + "loss": 1.2046, + "num_input_tokens_seen": 112120600, + "step": 6968 + }, + { + "epoch": 0.4881662244871155, + "grad_norm": 5.9246602058410645, + "learning_rate": 5.1229204903677766e-05, + "loss": 1.0982, + "num_input_tokens_seen": 112136472, + "step": 6969 + }, + { + "epoch": 0.48823627273284476, + "grad_norm": 8.849782943725586, + "learning_rate": 5.122220665499125e-05, + "loss": 1.0671, + "num_input_tokens_seen": 112152792, + "step": 6970 + }, + { + "epoch": 0.488306320978574, + "grad_norm": 4.184106349945068, + "learning_rate": 5.121520840630473e-05, + "loss": 1.2696, + "num_input_tokens_seen": 112169176, + "step": 6971 + }, + { + "epoch": 0.4883763692243032, + "grad_norm": 4.250857830047607, + "learning_rate": 5.120821015761822e-05, + "loss": 0.9724, + "num_input_tokens_seen": 112184784, + "step": 6972 + }, + { + "epoch": 0.4884464174700325, + "grad_norm": 4.522305011749268, + "learning_rate": 5.12012119089317e-05, + "loss": 0.8251, + "num_input_tokens_seen": 112200960, + "step": 6973 + }, + { + "epoch": 0.48851646571576174, + "grad_norm": 3.5135490894317627, + "learning_rate": 5.1194213660245195e-05, + "loss": 1.0713, + "num_input_tokens_seen": 112217080, + "step": 6974 + }, + { + "epoch": 0.488586513961491, + "grad_norm": 5.541810989379883, + "learning_rate": 5.118721541155866e-05, + "loss": 1.1885, + "num_input_tokens_seen": 112233464, + "step": 6975 + }, + { + "epoch": 0.4886565622072202, + "grad_norm": 3.7535064220428467, + "learning_rate": 5.1180217162872165e-05, + "loss": 1.0558, + "num_input_tokens_seen": 112249848, + "step": 6976 + }, + { + "epoch": 0.48872661045294946, + "grad_norm": 4.454082012176514, + "learning_rate": 5.117321891418565e-05, + "loss": 1.0892, + "num_input_tokens_seen": 112265560, + "step": 6977 + }, + { + "epoch": 0.4887966586986787, + "grad_norm": 3.770138740539551, + "learning_rate": 5.116622066549912e-05, + "loss": 1.082, + "num_input_tokens_seen": 112281944, + "step": 6978 + }, + { + "epoch": 0.488866706944408, + "grad_norm": 5.923669815063477, + "learning_rate": 5.115922241681261e-05, + "loss": 1.0662, + "num_input_tokens_seen": 112298264, + "step": 6979 + }, + { + "epoch": 0.4889367551901372, + "grad_norm": 3.9768123626708984, + "learning_rate": 5.115222416812609e-05, + "loss": 1.1816, + "num_input_tokens_seen": 112314608, + "step": 6980 + }, + { + "epoch": 0.48900680343586644, + "grad_norm": 5.525039196014404, + "learning_rate": 5.1145225919439575e-05, + "loss": 1.0287, + "num_input_tokens_seen": 112330400, + "step": 6981 + }, + { + "epoch": 0.4890768516815957, + "grad_norm": 3.8725640773773193, + "learning_rate": 5.113822767075307e-05, + "loss": 0.9666, + "num_input_tokens_seen": 112345384, + "step": 6982 + }, + { + "epoch": 0.48914689992732496, + "grad_norm": 4.746465682983398, + "learning_rate": 5.1131229422066565e-05, + "loss": 1.1162, + "num_input_tokens_seen": 112361768, + "step": 6983 + }, + { + "epoch": 0.48921694817305417, + "grad_norm": 3.774049997329712, + "learning_rate": 5.112423117338005e-05, + "loss": 1.0898, + "num_input_tokens_seen": 112377432, + "step": 6984 + }, + { + "epoch": 0.4892869964187834, + "grad_norm": 3.686307191848755, + "learning_rate": 5.1117232924693515e-05, + "loss": 1.0459, + "num_input_tokens_seen": 112393672, + "step": 6985 + }, + { + "epoch": 0.4893570446645127, + "grad_norm": 4.177459239959717, + "learning_rate": 5.1110234676007004e-05, + "loss": 1.0504, + "num_input_tokens_seen": 112409600, + "step": 6986 + }, + { + "epoch": 0.48942709291024195, + "grad_norm": 3.8517558574676514, + "learning_rate": 5.1103236427320486e-05, + "loss": 1.0947, + "num_input_tokens_seen": 112425880, + "step": 6987 + }, + { + "epoch": 0.48949714115597115, + "grad_norm": 3.3155159950256348, + "learning_rate": 5.109623817863397e-05, + "loss": 0.9201, + "num_input_tokens_seen": 112442264, + "step": 6988 + }, + { + "epoch": 0.4895671894017004, + "grad_norm": 4.027132987976074, + "learning_rate": 5.108923992994746e-05, + "loss": 1.0882, + "num_input_tokens_seen": 112458504, + "step": 6989 + }, + { + "epoch": 0.48963723764742967, + "grad_norm": 3.622421979904175, + "learning_rate": 5.108224168126096e-05, + "loss": 0.9098, + "num_input_tokens_seen": 112474888, + "step": 6990 + }, + { + "epoch": 0.48970728589315893, + "grad_norm": 4.16541051864624, + "learning_rate": 5.107524343257444e-05, + "loss": 0.967, + "num_input_tokens_seen": 112491272, + "step": 6991 + }, + { + "epoch": 0.48977733413888813, + "grad_norm": 4.473822593688965, + "learning_rate": 5.106824518388792e-05, + "loss": 1.026, + "num_input_tokens_seen": 112506632, + "step": 6992 + }, + { + "epoch": 0.4898473823846174, + "grad_norm": 5.10452127456665, + "learning_rate": 5.106124693520141e-05, + "loss": 1.003, + "num_input_tokens_seen": 112521696, + "step": 6993 + }, + { + "epoch": 0.48991743063034665, + "grad_norm": 4.185652732849121, + "learning_rate": 5.105424868651489e-05, + "loss": 0.9275, + "num_input_tokens_seen": 112537432, + "step": 6994 + }, + { + "epoch": 0.4899874788760759, + "grad_norm": 4.864262580871582, + "learning_rate": 5.104725043782837e-05, + "loss": 1.033, + "num_input_tokens_seen": 112553816, + "step": 6995 + }, + { + "epoch": 0.49005752712180517, + "grad_norm": 3.859199047088623, + "learning_rate": 5.1040252189141856e-05, + "loss": 1.115, + "num_input_tokens_seen": 112570200, + "step": 6996 + }, + { + "epoch": 0.4901275753675344, + "grad_norm": 3.49395751953125, + "learning_rate": 5.103325394045535e-05, + "loss": 0.9285, + "num_input_tokens_seen": 112586584, + "step": 6997 + }, + { + "epoch": 0.49019762361326363, + "grad_norm": 4.164735317230225, + "learning_rate": 5.102625569176883e-05, + "loss": 0.8565, + "num_input_tokens_seen": 112602568, + "step": 6998 + }, + { + "epoch": 0.4902676718589929, + "grad_norm": 6.273041725158691, + "learning_rate": 5.1019257443082315e-05, + "loss": 1.0252, + "num_input_tokens_seen": 112618952, + "step": 6999 + }, + { + "epoch": 0.49033772010472215, + "grad_norm": 3.8460848331451416, + "learning_rate": 5.1012259194395804e-05, + "loss": 0.9854, + "num_input_tokens_seen": 112635336, + "step": 7000 + }, + { + "epoch": 0.49033772010472215, + "eval_loss": 1.1226829290390015, + "eval_runtime": 0.157, + "eval_samples_per_second": 6.371, + "eval_steps_per_second": 6.371, + "num_input_tokens_seen": 112635336, + "step": 7000 + }, + { + "epoch": 0.49040776835045136, + "grad_norm": 4.498118877410889, + "learning_rate": 5.1005260945709285e-05, + "loss": 1.2191, + "num_input_tokens_seen": 112651248, + "step": 7001 + }, + { + "epoch": 0.4904778165961806, + "grad_norm": 3.9830660820007324, + "learning_rate": 5.099826269702276e-05, + "loss": 0.9127, + "num_input_tokens_seen": 112667632, + "step": 7002 + }, + { + "epoch": 0.4905478648419099, + "grad_norm": 3.439422130584717, + "learning_rate": 5.099126444833626e-05, + "loss": 0.9423, + "num_input_tokens_seen": 112684016, + "step": 7003 + }, + { + "epoch": 0.49061791308763913, + "grad_norm": 3.6636171340942383, + "learning_rate": 5.0984266199649744e-05, + "loss": 1.0124, + "num_input_tokens_seen": 112700360, + "step": 7004 + }, + { + "epoch": 0.49068796133336834, + "grad_norm": 4.3784589767456055, + "learning_rate": 5.097726795096321e-05, + "loss": 1.1782, + "num_input_tokens_seen": 112715624, + "step": 7005 + }, + { + "epoch": 0.4907580095790976, + "grad_norm": 3.716031789779663, + "learning_rate": 5.097026970227671e-05, + "loss": 1.0312, + "num_input_tokens_seen": 112732008, + "step": 7006 + }, + { + "epoch": 0.49082805782482686, + "grad_norm": 4.289496898651123, + "learning_rate": 5.09632714535902e-05, + "loss": 1.103, + "num_input_tokens_seen": 112748392, + "step": 7007 + }, + { + "epoch": 0.4908981060705561, + "grad_norm": 4.012343883514404, + "learning_rate": 5.095627320490368e-05, + "loss": 1.0623, + "num_input_tokens_seen": 112764432, + "step": 7008 + }, + { + "epoch": 0.4909681543162853, + "grad_norm": 3.7599732875823975, + "learning_rate": 5.094927495621716e-05, + "loss": 1.0666, + "num_input_tokens_seen": 112780544, + "step": 7009 + }, + { + "epoch": 0.4910382025620146, + "grad_norm": 3.398778200149536, + "learning_rate": 5.0942276707530656e-05, + "loss": 1.0517, + "num_input_tokens_seen": 112796928, + "step": 7010 + }, + { + "epoch": 0.49110825080774384, + "grad_norm": 7.299741268157959, + "learning_rate": 5.093527845884414e-05, + "loss": 1.0963, + "num_input_tokens_seen": 112812576, + "step": 7011 + }, + { + "epoch": 0.4911782990534731, + "grad_norm": 4.2506866455078125, + "learning_rate": 5.0928280210157606e-05, + "loss": 1.11, + "num_input_tokens_seen": 112828184, + "step": 7012 + }, + { + "epoch": 0.4912483472992023, + "grad_norm": 4.033505916595459, + "learning_rate": 5.09212819614711e-05, + "loss": 1.1648, + "num_input_tokens_seen": 112844000, + "step": 7013 + }, + { + "epoch": 0.49131839554493156, + "grad_norm": 3.9474592208862305, + "learning_rate": 5.0914283712784596e-05, + "loss": 1.081, + "num_input_tokens_seen": 112860384, + "step": 7014 + }, + { + "epoch": 0.4913884437906608, + "grad_norm": 5.549149036407471, + "learning_rate": 5.090728546409808e-05, + "loss": 0.8816, + "num_input_tokens_seen": 112875512, + "step": 7015 + }, + { + "epoch": 0.4914584920363901, + "grad_norm": 5.086400985717773, + "learning_rate": 5.090028721541156e-05, + "loss": 1.2013, + "num_input_tokens_seen": 112890840, + "step": 7016 + }, + { + "epoch": 0.4915285402821193, + "grad_norm": 3.8358511924743652, + "learning_rate": 5.089328896672505e-05, + "loss": 0.9351, + "num_input_tokens_seen": 112907048, + "step": 7017 + }, + { + "epoch": 0.49159858852784855, + "grad_norm": 3.902388095855713, + "learning_rate": 5.088629071803853e-05, + "loss": 0.9513, + "num_input_tokens_seen": 112923152, + "step": 7018 + }, + { + "epoch": 0.4916686367735778, + "grad_norm": 4.3525166511535645, + "learning_rate": 5.0879292469352026e-05, + "loss": 0.9481, + "num_input_tokens_seen": 112939536, + "step": 7019 + }, + { + "epoch": 0.49173868501930706, + "grad_norm": 4.519508361816406, + "learning_rate": 5.087229422066551e-05, + "loss": 0.9127, + "num_input_tokens_seen": 112955920, + "step": 7020 + }, + { + "epoch": 0.49180873326503627, + "grad_norm": 4.366591930389404, + "learning_rate": 5.086529597197899e-05, + "loss": 1.1246, + "num_input_tokens_seen": 112971824, + "step": 7021 + }, + { + "epoch": 0.49187878151076553, + "grad_norm": 3.9145777225494385, + "learning_rate": 5.085829772329247e-05, + "loss": 1.085, + "num_input_tokens_seen": 112988208, + "step": 7022 + }, + { + "epoch": 0.4919488297564948, + "grad_norm": 3.9565582275390625, + "learning_rate": 5.085129947460595e-05, + "loss": 1.0299, + "num_input_tokens_seen": 113004592, + "step": 7023 + }, + { + "epoch": 0.49201887800222405, + "grad_norm": 4.051690101623535, + "learning_rate": 5.084430122591944e-05, + "loss": 1.0569, + "num_input_tokens_seen": 113020552, + "step": 7024 + }, + { + "epoch": 0.49208892624795325, + "grad_norm": 4.020756244659424, + "learning_rate": 5.0837302977232923e-05, + "loss": 0.9238, + "num_input_tokens_seen": 113036840, + "step": 7025 + }, + { + "epoch": 0.4921589744936825, + "grad_norm": 4.177811622619629, + "learning_rate": 5.0830304728546405e-05, + "loss": 1.147, + "num_input_tokens_seen": 113051816, + "step": 7026 + }, + { + "epoch": 0.49222902273941177, + "grad_norm": 3.997945785522461, + "learning_rate": 5.08233064798599e-05, + "loss": 0.9788, + "num_input_tokens_seen": 113067968, + "step": 7027 + }, + { + "epoch": 0.49229907098514103, + "grad_norm": 4.968790531158447, + "learning_rate": 5.081630823117338e-05, + "loss": 1.1171, + "num_input_tokens_seen": 113084352, + "step": 7028 + }, + { + "epoch": 0.4923691192308703, + "grad_norm": 3.9024367332458496, + "learning_rate": 5.0809309982486864e-05, + "loss": 0.9673, + "num_input_tokens_seen": 113100672, + "step": 7029 + }, + { + "epoch": 0.4924391674765995, + "grad_norm": 4.58246374130249, + "learning_rate": 5.080231173380036e-05, + "loss": 1.0995, + "num_input_tokens_seen": 113115856, + "step": 7030 + }, + { + "epoch": 0.49250921572232875, + "grad_norm": 4.944141387939453, + "learning_rate": 5.0795313485113835e-05, + "loss": 1.0369, + "num_input_tokens_seen": 113132240, + "step": 7031 + }, + { + "epoch": 0.492579263968058, + "grad_norm": 5.382607460021973, + "learning_rate": 5.0788315236427316e-05, + "loss": 1.1601, + "num_input_tokens_seen": 113148624, + "step": 7032 + }, + { + "epoch": 0.49264931221378727, + "grad_norm": 4.664593696594238, + "learning_rate": 5.07813169877408e-05, + "loss": 1.2741, + "num_input_tokens_seen": 113164608, + "step": 7033 + }, + { + "epoch": 0.4927193604595165, + "grad_norm": 3.8908638954162598, + "learning_rate": 5.077431873905431e-05, + "loss": 1.0828, + "num_input_tokens_seen": 113180712, + "step": 7034 + }, + { + "epoch": 0.49278940870524574, + "grad_norm": 3.93803334236145, + "learning_rate": 5.0767320490367775e-05, + "loss": 1.0555, + "num_input_tokens_seen": 113196328, + "step": 7035 + }, + { + "epoch": 0.492859456950975, + "grad_norm": 5.349659442901611, + "learning_rate": 5.076032224168127e-05, + "loss": 1.0171, + "num_input_tokens_seen": 113212712, + "step": 7036 + }, + { + "epoch": 0.49292950519670425, + "grad_norm": 5.471059322357178, + "learning_rate": 5.075332399299475e-05, + "loss": 1.0165, + "num_input_tokens_seen": 113229096, + "step": 7037 + }, + { + "epoch": 0.49299955344243346, + "grad_norm": 5.430948734283447, + "learning_rate": 5.0746325744308234e-05, + "loss": 1.1832, + "num_input_tokens_seen": 113244752, + "step": 7038 + }, + { + "epoch": 0.4930696016881627, + "grad_norm": 3.4861812591552734, + "learning_rate": 5.0739327495621716e-05, + "loss": 0.9185, + "num_input_tokens_seen": 113261136, + "step": 7039 + }, + { + "epoch": 0.493139649933892, + "grad_norm": 3.9184775352478027, + "learning_rate": 5.0732329246935205e-05, + "loss": 1.0508, + "num_input_tokens_seen": 113277520, + "step": 7040 + }, + { + "epoch": 0.49320969817962124, + "grad_norm": 3.6723365783691406, + "learning_rate": 5.072533099824869e-05, + "loss": 1.0803, + "num_input_tokens_seen": 113293208, + "step": 7041 + }, + { + "epoch": 0.49327974642535044, + "grad_norm": 4.273809432983398, + "learning_rate": 5.071833274956217e-05, + "loss": 1.2621, + "num_input_tokens_seen": 113309592, + "step": 7042 + }, + { + "epoch": 0.4933497946710797, + "grad_norm": 3.448326349258423, + "learning_rate": 5.0711334500875664e-05, + "loss": 1.0201, + "num_input_tokens_seen": 113325976, + "step": 7043 + }, + { + "epoch": 0.49341984291680896, + "grad_norm": 4.427138805389404, + "learning_rate": 5.070433625218915e-05, + "loss": 1.1113, + "num_input_tokens_seen": 113341896, + "step": 7044 + }, + { + "epoch": 0.4934898911625382, + "grad_norm": 3.837282180786133, + "learning_rate": 5.069733800350263e-05, + "loss": 1.0454, + "num_input_tokens_seen": 113358280, + "step": 7045 + }, + { + "epoch": 0.4935599394082674, + "grad_norm": 4.842933654785156, + "learning_rate": 5.0690339754816116e-05, + "loss": 1.3572, + "num_input_tokens_seen": 113374664, + "step": 7046 + }, + { + "epoch": 0.4936299876539967, + "grad_norm": 5.275210857391357, + "learning_rate": 5.06833415061296e-05, + "loss": 1.0208, + "num_input_tokens_seen": 113391048, + "step": 7047 + }, + { + "epoch": 0.49370003589972594, + "grad_norm": 4.120177268981934, + "learning_rate": 5.067634325744308e-05, + "loss": 1.1957, + "num_input_tokens_seen": 113407432, + "step": 7048 + }, + { + "epoch": 0.4937700841454552, + "grad_norm": 4.254641056060791, + "learning_rate": 5.066934500875656e-05, + "loss": 1.1381, + "num_input_tokens_seen": 113423392, + "step": 7049 + }, + { + "epoch": 0.4938401323911844, + "grad_norm": 4.007355690002441, + "learning_rate": 5.066234676007006e-05, + "loss": 0.9828, + "num_input_tokens_seen": 113439776, + "step": 7050 + }, + { + "epoch": 0.49391018063691366, + "grad_norm": 3.8774940967559814, + "learning_rate": 5.065534851138355e-05, + "loss": 0.8772, + "num_input_tokens_seen": 113456160, + "step": 7051 + }, + { + "epoch": 0.4939802288826429, + "grad_norm": 3.6799323558807373, + "learning_rate": 5.064835026269702e-05, + "loss": 1.0428, + "num_input_tokens_seen": 113472408, + "step": 7052 + }, + { + "epoch": 0.4940502771283722, + "grad_norm": 3.662111282348633, + "learning_rate": 5.064135201401051e-05, + "loss": 1.0102, + "num_input_tokens_seen": 113488792, + "step": 7053 + }, + { + "epoch": 0.4941203253741014, + "grad_norm": 4.95071268081665, + "learning_rate": 5.0634353765324004e-05, + "loss": 0.9709, + "num_input_tokens_seen": 113505176, + "step": 7054 + }, + { + "epoch": 0.49419037361983065, + "grad_norm": 4.512982368469238, + "learning_rate": 5.062735551663747e-05, + "loss": 1.0955, + "num_input_tokens_seen": 113520640, + "step": 7055 + }, + { + "epoch": 0.4942604218655599, + "grad_norm": 3.6250205039978027, + "learning_rate": 5.0620357267950955e-05, + "loss": 0.9812, + "num_input_tokens_seen": 113535616, + "step": 7056 + }, + { + "epoch": 0.49433047011128917, + "grad_norm": 3.856593132019043, + "learning_rate": 5.0613359019264463e-05, + "loss": 0.9007, + "num_input_tokens_seen": 113552000, + "step": 7057 + }, + { + "epoch": 0.49440051835701837, + "grad_norm": 3.655444860458374, + "learning_rate": 5.0606360770577945e-05, + "loss": 1.1066, + "num_input_tokens_seen": 113568200, + "step": 7058 + }, + { + "epoch": 0.49447056660274763, + "grad_norm": 4.795759677886963, + "learning_rate": 5.0599362521891414e-05, + "loss": 0.9739, + "num_input_tokens_seen": 113584584, + "step": 7059 + }, + { + "epoch": 0.4945406148484769, + "grad_norm": 4.4534783363342285, + "learning_rate": 5.059236427320491e-05, + "loss": 1.0207, + "num_input_tokens_seen": 113600968, + "step": 7060 + }, + { + "epoch": 0.49461066309420615, + "grad_norm": 4.681578159332275, + "learning_rate": 5.05853660245184e-05, + "loss": 1.3153, + "num_input_tokens_seen": 113617352, + "step": 7061 + }, + { + "epoch": 0.49468071133993535, + "grad_norm": 3.6734678745269775, + "learning_rate": 5.057836777583187e-05, + "loss": 1.0005, + "num_input_tokens_seen": 113633736, + "step": 7062 + }, + { + "epoch": 0.4947507595856646, + "grad_norm": 3.7208728790283203, + "learning_rate": 5.057136952714536e-05, + "loss": 1.0641, + "num_input_tokens_seen": 113649224, + "step": 7063 + }, + { + "epoch": 0.49482080783139387, + "grad_norm": 4.036924362182617, + "learning_rate": 5.0564371278458856e-05, + "loss": 1.0842, + "num_input_tokens_seen": 113665552, + "step": 7064 + }, + { + "epoch": 0.49489085607712313, + "grad_norm": 6.462393760681152, + "learning_rate": 5.055737302977234e-05, + "loss": 1.0341, + "num_input_tokens_seen": 113681192, + "step": 7065 + }, + { + "epoch": 0.4949609043228524, + "grad_norm": 4.203556537628174, + "learning_rate": 5.055037478108582e-05, + "loss": 1.3181, + "num_input_tokens_seen": 113697168, + "step": 7066 + }, + { + "epoch": 0.4950309525685816, + "grad_norm": 3.798896551132202, + "learning_rate": 5.05433765323993e-05, + "loss": 1.1016, + "num_input_tokens_seen": 113713504, + "step": 7067 + }, + { + "epoch": 0.49510100081431085, + "grad_norm": 4.175333499908447, + "learning_rate": 5.053637828371279e-05, + "loss": 1.0572, + "num_input_tokens_seen": 113729872, + "step": 7068 + }, + { + "epoch": 0.4951710490600401, + "grad_norm": 3.563164234161377, + "learning_rate": 5.0529380035026266e-05, + "loss": 0.849, + "num_input_tokens_seen": 113745736, + "step": 7069 + }, + { + "epoch": 0.4952410973057694, + "grad_norm": 3.605379104614258, + "learning_rate": 5.0522381786339754e-05, + "loss": 1.0291, + "num_input_tokens_seen": 113760816, + "step": 7070 + }, + { + "epoch": 0.4953111455514986, + "grad_norm": 3.849106550216675, + "learning_rate": 5.051538353765325e-05, + "loss": 0.8297, + "num_input_tokens_seen": 113776600, + "step": 7071 + }, + { + "epoch": 0.49538119379722784, + "grad_norm": 4.046478748321533, + "learning_rate": 5.050838528896672e-05, + "loss": 0.9624, + "num_input_tokens_seen": 113792984, + "step": 7072 + }, + { + "epoch": 0.4954512420429571, + "grad_norm": 4.66940450668335, + "learning_rate": 5.050138704028021e-05, + "loss": 1.1873, + "num_input_tokens_seen": 113809072, + "step": 7073 + }, + { + "epoch": 0.49552129028868636, + "grad_norm": 5.5129075050354, + "learning_rate": 5.049438879159371e-05, + "loss": 1.0224, + "num_input_tokens_seen": 113825456, + "step": 7074 + }, + { + "epoch": 0.49559133853441556, + "grad_norm": 4.045241355895996, + "learning_rate": 5.048739054290719e-05, + "loss": 1.0762, + "num_input_tokens_seen": 113841840, + "step": 7075 + }, + { + "epoch": 0.4956613867801448, + "grad_norm": 3.9198641777038574, + "learning_rate": 5.0480392294220665e-05, + "loss": 1.1765, + "num_input_tokens_seen": 113857624, + "step": 7076 + }, + { + "epoch": 0.4957314350258741, + "grad_norm": 3.836678981781006, + "learning_rate": 5.047339404553415e-05, + "loss": 1.1377, + "num_input_tokens_seen": 113874008, + "step": 7077 + }, + { + "epoch": 0.49580148327160334, + "grad_norm": 3.593061923980713, + "learning_rate": 5.046639579684764e-05, + "loss": 1.0115, + "num_input_tokens_seen": 113889992, + "step": 7078 + }, + { + "epoch": 0.49587153151733254, + "grad_norm": 5.477400302886963, + "learning_rate": 5.045939754816111e-05, + "loss": 1.1446, + "num_input_tokens_seen": 113906376, + "step": 7079 + }, + { + "epoch": 0.4959415797630618, + "grad_norm": 5.204897880554199, + "learning_rate": 5.045239929947462e-05, + "loss": 1.2356, + "num_input_tokens_seen": 113921792, + "step": 7080 + }, + { + "epoch": 0.49601162800879106, + "grad_norm": 6.132393836975098, + "learning_rate": 5.04454010507881e-05, + "loss": 1.0126, + "num_input_tokens_seen": 113937720, + "step": 7081 + }, + { + "epoch": 0.4960816762545203, + "grad_norm": 3.651715040206909, + "learning_rate": 5.043840280210158e-05, + "loss": 0.948, + "num_input_tokens_seen": 113953016, + "step": 7082 + }, + { + "epoch": 0.4961517245002495, + "grad_norm": 4.28763484954834, + "learning_rate": 5.0431404553415065e-05, + "loss": 0.9985, + "num_input_tokens_seen": 113969400, + "step": 7083 + }, + { + "epoch": 0.4962217727459788, + "grad_norm": 7.6505208015441895, + "learning_rate": 5.0424406304728554e-05, + "loss": 1.2593, + "num_input_tokens_seen": 113985784, + "step": 7084 + }, + { + "epoch": 0.49629182099170804, + "grad_norm": 4.85219144821167, + "learning_rate": 5.0417408056042036e-05, + "loss": 0.9904, + "num_input_tokens_seen": 114001592, + "step": 7085 + }, + { + "epoch": 0.4963618692374373, + "grad_norm": 3.414391040802002, + "learning_rate": 5.041040980735552e-05, + "loss": 1.088, + "num_input_tokens_seen": 114017976, + "step": 7086 + }, + { + "epoch": 0.4964319174831665, + "grad_norm": 4.361126899719238, + "learning_rate": 5.040341155866901e-05, + "loss": 1.0559, + "num_input_tokens_seen": 114034360, + "step": 7087 + }, + { + "epoch": 0.49650196572889577, + "grad_norm": 3.459439754486084, + "learning_rate": 5.0396413309982495e-05, + "loss": 0.8409, + "num_input_tokens_seen": 114050744, + "step": 7088 + }, + { + "epoch": 0.496572013974625, + "grad_norm": 4.241810321807861, + "learning_rate": 5.0389415061295976e-05, + "loss": 1.1019, + "num_input_tokens_seen": 114066776, + "step": 7089 + }, + { + "epoch": 0.4966420622203543, + "grad_norm": 4.012382984161377, + "learning_rate": 5.0382416812609465e-05, + "loss": 1.1166, + "num_input_tokens_seen": 114082888, + "step": 7090 + }, + { + "epoch": 0.4967121104660835, + "grad_norm": 3.8776516914367676, + "learning_rate": 5.037541856392295e-05, + "loss": 1.0333, + "num_input_tokens_seen": 114099072, + "step": 7091 + }, + { + "epoch": 0.49678215871181275, + "grad_norm": 4.0513014793396, + "learning_rate": 5.036842031523643e-05, + "loss": 0.9747, + "num_input_tokens_seen": 114115296, + "step": 7092 + }, + { + "epoch": 0.496852206957542, + "grad_norm": 3.7338500022888184, + "learning_rate": 5.036142206654991e-05, + "loss": 1.1254, + "num_input_tokens_seen": 114131680, + "step": 7093 + }, + { + "epoch": 0.49692225520327127, + "grad_norm": 5.892488956451416, + "learning_rate": 5.035442381786339e-05, + "loss": 1.0316, + "num_input_tokens_seen": 114146560, + "step": 7094 + }, + { + "epoch": 0.49699230344900047, + "grad_norm": 5.1975507736206055, + "learning_rate": 5.034742556917689e-05, + "loss": 1.2128, + "num_input_tokens_seen": 114162944, + "step": 7095 + }, + { + "epoch": 0.49706235169472973, + "grad_norm": 4.196847438812256, + "learning_rate": 5.034042732049037e-05, + "loss": 1.0168, + "num_input_tokens_seen": 114178040, + "step": 7096 + }, + { + "epoch": 0.497132399940459, + "grad_norm": 4.342573642730713, + "learning_rate": 5.0333429071803865e-05, + "loss": 0.9601, + "num_input_tokens_seen": 114194424, + "step": 7097 + }, + { + "epoch": 0.49720244818618825, + "grad_norm": 4.113316059112549, + "learning_rate": 5.032643082311734e-05, + "loss": 1.0902, + "num_input_tokens_seen": 114210808, + "step": 7098 + }, + { + "epoch": 0.4972724964319175, + "grad_norm": 4.835622787475586, + "learning_rate": 5.031943257443082e-05, + "loss": 1.1025, + "num_input_tokens_seen": 114225248, + "step": 7099 + }, + { + "epoch": 0.4973425446776467, + "grad_norm": 4.603962421417236, + "learning_rate": 5.031243432574432e-05, + "loss": 1.1335, + "num_input_tokens_seen": 114241632, + "step": 7100 + }, + { + "epoch": 0.497412592923376, + "grad_norm": 4.17899227142334, + "learning_rate": 5.030543607705781e-05, + "loss": 1.0766, + "num_input_tokens_seen": 114256688, + "step": 7101 + }, + { + "epoch": 0.49748264116910523, + "grad_norm": 3.890780448913574, + "learning_rate": 5.029843782837128e-05, + "loss": 1.089, + "num_input_tokens_seen": 114273072, + "step": 7102 + }, + { + "epoch": 0.4975526894148345, + "grad_norm": 4.290158748626709, + "learning_rate": 5.029143957968476e-05, + "loss": 0.9706, + "num_input_tokens_seen": 114288936, + "step": 7103 + }, + { + "epoch": 0.4976227376605637, + "grad_norm": 5.222672462463379, + "learning_rate": 5.028444133099826e-05, + "loss": 1.0974, + "num_input_tokens_seen": 114305320, + "step": 7104 + }, + { + "epoch": 0.49769278590629296, + "grad_norm": 3.383232355117798, + "learning_rate": 5.027744308231174e-05, + "loss": 1.0279, + "num_input_tokens_seen": 114321704, + "step": 7105 + }, + { + "epoch": 0.4977628341520222, + "grad_norm": 3.8526852130889893, + "learning_rate": 5.027044483362522e-05, + "loss": 1.0064, + "num_input_tokens_seen": 114337920, + "step": 7106 + }, + { + "epoch": 0.4978328823977515, + "grad_norm": 3.699127674102783, + "learning_rate": 5.026344658493871e-05, + "loss": 1.1549, + "num_input_tokens_seen": 114354304, + "step": 7107 + }, + { + "epoch": 0.4979029306434807, + "grad_norm": 3.3088033199310303, + "learning_rate": 5.025644833625219e-05, + "loss": 0.9093, + "num_input_tokens_seen": 114370688, + "step": 7108 + }, + { + "epoch": 0.49797297888920994, + "grad_norm": 4.435497760772705, + "learning_rate": 5.0249450087565674e-05, + "loss": 1.0656, + "num_input_tokens_seen": 114386664, + "step": 7109 + }, + { + "epoch": 0.4980430271349392, + "grad_norm": 4.2929840087890625, + "learning_rate": 5.0242451838879155e-05, + "loss": 0.9749, + "num_input_tokens_seen": 114401816, + "step": 7110 + }, + { + "epoch": 0.49811307538066846, + "grad_norm": 4.264016628265381, + "learning_rate": 5.023545359019266e-05, + "loss": 1.008, + "num_input_tokens_seen": 114418200, + "step": 7111 + }, + { + "epoch": 0.49818312362639766, + "grad_norm": 3.5081541538238525, + "learning_rate": 5.022845534150613e-05, + "loss": 0.8752, + "num_input_tokens_seen": 114434424, + "step": 7112 + }, + { + "epoch": 0.4982531718721269, + "grad_norm": 5.671893119812012, + "learning_rate": 5.0221457092819614e-05, + "loss": 1.0707, + "num_input_tokens_seen": 114449560, + "step": 7113 + }, + { + "epoch": 0.4983232201178562, + "grad_norm": 4.350570201873779, + "learning_rate": 5.02144588441331e-05, + "loss": 1.105, + "num_input_tokens_seen": 114465504, + "step": 7114 + }, + { + "epoch": 0.49839326836358544, + "grad_norm": 3.650238513946533, + "learning_rate": 5.0207460595446585e-05, + "loss": 1.1202, + "num_input_tokens_seen": 114481760, + "step": 7115 + }, + { + "epoch": 0.49846331660931464, + "grad_norm": 4.211227893829346, + "learning_rate": 5.020046234676007e-05, + "loss": 1.228, + "num_input_tokens_seen": 114496696, + "step": 7116 + }, + { + "epoch": 0.4985333648550439, + "grad_norm": 3.561427354812622, + "learning_rate": 5.019346409807356e-05, + "loss": 0.8941, + "num_input_tokens_seen": 114512616, + "step": 7117 + }, + { + "epoch": 0.49860341310077316, + "grad_norm": 4.558845520019531, + "learning_rate": 5.018646584938706e-05, + "loss": 1.1921, + "num_input_tokens_seen": 114527600, + "step": 7118 + }, + { + "epoch": 0.4986734613465024, + "grad_norm": 3.419285297393799, + "learning_rate": 5.0179467600700526e-05, + "loss": 1.044, + "num_input_tokens_seen": 114543984, + "step": 7119 + }, + { + "epoch": 0.4987435095922316, + "grad_norm": 3.844834566116333, + "learning_rate": 5.017246935201401e-05, + "loss": 1.2369, + "num_input_tokens_seen": 114559728, + "step": 7120 + }, + { + "epoch": 0.4988135578379609, + "grad_norm": 4.457134246826172, + "learning_rate": 5.0165471103327496e-05, + "loss": 1.061, + "num_input_tokens_seen": 114575688, + "step": 7121 + }, + { + "epoch": 0.49888360608369015, + "grad_norm": 4.241283893585205, + "learning_rate": 5.015847285464098e-05, + "loss": 1.1434, + "num_input_tokens_seen": 114590680, + "step": 7122 + }, + { + "epoch": 0.4989536543294194, + "grad_norm": 3.7781248092651367, + "learning_rate": 5.015147460595446e-05, + "loss": 0.9872, + "num_input_tokens_seen": 114606960, + "step": 7123 + }, + { + "epoch": 0.4990237025751486, + "grad_norm": 5.492437839508057, + "learning_rate": 5.0144476357267955e-05, + "loss": 1.0772, + "num_input_tokens_seen": 114623344, + "step": 7124 + }, + { + "epoch": 0.49909375082087787, + "grad_norm": 5.001891613006592, + "learning_rate": 5.013747810858145e-05, + "loss": 1.1999, + "num_input_tokens_seen": 114639728, + "step": 7125 + }, + { + "epoch": 0.49916379906660713, + "grad_norm": 3.78376841545105, + "learning_rate": 5.013047985989493e-05, + "loss": 1.1275, + "num_input_tokens_seen": 114655984, + "step": 7126 + }, + { + "epoch": 0.4992338473123364, + "grad_norm": 5.250494956970215, + "learning_rate": 5.0123481611208414e-05, + "loss": 1.1063, + "num_input_tokens_seen": 114670912, + "step": 7127 + }, + { + "epoch": 0.4993038955580656, + "grad_norm": 3.8290820121765137, + "learning_rate": 5.01164833625219e-05, + "loss": 0.9789, + "num_input_tokens_seen": 114687240, + "step": 7128 + }, + { + "epoch": 0.49937394380379485, + "grad_norm": 5.523165225982666, + "learning_rate": 5.010948511383538e-05, + "loss": 0.977, + "num_input_tokens_seen": 114703616, + "step": 7129 + }, + { + "epoch": 0.4994439920495241, + "grad_norm": 3.838224172592163, + "learning_rate": 5.010248686514886e-05, + "loss": 0.9461, + "num_input_tokens_seen": 114720000, + "step": 7130 + }, + { + "epoch": 0.49951404029525337, + "grad_norm": 3.751004457473755, + "learning_rate": 5.009548861646235e-05, + "loss": 1.0345, + "num_input_tokens_seen": 114735624, + "step": 7131 + }, + { + "epoch": 0.49958408854098263, + "grad_norm": 4.485782146453857, + "learning_rate": 5.0088490367775843e-05, + "loss": 1.2378, + "num_input_tokens_seen": 114751040, + "step": 7132 + }, + { + "epoch": 0.49965413678671183, + "grad_norm": 4.896092891693115, + "learning_rate": 5.0081492119089325e-05, + "loss": 1.1213, + "num_input_tokens_seen": 114766096, + "step": 7133 + }, + { + "epoch": 0.4997241850324411, + "grad_norm": 4.27908182144165, + "learning_rate": 5.007449387040281e-05, + "loss": 0.9422, + "num_input_tokens_seen": 114782480, + "step": 7134 + }, + { + "epoch": 0.49979423327817035, + "grad_norm": 5.309985160827637, + "learning_rate": 5.0067495621716296e-05, + "loss": 1.1707, + "num_input_tokens_seen": 114798864, + "step": 7135 + }, + { + "epoch": 0.4998642815238996, + "grad_norm": 3.838355302810669, + "learning_rate": 5.006049737302978e-05, + "loss": 1.243, + "num_input_tokens_seen": 114814680, + "step": 7136 + }, + { + "epoch": 0.4999343297696288, + "grad_norm": 3.9620189666748047, + "learning_rate": 5.005349912434325e-05, + "loss": 1.025, + "num_input_tokens_seen": 114831064, + "step": 7137 + }, + { + "epoch": 0.5000043780153581, + "grad_norm": 3.4240174293518066, + "learning_rate": 5.004650087565674e-05, + "loss": 1.0205, + "num_input_tokens_seen": 114847448, + "step": 7138 + }, + { + "epoch": 0.5000744262610873, + "grad_norm": 3.603026866912842, + "learning_rate": 5.003950262697022e-05, + "loss": 1.0196, + "num_input_tokens_seen": 114863832, + "step": 7139 + }, + { + "epoch": 0.5001444745068165, + "grad_norm": 4.349592208862305, + "learning_rate": 5.0032504378283705e-05, + "loss": 1.1059, + "num_input_tokens_seen": 114879200, + "step": 7140 + }, + { + "epoch": 0.5002145227525459, + "grad_norm": 5.716104984283447, + "learning_rate": 5.00255061295972e-05, + "loss": 1.0465, + "num_input_tokens_seen": 114894880, + "step": 7141 + }, + { + "epoch": 0.5002845709982751, + "grad_norm": 3.857797384262085, + "learning_rate": 5.0018507880910695e-05, + "loss": 1.0438, + "num_input_tokens_seen": 114911264, + "step": 7142 + }, + { + "epoch": 0.5003546192440043, + "grad_norm": 3.7292556762695312, + "learning_rate": 5.001150963222417e-05, + "loss": 1.0803, + "num_input_tokens_seen": 114926792, + "step": 7143 + }, + { + "epoch": 0.5004246674897336, + "grad_norm": 4.02719783782959, + "learning_rate": 5.000451138353765e-05, + "loss": 0.9635, + "num_input_tokens_seen": 114942944, + "step": 7144 + }, + { + "epoch": 0.5004947157354628, + "grad_norm": 5.39168119430542, + "learning_rate": 4.9997513134851134e-05, + "loss": 1.0898, + "num_input_tokens_seen": 114958800, + "step": 7145 + }, + { + "epoch": 0.5005647639811921, + "grad_norm": 4.773622512817383, + "learning_rate": 4.999051488616463e-05, + "loss": 1.1112, + "num_input_tokens_seen": 114974664, + "step": 7146 + }, + { + "epoch": 0.5006348122269213, + "grad_norm": 3.635557174682617, + "learning_rate": 4.998351663747812e-05, + "loss": 0.9355, + "num_input_tokens_seen": 114990728, + "step": 7147 + }, + { + "epoch": 0.5007048604726505, + "grad_norm": 4.165726661682129, + "learning_rate": 4.997651838879159e-05, + "loss": 1.1044, + "num_input_tokens_seen": 115006552, + "step": 7148 + }, + { + "epoch": 0.5007749087183798, + "grad_norm": 4.2835001945495605, + "learning_rate": 4.996952014010508e-05, + "loss": 0.937, + "num_input_tokens_seen": 115022936, + "step": 7149 + }, + { + "epoch": 0.500844956964109, + "grad_norm": 3.7588231563568115, + "learning_rate": 4.9962521891418564e-05, + "loss": 1.0578, + "num_input_tokens_seen": 115038832, + "step": 7150 + }, + { + "epoch": 0.5009150052098382, + "grad_norm": 4.017446041107178, + "learning_rate": 4.995552364273205e-05, + "loss": 1.0177, + "num_input_tokens_seen": 115055216, + "step": 7151 + }, + { + "epoch": 0.5009850534555675, + "grad_norm": 4.145601749420166, + "learning_rate": 4.994852539404554e-05, + "loss": 1.1925, + "num_input_tokens_seen": 115071600, + "step": 7152 + }, + { + "epoch": 0.5010551017012967, + "grad_norm": 4.027134895324707, + "learning_rate": 4.994152714535902e-05, + "loss": 1.0115, + "num_input_tokens_seen": 115087504, + "step": 7153 + }, + { + "epoch": 0.5011251499470261, + "grad_norm": 4.185591697692871, + "learning_rate": 4.993452889667251e-05, + "loss": 1.012, + "num_input_tokens_seen": 115102936, + "step": 7154 + }, + { + "epoch": 0.5011951981927553, + "grad_norm": 3.262739658355713, + "learning_rate": 4.9927530647985986e-05, + "loss": 0.9424, + "num_input_tokens_seen": 115119320, + "step": 7155 + }, + { + "epoch": 0.5012652464384845, + "grad_norm": 3.514493465423584, + "learning_rate": 4.992053239929948e-05, + "loss": 0.9523, + "num_input_tokens_seen": 115135704, + "step": 7156 + }, + { + "epoch": 0.5013352946842138, + "grad_norm": 3.2577719688415527, + "learning_rate": 4.991353415061297e-05, + "loss": 0.9603, + "num_input_tokens_seen": 115152088, + "step": 7157 + }, + { + "epoch": 0.501405342929943, + "grad_norm": 4.475879669189453, + "learning_rate": 4.9906535901926445e-05, + "loss": 1.1393, + "num_input_tokens_seen": 115168472, + "step": 7158 + }, + { + "epoch": 0.5014753911756722, + "grad_norm": 4.558653354644775, + "learning_rate": 4.9899537653239934e-05, + "loss": 0.9628, + "num_input_tokens_seen": 115184136, + "step": 7159 + }, + { + "epoch": 0.5015454394214015, + "grad_norm": 4.034858703613281, + "learning_rate": 4.9892539404553416e-05, + "loss": 1.2669, + "num_input_tokens_seen": 115200520, + "step": 7160 + }, + { + "epoch": 0.5016154876671307, + "grad_norm": 4.190174579620361, + "learning_rate": 4.9885541155866904e-05, + "loss": 1.0478, + "num_input_tokens_seen": 115216904, + "step": 7161 + }, + { + "epoch": 0.50168553591286, + "grad_norm": 4.808748245239258, + "learning_rate": 4.987854290718039e-05, + "loss": 1.2661, + "num_input_tokens_seen": 115232864, + "step": 7162 + }, + { + "epoch": 0.5017555841585892, + "grad_norm": 3.7075023651123047, + "learning_rate": 4.9871544658493875e-05, + "loss": 1.1328, + "num_input_tokens_seen": 115248544, + "step": 7163 + }, + { + "epoch": 0.5018256324043184, + "grad_norm": 3.6593689918518066, + "learning_rate": 4.986454640980736e-05, + "loss": 1.0653, + "num_input_tokens_seen": 115264616, + "step": 7164 + }, + { + "epoch": 0.5018956806500477, + "grad_norm": 3.959949493408203, + "learning_rate": 4.985754816112084e-05, + "loss": 1.0708, + "num_input_tokens_seen": 115281000, + "step": 7165 + }, + { + "epoch": 0.501965728895777, + "grad_norm": 3.6724140644073486, + "learning_rate": 4.9850549912434334e-05, + "loss": 1.1306, + "num_input_tokens_seen": 115297384, + "step": 7166 + }, + { + "epoch": 0.5020357771415063, + "grad_norm": 3.9350247383117676, + "learning_rate": 4.984355166374781e-05, + "loss": 1.1785, + "num_input_tokens_seen": 115312760, + "step": 7167 + }, + { + "epoch": 0.5021058253872355, + "grad_norm": 3.8056607246398926, + "learning_rate": 4.98365534150613e-05, + "loss": 0.8915, + "num_input_tokens_seen": 115328336, + "step": 7168 + }, + { + "epoch": 0.5021758736329647, + "grad_norm": 3.995048761367798, + "learning_rate": 4.9829555166374786e-05, + "loss": 1.0687, + "num_input_tokens_seen": 115344016, + "step": 7169 + }, + { + "epoch": 0.502245921878694, + "grad_norm": 4.534327983856201, + "learning_rate": 4.982255691768827e-05, + "loss": 0.9757, + "num_input_tokens_seen": 115359976, + "step": 7170 + }, + { + "epoch": 0.5023159701244232, + "grad_norm": 5.29775333404541, + "learning_rate": 4.9815558669001756e-05, + "loss": 0.9172, + "num_input_tokens_seen": 115375480, + "step": 7171 + }, + { + "epoch": 0.5023860183701524, + "grad_norm": 3.8773534297943115, + "learning_rate": 4.980856042031524e-05, + "loss": 1.0997, + "num_input_tokens_seen": 115391632, + "step": 7172 + }, + { + "epoch": 0.5024560666158817, + "grad_norm": 4.249567985534668, + "learning_rate": 4.9801562171628727e-05, + "loss": 1.1145, + "num_input_tokens_seen": 115408016, + "step": 7173 + }, + { + "epoch": 0.5025261148616109, + "grad_norm": 4.293243408203125, + "learning_rate": 4.9794563922942215e-05, + "loss": 1.4234, + "num_input_tokens_seen": 115423808, + "step": 7174 + }, + { + "epoch": 0.5025961631073402, + "grad_norm": 4.535524845123291, + "learning_rate": 4.978756567425569e-05, + "loss": 0.9403, + "num_input_tokens_seen": 115440192, + "step": 7175 + }, + { + "epoch": 0.5026662113530694, + "grad_norm": 4.3390631675720215, + "learning_rate": 4.9780567425569186e-05, + "loss": 1.3728, + "num_input_tokens_seen": 115455416, + "step": 7176 + }, + { + "epoch": 0.5027362595987986, + "grad_norm": 3.630815267562866, + "learning_rate": 4.977356917688266e-05, + "loss": 1.0493, + "num_input_tokens_seen": 115471800, + "step": 7177 + }, + { + "epoch": 0.502806307844528, + "grad_norm": 3.9146728515625, + "learning_rate": 4.976657092819615e-05, + "loss": 1.0058, + "num_input_tokens_seen": 115488184, + "step": 7178 + }, + { + "epoch": 0.5028763560902572, + "grad_norm": 4.978190898895264, + "learning_rate": 4.975957267950964e-05, + "loss": 1.0014, + "num_input_tokens_seen": 115503792, + "step": 7179 + }, + { + "epoch": 0.5029464043359864, + "grad_norm": 3.8975963592529297, + "learning_rate": 4.975257443082312e-05, + "loss": 1.1317, + "num_input_tokens_seen": 115519576, + "step": 7180 + }, + { + "epoch": 0.5030164525817157, + "grad_norm": 4.439699649810791, + "learning_rate": 4.974557618213661e-05, + "loss": 1.0718, + "num_input_tokens_seen": 115535512, + "step": 7181 + }, + { + "epoch": 0.5030865008274449, + "grad_norm": 4.4080610275268555, + "learning_rate": 4.973857793345009e-05, + "loss": 1.3639, + "num_input_tokens_seen": 115551880, + "step": 7182 + }, + { + "epoch": 0.5031565490731742, + "grad_norm": 3.8968825340270996, + "learning_rate": 4.973157968476358e-05, + "loss": 1.1686, + "num_input_tokens_seen": 115568136, + "step": 7183 + }, + { + "epoch": 0.5032265973189034, + "grad_norm": 4.030379295349121, + "learning_rate": 4.972458143607707e-05, + "loss": 0.9829, + "num_input_tokens_seen": 115583928, + "step": 7184 + }, + { + "epoch": 0.5032966455646326, + "grad_norm": 4.46726131439209, + "learning_rate": 4.971758318739054e-05, + "loss": 1.1789, + "num_input_tokens_seen": 115600016, + "step": 7185 + }, + { + "epoch": 0.5033666938103619, + "grad_norm": 4.490327835083008, + "learning_rate": 4.971058493870404e-05, + "loss": 0.968, + "num_input_tokens_seen": 115616400, + "step": 7186 + }, + { + "epoch": 0.5034367420560911, + "grad_norm": 5.678159713745117, + "learning_rate": 4.970358669001751e-05, + "loss": 1.2517, + "num_input_tokens_seen": 115632424, + "step": 7187 + }, + { + "epoch": 0.5035067903018203, + "grad_norm": 4.695899963378906, + "learning_rate": 4.9696588441331e-05, + "loss": 1.2681, + "num_input_tokens_seen": 115648808, + "step": 7188 + }, + { + "epoch": 0.5035768385475496, + "grad_norm": 3.6823155879974365, + "learning_rate": 4.968959019264449e-05, + "loss": 1.0642, + "num_input_tokens_seen": 115665192, + "step": 7189 + }, + { + "epoch": 0.5036468867932788, + "grad_norm": 5.105508804321289, + "learning_rate": 4.968259194395797e-05, + "loss": 1.1646, + "num_input_tokens_seen": 115681472, + "step": 7190 + }, + { + "epoch": 0.5037169350390082, + "grad_norm": 4.0591607093811035, + "learning_rate": 4.967559369527146e-05, + "loss": 1.1231, + "num_input_tokens_seen": 115697248, + "step": 7191 + }, + { + "epoch": 0.5037869832847374, + "grad_norm": 4.097674369812012, + "learning_rate": 4.966859544658494e-05, + "loss": 0.9121, + "num_input_tokens_seen": 115713632, + "step": 7192 + }, + { + "epoch": 0.5038570315304666, + "grad_norm": 3.711235523223877, + "learning_rate": 4.966159719789843e-05, + "loss": 0.9402, + "num_input_tokens_seen": 115730016, + "step": 7193 + }, + { + "epoch": 0.5039270797761959, + "grad_norm": 3.9073588848114014, + "learning_rate": 4.9654598949211906e-05, + "loss": 1.0806, + "num_input_tokens_seen": 115745872, + "step": 7194 + }, + { + "epoch": 0.5039971280219251, + "grad_norm": 3.230870008468628, + "learning_rate": 4.9647600700525394e-05, + "loss": 0.8976, + "num_input_tokens_seen": 115762256, + "step": 7195 + }, + { + "epoch": 0.5040671762676543, + "grad_norm": 4.253819942474365, + "learning_rate": 4.964060245183889e-05, + "loss": 1.1024, + "num_input_tokens_seen": 115778640, + "step": 7196 + }, + { + "epoch": 0.5041372245133836, + "grad_norm": 3.6932590007781982, + "learning_rate": 4.9633604203152365e-05, + "loss": 1.1522, + "num_input_tokens_seen": 115795024, + "step": 7197 + }, + { + "epoch": 0.5042072727591128, + "grad_norm": 4.178073883056641, + "learning_rate": 4.962660595446585e-05, + "loss": 1.1629, + "num_input_tokens_seen": 115811408, + "step": 7198 + }, + { + "epoch": 0.5042773210048421, + "grad_norm": 3.4744091033935547, + "learning_rate": 4.9619607705779335e-05, + "loss": 0.9078, + "num_input_tokens_seen": 115827792, + "step": 7199 + }, + { + "epoch": 0.5043473692505713, + "grad_norm": 5.810272216796875, + "learning_rate": 4.9612609457092824e-05, + "loss": 1.0187, + "num_input_tokens_seen": 115843352, + "step": 7200 + }, + { + "epoch": 0.5043473692505713, + "eval_loss": 1.1233556270599365, + "eval_runtime": 0.159, + "eval_samples_per_second": 6.288, + "eval_steps_per_second": 6.288, + "num_input_tokens_seen": 115843352, + "step": 7200 + }, + { + "epoch": 0.5044174174963005, + "grad_norm": 4.0738205909729, + "learning_rate": 4.960561120840631e-05, + "loss": 1.2362, + "num_input_tokens_seen": 115859216, + "step": 7201 + }, + { + "epoch": 0.5044874657420299, + "grad_norm": 4.0072102546691895, + "learning_rate": 4.9598612959719794e-05, + "loss": 1.1624, + "num_input_tokens_seen": 115875600, + "step": 7202 + }, + { + "epoch": 0.504557513987759, + "grad_norm": 5.232552528381348, + "learning_rate": 4.959161471103328e-05, + "loss": 0.9148, + "num_input_tokens_seen": 115890976, + "step": 7203 + }, + { + "epoch": 0.5046275622334884, + "grad_norm": 4.6930623054504395, + "learning_rate": 4.958461646234676e-05, + "loss": 1.0314, + "num_input_tokens_seen": 115907360, + "step": 7204 + }, + { + "epoch": 0.5046976104792176, + "grad_norm": 5.217222690582275, + "learning_rate": 4.9577618213660246e-05, + "loss": 1.1316, + "num_input_tokens_seen": 115923320, + "step": 7205 + }, + { + "epoch": 0.5047676587249468, + "grad_norm": 3.999408006668091, + "learning_rate": 4.957061996497374e-05, + "loss": 0.9769, + "num_input_tokens_seen": 115939704, + "step": 7206 + }, + { + "epoch": 0.5048377069706761, + "grad_norm": 4.267052173614502, + "learning_rate": 4.956362171628722e-05, + "loss": 1.006, + "num_input_tokens_seen": 115955592, + "step": 7207 + }, + { + "epoch": 0.5049077552164053, + "grad_norm": 4.446041584014893, + "learning_rate": 4.9556623467600705e-05, + "loss": 1.2351, + "num_input_tokens_seen": 115971976, + "step": 7208 + }, + { + "epoch": 0.5049778034621345, + "grad_norm": 3.8210396766662598, + "learning_rate": 4.954962521891419e-05, + "loss": 1.1836, + "num_input_tokens_seen": 115987528, + "step": 7209 + }, + { + "epoch": 0.5050478517078638, + "grad_norm": 5.992397785186768, + "learning_rate": 4.9542626970227676e-05, + "loss": 1.0428, + "num_input_tokens_seen": 116003912, + "step": 7210 + }, + { + "epoch": 0.505117899953593, + "grad_norm": 3.934375524520874, + "learning_rate": 4.9535628721541164e-05, + "loss": 0.9858, + "num_input_tokens_seen": 116020296, + "step": 7211 + }, + { + "epoch": 0.5051879481993223, + "grad_norm": 3.936866521835327, + "learning_rate": 4.9528630472854646e-05, + "loss": 1.1011, + "num_input_tokens_seen": 116036600, + "step": 7212 + }, + { + "epoch": 0.5052579964450515, + "grad_norm": 4.908316135406494, + "learning_rate": 4.9521632224168135e-05, + "loss": 0.8953, + "num_input_tokens_seen": 116052984, + "step": 7213 + }, + { + "epoch": 0.5053280446907807, + "grad_norm": 4.035202503204346, + "learning_rate": 4.951463397548161e-05, + "loss": 1.1475, + "num_input_tokens_seen": 116068768, + "step": 7214 + }, + { + "epoch": 0.5053980929365101, + "grad_norm": 3.7488014698028564, + "learning_rate": 4.95076357267951e-05, + "loss": 0.9506, + "num_input_tokens_seen": 116085152, + "step": 7215 + }, + { + "epoch": 0.5054681411822393, + "grad_norm": 5.226819038391113, + "learning_rate": 4.9500637478108594e-05, + "loss": 0.9878, + "num_input_tokens_seen": 116100176, + "step": 7216 + }, + { + "epoch": 0.5055381894279685, + "grad_norm": 4.0122857093811035, + "learning_rate": 4.949363922942207e-05, + "loss": 1.1275, + "num_input_tokens_seen": 116116560, + "step": 7217 + }, + { + "epoch": 0.5056082376736978, + "grad_norm": 4.160411834716797, + "learning_rate": 4.948664098073556e-05, + "loss": 0.9622, + "num_input_tokens_seen": 116132464, + "step": 7218 + }, + { + "epoch": 0.505678285919427, + "grad_norm": 4.860180377960205, + "learning_rate": 4.947964273204904e-05, + "loss": 0.9896, + "num_input_tokens_seen": 116148848, + "step": 7219 + }, + { + "epoch": 0.5057483341651563, + "grad_norm": 4.549893856048584, + "learning_rate": 4.947264448336253e-05, + "loss": 1.0893, + "num_input_tokens_seen": 116165128, + "step": 7220 + }, + { + "epoch": 0.5058183824108855, + "grad_norm": 3.4614131450653076, + "learning_rate": 4.9465646234676e-05, + "loss": 0.7666, + "num_input_tokens_seen": 116181152, + "step": 7221 + }, + { + "epoch": 0.5058884306566147, + "grad_norm": 4.7237043380737305, + "learning_rate": 4.94586479859895e-05, + "loss": 0.949, + "num_input_tokens_seen": 116197032, + "step": 7222 + }, + { + "epoch": 0.505958478902344, + "grad_norm": 4.4195098876953125, + "learning_rate": 4.945164973730299e-05, + "loss": 0.985, + "num_input_tokens_seen": 116212752, + "step": 7223 + }, + { + "epoch": 0.5060285271480732, + "grad_norm": 3.6815669536590576, + "learning_rate": 4.944465148861646e-05, + "loss": 0.9769, + "num_input_tokens_seen": 116229024, + "step": 7224 + }, + { + "epoch": 0.5060985753938024, + "grad_norm": 3.776644229888916, + "learning_rate": 4.943765323992995e-05, + "loss": 0.9953, + "num_input_tokens_seen": 116245408, + "step": 7225 + }, + { + "epoch": 0.5061686236395317, + "grad_norm": 4.3324761390686035, + "learning_rate": 4.943065499124343e-05, + "loss": 1.058, + "num_input_tokens_seen": 116261192, + "step": 7226 + }, + { + "epoch": 0.506238671885261, + "grad_norm": 3.499302387237549, + "learning_rate": 4.942365674255692e-05, + "loss": 1.1726, + "num_input_tokens_seen": 116277576, + "step": 7227 + }, + { + "epoch": 0.5063087201309903, + "grad_norm": 3.5195088386535645, + "learning_rate": 4.941665849387041e-05, + "loss": 0.8901, + "num_input_tokens_seen": 116293960, + "step": 7228 + }, + { + "epoch": 0.5063787683767195, + "grad_norm": 4.266250133514404, + "learning_rate": 4.940966024518389e-05, + "loss": 1.0067, + "num_input_tokens_seen": 116310344, + "step": 7229 + }, + { + "epoch": 0.5064488166224487, + "grad_norm": 4.53155517578125, + "learning_rate": 4.940266199649738e-05, + "loss": 1.1207, + "num_input_tokens_seen": 116326536, + "step": 7230 + }, + { + "epoch": 0.506518864868178, + "grad_norm": 5.224839210510254, + "learning_rate": 4.9395663747810855e-05, + "loss": 1.1932, + "num_input_tokens_seen": 116342792, + "step": 7231 + }, + { + "epoch": 0.5065889131139072, + "grad_norm": 4.072076797485352, + "learning_rate": 4.938866549912435e-05, + "loss": 1.1041, + "num_input_tokens_seen": 116359144, + "step": 7232 + }, + { + "epoch": 0.5066589613596364, + "grad_norm": 4.286440372467041, + "learning_rate": 4.938166725043784e-05, + "loss": 1.1439, + "num_input_tokens_seen": 116375128, + "step": 7233 + }, + { + "epoch": 0.5067290096053657, + "grad_norm": 3.684030055999756, + "learning_rate": 4.9374669001751314e-05, + "loss": 0.8753, + "num_input_tokens_seen": 116391448, + "step": 7234 + }, + { + "epoch": 0.5067990578510949, + "grad_norm": 3.721698045730591, + "learning_rate": 4.93676707530648e-05, + "loss": 1.0196, + "num_input_tokens_seen": 116407832, + "step": 7235 + }, + { + "epoch": 0.5068691060968242, + "grad_norm": 3.5029869079589844, + "learning_rate": 4.9360672504378284e-05, + "loss": 1.0157, + "num_input_tokens_seen": 116424216, + "step": 7236 + }, + { + "epoch": 0.5069391543425534, + "grad_norm": 3.960109233856201, + "learning_rate": 4.935367425569177e-05, + "loss": 1.2421, + "num_input_tokens_seen": 116440600, + "step": 7237 + }, + { + "epoch": 0.5070092025882826, + "grad_norm": 3.7146995067596436, + "learning_rate": 4.934667600700526e-05, + "loss": 1.0466, + "num_input_tokens_seen": 116455776, + "step": 7238 + }, + { + "epoch": 0.507079250834012, + "grad_norm": 4.000344753265381, + "learning_rate": 4.933967775831874e-05, + "loss": 1.0173, + "num_input_tokens_seen": 116472160, + "step": 7239 + }, + { + "epoch": 0.5071492990797412, + "grad_norm": 4.015896320343018, + "learning_rate": 4.933267950963223e-05, + "loss": 1.0865, + "num_input_tokens_seen": 116488440, + "step": 7240 + }, + { + "epoch": 0.5072193473254705, + "grad_norm": 4.240390777587891, + "learning_rate": 4.932568126094571e-05, + "loss": 0.9658, + "num_input_tokens_seen": 116504112, + "step": 7241 + }, + { + "epoch": 0.5072893955711997, + "grad_norm": 4.051314353942871, + "learning_rate": 4.93186830122592e-05, + "loss": 1.1068, + "num_input_tokens_seen": 116519976, + "step": 7242 + }, + { + "epoch": 0.5073594438169289, + "grad_norm": 4.370121955871582, + "learning_rate": 4.931168476357269e-05, + "loss": 1.1409, + "num_input_tokens_seen": 116536360, + "step": 7243 + }, + { + "epoch": 0.5074294920626582, + "grad_norm": 3.7158761024475098, + "learning_rate": 4.9304686514886166e-05, + "loss": 1.0234, + "num_input_tokens_seen": 116552744, + "step": 7244 + }, + { + "epoch": 0.5074995403083874, + "grad_norm": 3.6040024757385254, + "learning_rate": 4.9297688266199654e-05, + "loss": 0.8008, + "num_input_tokens_seen": 116568088, + "step": 7245 + }, + { + "epoch": 0.5075695885541166, + "grad_norm": 5.175736904144287, + "learning_rate": 4.9290690017513136e-05, + "loss": 1.09, + "num_input_tokens_seen": 116584152, + "step": 7246 + }, + { + "epoch": 0.5076396367998459, + "grad_norm": 4.735289573669434, + "learning_rate": 4.9283691768826625e-05, + "loss": 1.093, + "num_input_tokens_seen": 116599848, + "step": 7247 + }, + { + "epoch": 0.5077096850455751, + "grad_norm": 5.659826278686523, + "learning_rate": 4.9276693520140107e-05, + "loss": 1.1248, + "num_input_tokens_seen": 116615216, + "step": 7248 + }, + { + "epoch": 0.5077797332913044, + "grad_norm": 4.524930000305176, + "learning_rate": 4.9269695271453595e-05, + "loss": 1.0244, + "num_input_tokens_seen": 116631328, + "step": 7249 + }, + { + "epoch": 0.5078497815370336, + "grad_norm": 3.6031768321990967, + "learning_rate": 4.9262697022767084e-05, + "loss": 1.0042, + "num_input_tokens_seen": 116647016, + "step": 7250 + }, + { + "epoch": 0.5079198297827628, + "grad_norm": 3.953381299972534, + "learning_rate": 4.925569877408056e-05, + "loss": 0.9817, + "num_input_tokens_seen": 116663400, + "step": 7251 + }, + { + "epoch": 0.5079898780284922, + "grad_norm": 7.162896633148193, + "learning_rate": 4.9248700525394054e-05, + "loss": 1.2361, + "num_input_tokens_seen": 116679456, + "step": 7252 + }, + { + "epoch": 0.5080599262742214, + "grad_norm": 3.416929006576538, + "learning_rate": 4.924170227670753e-05, + "loss": 1.0067, + "num_input_tokens_seen": 116695840, + "step": 7253 + }, + { + "epoch": 0.5081299745199506, + "grad_norm": 3.542628049850464, + "learning_rate": 4.923470402802102e-05, + "loss": 0.8451, + "num_input_tokens_seen": 116712224, + "step": 7254 + }, + { + "epoch": 0.5082000227656799, + "grad_norm": 5.850252151489258, + "learning_rate": 4.9227705779334506e-05, + "loss": 1.0186, + "num_input_tokens_seen": 116728608, + "step": 7255 + }, + { + "epoch": 0.5082700710114091, + "grad_norm": 4.921962261199951, + "learning_rate": 4.922070753064799e-05, + "loss": 1.0944, + "num_input_tokens_seen": 116744992, + "step": 7256 + }, + { + "epoch": 0.5083401192571384, + "grad_norm": 5.621464252471924, + "learning_rate": 4.921370928196148e-05, + "loss": 1.0273, + "num_input_tokens_seen": 116761376, + "step": 7257 + }, + { + "epoch": 0.5084101675028676, + "grad_norm": 3.8369336128234863, + "learning_rate": 4.920671103327496e-05, + "loss": 1.0421, + "num_input_tokens_seen": 116777656, + "step": 7258 + }, + { + "epoch": 0.5084802157485968, + "grad_norm": 4.033676624298096, + "learning_rate": 4.919971278458845e-05, + "loss": 1.0248, + "num_input_tokens_seen": 116793432, + "step": 7259 + }, + { + "epoch": 0.5085502639943261, + "grad_norm": 5.80480432510376, + "learning_rate": 4.9192714535901936e-05, + "loss": 0.9104, + "num_input_tokens_seen": 116809816, + "step": 7260 + }, + { + "epoch": 0.5086203122400553, + "grad_norm": 4.646456241607666, + "learning_rate": 4.918571628721541e-05, + "loss": 1.1542, + "num_input_tokens_seen": 116826024, + "step": 7261 + }, + { + "epoch": 0.5086903604857845, + "grad_norm": 5.681286811828613, + "learning_rate": 4.9178718038528906e-05, + "loss": 1.0206, + "num_input_tokens_seen": 116841176, + "step": 7262 + }, + { + "epoch": 0.5087604087315138, + "grad_norm": 4.391019821166992, + "learning_rate": 4.917171978984238e-05, + "loss": 0.9668, + "num_input_tokens_seen": 116857560, + "step": 7263 + }, + { + "epoch": 0.508830456977243, + "grad_norm": 3.69963002204895, + "learning_rate": 4.916472154115587e-05, + "loss": 1.1773, + "num_input_tokens_seen": 116873872, + "step": 7264 + }, + { + "epoch": 0.5089005052229724, + "grad_norm": 4.748251914978027, + "learning_rate": 4.915772329246936e-05, + "loss": 1.1401, + "num_input_tokens_seen": 116890256, + "step": 7265 + }, + { + "epoch": 0.5089705534687016, + "grad_norm": 4.31103515625, + "learning_rate": 4.915072504378284e-05, + "loss": 1.0757, + "num_input_tokens_seen": 116906640, + "step": 7266 + }, + { + "epoch": 0.5090406017144308, + "grad_norm": 4.616788864135742, + "learning_rate": 4.914372679509633e-05, + "loss": 1.0779, + "num_input_tokens_seen": 116923024, + "step": 7267 + }, + { + "epoch": 0.5091106499601601, + "grad_norm": 5.435046672821045, + "learning_rate": 4.913672854640981e-05, + "loss": 1.1371, + "num_input_tokens_seen": 116939264, + "step": 7268 + }, + { + "epoch": 0.5091806982058893, + "grad_norm": 3.534294843673706, + "learning_rate": 4.91297302977233e-05, + "loss": 0.9129, + "num_input_tokens_seen": 116955648, + "step": 7269 + }, + { + "epoch": 0.5092507464516186, + "grad_norm": 4.072021961212158, + "learning_rate": 4.912273204903679e-05, + "loss": 1.0451, + "num_input_tokens_seen": 116970936, + "step": 7270 + }, + { + "epoch": 0.5093207946973478, + "grad_norm": 3.853341817855835, + "learning_rate": 4.911573380035026e-05, + "loss": 1.0749, + "num_input_tokens_seen": 116987064, + "step": 7271 + }, + { + "epoch": 0.509390842943077, + "grad_norm": 5.853321552276611, + "learning_rate": 4.910873555166375e-05, + "loss": 1.0245, + "num_input_tokens_seen": 117002752, + "step": 7272 + }, + { + "epoch": 0.5094608911888063, + "grad_norm": 3.7305798530578613, + "learning_rate": 4.910173730297723e-05, + "loss": 1.0011, + "num_input_tokens_seen": 117019136, + "step": 7273 + }, + { + "epoch": 0.5095309394345355, + "grad_norm": 3.9224064350128174, + "learning_rate": 4.909473905429072e-05, + "loss": 1.0884, + "num_input_tokens_seen": 117035504, + "step": 7274 + }, + { + "epoch": 0.5096009876802647, + "grad_norm": 3.633242130279541, + "learning_rate": 4.9087740805604204e-05, + "loss": 1.0027, + "num_input_tokens_seen": 117051888, + "step": 7275 + }, + { + "epoch": 0.509671035925994, + "grad_norm": 3.7328341007232666, + "learning_rate": 4.908074255691769e-05, + "loss": 1.1986, + "num_input_tokens_seen": 117068272, + "step": 7276 + }, + { + "epoch": 0.5097410841717233, + "grad_norm": 3.492896556854248, + "learning_rate": 4.907374430823118e-05, + "loss": 0.8862, + "num_input_tokens_seen": 117084656, + "step": 7277 + }, + { + "epoch": 0.5098111324174526, + "grad_norm": 3.8437516689300537, + "learning_rate": 4.906674605954466e-05, + "loss": 1.1177, + "num_input_tokens_seen": 117100432, + "step": 7278 + }, + { + "epoch": 0.5098811806631818, + "grad_norm": 3.537297487258911, + "learning_rate": 4.905974781085815e-05, + "loss": 1.1694, + "num_input_tokens_seen": 117116544, + "step": 7279 + }, + { + "epoch": 0.509951228908911, + "grad_norm": 3.6758127212524414, + "learning_rate": 4.9052749562171626e-05, + "loss": 0.8982, + "num_input_tokens_seen": 117132376, + "step": 7280 + }, + { + "epoch": 0.5100212771546403, + "grad_norm": 3.7280797958374023, + "learning_rate": 4.9045751313485115e-05, + "loss": 1.013, + "num_input_tokens_seen": 117148760, + "step": 7281 + }, + { + "epoch": 0.5100913254003695, + "grad_norm": 4.579721927642822, + "learning_rate": 4.9038753064798603e-05, + "loss": 1.1065, + "num_input_tokens_seen": 117165144, + "step": 7282 + }, + { + "epoch": 0.5101613736460987, + "grad_norm": 4.146833896636963, + "learning_rate": 4.9031754816112085e-05, + "loss": 1.1848, + "num_input_tokens_seen": 117180472, + "step": 7283 + }, + { + "epoch": 0.510231421891828, + "grad_norm": 3.7897355556488037, + "learning_rate": 4.9024756567425574e-05, + "loss": 1.0082, + "num_input_tokens_seen": 117196856, + "step": 7284 + }, + { + "epoch": 0.5103014701375572, + "grad_norm": 3.821641206741333, + "learning_rate": 4.9017758318739056e-05, + "loss": 1.0827, + "num_input_tokens_seen": 117213240, + "step": 7285 + }, + { + "epoch": 0.5103715183832865, + "grad_norm": 4.439133644104004, + "learning_rate": 4.9010760070052544e-05, + "loss": 1.0325, + "num_input_tokens_seen": 117229240, + "step": 7286 + }, + { + "epoch": 0.5104415666290157, + "grad_norm": 4.781843185424805, + "learning_rate": 4.900376182136603e-05, + "loss": 1.0871, + "num_input_tokens_seen": 117245624, + "step": 7287 + }, + { + "epoch": 0.510511614874745, + "grad_norm": 3.928457736968994, + "learning_rate": 4.8996763572679515e-05, + "loss": 1.0049, + "num_input_tokens_seen": 117261496, + "step": 7288 + }, + { + "epoch": 0.5105816631204743, + "grad_norm": 3.8278815746307373, + "learning_rate": 4.8989765323993e-05, + "loss": 0.8442, + "num_input_tokens_seen": 117277448, + "step": 7289 + }, + { + "epoch": 0.5106517113662035, + "grad_norm": 3.8238277435302734, + "learning_rate": 4.898276707530648e-05, + "loss": 1.1188, + "num_input_tokens_seen": 117293832, + "step": 7290 + }, + { + "epoch": 0.5107217596119327, + "grad_norm": 3.835528612136841, + "learning_rate": 4.897576882661997e-05, + "loss": 0.9828, + "num_input_tokens_seen": 117310216, + "step": 7291 + }, + { + "epoch": 0.510791807857662, + "grad_norm": 3.516911029815674, + "learning_rate": 4.8968770577933455e-05, + "loss": 1.0539, + "num_input_tokens_seen": 117326600, + "step": 7292 + }, + { + "epoch": 0.5108618561033912, + "grad_norm": 4.772302150726318, + "learning_rate": 4.896177232924694e-05, + "loss": 1.1043, + "num_input_tokens_seen": 117342984, + "step": 7293 + }, + { + "epoch": 0.5109319043491205, + "grad_norm": 3.933194875717163, + "learning_rate": 4.8954774080560426e-05, + "loss": 1.1596, + "num_input_tokens_seen": 117359024, + "step": 7294 + }, + { + "epoch": 0.5110019525948497, + "grad_norm": 5.703490734100342, + "learning_rate": 4.894777583187391e-05, + "loss": 1.0191, + "num_input_tokens_seen": 117375184, + "step": 7295 + }, + { + "epoch": 0.5110720008405789, + "grad_norm": 5.454290866851807, + "learning_rate": 4.8940777583187396e-05, + "loss": 1.1642, + "num_input_tokens_seen": 117391480, + "step": 7296 + }, + { + "epoch": 0.5111420490863082, + "grad_norm": 4.424625396728516, + "learning_rate": 4.8933779334500885e-05, + "loss": 1.0596, + "num_input_tokens_seen": 117407864, + "step": 7297 + }, + { + "epoch": 0.5112120973320374, + "grad_norm": 3.7400286197662354, + "learning_rate": 4.892678108581436e-05, + "loss": 0.9681, + "num_input_tokens_seen": 117424248, + "step": 7298 + }, + { + "epoch": 0.5112821455777666, + "grad_norm": 4.604375839233398, + "learning_rate": 4.8919782837127855e-05, + "loss": 1.1543, + "num_input_tokens_seen": 117440632, + "step": 7299 + }, + { + "epoch": 0.511352193823496, + "grad_norm": 4.090991497039795, + "learning_rate": 4.891278458844133e-05, + "loss": 0.9973, + "num_input_tokens_seen": 117457016, + "step": 7300 + }, + { + "epoch": 0.5114222420692252, + "grad_norm": 4.32021427154541, + "learning_rate": 4.890578633975482e-05, + "loss": 1.1599, + "num_input_tokens_seen": 117473400, + "step": 7301 + }, + { + "epoch": 0.5114922903149545, + "grad_norm": 5.156586170196533, + "learning_rate": 4.88987880910683e-05, + "loss": 1.0992, + "num_input_tokens_seen": 117489520, + "step": 7302 + }, + { + "epoch": 0.5115623385606837, + "grad_norm": 3.4068989753723145, + "learning_rate": 4.889178984238179e-05, + "loss": 0.7612, + "num_input_tokens_seen": 117505904, + "step": 7303 + }, + { + "epoch": 0.5116323868064129, + "grad_norm": 4.153528690338135, + "learning_rate": 4.888479159369528e-05, + "loss": 1.0344, + "num_input_tokens_seen": 117522056, + "step": 7304 + }, + { + "epoch": 0.5117024350521422, + "grad_norm": 4.036544322967529, + "learning_rate": 4.887779334500876e-05, + "loss": 0.9494, + "num_input_tokens_seen": 117538392, + "step": 7305 + }, + { + "epoch": 0.5117724832978714, + "grad_norm": 3.522869110107422, + "learning_rate": 4.887079509632225e-05, + "loss": 1.0601, + "num_input_tokens_seen": 117554776, + "step": 7306 + }, + { + "epoch": 0.5118425315436007, + "grad_norm": 5.0436530113220215, + "learning_rate": 4.886379684763572e-05, + "loss": 1.3193, + "num_input_tokens_seen": 117571160, + "step": 7307 + }, + { + "epoch": 0.5119125797893299, + "grad_norm": 3.9105610847473145, + "learning_rate": 4.885679859894921e-05, + "loss": 1.087, + "num_input_tokens_seen": 117587544, + "step": 7308 + }, + { + "epoch": 0.5119826280350591, + "grad_norm": 3.6909308433532715, + "learning_rate": 4.884980035026271e-05, + "loss": 1.2365, + "num_input_tokens_seen": 117603512, + "step": 7309 + }, + { + "epoch": 0.5120526762807884, + "grad_norm": 4.3869404792785645, + "learning_rate": 4.884280210157618e-05, + "loss": 1.0075, + "num_input_tokens_seen": 117619896, + "step": 7310 + }, + { + "epoch": 0.5121227245265176, + "grad_norm": 4.207435607910156, + "learning_rate": 4.883580385288967e-05, + "loss": 1.2587, + "num_input_tokens_seen": 117636280, + "step": 7311 + }, + { + "epoch": 0.5121927727722468, + "grad_norm": 3.8486809730529785, + "learning_rate": 4.882880560420315e-05, + "loss": 0.9452, + "num_input_tokens_seen": 117652664, + "step": 7312 + }, + { + "epoch": 0.5122628210179762, + "grad_norm": 4.338857173919678, + "learning_rate": 4.882180735551664e-05, + "loss": 1.0159, + "num_input_tokens_seen": 117669048, + "step": 7313 + }, + { + "epoch": 0.5123328692637054, + "grad_norm": 4.424511432647705, + "learning_rate": 4.881480910683013e-05, + "loss": 1.1418, + "num_input_tokens_seen": 117684464, + "step": 7314 + }, + { + "epoch": 0.5124029175094347, + "grad_norm": 3.560134172439575, + "learning_rate": 4.880781085814361e-05, + "loss": 0.8744, + "num_input_tokens_seen": 117700848, + "step": 7315 + }, + { + "epoch": 0.5124729657551639, + "grad_norm": 3.687147378921509, + "learning_rate": 4.88008126094571e-05, + "loss": 1.0157, + "num_input_tokens_seen": 117717232, + "step": 7316 + }, + { + "epoch": 0.5125430140008931, + "grad_norm": 3.469698429107666, + "learning_rate": 4.8793814360770575e-05, + "loss": 1.0037, + "num_input_tokens_seen": 117733616, + "step": 7317 + }, + { + "epoch": 0.5126130622466224, + "grad_norm": 4.417529582977295, + "learning_rate": 4.8786816112084064e-05, + "loss": 1.0446, + "num_input_tokens_seen": 117749912, + "step": 7318 + }, + { + "epoch": 0.5126831104923516, + "grad_norm": 4.738529205322266, + "learning_rate": 4.877981786339756e-05, + "loss": 1.1557, + "num_input_tokens_seen": 117766088, + "step": 7319 + }, + { + "epoch": 0.5127531587380808, + "grad_norm": 3.9368979930877686, + "learning_rate": 4.8772819614711034e-05, + "loss": 0.9631, + "num_input_tokens_seen": 117781896, + "step": 7320 + }, + { + "epoch": 0.5128232069838101, + "grad_norm": 4.166690826416016, + "learning_rate": 4.876582136602452e-05, + "loss": 1.1633, + "num_input_tokens_seen": 117798280, + "step": 7321 + }, + { + "epoch": 0.5128932552295393, + "grad_norm": 5.667482376098633, + "learning_rate": 4.8758823117338005e-05, + "loss": 1.0399, + "num_input_tokens_seen": 117812376, + "step": 7322 + }, + { + "epoch": 0.5129633034752686, + "grad_norm": 4.0888776779174805, + "learning_rate": 4.875182486865149e-05, + "loss": 1.1515, + "num_input_tokens_seen": 117828760, + "step": 7323 + }, + { + "epoch": 0.5130333517209978, + "grad_norm": 3.7693824768066406, + "learning_rate": 4.874482661996498e-05, + "loss": 1.003, + "num_input_tokens_seen": 117844312, + "step": 7324 + }, + { + "epoch": 0.513103399966727, + "grad_norm": 4.041450500488281, + "learning_rate": 4.8737828371278464e-05, + "loss": 1.0271, + "num_input_tokens_seen": 117860456, + "step": 7325 + }, + { + "epoch": 0.5131734482124564, + "grad_norm": 3.697910785675049, + "learning_rate": 4.873083012259195e-05, + "loss": 1.0592, + "num_input_tokens_seen": 117876840, + "step": 7326 + }, + { + "epoch": 0.5132434964581856, + "grad_norm": 5.30276346206665, + "learning_rate": 4.872383187390543e-05, + "loss": 1.0356, + "num_input_tokens_seen": 117891376, + "step": 7327 + }, + { + "epoch": 0.5133135447039148, + "grad_norm": 4.165642261505127, + "learning_rate": 4.8716833625218916e-05, + "loss": 1.095, + "num_input_tokens_seen": 117907080, + "step": 7328 + }, + { + "epoch": 0.5133835929496441, + "grad_norm": 3.727694272994995, + "learning_rate": 4.87098353765324e-05, + "loss": 1.0567, + "num_input_tokens_seen": 117923464, + "step": 7329 + }, + { + "epoch": 0.5134536411953733, + "grad_norm": 3.4425323009490967, + "learning_rate": 4.8702837127845886e-05, + "loss": 1.002, + "num_input_tokens_seen": 117939848, + "step": 7330 + }, + { + "epoch": 0.5135236894411026, + "grad_norm": 5.108044147491455, + "learning_rate": 4.8695838879159375e-05, + "loss": 1.3818, + "num_input_tokens_seen": 117956232, + "step": 7331 + }, + { + "epoch": 0.5135937376868318, + "grad_norm": 3.598155975341797, + "learning_rate": 4.868884063047286e-05, + "loss": 0.8586, + "num_input_tokens_seen": 117972616, + "step": 7332 + }, + { + "epoch": 0.513663785932561, + "grad_norm": 5.529993057250977, + "learning_rate": 4.8681842381786345e-05, + "loss": 1.0782, + "num_input_tokens_seen": 117988616, + "step": 7333 + }, + { + "epoch": 0.5137338341782903, + "grad_norm": 3.80139422416687, + "learning_rate": 4.867484413309982e-05, + "loss": 0.9837, + "num_input_tokens_seen": 118004720, + "step": 7334 + }, + { + "epoch": 0.5138038824240195, + "grad_norm": 3.5874598026275635, + "learning_rate": 4.8667845884413316e-05, + "loss": 0.965, + "num_input_tokens_seen": 118021008, + "step": 7335 + }, + { + "epoch": 0.5138739306697487, + "grad_norm": 4.101084232330322, + "learning_rate": 4.8660847635726804e-05, + "loss": 1.1482, + "num_input_tokens_seen": 118036976, + "step": 7336 + }, + { + "epoch": 0.513943978915478, + "grad_norm": 4.202828407287598, + "learning_rate": 4.865384938704028e-05, + "loss": 0.9336, + "num_input_tokens_seen": 118052632, + "step": 7337 + }, + { + "epoch": 0.5140140271612073, + "grad_norm": 3.6700570583343506, + "learning_rate": 4.864685113835377e-05, + "loss": 1.0664, + "num_input_tokens_seen": 118069016, + "step": 7338 + }, + { + "epoch": 0.5140840754069366, + "grad_norm": 5.313467502593994, + "learning_rate": 4.863985288966725e-05, + "loss": 1.1133, + "num_input_tokens_seen": 118085400, + "step": 7339 + }, + { + "epoch": 0.5141541236526658, + "grad_norm": 3.78403639793396, + "learning_rate": 4.863285464098074e-05, + "loss": 1.1592, + "num_input_tokens_seen": 118101784, + "step": 7340 + }, + { + "epoch": 0.514224171898395, + "grad_norm": 5.284808158874512, + "learning_rate": 4.862585639229423e-05, + "loss": 1.0568, + "num_input_tokens_seen": 118116584, + "step": 7341 + }, + { + "epoch": 0.5142942201441243, + "grad_norm": 3.6075503826141357, + "learning_rate": 4.861885814360771e-05, + "loss": 1.0451, + "num_input_tokens_seen": 118132968, + "step": 7342 + }, + { + "epoch": 0.5143642683898535, + "grad_norm": 3.514549493789673, + "learning_rate": 4.86118598949212e-05, + "loss": 0.8536, + "num_input_tokens_seen": 118148576, + "step": 7343 + }, + { + "epoch": 0.5144343166355828, + "grad_norm": 4.394196510314941, + "learning_rate": 4.860486164623467e-05, + "loss": 0.9135, + "num_input_tokens_seen": 118164168, + "step": 7344 + }, + { + "epoch": 0.514504364881312, + "grad_norm": 4.1175737380981445, + "learning_rate": 4.859786339754817e-05, + "loss": 1.1392, + "num_input_tokens_seen": 118180552, + "step": 7345 + }, + { + "epoch": 0.5145744131270412, + "grad_norm": 4.6415581703186035, + "learning_rate": 4.8590865148861656e-05, + "loss": 1.1932, + "num_input_tokens_seen": 118196936, + "step": 7346 + }, + { + "epoch": 0.5146444613727705, + "grad_norm": 4.681972980499268, + "learning_rate": 4.858386690017513e-05, + "loss": 1.0245, + "num_input_tokens_seen": 118213320, + "step": 7347 + }, + { + "epoch": 0.5147145096184997, + "grad_norm": 6.289412021636963, + "learning_rate": 4.857686865148862e-05, + "loss": 1.0609, + "num_input_tokens_seen": 118228648, + "step": 7348 + }, + { + "epoch": 0.5147845578642289, + "grad_norm": 4.726010799407959, + "learning_rate": 4.85698704028021e-05, + "loss": 1.1635, + "num_input_tokens_seen": 118244552, + "step": 7349 + }, + { + "epoch": 0.5148546061099583, + "grad_norm": 3.7650058269500732, + "learning_rate": 4.856287215411559e-05, + "loss": 0.8969, + "num_input_tokens_seen": 118260416, + "step": 7350 + }, + { + "epoch": 0.5149246543556875, + "grad_norm": 5.283148765563965, + "learning_rate": 4.855587390542907e-05, + "loss": 1.1027, + "num_input_tokens_seen": 118276800, + "step": 7351 + }, + { + "epoch": 0.5149947026014168, + "grad_norm": 4.060834884643555, + "learning_rate": 4.854887565674256e-05, + "loss": 0.9387, + "num_input_tokens_seen": 118293128, + "step": 7352 + }, + { + "epoch": 0.515064750847146, + "grad_norm": 3.5670249462127686, + "learning_rate": 4.854187740805605e-05, + "loss": 0.9658, + "num_input_tokens_seen": 118309512, + "step": 7353 + }, + { + "epoch": 0.5151347990928752, + "grad_norm": 4.051480770111084, + "learning_rate": 4.8534879159369524e-05, + "loss": 1.1616, + "num_input_tokens_seen": 118325640, + "step": 7354 + }, + { + "epoch": 0.5152048473386045, + "grad_norm": 4.1467461585998535, + "learning_rate": 4.852788091068302e-05, + "loss": 1.0501, + "num_input_tokens_seen": 118342024, + "step": 7355 + }, + { + "epoch": 0.5152748955843337, + "grad_norm": 4.226484775543213, + "learning_rate": 4.8520882661996495e-05, + "loss": 1.3255, + "num_input_tokens_seen": 118358408, + "step": 7356 + }, + { + "epoch": 0.5153449438300629, + "grad_norm": 3.81172513961792, + "learning_rate": 4.8513884413309983e-05, + "loss": 0.9909, + "num_input_tokens_seen": 118373000, + "step": 7357 + }, + { + "epoch": 0.5154149920757922, + "grad_norm": 5.178154945373535, + "learning_rate": 4.850688616462347e-05, + "loss": 1.1296, + "num_input_tokens_seen": 118389208, + "step": 7358 + }, + { + "epoch": 0.5154850403215214, + "grad_norm": 3.971707582473755, + "learning_rate": 4.8499887915936954e-05, + "loss": 1.0011, + "num_input_tokens_seen": 118405592, + "step": 7359 + }, + { + "epoch": 0.5155550885672507, + "grad_norm": 3.4853761196136475, + "learning_rate": 4.849288966725044e-05, + "loss": 0.9744, + "num_input_tokens_seen": 118421976, + "step": 7360 + }, + { + "epoch": 0.51562513681298, + "grad_norm": 3.309765577316284, + "learning_rate": 4.8485891418563924e-05, + "loss": 0.9776, + "num_input_tokens_seen": 118438232, + "step": 7361 + }, + { + "epoch": 0.5156951850587091, + "grad_norm": 5.875191688537598, + "learning_rate": 4.847889316987741e-05, + "loss": 0.9788, + "num_input_tokens_seen": 118453368, + "step": 7362 + }, + { + "epoch": 0.5157652333044385, + "grad_norm": 3.633922815322876, + "learning_rate": 4.84718949211909e-05, + "loss": 0.9949, + "num_input_tokens_seen": 118469752, + "step": 7363 + }, + { + "epoch": 0.5158352815501677, + "grad_norm": 3.7608840465545654, + "learning_rate": 4.8464896672504376e-05, + "loss": 0.9275, + "num_input_tokens_seen": 118485760, + "step": 7364 + }, + { + "epoch": 0.5159053297958969, + "grad_norm": 4.625830173492432, + "learning_rate": 4.845789842381787e-05, + "loss": 1.0056, + "num_input_tokens_seen": 118501944, + "step": 7365 + }, + { + "epoch": 0.5159753780416262, + "grad_norm": 5.04378604888916, + "learning_rate": 4.845090017513135e-05, + "loss": 1.1256, + "num_input_tokens_seen": 118518328, + "step": 7366 + }, + { + "epoch": 0.5160454262873554, + "grad_norm": 3.658432960510254, + "learning_rate": 4.8443901926444835e-05, + "loss": 1.0818, + "num_input_tokens_seen": 118534528, + "step": 7367 + }, + { + "epoch": 0.5161154745330847, + "grad_norm": 4.299334526062012, + "learning_rate": 4.8436903677758324e-05, + "loss": 1.0318, + "num_input_tokens_seen": 118550912, + "step": 7368 + }, + { + "epoch": 0.5161855227788139, + "grad_norm": 5.608176231384277, + "learning_rate": 4.8429905429071806e-05, + "loss": 1.0708, + "num_input_tokens_seen": 118566464, + "step": 7369 + }, + { + "epoch": 0.5162555710245431, + "grad_norm": 4.76702356338501, + "learning_rate": 4.8422907180385294e-05, + "loss": 1.035, + "num_input_tokens_seen": 118582848, + "step": 7370 + }, + { + "epoch": 0.5163256192702724, + "grad_norm": 4.01913595199585, + "learning_rate": 4.8415908931698776e-05, + "loss": 1.2714, + "num_input_tokens_seen": 118599232, + "step": 7371 + }, + { + "epoch": 0.5163956675160016, + "grad_norm": 4.569849967956543, + "learning_rate": 4.8408910683012265e-05, + "loss": 0.9726, + "num_input_tokens_seen": 118615616, + "step": 7372 + }, + { + "epoch": 0.5164657157617308, + "grad_norm": 4.713064670562744, + "learning_rate": 4.8401912434325753e-05, + "loss": 0.9028, + "num_input_tokens_seen": 118632000, + "step": 7373 + }, + { + "epoch": 0.5165357640074602, + "grad_norm": 4.8917317390441895, + "learning_rate": 4.839491418563923e-05, + "loss": 0.9373, + "num_input_tokens_seen": 118646960, + "step": 7374 + }, + { + "epoch": 0.5166058122531894, + "grad_norm": 4.5929460525512695, + "learning_rate": 4.8387915936952724e-05, + "loss": 1.0066, + "num_input_tokens_seen": 118663344, + "step": 7375 + }, + { + "epoch": 0.5166758604989187, + "grad_norm": 4.328620433807373, + "learning_rate": 4.83809176882662e-05, + "loss": 0.9591, + "num_input_tokens_seen": 118679728, + "step": 7376 + }, + { + "epoch": 0.5167459087446479, + "grad_norm": 4.046355724334717, + "learning_rate": 4.837391943957969e-05, + "loss": 1.1514, + "num_input_tokens_seen": 118694912, + "step": 7377 + }, + { + "epoch": 0.5168159569903771, + "grad_norm": 3.612791061401367, + "learning_rate": 4.836692119089317e-05, + "loss": 1.0936, + "num_input_tokens_seen": 118711296, + "step": 7378 + }, + { + "epoch": 0.5168860052361064, + "grad_norm": 3.2623980045318604, + "learning_rate": 4.835992294220666e-05, + "loss": 0.9072, + "num_input_tokens_seen": 118727680, + "step": 7379 + }, + { + "epoch": 0.5169560534818356, + "grad_norm": 3.7941362857818604, + "learning_rate": 4.8352924693520147e-05, + "loss": 0.8598, + "num_input_tokens_seen": 118744064, + "step": 7380 + }, + { + "epoch": 0.5170261017275649, + "grad_norm": 3.7529819011688232, + "learning_rate": 4.834592644483363e-05, + "loss": 0.9265, + "num_input_tokens_seen": 118759616, + "step": 7381 + }, + { + "epoch": 0.5170961499732941, + "grad_norm": 3.552791118621826, + "learning_rate": 4.833892819614712e-05, + "loss": 1.1577, + "num_input_tokens_seen": 118776000, + "step": 7382 + }, + { + "epoch": 0.5171661982190233, + "grad_norm": 3.719827651977539, + "learning_rate": 4.833192994746059e-05, + "loss": 0.852, + "num_input_tokens_seen": 118792360, + "step": 7383 + }, + { + "epoch": 0.5172362464647526, + "grad_norm": 6.634743690490723, + "learning_rate": 4.832493169877408e-05, + "loss": 1.3676, + "num_input_tokens_seen": 118808112, + "step": 7384 + }, + { + "epoch": 0.5173062947104818, + "grad_norm": 4.388521194458008, + "learning_rate": 4.8317933450087576e-05, + "loss": 1.2442, + "num_input_tokens_seen": 118824384, + "step": 7385 + }, + { + "epoch": 0.517376342956211, + "grad_norm": 4.161240577697754, + "learning_rate": 4.831093520140105e-05, + "loss": 1.0573, + "num_input_tokens_seen": 118840280, + "step": 7386 + }, + { + "epoch": 0.5174463912019404, + "grad_norm": 4.272933006286621, + "learning_rate": 4.830393695271454e-05, + "loss": 1.1301, + "num_input_tokens_seen": 118855176, + "step": 7387 + }, + { + "epoch": 0.5175164394476696, + "grad_norm": 4.388091087341309, + "learning_rate": 4.829693870402802e-05, + "loss": 1.0048, + "num_input_tokens_seen": 118869528, + "step": 7388 + }, + { + "epoch": 0.5175864876933989, + "grad_norm": 3.623302698135376, + "learning_rate": 4.828994045534151e-05, + "loss": 1.0096, + "num_input_tokens_seen": 118885272, + "step": 7389 + }, + { + "epoch": 0.5176565359391281, + "grad_norm": 6.347753047943115, + "learning_rate": 4.8282942206655e-05, + "loss": 0.9923, + "num_input_tokens_seen": 118901384, + "step": 7390 + }, + { + "epoch": 0.5177265841848573, + "grad_norm": 5.049704551696777, + "learning_rate": 4.827594395796848e-05, + "loss": 1.1742, + "num_input_tokens_seen": 118916392, + "step": 7391 + }, + { + "epoch": 0.5177966324305866, + "grad_norm": 4.022581100463867, + "learning_rate": 4.826894570928197e-05, + "loss": 0.8257, + "num_input_tokens_seen": 118932000, + "step": 7392 + }, + { + "epoch": 0.5178666806763158, + "grad_norm": 4.277614593505859, + "learning_rate": 4.8261947460595444e-05, + "loss": 1.1972, + "num_input_tokens_seen": 118947752, + "step": 7393 + }, + { + "epoch": 0.517936728922045, + "grad_norm": 3.5294747352600098, + "learning_rate": 4.825494921190893e-05, + "loss": 1.0023, + "num_input_tokens_seen": 118963312, + "step": 7394 + }, + { + "epoch": 0.5180067771677743, + "grad_norm": 4.206392765045166, + "learning_rate": 4.824795096322243e-05, + "loss": 1.1942, + "num_input_tokens_seen": 118979696, + "step": 7395 + }, + { + "epoch": 0.5180768254135035, + "grad_norm": 5.907151222229004, + "learning_rate": 4.82409527145359e-05, + "loss": 1.1046, + "num_input_tokens_seen": 118996080, + "step": 7396 + }, + { + "epoch": 0.5181468736592328, + "grad_norm": 3.4217193126678467, + "learning_rate": 4.823395446584939e-05, + "loss": 0.9469, + "num_input_tokens_seen": 119012064, + "step": 7397 + }, + { + "epoch": 0.518216921904962, + "grad_norm": 3.4076719284057617, + "learning_rate": 4.822695621716287e-05, + "loss": 0.9599, + "num_input_tokens_seen": 119028448, + "step": 7398 + }, + { + "epoch": 0.5182869701506913, + "grad_norm": 3.6392581462860107, + "learning_rate": 4.821995796847636e-05, + "loss": 1.0481, + "num_input_tokens_seen": 119044832, + "step": 7399 + }, + { + "epoch": 0.5183570183964206, + "grad_norm": 4.3391947746276855, + "learning_rate": 4.821295971978985e-05, + "loss": 1.0365, + "num_input_tokens_seen": 119061216, + "step": 7400 + }, + { + "epoch": 0.5183570183964206, + "eval_loss": 1.1227985620498657, + "eval_runtime": 0.1572, + "eval_samples_per_second": 6.362, + "eval_steps_per_second": 6.362, + "num_input_tokens_seen": 119061216, + "step": 7400 + }, + { + "epoch": 0.5184270666421498, + "grad_norm": 4.152843952178955, + "learning_rate": 4.820596147110333e-05, + "loss": 1.0123, + "num_input_tokens_seen": 119077600, + "step": 7401 + }, + { + "epoch": 0.518497114887879, + "grad_norm": 4.2847371101379395, + "learning_rate": 4.819896322241682e-05, + "loss": 1.0532, + "num_input_tokens_seen": 119093984, + "step": 7402 + }, + { + "epoch": 0.5185671631336083, + "grad_norm": 3.9743080139160156, + "learning_rate": 4.8191964973730296e-05, + "loss": 1.0229, + "num_input_tokens_seen": 119109984, + "step": 7403 + }, + { + "epoch": 0.5186372113793375, + "grad_norm": 3.8699886798858643, + "learning_rate": 4.8184966725043785e-05, + "loss": 1.0008, + "num_input_tokens_seen": 119126040, + "step": 7404 + }, + { + "epoch": 0.5187072596250668, + "grad_norm": 4.810290336608887, + "learning_rate": 4.8177968476357266e-05, + "loss": 1.1959, + "num_input_tokens_seen": 119141512, + "step": 7405 + }, + { + "epoch": 0.518777307870796, + "grad_norm": 5.068106174468994, + "learning_rate": 4.8170970227670755e-05, + "loss": 0.9215, + "num_input_tokens_seen": 119156112, + "step": 7406 + }, + { + "epoch": 0.5188473561165252, + "grad_norm": 5.044201374053955, + "learning_rate": 4.8163971978984244e-05, + "loss": 1.2169, + "num_input_tokens_seen": 119172496, + "step": 7407 + }, + { + "epoch": 0.5189174043622545, + "grad_norm": 5.986364364624023, + "learning_rate": 4.8156973730297725e-05, + "loss": 1.1855, + "num_input_tokens_seen": 119188880, + "step": 7408 + }, + { + "epoch": 0.5189874526079837, + "grad_norm": 4.029194355010986, + "learning_rate": 4.8149975481611214e-05, + "loss": 0.9322, + "num_input_tokens_seen": 119205264, + "step": 7409 + }, + { + "epoch": 0.519057500853713, + "grad_norm": 6.960198879241943, + "learning_rate": 4.814297723292469e-05, + "loss": 0.9012, + "num_input_tokens_seen": 119220560, + "step": 7410 + }, + { + "epoch": 0.5191275490994423, + "grad_norm": 3.8074264526367188, + "learning_rate": 4.8135978984238184e-05, + "loss": 1.0376, + "num_input_tokens_seen": 119236856, + "step": 7411 + }, + { + "epoch": 0.5191975973451715, + "grad_norm": 4.192657947540283, + "learning_rate": 4.812898073555167e-05, + "loss": 1.2593, + "num_input_tokens_seen": 119253240, + "step": 7412 + }, + { + "epoch": 0.5192676455909008, + "grad_norm": 4.494477272033691, + "learning_rate": 4.812198248686515e-05, + "loss": 1.1815, + "num_input_tokens_seen": 119269264, + "step": 7413 + }, + { + "epoch": 0.51933769383663, + "grad_norm": 3.808358907699585, + "learning_rate": 4.8114984238178637e-05, + "loss": 1.0039, + "num_input_tokens_seen": 119285384, + "step": 7414 + }, + { + "epoch": 0.5194077420823592, + "grad_norm": 4.097054481506348, + "learning_rate": 4.810798598949212e-05, + "loss": 1.2953, + "num_input_tokens_seen": 119301768, + "step": 7415 + }, + { + "epoch": 0.5194777903280885, + "grad_norm": 4.17371129989624, + "learning_rate": 4.810098774080561e-05, + "loss": 1.3124, + "num_input_tokens_seen": 119318152, + "step": 7416 + }, + { + "epoch": 0.5195478385738177, + "grad_norm": 3.7520711421966553, + "learning_rate": 4.8093989492119096e-05, + "loss": 0.9264, + "num_input_tokens_seen": 119334536, + "step": 7417 + }, + { + "epoch": 0.519617886819547, + "grad_norm": 3.5394372940063477, + "learning_rate": 4.808699124343258e-05, + "loss": 1.1861, + "num_input_tokens_seen": 119350888, + "step": 7418 + }, + { + "epoch": 0.5196879350652762, + "grad_norm": 3.490539312362671, + "learning_rate": 4.8079992994746066e-05, + "loss": 0.9393, + "num_input_tokens_seen": 119367272, + "step": 7419 + }, + { + "epoch": 0.5197579833110054, + "grad_norm": 3.7124273777008057, + "learning_rate": 4.807299474605954e-05, + "loss": 0.9956, + "num_input_tokens_seen": 119383656, + "step": 7420 + }, + { + "epoch": 0.5198280315567347, + "grad_norm": 4.070093631744385, + "learning_rate": 4.8065996497373036e-05, + "loss": 1.0084, + "num_input_tokens_seen": 119399440, + "step": 7421 + }, + { + "epoch": 0.5198980798024639, + "grad_norm": 7.456560134887695, + "learning_rate": 4.8058998248686525e-05, + "loss": 0.9437, + "num_input_tokens_seen": 119415232, + "step": 7422 + }, + { + "epoch": 0.5199681280481931, + "grad_norm": 3.812809705734253, + "learning_rate": 4.8052e-05, + "loss": 0.9875, + "num_input_tokens_seen": 119431032, + "step": 7423 + }, + { + "epoch": 0.5200381762939225, + "grad_norm": 4.641679763793945, + "learning_rate": 4.804500175131349e-05, + "loss": 0.8798, + "num_input_tokens_seen": 119447416, + "step": 7424 + }, + { + "epoch": 0.5201082245396517, + "grad_norm": 3.7425315380096436, + "learning_rate": 4.803800350262697e-05, + "loss": 0.922, + "num_input_tokens_seen": 119463400, + "step": 7425 + }, + { + "epoch": 0.520178272785381, + "grad_norm": 3.5548949241638184, + "learning_rate": 4.803100525394046e-05, + "loss": 0.8365, + "num_input_tokens_seen": 119479784, + "step": 7426 + }, + { + "epoch": 0.5202483210311102, + "grad_norm": 3.335888385772705, + "learning_rate": 4.802400700525395e-05, + "loss": 0.9923, + "num_input_tokens_seen": 119495928, + "step": 7427 + }, + { + "epoch": 0.5203183692768394, + "grad_norm": 3.8208446502685547, + "learning_rate": 4.801700875656743e-05, + "loss": 0.9784, + "num_input_tokens_seen": 119511408, + "step": 7428 + }, + { + "epoch": 0.5203884175225687, + "grad_norm": 4.35474967956543, + "learning_rate": 4.801001050788092e-05, + "loss": 1.1477, + "num_input_tokens_seen": 119526656, + "step": 7429 + }, + { + "epoch": 0.5204584657682979, + "grad_norm": 4.081501007080078, + "learning_rate": 4.800301225919439e-05, + "loss": 1.0881, + "num_input_tokens_seen": 119543040, + "step": 7430 + }, + { + "epoch": 0.5205285140140271, + "grad_norm": 4.1007866859436035, + "learning_rate": 4.799601401050789e-05, + "loss": 0.9335, + "num_input_tokens_seen": 119559040, + "step": 7431 + }, + { + "epoch": 0.5205985622597564, + "grad_norm": 3.955095052719116, + "learning_rate": 4.7989015761821363e-05, + "loss": 1.0092, + "num_input_tokens_seen": 119574832, + "step": 7432 + }, + { + "epoch": 0.5206686105054856, + "grad_norm": 5.475005626678467, + "learning_rate": 4.798201751313485e-05, + "loss": 1.1933, + "num_input_tokens_seen": 119590688, + "step": 7433 + }, + { + "epoch": 0.520738658751215, + "grad_norm": 4.125513553619385, + "learning_rate": 4.797501926444834e-05, + "loss": 0.9113, + "num_input_tokens_seen": 119606592, + "step": 7434 + }, + { + "epoch": 0.5208087069969441, + "grad_norm": 3.608366012573242, + "learning_rate": 4.796802101576182e-05, + "loss": 1.0557, + "num_input_tokens_seen": 119622976, + "step": 7435 + }, + { + "epoch": 0.5208787552426734, + "grad_norm": 5.827488899230957, + "learning_rate": 4.796102276707531e-05, + "loss": 1.044, + "num_input_tokens_seen": 119639360, + "step": 7436 + }, + { + "epoch": 0.5209488034884027, + "grad_norm": 4.732996463775635, + "learning_rate": 4.795402451838879e-05, + "loss": 0.7811, + "num_input_tokens_seen": 119654448, + "step": 7437 + }, + { + "epoch": 0.5210188517341319, + "grad_norm": 3.9436864852905273, + "learning_rate": 4.794702626970228e-05, + "loss": 0.9169, + "num_input_tokens_seen": 119670448, + "step": 7438 + }, + { + "epoch": 0.5210888999798611, + "grad_norm": 3.787825345993042, + "learning_rate": 4.794002802101577e-05, + "loss": 1.1469, + "num_input_tokens_seen": 119686832, + "step": 7439 + }, + { + "epoch": 0.5211589482255904, + "grad_norm": 3.7498011589050293, + "learning_rate": 4.7933029772329245e-05, + "loss": 0.944, + "num_input_tokens_seen": 119702776, + "step": 7440 + }, + { + "epoch": 0.5212289964713196, + "grad_norm": 4.7387285232543945, + "learning_rate": 4.792603152364274e-05, + "loss": 1.0428, + "num_input_tokens_seen": 119719160, + "step": 7441 + }, + { + "epoch": 0.5212990447170489, + "grad_norm": 4.916442394256592, + "learning_rate": 4.7919033274956215e-05, + "loss": 1.0036, + "num_input_tokens_seen": 119735208, + "step": 7442 + }, + { + "epoch": 0.5213690929627781, + "grad_norm": 7.069667339324951, + "learning_rate": 4.7912035026269704e-05, + "loss": 1.0664, + "num_input_tokens_seen": 119750720, + "step": 7443 + }, + { + "epoch": 0.5214391412085073, + "grad_norm": 4.178729057312012, + "learning_rate": 4.790503677758319e-05, + "loss": 0.946, + "num_input_tokens_seen": 119767104, + "step": 7444 + }, + { + "epoch": 0.5215091894542366, + "grad_norm": 4.8164801597595215, + "learning_rate": 4.7898038528896674e-05, + "loss": 1.2099, + "num_input_tokens_seen": 119782680, + "step": 7445 + }, + { + "epoch": 0.5215792376999658, + "grad_norm": 4.30715799331665, + "learning_rate": 4.789104028021016e-05, + "loss": 1.0225, + "num_input_tokens_seen": 119798392, + "step": 7446 + }, + { + "epoch": 0.5216492859456952, + "grad_norm": 4.270611763000488, + "learning_rate": 4.7884042031523645e-05, + "loss": 1.3242, + "num_input_tokens_seen": 119814776, + "step": 7447 + }, + { + "epoch": 0.5217193341914244, + "grad_norm": 3.3384652137756348, + "learning_rate": 4.7877043782837133e-05, + "loss": 0.9353, + "num_input_tokens_seen": 119831160, + "step": 7448 + }, + { + "epoch": 0.5217893824371536, + "grad_norm": 3.343555450439453, + "learning_rate": 4.787004553415062e-05, + "loss": 1.0108, + "num_input_tokens_seen": 119847544, + "step": 7449 + }, + { + "epoch": 0.5218594306828829, + "grad_norm": 4.26312255859375, + "learning_rate": 4.78630472854641e-05, + "loss": 1.1169, + "num_input_tokens_seen": 119863928, + "step": 7450 + }, + { + "epoch": 0.5219294789286121, + "grad_norm": 3.9083523750305176, + "learning_rate": 4.785604903677759e-05, + "loss": 1.01, + "num_input_tokens_seen": 119880312, + "step": 7451 + }, + { + "epoch": 0.5219995271743413, + "grad_norm": 4.212127208709717, + "learning_rate": 4.784905078809107e-05, + "loss": 1.1562, + "num_input_tokens_seen": 119895464, + "step": 7452 + }, + { + "epoch": 0.5220695754200706, + "grad_norm": 6.6233038902282715, + "learning_rate": 4.7842052539404556e-05, + "loss": 1.0112, + "num_input_tokens_seen": 119911848, + "step": 7453 + }, + { + "epoch": 0.5221396236657998, + "grad_norm": 3.770444631576538, + "learning_rate": 4.7835054290718045e-05, + "loss": 0.9608, + "num_input_tokens_seen": 119928232, + "step": 7454 + }, + { + "epoch": 0.5222096719115291, + "grad_norm": 3.3227531909942627, + "learning_rate": 4.7828056042031527e-05, + "loss": 0.9593, + "num_input_tokens_seen": 119944296, + "step": 7455 + }, + { + "epoch": 0.5222797201572583, + "grad_norm": 4.297872066497803, + "learning_rate": 4.7821057793345015e-05, + "loss": 1.1381, + "num_input_tokens_seen": 119960680, + "step": 7456 + }, + { + "epoch": 0.5223497684029875, + "grad_norm": 3.6508405208587646, + "learning_rate": 4.78140595446585e-05, + "loss": 0.9298, + "num_input_tokens_seen": 119977064, + "step": 7457 + }, + { + "epoch": 0.5224198166487168, + "grad_norm": 3.5275723934173584, + "learning_rate": 4.7807061295971986e-05, + "loss": 1.0049, + "num_input_tokens_seen": 119993448, + "step": 7458 + }, + { + "epoch": 0.522489864894446, + "grad_norm": 3.983844757080078, + "learning_rate": 4.780006304728546e-05, + "loss": 1.0362, + "num_input_tokens_seen": 120009832, + "step": 7459 + }, + { + "epoch": 0.5225599131401752, + "grad_norm": 3.6752452850341797, + "learning_rate": 4.779306479859895e-05, + "loss": 1.0864, + "num_input_tokens_seen": 120026216, + "step": 7460 + }, + { + "epoch": 0.5226299613859046, + "grad_norm": 3.3501710891723633, + "learning_rate": 4.7786066549912445e-05, + "loss": 0.8639, + "num_input_tokens_seen": 120042456, + "step": 7461 + }, + { + "epoch": 0.5227000096316338, + "grad_norm": 3.448544979095459, + "learning_rate": 4.777906830122592e-05, + "loss": 0.953, + "num_input_tokens_seen": 120058392, + "step": 7462 + }, + { + "epoch": 0.5227700578773631, + "grad_norm": 3.7892913818359375, + "learning_rate": 4.777207005253941e-05, + "loss": 0.9391, + "num_input_tokens_seen": 120074776, + "step": 7463 + }, + { + "epoch": 0.5228401061230923, + "grad_norm": 6.972007751464844, + "learning_rate": 4.776507180385289e-05, + "loss": 0.944, + "num_input_tokens_seen": 120089776, + "step": 7464 + }, + { + "epoch": 0.5229101543688215, + "grad_norm": 3.6271073818206787, + "learning_rate": 4.775807355516638e-05, + "loss": 1.1703, + "num_input_tokens_seen": 120106160, + "step": 7465 + }, + { + "epoch": 0.5229802026145508, + "grad_norm": 4.735227108001709, + "learning_rate": 4.775107530647987e-05, + "loss": 1.2245, + "num_input_tokens_seen": 120122144, + "step": 7466 + }, + { + "epoch": 0.52305025086028, + "grad_norm": 4.848446846008301, + "learning_rate": 4.774407705779335e-05, + "loss": 1.0117, + "num_input_tokens_seen": 120138528, + "step": 7467 + }, + { + "epoch": 0.5231202991060092, + "grad_norm": 3.5538604259490967, + "learning_rate": 4.773707880910684e-05, + "loss": 1.0153, + "num_input_tokens_seen": 120154912, + "step": 7468 + }, + { + "epoch": 0.5231903473517385, + "grad_norm": 4.481129169464111, + "learning_rate": 4.773008056042031e-05, + "loss": 1.1685, + "num_input_tokens_seen": 120171296, + "step": 7469 + }, + { + "epoch": 0.5232603955974677, + "grad_norm": 4.581019401550293, + "learning_rate": 4.77230823117338e-05, + "loss": 1.0292, + "num_input_tokens_seen": 120187680, + "step": 7470 + }, + { + "epoch": 0.523330443843197, + "grad_norm": 5.147364139556885, + "learning_rate": 4.7716084063047297e-05, + "loss": 0.9574, + "num_input_tokens_seen": 120204008, + "step": 7471 + }, + { + "epoch": 0.5234004920889263, + "grad_norm": 3.716172218322754, + "learning_rate": 4.770908581436077e-05, + "loss": 0.9937, + "num_input_tokens_seen": 120220392, + "step": 7472 + }, + { + "epoch": 0.5234705403346555, + "grad_norm": 5.15359354019165, + "learning_rate": 4.770208756567426e-05, + "loss": 1.0412, + "num_input_tokens_seen": 120236504, + "step": 7473 + }, + { + "epoch": 0.5235405885803848, + "grad_norm": 3.9450008869171143, + "learning_rate": 4.769508931698774e-05, + "loss": 1.2084, + "num_input_tokens_seen": 120252568, + "step": 7474 + }, + { + "epoch": 0.523610636826114, + "grad_norm": 7.154159069061279, + "learning_rate": 4.768809106830123e-05, + "loss": 1.0837, + "num_input_tokens_seen": 120268952, + "step": 7475 + }, + { + "epoch": 0.5236806850718432, + "grad_norm": 3.838291883468628, + "learning_rate": 4.768109281961472e-05, + "loss": 0.9896, + "num_input_tokens_seen": 120285336, + "step": 7476 + }, + { + "epoch": 0.5237507333175725, + "grad_norm": 4.197968482971191, + "learning_rate": 4.76740945709282e-05, + "loss": 1.0913, + "num_input_tokens_seen": 120301720, + "step": 7477 + }, + { + "epoch": 0.5238207815633017, + "grad_norm": 4.732188701629639, + "learning_rate": 4.766709632224169e-05, + "loss": 1.2257, + "num_input_tokens_seen": 120316968, + "step": 7478 + }, + { + "epoch": 0.523890829809031, + "grad_norm": 4.164662837982178, + "learning_rate": 4.7660098073555165e-05, + "loss": 1.0551, + "num_input_tokens_seen": 120333352, + "step": 7479 + }, + { + "epoch": 0.5239608780547602, + "grad_norm": 3.6521334648132324, + "learning_rate": 4.765309982486865e-05, + "loss": 0.9889, + "num_input_tokens_seen": 120349736, + "step": 7480 + }, + { + "epoch": 0.5240309263004894, + "grad_norm": 3.6896276473999023, + "learning_rate": 4.764610157618215e-05, + "loss": 0.8846, + "num_input_tokens_seen": 120366120, + "step": 7481 + }, + { + "epoch": 0.5241009745462187, + "grad_norm": 4.352004051208496, + "learning_rate": 4.7639103327495624e-05, + "loss": 1.0483, + "num_input_tokens_seen": 120382360, + "step": 7482 + }, + { + "epoch": 0.5241710227919479, + "grad_norm": 6.433780670166016, + "learning_rate": 4.763210507880911e-05, + "loss": 1.1227, + "num_input_tokens_seen": 120397904, + "step": 7483 + }, + { + "epoch": 0.5242410710376773, + "grad_norm": 5.6717329025268555, + "learning_rate": 4.7625106830122594e-05, + "loss": 1.0687, + "num_input_tokens_seen": 120414248, + "step": 7484 + }, + { + "epoch": 0.5243111192834065, + "grad_norm": 3.4725184440612793, + "learning_rate": 4.761810858143608e-05, + "loss": 0.8943, + "num_input_tokens_seen": 120430632, + "step": 7485 + }, + { + "epoch": 0.5243811675291357, + "grad_norm": 3.803506374359131, + "learning_rate": 4.761111033274956e-05, + "loss": 1.0, + "num_input_tokens_seen": 120447016, + "step": 7486 + }, + { + "epoch": 0.524451215774865, + "grad_norm": 5.165005683898926, + "learning_rate": 4.760411208406305e-05, + "loss": 1.0664, + "num_input_tokens_seen": 120463400, + "step": 7487 + }, + { + "epoch": 0.5245212640205942, + "grad_norm": 6.133605480194092, + "learning_rate": 4.759711383537654e-05, + "loss": 1.0436, + "num_input_tokens_seen": 120478432, + "step": 7488 + }, + { + "epoch": 0.5245913122663234, + "grad_norm": 4.061281681060791, + "learning_rate": 4.7590115586690017e-05, + "loss": 1.1429, + "num_input_tokens_seen": 120494816, + "step": 7489 + }, + { + "epoch": 0.5246613605120527, + "grad_norm": 3.2192203998565674, + "learning_rate": 4.7583117338003505e-05, + "loss": 0.9059, + "num_input_tokens_seen": 120511200, + "step": 7490 + }, + { + "epoch": 0.5247314087577819, + "grad_norm": 3.718182325363159, + "learning_rate": 4.757611908931699e-05, + "loss": 0.9153, + "num_input_tokens_seen": 120527584, + "step": 7491 + }, + { + "epoch": 0.5248014570035112, + "grad_norm": 3.742267370223999, + "learning_rate": 4.7569120840630476e-05, + "loss": 0.9785, + "num_input_tokens_seen": 120543968, + "step": 7492 + }, + { + "epoch": 0.5248715052492404, + "grad_norm": 5.5869951248168945, + "learning_rate": 4.7562122591943964e-05, + "loss": 1.009, + "num_input_tokens_seen": 120559120, + "step": 7493 + }, + { + "epoch": 0.5249415534949696, + "grad_norm": 3.9366302490234375, + "learning_rate": 4.7555124343257446e-05, + "loss": 1.1197, + "num_input_tokens_seen": 120575504, + "step": 7494 + }, + { + "epoch": 0.5250116017406989, + "grad_norm": 5.102993488311768, + "learning_rate": 4.7548126094570935e-05, + "loss": 1.0065, + "num_input_tokens_seen": 120591592, + "step": 7495 + }, + { + "epoch": 0.5250816499864281, + "grad_norm": 3.392009735107422, + "learning_rate": 4.754112784588441e-05, + "loss": 0.9957, + "num_input_tokens_seen": 120607976, + "step": 7496 + }, + { + "epoch": 0.5251516982321573, + "grad_norm": 5.089282512664795, + "learning_rate": 4.7534129597197905e-05, + "loss": 0.9714, + "num_input_tokens_seen": 120624360, + "step": 7497 + }, + { + "epoch": 0.5252217464778867, + "grad_norm": 4.311940670013428, + "learning_rate": 4.7527131348511394e-05, + "loss": 0.9564, + "num_input_tokens_seen": 120640744, + "step": 7498 + }, + { + "epoch": 0.5252917947236159, + "grad_norm": 3.8907923698425293, + "learning_rate": 4.752013309982487e-05, + "loss": 1.1852, + "num_input_tokens_seen": 120656992, + "step": 7499 + }, + { + "epoch": 0.5253618429693452, + "grad_norm": 3.856172800064087, + "learning_rate": 4.751313485113836e-05, + "loss": 1.2247, + "num_input_tokens_seen": 120673376, + "step": 7500 + }, + { + "epoch": 0.5254318912150744, + "grad_norm": 3.661641836166382, + "learning_rate": 4.750613660245184e-05, + "loss": 1.0106, + "num_input_tokens_seen": 120689760, + "step": 7501 + }, + { + "epoch": 0.5255019394608036, + "grad_norm": 3.8533976078033447, + "learning_rate": 4.749913835376533e-05, + "loss": 0.9485, + "num_input_tokens_seen": 120706144, + "step": 7502 + }, + { + "epoch": 0.5255719877065329, + "grad_norm": 4.209764003753662, + "learning_rate": 4.7492140105078816e-05, + "loss": 1.0623, + "num_input_tokens_seen": 120722528, + "step": 7503 + }, + { + "epoch": 0.5256420359522621, + "grad_norm": 4.190296649932861, + "learning_rate": 4.74851418563923e-05, + "loss": 1.0135, + "num_input_tokens_seen": 120738912, + "step": 7504 + }, + { + "epoch": 0.5257120841979913, + "grad_norm": 3.971188545227051, + "learning_rate": 4.747814360770579e-05, + "loss": 0.8885, + "num_input_tokens_seen": 120754464, + "step": 7505 + }, + { + "epoch": 0.5257821324437206, + "grad_norm": 3.8005099296569824, + "learning_rate": 4.747114535901926e-05, + "loss": 0.8758, + "num_input_tokens_seen": 120770848, + "step": 7506 + }, + { + "epoch": 0.5258521806894498, + "grad_norm": 4.144433498382568, + "learning_rate": 4.746414711033276e-05, + "loss": 1.2437, + "num_input_tokens_seen": 120787232, + "step": 7507 + }, + { + "epoch": 0.5259222289351791, + "grad_norm": 4.963449001312256, + "learning_rate": 4.7457148861646246e-05, + "loss": 1.2806, + "num_input_tokens_seen": 120802792, + "step": 7508 + }, + { + "epoch": 0.5259922771809084, + "grad_norm": 6.420035362243652, + "learning_rate": 4.745015061295972e-05, + "loss": 1.1617, + "num_input_tokens_seen": 120819176, + "step": 7509 + }, + { + "epoch": 0.5260623254266376, + "grad_norm": 3.493263006210327, + "learning_rate": 4.744315236427321e-05, + "loss": 0.9335, + "num_input_tokens_seen": 120835560, + "step": 7510 + }, + { + "epoch": 0.5261323736723669, + "grad_norm": 5.055440425872803, + "learning_rate": 4.743615411558669e-05, + "loss": 0.9479, + "num_input_tokens_seen": 120851752, + "step": 7511 + }, + { + "epoch": 0.5262024219180961, + "grad_norm": 3.632171869277954, + "learning_rate": 4.742915586690018e-05, + "loss": 1.0991, + "num_input_tokens_seen": 120867568, + "step": 7512 + }, + { + "epoch": 0.5262724701638254, + "grad_norm": 4.197231769561768, + "learning_rate": 4.742215761821366e-05, + "loss": 1.1136, + "num_input_tokens_seen": 120883952, + "step": 7513 + }, + { + "epoch": 0.5263425184095546, + "grad_norm": 5.476736068725586, + "learning_rate": 4.741515936952715e-05, + "loss": 0.9664, + "num_input_tokens_seen": 120899584, + "step": 7514 + }, + { + "epoch": 0.5264125666552838, + "grad_norm": 3.98996901512146, + "learning_rate": 4.740816112084064e-05, + "loss": 1.0942, + "num_input_tokens_seen": 120915968, + "step": 7515 + }, + { + "epoch": 0.5264826149010131, + "grad_norm": 3.881070375442505, + "learning_rate": 4.7401162872154114e-05, + "loss": 1.3225, + "num_input_tokens_seen": 120932352, + "step": 7516 + }, + { + "epoch": 0.5265526631467423, + "grad_norm": 4.582496643066406, + "learning_rate": 4.739416462346761e-05, + "loss": 1.1441, + "num_input_tokens_seen": 120948736, + "step": 7517 + }, + { + "epoch": 0.5266227113924715, + "grad_norm": 3.413455009460449, + "learning_rate": 4.7387166374781084e-05, + "loss": 0.8994, + "num_input_tokens_seen": 120964928, + "step": 7518 + }, + { + "epoch": 0.5266927596382008, + "grad_norm": 3.806334972381592, + "learning_rate": 4.738016812609457e-05, + "loss": 1.1016, + "num_input_tokens_seen": 120981312, + "step": 7519 + }, + { + "epoch": 0.52676280788393, + "grad_norm": 4.136782169342041, + "learning_rate": 4.737316987740806e-05, + "loss": 1.3558, + "num_input_tokens_seen": 120997696, + "step": 7520 + }, + { + "epoch": 0.5268328561296594, + "grad_norm": 4.850030899047852, + "learning_rate": 4.736617162872154e-05, + "loss": 0.9764, + "num_input_tokens_seen": 121013624, + "step": 7521 + }, + { + "epoch": 0.5269029043753886, + "grad_norm": 3.9870705604553223, + "learning_rate": 4.735917338003503e-05, + "loss": 1.122, + "num_input_tokens_seen": 121030008, + "step": 7522 + }, + { + "epoch": 0.5269729526211178, + "grad_norm": 3.778407573699951, + "learning_rate": 4.7352175131348513e-05, + "loss": 1.0158, + "num_input_tokens_seen": 121046168, + "step": 7523 + }, + { + "epoch": 0.5270430008668471, + "grad_norm": 4.619656085968018, + "learning_rate": 4.7345176882662e-05, + "loss": 1.0316, + "num_input_tokens_seen": 121061544, + "step": 7524 + }, + { + "epoch": 0.5271130491125763, + "grad_norm": 3.794745683670044, + "learning_rate": 4.733817863397549e-05, + "loss": 0.9998, + "num_input_tokens_seen": 121077928, + "step": 7525 + }, + { + "epoch": 0.5271830973583055, + "grad_norm": 5.526957988739014, + "learning_rate": 4.7331180385288966e-05, + "loss": 0.9366, + "num_input_tokens_seen": 121094312, + "step": 7526 + }, + { + "epoch": 0.5272531456040348, + "grad_norm": 4.695877552032471, + "learning_rate": 4.732418213660246e-05, + "loss": 0.9802, + "num_input_tokens_seen": 121110696, + "step": 7527 + }, + { + "epoch": 0.527323193849764, + "grad_norm": 4.297723293304443, + "learning_rate": 4.7317183887915936e-05, + "loss": 0.9783, + "num_input_tokens_seen": 121126832, + "step": 7528 + }, + { + "epoch": 0.5273932420954933, + "grad_norm": 3.9494822025299072, + "learning_rate": 4.7310185639229425e-05, + "loss": 1.0655, + "num_input_tokens_seen": 121143192, + "step": 7529 + }, + { + "epoch": 0.5274632903412225, + "grad_norm": 3.572096347808838, + "learning_rate": 4.730318739054291e-05, + "loss": 0.9569, + "num_input_tokens_seen": 121159552, + "step": 7530 + }, + { + "epoch": 0.5275333385869517, + "grad_norm": 4.53004789352417, + "learning_rate": 4.7296189141856395e-05, + "loss": 0.9207, + "num_input_tokens_seen": 121175744, + "step": 7531 + }, + { + "epoch": 0.527603386832681, + "grad_norm": 4.6228203773498535, + "learning_rate": 4.7289190893169884e-05, + "loss": 1.0224, + "num_input_tokens_seen": 121190304, + "step": 7532 + }, + { + "epoch": 0.5276734350784102, + "grad_norm": 3.7169201374053955, + "learning_rate": 4.7282192644483366e-05, + "loss": 1.0556, + "num_input_tokens_seen": 121206688, + "step": 7533 + }, + { + "epoch": 0.5277434833241395, + "grad_norm": 4.113621234893799, + "learning_rate": 4.7275194395796854e-05, + "loss": 0.8171, + "num_input_tokens_seen": 121223072, + "step": 7534 + }, + { + "epoch": 0.5278135315698688, + "grad_norm": 3.70991849899292, + "learning_rate": 4.726819614711034e-05, + "loss": 1.0469, + "num_input_tokens_seen": 121239456, + "step": 7535 + }, + { + "epoch": 0.527883579815598, + "grad_norm": 4.051577091217041, + "learning_rate": 4.726119789842382e-05, + "loss": 1.192, + "num_input_tokens_seen": 121255840, + "step": 7536 + }, + { + "epoch": 0.5279536280613273, + "grad_norm": 5.249835968017578, + "learning_rate": 4.725419964973731e-05, + "loss": 1.1282, + "num_input_tokens_seen": 121271664, + "step": 7537 + }, + { + "epoch": 0.5280236763070565, + "grad_norm": 3.727388381958008, + "learning_rate": 4.724720140105079e-05, + "loss": 1.1146, + "num_input_tokens_seen": 121288048, + "step": 7538 + }, + { + "epoch": 0.5280937245527857, + "grad_norm": 3.91412615776062, + "learning_rate": 4.724020315236428e-05, + "loss": 1.002, + "num_input_tokens_seen": 121304432, + "step": 7539 + }, + { + "epoch": 0.528163772798515, + "grad_norm": 4.474177360534668, + "learning_rate": 4.723320490367776e-05, + "loss": 1.1398, + "num_input_tokens_seen": 121320816, + "step": 7540 + }, + { + "epoch": 0.5282338210442442, + "grad_norm": 3.860063314437866, + "learning_rate": 4.722620665499125e-05, + "loss": 0.9238, + "num_input_tokens_seen": 121337200, + "step": 7541 + }, + { + "epoch": 0.5283038692899734, + "grad_norm": 3.4258196353912354, + "learning_rate": 4.7219208406304736e-05, + "loss": 0.8809, + "num_input_tokens_seen": 121353584, + "step": 7542 + }, + { + "epoch": 0.5283739175357027, + "grad_norm": 3.8684587478637695, + "learning_rate": 4.721221015761822e-05, + "loss": 1.1276, + "num_input_tokens_seen": 121369968, + "step": 7543 + }, + { + "epoch": 0.5284439657814319, + "grad_norm": 3.713254451751709, + "learning_rate": 4.7205211908931706e-05, + "loss": 0.9036, + "num_input_tokens_seen": 121386352, + "step": 7544 + }, + { + "epoch": 0.5285140140271612, + "grad_norm": 4.29981803894043, + "learning_rate": 4.719821366024518e-05, + "loss": 0.9864, + "num_input_tokens_seen": 121402632, + "step": 7545 + }, + { + "epoch": 0.5285840622728905, + "grad_norm": 3.9872469902038574, + "learning_rate": 4.719121541155867e-05, + "loss": 1.1266, + "num_input_tokens_seen": 121419016, + "step": 7546 + }, + { + "epoch": 0.5286541105186197, + "grad_norm": 5.582516193389893, + "learning_rate": 4.7184217162872165e-05, + "loss": 1.0141, + "num_input_tokens_seen": 121434168, + "step": 7547 + }, + { + "epoch": 0.528724158764349, + "grad_norm": 3.623431921005249, + "learning_rate": 4.717721891418564e-05, + "loss": 1.021, + "num_input_tokens_seen": 121450552, + "step": 7548 + }, + { + "epoch": 0.5287942070100782, + "grad_norm": 4.299187660217285, + "learning_rate": 4.717022066549913e-05, + "loss": 0.9786, + "num_input_tokens_seen": 121466488, + "step": 7549 + }, + { + "epoch": 0.5288642552558075, + "grad_norm": 3.768704652786255, + "learning_rate": 4.716322241681261e-05, + "loss": 1.3006, + "num_input_tokens_seen": 121482872, + "step": 7550 + }, + { + "epoch": 0.5289343035015367, + "grad_norm": 4.494194507598877, + "learning_rate": 4.71562241681261e-05, + "loss": 1.2086, + "num_input_tokens_seen": 121499256, + "step": 7551 + }, + { + "epoch": 0.5290043517472659, + "grad_norm": 3.676561117172241, + "learning_rate": 4.714922591943959e-05, + "loss": 0.9644, + "num_input_tokens_seen": 121515272, + "step": 7552 + }, + { + "epoch": 0.5290743999929952, + "grad_norm": 5.0016961097717285, + "learning_rate": 4.714222767075307e-05, + "loss": 0.8883, + "num_input_tokens_seen": 121531544, + "step": 7553 + }, + { + "epoch": 0.5291444482387244, + "grad_norm": 3.8496031761169434, + "learning_rate": 4.713522942206656e-05, + "loss": 1.1284, + "num_input_tokens_seen": 121547928, + "step": 7554 + }, + { + "epoch": 0.5292144964844536, + "grad_norm": 5.569375514984131, + "learning_rate": 4.712823117338003e-05, + "loss": 1.227, + "num_input_tokens_seen": 121564312, + "step": 7555 + }, + { + "epoch": 0.5292845447301829, + "grad_norm": 3.6076838970184326, + "learning_rate": 4.712123292469352e-05, + "loss": 1.0773, + "num_input_tokens_seen": 121580688, + "step": 7556 + }, + { + "epoch": 0.5293545929759121, + "grad_norm": 3.5435140132904053, + "learning_rate": 4.711423467600701e-05, + "loss": 0.9505, + "num_input_tokens_seen": 121596496, + "step": 7557 + }, + { + "epoch": 0.5294246412216415, + "grad_norm": 3.233835458755493, + "learning_rate": 4.710723642732049e-05, + "loss": 0.9657, + "num_input_tokens_seen": 121612880, + "step": 7558 + }, + { + "epoch": 0.5294946894673707, + "grad_norm": 5.496852874755859, + "learning_rate": 4.710023817863398e-05, + "loss": 1.0778, + "num_input_tokens_seen": 121629264, + "step": 7559 + }, + { + "epoch": 0.5295647377130999, + "grad_norm": 5.445659637451172, + "learning_rate": 4.709323992994746e-05, + "loss": 1.0318, + "num_input_tokens_seen": 121645128, + "step": 7560 + }, + { + "epoch": 0.5296347859588292, + "grad_norm": 3.8428354263305664, + "learning_rate": 4.708624168126095e-05, + "loss": 1.0299, + "num_input_tokens_seen": 121661504, + "step": 7561 + }, + { + "epoch": 0.5297048342045584, + "grad_norm": 3.609997272491455, + "learning_rate": 4.707924343257444e-05, + "loss": 1.0052, + "num_input_tokens_seen": 121677888, + "step": 7562 + }, + { + "epoch": 0.5297748824502876, + "grad_norm": 4.154750823974609, + "learning_rate": 4.707224518388792e-05, + "loss": 1.0849, + "num_input_tokens_seen": 121693328, + "step": 7563 + }, + { + "epoch": 0.5298449306960169, + "grad_norm": 4.595134735107422, + "learning_rate": 4.706524693520141e-05, + "loss": 0.9786, + "num_input_tokens_seen": 121709512, + "step": 7564 + }, + { + "epoch": 0.5299149789417461, + "grad_norm": 3.6196346282958984, + "learning_rate": 4.7058248686514885e-05, + "loss": 1.0239, + "num_input_tokens_seen": 121725272, + "step": 7565 + }, + { + "epoch": 0.5299850271874754, + "grad_norm": 3.8893964290618896, + "learning_rate": 4.7051250437828374e-05, + "loss": 0.9623, + "num_input_tokens_seen": 121741656, + "step": 7566 + }, + { + "epoch": 0.5300550754332046, + "grad_norm": 3.8492813110351562, + "learning_rate": 4.7044252189141856e-05, + "loss": 1.0309, + "num_input_tokens_seen": 121758040, + "step": 7567 + }, + { + "epoch": 0.5301251236789338, + "grad_norm": 3.920822858810425, + "learning_rate": 4.7037253940455344e-05, + "loss": 1.0483, + "num_input_tokens_seen": 121773736, + "step": 7568 + }, + { + "epoch": 0.5301951719246631, + "grad_norm": 5.269485950469971, + "learning_rate": 4.703025569176883e-05, + "loss": 1.0504, + "num_input_tokens_seen": 121790120, + "step": 7569 + }, + { + "epoch": 0.5302652201703923, + "grad_norm": 4.920991897583008, + "learning_rate": 4.7023257443082315e-05, + "loss": 0.8157, + "num_input_tokens_seen": 121806504, + "step": 7570 + }, + { + "epoch": 0.5303352684161216, + "grad_norm": 3.8457534313201904, + "learning_rate": 4.70162591943958e-05, + "loss": 1.0428, + "num_input_tokens_seen": 121822256, + "step": 7571 + }, + { + "epoch": 0.5304053166618509, + "grad_norm": 3.80702543258667, + "learning_rate": 4.700926094570928e-05, + "loss": 1.1466, + "num_input_tokens_seen": 121838640, + "step": 7572 + }, + { + "epoch": 0.5304753649075801, + "grad_norm": 3.8683180809020996, + "learning_rate": 4.7002262697022774e-05, + "loss": 0.8946, + "num_input_tokens_seen": 121854704, + "step": 7573 + }, + { + "epoch": 0.5305454131533094, + "grad_norm": 6.011785507202148, + "learning_rate": 4.699526444833626e-05, + "loss": 1.0269, + "num_input_tokens_seen": 121871088, + "step": 7574 + }, + { + "epoch": 0.5306154613990386, + "grad_norm": 4.970396995544434, + "learning_rate": 4.698826619964974e-05, + "loss": 0.9285, + "num_input_tokens_seen": 121887240, + "step": 7575 + }, + { + "epoch": 0.5306855096447678, + "grad_norm": 4.267600059509277, + "learning_rate": 4.6981267950963226e-05, + "loss": 1.2008, + "num_input_tokens_seen": 121903624, + "step": 7576 + }, + { + "epoch": 0.5307555578904971, + "grad_norm": 4.618432998657227, + "learning_rate": 4.697426970227671e-05, + "loss": 1.0197, + "num_input_tokens_seen": 121919656, + "step": 7577 + }, + { + "epoch": 0.5308256061362263, + "grad_norm": 3.775972366333008, + "learning_rate": 4.6967271453590196e-05, + "loss": 0.9733, + "num_input_tokens_seen": 121935992, + "step": 7578 + }, + { + "epoch": 0.5308956543819555, + "grad_norm": 4.6172356605529785, + "learning_rate": 4.6960273204903685e-05, + "loss": 1.1194, + "num_input_tokens_seen": 121952376, + "step": 7579 + }, + { + "epoch": 0.5309657026276848, + "grad_norm": 4.867498874664307, + "learning_rate": 4.695327495621717e-05, + "loss": 1.1188, + "num_input_tokens_seen": 121968760, + "step": 7580 + }, + { + "epoch": 0.531035750873414, + "grad_norm": 4.13311767578125, + "learning_rate": 4.6946276707530655e-05, + "loss": 1.0245, + "num_input_tokens_seen": 121984304, + "step": 7581 + }, + { + "epoch": 0.5311057991191434, + "grad_norm": 3.957585573196411, + "learning_rate": 4.693927845884413e-05, + "loss": 0.8689, + "num_input_tokens_seen": 122000656, + "step": 7582 + }, + { + "epoch": 0.5311758473648726, + "grad_norm": 4.368579387664795, + "learning_rate": 4.693228021015762e-05, + "loss": 0.996, + "num_input_tokens_seen": 122016576, + "step": 7583 + }, + { + "epoch": 0.5312458956106018, + "grad_norm": 3.653543710708618, + "learning_rate": 4.6925281961471114e-05, + "loss": 1.034, + "num_input_tokens_seen": 122032784, + "step": 7584 + }, + { + "epoch": 0.5313159438563311, + "grad_norm": 3.5882821083068848, + "learning_rate": 4.691828371278459e-05, + "loss": 0.8917, + "num_input_tokens_seen": 122049120, + "step": 7585 + }, + { + "epoch": 0.5313859921020603, + "grad_norm": 3.252802848815918, + "learning_rate": 4.691128546409808e-05, + "loss": 0.9816, + "num_input_tokens_seen": 122065504, + "step": 7586 + }, + { + "epoch": 0.5314560403477896, + "grad_norm": 3.5019781589508057, + "learning_rate": 4.690428721541156e-05, + "loss": 0.912, + "num_input_tokens_seen": 122080800, + "step": 7587 + }, + { + "epoch": 0.5315260885935188, + "grad_norm": 3.470921277999878, + "learning_rate": 4.689728896672505e-05, + "loss": 1.0494, + "num_input_tokens_seen": 122097184, + "step": 7588 + }, + { + "epoch": 0.531596136839248, + "grad_norm": 3.5450100898742676, + "learning_rate": 4.689029071803854e-05, + "loss": 0.9291, + "num_input_tokens_seen": 122113568, + "step": 7589 + }, + { + "epoch": 0.5316661850849773, + "grad_norm": 4.1188578605651855, + "learning_rate": 4.688329246935202e-05, + "loss": 1.0393, + "num_input_tokens_seen": 122129800, + "step": 7590 + }, + { + "epoch": 0.5317362333307065, + "grad_norm": 4.097812175750732, + "learning_rate": 4.687629422066551e-05, + "loss": 1.0745, + "num_input_tokens_seen": 122145784, + "step": 7591 + }, + { + "epoch": 0.5318062815764357, + "grad_norm": 3.929668664932251, + "learning_rate": 4.686929597197898e-05, + "loss": 1.0934, + "num_input_tokens_seen": 122162168, + "step": 7592 + }, + { + "epoch": 0.531876329822165, + "grad_norm": 5.706707954406738, + "learning_rate": 4.686229772329247e-05, + "loss": 1.007, + "num_input_tokens_seen": 122178552, + "step": 7593 + }, + { + "epoch": 0.5319463780678942, + "grad_norm": 4.066921234130859, + "learning_rate": 4.685529947460595e-05, + "loss": 0.9495, + "num_input_tokens_seen": 122194936, + "step": 7594 + }, + { + "epoch": 0.5320164263136236, + "grad_norm": 3.8690404891967773, + "learning_rate": 4.684830122591944e-05, + "loss": 1.2008, + "num_input_tokens_seen": 122211184, + "step": 7595 + }, + { + "epoch": 0.5320864745593528, + "grad_norm": 5.118635654449463, + "learning_rate": 4.684130297723293e-05, + "loss": 0.9476, + "num_input_tokens_seen": 122227152, + "step": 7596 + }, + { + "epoch": 0.532156522805082, + "grad_norm": 3.9654159545898438, + "learning_rate": 4.683430472854641e-05, + "loss": 1.1422, + "num_input_tokens_seen": 122242952, + "step": 7597 + }, + { + "epoch": 0.5322265710508113, + "grad_norm": 5.006682872772217, + "learning_rate": 4.68273064798599e-05, + "loss": 1.2078, + "num_input_tokens_seen": 122259104, + "step": 7598 + }, + { + "epoch": 0.5322966192965405, + "grad_norm": 4.747622013092041, + "learning_rate": 4.6820308231173375e-05, + "loss": 1.3191, + "num_input_tokens_seen": 122274320, + "step": 7599 + }, + { + "epoch": 0.5323666675422697, + "grad_norm": 4.334118843078613, + "learning_rate": 4.681330998248687e-05, + "loss": 0.9368, + "num_input_tokens_seen": 122290376, + "step": 7600 + }, + { + "epoch": 0.5323666675422697, + "eval_loss": 1.120592474937439, + "eval_runtime": 0.4292, + "eval_samples_per_second": 2.33, + "eval_steps_per_second": 2.33, + "num_input_tokens_seen": 122290376, + "step": 7600 + }, + { + "epoch": 0.532436715787999, + "grad_norm": 3.4493050575256348, + "learning_rate": 4.680631173380036e-05, + "loss": 0.9147, + "num_input_tokens_seen": 122306760, + "step": 7601 + }, + { + "epoch": 0.5325067640337282, + "grad_norm": 3.933213949203491, + "learning_rate": 4.6799313485113834e-05, + "loss": 1.2942, + "num_input_tokens_seen": 122322784, + "step": 7602 + }, + { + "epoch": 0.5325768122794575, + "grad_norm": 3.874788284301758, + "learning_rate": 4.679231523642732e-05, + "loss": 1.2438, + "num_input_tokens_seen": 122339168, + "step": 7603 + }, + { + "epoch": 0.5326468605251867, + "grad_norm": 4.443728446960449, + "learning_rate": 4.6785316987740805e-05, + "loss": 1.138, + "num_input_tokens_seen": 122355552, + "step": 7604 + }, + { + "epoch": 0.5327169087709159, + "grad_norm": 3.7982730865478516, + "learning_rate": 4.677831873905429e-05, + "loss": 1.105, + "num_input_tokens_seen": 122371936, + "step": 7605 + }, + { + "epoch": 0.5327869570166452, + "grad_norm": 3.4133870601654053, + "learning_rate": 4.677132049036778e-05, + "loss": 1.0804, + "num_input_tokens_seen": 122388192, + "step": 7606 + }, + { + "epoch": 0.5328570052623745, + "grad_norm": 5.178568363189697, + "learning_rate": 4.6764322241681264e-05, + "loss": 0.9356, + "num_input_tokens_seen": 122404576, + "step": 7607 + }, + { + "epoch": 0.5329270535081037, + "grad_norm": 3.9049737453460693, + "learning_rate": 4.675732399299475e-05, + "loss": 0.9998, + "num_input_tokens_seen": 122420960, + "step": 7608 + }, + { + "epoch": 0.532997101753833, + "grad_norm": 3.8209729194641113, + "learning_rate": 4.675032574430823e-05, + "loss": 0.9225, + "num_input_tokens_seen": 122436808, + "step": 7609 + }, + { + "epoch": 0.5330671499995622, + "grad_norm": 5.2931389808654785, + "learning_rate": 4.674332749562172e-05, + "loss": 1.225, + "num_input_tokens_seen": 122453192, + "step": 7610 + }, + { + "epoch": 0.5331371982452915, + "grad_norm": 3.608839273452759, + "learning_rate": 4.673632924693521e-05, + "loss": 1.0053, + "num_input_tokens_seen": 122469576, + "step": 7611 + }, + { + "epoch": 0.5332072464910207, + "grad_norm": 3.75544810295105, + "learning_rate": 4.6729330998248686e-05, + "loss": 1.0932, + "num_input_tokens_seen": 122485960, + "step": 7612 + }, + { + "epoch": 0.5332772947367499, + "grad_norm": 4.498108863830566, + "learning_rate": 4.6722332749562175e-05, + "loss": 1.1613, + "num_input_tokens_seen": 122502344, + "step": 7613 + }, + { + "epoch": 0.5333473429824792, + "grad_norm": 7.5673909187316895, + "learning_rate": 4.671533450087566e-05, + "loss": 1.1696, + "num_input_tokens_seen": 122518728, + "step": 7614 + }, + { + "epoch": 0.5334173912282084, + "grad_norm": 6.213915824890137, + "learning_rate": 4.6708336252189145e-05, + "loss": 0.9305, + "num_input_tokens_seen": 122534992, + "step": 7615 + }, + { + "epoch": 0.5334874394739376, + "grad_norm": 3.739473342895508, + "learning_rate": 4.6701338003502634e-05, + "loss": 1.0961, + "num_input_tokens_seen": 122551376, + "step": 7616 + }, + { + "epoch": 0.5335574877196669, + "grad_norm": 4.1125617027282715, + "learning_rate": 4.6694339754816116e-05, + "loss": 1.0056, + "num_input_tokens_seen": 122567760, + "step": 7617 + }, + { + "epoch": 0.5336275359653961, + "grad_norm": 3.5337769985198975, + "learning_rate": 4.6687341506129604e-05, + "loss": 0.9145, + "num_input_tokens_seen": 122583992, + "step": 7618 + }, + { + "epoch": 0.5336975842111255, + "grad_norm": 3.6304452419281006, + "learning_rate": 4.668034325744308e-05, + "loss": 1.0398, + "num_input_tokens_seen": 122600376, + "step": 7619 + }, + { + "epoch": 0.5337676324568547, + "grad_norm": 4.323266983032227, + "learning_rate": 4.6673345008756575e-05, + "loss": 1.1597, + "num_input_tokens_seen": 122616760, + "step": 7620 + }, + { + "epoch": 0.5338376807025839, + "grad_norm": 3.598428249359131, + "learning_rate": 4.666634676007005e-05, + "loss": 1.0098, + "num_input_tokens_seen": 122633144, + "step": 7621 + }, + { + "epoch": 0.5339077289483132, + "grad_norm": 4.279503345489502, + "learning_rate": 4.665934851138354e-05, + "loss": 1.1214, + "num_input_tokens_seen": 122649528, + "step": 7622 + }, + { + "epoch": 0.5339777771940424, + "grad_norm": 5.056297302246094, + "learning_rate": 4.665235026269703e-05, + "loss": 1.0469, + "num_input_tokens_seen": 122665912, + "step": 7623 + }, + { + "epoch": 0.5340478254397717, + "grad_norm": 7.37079381942749, + "learning_rate": 4.664535201401051e-05, + "loss": 1.168, + "num_input_tokens_seen": 122681968, + "step": 7624 + }, + { + "epoch": 0.5341178736855009, + "grad_norm": 4.478328227996826, + "learning_rate": 4.6638353765324e-05, + "loss": 1.2127, + "num_input_tokens_seen": 122698352, + "step": 7625 + }, + { + "epoch": 0.5341879219312301, + "grad_norm": 5.164111614227295, + "learning_rate": 4.663135551663748e-05, + "loss": 1.0905, + "num_input_tokens_seen": 122714736, + "step": 7626 + }, + { + "epoch": 0.5342579701769594, + "grad_norm": 6.489926815032959, + "learning_rate": 4.662435726795097e-05, + "loss": 0.95, + "num_input_tokens_seen": 122731120, + "step": 7627 + }, + { + "epoch": 0.5343280184226886, + "grad_norm": 5.209092140197754, + "learning_rate": 4.6617359019264456e-05, + "loss": 1.117, + "num_input_tokens_seen": 122747504, + "step": 7628 + }, + { + "epoch": 0.5343980666684178, + "grad_norm": 4.007065773010254, + "learning_rate": 4.661036077057793e-05, + "loss": 0.9271, + "num_input_tokens_seen": 122763864, + "step": 7629 + }, + { + "epoch": 0.5344681149141471, + "grad_norm": 4.077016830444336, + "learning_rate": 4.660336252189143e-05, + "loss": 1.1052, + "num_input_tokens_seen": 122779808, + "step": 7630 + }, + { + "epoch": 0.5345381631598763, + "grad_norm": 3.764261245727539, + "learning_rate": 4.65963642732049e-05, + "loss": 1.1111, + "num_input_tokens_seen": 122796192, + "step": 7631 + }, + { + "epoch": 0.5346082114056057, + "grad_norm": 3.8028204441070557, + "learning_rate": 4.658936602451839e-05, + "loss": 0.9435, + "num_input_tokens_seen": 122812576, + "step": 7632 + }, + { + "epoch": 0.5346782596513349, + "grad_norm": 4.397709846496582, + "learning_rate": 4.658236777583188e-05, + "loss": 0.9155, + "num_input_tokens_seen": 122828048, + "step": 7633 + }, + { + "epoch": 0.5347483078970641, + "grad_norm": 4.138652324676514, + "learning_rate": 4.657536952714536e-05, + "loss": 0.9613, + "num_input_tokens_seen": 122842976, + "step": 7634 + }, + { + "epoch": 0.5348183561427934, + "grad_norm": 3.9423744678497314, + "learning_rate": 4.656837127845885e-05, + "loss": 1.0464, + "num_input_tokens_seen": 122859360, + "step": 7635 + }, + { + "epoch": 0.5348884043885226, + "grad_norm": 4.257975101470947, + "learning_rate": 4.656137302977233e-05, + "loss": 1.1539, + "num_input_tokens_seen": 122874888, + "step": 7636 + }, + { + "epoch": 0.5349584526342518, + "grad_norm": 4.364237308502197, + "learning_rate": 4.655437478108582e-05, + "loss": 1.0326, + "num_input_tokens_seen": 122891272, + "step": 7637 + }, + { + "epoch": 0.5350285008799811, + "grad_norm": 4.238529205322266, + "learning_rate": 4.654737653239931e-05, + "loss": 1.2226, + "num_input_tokens_seen": 122906408, + "step": 7638 + }, + { + "epoch": 0.5350985491257103, + "grad_norm": 3.609213352203369, + "learning_rate": 4.654037828371278e-05, + "loss": 0.9925, + "num_input_tokens_seen": 122922792, + "step": 7639 + }, + { + "epoch": 0.5351685973714396, + "grad_norm": 3.5299439430236816, + "learning_rate": 4.653338003502628e-05, + "loss": 0.9586, + "num_input_tokens_seen": 122939120, + "step": 7640 + }, + { + "epoch": 0.5352386456171688, + "grad_norm": 3.879683494567871, + "learning_rate": 4.6526381786339754e-05, + "loss": 1.1882, + "num_input_tokens_seen": 122955504, + "step": 7641 + }, + { + "epoch": 0.535308693862898, + "grad_norm": 3.414780616760254, + "learning_rate": 4.651938353765324e-05, + "loss": 1.0047, + "num_input_tokens_seen": 122971848, + "step": 7642 + }, + { + "epoch": 0.5353787421086273, + "grad_norm": 3.307396173477173, + "learning_rate": 4.651238528896673e-05, + "loss": 0.8771, + "num_input_tokens_seen": 122987840, + "step": 7643 + }, + { + "epoch": 0.5354487903543566, + "grad_norm": 3.41166353225708, + "learning_rate": 4.650538704028021e-05, + "loss": 0.8481, + "num_input_tokens_seen": 123004224, + "step": 7644 + }, + { + "epoch": 0.5355188386000858, + "grad_norm": 4.273513317108154, + "learning_rate": 4.64983887915937e-05, + "loss": 1.2176, + "num_input_tokens_seen": 123020608, + "step": 7645 + }, + { + "epoch": 0.5355888868458151, + "grad_norm": 6.728183269500732, + "learning_rate": 4.649139054290718e-05, + "loss": 1.0303, + "num_input_tokens_seen": 123035928, + "step": 7646 + }, + { + "epoch": 0.5356589350915443, + "grad_norm": 3.6993179321289062, + "learning_rate": 4.648439229422067e-05, + "loss": 0.9471, + "num_input_tokens_seen": 123051784, + "step": 7647 + }, + { + "epoch": 0.5357289833372736, + "grad_norm": 4.173643112182617, + "learning_rate": 4.647739404553415e-05, + "loss": 1.0996, + "num_input_tokens_seen": 123068080, + "step": 7648 + }, + { + "epoch": 0.5357990315830028, + "grad_norm": 4.235645294189453, + "learning_rate": 4.6470395796847635e-05, + "loss": 1.0112, + "num_input_tokens_seen": 123084464, + "step": 7649 + }, + { + "epoch": 0.535869079828732, + "grad_norm": 4.8372344970703125, + "learning_rate": 4.646339754816113e-05, + "loss": 0.9532, + "num_input_tokens_seen": 123099480, + "step": 7650 + }, + { + "epoch": 0.5359391280744613, + "grad_norm": 3.768519878387451, + "learning_rate": 4.6456399299474606e-05, + "loss": 1.0942, + "num_input_tokens_seen": 123115592, + "step": 7651 + }, + { + "epoch": 0.5360091763201905, + "grad_norm": 5.204262733459473, + "learning_rate": 4.6449401050788094e-05, + "loss": 0.954, + "num_input_tokens_seen": 123131976, + "step": 7652 + }, + { + "epoch": 0.5360792245659198, + "grad_norm": 3.371913194656372, + "learning_rate": 4.6442402802101576e-05, + "loss": 0.8998, + "num_input_tokens_seen": 123148360, + "step": 7653 + }, + { + "epoch": 0.536149272811649, + "grad_norm": 6.089724540710449, + "learning_rate": 4.6435404553415065e-05, + "loss": 0.9717, + "num_input_tokens_seen": 123164744, + "step": 7654 + }, + { + "epoch": 0.5362193210573782, + "grad_norm": 4.112463474273682, + "learning_rate": 4.6428406304728553e-05, + "loss": 1.1899, + "num_input_tokens_seen": 123181128, + "step": 7655 + }, + { + "epoch": 0.5362893693031076, + "grad_norm": 3.9375081062316895, + "learning_rate": 4.6421408056042035e-05, + "loss": 1.0524, + "num_input_tokens_seen": 123197512, + "step": 7656 + }, + { + "epoch": 0.5363594175488368, + "grad_norm": 4.459086894989014, + "learning_rate": 4.6414409807355524e-05, + "loss": 1.206, + "num_input_tokens_seen": 123212816, + "step": 7657 + }, + { + "epoch": 0.536429465794566, + "grad_norm": 4.026162147521973, + "learning_rate": 4.6407411558669e-05, + "loss": 0.8556, + "num_input_tokens_seen": 123229200, + "step": 7658 + }, + { + "epoch": 0.5364995140402953, + "grad_norm": 3.6091065406799316, + "learning_rate": 4.640041330998249e-05, + "loss": 1.0515, + "num_input_tokens_seen": 123245360, + "step": 7659 + }, + { + "epoch": 0.5365695622860245, + "grad_norm": 4.105917453765869, + "learning_rate": 4.639341506129598e-05, + "loss": 1.3174, + "num_input_tokens_seen": 123261072, + "step": 7660 + }, + { + "epoch": 0.5366396105317538, + "grad_norm": 4.645833492279053, + "learning_rate": 4.638641681260946e-05, + "loss": 1.0443, + "num_input_tokens_seen": 123277216, + "step": 7661 + }, + { + "epoch": 0.536709658777483, + "grad_norm": 4.012742519378662, + "learning_rate": 4.6379418563922946e-05, + "loss": 1.0339, + "num_input_tokens_seen": 123293336, + "step": 7662 + }, + { + "epoch": 0.5367797070232122, + "grad_norm": 3.9238698482513428, + "learning_rate": 4.637242031523643e-05, + "loss": 1.0772, + "num_input_tokens_seen": 123309720, + "step": 7663 + }, + { + "epoch": 0.5368497552689415, + "grad_norm": 3.3745267391204834, + "learning_rate": 4.636542206654992e-05, + "loss": 1.0389, + "num_input_tokens_seen": 123326104, + "step": 7664 + }, + { + "epoch": 0.5369198035146707, + "grad_norm": 4.080345630645752, + "learning_rate": 4.6358423817863405e-05, + "loss": 1.0941, + "num_input_tokens_seen": 123342280, + "step": 7665 + }, + { + "epoch": 0.5369898517603999, + "grad_norm": 4.973495006561279, + "learning_rate": 4.635142556917689e-05, + "loss": 0.9355, + "num_input_tokens_seen": 123358664, + "step": 7666 + }, + { + "epoch": 0.5370599000061292, + "grad_norm": 3.8065357208251953, + "learning_rate": 4.6344427320490376e-05, + "loss": 1.0071, + "num_input_tokens_seen": 123374584, + "step": 7667 + }, + { + "epoch": 0.5371299482518584, + "grad_norm": 4.49127721786499, + "learning_rate": 4.633742907180385e-05, + "loss": 1.0562, + "num_input_tokens_seen": 123390208, + "step": 7668 + }, + { + "epoch": 0.5371999964975878, + "grad_norm": 4.231927394866943, + "learning_rate": 4.633043082311734e-05, + "loss": 1.0988, + "num_input_tokens_seen": 123406592, + "step": 7669 + }, + { + "epoch": 0.537270044743317, + "grad_norm": 3.7635555267333984, + "learning_rate": 4.6323432574430835e-05, + "loss": 1.1342, + "num_input_tokens_seen": 123422976, + "step": 7670 + }, + { + "epoch": 0.5373400929890462, + "grad_norm": 3.9398446083068848, + "learning_rate": 4.631643432574431e-05, + "loss": 1.1557, + "num_input_tokens_seen": 123439184, + "step": 7671 + }, + { + "epoch": 0.5374101412347755, + "grad_norm": 3.7720675468444824, + "learning_rate": 4.63094360770578e-05, + "loss": 1.1092, + "num_input_tokens_seen": 123455568, + "step": 7672 + }, + { + "epoch": 0.5374801894805047, + "grad_norm": 5.23007869720459, + "learning_rate": 4.630243782837128e-05, + "loss": 1.0874, + "num_input_tokens_seen": 123471952, + "step": 7673 + }, + { + "epoch": 0.5375502377262339, + "grad_norm": 4.356583118438721, + "learning_rate": 4.629543957968477e-05, + "loss": 1.1307, + "num_input_tokens_seen": 123487528, + "step": 7674 + }, + { + "epoch": 0.5376202859719632, + "grad_norm": 3.71581768989563, + "learning_rate": 4.6288441330998244e-05, + "loss": 0.9518, + "num_input_tokens_seen": 123503216, + "step": 7675 + }, + { + "epoch": 0.5376903342176924, + "grad_norm": 3.9850363731384277, + "learning_rate": 4.628144308231174e-05, + "loss": 1.0477, + "num_input_tokens_seen": 123519600, + "step": 7676 + }, + { + "epoch": 0.5377603824634217, + "grad_norm": 3.9007675647735596, + "learning_rate": 4.627444483362523e-05, + "loss": 1.1008, + "num_input_tokens_seen": 123535952, + "step": 7677 + }, + { + "epoch": 0.5378304307091509, + "grad_norm": 4.420581340789795, + "learning_rate": 4.62674465849387e-05, + "loss": 1.1284, + "num_input_tokens_seen": 123552336, + "step": 7678 + }, + { + "epoch": 0.5379004789548801, + "grad_norm": 3.788006067276001, + "learning_rate": 4.626044833625219e-05, + "loss": 1.0635, + "num_input_tokens_seen": 123568232, + "step": 7679 + }, + { + "epoch": 0.5379705272006094, + "grad_norm": 3.8466997146606445, + "learning_rate": 4.625345008756567e-05, + "loss": 1.0154, + "num_input_tokens_seen": 123583680, + "step": 7680 + }, + { + "epoch": 0.5380405754463387, + "grad_norm": 4.214776515960693, + "learning_rate": 4.624645183887916e-05, + "loss": 0.9577, + "num_input_tokens_seen": 123600064, + "step": 7681 + }, + { + "epoch": 0.5381106236920679, + "grad_norm": 4.797380447387695, + "learning_rate": 4.623945359019265e-05, + "loss": 0.9281, + "num_input_tokens_seen": 123615880, + "step": 7682 + }, + { + "epoch": 0.5381806719377972, + "grad_norm": 3.3792150020599365, + "learning_rate": 4.623245534150613e-05, + "loss": 0.97, + "num_input_tokens_seen": 123632264, + "step": 7683 + }, + { + "epoch": 0.5382507201835264, + "grad_norm": 4.794241428375244, + "learning_rate": 4.622545709281962e-05, + "loss": 0.9202, + "num_input_tokens_seen": 123647104, + "step": 7684 + }, + { + "epoch": 0.5383207684292557, + "grad_norm": 3.86734676361084, + "learning_rate": 4.6218458844133096e-05, + "loss": 1.2309, + "num_input_tokens_seen": 123662976, + "step": 7685 + }, + { + "epoch": 0.5383908166749849, + "grad_norm": 4.570960998535156, + "learning_rate": 4.621146059544659e-05, + "loss": 1.1882, + "num_input_tokens_seen": 123678648, + "step": 7686 + }, + { + "epoch": 0.5384608649207141, + "grad_norm": 4.53627347946167, + "learning_rate": 4.620446234676008e-05, + "loss": 0.996, + "num_input_tokens_seen": 123695032, + "step": 7687 + }, + { + "epoch": 0.5385309131664434, + "grad_norm": 3.517305612564087, + "learning_rate": 4.6197464098073555e-05, + "loss": 0.9094, + "num_input_tokens_seen": 123711416, + "step": 7688 + }, + { + "epoch": 0.5386009614121726, + "grad_norm": 3.955936908721924, + "learning_rate": 4.6190465849387043e-05, + "loss": 1.0099, + "num_input_tokens_seen": 123727800, + "step": 7689 + }, + { + "epoch": 0.5386710096579019, + "grad_norm": 3.3592917919158936, + "learning_rate": 4.6183467600700525e-05, + "loss": 1.0007, + "num_input_tokens_seen": 123744184, + "step": 7690 + }, + { + "epoch": 0.5387410579036311, + "grad_norm": 3.4240005016326904, + "learning_rate": 4.6176469352014014e-05, + "loss": 0.8842, + "num_input_tokens_seen": 123760544, + "step": 7691 + }, + { + "epoch": 0.5388111061493603, + "grad_norm": 4.404487609863281, + "learning_rate": 4.61694711033275e-05, + "loss": 1.2803, + "num_input_tokens_seen": 123776928, + "step": 7692 + }, + { + "epoch": 0.5388811543950897, + "grad_norm": 3.729642868041992, + "learning_rate": 4.6162472854640984e-05, + "loss": 1.0132, + "num_input_tokens_seen": 123793312, + "step": 7693 + }, + { + "epoch": 0.5389512026408189, + "grad_norm": 4.076940536499023, + "learning_rate": 4.615547460595447e-05, + "loss": 0.9274, + "num_input_tokens_seen": 123809696, + "step": 7694 + }, + { + "epoch": 0.5390212508865481, + "grad_norm": 3.436220407485962, + "learning_rate": 4.614847635726795e-05, + "loss": 1.0113, + "num_input_tokens_seen": 123826080, + "step": 7695 + }, + { + "epoch": 0.5390912991322774, + "grad_norm": 3.4559690952301025, + "learning_rate": 4.614147810858144e-05, + "loss": 1.0622, + "num_input_tokens_seen": 123841856, + "step": 7696 + }, + { + "epoch": 0.5391613473780066, + "grad_norm": 4.085431098937988, + "learning_rate": 4.613447985989493e-05, + "loss": 1.0832, + "num_input_tokens_seen": 123858240, + "step": 7697 + }, + { + "epoch": 0.5392313956237359, + "grad_norm": 4.373138427734375, + "learning_rate": 4.612748161120841e-05, + "loss": 1.0914, + "num_input_tokens_seen": 123874168, + "step": 7698 + }, + { + "epoch": 0.5393014438694651, + "grad_norm": 4.127585411071777, + "learning_rate": 4.6120483362521896e-05, + "loss": 1.1582, + "num_input_tokens_seen": 123890552, + "step": 7699 + }, + { + "epoch": 0.5393714921151943, + "grad_norm": 3.53139328956604, + "learning_rate": 4.611348511383538e-05, + "loss": 1.0876, + "num_input_tokens_seen": 123906936, + "step": 7700 + }, + { + "epoch": 0.5394415403609236, + "grad_norm": 3.8704042434692383, + "learning_rate": 4.6106486865148866e-05, + "loss": 0.9507, + "num_input_tokens_seen": 123922640, + "step": 7701 + }, + { + "epoch": 0.5395115886066528, + "grad_norm": 5.1702189445495605, + "learning_rate": 4.609948861646235e-05, + "loss": 1.1272, + "num_input_tokens_seen": 123938520, + "step": 7702 + }, + { + "epoch": 0.539581636852382, + "grad_norm": 4.382580280303955, + "learning_rate": 4.6092490367775836e-05, + "loss": 1.1366, + "num_input_tokens_seen": 123953728, + "step": 7703 + }, + { + "epoch": 0.5396516850981113, + "grad_norm": 4.136844635009766, + "learning_rate": 4.6085492119089325e-05, + "loss": 0.9004, + "num_input_tokens_seen": 123970112, + "step": 7704 + }, + { + "epoch": 0.5397217333438405, + "grad_norm": 3.9517366886138916, + "learning_rate": 4.60784938704028e-05, + "loss": 1.097, + "num_input_tokens_seen": 123986104, + "step": 7705 + }, + { + "epoch": 0.5397917815895699, + "grad_norm": 3.5815629959106445, + "learning_rate": 4.6071495621716295e-05, + "loss": 1.1862, + "num_input_tokens_seen": 124002488, + "step": 7706 + }, + { + "epoch": 0.5398618298352991, + "grad_norm": 3.8689863681793213, + "learning_rate": 4.606449737302977e-05, + "loss": 1.0321, + "num_input_tokens_seen": 124018184, + "step": 7707 + }, + { + "epoch": 0.5399318780810283, + "grad_norm": 3.911912679672241, + "learning_rate": 4.605749912434326e-05, + "loss": 1.2491, + "num_input_tokens_seen": 124034568, + "step": 7708 + }, + { + "epoch": 0.5400019263267576, + "grad_norm": 4.168681621551514, + "learning_rate": 4.605050087565675e-05, + "loss": 0.953, + "num_input_tokens_seen": 124050952, + "step": 7709 + }, + { + "epoch": 0.5400719745724868, + "grad_norm": 3.850926160812378, + "learning_rate": 4.604350262697023e-05, + "loss": 0.9641, + "num_input_tokens_seen": 124066688, + "step": 7710 + }, + { + "epoch": 0.540142022818216, + "grad_norm": 4.872866630554199, + "learning_rate": 4.603650437828372e-05, + "loss": 1.0421, + "num_input_tokens_seen": 124083072, + "step": 7711 + }, + { + "epoch": 0.5402120710639453, + "grad_norm": 3.4104743003845215, + "learning_rate": 4.60295061295972e-05, + "loss": 0.9064, + "num_input_tokens_seen": 124099144, + "step": 7712 + }, + { + "epoch": 0.5402821193096745, + "grad_norm": 4.460788249969482, + "learning_rate": 4.602250788091069e-05, + "loss": 1.258, + "num_input_tokens_seen": 124115528, + "step": 7713 + }, + { + "epoch": 0.5403521675554038, + "grad_norm": 4.264237880706787, + "learning_rate": 4.601550963222418e-05, + "loss": 0.9638, + "num_input_tokens_seen": 124131080, + "step": 7714 + }, + { + "epoch": 0.540422215801133, + "grad_norm": 3.436184883117676, + "learning_rate": 4.600851138353765e-05, + "loss": 1.0876, + "num_input_tokens_seen": 124147464, + "step": 7715 + }, + { + "epoch": 0.5404922640468622, + "grad_norm": 5.596844673156738, + "learning_rate": 4.600151313485115e-05, + "loss": 0.8719, + "num_input_tokens_seen": 124163848, + "step": 7716 + }, + { + "epoch": 0.5405623122925916, + "grad_norm": 3.6911396980285645, + "learning_rate": 4.599451488616462e-05, + "loss": 1.0812, + "num_input_tokens_seen": 124180000, + "step": 7717 + }, + { + "epoch": 0.5406323605383208, + "grad_norm": 5.705863952636719, + "learning_rate": 4.598751663747811e-05, + "loss": 1.0574, + "num_input_tokens_seen": 124196384, + "step": 7718 + }, + { + "epoch": 0.54070240878405, + "grad_norm": 3.699828863143921, + "learning_rate": 4.59805183887916e-05, + "loss": 1.0859, + "num_input_tokens_seen": 124212768, + "step": 7719 + }, + { + "epoch": 0.5407724570297793, + "grad_norm": 4.548577785491943, + "learning_rate": 4.597352014010508e-05, + "loss": 1.1576, + "num_input_tokens_seen": 124228520, + "step": 7720 + }, + { + "epoch": 0.5408425052755085, + "grad_norm": 3.9667632579803467, + "learning_rate": 4.596652189141857e-05, + "loss": 0.8426, + "num_input_tokens_seen": 124244312, + "step": 7721 + }, + { + "epoch": 0.5409125535212378, + "grad_norm": 5.471269130706787, + "learning_rate": 4.595952364273205e-05, + "loss": 1.154, + "num_input_tokens_seen": 124260448, + "step": 7722 + }, + { + "epoch": 0.540982601766967, + "grad_norm": 3.6126010417938232, + "learning_rate": 4.595252539404554e-05, + "loss": 0.9946, + "num_input_tokens_seen": 124276776, + "step": 7723 + }, + { + "epoch": 0.5410526500126962, + "grad_norm": 3.4256107807159424, + "learning_rate": 4.594552714535903e-05, + "loss": 0.8039, + "num_input_tokens_seen": 124293160, + "step": 7724 + }, + { + "epoch": 0.5411226982584255, + "grad_norm": 4.028780937194824, + "learning_rate": 4.5938528896672504e-05, + "loss": 0.986, + "num_input_tokens_seen": 124308648, + "step": 7725 + }, + { + "epoch": 0.5411927465041547, + "grad_norm": 4.520470142364502, + "learning_rate": 4.5931530647986e-05, + "loss": 1.2179, + "num_input_tokens_seen": 124324512, + "step": 7726 + }, + { + "epoch": 0.541262794749884, + "grad_norm": 6.498549938201904, + "learning_rate": 4.5924532399299474e-05, + "loss": 0.9941, + "num_input_tokens_seen": 124340888, + "step": 7727 + }, + { + "epoch": 0.5413328429956132, + "grad_norm": 6.183528900146484, + "learning_rate": 4.591753415061296e-05, + "loss": 0.9104, + "num_input_tokens_seen": 124355024, + "step": 7728 + }, + { + "epoch": 0.5414028912413424, + "grad_norm": 4.973779201507568, + "learning_rate": 4.5910535901926445e-05, + "loss": 1.0977, + "num_input_tokens_seen": 124370424, + "step": 7729 + }, + { + "epoch": 0.5414729394870718, + "grad_norm": 11.01496410369873, + "learning_rate": 4.5903537653239933e-05, + "loss": 1.2337, + "num_input_tokens_seen": 124386808, + "step": 7730 + }, + { + "epoch": 0.541542987732801, + "grad_norm": 5.617726802825928, + "learning_rate": 4.589653940455342e-05, + "loss": 1.1204, + "num_input_tokens_seen": 124403192, + "step": 7731 + }, + { + "epoch": 0.5416130359785302, + "grad_norm": 4.480281352996826, + "learning_rate": 4.5889541155866904e-05, + "loss": 1.1171, + "num_input_tokens_seen": 124419576, + "step": 7732 + }, + { + "epoch": 0.5416830842242595, + "grad_norm": 4.884644985198975, + "learning_rate": 4.588254290718039e-05, + "loss": 0.9479, + "num_input_tokens_seen": 124435816, + "step": 7733 + }, + { + "epoch": 0.5417531324699887, + "grad_norm": 3.540273666381836, + "learning_rate": 4.587554465849387e-05, + "loss": 1.1185, + "num_input_tokens_seen": 124452200, + "step": 7734 + }, + { + "epoch": 0.541823180715718, + "grad_norm": 3.7844882011413574, + "learning_rate": 4.5868546409807356e-05, + "loss": 0.8428, + "num_input_tokens_seen": 124468584, + "step": 7735 + }, + { + "epoch": 0.5418932289614472, + "grad_norm": 3.699333906173706, + "learning_rate": 4.586154816112085e-05, + "loss": 1.087, + "num_input_tokens_seen": 124484968, + "step": 7736 + }, + { + "epoch": 0.5419632772071764, + "grad_norm": 4.426324844360352, + "learning_rate": 4.5854549912434326e-05, + "loss": 1.0829, + "num_input_tokens_seen": 124501352, + "step": 7737 + }, + { + "epoch": 0.5420333254529057, + "grad_norm": 3.796420097351074, + "learning_rate": 4.5847551663747815e-05, + "loss": 0.8883, + "num_input_tokens_seen": 124517328, + "step": 7738 + }, + { + "epoch": 0.5421033736986349, + "grad_norm": 4.042966842651367, + "learning_rate": 4.58405534150613e-05, + "loss": 1.0737, + "num_input_tokens_seen": 124533408, + "step": 7739 + }, + { + "epoch": 0.5421734219443641, + "grad_norm": 4.4333977699279785, + "learning_rate": 4.5833555166374785e-05, + "loss": 1.0786, + "num_input_tokens_seen": 124549504, + "step": 7740 + }, + { + "epoch": 0.5422434701900934, + "grad_norm": 3.791276216506958, + "learning_rate": 4.5826556917688274e-05, + "loss": 0.9739, + "num_input_tokens_seen": 124564864, + "step": 7741 + }, + { + "epoch": 0.5423135184358227, + "grad_norm": 3.6679089069366455, + "learning_rate": 4.5819558669001756e-05, + "loss": 0.9717, + "num_input_tokens_seen": 124581248, + "step": 7742 + }, + { + "epoch": 0.542383566681552, + "grad_norm": 4.028548717498779, + "learning_rate": 4.5812560420315244e-05, + "loss": 1.2381, + "num_input_tokens_seen": 124597632, + "step": 7743 + }, + { + "epoch": 0.5424536149272812, + "grad_norm": 4.555594444274902, + "learning_rate": 4.580556217162872e-05, + "loss": 0.9839, + "num_input_tokens_seen": 124614016, + "step": 7744 + }, + { + "epoch": 0.5425236631730104, + "grad_norm": 4.0034589767456055, + "learning_rate": 4.579856392294221e-05, + "loss": 1.1126, + "num_input_tokens_seen": 124629696, + "step": 7745 + }, + { + "epoch": 0.5425937114187397, + "grad_norm": 5.23121452331543, + "learning_rate": 4.5791565674255703e-05, + "loss": 1.2308, + "num_input_tokens_seen": 124646080, + "step": 7746 + }, + { + "epoch": 0.5426637596644689, + "grad_norm": 3.759575605392456, + "learning_rate": 4.578456742556918e-05, + "loss": 0.9166, + "num_input_tokens_seen": 124662464, + "step": 7747 + }, + { + "epoch": 0.5427338079101981, + "grad_norm": 3.4041309356689453, + "learning_rate": 4.577756917688267e-05, + "loss": 0.9725, + "num_input_tokens_seen": 124678848, + "step": 7748 + }, + { + "epoch": 0.5428038561559274, + "grad_norm": 4.347851276397705, + "learning_rate": 4.577057092819615e-05, + "loss": 0.9738, + "num_input_tokens_seen": 124695232, + "step": 7749 + }, + { + "epoch": 0.5428739044016566, + "grad_norm": 3.457156181335449, + "learning_rate": 4.576357267950964e-05, + "loss": 0.9348, + "num_input_tokens_seen": 124711368, + "step": 7750 + }, + { + "epoch": 0.5429439526473859, + "grad_norm": 4.432048320770264, + "learning_rate": 4.5756574430823126e-05, + "loss": 1.1717, + "num_input_tokens_seen": 124727752, + "step": 7751 + }, + { + "epoch": 0.5430140008931151, + "grad_norm": 3.337639570236206, + "learning_rate": 4.574957618213661e-05, + "loss": 0.7409, + "num_input_tokens_seen": 124744136, + "step": 7752 + }, + { + "epoch": 0.5430840491388443, + "grad_norm": 5.203801155090332, + "learning_rate": 4.5742577933450096e-05, + "loss": 1.111, + "num_input_tokens_seen": 124760264, + "step": 7753 + }, + { + "epoch": 0.5431540973845737, + "grad_norm": 4.642807960510254, + "learning_rate": 4.573557968476357e-05, + "loss": 1.0197, + "num_input_tokens_seen": 124776456, + "step": 7754 + }, + { + "epoch": 0.5432241456303029, + "grad_norm": 4.211435317993164, + "learning_rate": 4.572858143607706e-05, + "loss": 1.1239, + "num_input_tokens_seen": 124792320, + "step": 7755 + }, + { + "epoch": 0.5432941938760322, + "grad_norm": 4.980574607849121, + "learning_rate": 4.572158318739054e-05, + "loss": 1.3394, + "num_input_tokens_seen": 124808704, + "step": 7756 + }, + { + "epoch": 0.5433642421217614, + "grad_norm": 3.556262493133545, + "learning_rate": 4.571458493870403e-05, + "loss": 1.047, + "num_input_tokens_seen": 124825088, + "step": 7757 + }, + { + "epoch": 0.5434342903674906, + "grad_norm": 4.556125164031982, + "learning_rate": 4.570758669001752e-05, + "loss": 0.9926, + "num_input_tokens_seen": 124841032, + "step": 7758 + }, + { + "epoch": 0.5435043386132199, + "grad_norm": 5.515524864196777, + "learning_rate": 4.5700588441331e-05, + "loss": 1.0712, + "num_input_tokens_seen": 124857416, + "step": 7759 + }, + { + "epoch": 0.5435743868589491, + "grad_norm": 3.912358283996582, + "learning_rate": 4.569359019264449e-05, + "loss": 0.9536, + "num_input_tokens_seen": 124873800, + "step": 7760 + }, + { + "epoch": 0.5436444351046783, + "grad_norm": 3.7982399463653564, + "learning_rate": 4.5686591943957965e-05, + "loss": 1.009, + "num_input_tokens_seen": 124890184, + "step": 7761 + }, + { + "epoch": 0.5437144833504076, + "grad_norm": 3.991724967956543, + "learning_rate": 4.567959369527146e-05, + "loss": 0.9333, + "num_input_tokens_seen": 124906568, + "step": 7762 + }, + { + "epoch": 0.5437845315961368, + "grad_norm": 5.313719272613525, + "learning_rate": 4.567259544658495e-05, + "loss": 1.0243, + "num_input_tokens_seen": 124922352, + "step": 7763 + }, + { + "epoch": 0.5438545798418661, + "grad_norm": 3.796652317047119, + "learning_rate": 4.5665597197898423e-05, + "loss": 0.905, + "num_input_tokens_seen": 124938472, + "step": 7764 + }, + { + "epoch": 0.5439246280875953, + "grad_norm": 4.874033451080322, + "learning_rate": 4.565859894921191e-05, + "loss": 1.0194, + "num_input_tokens_seen": 124954848, + "step": 7765 + }, + { + "epoch": 0.5439946763333245, + "grad_norm": 3.8010215759277344, + "learning_rate": 4.5651600700525394e-05, + "loss": 1.092, + "num_input_tokens_seen": 124970488, + "step": 7766 + }, + { + "epoch": 0.5440647245790539, + "grad_norm": 4.711667060852051, + "learning_rate": 4.564460245183888e-05, + "loss": 1.0575, + "num_input_tokens_seen": 124986656, + "step": 7767 + }, + { + "epoch": 0.5441347728247831, + "grad_norm": 5.9820356369018555, + "learning_rate": 4.563760420315237e-05, + "loss": 1.0015, + "num_input_tokens_seen": 125003040, + "step": 7768 + }, + { + "epoch": 0.5442048210705123, + "grad_norm": 4.217742443084717, + "learning_rate": 4.563060595446585e-05, + "loss": 0.907, + "num_input_tokens_seen": 125019008, + "step": 7769 + }, + { + "epoch": 0.5442748693162416, + "grad_norm": 3.3076283931732178, + "learning_rate": 4.562360770577934e-05, + "loss": 0.7289, + "num_input_tokens_seen": 125035272, + "step": 7770 + }, + { + "epoch": 0.5443449175619708, + "grad_norm": 3.409607172012329, + "learning_rate": 4.5616609457092817e-05, + "loss": 0.8898, + "num_input_tokens_seen": 125051656, + "step": 7771 + }, + { + "epoch": 0.5444149658077001, + "grad_norm": 5.260388374328613, + "learning_rate": 4.560961120840631e-05, + "loss": 0.9778, + "num_input_tokens_seen": 125068040, + "step": 7772 + }, + { + "epoch": 0.5444850140534293, + "grad_norm": 4.905508518218994, + "learning_rate": 4.56026129597198e-05, + "loss": 1.2631, + "num_input_tokens_seen": 125084192, + "step": 7773 + }, + { + "epoch": 0.5445550622991585, + "grad_norm": 4.701261043548584, + "learning_rate": 4.5595614711033276e-05, + "loss": 1.1305, + "num_input_tokens_seen": 125098976, + "step": 7774 + }, + { + "epoch": 0.5446251105448878, + "grad_norm": 4.822204113006592, + "learning_rate": 4.5588616462346764e-05, + "loss": 1.0407, + "num_input_tokens_seen": 125114624, + "step": 7775 + }, + { + "epoch": 0.544695158790617, + "grad_norm": 3.7025883197784424, + "learning_rate": 4.5581618213660246e-05, + "loss": 1.0469, + "num_input_tokens_seen": 125130560, + "step": 7776 + }, + { + "epoch": 0.5447652070363462, + "grad_norm": 4.998040676116943, + "learning_rate": 4.5574619964973735e-05, + "loss": 1.0568, + "num_input_tokens_seen": 125146944, + "step": 7777 + }, + { + "epoch": 0.5448352552820755, + "grad_norm": 3.457750082015991, + "learning_rate": 4.5567621716287216e-05, + "loss": 0.8956, + "num_input_tokens_seen": 125162520, + "step": 7778 + }, + { + "epoch": 0.5449053035278048, + "grad_norm": 3.417926788330078, + "learning_rate": 4.5560623467600705e-05, + "loss": 0.9547, + "num_input_tokens_seen": 125178656, + "step": 7779 + }, + { + "epoch": 0.5449753517735341, + "grad_norm": 4.075389385223389, + "learning_rate": 4.5553625218914194e-05, + "loss": 0.9655, + "num_input_tokens_seen": 125195040, + "step": 7780 + }, + { + "epoch": 0.5450454000192633, + "grad_norm": 4.12037992477417, + "learning_rate": 4.554662697022767e-05, + "loss": 1.3969, + "num_input_tokens_seen": 125211424, + "step": 7781 + }, + { + "epoch": 0.5451154482649925, + "grad_norm": 4.0456671714782715, + "learning_rate": 4.5539628721541164e-05, + "loss": 0.9851, + "num_input_tokens_seen": 125227752, + "step": 7782 + }, + { + "epoch": 0.5451854965107218, + "grad_norm": 4.972954273223877, + "learning_rate": 4.553263047285464e-05, + "loss": 1.0509, + "num_input_tokens_seen": 125242792, + "step": 7783 + }, + { + "epoch": 0.545255544756451, + "grad_norm": 3.667360544204712, + "learning_rate": 4.552563222416813e-05, + "loss": 1.194, + "num_input_tokens_seen": 125259176, + "step": 7784 + }, + { + "epoch": 0.5453255930021802, + "grad_norm": 3.54160737991333, + "learning_rate": 4.5518633975481616e-05, + "loss": 0.9758, + "num_input_tokens_seen": 125275560, + "step": 7785 + }, + { + "epoch": 0.5453956412479095, + "grad_norm": 3.7189040184020996, + "learning_rate": 4.55116357267951e-05, + "loss": 0.8118, + "num_input_tokens_seen": 125291944, + "step": 7786 + }, + { + "epoch": 0.5454656894936387, + "grad_norm": 3.435598611831665, + "learning_rate": 4.5504637478108587e-05, + "loss": 0.9343, + "num_input_tokens_seen": 125307528, + "step": 7787 + }, + { + "epoch": 0.545535737739368, + "grad_norm": 3.7623162269592285, + "learning_rate": 4.549763922942207e-05, + "loss": 1.2752, + "num_input_tokens_seen": 125323816, + "step": 7788 + }, + { + "epoch": 0.5456057859850972, + "grad_norm": 4.6416239738464355, + "learning_rate": 4.549064098073556e-05, + "loss": 1.0901, + "num_input_tokens_seen": 125340200, + "step": 7789 + }, + { + "epoch": 0.5456758342308264, + "grad_norm": 4.615113258361816, + "learning_rate": 4.5483642732049046e-05, + "loss": 0.977, + "num_input_tokens_seen": 125356584, + "step": 7790 + }, + { + "epoch": 0.5457458824765558, + "grad_norm": 3.8960089683532715, + "learning_rate": 4.547664448336252e-05, + "loss": 0.9974, + "num_input_tokens_seen": 125372968, + "step": 7791 + }, + { + "epoch": 0.545815930722285, + "grad_norm": 3.9642269611358643, + "learning_rate": 4.5469646234676016e-05, + "loss": 0.9801, + "num_input_tokens_seen": 125389352, + "step": 7792 + }, + { + "epoch": 0.5458859789680143, + "grad_norm": 5.444625377655029, + "learning_rate": 4.546264798598949e-05, + "loss": 1.1028, + "num_input_tokens_seen": 125404624, + "step": 7793 + }, + { + "epoch": 0.5459560272137435, + "grad_norm": 4.1974053382873535, + "learning_rate": 4.545564973730298e-05, + "loss": 0.9788, + "num_input_tokens_seen": 125420944, + "step": 7794 + }, + { + "epoch": 0.5460260754594727, + "grad_norm": 5.193080425262451, + "learning_rate": 4.544865148861647e-05, + "loss": 1.0504, + "num_input_tokens_seen": 125437328, + "step": 7795 + }, + { + "epoch": 0.546096123705202, + "grad_norm": 5.049325942993164, + "learning_rate": 4.544165323992995e-05, + "loss": 0.9977, + "num_input_tokens_seen": 125452656, + "step": 7796 + }, + { + "epoch": 0.5461661719509312, + "grad_norm": 4.1581549644470215, + "learning_rate": 4.543465499124344e-05, + "loss": 0.873, + "num_input_tokens_seen": 125469040, + "step": 7797 + }, + { + "epoch": 0.5462362201966604, + "grad_norm": 4.484875202178955, + "learning_rate": 4.542765674255692e-05, + "loss": 1.1117, + "num_input_tokens_seen": 125485168, + "step": 7798 + }, + { + "epoch": 0.5463062684423897, + "grad_norm": 4.153511047363281, + "learning_rate": 4.542065849387041e-05, + "loss": 1.2482, + "num_input_tokens_seen": 125501336, + "step": 7799 + }, + { + "epoch": 0.5463763166881189, + "grad_norm": 4.060786724090576, + "learning_rate": 4.54136602451839e-05, + "loss": 1.0232, + "num_input_tokens_seen": 125517688, + "step": 7800 + }, + { + "epoch": 0.5463763166881189, + "eval_loss": 1.121619462966919, + "eval_runtime": 0.2054, + "eval_samples_per_second": 4.868, + "eval_steps_per_second": 4.868, + "num_input_tokens_seen": 125517688, + "step": 7800 + }, + { + "epoch": 0.5464463649338482, + "grad_norm": 5.799534797668457, + "learning_rate": 4.540666199649737e-05, + "loss": 1.2131, + "num_input_tokens_seen": 125533408, + "step": 7801 + }, + { + "epoch": 0.5465164131795774, + "grad_norm": 3.7412962913513184, + "learning_rate": 4.539966374781087e-05, + "loss": 1.0203, + "num_input_tokens_seen": 125549792, + "step": 7802 + }, + { + "epoch": 0.5465864614253066, + "grad_norm": 3.978907346725464, + "learning_rate": 4.539266549912434e-05, + "loss": 1.0106, + "num_input_tokens_seen": 125565352, + "step": 7803 + }, + { + "epoch": 0.546656509671036, + "grad_norm": 4.388980865478516, + "learning_rate": 4.538566725043783e-05, + "loss": 1.0467, + "num_input_tokens_seen": 125580864, + "step": 7804 + }, + { + "epoch": 0.5467265579167652, + "grad_norm": 3.432842969894409, + "learning_rate": 4.5378669001751313e-05, + "loss": 1.0141, + "num_input_tokens_seen": 125597248, + "step": 7805 + }, + { + "epoch": 0.5467966061624944, + "grad_norm": 4.419676303863525, + "learning_rate": 4.53716707530648e-05, + "loss": 1.053, + "num_input_tokens_seen": 125613448, + "step": 7806 + }, + { + "epoch": 0.5468666544082237, + "grad_norm": 4.717494964599609, + "learning_rate": 4.536467250437829e-05, + "loss": 0.9455, + "num_input_tokens_seen": 125629728, + "step": 7807 + }, + { + "epoch": 0.5469367026539529, + "grad_norm": 3.8088278770446777, + "learning_rate": 4.535767425569177e-05, + "loss": 1.0265, + "num_input_tokens_seen": 125646112, + "step": 7808 + }, + { + "epoch": 0.5470067508996822, + "grad_norm": 4.486949443817139, + "learning_rate": 4.535067600700526e-05, + "loss": 1.0894, + "num_input_tokens_seen": 125662496, + "step": 7809 + }, + { + "epoch": 0.5470767991454114, + "grad_norm": 4.220696926116943, + "learning_rate": 4.5343677758318736e-05, + "loss": 1.1683, + "num_input_tokens_seen": 125678192, + "step": 7810 + }, + { + "epoch": 0.5471468473911406, + "grad_norm": 3.5514204502105713, + "learning_rate": 4.5336679509632225e-05, + "loss": 0.934, + "num_input_tokens_seen": 125693824, + "step": 7811 + }, + { + "epoch": 0.5472168956368699, + "grad_norm": 4.971661567687988, + "learning_rate": 4.532968126094572e-05, + "loss": 1.0883, + "num_input_tokens_seen": 125709184, + "step": 7812 + }, + { + "epoch": 0.5472869438825991, + "grad_norm": 4.215356349945068, + "learning_rate": 4.5322683012259195e-05, + "loss": 1.1258, + "num_input_tokens_seen": 125725568, + "step": 7813 + }, + { + "epoch": 0.5473569921283283, + "grad_norm": 3.7598018646240234, + "learning_rate": 4.5315684763572684e-05, + "loss": 0.9901, + "num_input_tokens_seen": 125741952, + "step": 7814 + }, + { + "epoch": 0.5474270403740576, + "grad_norm": 4.145439147949219, + "learning_rate": 4.5308686514886165e-05, + "loss": 1.1545, + "num_input_tokens_seen": 125757792, + "step": 7815 + }, + { + "epoch": 0.5474970886197869, + "grad_norm": 4.645499229431152, + "learning_rate": 4.5301688266199654e-05, + "loss": 0.964, + "num_input_tokens_seen": 125774176, + "step": 7816 + }, + { + "epoch": 0.5475671368655162, + "grad_norm": 3.526381015777588, + "learning_rate": 4.529469001751314e-05, + "loss": 1.098, + "num_input_tokens_seen": 125790264, + "step": 7817 + }, + { + "epoch": 0.5476371851112454, + "grad_norm": 4.389588356018066, + "learning_rate": 4.5287691768826624e-05, + "loss": 1.2777, + "num_input_tokens_seen": 125806648, + "step": 7818 + }, + { + "epoch": 0.5477072333569746, + "grad_norm": 3.5831832885742188, + "learning_rate": 4.528069352014011e-05, + "loss": 1.1427, + "num_input_tokens_seen": 125822848, + "step": 7819 + }, + { + "epoch": 0.5477772816027039, + "grad_norm": 5.075149059295654, + "learning_rate": 4.527369527145359e-05, + "loss": 0.962, + "num_input_tokens_seen": 125839232, + "step": 7820 + }, + { + "epoch": 0.5478473298484331, + "grad_norm": 3.7986133098602295, + "learning_rate": 4.526669702276708e-05, + "loss": 1.0552, + "num_input_tokens_seen": 125855616, + "step": 7821 + }, + { + "epoch": 0.5479173780941623, + "grad_norm": 3.6170661449432373, + "learning_rate": 4.525969877408057e-05, + "loss": 0.9207, + "num_input_tokens_seen": 125872000, + "step": 7822 + }, + { + "epoch": 0.5479874263398916, + "grad_norm": 3.752514123916626, + "learning_rate": 4.525270052539405e-05, + "loss": 0.9744, + "num_input_tokens_seen": 125888384, + "step": 7823 + }, + { + "epoch": 0.5480574745856208, + "grad_norm": 3.609358072280884, + "learning_rate": 4.5245702276707536e-05, + "loss": 1.0688, + "num_input_tokens_seen": 125904768, + "step": 7824 + }, + { + "epoch": 0.5481275228313501, + "grad_norm": 5.24570369720459, + "learning_rate": 4.523870402802102e-05, + "loss": 1.0225, + "num_input_tokens_seen": 125920592, + "step": 7825 + }, + { + "epoch": 0.5481975710770793, + "grad_norm": 5.084728240966797, + "learning_rate": 4.5231705779334506e-05, + "loss": 1.0211, + "num_input_tokens_seen": 125936976, + "step": 7826 + }, + { + "epoch": 0.5482676193228085, + "grad_norm": 4.076999664306641, + "learning_rate": 4.5224707530647995e-05, + "loss": 1.0143, + "num_input_tokens_seen": 125953048, + "step": 7827 + }, + { + "epoch": 0.5483376675685379, + "grad_norm": 3.8320982456207275, + "learning_rate": 4.5217709281961476e-05, + "loss": 0.9229, + "num_input_tokens_seen": 125969432, + "step": 7828 + }, + { + "epoch": 0.5484077158142671, + "grad_norm": 5.069493293762207, + "learning_rate": 4.5210711033274965e-05, + "loss": 0.8938, + "num_input_tokens_seen": 125985688, + "step": 7829 + }, + { + "epoch": 0.5484777640599964, + "grad_norm": 3.8555328845977783, + "learning_rate": 4.520371278458844e-05, + "loss": 1.0394, + "num_input_tokens_seen": 126002072, + "step": 7830 + }, + { + "epoch": 0.5485478123057256, + "grad_norm": 3.5486679077148438, + "learning_rate": 4.519671453590193e-05, + "loss": 1.0865, + "num_input_tokens_seen": 126018456, + "step": 7831 + }, + { + "epoch": 0.5486178605514548, + "grad_norm": 4.256968021392822, + "learning_rate": 4.518971628721541e-05, + "loss": 1.275, + "num_input_tokens_seen": 126034272, + "step": 7832 + }, + { + "epoch": 0.5486879087971841, + "grad_norm": 3.594381332397461, + "learning_rate": 4.51827180385289e-05, + "loss": 1.0079, + "num_input_tokens_seen": 126050656, + "step": 7833 + }, + { + "epoch": 0.5487579570429133, + "grad_norm": 3.4965176582336426, + "learning_rate": 4.517571978984239e-05, + "loss": 1.0182, + "num_input_tokens_seen": 126067040, + "step": 7834 + }, + { + "epoch": 0.5488280052886425, + "grad_norm": 3.762791395187378, + "learning_rate": 4.516872154115587e-05, + "loss": 1.048, + "num_input_tokens_seen": 126083384, + "step": 7835 + }, + { + "epoch": 0.5488980535343718, + "grad_norm": 4.816859245300293, + "learning_rate": 4.516172329246936e-05, + "loss": 1.1516, + "num_input_tokens_seen": 126099768, + "step": 7836 + }, + { + "epoch": 0.548968101780101, + "grad_norm": 4.410999774932861, + "learning_rate": 4.515472504378283e-05, + "loss": 1.1552, + "num_input_tokens_seen": 126114976, + "step": 7837 + }, + { + "epoch": 0.5490381500258303, + "grad_norm": 3.48974609375, + "learning_rate": 4.514772679509633e-05, + "loss": 0.8017, + "num_input_tokens_seen": 126131360, + "step": 7838 + }, + { + "epoch": 0.5491081982715595, + "grad_norm": 4.172264575958252, + "learning_rate": 4.514072854640982e-05, + "loss": 0.9508, + "num_input_tokens_seen": 126147080, + "step": 7839 + }, + { + "epoch": 0.5491782465172887, + "grad_norm": 4.311397075653076, + "learning_rate": 4.513373029772329e-05, + "loss": 1.1269, + "num_input_tokens_seen": 126163464, + "step": 7840 + }, + { + "epoch": 0.5492482947630181, + "grad_norm": 3.950122833251953, + "learning_rate": 4.512673204903678e-05, + "loss": 1.0341, + "num_input_tokens_seen": 126179848, + "step": 7841 + }, + { + "epoch": 0.5493183430087473, + "grad_norm": 3.5279722213745117, + "learning_rate": 4.511973380035026e-05, + "loss": 0.8949, + "num_input_tokens_seen": 126196232, + "step": 7842 + }, + { + "epoch": 0.5493883912544765, + "grad_norm": 3.958651065826416, + "learning_rate": 4.511273555166375e-05, + "loss": 1.2114, + "num_input_tokens_seen": 126212616, + "step": 7843 + }, + { + "epoch": 0.5494584395002058, + "grad_norm": 4.567907810211182, + "learning_rate": 4.510573730297724e-05, + "loss": 1.0151, + "num_input_tokens_seen": 126229000, + "step": 7844 + }, + { + "epoch": 0.549528487745935, + "grad_norm": 5.533442497253418, + "learning_rate": 4.509873905429072e-05, + "loss": 1.1584, + "num_input_tokens_seen": 126243808, + "step": 7845 + }, + { + "epoch": 0.5495985359916643, + "grad_norm": 3.7990267276763916, + "learning_rate": 4.509174080560421e-05, + "loss": 1.023, + "num_input_tokens_seen": 126260136, + "step": 7846 + }, + { + "epoch": 0.5496685842373935, + "grad_norm": 3.648163318634033, + "learning_rate": 4.5084742556917685e-05, + "loss": 1.1331, + "num_input_tokens_seen": 126276488, + "step": 7847 + }, + { + "epoch": 0.5497386324831227, + "grad_norm": 3.247767686843872, + "learning_rate": 4.507774430823118e-05, + "loss": 0.8663, + "num_input_tokens_seen": 126292872, + "step": 7848 + }, + { + "epoch": 0.549808680728852, + "grad_norm": 3.6284050941467285, + "learning_rate": 4.507074605954467e-05, + "loss": 1.2207, + "num_input_tokens_seen": 126309256, + "step": 7849 + }, + { + "epoch": 0.5498787289745812, + "grad_norm": 3.714810848236084, + "learning_rate": 4.5063747810858144e-05, + "loss": 0.8713, + "num_input_tokens_seen": 126324880, + "step": 7850 + }, + { + "epoch": 0.5499487772203104, + "grad_norm": 4.455381870269775, + "learning_rate": 4.505674956217163e-05, + "loss": 0.9736, + "num_input_tokens_seen": 126341264, + "step": 7851 + }, + { + "epoch": 0.5500188254660398, + "grad_norm": 6.539924144744873, + "learning_rate": 4.5049751313485115e-05, + "loss": 1.1598, + "num_input_tokens_seen": 126357648, + "step": 7852 + }, + { + "epoch": 0.550088873711769, + "grad_norm": 4.244390487670898, + "learning_rate": 4.50427530647986e-05, + "loss": 1.2593, + "num_input_tokens_seen": 126373888, + "step": 7853 + }, + { + "epoch": 0.5501589219574983, + "grad_norm": 4.415054798126221, + "learning_rate": 4.503575481611209e-05, + "loss": 1.2661, + "num_input_tokens_seen": 126389976, + "step": 7854 + }, + { + "epoch": 0.5502289702032275, + "grad_norm": 5.152358531951904, + "learning_rate": 4.5028756567425574e-05, + "loss": 1.0433, + "num_input_tokens_seen": 126406000, + "step": 7855 + }, + { + "epoch": 0.5502990184489567, + "grad_norm": 4.243201732635498, + "learning_rate": 4.502175831873906e-05, + "loss": 1.0326, + "num_input_tokens_seen": 126422384, + "step": 7856 + }, + { + "epoch": 0.550369066694686, + "grad_norm": 4.7400898933410645, + "learning_rate": 4.501476007005254e-05, + "loss": 1.1462, + "num_input_tokens_seen": 126438552, + "step": 7857 + }, + { + "epoch": 0.5504391149404152, + "grad_norm": 3.5386011600494385, + "learning_rate": 4.500776182136603e-05, + "loss": 0.9501, + "num_input_tokens_seen": 126454936, + "step": 7858 + }, + { + "epoch": 0.5505091631861445, + "grad_norm": 3.939976692199707, + "learning_rate": 4.500076357267951e-05, + "loss": 1.081, + "num_input_tokens_seen": 126471320, + "step": 7859 + }, + { + "epoch": 0.5505792114318737, + "grad_norm": 3.7520668506622314, + "learning_rate": 4.4993765323992996e-05, + "loss": 1.1897, + "num_input_tokens_seen": 126487144, + "step": 7860 + }, + { + "epoch": 0.5506492596776029, + "grad_norm": 3.8446950912475586, + "learning_rate": 4.4986767075306485e-05, + "loss": 1.1295, + "num_input_tokens_seen": 126503208, + "step": 7861 + }, + { + "epoch": 0.5507193079233322, + "grad_norm": 5.0048909187316895, + "learning_rate": 4.4979768826619967e-05, + "loss": 1.1618, + "num_input_tokens_seen": 126519592, + "step": 7862 + }, + { + "epoch": 0.5507893561690614, + "grad_norm": 3.4074764251708984, + "learning_rate": 4.4972770577933455e-05, + "loss": 0.9419, + "num_input_tokens_seen": 126535928, + "step": 7863 + }, + { + "epoch": 0.5508594044147906, + "grad_norm": 4.959311485290527, + "learning_rate": 4.496577232924694e-05, + "loss": 0.9927, + "num_input_tokens_seen": 126551768, + "step": 7864 + }, + { + "epoch": 0.55092945266052, + "grad_norm": 4.260848045349121, + "learning_rate": 4.4958774080560426e-05, + "loss": 1.1245, + "num_input_tokens_seen": 126566968, + "step": 7865 + }, + { + "epoch": 0.5509995009062492, + "grad_norm": 4.374674320220947, + "learning_rate": 4.4951775831873914e-05, + "loss": 1.1858, + "num_input_tokens_seen": 126582432, + "step": 7866 + }, + { + "epoch": 0.5510695491519785, + "grad_norm": 3.808882236480713, + "learning_rate": 4.494477758318739e-05, + "loss": 0.8445, + "num_input_tokens_seen": 126598272, + "step": 7867 + }, + { + "epoch": 0.5511395973977077, + "grad_norm": 4.428232192993164, + "learning_rate": 4.493777933450088e-05, + "loss": 1.0271, + "num_input_tokens_seen": 126614656, + "step": 7868 + }, + { + "epoch": 0.5512096456434369, + "grad_norm": 3.7343485355377197, + "learning_rate": 4.493078108581436e-05, + "loss": 0.9685, + "num_input_tokens_seen": 126630920, + "step": 7869 + }, + { + "epoch": 0.5512796938891662, + "grad_norm": 3.8342700004577637, + "learning_rate": 4.492378283712785e-05, + "loss": 1.145, + "num_input_tokens_seen": 126647304, + "step": 7870 + }, + { + "epoch": 0.5513497421348954, + "grad_norm": 3.969151496887207, + "learning_rate": 4.491678458844134e-05, + "loss": 1.1645, + "num_input_tokens_seen": 126663688, + "step": 7871 + }, + { + "epoch": 0.5514197903806246, + "grad_norm": 4.096588134765625, + "learning_rate": 4.490978633975482e-05, + "loss": 1.2369, + "num_input_tokens_seen": 126680072, + "step": 7872 + }, + { + "epoch": 0.5514898386263539, + "grad_norm": 3.6431949138641357, + "learning_rate": 4.490278809106831e-05, + "loss": 0.8899, + "num_input_tokens_seen": 126696456, + "step": 7873 + }, + { + "epoch": 0.5515598868720831, + "grad_norm": 3.6036217212677, + "learning_rate": 4.489578984238179e-05, + "loss": 0.9629, + "num_input_tokens_seen": 126712464, + "step": 7874 + }, + { + "epoch": 0.5516299351178124, + "grad_norm": 4.925845146179199, + "learning_rate": 4.488879159369528e-05, + "loss": 1.0951, + "num_input_tokens_seen": 126727664, + "step": 7875 + }, + { + "epoch": 0.5516999833635416, + "grad_norm": 3.822420120239258, + "learning_rate": 4.4881793345008766e-05, + "loss": 0.9803, + "num_input_tokens_seen": 126742280, + "step": 7876 + }, + { + "epoch": 0.5517700316092709, + "grad_norm": 4.6314005851745605, + "learning_rate": 4.487479509632224e-05, + "loss": 0.852, + "num_input_tokens_seen": 126758664, + "step": 7877 + }, + { + "epoch": 0.5518400798550002, + "grad_norm": 3.5483834743499756, + "learning_rate": 4.486779684763573e-05, + "loss": 0.9601, + "num_input_tokens_seen": 126774160, + "step": 7878 + }, + { + "epoch": 0.5519101281007294, + "grad_norm": 3.582298755645752, + "learning_rate": 4.486079859894921e-05, + "loss": 1.0404, + "num_input_tokens_seen": 126790344, + "step": 7879 + }, + { + "epoch": 0.5519801763464586, + "grad_norm": 6.529607772827148, + "learning_rate": 4.48538003502627e-05, + "loss": 0.8033, + "num_input_tokens_seen": 126806728, + "step": 7880 + }, + { + "epoch": 0.5520502245921879, + "grad_norm": 4.167466640472412, + "learning_rate": 4.484680210157619e-05, + "loss": 1.1219, + "num_input_tokens_seen": 126822792, + "step": 7881 + }, + { + "epoch": 0.5521202728379171, + "grad_norm": 4.332005500793457, + "learning_rate": 4.483980385288967e-05, + "loss": 1.093, + "num_input_tokens_seen": 126838952, + "step": 7882 + }, + { + "epoch": 0.5521903210836464, + "grad_norm": 7.588165283203125, + "learning_rate": 4.483280560420316e-05, + "loss": 1.075, + "num_input_tokens_seen": 126855336, + "step": 7883 + }, + { + "epoch": 0.5522603693293756, + "grad_norm": 5.086427688598633, + "learning_rate": 4.4825807355516634e-05, + "loss": 0.8955, + "num_input_tokens_seen": 126871448, + "step": 7884 + }, + { + "epoch": 0.5523304175751048, + "grad_norm": 5.5746355056762695, + "learning_rate": 4.481880910683013e-05, + "loss": 0.8979, + "num_input_tokens_seen": 126886424, + "step": 7885 + }, + { + "epoch": 0.5524004658208341, + "grad_norm": 3.833796262741089, + "learning_rate": 4.4811810858143605e-05, + "loss": 0.9682, + "num_input_tokens_seen": 126902808, + "step": 7886 + }, + { + "epoch": 0.5524705140665633, + "grad_norm": 3.436943769454956, + "learning_rate": 4.480481260945709e-05, + "loss": 0.8928, + "num_input_tokens_seen": 126918960, + "step": 7887 + }, + { + "epoch": 0.5525405623122925, + "grad_norm": 3.8391594886779785, + "learning_rate": 4.479781436077058e-05, + "loss": 1.0474, + "num_input_tokens_seen": 126934432, + "step": 7888 + }, + { + "epoch": 0.5526106105580219, + "grad_norm": 5.476257801055908, + "learning_rate": 4.4790816112084064e-05, + "loss": 1.0429, + "num_input_tokens_seen": 126949912, + "step": 7889 + }, + { + "epoch": 0.5526806588037511, + "grad_norm": 3.628587245941162, + "learning_rate": 4.478381786339755e-05, + "loss": 1.013, + "num_input_tokens_seen": 126966296, + "step": 7890 + }, + { + "epoch": 0.5527507070494804, + "grad_norm": 4.087871551513672, + "learning_rate": 4.4776819614711034e-05, + "loss": 1.0916, + "num_input_tokens_seen": 126982680, + "step": 7891 + }, + { + "epoch": 0.5528207552952096, + "grad_norm": 4.438427448272705, + "learning_rate": 4.476982136602452e-05, + "loss": 1.1005, + "num_input_tokens_seen": 126998072, + "step": 7892 + }, + { + "epoch": 0.5528908035409388, + "grad_norm": 3.5243074893951416, + "learning_rate": 4.476282311733801e-05, + "loss": 0.7855, + "num_input_tokens_seen": 127014104, + "step": 7893 + }, + { + "epoch": 0.5529608517866681, + "grad_norm": 10.454376220703125, + "learning_rate": 4.4755824868651486e-05, + "loss": 0.9935, + "num_input_tokens_seen": 127030488, + "step": 7894 + }, + { + "epoch": 0.5530309000323973, + "grad_norm": 3.5278947353363037, + "learning_rate": 4.474882661996498e-05, + "loss": 1.1274, + "num_input_tokens_seen": 127046536, + "step": 7895 + }, + { + "epoch": 0.5531009482781266, + "grad_norm": 3.8515357971191406, + "learning_rate": 4.474182837127846e-05, + "loss": 1.0061, + "num_input_tokens_seen": 127062200, + "step": 7896 + }, + { + "epoch": 0.5531709965238558, + "grad_norm": 4.094476699829102, + "learning_rate": 4.4734830122591945e-05, + "loss": 0.9606, + "num_input_tokens_seen": 127077656, + "step": 7897 + }, + { + "epoch": 0.553241044769585, + "grad_norm": 3.9371705055236816, + "learning_rate": 4.4727831873905434e-05, + "loss": 1.1099, + "num_input_tokens_seen": 127093464, + "step": 7898 + }, + { + "epoch": 0.5533110930153143, + "grad_norm": 5.384475231170654, + "learning_rate": 4.4720833625218916e-05, + "loss": 1.0207, + "num_input_tokens_seen": 127109848, + "step": 7899 + }, + { + "epoch": 0.5533811412610435, + "grad_norm": 4.609706878662109, + "learning_rate": 4.4713835376532404e-05, + "loss": 0.9562, + "num_input_tokens_seen": 127126232, + "step": 7900 + }, + { + "epoch": 0.5534511895067727, + "grad_norm": 5.60554838180542, + "learning_rate": 4.4706837127845886e-05, + "loss": 1.0052, + "num_input_tokens_seen": 127142616, + "step": 7901 + }, + { + "epoch": 0.5535212377525021, + "grad_norm": 4.736166477203369, + "learning_rate": 4.4699838879159375e-05, + "loss": 0.9599, + "num_input_tokens_seen": 127159000, + "step": 7902 + }, + { + "epoch": 0.5535912859982313, + "grad_norm": 3.9779133796691895, + "learning_rate": 4.469284063047286e-05, + "loss": 1.0068, + "num_input_tokens_seen": 127175384, + "step": 7903 + }, + { + "epoch": 0.5536613342439606, + "grad_norm": 4.673941612243652, + "learning_rate": 4.468584238178634e-05, + "loss": 1.2606, + "num_input_tokens_seen": 127191752, + "step": 7904 + }, + { + "epoch": 0.5537313824896898, + "grad_norm": 3.722839117050171, + "learning_rate": 4.4678844133099834e-05, + "loss": 1.0554, + "num_input_tokens_seen": 127207408, + "step": 7905 + }, + { + "epoch": 0.553801430735419, + "grad_norm": 4.31935453414917, + "learning_rate": 4.467184588441331e-05, + "loss": 1.0523, + "num_input_tokens_seen": 127223792, + "step": 7906 + }, + { + "epoch": 0.5538714789811483, + "grad_norm": 3.686140775680542, + "learning_rate": 4.46648476357268e-05, + "loss": 1.083, + "num_input_tokens_seen": 127240176, + "step": 7907 + }, + { + "epoch": 0.5539415272268775, + "grad_norm": 4.997079372406006, + "learning_rate": 4.4657849387040286e-05, + "loss": 1.2024, + "num_input_tokens_seen": 127255560, + "step": 7908 + }, + { + "epoch": 0.5540115754726067, + "grad_norm": 4.612823963165283, + "learning_rate": 4.465085113835377e-05, + "loss": 1.0067, + "num_input_tokens_seen": 127271336, + "step": 7909 + }, + { + "epoch": 0.554081623718336, + "grad_norm": 3.857698678970337, + "learning_rate": 4.4643852889667256e-05, + "loss": 1.2211, + "num_input_tokens_seen": 127287720, + "step": 7910 + }, + { + "epoch": 0.5541516719640652, + "grad_norm": 3.8385705947875977, + "learning_rate": 4.463685464098074e-05, + "loss": 1.2123, + "num_input_tokens_seen": 127304104, + "step": 7911 + }, + { + "epoch": 0.5542217202097945, + "grad_norm": 3.910621166229248, + "learning_rate": 4.462985639229423e-05, + "loss": 1.071, + "num_input_tokens_seen": 127319512, + "step": 7912 + }, + { + "epoch": 0.5542917684555237, + "grad_norm": 4.685849666595459, + "learning_rate": 4.46228581436077e-05, + "loss": 0.9889, + "num_input_tokens_seen": 127335736, + "step": 7913 + }, + { + "epoch": 0.554361816701253, + "grad_norm": 3.6313934326171875, + "learning_rate": 4.461585989492119e-05, + "loss": 0.9355, + "num_input_tokens_seen": 127351896, + "step": 7914 + }, + { + "epoch": 0.5544318649469823, + "grad_norm": 3.3593709468841553, + "learning_rate": 4.4608861646234686e-05, + "loss": 0.8789, + "num_input_tokens_seen": 127368032, + "step": 7915 + }, + { + "epoch": 0.5545019131927115, + "grad_norm": 3.924467086791992, + "learning_rate": 4.460186339754816e-05, + "loss": 1.0038, + "num_input_tokens_seen": 127384416, + "step": 7916 + }, + { + "epoch": 0.5545719614384407, + "grad_norm": 4.749776363372803, + "learning_rate": 4.459486514886165e-05, + "loss": 1.2994, + "num_input_tokens_seen": 127400800, + "step": 7917 + }, + { + "epoch": 0.55464200968417, + "grad_norm": 4.43412446975708, + "learning_rate": 4.458786690017513e-05, + "loss": 1.1877, + "num_input_tokens_seen": 127416184, + "step": 7918 + }, + { + "epoch": 0.5547120579298992, + "grad_norm": 3.626112699508667, + "learning_rate": 4.458086865148862e-05, + "loss": 0.9214, + "num_input_tokens_seen": 127432568, + "step": 7919 + }, + { + "epoch": 0.5547821061756285, + "grad_norm": 4.031342506408691, + "learning_rate": 4.457387040280211e-05, + "loss": 1.0058, + "num_input_tokens_seen": 127448656, + "step": 7920 + }, + { + "epoch": 0.5548521544213577, + "grad_norm": 4.554370403289795, + "learning_rate": 4.456687215411559e-05, + "loss": 1.101, + "num_input_tokens_seen": 127462584, + "step": 7921 + }, + { + "epoch": 0.5549222026670869, + "grad_norm": 4.46144437789917, + "learning_rate": 4.455987390542908e-05, + "loss": 1.1095, + "num_input_tokens_seen": 127478968, + "step": 7922 + }, + { + "epoch": 0.5549922509128162, + "grad_norm": 3.759772539138794, + "learning_rate": 4.4552875656742554e-05, + "loss": 1.1723, + "num_input_tokens_seen": 127494856, + "step": 7923 + }, + { + "epoch": 0.5550622991585454, + "grad_norm": 6.424376010894775, + "learning_rate": 4.454587740805604e-05, + "loss": 1.1691, + "num_input_tokens_seen": 127511240, + "step": 7924 + }, + { + "epoch": 0.5551323474042746, + "grad_norm": 4.5455708503723145, + "learning_rate": 4.453887915936954e-05, + "loss": 1.0578, + "num_input_tokens_seen": 127527624, + "step": 7925 + }, + { + "epoch": 0.555202395650004, + "grad_norm": 3.673654794692993, + "learning_rate": 4.453188091068301e-05, + "loss": 1.0097, + "num_input_tokens_seen": 127543200, + "step": 7926 + }, + { + "epoch": 0.5552724438957332, + "grad_norm": 5.912845134735107, + "learning_rate": 4.45248826619965e-05, + "loss": 1.1171, + "num_input_tokens_seen": 127558704, + "step": 7927 + }, + { + "epoch": 0.5553424921414625, + "grad_norm": 4.603636741638184, + "learning_rate": 4.451788441330998e-05, + "loss": 1.0536, + "num_input_tokens_seen": 127575088, + "step": 7928 + }, + { + "epoch": 0.5554125403871917, + "grad_norm": 4.067967414855957, + "learning_rate": 4.451088616462347e-05, + "loss": 1.3762, + "num_input_tokens_seen": 127590584, + "step": 7929 + }, + { + "epoch": 0.5554825886329209, + "grad_norm": 4.261424541473389, + "learning_rate": 4.450388791593696e-05, + "loss": 0.993, + "num_input_tokens_seen": 127606320, + "step": 7930 + }, + { + "epoch": 0.5555526368786502, + "grad_norm": 3.4832777976989746, + "learning_rate": 4.449688966725044e-05, + "loss": 1.0043, + "num_input_tokens_seen": 127622704, + "step": 7931 + }, + { + "epoch": 0.5556226851243794, + "grad_norm": 3.5916316509246826, + "learning_rate": 4.448989141856393e-05, + "loss": 1.014, + "num_input_tokens_seen": 127639088, + "step": 7932 + }, + { + "epoch": 0.5556927333701087, + "grad_norm": 4.151430130004883, + "learning_rate": 4.4482893169877406e-05, + "loss": 1.0182, + "num_input_tokens_seen": 127655240, + "step": 7933 + }, + { + "epoch": 0.5557627816158379, + "grad_norm": 4.024296283721924, + "learning_rate": 4.4475894921190894e-05, + "loss": 0.9699, + "num_input_tokens_seen": 127671624, + "step": 7934 + }, + { + "epoch": 0.5558328298615671, + "grad_norm": 3.977968454360962, + "learning_rate": 4.446889667250439e-05, + "loss": 1.1159, + "num_input_tokens_seen": 127688008, + "step": 7935 + }, + { + "epoch": 0.5559028781072964, + "grad_norm": 3.4853708744049072, + "learning_rate": 4.4461898423817865e-05, + "loss": 0.9936, + "num_input_tokens_seen": 127704352, + "step": 7936 + }, + { + "epoch": 0.5559729263530256, + "grad_norm": 3.8649024963378906, + "learning_rate": 4.445490017513135e-05, + "loss": 0.9168, + "num_input_tokens_seen": 127720488, + "step": 7937 + }, + { + "epoch": 0.5560429745987548, + "grad_norm": 3.8322534561157227, + "learning_rate": 4.4447901926444835e-05, + "loss": 1.0689, + "num_input_tokens_seen": 127736280, + "step": 7938 + }, + { + "epoch": 0.5561130228444842, + "grad_norm": 3.2827885150909424, + "learning_rate": 4.4440903677758324e-05, + "loss": 0.8871, + "num_input_tokens_seen": 127752664, + "step": 7939 + }, + { + "epoch": 0.5561830710902134, + "grad_norm": 3.632049560546875, + "learning_rate": 4.44339054290718e-05, + "loss": 0.9965, + "num_input_tokens_seen": 127769048, + "step": 7940 + }, + { + "epoch": 0.5562531193359427, + "grad_norm": 3.9161887168884277, + "learning_rate": 4.4426907180385294e-05, + "loss": 1.3189, + "num_input_tokens_seen": 127784576, + "step": 7941 + }, + { + "epoch": 0.5563231675816719, + "grad_norm": 3.6720736026763916, + "learning_rate": 4.441990893169878e-05, + "loss": 1.2914, + "num_input_tokens_seen": 127800960, + "step": 7942 + }, + { + "epoch": 0.5563932158274011, + "grad_norm": 4.027480602264404, + "learning_rate": 4.441291068301226e-05, + "loss": 1.0616, + "num_input_tokens_seen": 127816800, + "step": 7943 + }, + { + "epoch": 0.5564632640731304, + "grad_norm": 5.149120807647705, + "learning_rate": 4.4405912434325746e-05, + "loss": 0.9716, + "num_input_tokens_seen": 127832760, + "step": 7944 + }, + { + "epoch": 0.5565333123188596, + "grad_norm": 4.779052734375, + "learning_rate": 4.439891418563923e-05, + "loss": 1.0751, + "num_input_tokens_seen": 127848888, + "step": 7945 + }, + { + "epoch": 0.5566033605645888, + "grad_norm": 4.37583065032959, + "learning_rate": 4.439191593695272e-05, + "loss": 1.1268, + "num_input_tokens_seen": 127865216, + "step": 7946 + }, + { + "epoch": 0.5566734088103181, + "grad_norm": 4.8968963623046875, + "learning_rate": 4.4384917688266205e-05, + "loss": 1.139, + "num_input_tokens_seen": 127881600, + "step": 7947 + }, + { + "epoch": 0.5567434570560473, + "grad_norm": 3.862400770187378, + "learning_rate": 4.437791943957969e-05, + "loss": 1.0398, + "num_input_tokens_seen": 127897424, + "step": 7948 + }, + { + "epoch": 0.5568135053017766, + "grad_norm": 3.653930902481079, + "learning_rate": 4.4370921190893176e-05, + "loss": 1.0289, + "num_input_tokens_seen": 127913808, + "step": 7949 + }, + { + "epoch": 0.5568835535475058, + "grad_norm": 4.557390213012695, + "learning_rate": 4.436392294220665e-05, + "loss": 1.1048, + "num_input_tokens_seen": 127930192, + "step": 7950 + }, + { + "epoch": 0.556953601793235, + "grad_norm": 6.2530364990234375, + "learning_rate": 4.4356924693520146e-05, + "loss": 1.2491, + "num_input_tokens_seen": 127946576, + "step": 7951 + }, + { + "epoch": 0.5570236500389644, + "grad_norm": 3.9652791023254395, + "learning_rate": 4.4349926444833635e-05, + "loss": 0.9109, + "num_input_tokens_seen": 127962960, + "step": 7952 + }, + { + "epoch": 0.5570936982846936, + "grad_norm": 5.276636600494385, + "learning_rate": 4.434292819614711e-05, + "loss": 1.0179, + "num_input_tokens_seen": 127979016, + "step": 7953 + }, + { + "epoch": 0.5571637465304228, + "grad_norm": 3.445674180984497, + "learning_rate": 4.43359299474606e-05, + "loss": 1.033, + "num_input_tokens_seen": 127995400, + "step": 7954 + }, + { + "epoch": 0.5572337947761521, + "grad_norm": 4.23106575012207, + "learning_rate": 4.432893169877408e-05, + "loss": 1.1025, + "num_input_tokens_seen": 128011784, + "step": 7955 + }, + { + "epoch": 0.5573038430218813, + "grad_norm": 3.795984983444214, + "learning_rate": 4.432193345008757e-05, + "loss": 0.8922, + "num_input_tokens_seen": 128027504, + "step": 7956 + }, + { + "epoch": 0.5573738912676106, + "grad_norm": 4.286993980407715, + "learning_rate": 4.431493520140106e-05, + "loss": 1.0134, + "num_input_tokens_seen": 128043832, + "step": 7957 + }, + { + "epoch": 0.5574439395133398, + "grad_norm": 5.160306930541992, + "learning_rate": 4.430793695271454e-05, + "loss": 1.1087, + "num_input_tokens_seen": 128059152, + "step": 7958 + }, + { + "epoch": 0.557513987759069, + "grad_norm": 5.083519458770752, + "learning_rate": 4.430093870402803e-05, + "loss": 1.173, + "num_input_tokens_seen": 128074664, + "step": 7959 + }, + { + "epoch": 0.5575840360047983, + "grad_norm": 3.746807336807251, + "learning_rate": 4.42939404553415e-05, + "loss": 1.0455, + "num_input_tokens_seen": 128090984, + "step": 7960 + }, + { + "epoch": 0.5576540842505275, + "grad_norm": 5.527798175811768, + "learning_rate": 4.4286942206655e-05, + "loss": 0.9927, + "num_input_tokens_seen": 128107368, + "step": 7961 + }, + { + "epoch": 0.5577241324962567, + "grad_norm": 5.1027116775512695, + "learning_rate": 4.427994395796849e-05, + "loss": 1.116, + "num_input_tokens_seen": 128123752, + "step": 7962 + }, + { + "epoch": 0.5577941807419861, + "grad_norm": 3.7059028148651123, + "learning_rate": 4.427294570928196e-05, + "loss": 1.2798, + "num_input_tokens_seen": 128140136, + "step": 7963 + }, + { + "epoch": 0.5578642289877153, + "grad_norm": 3.936305522918701, + "learning_rate": 4.426594746059545e-05, + "loss": 0.88, + "num_input_tokens_seen": 128156520, + "step": 7964 + }, + { + "epoch": 0.5579342772334446, + "grad_norm": 4.8260369300842285, + "learning_rate": 4.425894921190893e-05, + "loss": 0.8951, + "num_input_tokens_seen": 128172904, + "step": 7965 + }, + { + "epoch": 0.5580043254791738, + "grad_norm": 4.908670425415039, + "learning_rate": 4.425195096322242e-05, + "loss": 0.9753, + "num_input_tokens_seen": 128188672, + "step": 7966 + }, + { + "epoch": 0.558074373724903, + "grad_norm": 4.453614711761475, + "learning_rate": 4.42449527145359e-05, + "loss": 1.1764, + "num_input_tokens_seen": 128205056, + "step": 7967 + }, + { + "epoch": 0.5581444219706323, + "grad_norm": 3.9060006141662598, + "learning_rate": 4.423795446584939e-05, + "loss": 1.1539, + "num_input_tokens_seen": 128221440, + "step": 7968 + }, + { + "epoch": 0.5582144702163615, + "grad_norm": 4.674314498901367, + "learning_rate": 4.423095621716288e-05, + "loss": 1.0709, + "num_input_tokens_seen": 128237824, + "step": 7969 + }, + { + "epoch": 0.5582845184620908, + "grad_norm": 5.411530017852783, + "learning_rate": 4.4223957968476355e-05, + "loss": 1.1191, + "num_input_tokens_seen": 128253488, + "step": 7970 + }, + { + "epoch": 0.55835456670782, + "grad_norm": 3.7647812366485596, + "learning_rate": 4.421695971978985e-05, + "loss": 1.0836, + "num_input_tokens_seen": 128269600, + "step": 7971 + }, + { + "epoch": 0.5584246149535492, + "grad_norm": 5.809626579284668, + "learning_rate": 4.4209961471103325e-05, + "loss": 1.2228, + "num_input_tokens_seen": 128285592, + "step": 7972 + }, + { + "epoch": 0.5584946631992785, + "grad_norm": 4.234646320343018, + "learning_rate": 4.4202963222416814e-05, + "loss": 1.2126, + "num_input_tokens_seen": 128301976, + "step": 7973 + }, + { + "epoch": 0.5585647114450077, + "grad_norm": 3.390713930130005, + "learning_rate": 4.41959649737303e-05, + "loss": 1.0575, + "num_input_tokens_seen": 128318360, + "step": 7974 + }, + { + "epoch": 0.558634759690737, + "grad_norm": 3.582789182662964, + "learning_rate": 4.4188966725043784e-05, + "loss": 1.0472, + "num_input_tokens_seen": 128334744, + "step": 7975 + }, + { + "epoch": 0.5587048079364663, + "grad_norm": 4.052979946136475, + "learning_rate": 4.418196847635727e-05, + "loss": 1.0512, + "num_input_tokens_seen": 128350920, + "step": 7976 + }, + { + "epoch": 0.5587748561821955, + "grad_norm": 4.101496696472168, + "learning_rate": 4.4174970227670755e-05, + "loss": 1.0432, + "num_input_tokens_seen": 128367144, + "step": 7977 + }, + { + "epoch": 0.5588449044279248, + "grad_norm": 4.708171367645264, + "learning_rate": 4.416797197898424e-05, + "loss": 0.9654, + "num_input_tokens_seen": 128381272, + "step": 7978 + }, + { + "epoch": 0.558914952673654, + "grad_norm": 4.02223539352417, + "learning_rate": 4.416097373029773e-05, + "loss": 1.1984, + "num_input_tokens_seen": 128397160, + "step": 7979 + }, + { + "epoch": 0.5589850009193832, + "grad_norm": 4.594113349914551, + "learning_rate": 4.415397548161121e-05, + "loss": 1.1523, + "num_input_tokens_seen": 128413248, + "step": 7980 + }, + { + "epoch": 0.5590550491651125, + "grad_norm": 4.255110263824463, + "learning_rate": 4.41469772329247e-05, + "loss": 0.9418, + "num_input_tokens_seen": 128429632, + "step": 7981 + }, + { + "epoch": 0.5591250974108417, + "grad_norm": 4.862103462219238, + "learning_rate": 4.413997898423818e-05, + "loss": 0.8901, + "num_input_tokens_seen": 128446016, + "step": 7982 + }, + { + "epoch": 0.5591951456565709, + "grad_norm": 4.902801036834717, + "learning_rate": 4.4132980735551666e-05, + "loss": 1.1788, + "num_input_tokens_seen": 128461344, + "step": 7983 + }, + { + "epoch": 0.5592651939023002, + "grad_norm": 3.762756824493408, + "learning_rate": 4.4125982486865154e-05, + "loss": 1.2135, + "num_input_tokens_seen": 128477080, + "step": 7984 + }, + { + "epoch": 0.5593352421480294, + "grad_norm": 4.081719398498535, + "learning_rate": 4.4118984238178636e-05, + "loss": 0.9034, + "num_input_tokens_seen": 128493112, + "step": 7985 + }, + { + "epoch": 0.5594052903937587, + "grad_norm": 4.764441013336182, + "learning_rate": 4.4111985989492125e-05, + "loss": 1.1384, + "num_input_tokens_seen": 128508816, + "step": 7986 + }, + { + "epoch": 0.559475338639488, + "grad_norm": 3.709524631500244, + "learning_rate": 4.410498774080561e-05, + "loss": 1.0331, + "num_input_tokens_seen": 128525200, + "step": 7987 + }, + { + "epoch": 0.5595453868852172, + "grad_norm": 3.648374557495117, + "learning_rate": 4.4097989492119095e-05, + "loss": 1.072, + "num_input_tokens_seen": 128541584, + "step": 7988 + }, + { + "epoch": 0.5596154351309465, + "grad_norm": 6.713209629058838, + "learning_rate": 4.4090991243432584e-05, + "loss": 1.0908, + "num_input_tokens_seen": 128557048, + "step": 7989 + }, + { + "epoch": 0.5596854833766757, + "grad_norm": 5.627123832702637, + "learning_rate": 4.408399299474606e-05, + "loss": 0.8656, + "num_input_tokens_seen": 128572232, + "step": 7990 + }, + { + "epoch": 0.5597555316224049, + "grad_norm": 3.928884267807007, + "learning_rate": 4.4076994746059554e-05, + "loss": 0.8768, + "num_input_tokens_seen": 128588616, + "step": 7991 + }, + { + "epoch": 0.5598255798681342, + "grad_norm": 4.144782543182373, + "learning_rate": 4.406999649737303e-05, + "loss": 1.2362, + "num_input_tokens_seen": 128604824, + "step": 7992 + }, + { + "epoch": 0.5598956281138634, + "grad_norm": 3.7674214839935303, + "learning_rate": 4.406299824868652e-05, + "loss": 1.0108, + "num_input_tokens_seen": 128621208, + "step": 7993 + }, + { + "epoch": 0.5599656763595927, + "grad_norm": 3.4347217082977295, + "learning_rate": 4.4056e-05, + "loss": 0.8531, + "num_input_tokens_seen": 128637592, + "step": 7994 + }, + { + "epoch": 0.5600357246053219, + "grad_norm": 4.309778690338135, + "learning_rate": 4.404900175131349e-05, + "loss": 1.1468, + "num_input_tokens_seen": 128653976, + "step": 7995 + }, + { + "epoch": 0.5601057728510511, + "grad_norm": 3.567622423171997, + "learning_rate": 4.404200350262698e-05, + "loss": 0.9106, + "num_input_tokens_seen": 128670360, + "step": 7996 + }, + { + "epoch": 0.5601758210967804, + "grad_norm": 5.184731483459473, + "learning_rate": 4.403500525394046e-05, + "loss": 0.8277, + "num_input_tokens_seen": 128684744, + "step": 7997 + }, + { + "epoch": 0.5602458693425096, + "grad_norm": 4.807730674743652, + "learning_rate": 4.402800700525395e-05, + "loss": 1.028, + "num_input_tokens_seen": 128701128, + "step": 7998 + }, + { + "epoch": 0.560315917588239, + "grad_norm": 6.368520736694336, + "learning_rate": 4.402100875656742e-05, + "loss": 1.3107, + "num_input_tokens_seen": 128717288, + "step": 7999 + }, + { + "epoch": 0.5603859658339682, + "grad_norm": 3.997955083847046, + "learning_rate": 4.401401050788091e-05, + "loss": 0.9302, + "num_input_tokens_seen": 128733328, + "step": 8000 + }, + { + "epoch": 0.5603859658339682, + "eval_loss": 1.1226634979248047, + "eval_runtime": 0.2085, + "eval_samples_per_second": 4.797, + "eval_steps_per_second": 4.797, + "num_input_tokens_seen": 128733328, + "step": 8000 + }, + { + "epoch": 0.5604560140796974, + "grad_norm": 7.1005635261535645, + "learning_rate": 4.4007012259194406e-05, + "loss": 1.0116, + "num_input_tokens_seen": 128749712, + "step": 8001 + }, + { + "epoch": 0.5605260623254267, + "grad_norm": 3.9482359886169434, + "learning_rate": 4.400001401050788e-05, + "loss": 0.8402, + "num_input_tokens_seen": 128765408, + "step": 8002 + }, + { + "epoch": 0.5605961105711559, + "grad_norm": 5.755213260650635, + "learning_rate": 4.399301576182137e-05, + "loss": 1.0476, + "num_input_tokens_seen": 128781256, + "step": 8003 + }, + { + "epoch": 0.5606661588168851, + "grad_norm": 4.391782760620117, + "learning_rate": 4.398601751313485e-05, + "loss": 0.9619, + "num_input_tokens_seen": 128796952, + "step": 8004 + }, + { + "epoch": 0.5607362070626144, + "grad_norm": 4.010289192199707, + "learning_rate": 4.397901926444834e-05, + "loss": 0.963, + "num_input_tokens_seen": 128813160, + "step": 8005 + }, + { + "epoch": 0.5608062553083436, + "grad_norm": 4.009171485900879, + "learning_rate": 4.397202101576183e-05, + "loss": 1.0096, + "num_input_tokens_seen": 128828712, + "step": 8006 + }, + { + "epoch": 0.5608763035540729, + "grad_norm": 3.7310292720794678, + "learning_rate": 4.396502276707531e-05, + "loss": 1.0376, + "num_input_tokens_seen": 128845096, + "step": 8007 + }, + { + "epoch": 0.5609463517998021, + "grad_norm": 4.350787162780762, + "learning_rate": 4.39580245183888e-05, + "loss": 1.1524, + "num_input_tokens_seen": 128861480, + "step": 8008 + }, + { + "epoch": 0.5610164000455313, + "grad_norm": 3.7701098918914795, + "learning_rate": 4.3951026269702274e-05, + "loss": 1.0272, + "num_input_tokens_seen": 128877160, + "step": 8009 + }, + { + "epoch": 0.5610864482912606, + "grad_norm": 3.878599166870117, + "learning_rate": 4.394402802101576e-05, + "loss": 0.9247, + "num_input_tokens_seen": 128893544, + "step": 8010 + }, + { + "epoch": 0.5611564965369898, + "grad_norm": 4.865396499633789, + "learning_rate": 4.393702977232926e-05, + "loss": 1.2614, + "num_input_tokens_seen": 128909496, + "step": 8011 + }, + { + "epoch": 0.561226544782719, + "grad_norm": 3.691734552383423, + "learning_rate": 4.393003152364273e-05, + "loss": 1.1862, + "num_input_tokens_seen": 128925440, + "step": 8012 + }, + { + "epoch": 0.5612965930284484, + "grad_norm": 3.890925884246826, + "learning_rate": 4.392303327495622e-05, + "loss": 0.8807, + "num_input_tokens_seen": 128941824, + "step": 8013 + }, + { + "epoch": 0.5613666412741776, + "grad_norm": 5.752060413360596, + "learning_rate": 4.3916035026269704e-05, + "loss": 1.1932, + "num_input_tokens_seen": 128955672, + "step": 8014 + }, + { + "epoch": 0.5614366895199069, + "grad_norm": 3.931739091873169, + "learning_rate": 4.390903677758319e-05, + "loss": 0.9507, + "num_input_tokens_seen": 128972056, + "step": 8015 + }, + { + "epoch": 0.5615067377656361, + "grad_norm": 5.384754657745361, + "learning_rate": 4.390203852889668e-05, + "loss": 1.0261, + "num_input_tokens_seen": 128988440, + "step": 8016 + }, + { + "epoch": 0.5615767860113653, + "grad_norm": 4.624659061431885, + "learning_rate": 4.389504028021016e-05, + "loss": 0.8866, + "num_input_tokens_seen": 129004528, + "step": 8017 + }, + { + "epoch": 0.5616468342570946, + "grad_norm": 3.9661507606506348, + "learning_rate": 4.388804203152365e-05, + "loss": 1.1405, + "num_input_tokens_seen": 129020912, + "step": 8018 + }, + { + "epoch": 0.5617168825028238, + "grad_norm": 3.5319230556488037, + "learning_rate": 4.3881043782837126e-05, + "loss": 1.0024, + "num_input_tokens_seen": 129037296, + "step": 8019 + }, + { + "epoch": 0.561786930748553, + "grad_norm": 4.236444473266602, + "learning_rate": 4.3874045534150615e-05, + "loss": 1.0596, + "num_input_tokens_seen": 129053680, + "step": 8020 + }, + { + "epoch": 0.5618569789942823, + "grad_norm": 3.69328236579895, + "learning_rate": 4.38670472854641e-05, + "loss": 1.0373, + "num_input_tokens_seen": 129070064, + "step": 8021 + }, + { + "epoch": 0.5619270272400115, + "grad_norm": 5.341061115264893, + "learning_rate": 4.3860049036777585e-05, + "loss": 0.9455, + "num_input_tokens_seen": 129086224, + "step": 8022 + }, + { + "epoch": 0.5619970754857408, + "grad_norm": 4.005245208740234, + "learning_rate": 4.3853050788091074e-05, + "loss": 0.9985, + "num_input_tokens_seen": 129102608, + "step": 8023 + }, + { + "epoch": 0.56206712373147, + "grad_norm": 4.377266883850098, + "learning_rate": 4.3846052539404556e-05, + "loss": 1.1311, + "num_input_tokens_seen": 129118992, + "step": 8024 + }, + { + "epoch": 0.5621371719771993, + "grad_norm": 3.7815823554992676, + "learning_rate": 4.3839054290718044e-05, + "loss": 1.192, + "num_input_tokens_seen": 129135376, + "step": 8025 + }, + { + "epoch": 0.5622072202229286, + "grad_norm": 4.8340744972229, + "learning_rate": 4.383205604203152e-05, + "loss": 1.0839, + "num_input_tokens_seen": 129151352, + "step": 8026 + }, + { + "epoch": 0.5622772684686578, + "grad_norm": 5.007735252380371, + "learning_rate": 4.3825057793345015e-05, + "loss": 1.094, + "num_input_tokens_seen": 129166992, + "step": 8027 + }, + { + "epoch": 0.562347316714387, + "grad_norm": 5.447760105133057, + "learning_rate": 4.38180595446585e-05, + "loss": 1.2057, + "num_input_tokens_seen": 129183376, + "step": 8028 + }, + { + "epoch": 0.5624173649601163, + "grad_norm": 4.995473861694336, + "learning_rate": 4.381106129597198e-05, + "loss": 0.9403, + "num_input_tokens_seen": 129198816, + "step": 8029 + }, + { + "epoch": 0.5624874132058455, + "grad_norm": 4.708920955657959, + "learning_rate": 4.380406304728547e-05, + "loss": 0.9984, + "num_input_tokens_seen": 129215200, + "step": 8030 + }, + { + "epoch": 0.5625574614515748, + "grad_norm": 3.8864386081695557, + "learning_rate": 4.379706479859895e-05, + "loss": 0.734, + "num_input_tokens_seen": 129231504, + "step": 8031 + }, + { + "epoch": 0.562627509697304, + "grad_norm": 3.467696189880371, + "learning_rate": 4.379006654991244e-05, + "loss": 0.9214, + "num_input_tokens_seen": 129247624, + "step": 8032 + }, + { + "epoch": 0.5626975579430332, + "grad_norm": 4.07413387298584, + "learning_rate": 4.3783068301225926e-05, + "loss": 1.0158, + "num_input_tokens_seen": 129264008, + "step": 8033 + }, + { + "epoch": 0.5627676061887625, + "grad_norm": 4.135556697845459, + "learning_rate": 4.377607005253941e-05, + "loss": 1.1483, + "num_input_tokens_seen": 129279136, + "step": 8034 + }, + { + "epoch": 0.5628376544344917, + "grad_norm": 4.153659820556641, + "learning_rate": 4.3769071803852896e-05, + "loss": 0.9118, + "num_input_tokens_seen": 129294192, + "step": 8035 + }, + { + "epoch": 0.5629077026802211, + "grad_norm": 3.6717982292175293, + "learning_rate": 4.376207355516637e-05, + "loss": 1.0144, + "num_input_tokens_seen": 129310312, + "step": 8036 + }, + { + "epoch": 0.5629777509259503, + "grad_norm": 4.013020038604736, + "learning_rate": 4.375507530647987e-05, + "loss": 1.1211, + "num_input_tokens_seen": 129326696, + "step": 8037 + }, + { + "epoch": 0.5630477991716795, + "grad_norm": 4.093492031097412, + "learning_rate": 4.3748077057793355e-05, + "loss": 1.0406, + "num_input_tokens_seen": 129343080, + "step": 8038 + }, + { + "epoch": 0.5631178474174088, + "grad_norm": 5.1094512939453125, + "learning_rate": 4.374107880910683e-05, + "loss": 1.1122, + "num_input_tokens_seen": 129359232, + "step": 8039 + }, + { + "epoch": 0.563187895663138, + "grad_norm": 4.3890275955200195, + "learning_rate": 4.373408056042032e-05, + "loss": 1.2691, + "num_input_tokens_seen": 129374992, + "step": 8040 + }, + { + "epoch": 0.5632579439088672, + "grad_norm": 3.4645395278930664, + "learning_rate": 4.37270823117338e-05, + "loss": 0.945, + "num_input_tokens_seen": 129390728, + "step": 8041 + }, + { + "epoch": 0.5633279921545965, + "grad_norm": 3.5011980533599854, + "learning_rate": 4.372008406304729e-05, + "loss": 0.9252, + "num_input_tokens_seen": 129406616, + "step": 8042 + }, + { + "epoch": 0.5633980404003257, + "grad_norm": 3.694739580154419, + "learning_rate": 4.371308581436078e-05, + "loss": 1.0331, + "num_input_tokens_seen": 129423000, + "step": 8043 + }, + { + "epoch": 0.563468088646055, + "grad_norm": 4.7144293785095215, + "learning_rate": 4.370608756567426e-05, + "loss": 0.9954, + "num_input_tokens_seen": 129438048, + "step": 8044 + }, + { + "epoch": 0.5635381368917842, + "grad_norm": 3.9834823608398438, + "learning_rate": 4.369908931698775e-05, + "loss": 1.1571, + "num_input_tokens_seen": 129454432, + "step": 8045 + }, + { + "epoch": 0.5636081851375134, + "grad_norm": 5.334307670593262, + "learning_rate": 4.3692091068301223e-05, + "loss": 1.1283, + "num_input_tokens_seen": 129470816, + "step": 8046 + }, + { + "epoch": 0.5636782333832427, + "grad_norm": 4.415291786193848, + "learning_rate": 4.368509281961472e-05, + "loss": 1.2554, + "num_input_tokens_seen": 129487200, + "step": 8047 + }, + { + "epoch": 0.563748281628972, + "grad_norm": 5.6152262687683105, + "learning_rate": 4.3678094570928194e-05, + "loss": 1.0074, + "num_input_tokens_seen": 129503584, + "step": 8048 + }, + { + "epoch": 0.5638183298747012, + "grad_norm": 5.171108245849609, + "learning_rate": 4.367109632224168e-05, + "loss": 0.9731, + "num_input_tokens_seen": 129519968, + "step": 8049 + }, + { + "epoch": 0.5638883781204305, + "grad_norm": 4.773746013641357, + "learning_rate": 4.366409807355517e-05, + "loss": 1.0941, + "num_input_tokens_seen": 129535432, + "step": 8050 + }, + { + "epoch": 0.5639584263661597, + "grad_norm": 4.129162311553955, + "learning_rate": 4.365709982486865e-05, + "loss": 0.9929, + "num_input_tokens_seen": 129550880, + "step": 8051 + }, + { + "epoch": 0.564028474611889, + "grad_norm": 4.851277828216553, + "learning_rate": 4.365010157618214e-05, + "loss": 0.9914, + "num_input_tokens_seen": 129565592, + "step": 8052 + }, + { + "epoch": 0.5640985228576182, + "grad_norm": 4.487455368041992, + "learning_rate": 4.364310332749562e-05, + "loss": 1.2058, + "num_input_tokens_seen": 129581976, + "step": 8053 + }, + { + "epoch": 0.5641685711033474, + "grad_norm": 4.226871013641357, + "learning_rate": 4.363610507880911e-05, + "loss": 1.1413, + "num_input_tokens_seen": 129598360, + "step": 8054 + }, + { + "epoch": 0.5642386193490767, + "grad_norm": 4.204067230224609, + "learning_rate": 4.36291068301226e-05, + "loss": 1.1353, + "num_input_tokens_seen": 129614232, + "step": 8055 + }, + { + "epoch": 0.5643086675948059, + "grad_norm": 3.792025566101074, + "learning_rate": 4.3622108581436075e-05, + "loss": 1.2165, + "num_input_tokens_seen": 129630616, + "step": 8056 + }, + { + "epoch": 0.5643787158405351, + "grad_norm": 3.966076135635376, + "learning_rate": 4.361511033274957e-05, + "loss": 1.1306, + "num_input_tokens_seen": 129647000, + "step": 8057 + }, + { + "epoch": 0.5644487640862644, + "grad_norm": 5.704814910888672, + "learning_rate": 4.3608112084063046e-05, + "loss": 1.0289, + "num_input_tokens_seen": 129663384, + "step": 8058 + }, + { + "epoch": 0.5645188123319936, + "grad_norm": 4.2781081199646, + "learning_rate": 4.3601113835376534e-05, + "loss": 1.173, + "num_input_tokens_seen": 129679768, + "step": 8059 + }, + { + "epoch": 0.564588860577723, + "grad_norm": 3.806344985961914, + "learning_rate": 4.359411558669002e-05, + "loss": 1.1536, + "num_input_tokens_seen": 129696152, + "step": 8060 + }, + { + "epoch": 0.5646589088234522, + "grad_norm": 4.551416873931885, + "learning_rate": 4.3587117338003505e-05, + "loss": 1.078, + "num_input_tokens_seen": 129711184, + "step": 8061 + }, + { + "epoch": 0.5647289570691814, + "grad_norm": 3.6475703716278076, + "learning_rate": 4.3580119089316993e-05, + "loss": 1.0125, + "num_input_tokens_seen": 129727568, + "step": 8062 + }, + { + "epoch": 0.5647990053149107, + "grad_norm": 4.113941192626953, + "learning_rate": 4.3573120840630475e-05, + "loss": 1.0104, + "num_input_tokens_seen": 129743952, + "step": 8063 + }, + { + "epoch": 0.5648690535606399, + "grad_norm": 3.9017693996429443, + "learning_rate": 4.3566122591943964e-05, + "loss": 0.9712, + "num_input_tokens_seen": 129760336, + "step": 8064 + }, + { + "epoch": 0.5649391018063691, + "grad_norm": 4.13060188293457, + "learning_rate": 4.355912434325745e-05, + "loss": 0.9723, + "num_input_tokens_seen": 129776144, + "step": 8065 + }, + { + "epoch": 0.5650091500520984, + "grad_norm": 4.518004417419434, + "learning_rate": 4.355212609457093e-05, + "loss": 0.9606, + "num_input_tokens_seen": 129792000, + "step": 8066 + }, + { + "epoch": 0.5650791982978276, + "grad_norm": 4.141806602478027, + "learning_rate": 4.354512784588442e-05, + "loss": 1.0458, + "num_input_tokens_seen": 129808384, + "step": 8067 + }, + { + "epoch": 0.5651492465435569, + "grad_norm": 4.177087783813477, + "learning_rate": 4.35381295971979e-05, + "loss": 1.1853, + "num_input_tokens_seen": 129824256, + "step": 8068 + }, + { + "epoch": 0.5652192947892861, + "grad_norm": 3.939929485321045, + "learning_rate": 4.3531131348511386e-05, + "loss": 0.995, + "num_input_tokens_seen": 129840640, + "step": 8069 + }, + { + "epoch": 0.5652893430350153, + "grad_norm": 4.335320472717285, + "learning_rate": 4.3524133099824875e-05, + "loss": 1.048, + "num_input_tokens_seen": 129856952, + "step": 8070 + }, + { + "epoch": 0.5653593912807446, + "grad_norm": 6.816842079162598, + "learning_rate": 4.351713485113836e-05, + "loss": 1.0009, + "num_input_tokens_seen": 129872552, + "step": 8071 + }, + { + "epoch": 0.5654294395264738, + "grad_norm": 4.088340759277344, + "learning_rate": 4.3510136602451845e-05, + "loss": 1.0941, + "num_input_tokens_seen": 129888936, + "step": 8072 + }, + { + "epoch": 0.5654994877722032, + "grad_norm": 3.65846586227417, + "learning_rate": 4.350313835376533e-05, + "loss": 1.1355, + "num_input_tokens_seen": 129905320, + "step": 8073 + }, + { + "epoch": 0.5655695360179324, + "grad_norm": 3.5550310611724854, + "learning_rate": 4.3496140105078816e-05, + "loss": 1.0121, + "num_input_tokens_seen": 129921704, + "step": 8074 + }, + { + "epoch": 0.5656395842636616, + "grad_norm": 4.011558532714844, + "learning_rate": 4.348914185639229e-05, + "loss": 1.0614, + "num_input_tokens_seen": 129937968, + "step": 8075 + }, + { + "epoch": 0.5657096325093909, + "grad_norm": 3.63883638381958, + "learning_rate": 4.348214360770578e-05, + "loss": 1.162, + "num_input_tokens_seen": 129954352, + "step": 8076 + }, + { + "epoch": 0.5657796807551201, + "grad_norm": 3.9487674236297607, + "learning_rate": 4.3475145359019275e-05, + "loss": 1.1023, + "num_input_tokens_seen": 129970528, + "step": 8077 + }, + { + "epoch": 0.5658497290008493, + "grad_norm": 4.0052266120910645, + "learning_rate": 4.346814711033275e-05, + "loss": 1.0462, + "num_input_tokens_seen": 129985568, + "step": 8078 + }, + { + "epoch": 0.5659197772465786, + "grad_norm": 4.147900581359863, + "learning_rate": 4.346114886164624e-05, + "loss": 1.0128, + "num_input_tokens_seen": 130000832, + "step": 8079 + }, + { + "epoch": 0.5659898254923078, + "grad_norm": 3.9152534008026123, + "learning_rate": 4.345415061295972e-05, + "loss": 1.1202, + "num_input_tokens_seen": 130017216, + "step": 8080 + }, + { + "epoch": 0.5660598737380371, + "grad_norm": 5.270138263702393, + "learning_rate": 4.344715236427321e-05, + "loss": 1.1368, + "num_input_tokens_seen": 130033600, + "step": 8081 + }, + { + "epoch": 0.5661299219837663, + "grad_norm": 3.7850892543792725, + "learning_rate": 4.34401541155867e-05, + "loss": 1.0064, + "num_input_tokens_seen": 130049576, + "step": 8082 + }, + { + "epoch": 0.5661999702294955, + "grad_norm": 3.8164186477661133, + "learning_rate": 4.343315586690018e-05, + "loss": 0.9948, + "num_input_tokens_seen": 130065960, + "step": 8083 + }, + { + "epoch": 0.5662700184752248, + "grad_norm": 4.439040660858154, + "learning_rate": 4.342615761821367e-05, + "loss": 0.8882, + "num_input_tokens_seen": 130082344, + "step": 8084 + }, + { + "epoch": 0.566340066720954, + "grad_norm": 4.9009599685668945, + "learning_rate": 4.341915936952714e-05, + "loss": 1.2654, + "num_input_tokens_seen": 130098432, + "step": 8085 + }, + { + "epoch": 0.5664101149666833, + "grad_norm": 3.7141706943511963, + "learning_rate": 4.341216112084063e-05, + "loss": 0.9878, + "num_input_tokens_seen": 130114816, + "step": 8086 + }, + { + "epoch": 0.5664801632124126, + "grad_norm": 3.661966323852539, + "learning_rate": 4.340516287215413e-05, + "loss": 0.9867, + "num_input_tokens_seen": 130131112, + "step": 8087 + }, + { + "epoch": 0.5665502114581418, + "grad_norm": 5.044811725616455, + "learning_rate": 4.33981646234676e-05, + "loss": 0.9719, + "num_input_tokens_seen": 130147496, + "step": 8088 + }, + { + "epoch": 0.5666202597038711, + "grad_norm": 5.159415245056152, + "learning_rate": 4.339116637478109e-05, + "loss": 1.1301, + "num_input_tokens_seen": 130163880, + "step": 8089 + }, + { + "epoch": 0.5666903079496003, + "grad_norm": 3.651123523712158, + "learning_rate": 4.338416812609457e-05, + "loss": 1.0788, + "num_input_tokens_seen": 130179888, + "step": 8090 + }, + { + "epoch": 0.5667603561953295, + "grad_norm": 4.067645072937012, + "learning_rate": 4.337716987740806e-05, + "loss": 1.0071, + "num_input_tokens_seen": 130196272, + "step": 8091 + }, + { + "epoch": 0.5668304044410588, + "grad_norm": 3.646764039993286, + "learning_rate": 4.337017162872155e-05, + "loss": 0.9172, + "num_input_tokens_seen": 130212656, + "step": 8092 + }, + { + "epoch": 0.566900452686788, + "grad_norm": 5.098438739776611, + "learning_rate": 4.336317338003503e-05, + "loss": 0.9031, + "num_input_tokens_seen": 130228608, + "step": 8093 + }, + { + "epoch": 0.5669705009325172, + "grad_norm": 3.536712408065796, + "learning_rate": 4.335617513134852e-05, + "loss": 0.9787, + "num_input_tokens_seen": 130244768, + "step": 8094 + }, + { + "epoch": 0.5670405491782465, + "grad_norm": 4.888411998748779, + "learning_rate": 4.3349176882661995e-05, + "loss": 1.1445, + "num_input_tokens_seen": 130260936, + "step": 8095 + }, + { + "epoch": 0.5671105974239757, + "grad_norm": 4.441121578216553, + "learning_rate": 4.3342178633975484e-05, + "loss": 1.1489, + "num_input_tokens_seen": 130277320, + "step": 8096 + }, + { + "epoch": 0.567180645669705, + "grad_norm": 4.9977545738220215, + "learning_rate": 4.333518038528898e-05, + "loss": 1.0849, + "num_input_tokens_seen": 130292984, + "step": 8097 + }, + { + "epoch": 0.5672506939154343, + "grad_norm": 3.602999448776245, + "learning_rate": 4.3328182136602454e-05, + "loss": 0.8972, + "num_input_tokens_seen": 130308376, + "step": 8098 + }, + { + "epoch": 0.5673207421611635, + "grad_norm": 4.618570804595947, + "learning_rate": 4.332118388791594e-05, + "loss": 1.1175, + "num_input_tokens_seen": 130324440, + "step": 8099 + }, + { + "epoch": 0.5673907904068928, + "grad_norm": 4.875861644744873, + "learning_rate": 4.3314185639229424e-05, + "loss": 1.036, + "num_input_tokens_seen": 130340160, + "step": 8100 + }, + { + "epoch": 0.567460838652622, + "grad_norm": 3.3949697017669678, + "learning_rate": 4.330718739054291e-05, + "loss": 0.9029, + "num_input_tokens_seen": 130355984, + "step": 8101 + }, + { + "epoch": 0.5675308868983513, + "grad_norm": 5.05001163482666, + "learning_rate": 4.330018914185639e-05, + "loss": 1.095, + "num_input_tokens_seen": 130372368, + "step": 8102 + }, + { + "epoch": 0.5676009351440805, + "grad_norm": 3.913780689239502, + "learning_rate": 4.329319089316988e-05, + "loss": 1.1253, + "num_input_tokens_seen": 130388752, + "step": 8103 + }, + { + "epoch": 0.5676709833898097, + "grad_norm": 3.778477668762207, + "learning_rate": 4.328619264448337e-05, + "loss": 1.0927, + "num_input_tokens_seen": 130405136, + "step": 8104 + }, + { + "epoch": 0.567741031635539, + "grad_norm": 4.065064430236816, + "learning_rate": 4.327919439579685e-05, + "loss": 1.1245, + "num_input_tokens_seen": 130421384, + "step": 8105 + }, + { + "epoch": 0.5678110798812682, + "grad_norm": 3.609297037124634, + "learning_rate": 4.3272196147110336e-05, + "loss": 1.0434, + "num_input_tokens_seen": 130437768, + "step": 8106 + }, + { + "epoch": 0.5678811281269974, + "grad_norm": 4.521945476531982, + "learning_rate": 4.326519789842382e-05, + "loss": 1.0285, + "num_input_tokens_seen": 130452640, + "step": 8107 + }, + { + "epoch": 0.5679511763727267, + "grad_norm": 4.647178649902344, + "learning_rate": 4.3258199649737306e-05, + "loss": 1.1492, + "num_input_tokens_seen": 130469024, + "step": 8108 + }, + { + "epoch": 0.5680212246184559, + "grad_norm": 3.594043254852295, + "learning_rate": 4.3251201401050795e-05, + "loss": 0.8569, + "num_input_tokens_seen": 130484648, + "step": 8109 + }, + { + "epoch": 0.5680912728641853, + "grad_norm": 4.731708526611328, + "learning_rate": 4.3244203152364276e-05, + "loss": 1.3175, + "num_input_tokens_seen": 130500024, + "step": 8110 + }, + { + "epoch": 0.5681613211099145, + "grad_norm": 4.488846302032471, + "learning_rate": 4.3237204903677765e-05, + "loss": 1.0714, + "num_input_tokens_seen": 130516408, + "step": 8111 + }, + { + "epoch": 0.5682313693556437, + "grad_norm": 4.828577995300293, + "learning_rate": 4.323020665499124e-05, + "loss": 1.1976, + "num_input_tokens_seen": 130532752, + "step": 8112 + }, + { + "epoch": 0.568301417601373, + "grad_norm": 5.491752624511719, + "learning_rate": 4.3223208406304735e-05, + "loss": 1.249, + "num_input_tokens_seen": 130549136, + "step": 8113 + }, + { + "epoch": 0.5683714658471022, + "grad_norm": 3.4913344383239746, + "learning_rate": 4.3216210157618224e-05, + "loss": 1.0322, + "num_input_tokens_seen": 130565448, + "step": 8114 + }, + { + "epoch": 0.5684415140928314, + "grad_norm": 3.519788980484009, + "learning_rate": 4.32092119089317e-05, + "loss": 0.8711, + "num_input_tokens_seen": 130581832, + "step": 8115 + }, + { + "epoch": 0.5685115623385607, + "grad_norm": 4.691020488739014, + "learning_rate": 4.320221366024519e-05, + "loss": 1.1926, + "num_input_tokens_seen": 130597608, + "step": 8116 + }, + { + "epoch": 0.5685816105842899, + "grad_norm": 4.807389736175537, + "learning_rate": 4.319521541155867e-05, + "loss": 1.0702, + "num_input_tokens_seen": 130613992, + "step": 8117 + }, + { + "epoch": 0.5686516588300192, + "grad_norm": 4.060730457305908, + "learning_rate": 4.318821716287216e-05, + "loss": 1.2357, + "num_input_tokens_seen": 130630376, + "step": 8118 + }, + { + "epoch": 0.5687217070757484, + "grad_norm": 4.58277702331543, + "learning_rate": 4.3181218914185647e-05, + "loss": 1.0893, + "num_input_tokens_seen": 130646112, + "step": 8119 + }, + { + "epoch": 0.5687917553214776, + "grad_norm": 4.290525913238525, + "learning_rate": 4.317422066549913e-05, + "loss": 1.177, + "num_input_tokens_seen": 130662496, + "step": 8120 + }, + { + "epoch": 0.568861803567207, + "grad_norm": 4.339941501617432, + "learning_rate": 4.316722241681262e-05, + "loss": 0.9364, + "num_input_tokens_seen": 130678880, + "step": 8121 + }, + { + "epoch": 0.5689318518129362, + "grad_norm": 7.363986492156982, + "learning_rate": 4.316022416812609e-05, + "loss": 1.0718, + "num_input_tokens_seen": 130693176, + "step": 8122 + }, + { + "epoch": 0.5690019000586654, + "grad_norm": 4.054739475250244, + "learning_rate": 4.315322591943959e-05, + "loss": 1.147, + "num_input_tokens_seen": 130709152, + "step": 8123 + }, + { + "epoch": 0.5690719483043947, + "grad_norm": 3.5982446670532227, + "learning_rate": 4.3146227670753076e-05, + "loss": 0.9578, + "num_input_tokens_seen": 130725424, + "step": 8124 + }, + { + "epoch": 0.5691419965501239, + "grad_norm": 4.437469005584717, + "learning_rate": 4.313922942206655e-05, + "loss": 1.0498, + "num_input_tokens_seen": 130740184, + "step": 8125 + }, + { + "epoch": 0.5692120447958532, + "grad_norm": 3.761885643005371, + "learning_rate": 4.313223117338004e-05, + "loss": 1.0214, + "num_input_tokens_seen": 130756568, + "step": 8126 + }, + { + "epoch": 0.5692820930415824, + "grad_norm": 4.0984416007995605, + "learning_rate": 4.312523292469352e-05, + "loss": 0.8935, + "num_input_tokens_seen": 130772248, + "step": 8127 + }, + { + "epoch": 0.5693521412873116, + "grad_norm": 3.3904314041137695, + "learning_rate": 4.311823467600701e-05, + "loss": 0.9048, + "num_input_tokens_seen": 130788632, + "step": 8128 + }, + { + "epoch": 0.5694221895330409, + "grad_norm": 4.1015214920043945, + "learning_rate": 4.311123642732049e-05, + "loss": 1.0574, + "num_input_tokens_seen": 130805016, + "step": 8129 + }, + { + "epoch": 0.5694922377787701, + "grad_norm": 4.696201801300049, + "learning_rate": 4.310423817863398e-05, + "loss": 1.0341, + "num_input_tokens_seen": 130821160, + "step": 8130 + }, + { + "epoch": 0.5695622860244993, + "grad_norm": 4.361652374267578, + "learning_rate": 4.309723992994747e-05, + "loss": 0.9995, + "num_input_tokens_seen": 130835640, + "step": 8131 + }, + { + "epoch": 0.5696323342702286, + "grad_norm": 3.435509204864502, + "learning_rate": 4.3090241681260944e-05, + "loss": 0.8642, + "num_input_tokens_seen": 130851176, + "step": 8132 + }, + { + "epoch": 0.5697023825159578, + "grad_norm": 5.87947416305542, + "learning_rate": 4.308324343257444e-05, + "loss": 1.0376, + "num_input_tokens_seen": 130866568, + "step": 8133 + }, + { + "epoch": 0.5697724307616872, + "grad_norm": 3.353489398956299, + "learning_rate": 4.3076245183887914e-05, + "loss": 0.8508, + "num_input_tokens_seen": 130882536, + "step": 8134 + }, + { + "epoch": 0.5698424790074164, + "grad_norm": 3.4409422874450684, + "learning_rate": 4.30692469352014e-05, + "loss": 0.86, + "num_input_tokens_seen": 130898848, + "step": 8135 + }, + { + "epoch": 0.5699125272531456, + "grad_norm": 4.050583362579346, + "learning_rate": 4.306224868651489e-05, + "loss": 1.0886, + "num_input_tokens_seen": 130915224, + "step": 8136 + }, + { + "epoch": 0.5699825754988749, + "grad_norm": 3.630647659301758, + "learning_rate": 4.3055250437828373e-05, + "loss": 0.9504, + "num_input_tokens_seen": 130931608, + "step": 8137 + }, + { + "epoch": 0.5700526237446041, + "grad_norm": 3.3654730319976807, + "learning_rate": 4.304825218914186e-05, + "loss": 0.9361, + "num_input_tokens_seen": 130947992, + "step": 8138 + }, + { + "epoch": 0.5701226719903334, + "grad_norm": 4.898937702178955, + "learning_rate": 4.3041253940455344e-05, + "loss": 1.0703, + "num_input_tokens_seen": 130963560, + "step": 8139 + }, + { + "epoch": 0.5701927202360626, + "grad_norm": 3.9583377838134766, + "learning_rate": 4.303425569176883e-05, + "loss": 0.9444, + "num_input_tokens_seen": 130978024, + "step": 8140 + }, + { + "epoch": 0.5702627684817918, + "grad_norm": 3.5538089275360107, + "learning_rate": 4.302725744308232e-05, + "loss": 1.0337, + "num_input_tokens_seen": 130993672, + "step": 8141 + }, + { + "epoch": 0.5703328167275211, + "grad_norm": 3.731968879699707, + "learning_rate": 4.3020259194395796e-05, + "loss": 1.0219, + "num_input_tokens_seen": 131010056, + "step": 8142 + }, + { + "epoch": 0.5704028649732503, + "grad_norm": 4.178092002868652, + "learning_rate": 4.301326094570929e-05, + "loss": 1.0606, + "num_input_tokens_seen": 131026440, + "step": 8143 + }, + { + "epoch": 0.5704729132189795, + "grad_norm": 4.041304588317871, + "learning_rate": 4.3006262697022766e-05, + "loss": 1.0485, + "num_input_tokens_seen": 131041816, + "step": 8144 + }, + { + "epoch": 0.5705429614647088, + "grad_norm": 3.405008316040039, + "learning_rate": 4.2999264448336255e-05, + "loss": 0.9009, + "num_input_tokens_seen": 131058200, + "step": 8145 + }, + { + "epoch": 0.570613009710438, + "grad_norm": 4.503276348114014, + "learning_rate": 4.2992266199649744e-05, + "loss": 0.8806, + "num_input_tokens_seen": 131074584, + "step": 8146 + }, + { + "epoch": 0.5706830579561674, + "grad_norm": 3.5301196575164795, + "learning_rate": 4.2985267950963225e-05, + "loss": 0.918, + "num_input_tokens_seen": 131090968, + "step": 8147 + }, + { + "epoch": 0.5707531062018966, + "grad_norm": 3.807494878768921, + "learning_rate": 4.2978269702276714e-05, + "loss": 1.1168, + "num_input_tokens_seen": 131106840, + "step": 8148 + }, + { + "epoch": 0.5708231544476258, + "grad_norm": 3.9565365314483643, + "learning_rate": 4.2971271453590196e-05, + "loss": 1.01, + "num_input_tokens_seen": 131123224, + "step": 8149 + }, + { + "epoch": 0.5708932026933551, + "grad_norm": 3.4169907569885254, + "learning_rate": 4.2964273204903684e-05, + "loss": 0.9627, + "num_input_tokens_seen": 131139608, + "step": 8150 + }, + { + "epoch": 0.5709632509390843, + "grad_norm": 3.7201099395751953, + "learning_rate": 4.295727495621717e-05, + "loss": 1.0652, + "num_input_tokens_seen": 131155896, + "step": 8151 + }, + { + "epoch": 0.5710332991848135, + "grad_norm": 4.128871440887451, + "learning_rate": 4.295027670753065e-05, + "loss": 1.1233, + "num_input_tokens_seen": 131171824, + "step": 8152 + }, + { + "epoch": 0.5711033474305428, + "grad_norm": 4.046292304992676, + "learning_rate": 4.294327845884414e-05, + "loss": 1.0088, + "num_input_tokens_seen": 131188208, + "step": 8153 + }, + { + "epoch": 0.571173395676272, + "grad_norm": 4.006393909454346, + "learning_rate": 4.293628021015762e-05, + "loss": 1.0339, + "num_input_tokens_seen": 131204592, + "step": 8154 + }, + { + "epoch": 0.5712434439220013, + "grad_norm": 3.7372918128967285, + "learning_rate": 4.292928196147111e-05, + "loss": 0.8374, + "num_input_tokens_seen": 131220552, + "step": 8155 + }, + { + "epoch": 0.5713134921677305, + "grad_norm": 3.778796672821045, + "learning_rate": 4.292228371278459e-05, + "loss": 1.193, + "num_input_tokens_seen": 131236936, + "step": 8156 + }, + { + "epoch": 0.5713835404134597, + "grad_norm": 5.442931652069092, + "learning_rate": 4.291528546409808e-05, + "loss": 1.3328, + "num_input_tokens_seen": 131253320, + "step": 8157 + }, + { + "epoch": 0.571453588659189, + "grad_norm": 3.9510107040405273, + "learning_rate": 4.2908287215411566e-05, + "loss": 1.1776, + "num_input_tokens_seen": 131269704, + "step": 8158 + }, + { + "epoch": 0.5715236369049183, + "grad_norm": 3.787173271179199, + "learning_rate": 4.290128896672505e-05, + "loss": 0.9507, + "num_input_tokens_seen": 131286088, + "step": 8159 + }, + { + "epoch": 0.5715936851506475, + "grad_norm": 4.486179828643799, + "learning_rate": 4.2894290718038536e-05, + "loss": 0.8241, + "num_input_tokens_seen": 131302472, + "step": 8160 + }, + { + "epoch": 0.5716637333963768, + "grad_norm": 5.035008907318115, + "learning_rate": 4.288729246935201e-05, + "loss": 1.0102, + "num_input_tokens_seen": 131318520, + "step": 8161 + }, + { + "epoch": 0.571733781642106, + "grad_norm": 3.7599833011627197, + "learning_rate": 4.28802942206655e-05, + "loss": 0.8591, + "num_input_tokens_seen": 131334904, + "step": 8162 + }, + { + "epoch": 0.5718038298878353, + "grad_norm": 4.14766263961792, + "learning_rate": 4.287329597197899e-05, + "loss": 1.1655, + "num_input_tokens_seen": 131351288, + "step": 8163 + }, + { + "epoch": 0.5718738781335645, + "grad_norm": 3.914499044418335, + "learning_rate": 4.286629772329247e-05, + "loss": 1.0247, + "num_input_tokens_seen": 131367672, + "step": 8164 + }, + { + "epoch": 0.5719439263792937, + "grad_norm": 3.5725250244140625, + "learning_rate": 4.285929947460596e-05, + "loss": 1.0252, + "num_input_tokens_seen": 131383968, + "step": 8165 + }, + { + "epoch": 0.572013974625023, + "grad_norm": 3.5275251865386963, + "learning_rate": 4.285230122591944e-05, + "loss": 1.0562, + "num_input_tokens_seen": 131400352, + "step": 8166 + }, + { + "epoch": 0.5720840228707522, + "grad_norm": 5.734055042266846, + "learning_rate": 4.284530297723293e-05, + "loss": 0.8822, + "num_input_tokens_seen": 131416168, + "step": 8167 + }, + { + "epoch": 0.5721540711164814, + "grad_norm": 6.255549907684326, + "learning_rate": 4.283830472854642e-05, + "loss": 1.0778, + "num_input_tokens_seen": 131432072, + "step": 8168 + }, + { + "epoch": 0.5722241193622107, + "grad_norm": 5.589477062225342, + "learning_rate": 4.283130647985989e-05, + "loss": 0.9766, + "num_input_tokens_seen": 131448456, + "step": 8169 + }, + { + "epoch": 0.5722941676079399, + "grad_norm": 4.749863147735596, + "learning_rate": 4.282430823117339e-05, + "loss": 1.122, + "num_input_tokens_seen": 131464840, + "step": 8170 + }, + { + "epoch": 0.5723642158536693, + "grad_norm": 4.904588222503662, + "learning_rate": 4.2817309982486864e-05, + "loss": 1.1841, + "num_input_tokens_seen": 131481224, + "step": 8171 + }, + { + "epoch": 0.5724342640993985, + "grad_norm": 3.5946271419525146, + "learning_rate": 4.281031173380035e-05, + "loss": 0.8804, + "num_input_tokens_seen": 131497608, + "step": 8172 + }, + { + "epoch": 0.5725043123451277, + "grad_norm": 4.187933921813965, + "learning_rate": 4.280331348511384e-05, + "loss": 0.8924, + "num_input_tokens_seen": 131513776, + "step": 8173 + }, + { + "epoch": 0.572574360590857, + "grad_norm": 4.336525917053223, + "learning_rate": 4.279631523642732e-05, + "loss": 1.0626, + "num_input_tokens_seen": 131529544, + "step": 8174 + }, + { + "epoch": 0.5726444088365862, + "grad_norm": 4.156813621520996, + "learning_rate": 4.278931698774081e-05, + "loss": 0.9311, + "num_input_tokens_seen": 131544880, + "step": 8175 + }, + { + "epoch": 0.5727144570823155, + "grad_norm": 4.064977169036865, + "learning_rate": 4.278231873905429e-05, + "loss": 1.2461, + "num_input_tokens_seen": 131560600, + "step": 8176 + }, + { + "epoch": 0.5727845053280447, + "grad_norm": 3.8093082904815674, + "learning_rate": 4.277532049036778e-05, + "loss": 1.0079, + "num_input_tokens_seen": 131576984, + "step": 8177 + }, + { + "epoch": 0.5728545535737739, + "grad_norm": 3.8263261318206787, + "learning_rate": 4.276832224168127e-05, + "loss": 1.0441, + "num_input_tokens_seen": 131592848, + "step": 8178 + }, + { + "epoch": 0.5729246018195032, + "grad_norm": 4.268642425537109, + "learning_rate": 4.2761323992994745e-05, + "loss": 1.0139, + "num_input_tokens_seen": 131609232, + "step": 8179 + }, + { + "epoch": 0.5729946500652324, + "grad_norm": 3.6199145317077637, + "learning_rate": 4.275432574430824e-05, + "loss": 1.0993, + "num_input_tokens_seen": 131625160, + "step": 8180 + }, + { + "epoch": 0.5730646983109616, + "grad_norm": 5.106734275817871, + "learning_rate": 4.2747327495621716e-05, + "loss": 1.2588, + "num_input_tokens_seen": 131641240, + "step": 8181 + }, + { + "epoch": 0.5731347465566909, + "grad_norm": 4.9245381355285645, + "learning_rate": 4.2740329246935204e-05, + "loss": 0.9496, + "num_input_tokens_seen": 131657624, + "step": 8182 + }, + { + "epoch": 0.5732047948024201, + "grad_norm": 4.108707904815674, + "learning_rate": 4.2733330998248686e-05, + "loss": 1.0043, + "num_input_tokens_seen": 131673400, + "step": 8183 + }, + { + "epoch": 0.5732748430481495, + "grad_norm": 4.672266006469727, + "learning_rate": 4.2726332749562175e-05, + "loss": 1.1272, + "num_input_tokens_seen": 131689784, + "step": 8184 + }, + { + "epoch": 0.5733448912938787, + "grad_norm": 5.033571720123291, + "learning_rate": 4.271933450087566e-05, + "loss": 1.067, + "num_input_tokens_seen": 131706168, + "step": 8185 + }, + { + "epoch": 0.5734149395396079, + "grad_norm": 4.336825847625732, + "learning_rate": 4.2712336252189145e-05, + "loss": 0.9631, + "num_input_tokens_seen": 131722496, + "step": 8186 + }, + { + "epoch": 0.5734849877853372, + "grad_norm": 3.6326375007629395, + "learning_rate": 4.2705338003502634e-05, + "loss": 0.8525, + "num_input_tokens_seen": 131738880, + "step": 8187 + }, + { + "epoch": 0.5735550360310664, + "grad_norm": 3.407284736633301, + "learning_rate": 4.269833975481611e-05, + "loss": 1.099, + "num_input_tokens_seen": 131755080, + "step": 8188 + }, + { + "epoch": 0.5736250842767956, + "grad_norm": 5.604672431945801, + "learning_rate": 4.26913415061296e-05, + "loss": 1.0606, + "num_input_tokens_seen": 131771464, + "step": 8189 + }, + { + "epoch": 0.5736951325225249, + "grad_norm": 3.4375531673431396, + "learning_rate": 4.268434325744309e-05, + "loss": 0.9977, + "num_input_tokens_seen": 131787848, + "step": 8190 + }, + { + "epoch": 0.5737651807682541, + "grad_norm": 4.113184928894043, + "learning_rate": 4.267734500875657e-05, + "loss": 1.0283, + "num_input_tokens_seen": 131804232, + "step": 8191 + }, + { + "epoch": 0.5738352290139834, + "grad_norm": 5.918920516967773, + "learning_rate": 4.2670346760070056e-05, + "loss": 1.168, + "num_input_tokens_seen": 131819608, + "step": 8192 + }, + { + "epoch": 0.5739052772597126, + "grad_norm": 4.387043476104736, + "learning_rate": 4.266334851138354e-05, + "loss": 1.2334, + "num_input_tokens_seen": 131835992, + "step": 8193 + }, + { + "epoch": 0.5739753255054418, + "grad_norm": 4.381311416625977, + "learning_rate": 4.2656350262697027e-05, + "loss": 1.0675, + "num_input_tokens_seen": 131851920, + "step": 8194 + }, + { + "epoch": 0.5740453737511712, + "grad_norm": 6.177340507507324, + "learning_rate": 4.2649352014010515e-05, + "loss": 1.1517, + "num_input_tokens_seen": 131867240, + "step": 8195 + }, + { + "epoch": 0.5741154219969004, + "grad_norm": 4.505295276641846, + "learning_rate": 4.2642353765324e-05, + "loss": 1.1147, + "num_input_tokens_seen": 131883408, + "step": 8196 + }, + { + "epoch": 0.5741854702426296, + "grad_norm": 3.9150309562683105, + "learning_rate": 4.2635355516637486e-05, + "loss": 0.9609, + "num_input_tokens_seen": 131899144, + "step": 8197 + }, + { + "epoch": 0.5742555184883589, + "grad_norm": 4.148092269897461, + "learning_rate": 4.262835726795096e-05, + "loss": 1.0709, + "num_input_tokens_seen": 131915528, + "step": 8198 + }, + { + "epoch": 0.5743255667340881, + "grad_norm": 3.353017807006836, + "learning_rate": 4.262135901926445e-05, + "loss": 1.0293, + "num_input_tokens_seen": 131931912, + "step": 8199 + }, + { + "epoch": 0.5743956149798174, + "grad_norm": 3.949207305908203, + "learning_rate": 4.2614360770577945e-05, + "loss": 0.8957, + "num_input_tokens_seen": 131947712, + "step": 8200 + }, + { + "epoch": 0.5743956149798174, + "eval_loss": 1.1200522184371948, + "eval_runtime": 0.2045, + "eval_samples_per_second": 4.889, + "eval_steps_per_second": 4.889, + "num_input_tokens_seen": 131947712, + "step": 8200 + }, + { + "epoch": 0.5744656632255466, + "grad_norm": 3.978220224380493, + "learning_rate": 4.260736252189142e-05, + "loss": 0.9035, + "num_input_tokens_seen": 131963208, + "step": 8201 + }, + { + "epoch": 0.5745357114712758, + "grad_norm": 3.7986531257629395, + "learning_rate": 4.260036427320491e-05, + "loss": 0.8985, + "num_input_tokens_seen": 131978048, + "step": 8202 + }, + { + "epoch": 0.5746057597170051, + "grad_norm": 3.9384713172912598, + "learning_rate": 4.259336602451839e-05, + "loss": 1.1815, + "num_input_tokens_seen": 131994432, + "step": 8203 + }, + { + "epoch": 0.5746758079627343, + "grad_norm": 4.360903739929199, + "learning_rate": 4.258636777583188e-05, + "loss": 1.107, + "num_input_tokens_seen": 132010504, + "step": 8204 + }, + { + "epoch": 0.5747458562084636, + "grad_norm": 4.957514762878418, + "learning_rate": 4.2579369527145354e-05, + "loss": 1.1793, + "num_input_tokens_seen": 132026192, + "step": 8205 + }, + { + "epoch": 0.5748159044541928, + "grad_norm": 4.4821882247924805, + "learning_rate": 4.257237127845885e-05, + "loss": 1.1506, + "num_input_tokens_seen": 132041624, + "step": 8206 + }, + { + "epoch": 0.574885952699922, + "grad_norm": 3.915302276611328, + "learning_rate": 4.256537302977234e-05, + "loss": 1.0654, + "num_input_tokens_seen": 132056600, + "step": 8207 + }, + { + "epoch": 0.5749560009456514, + "grad_norm": 3.67526912689209, + "learning_rate": 4.255837478108581e-05, + "loss": 1.101, + "num_input_tokens_seen": 132072984, + "step": 8208 + }, + { + "epoch": 0.5750260491913806, + "grad_norm": 5.455066204071045, + "learning_rate": 4.25513765323993e-05, + "loss": 1.1665, + "num_input_tokens_seen": 132089368, + "step": 8209 + }, + { + "epoch": 0.5750960974371098, + "grad_norm": 3.7879862785339355, + "learning_rate": 4.254437828371278e-05, + "loss": 1.2125, + "num_input_tokens_seen": 132105752, + "step": 8210 + }, + { + "epoch": 0.5751661456828391, + "grad_norm": 4.033114910125732, + "learning_rate": 4.253738003502627e-05, + "loss": 1.1133, + "num_input_tokens_seen": 132122136, + "step": 8211 + }, + { + "epoch": 0.5752361939285683, + "grad_norm": 3.596369743347168, + "learning_rate": 4.253038178633976e-05, + "loss": 1.0748, + "num_input_tokens_seen": 132138520, + "step": 8212 + }, + { + "epoch": 0.5753062421742976, + "grad_norm": 4.42296838760376, + "learning_rate": 4.252338353765324e-05, + "loss": 1.0508, + "num_input_tokens_seen": 132154904, + "step": 8213 + }, + { + "epoch": 0.5753762904200268, + "grad_norm": 3.5020558834075928, + "learning_rate": 4.251638528896673e-05, + "loss": 0.9663, + "num_input_tokens_seen": 132171248, + "step": 8214 + }, + { + "epoch": 0.575446338665756, + "grad_norm": 4.176201343536377, + "learning_rate": 4.2509387040280206e-05, + "loss": 1.0606, + "num_input_tokens_seen": 132187296, + "step": 8215 + }, + { + "epoch": 0.5755163869114853, + "grad_norm": 3.459134817123413, + "learning_rate": 4.25023887915937e-05, + "loss": 0.87, + "num_input_tokens_seen": 132203680, + "step": 8216 + }, + { + "epoch": 0.5755864351572145, + "grad_norm": 3.870122194290161, + "learning_rate": 4.249539054290719e-05, + "loss": 1.005, + "num_input_tokens_seen": 132220064, + "step": 8217 + }, + { + "epoch": 0.5756564834029437, + "grad_norm": 4.246501922607422, + "learning_rate": 4.2488392294220665e-05, + "loss": 1.246, + "num_input_tokens_seen": 132236248, + "step": 8218 + }, + { + "epoch": 0.575726531648673, + "grad_norm": 4.028504371643066, + "learning_rate": 4.248139404553415e-05, + "loss": 1.1537, + "num_input_tokens_seen": 132252624, + "step": 8219 + }, + { + "epoch": 0.5757965798944022, + "grad_norm": 5.083249092102051, + "learning_rate": 4.2474395796847635e-05, + "loss": 1.1357, + "num_input_tokens_seen": 132269008, + "step": 8220 + }, + { + "epoch": 0.5758666281401316, + "grad_norm": 4.459508419036865, + "learning_rate": 4.2467397548161124e-05, + "loss": 0.9581, + "num_input_tokens_seen": 132285392, + "step": 8221 + }, + { + "epoch": 0.5759366763858608, + "grad_norm": 3.5667686462402344, + "learning_rate": 4.246039929947461e-05, + "loss": 0.9399, + "num_input_tokens_seen": 132301584, + "step": 8222 + }, + { + "epoch": 0.57600672463159, + "grad_norm": 3.982933759689331, + "learning_rate": 4.2453401050788094e-05, + "loss": 1.2309, + "num_input_tokens_seen": 132317968, + "step": 8223 + }, + { + "epoch": 0.5760767728773193, + "grad_norm": 3.59114408493042, + "learning_rate": 4.244640280210158e-05, + "loss": 0.9939, + "num_input_tokens_seen": 132334352, + "step": 8224 + }, + { + "epoch": 0.5761468211230485, + "grad_norm": 4.596956729888916, + "learning_rate": 4.243940455341506e-05, + "loss": 1.1058, + "num_input_tokens_seen": 132350736, + "step": 8225 + }, + { + "epoch": 0.5762168693687777, + "grad_norm": 4.798111915588379, + "learning_rate": 4.243240630472855e-05, + "loss": 1.1164, + "num_input_tokens_seen": 132367120, + "step": 8226 + }, + { + "epoch": 0.576286917614507, + "grad_norm": 3.9915149211883545, + "learning_rate": 4.242540805604204e-05, + "loss": 1.1464, + "num_input_tokens_seen": 132382712, + "step": 8227 + }, + { + "epoch": 0.5763569658602362, + "grad_norm": 3.8071064949035645, + "learning_rate": 4.241840980735552e-05, + "loss": 0.8643, + "num_input_tokens_seen": 132399096, + "step": 8228 + }, + { + "epoch": 0.5764270141059655, + "grad_norm": 3.679708242416382, + "learning_rate": 4.2411411558669005e-05, + "loss": 1.1226, + "num_input_tokens_seen": 132415288, + "step": 8229 + }, + { + "epoch": 0.5764970623516947, + "grad_norm": 3.723848581314087, + "learning_rate": 4.240441330998249e-05, + "loss": 1.0517, + "num_input_tokens_seen": 132431672, + "step": 8230 + }, + { + "epoch": 0.5765671105974239, + "grad_norm": 4.438333988189697, + "learning_rate": 4.2397415061295976e-05, + "loss": 1.1601, + "num_input_tokens_seen": 132447408, + "step": 8231 + }, + { + "epoch": 0.5766371588431533, + "grad_norm": 3.4421169757843018, + "learning_rate": 4.239041681260946e-05, + "loss": 1.017, + "num_input_tokens_seen": 132463600, + "step": 8232 + }, + { + "epoch": 0.5767072070888825, + "grad_norm": 3.851832389831543, + "learning_rate": 4.2383418563922946e-05, + "loss": 1.0237, + "num_input_tokens_seen": 132479984, + "step": 8233 + }, + { + "epoch": 0.5767772553346117, + "grad_norm": 5.055593967437744, + "learning_rate": 4.2376420315236435e-05, + "loss": 1.0512, + "num_input_tokens_seen": 132495872, + "step": 8234 + }, + { + "epoch": 0.576847303580341, + "grad_norm": 3.958918809890747, + "learning_rate": 4.236942206654991e-05, + "loss": 1.0412, + "num_input_tokens_seen": 132512256, + "step": 8235 + }, + { + "epoch": 0.5769173518260702, + "grad_norm": 3.6936240196228027, + "learning_rate": 4.2362423817863405e-05, + "loss": 1.0251, + "num_input_tokens_seen": 132528640, + "step": 8236 + }, + { + "epoch": 0.5769874000717995, + "grad_norm": 4.020928859710693, + "learning_rate": 4.235542556917688e-05, + "loss": 0.979, + "num_input_tokens_seen": 132543712, + "step": 8237 + }, + { + "epoch": 0.5770574483175287, + "grad_norm": 3.5804123878479004, + "learning_rate": 4.234842732049037e-05, + "loss": 1.0581, + "num_input_tokens_seen": 132559568, + "step": 8238 + }, + { + "epoch": 0.5771274965632579, + "grad_norm": 5.450964450836182, + "learning_rate": 4.234142907180386e-05, + "loss": 1.0109, + "num_input_tokens_seen": 132575952, + "step": 8239 + }, + { + "epoch": 0.5771975448089872, + "grad_norm": 4.571927547454834, + "learning_rate": 4.233443082311734e-05, + "loss": 1.0427, + "num_input_tokens_seen": 132591832, + "step": 8240 + }, + { + "epoch": 0.5772675930547164, + "grad_norm": 5.09351921081543, + "learning_rate": 4.232743257443083e-05, + "loss": 0.9558, + "num_input_tokens_seen": 132608216, + "step": 8241 + }, + { + "epoch": 0.5773376413004457, + "grad_norm": 3.211693525314331, + "learning_rate": 4.232043432574431e-05, + "loss": 0.9804, + "num_input_tokens_seen": 132624600, + "step": 8242 + }, + { + "epoch": 0.5774076895461749, + "grad_norm": 3.822723865509033, + "learning_rate": 4.23134360770578e-05, + "loss": 1.0267, + "num_input_tokens_seen": 132640984, + "step": 8243 + }, + { + "epoch": 0.5774777377919041, + "grad_norm": 5.541271686553955, + "learning_rate": 4.230643782837129e-05, + "loss": 0.7341, + "num_input_tokens_seen": 132657368, + "step": 8244 + }, + { + "epoch": 0.5775477860376335, + "grad_norm": 6.682171821594238, + "learning_rate": 4.229943957968476e-05, + "loss": 0.9682, + "num_input_tokens_seen": 132673752, + "step": 8245 + }, + { + "epoch": 0.5776178342833627, + "grad_norm": 3.4161407947540283, + "learning_rate": 4.229244133099826e-05, + "loss": 0.9307, + "num_input_tokens_seen": 132690136, + "step": 8246 + }, + { + "epoch": 0.5776878825290919, + "grad_norm": 3.6526873111724854, + "learning_rate": 4.228544308231173e-05, + "loss": 1.0356, + "num_input_tokens_seen": 132706520, + "step": 8247 + }, + { + "epoch": 0.5777579307748212, + "grad_norm": 4.506532192230225, + "learning_rate": 4.227844483362522e-05, + "loss": 0.9916, + "num_input_tokens_seen": 132721600, + "step": 8248 + }, + { + "epoch": 0.5778279790205504, + "grad_norm": 3.9028801918029785, + "learning_rate": 4.227144658493871e-05, + "loss": 1.1071, + "num_input_tokens_seen": 132737728, + "step": 8249 + }, + { + "epoch": 0.5778980272662797, + "grad_norm": 4.508323669433594, + "learning_rate": 4.226444833625219e-05, + "loss": 1.044, + "num_input_tokens_seen": 132753240, + "step": 8250 + }, + { + "epoch": 0.5779680755120089, + "grad_norm": 4.045495986938477, + "learning_rate": 4.225745008756568e-05, + "loss": 1.0133, + "num_input_tokens_seen": 132769024, + "step": 8251 + }, + { + "epoch": 0.5780381237577381, + "grad_norm": 3.3747479915618896, + "learning_rate": 4.225045183887916e-05, + "loss": 0.9641, + "num_input_tokens_seen": 132785408, + "step": 8252 + }, + { + "epoch": 0.5781081720034674, + "grad_norm": 3.527862787246704, + "learning_rate": 4.224345359019265e-05, + "loss": 0.9351, + "num_input_tokens_seen": 132801792, + "step": 8253 + }, + { + "epoch": 0.5781782202491966, + "grad_norm": 5.42443323135376, + "learning_rate": 4.223645534150614e-05, + "loss": 0.9197, + "num_input_tokens_seen": 132817536, + "step": 8254 + }, + { + "epoch": 0.5782482684949258, + "grad_norm": 3.450995922088623, + "learning_rate": 4.2229457092819614e-05, + "loss": 1.1086, + "num_input_tokens_seen": 132833920, + "step": 8255 + }, + { + "epoch": 0.5783183167406551, + "grad_norm": 3.824470043182373, + "learning_rate": 4.222245884413311e-05, + "loss": 1.0504, + "num_input_tokens_seen": 132850304, + "step": 8256 + }, + { + "epoch": 0.5783883649863844, + "grad_norm": 4.278646945953369, + "learning_rate": 4.2215460595446584e-05, + "loss": 0.766, + "num_input_tokens_seen": 132865216, + "step": 8257 + }, + { + "epoch": 0.5784584132321137, + "grad_norm": 3.582697629928589, + "learning_rate": 4.220846234676007e-05, + "loss": 0.9996, + "num_input_tokens_seen": 132881600, + "step": 8258 + }, + { + "epoch": 0.5785284614778429, + "grad_norm": 4.3087849617004395, + "learning_rate": 4.2201464098073555e-05, + "loss": 0.9265, + "num_input_tokens_seen": 132897360, + "step": 8259 + }, + { + "epoch": 0.5785985097235721, + "grad_norm": 3.9091784954071045, + "learning_rate": 4.219446584938704e-05, + "loss": 1.0683, + "num_input_tokens_seen": 132913504, + "step": 8260 + }, + { + "epoch": 0.5786685579693014, + "grad_norm": 3.899811267852783, + "learning_rate": 4.218746760070053e-05, + "loss": 1.0876, + "num_input_tokens_seen": 132929184, + "step": 8261 + }, + { + "epoch": 0.5787386062150306, + "grad_norm": 4.090141773223877, + "learning_rate": 4.2180469352014014e-05, + "loss": 1.1917, + "num_input_tokens_seen": 132945016, + "step": 8262 + }, + { + "epoch": 0.5788086544607598, + "grad_norm": 4.3014631271362305, + "learning_rate": 4.21734711033275e-05, + "loss": 0.8849, + "num_input_tokens_seen": 132961224, + "step": 8263 + }, + { + "epoch": 0.5788787027064891, + "grad_norm": 4.155791282653809, + "learning_rate": 4.216647285464098e-05, + "loss": 1.2749, + "num_input_tokens_seen": 132977464, + "step": 8264 + }, + { + "epoch": 0.5789487509522183, + "grad_norm": 3.6278793811798096, + "learning_rate": 4.2159474605954466e-05, + "loss": 0.9852, + "num_input_tokens_seen": 132993848, + "step": 8265 + }, + { + "epoch": 0.5790187991979476, + "grad_norm": 3.477964401245117, + "learning_rate": 4.215247635726796e-05, + "loss": 0.8369, + "num_input_tokens_seen": 133010232, + "step": 8266 + }, + { + "epoch": 0.5790888474436768, + "grad_norm": 5.3280439376831055, + "learning_rate": 4.2145478108581436e-05, + "loss": 0.9703, + "num_input_tokens_seen": 133026616, + "step": 8267 + }, + { + "epoch": 0.579158895689406, + "grad_norm": 3.5057740211486816, + "learning_rate": 4.2138479859894925e-05, + "loss": 0.9546, + "num_input_tokens_seen": 133043000, + "step": 8268 + }, + { + "epoch": 0.5792289439351354, + "grad_norm": 6.108116149902344, + "learning_rate": 4.2131481611208407e-05, + "loss": 0.9677, + "num_input_tokens_seen": 133058584, + "step": 8269 + }, + { + "epoch": 0.5792989921808646, + "grad_norm": 3.764878273010254, + "learning_rate": 4.2124483362521895e-05, + "loss": 0.9287, + "num_input_tokens_seen": 133074176, + "step": 8270 + }, + { + "epoch": 0.5793690404265938, + "grad_norm": 3.9870357513427734, + "learning_rate": 4.2117485113835384e-05, + "loss": 1.1149, + "num_input_tokens_seen": 133090320, + "step": 8271 + }, + { + "epoch": 0.5794390886723231, + "grad_norm": 3.3531510829925537, + "learning_rate": 4.2110486865148866e-05, + "loss": 0.7976, + "num_input_tokens_seen": 133106312, + "step": 8272 + }, + { + "epoch": 0.5795091369180523, + "grad_norm": 3.321676731109619, + "learning_rate": 4.2103488616462354e-05, + "loss": 0.8953, + "num_input_tokens_seen": 133122696, + "step": 8273 + }, + { + "epoch": 0.5795791851637816, + "grad_norm": 4.315324306488037, + "learning_rate": 4.209649036777583e-05, + "loss": 1.0502, + "num_input_tokens_seen": 133139080, + "step": 8274 + }, + { + "epoch": 0.5796492334095108, + "grad_norm": 3.9912524223327637, + "learning_rate": 4.208949211908932e-05, + "loss": 1.0919, + "num_input_tokens_seen": 133155192, + "step": 8275 + }, + { + "epoch": 0.57971928165524, + "grad_norm": 3.8014609813690186, + "learning_rate": 4.208249387040281e-05, + "loss": 1.0997, + "num_input_tokens_seen": 133171576, + "step": 8276 + }, + { + "epoch": 0.5797893299009693, + "grad_norm": 3.790175437927246, + "learning_rate": 4.207549562171629e-05, + "loss": 0.9689, + "num_input_tokens_seen": 133187448, + "step": 8277 + }, + { + "epoch": 0.5798593781466985, + "grad_norm": 3.7050399780273438, + "learning_rate": 4.206849737302978e-05, + "loss": 1.0661, + "num_input_tokens_seen": 133203832, + "step": 8278 + }, + { + "epoch": 0.5799294263924278, + "grad_norm": 3.9394073486328125, + "learning_rate": 4.206149912434326e-05, + "loss": 1.0586, + "num_input_tokens_seen": 133219176, + "step": 8279 + }, + { + "epoch": 0.579999474638157, + "grad_norm": 3.8162503242492676, + "learning_rate": 4.205450087565675e-05, + "loss": 1.0055, + "num_input_tokens_seen": 133235272, + "step": 8280 + }, + { + "epoch": 0.5800695228838862, + "grad_norm": 4.20829439163208, + "learning_rate": 4.2047502626970236e-05, + "loss": 1.0363, + "num_input_tokens_seen": 133250872, + "step": 8281 + }, + { + "epoch": 0.5801395711296156, + "grad_norm": 4.279886722564697, + "learning_rate": 4.204050437828372e-05, + "loss": 1.1982, + "num_input_tokens_seen": 133266680, + "step": 8282 + }, + { + "epoch": 0.5802096193753448, + "grad_norm": 4.4257354736328125, + "learning_rate": 4.2033506129597206e-05, + "loss": 1.1011, + "num_input_tokens_seen": 133282928, + "step": 8283 + }, + { + "epoch": 0.580279667621074, + "grad_norm": 4.393752098083496, + "learning_rate": 4.202650788091068e-05, + "loss": 1.373, + "num_input_tokens_seen": 133299312, + "step": 8284 + }, + { + "epoch": 0.5803497158668033, + "grad_norm": 3.4351353645324707, + "learning_rate": 4.201950963222417e-05, + "loss": 0.9685, + "num_input_tokens_seen": 133315568, + "step": 8285 + }, + { + "epoch": 0.5804197641125325, + "grad_norm": 5.595382213592529, + "learning_rate": 4.201251138353765e-05, + "loss": 0.9129, + "num_input_tokens_seen": 133331952, + "step": 8286 + }, + { + "epoch": 0.5804898123582618, + "grad_norm": 3.8465957641601562, + "learning_rate": 4.200551313485114e-05, + "loss": 1.083, + "num_input_tokens_seen": 133347992, + "step": 8287 + }, + { + "epoch": 0.580559860603991, + "grad_norm": 3.53275465965271, + "learning_rate": 4.199851488616463e-05, + "loss": 0.8685, + "num_input_tokens_seen": 133364376, + "step": 8288 + }, + { + "epoch": 0.5806299088497202, + "grad_norm": 3.821805000305176, + "learning_rate": 4.199151663747811e-05, + "loss": 0.9433, + "num_input_tokens_seen": 133380760, + "step": 8289 + }, + { + "epoch": 0.5806999570954495, + "grad_norm": 3.843419313430786, + "learning_rate": 4.19845183887916e-05, + "loss": 1.0135, + "num_input_tokens_seen": 133397144, + "step": 8290 + }, + { + "epoch": 0.5807700053411787, + "grad_norm": 3.5370657444000244, + "learning_rate": 4.1977520140105074e-05, + "loss": 1.0259, + "num_input_tokens_seen": 133413528, + "step": 8291 + }, + { + "epoch": 0.5808400535869079, + "grad_norm": 3.9939937591552734, + "learning_rate": 4.197052189141857e-05, + "loss": 1.1104, + "num_input_tokens_seen": 133429912, + "step": 8292 + }, + { + "epoch": 0.5809101018326372, + "grad_norm": 3.8171210289001465, + "learning_rate": 4.196352364273206e-05, + "loss": 1.0962, + "num_input_tokens_seen": 133446280, + "step": 8293 + }, + { + "epoch": 0.5809801500783665, + "grad_norm": 5.993227481842041, + "learning_rate": 4.195652539404553e-05, + "loss": 1.0393, + "num_input_tokens_seen": 133462272, + "step": 8294 + }, + { + "epoch": 0.5810501983240958, + "grad_norm": 3.480929136276245, + "learning_rate": 4.194952714535902e-05, + "loss": 0.8842, + "num_input_tokens_seen": 133478656, + "step": 8295 + }, + { + "epoch": 0.581120246569825, + "grad_norm": 4.371162414550781, + "learning_rate": 4.1942528896672504e-05, + "loss": 0.861, + "num_input_tokens_seen": 133495040, + "step": 8296 + }, + { + "epoch": 0.5811902948155542, + "grad_norm": 3.9835524559020996, + "learning_rate": 4.193553064798599e-05, + "loss": 1.025, + "num_input_tokens_seen": 133510576, + "step": 8297 + }, + { + "epoch": 0.5812603430612835, + "grad_norm": 3.935680866241455, + "learning_rate": 4.192853239929948e-05, + "loss": 1.2121, + "num_input_tokens_seen": 133526960, + "step": 8298 + }, + { + "epoch": 0.5813303913070127, + "grad_norm": 5.499049186706543, + "learning_rate": 4.192153415061296e-05, + "loss": 0.9905, + "num_input_tokens_seen": 133542336, + "step": 8299 + }, + { + "epoch": 0.5814004395527419, + "grad_norm": 4.98264217376709, + "learning_rate": 4.191453590192645e-05, + "loss": 1.0016, + "num_input_tokens_seen": 133556464, + "step": 8300 + }, + { + "epoch": 0.5814704877984712, + "grad_norm": 4.091787338256836, + "learning_rate": 4.1907537653239926e-05, + "loss": 0.9863, + "num_input_tokens_seen": 133572280, + "step": 8301 + }, + { + "epoch": 0.5815405360442004, + "grad_norm": 3.9688591957092285, + "learning_rate": 4.190053940455342e-05, + "loss": 1.0631, + "num_input_tokens_seen": 133588224, + "step": 8302 + }, + { + "epoch": 0.5816105842899297, + "grad_norm": 3.7555758953094482, + "learning_rate": 4.189354115586691e-05, + "loss": 1.0418, + "num_input_tokens_seen": 133604016, + "step": 8303 + }, + { + "epoch": 0.5816806325356589, + "grad_norm": 3.423367977142334, + "learning_rate": 4.1886542907180385e-05, + "loss": 1.0608, + "num_input_tokens_seen": 133620400, + "step": 8304 + }, + { + "epoch": 0.5817506807813881, + "grad_norm": 4.812343597412109, + "learning_rate": 4.1879544658493874e-05, + "loss": 1.0295, + "num_input_tokens_seen": 133636008, + "step": 8305 + }, + { + "epoch": 0.5818207290271175, + "grad_norm": 4.64470100402832, + "learning_rate": 4.1872546409807356e-05, + "loss": 1.1065, + "num_input_tokens_seen": 133651944, + "step": 8306 + }, + { + "epoch": 0.5818907772728467, + "grad_norm": 3.9178478717803955, + "learning_rate": 4.1865548161120844e-05, + "loss": 1.162, + "num_input_tokens_seen": 133667336, + "step": 8307 + }, + { + "epoch": 0.5819608255185759, + "grad_norm": 3.7328476905822754, + "learning_rate": 4.185854991243433e-05, + "loss": 0.9681, + "num_input_tokens_seen": 133683720, + "step": 8308 + }, + { + "epoch": 0.5820308737643052, + "grad_norm": 5.883306980133057, + "learning_rate": 4.1851551663747815e-05, + "loss": 1.0203, + "num_input_tokens_seen": 133700104, + "step": 8309 + }, + { + "epoch": 0.5821009220100344, + "grad_norm": 3.6011807918548584, + "learning_rate": 4.18445534150613e-05, + "loss": 0.8344, + "num_input_tokens_seen": 133716184, + "step": 8310 + }, + { + "epoch": 0.5821709702557637, + "grad_norm": 5.971445083618164, + "learning_rate": 4.183755516637478e-05, + "loss": 1.0888, + "num_input_tokens_seen": 133732552, + "step": 8311 + }, + { + "epoch": 0.5822410185014929, + "grad_norm": 3.7484395503997803, + "learning_rate": 4.1830556917688274e-05, + "loss": 0.9394, + "num_input_tokens_seen": 133748400, + "step": 8312 + }, + { + "epoch": 0.5823110667472221, + "grad_norm": 4.420420169830322, + "learning_rate": 4.182355866900175e-05, + "loss": 0.8677, + "num_input_tokens_seen": 133764784, + "step": 8313 + }, + { + "epoch": 0.5823811149929514, + "grad_norm": 4.270792484283447, + "learning_rate": 4.181656042031524e-05, + "loss": 1.0086, + "num_input_tokens_seen": 133780472, + "step": 8314 + }, + { + "epoch": 0.5824511632386806, + "grad_norm": 3.99307918548584, + "learning_rate": 4.1809562171628726e-05, + "loss": 0.8987, + "num_input_tokens_seen": 133796856, + "step": 8315 + }, + { + "epoch": 0.5825212114844099, + "grad_norm": 3.3790438175201416, + "learning_rate": 4.180256392294221e-05, + "loss": 0.8995, + "num_input_tokens_seen": 133813240, + "step": 8316 + }, + { + "epoch": 0.5825912597301391, + "grad_norm": 3.9880869388580322, + "learning_rate": 4.1795565674255696e-05, + "loss": 1.0347, + "num_input_tokens_seen": 133829624, + "step": 8317 + }, + { + "epoch": 0.5826613079758683, + "grad_norm": 4.359625339508057, + "learning_rate": 4.178856742556918e-05, + "loss": 1.0979, + "num_input_tokens_seen": 133845048, + "step": 8318 + }, + { + "epoch": 0.5827313562215977, + "grad_norm": 3.856341600418091, + "learning_rate": 4.178156917688267e-05, + "loss": 1.0426, + "num_input_tokens_seen": 133861432, + "step": 8319 + }, + { + "epoch": 0.5828014044673269, + "grad_norm": 4.141717433929443, + "learning_rate": 4.1774570928196155e-05, + "loss": 1.1243, + "num_input_tokens_seen": 133877176, + "step": 8320 + }, + { + "epoch": 0.5828714527130561, + "grad_norm": 4.0302605628967285, + "learning_rate": 4.176757267950963e-05, + "loss": 0.9288, + "num_input_tokens_seen": 133892840, + "step": 8321 + }, + { + "epoch": 0.5829415009587854, + "grad_norm": 4.464367389678955, + "learning_rate": 4.1760574430823126e-05, + "loss": 1.2155, + "num_input_tokens_seen": 133909224, + "step": 8322 + }, + { + "epoch": 0.5830115492045146, + "grad_norm": 5.884219169616699, + "learning_rate": 4.17535761821366e-05, + "loss": 1.2055, + "num_input_tokens_seen": 133925432, + "step": 8323 + }, + { + "epoch": 0.5830815974502439, + "grad_norm": 4.006690979003906, + "learning_rate": 4.174657793345009e-05, + "loss": 1.0853, + "num_input_tokens_seen": 133941816, + "step": 8324 + }, + { + "epoch": 0.5831516456959731, + "grad_norm": 4.723453998565674, + "learning_rate": 4.173957968476358e-05, + "loss": 1.0426, + "num_input_tokens_seen": 133958200, + "step": 8325 + }, + { + "epoch": 0.5832216939417023, + "grad_norm": 4.328615665435791, + "learning_rate": 4.173258143607706e-05, + "loss": 1.3134, + "num_input_tokens_seen": 133974584, + "step": 8326 + }, + { + "epoch": 0.5832917421874316, + "grad_norm": 3.6810529232025146, + "learning_rate": 4.172558318739055e-05, + "loss": 1.0627, + "num_input_tokens_seen": 133990936, + "step": 8327 + }, + { + "epoch": 0.5833617904331608, + "grad_norm": 4.4679741859436035, + "learning_rate": 4.171858493870403e-05, + "loss": 1.0715, + "num_input_tokens_seen": 134006488, + "step": 8328 + }, + { + "epoch": 0.58343183867889, + "grad_norm": 3.5542099475860596, + "learning_rate": 4.171158669001752e-05, + "loss": 0.9576, + "num_input_tokens_seen": 134022128, + "step": 8329 + }, + { + "epoch": 0.5835018869246194, + "grad_norm": 5.318003177642822, + "learning_rate": 4.170458844133101e-05, + "loss": 1.0429, + "num_input_tokens_seen": 134038512, + "step": 8330 + }, + { + "epoch": 0.5835719351703486, + "grad_norm": 3.7052905559539795, + "learning_rate": 4.169759019264448e-05, + "loss": 1.0589, + "num_input_tokens_seen": 134054816, + "step": 8331 + }, + { + "epoch": 0.5836419834160779, + "grad_norm": 3.937094211578369, + "learning_rate": 4.169059194395798e-05, + "loss": 0.8523, + "num_input_tokens_seen": 134071200, + "step": 8332 + }, + { + "epoch": 0.5837120316618071, + "grad_norm": 4.033092498779297, + "learning_rate": 4.168359369527145e-05, + "loss": 1.0589, + "num_input_tokens_seen": 134087416, + "step": 8333 + }, + { + "epoch": 0.5837820799075363, + "grad_norm": 3.4819228649139404, + "learning_rate": 4.167659544658494e-05, + "loss": 0.8606, + "num_input_tokens_seen": 134103800, + "step": 8334 + }, + { + "epoch": 0.5838521281532656, + "grad_norm": 4.7478861808776855, + "learning_rate": 4.166959719789843e-05, + "loss": 1.0191, + "num_input_tokens_seen": 134119856, + "step": 8335 + }, + { + "epoch": 0.5839221763989948, + "grad_norm": 3.6348135471343994, + "learning_rate": 4.166259894921191e-05, + "loss": 0.9789, + "num_input_tokens_seen": 134135992, + "step": 8336 + }, + { + "epoch": 0.583992224644724, + "grad_norm": 3.5731043815612793, + "learning_rate": 4.16556007005254e-05, + "loss": 0.9688, + "num_input_tokens_seen": 134152376, + "step": 8337 + }, + { + "epoch": 0.5840622728904533, + "grad_norm": 4.3569254875183105, + "learning_rate": 4.164860245183888e-05, + "loss": 1.1394, + "num_input_tokens_seen": 134168760, + "step": 8338 + }, + { + "epoch": 0.5841323211361825, + "grad_norm": 3.468846082687378, + "learning_rate": 4.164160420315237e-05, + "loss": 0.9848, + "num_input_tokens_seen": 134185144, + "step": 8339 + }, + { + "epoch": 0.5842023693819118, + "grad_norm": 6.284951210021973, + "learning_rate": 4.1634605954465846e-05, + "loss": 0.9518, + "num_input_tokens_seen": 134200680, + "step": 8340 + }, + { + "epoch": 0.584272417627641, + "grad_norm": 4.579503536224365, + "learning_rate": 4.1627607705779334e-05, + "loss": 0.9929, + "num_input_tokens_seen": 134217064, + "step": 8341 + }, + { + "epoch": 0.5843424658733702, + "grad_norm": 3.716926336288452, + "learning_rate": 4.162060945709283e-05, + "loss": 1.0092, + "num_input_tokens_seen": 134233448, + "step": 8342 + }, + { + "epoch": 0.5844125141190996, + "grad_norm": 4.120275974273682, + "learning_rate": 4.1613611208406305e-05, + "loss": 1.097, + "num_input_tokens_seen": 134249832, + "step": 8343 + }, + { + "epoch": 0.5844825623648288, + "grad_norm": 5.749308109283447, + "learning_rate": 4.160661295971979e-05, + "loss": 1.0097, + "num_input_tokens_seen": 134266216, + "step": 8344 + }, + { + "epoch": 0.5845526106105581, + "grad_norm": 3.9694550037384033, + "learning_rate": 4.1599614711033275e-05, + "loss": 0.9815, + "num_input_tokens_seen": 134282600, + "step": 8345 + }, + { + "epoch": 0.5846226588562873, + "grad_norm": 4.208174705505371, + "learning_rate": 4.1592616462346764e-05, + "loss": 1.0167, + "num_input_tokens_seen": 134298088, + "step": 8346 + }, + { + "epoch": 0.5846927071020165, + "grad_norm": 3.957308530807495, + "learning_rate": 4.158561821366025e-05, + "loss": 1.0122, + "num_input_tokens_seen": 134314144, + "step": 8347 + }, + { + "epoch": 0.5847627553477458, + "grad_norm": 4.887307643890381, + "learning_rate": 4.1578619964973734e-05, + "loss": 0.9538, + "num_input_tokens_seen": 134330528, + "step": 8348 + }, + { + "epoch": 0.584832803593475, + "grad_norm": 4.353172302246094, + "learning_rate": 4.157162171628722e-05, + "loss": 1.1329, + "num_input_tokens_seen": 134346912, + "step": 8349 + }, + { + "epoch": 0.5849028518392042, + "grad_norm": 3.504237174987793, + "learning_rate": 4.15646234676007e-05, + "loss": 0.9481, + "num_input_tokens_seen": 134363296, + "step": 8350 + }, + { + "epoch": 0.5849729000849335, + "grad_norm": 4.646234512329102, + "learning_rate": 4.1557625218914186e-05, + "loss": 1.0612, + "num_input_tokens_seen": 134379376, + "step": 8351 + }, + { + "epoch": 0.5850429483306627, + "grad_norm": 4.285154819488525, + "learning_rate": 4.155062697022768e-05, + "loss": 0.9446, + "num_input_tokens_seen": 134395760, + "step": 8352 + }, + { + "epoch": 0.585112996576392, + "grad_norm": 3.5311827659606934, + "learning_rate": 4.154362872154116e-05, + "loss": 1.0646, + "num_input_tokens_seen": 134412144, + "step": 8353 + }, + { + "epoch": 0.5851830448221212, + "grad_norm": 4.413166522979736, + "learning_rate": 4.1536630472854645e-05, + "loss": 0.9335, + "num_input_tokens_seen": 134428136, + "step": 8354 + }, + { + "epoch": 0.5852530930678504, + "grad_norm": 4.07955265045166, + "learning_rate": 4.152963222416813e-05, + "loss": 0.8923, + "num_input_tokens_seen": 134444512, + "step": 8355 + }, + { + "epoch": 0.5853231413135798, + "grad_norm": 3.809666633605957, + "learning_rate": 4.1522633975481616e-05, + "loss": 0.8746, + "num_input_tokens_seen": 134460896, + "step": 8356 + }, + { + "epoch": 0.585393189559309, + "grad_norm": 3.916811466217041, + "learning_rate": 4.1515635726795104e-05, + "loss": 0.9548, + "num_input_tokens_seen": 134477128, + "step": 8357 + }, + { + "epoch": 0.5854632378050382, + "grad_norm": 4.052529811859131, + "learning_rate": 4.1508637478108586e-05, + "loss": 1.0494, + "num_input_tokens_seen": 134493512, + "step": 8358 + }, + { + "epoch": 0.5855332860507675, + "grad_norm": 4.45082950592041, + "learning_rate": 4.1501639229422075e-05, + "loss": 1.2332, + "num_input_tokens_seen": 134509192, + "step": 8359 + }, + { + "epoch": 0.5856033342964967, + "grad_norm": 4.140877723693848, + "learning_rate": 4.149464098073555e-05, + "loss": 1.1588, + "num_input_tokens_seen": 134524848, + "step": 8360 + }, + { + "epoch": 0.585673382542226, + "grad_norm": 3.7743544578552246, + "learning_rate": 4.148764273204904e-05, + "loss": 0.8837, + "num_input_tokens_seen": 134541232, + "step": 8361 + }, + { + "epoch": 0.5857434307879552, + "grad_norm": 3.654794692993164, + "learning_rate": 4.1480644483362534e-05, + "loss": 1.2079, + "num_input_tokens_seen": 134557616, + "step": 8362 + }, + { + "epoch": 0.5858134790336844, + "grad_norm": 3.4448959827423096, + "learning_rate": 4.147364623467601e-05, + "loss": 0.8832, + "num_input_tokens_seen": 134574000, + "step": 8363 + }, + { + "epoch": 0.5858835272794137, + "grad_norm": 4.829925537109375, + "learning_rate": 4.14666479859895e-05, + "loss": 1.1541, + "num_input_tokens_seen": 134590384, + "step": 8364 + }, + { + "epoch": 0.5859535755251429, + "grad_norm": 3.5955686569213867, + "learning_rate": 4.145964973730298e-05, + "loss": 1.0381, + "num_input_tokens_seen": 134606768, + "step": 8365 + }, + { + "epoch": 0.5860236237708721, + "grad_norm": 5.0735368728637695, + "learning_rate": 4.145265148861647e-05, + "loss": 1.238, + "num_input_tokens_seen": 134622752, + "step": 8366 + }, + { + "epoch": 0.5860936720166015, + "grad_norm": 3.8610787391662598, + "learning_rate": 4.144565323992994e-05, + "loss": 1.1311, + "num_input_tokens_seen": 134639112, + "step": 8367 + }, + { + "epoch": 0.5861637202623307, + "grad_norm": 3.630153179168701, + "learning_rate": 4.143865499124344e-05, + "loss": 0.9734, + "num_input_tokens_seen": 134655496, + "step": 8368 + }, + { + "epoch": 0.58623376850806, + "grad_norm": 4.367414951324463, + "learning_rate": 4.143165674255693e-05, + "loss": 1.0743, + "num_input_tokens_seen": 134671880, + "step": 8369 + }, + { + "epoch": 0.5863038167537892, + "grad_norm": 3.709831953048706, + "learning_rate": 4.14246584938704e-05, + "loss": 1.0332, + "num_input_tokens_seen": 134688168, + "step": 8370 + }, + { + "epoch": 0.5863738649995184, + "grad_norm": 4.649940490722656, + "learning_rate": 4.141766024518389e-05, + "loss": 1.3275, + "num_input_tokens_seen": 134704552, + "step": 8371 + }, + { + "epoch": 0.5864439132452477, + "grad_norm": 3.7334702014923096, + "learning_rate": 4.141066199649737e-05, + "loss": 0.7565, + "num_input_tokens_seen": 134719896, + "step": 8372 + }, + { + "epoch": 0.5865139614909769, + "grad_norm": 4.841366291046143, + "learning_rate": 4.140366374781086e-05, + "loss": 1.337, + "num_input_tokens_seen": 134735168, + "step": 8373 + }, + { + "epoch": 0.5865840097367061, + "grad_norm": 4.331602096557617, + "learning_rate": 4.139666549912435e-05, + "loss": 1.1276, + "num_input_tokens_seen": 134751552, + "step": 8374 + }, + { + "epoch": 0.5866540579824354, + "grad_norm": 4.3667497634887695, + "learning_rate": 4.138966725043783e-05, + "loss": 0.9588, + "num_input_tokens_seen": 134767272, + "step": 8375 + }, + { + "epoch": 0.5867241062281646, + "grad_norm": 3.6581032276153564, + "learning_rate": 4.138266900175132e-05, + "loss": 1.0778, + "num_input_tokens_seen": 134783656, + "step": 8376 + }, + { + "epoch": 0.5867941544738939, + "grad_norm": 3.8137452602386475, + "learning_rate": 4.1375670753064795e-05, + "loss": 1.1091, + "num_input_tokens_seen": 134799760, + "step": 8377 + }, + { + "epoch": 0.5868642027196231, + "grad_norm": 4.273350238800049, + "learning_rate": 4.136867250437829e-05, + "loss": 1.0878, + "num_input_tokens_seen": 134816144, + "step": 8378 + }, + { + "epoch": 0.5869342509653523, + "grad_norm": 4.527581214904785, + "learning_rate": 4.136167425569178e-05, + "loss": 1.0128, + "num_input_tokens_seen": 134832528, + "step": 8379 + }, + { + "epoch": 0.5870042992110817, + "grad_norm": 4.647453784942627, + "learning_rate": 4.1354676007005254e-05, + "loss": 1.0938, + "num_input_tokens_seen": 134847984, + "step": 8380 + }, + { + "epoch": 0.5870743474568109, + "grad_norm": 5.219020843505859, + "learning_rate": 4.134767775831874e-05, + "loss": 1.2273, + "num_input_tokens_seen": 134864368, + "step": 8381 + }, + { + "epoch": 0.5871443957025402, + "grad_norm": 4.104679107666016, + "learning_rate": 4.1340679509632224e-05, + "loss": 1.0368, + "num_input_tokens_seen": 134879776, + "step": 8382 + }, + { + "epoch": 0.5872144439482694, + "grad_norm": 4.659088134765625, + "learning_rate": 4.133368126094571e-05, + "loss": 1.258, + "num_input_tokens_seen": 134894656, + "step": 8383 + }, + { + "epoch": 0.5872844921939986, + "grad_norm": 5.709257125854492, + "learning_rate": 4.13266830122592e-05, + "loss": 1.0678, + "num_input_tokens_seen": 134910952, + "step": 8384 + }, + { + "epoch": 0.5873545404397279, + "grad_norm": 3.874393939971924, + "learning_rate": 4.131968476357268e-05, + "loss": 1.2776, + "num_input_tokens_seen": 134927320, + "step": 8385 + }, + { + "epoch": 0.5874245886854571, + "grad_norm": 3.5335848331451416, + "learning_rate": 4.131268651488617e-05, + "loss": 0.9319, + "num_input_tokens_seen": 134943704, + "step": 8386 + }, + { + "epoch": 0.5874946369311863, + "grad_norm": 7.110137462615967, + "learning_rate": 4.130568826619965e-05, + "loss": 1.0478, + "num_input_tokens_seen": 134960088, + "step": 8387 + }, + { + "epoch": 0.5875646851769156, + "grad_norm": 5.622186660766602, + "learning_rate": 4.129869001751314e-05, + "loss": 1.3012, + "num_input_tokens_seen": 134976232, + "step": 8388 + }, + { + "epoch": 0.5876347334226448, + "grad_norm": 4.596433162689209, + "learning_rate": 4.129169176882663e-05, + "loss": 1.1115, + "num_input_tokens_seen": 134992616, + "step": 8389 + }, + { + "epoch": 0.5877047816683741, + "grad_norm": 4.493381023406982, + "learning_rate": 4.1284693520140106e-05, + "loss": 0.9248, + "num_input_tokens_seen": 135008704, + "step": 8390 + }, + { + "epoch": 0.5877748299141033, + "grad_norm": 3.4309275150299072, + "learning_rate": 4.1277695271453594e-05, + "loss": 0.9195, + "num_input_tokens_seen": 135024360, + "step": 8391 + }, + { + "epoch": 0.5878448781598326, + "grad_norm": 3.7281200885772705, + "learning_rate": 4.1270697022767076e-05, + "loss": 1.1526, + "num_input_tokens_seen": 135040744, + "step": 8392 + }, + { + "epoch": 0.5879149264055619, + "grad_norm": 4.484415054321289, + "learning_rate": 4.1263698774080565e-05, + "loss": 0.9914, + "num_input_tokens_seen": 135057128, + "step": 8393 + }, + { + "epoch": 0.5879849746512911, + "grad_norm": 4.102346897125244, + "learning_rate": 4.125670052539405e-05, + "loss": 1.0439, + "num_input_tokens_seen": 135073248, + "step": 8394 + }, + { + "epoch": 0.5880550228970203, + "grad_norm": 7.703208923339844, + "learning_rate": 4.1249702276707535e-05, + "loss": 1.3581, + "num_input_tokens_seen": 135088344, + "step": 8395 + }, + { + "epoch": 0.5881250711427496, + "grad_norm": 6.113401889801025, + "learning_rate": 4.1242704028021024e-05, + "loss": 1.0845, + "num_input_tokens_seen": 135104728, + "step": 8396 + }, + { + "epoch": 0.5881951193884788, + "grad_norm": 3.8614649772644043, + "learning_rate": 4.12357057793345e-05, + "loss": 0.9821, + "num_input_tokens_seen": 135120272, + "step": 8397 + }, + { + "epoch": 0.5882651676342081, + "grad_norm": 3.9187567234039307, + "learning_rate": 4.1228707530647994e-05, + "loss": 0.9296, + "num_input_tokens_seen": 135136152, + "step": 8398 + }, + { + "epoch": 0.5883352158799373, + "grad_norm": 3.274703025817871, + "learning_rate": 4.122170928196147e-05, + "loss": 0.9896, + "num_input_tokens_seen": 135152536, + "step": 8399 + }, + { + "epoch": 0.5884052641256665, + "grad_norm": 3.8558948040008545, + "learning_rate": 4.121471103327496e-05, + "loss": 1.0359, + "num_input_tokens_seen": 135168920, + "step": 8400 + }, + { + "epoch": 0.5884052641256665, + "eval_loss": 1.1207953691482544, + "eval_runtime": 0.5047, + "eval_samples_per_second": 1.981, + "eval_steps_per_second": 1.981, + "num_input_tokens_seen": 135168920, + "step": 8400 + }, + { + "epoch": 0.5884753123713958, + "grad_norm": 3.8348987102508545, + "learning_rate": 4.1207712784588446e-05, + "loss": 1.1777, + "num_input_tokens_seen": 135185240, + "step": 8401 + }, + { + "epoch": 0.588545360617125, + "grad_norm": 4.601896286010742, + "learning_rate": 4.120071453590193e-05, + "loss": 1.1153, + "num_input_tokens_seen": 135201624, + "step": 8402 + }, + { + "epoch": 0.5886154088628542, + "grad_norm": 5.034974098205566, + "learning_rate": 4.119371628721542e-05, + "loss": 1.107, + "num_input_tokens_seen": 135217312, + "step": 8403 + }, + { + "epoch": 0.5886854571085836, + "grad_norm": 3.9467031955718994, + "learning_rate": 4.11867180385289e-05, + "loss": 1.156, + "num_input_tokens_seen": 135233696, + "step": 8404 + }, + { + "epoch": 0.5887555053543128, + "grad_norm": 4.382333278656006, + "learning_rate": 4.117971978984239e-05, + "loss": 1.1136, + "num_input_tokens_seen": 135250080, + "step": 8405 + }, + { + "epoch": 0.5888255536000421, + "grad_norm": 4.233513832092285, + "learning_rate": 4.1172721541155876e-05, + "loss": 0.9812, + "num_input_tokens_seen": 135266464, + "step": 8406 + }, + { + "epoch": 0.5888956018457713, + "grad_norm": 4.086855411529541, + "learning_rate": 4.116572329246935e-05, + "loss": 1.1262, + "num_input_tokens_seen": 135282056, + "step": 8407 + }, + { + "epoch": 0.5889656500915005, + "grad_norm": 5.637170791625977, + "learning_rate": 4.1158725043782846e-05, + "loss": 0.9163, + "num_input_tokens_seen": 135298440, + "step": 8408 + }, + { + "epoch": 0.5890356983372298, + "grad_norm": 4.02920389175415, + "learning_rate": 4.115172679509632e-05, + "loss": 1.1859, + "num_input_tokens_seen": 135314824, + "step": 8409 + }, + { + "epoch": 0.589105746582959, + "grad_norm": 4.483967304229736, + "learning_rate": 4.114472854640981e-05, + "loss": 1.2137, + "num_input_tokens_seen": 135331208, + "step": 8410 + }, + { + "epoch": 0.5891757948286882, + "grad_norm": 3.4211783409118652, + "learning_rate": 4.11377302977233e-05, + "loss": 1.0682, + "num_input_tokens_seen": 135347592, + "step": 8411 + }, + { + "epoch": 0.5892458430744175, + "grad_norm": 5.509677886962891, + "learning_rate": 4.113073204903678e-05, + "loss": 1.0495, + "num_input_tokens_seen": 135363976, + "step": 8412 + }, + { + "epoch": 0.5893158913201467, + "grad_norm": 3.858487129211426, + "learning_rate": 4.112373380035027e-05, + "loss": 0.9229, + "num_input_tokens_seen": 135380360, + "step": 8413 + }, + { + "epoch": 0.589385939565876, + "grad_norm": 3.9579319953918457, + "learning_rate": 4.111673555166375e-05, + "loss": 0.9867, + "num_input_tokens_seen": 135396520, + "step": 8414 + }, + { + "epoch": 0.5894559878116052, + "grad_norm": 4.196267127990723, + "learning_rate": 4.110973730297724e-05, + "loss": 1.0332, + "num_input_tokens_seen": 135412904, + "step": 8415 + }, + { + "epoch": 0.5895260360573344, + "grad_norm": 4.3482584953308105, + "learning_rate": 4.110273905429073e-05, + "loss": 1.153, + "num_input_tokens_seen": 135428608, + "step": 8416 + }, + { + "epoch": 0.5895960843030638, + "grad_norm": 4.085668087005615, + "learning_rate": 4.10957408056042e-05, + "loss": 0.8883, + "num_input_tokens_seen": 135444992, + "step": 8417 + }, + { + "epoch": 0.589666132548793, + "grad_norm": 5.926814556121826, + "learning_rate": 4.10887425569177e-05, + "loss": 1.1014, + "num_input_tokens_seen": 135461376, + "step": 8418 + }, + { + "epoch": 0.5897361807945223, + "grad_norm": 3.9502601623535156, + "learning_rate": 4.108174430823117e-05, + "loss": 1.0685, + "num_input_tokens_seen": 135477760, + "step": 8419 + }, + { + "epoch": 0.5898062290402515, + "grad_norm": 5.679574012756348, + "learning_rate": 4.107474605954466e-05, + "loss": 1.1308, + "num_input_tokens_seen": 135494144, + "step": 8420 + }, + { + "epoch": 0.5898762772859807, + "grad_norm": 3.5571305751800537, + "learning_rate": 4.1067747810858144e-05, + "loss": 0.9001, + "num_input_tokens_seen": 135510528, + "step": 8421 + }, + { + "epoch": 0.58994632553171, + "grad_norm": 4.572329998016357, + "learning_rate": 4.106074956217163e-05, + "loss": 0.8941, + "num_input_tokens_seen": 135526912, + "step": 8422 + }, + { + "epoch": 0.5900163737774392, + "grad_norm": 3.605867385864258, + "learning_rate": 4.105375131348512e-05, + "loss": 0.8755, + "num_input_tokens_seen": 135542984, + "step": 8423 + }, + { + "epoch": 0.5900864220231684, + "grad_norm": 3.735992193222046, + "learning_rate": 4.10467530647986e-05, + "loss": 1.2112, + "num_input_tokens_seen": 135559368, + "step": 8424 + }, + { + "epoch": 0.5901564702688977, + "grad_norm": 3.49092435836792, + "learning_rate": 4.103975481611209e-05, + "loss": 0.9727, + "num_input_tokens_seen": 135575752, + "step": 8425 + }, + { + "epoch": 0.5902265185146269, + "grad_norm": 4.640012741088867, + "learning_rate": 4.1032756567425566e-05, + "loss": 1.0578, + "num_input_tokens_seen": 135590864, + "step": 8426 + }, + { + "epoch": 0.5902965667603562, + "grad_norm": 4.801137447357178, + "learning_rate": 4.1025758318739055e-05, + "loss": 1.2412, + "num_input_tokens_seen": 135607032, + "step": 8427 + }, + { + "epoch": 0.5903666150060854, + "grad_norm": 3.724868059158325, + "learning_rate": 4.101876007005255e-05, + "loss": 0.8363, + "num_input_tokens_seen": 135623416, + "step": 8428 + }, + { + "epoch": 0.5904366632518147, + "grad_norm": 3.8564326763153076, + "learning_rate": 4.1011761821366025e-05, + "loss": 1.0217, + "num_input_tokens_seen": 135639800, + "step": 8429 + }, + { + "epoch": 0.590506711497544, + "grad_norm": 6.56883430480957, + "learning_rate": 4.1004763572679514e-05, + "loss": 1.2159, + "num_input_tokens_seen": 135656184, + "step": 8430 + }, + { + "epoch": 0.5905767597432732, + "grad_norm": 4.201202869415283, + "learning_rate": 4.0997765323992996e-05, + "loss": 1.0327, + "num_input_tokens_seen": 135671440, + "step": 8431 + }, + { + "epoch": 0.5906468079890024, + "grad_norm": 3.847327709197998, + "learning_rate": 4.0990767075306484e-05, + "loss": 1.1325, + "num_input_tokens_seen": 135687824, + "step": 8432 + }, + { + "epoch": 0.5907168562347317, + "grad_norm": 3.914062976837158, + "learning_rate": 4.098376882661997e-05, + "loss": 1.1359, + "num_input_tokens_seen": 135703984, + "step": 8433 + }, + { + "epoch": 0.5907869044804609, + "grad_norm": 3.8910293579101562, + "learning_rate": 4.0976770577933455e-05, + "loss": 1.0944, + "num_input_tokens_seen": 135720168, + "step": 8434 + }, + { + "epoch": 0.5908569527261902, + "grad_norm": 3.861595392227173, + "learning_rate": 4.096977232924694e-05, + "loss": 1.1301, + "num_input_tokens_seen": 135736552, + "step": 8435 + }, + { + "epoch": 0.5909270009719194, + "grad_norm": 3.977604866027832, + "learning_rate": 4.096277408056042e-05, + "loss": 1.1798, + "num_input_tokens_seen": 135751784, + "step": 8436 + }, + { + "epoch": 0.5909970492176486, + "grad_norm": 5.1813063621521, + "learning_rate": 4.095577583187391e-05, + "loss": 1.1489, + "num_input_tokens_seen": 135768168, + "step": 8437 + }, + { + "epoch": 0.5910670974633779, + "grad_norm": 4.813988208770752, + "learning_rate": 4.0948777583187396e-05, + "loss": 1.0265, + "num_input_tokens_seen": 135784056, + "step": 8438 + }, + { + "epoch": 0.5911371457091071, + "grad_norm": 3.533111572265625, + "learning_rate": 4.094177933450088e-05, + "loss": 1.1011, + "num_input_tokens_seen": 135800440, + "step": 8439 + }, + { + "epoch": 0.5912071939548363, + "grad_norm": 4.941068649291992, + "learning_rate": 4.0934781085814366e-05, + "loss": 1.0636, + "num_input_tokens_seen": 135816632, + "step": 8440 + }, + { + "epoch": 0.5912772422005657, + "grad_norm": 3.904463529586792, + "learning_rate": 4.092778283712785e-05, + "loss": 1.0524, + "num_input_tokens_seen": 135833016, + "step": 8441 + }, + { + "epoch": 0.5913472904462949, + "grad_norm": 3.8580570220947266, + "learning_rate": 4.0920784588441336e-05, + "loss": 1.0551, + "num_input_tokens_seen": 135848880, + "step": 8442 + }, + { + "epoch": 0.5914173386920242, + "grad_norm": 4.239645957946777, + "learning_rate": 4.0913786339754825e-05, + "loss": 1.0472, + "num_input_tokens_seen": 135864784, + "step": 8443 + }, + { + "epoch": 0.5914873869377534, + "grad_norm": 5.425406455993652, + "learning_rate": 4.090678809106831e-05, + "loss": 0.9734, + "num_input_tokens_seen": 135881168, + "step": 8444 + }, + { + "epoch": 0.5915574351834826, + "grad_norm": 5.343285083770752, + "learning_rate": 4.0899789842381795e-05, + "loss": 1.0865, + "num_input_tokens_seen": 135897160, + "step": 8445 + }, + { + "epoch": 0.5916274834292119, + "grad_norm": 3.98968768119812, + "learning_rate": 4.089279159369527e-05, + "loss": 1.0532, + "num_input_tokens_seen": 135913544, + "step": 8446 + }, + { + "epoch": 0.5916975316749411, + "grad_norm": 4.17509651184082, + "learning_rate": 4.088579334500876e-05, + "loss": 1.0097, + "num_input_tokens_seen": 135929856, + "step": 8447 + }, + { + "epoch": 0.5917675799206704, + "grad_norm": 3.902754545211792, + "learning_rate": 4.087879509632224e-05, + "loss": 1.0945, + "num_input_tokens_seen": 135946240, + "step": 8448 + }, + { + "epoch": 0.5918376281663996, + "grad_norm": 3.9351940155029297, + "learning_rate": 4.087179684763573e-05, + "loss": 1.0191, + "num_input_tokens_seen": 135962624, + "step": 8449 + }, + { + "epoch": 0.5919076764121288, + "grad_norm": 11.633938789367676, + "learning_rate": 4.086479859894922e-05, + "loss": 0.9441, + "num_input_tokens_seen": 135977400, + "step": 8450 + }, + { + "epoch": 0.5919777246578581, + "grad_norm": 5.2794952392578125, + "learning_rate": 4.08578003502627e-05, + "loss": 1.2718, + "num_input_tokens_seen": 135993784, + "step": 8451 + }, + { + "epoch": 0.5920477729035873, + "grad_norm": 4.678462982177734, + "learning_rate": 4.085080210157619e-05, + "loss": 0.9368, + "num_input_tokens_seen": 136009792, + "step": 8452 + }, + { + "epoch": 0.5921178211493165, + "grad_norm": 4.661433696746826, + "learning_rate": 4.0843803852889663e-05, + "loss": 0.9451, + "num_input_tokens_seen": 136026080, + "step": 8453 + }, + { + "epoch": 0.5921878693950459, + "grad_norm": 3.5805437564849854, + "learning_rate": 4.083680560420315e-05, + "loss": 1.1001, + "num_input_tokens_seen": 136042464, + "step": 8454 + }, + { + "epoch": 0.5922579176407751, + "grad_norm": 4.52186918258667, + "learning_rate": 4.082980735551665e-05, + "loss": 1.1597, + "num_input_tokens_seen": 136058848, + "step": 8455 + }, + { + "epoch": 0.5923279658865044, + "grad_norm": 4.3744096755981445, + "learning_rate": 4.082280910683012e-05, + "loss": 0.9786, + "num_input_tokens_seen": 136074592, + "step": 8456 + }, + { + "epoch": 0.5923980141322336, + "grad_norm": 3.618044137954712, + "learning_rate": 4.081581085814361e-05, + "loss": 0.9851, + "num_input_tokens_seen": 136090976, + "step": 8457 + }, + { + "epoch": 0.5924680623779628, + "grad_norm": 4.326976776123047, + "learning_rate": 4.080881260945709e-05, + "loss": 1.208, + "num_input_tokens_seen": 136106328, + "step": 8458 + }, + { + "epoch": 0.5925381106236921, + "grad_norm": 4.256892681121826, + "learning_rate": 4.080181436077058e-05, + "loss": 0.9317, + "num_input_tokens_seen": 136122712, + "step": 8459 + }, + { + "epoch": 0.5926081588694213, + "grad_norm": 7.176054954528809, + "learning_rate": 4.079481611208407e-05, + "loss": 1.1927, + "num_input_tokens_seen": 136138568, + "step": 8460 + }, + { + "epoch": 0.5926782071151505, + "grad_norm": 4.251017093658447, + "learning_rate": 4.078781786339755e-05, + "loss": 0.9107, + "num_input_tokens_seen": 136154536, + "step": 8461 + }, + { + "epoch": 0.5927482553608798, + "grad_norm": 4.4962358474731445, + "learning_rate": 4.078081961471104e-05, + "loss": 1.2891, + "num_input_tokens_seen": 136170904, + "step": 8462 + }, + { + "epoch": 0.592818303606609, + "grad_norm": 3.534278392791748, + "learning_rate": 4.0773821366024515e-05, + "loss": 1.0803, + "num_input_tokens_seen": 136187288, + "step": 8463 + }, + { + "epoch": 0.5928883518523383, + "grad_norm": 3.931110143661499, + "learning_rate": 4.0766823117338004e-05, + "loss": 1.0634, + "num_input_tokens_seen": 136203672, + "step": 8464 + }, + { + "epoch": 0.5929584000980676, + "grad_norm": 3.703648567199707, + "learning_rate": 4.07598248686515e-05, + "loss": 1.0554, + "num_input_tokens_seen": 136219608, + "step": 8465 + }, + { + "epoch": 0.5930284483437968, + "grad_norm": 3.503268003463745, + "learning_rate": 4.0752826619964974e-05, + "loss": 0.9267, + "num_input_tokens_seen": 136235736, + "step": 8466 + }, + { + "epoch": 0.5930984965895261, + "grad_norm": 3.9382851123809814, + "learning_rate": 4.074582837127846e-05, + "loss": 1.0025, + "num_input_tokens_seen": 136251280, + "step": 8467 + }, + { + "epoch": 0.5931685448352553, + "grad_norm": 4.2469892501831055, + "learning_rate": 4.0738830122591945e-05, + "loss": 1.2617, + "num_input_tokens_seen": 136267664, + "step": 8468 + }, + { + "epoch": 0.5932385930809845, + "grad_norm": 3.9730587005615234, + "learning_rate": 4.0731831873905433e-05, + "loss": 0.9621, + "num_input_tokens_seen": 136282392, + "step": 8469 + }, + { + "epoch": 0.5933086413267138, + "grad_norm": 4.624509334564209, + "learning_rate": 4.072483362521892e-05, + "loss": 1.0994, + "num_input_tokens_seen": 136298776, + "step": 8470 + }, + { + "epoch": 0.593378689572443, + "grad_norm": 3.6294236183166504, + "learning_rate": 4.0717835376532404e-05, + "loss": 0.9881, + "num_input_tokens_seen": 136315160, + "step": 8471 + }, + { + "epoch": 0.5934487378181723, + "grad_norm": 7.062867641448975, + "learning_rate": 4.071083712784589e-05, + "loss": 1.2028, + "num_input_tokens_seen": 136331184, + "step": 8472 + }, + { + "epoch": 0.5935187860639015, + "grad_norm": 6.272582054138184, + "learning_rate": 4.070383887915937e-05, + "loss": 0.9448, + "num_input_tokens_seen": 136347568, + "step": 8473 + }, + { + "epoch": 0.5935888343096307, + "grad_norm": 5.614204406738281, + "learning_rate": 4.0696840630472856e-05, + "loss": 1.0933, + "num_input_tokens_seen": 136362560, + "step": 8474 + }, + { + "epoch": 0.59365888255536, + "grad_norm": 5.849013805389404, + "learning_rate": 4.068984238178634e-05, + "loss": 0.9458, + "num_input_tokens_seen": 136378688, + "step": 8475 + }, + { + "epoch": 0.5937289308010892, + "grad_norm": 3.793728828430176, + "learning_rate": 4.0682844133099826e-05, + "loss": 1.0254, + "num_input_tokens_seen": 136395032, + "step": 8476 + }, + { + "epoch": 0.5937989790468184, + "grad_norm": 4.718780517578125, + "learning_rate": 4.0675845884413315e-05, + "loss": 1.1872, + "num_input_tokens_seen": 136411416, + "step": 8477 + }, + { + "epoch": 0.5938690272925478, + "grad_norm": 3.784729480743408, + "learning_rate": 4.06688476357268e-05, + "loss": 1.0111, + "num_input_tokens_seen": 136427800, + "step": 8478 + }, + { + "epoch": 0.593939075538277, + "grad_norm": 5.107780933380127, + "learning_rate": 4.0661849387040285e-05, + "loss": 1.0403, + "num_input_tokens_seen": 136443872, + "step": 8479 + }, + { + "epoch": 0.5940091237840063, + "grad_norm": 4.075433254241943, + "learning_rate": 4.065485113835376e-05, + "loss": 1.0314, + "num_input_tokens_seen": 136460256, + "step": 8480 + }, + { + "epoch": 0.5940791720297355, + "grad_norm": 4.13838005065918, + "learning_rate": 4.0647852889667256e-05, + "loss": 1.206, + "num_input_tokens_seen": 136476640, + "step": 8481 + }, + { + "epoch": 0.5941492202754647, + "grad_norm": 3.5712060928344727, + "learning_rate": 4.0640854640980744e-05, + "loss": 0.9423, + "num_input_tokens_seen": 136493024, + "step": 8482 + }, + { + "epoch": 0.594219268521194, + "grad_norm": 5.28206205368042, + "learning_rate": 4.063385639229422e-05, + "loss": 0.9835, + "num_input_tokens_seen": 136509408, + "step": 8483 + }, + { + "epoch": 0.5942893167669232, + "grad_norm": 3.5715935230255127, + "learning_rate": 4.062685814360771e-05, + "loss": 0.9044, + "num_input_tokens_seen": 136525792, + "step": 8484 + }, + { + "epoch": 0.5943593650126525, + "grad_norm": 6.536035060882568, + "learning_rate": 4.061985989492119e-05, + "loss": 1.1452, + "num_input_tokens_seen": 136542176, + "step": 8485 + }, + { + "epoch": 0.5944294132583817, + "grad_norm": 3.952908992767334, + "learning_rate": 4.061286164623468e-05, + "loss": 0.8111, + "num_input_tokens_seen": 136558560, + "step": 8486 + }, + { + "epoch": 0.5944994615041109, + "grad_norm": 5.087388038635254, + "learning_rate": 4.060586339754817e-05, + "loss": 0.8976, + "num_input_tokens_seen": 136574600, + "step": 8487 + }, + { + "epoch": 0.5945695097498402, + "grad_norm": 3.6439568996429443, + "learning_rate": 4.059886514886165e-05, + "loss": 0.8971, + "num_input_tokens_seen": 136590296, + "step": 8488 + }, + { + "epoch": 0.5946395579955694, + "grad_norm": 3.4764657020568848, + "learning_rate": 4.059186690017514e-05, + "loss": 0.9029, + "num_input_tokens_seen": 136606680, + "step": 8489 + }, + { + "epoch": 0.5947096062412986, + "grad_norm": 4.079936504364014, + "learning_rate": 4.058486865148861e-05, + "loss": 0.8968, + "num_input_tokens_seen": 136622560, + "step": 8490 + }, + { + "epoch": 0.594779654487028, + "grad_norm": 3.562714099884033, + "learning_rate": 4.057787040280211e-05, + "loss": 0.9019, + "num_input_tokens_seen": 136638720, + "step": 8491 + }, + { + "epoch": 0.5948497027327572, + "grad_norm": 3.68719482421875, + "learning_rate": 4.0570872154115596e-05, + "loss": 0.8945, + "num_input_tokens_seen": 136654616, + "step": 8492 + }, + { + "epoch": 0.5949197509784865, + "grad_norm": 4.2226457595825195, + "learning_rate": 4.056387390542907e-05, + "loss": 1.025, + "num_input_tokens_seen": 136671000, + "step": 8493 + }, + { + "epoch": 0.5949897992242157, + "grad_norm": 3.5887370109558105, + "learning_rate": 4.055687565674256e-05, + "loss": 1.0966, + "num_input_tokens_seen": 136686752, + "step": 8494 + }, + { + "epoch": 0.5950598474699449, + "grad_norm": 4.240508556365967, + "learning_rate": 4.054987740805604e-05, + "loss": 1.1577, + "num_input_tokens_seen": 136702304, + "step": 8495 + }, + { + "epoch": 0.5951298957156742, + "grad_norm": 4.054566383361816, + "learning_rate": 4.054287915936953e-05, + "loss": 1.1753, + "num_input_tokens_seen": 136718328, + "step": 8496 + }, + { + "epoch": 0.5951999439614034, + "grad_norm": 4.475510597229004, + "learning_rate": 4.053588091068302e-05, + "loss": 0.9763, + "num_input_tokens_seen": 136734712, + "step": 8497 + }, + { + "epoch": 0.5952699922071326, + "grad_norm": 4.823075771331787, + "learning_rate": 4.05288826619965e-05, + "loss": 1.0618, + "num_input_tokens_seen": 136751096, + "step": 8498 + }, + { + "epoch": 0.5953400404528619, + "grad_norm": 4.210450172424316, + "learning_rate": 4.052188441330999e-05, + "loss": 1.0448, + "num_input_tokens_seen": 136766608, + "step": 8499 + }, + { + "epoch": 0.5954100886985911, + "grad_norm": 3.9019668102264404, + "learning_rate": 4.0514886164623465e-05, + "loss": 1.0376, + "num_input_tokens_seen": 136781912, + "step": 8500 + }, + { + "epoch": 0.5954801369443204, + "grad_norm": 3.659578800201416, + "learning_rate": 4.050788791593696e-05, + "loss": 1.0362, + "num_input_tokens_seen": 136798296, + "step": 8501 + }, + { + "epoch": 0.5955501851900497, + "grad_norm": 3.483598232269287, + "learning_rate": 4.0500889667250435e-05, + "loss": 0.9742, + "num_input_tokens_seen": 136814680, + "step": 8502 + }, + { + "epoch": 0.5956202334357789, + "grad_norm": 3.6818151473999023, + "learning_rate": 4.0493891418563924e-05, + "loss": 0.9461, + "num_input_tokens_seen": 136830744, + "step": 8503 + }, + { + "epoch": 0.5956902816815082, + "grad_norm": 3.6853933334350586, + "learning_rate": 4.048689316987741e-05, + "loss": 0.8666, + "num_input_tokens_seen": 136846584, + "step": 8504 + }, + { + "epoch": 0.5957603299272374, + "grad_norm": 3.8736915588378906, + "learning_rate": 4.0479894921190894e-05, + "loss": 0.9954, + "num_input_tokens_seen": 136862640, + "step": 8505 + }, + { + "epoch": 0.5958303781729666, + "grad_norm": 3.9588091373443604, + "learning_rate": 4.047289667250438e-05, + "loss": 1.0243, + "num_input_tokens_seen": 136879024, + "step": 8506 + }, + { + "epoch": 0.5959004264186959, + "grad_norm": 4.486006259918213, + "learning_rate": 4.0465898423817864e-05, + "loss": 1.1943, + "num_input_tokens_seen": 136894936, + "step": 8507 + }, + { + "epoch": 0.5959704746644251, + "grad_norm": 6.0953474044799805, + "learning_rate": 4.045890017513135e-05, + "loss": 0.9425, + "num_input_tokens_seen": 136911320, + "step": 8508 + }, + { + "epoch": 0.5960405229101544, + "grad_norm": 3.4986250400543213, + "learning_rate": 4.045190192644484e-05, + "loss": 0.9987, + "num_input_tokens_seen": 136927040, + "step": 8509 + }, + { + "epoch": 0.5961105711558836, + "grad_norm": 4.692337989807129, + "learning_rate": 4.0444903677758317e-05, + "loss": 1.0939, + "num_input_tokens_seen": 136942256, + "step": 8510 + }, + { + "epoch": 0.5961806194016128, + "grad_norm": 4.562539100646973, + "learning_rate": 4.043790542907181e-05, + "loss": 1.0255, + "num_input_tokens_seen": 136958640, + "step": 8511 + }, + { + "epoch": 0.5962506676473421, + "grad_norm": 5.332449913024902, + "learning_rate": 4.043090718038529e-05, + "loss": 1.1716, + "num_input_tokens_seen": 136974184, + "step": 8512 + }, + { + "epoch": 0.5963207158930713, + "grad_norm": 3.3877415657043457, + "learning_rate": 4.0423908931698776e-05, + "loss": 0.9377, + "num_input_tokens_seen": 136990184, + "step": 8513 + }, + { + "epoch": 0.5963907641388005, + "grad_norm": 6.579495906829834, + "learning_rate": 4.0416910683012264e-05, + "loss": 1.1208, + "num_input_tokens_seen": 137005464, + "step": 8514 + }, + { + "epoch": 0.5964608123845299, + "grad_norm": 3.4276599884033203, + "learning_rate": 4.0409912434325746e-05, + "loss": 0.8508, + "num_input_tokens_seen": 137021560, + "step": 8515 + }, + { + "epoch": 0.5965308606302591, + "grad_norm": 4.2722320556640625, + "learning_rate": 4.0402914185639235e-05, + "loss": 1.0485, + "num_input_tokens_seen": 137037944, + "step": 8516 + }, + { + "epoch": 0.5966009088759884, + "grad_norm": 3.7961597442626953, + "learning_rate": 4.0395915936952716e-05, + "loss": 1.0189, + "num_input_tokens_seen": 137054272, + "step": 8517 + }, + { + "epoch": 0.5966709571217176, + "grad_norm": 4.86784029006958, + "learning_rate": 4.0388917688266205e-05, + "loss": 1.1087, + "num_input_tokens_seen": 137070384, + "step": 8518 + }, + { + "epoch": 0.5967410053674468, + "grad_norm": 4.233669757843018, + "learning_rate": 4.0381919439579694e-05, + "loss": 1.1851, + "num_input_tokens_seen": 137086400, + "step": 8519 + }, + { + "epoch": 0.5968110536131761, + "grad_norm": 3.619994640350342, + "learning_rate": 4.037492119089317e-05, + "loss": 0.9043, + "num_input_tokens_seen": 137102784, + "step": 8520 + }, + { + "epoch": 0.5968811018589053, + "grad_norm": 4.171209812164307, + "learning_rate": 4.0367922942206664e-05, + "loss": 1.122, + "num_input_tokens_seen": 137119144, + "step": 8521 + }, + { + "epoch": 0.5969511501046346, + "grad_norm": 3.9759716987609863, + "learning_rate": 4.036092469352014e-05, + "loss": 1.0569, + "num_input_tokens_seen": 137135432, + "step": 8522 + }, + { + "epoch": 0.5970211983503638, + "grad_norm": 4.9528045654296875, + "learning_rate": 4.035392644483363e-05, + "loss": 1.0995, + "num_input_tokens_seen": 137151360, + "step": 8523 + }, + { + "epoch": 0.597091246596093, + "grad_norm": 4.097436904907227, + "learning_rate": 4.0346928196147116e-05, + "loss": 1.171, + "num_input_tokens_seen": 137167736, + "step": 8524 + }, + { + "epoch": 0.5971612948418223, + "grad_norm": 3.5559403896331787, + "learning_rate": 4.03399299474606e-05, + "loss": 1.0854, + "num_input_tokens_seen": 137184120, + "step": 8525 + }, + { + "epoch": 0.5972313430875515, + "grad_norm": 4.606170654296875, + "learning_rate": 4.0332931698774087e-05, + "loss": 1.2476, + "num_input_tokens_seen": 137199432, + "step": 8526 + }, + { + "epoch": 0.5973013913332808, + "grad_norm": 3.667523145675659, + "learning_rate": 4.032593345008757e-05, + "loss": 0.9729, + "num_input_tokens_seen": 137215640, + "step": 8527 + }, + { + "epoch": 0.5973714395790101, + "grad_norm": 4.34551477432251, + "learning_rate": 4.031893520140106e-05, + "loss": 1.1817, + "num_input_tokens_seen": 137232024, + "step": 8528 + }, + { + "epoch": 0.5974414878247393, + "grad_norm": 5.5783538818359375, + "learning_rate": 4.031193695271453e-05, + "loss": 1.008, + "num_input_tokens_seen": 137248408, + "step": 8529 + }, + { + "epoch": 0.5975115360704686, + "grad_norm": 4.121201515197754, + "learning_rate": 4.030493870402802e-05, + "loss": 1.0626, + "num_input_tokens_seen": 137264792, + "step": 8530 + }, + { + "epoch": 0.5975815843161978, + "grad_norm": 3.946624279022217, + "learning_rate": 4.0297940455341516e-05, + "loss": 0.9995, + "num_input_tokens_seen": 137281176, + "step": 8531 + }, + { + "epoch": 0.597651632561927, + "grad_norm": 3.6946029663085938, + "learning_rate": 4.029094220665499e-05, + "loss": 1.0057, + "num_input_tokens_seen": 137297560, + "step": 8532 + }, + { + "epoch": 0.5977216808076563, + "grad_norm": 3.5353474617004395, + "learning_rate": 4.028394395796848e-05, + "loss": 1.0662, + "num_input_tokens_seen": 137313472, + "step": 8533 + }, + { + "epoch": 0.5977917290533855, + "grad_norm": 5.651511192321777, + "learning_rate": 4.027694570928196e-05, + "loss": 1.3013, + "num_input_tokens_seen": 137329520, + "step": 8534 + }, + { + "epoch": 0.5978617772991147, + "grad_norm": 4.05964469909668, + "learning_rate": 4.026994746059545e-05, + "loss": 1.0786, + "num_input_tokens_seen": 137345056, + "step": 8535 + }, + { + "epoch": 0.597931825544844, + "grad_norm": 4.253178119659424, + "learning_rate": 4.026294921190894e-05, + "loss": 0.9926, + "num_input_tokens_seen": 137361024, + "step": 8536 + }, + { + "epoch": 0.5980018737905732, + "grad_norm": 4.147807598114014, + "learning_rate": 4.025595096322242e-05, + "loss": 1.2473, + "num_input_tokens_seen": 137377408, + "step": 8537 + }, + { + "epoch": 0.5980719220363026, + "grad_norm": 5.016750812530518, + "learning_rate": 4.024895271453591e-05, + "loss": 0.8938, + "num_input_tokens_seen": 137393792, + "step": 8538 + }, + { + "epoch": 0.5981419702820318, + "grad_norm": 3.6434571743011475, + "learning_rate": 4.0241954465849384e-05, + "loss": 1.137, + "num_input_tokens_seen": 137410176, + "step": 8539 + }, + { + "epoch": 0.598212018527761, + "grad_norm": 4.901271343231201, + "learning_rate": 4.023495621716287e-05, + "loss": 1.0219, + "num_input_tokens_seen": 137426560, + "step": 8540 + }, + { + "epoch": 0.5982820667734903, + "grad_norm": 4.881519317626953, + "learning_rate": 4.022795796847637e-05, + "loss": 0.9401, + "num_input_tokens_seen": 137442944, + "step": 8541 + }, + { + "epoch": 0.5983521150192195, + "grad_norm": 3.551992177963257, + "learning_rate": 4.022095971978984e-05, + "loss": 0.9494, + "num_input_tokens_seen": 137459000, + "step": 8542 + }, + { + "epoch": 0.5984221632649487, + "grad_norm": 4.8773088455200195, + "learning_rate": 4.021396147110333e-05, + "loss": 0.8583, + "num_input_tokens_seen": 137475384, + "step": 8543 + }, + { + "epoch": 0.598492211510678, + "grad_norm": 6.254101276397705, + "learning_rate": 4.0206963222416813e-05, + "loss": 1.3598, + "num_input_tokens_seen": 137491472, + "step": 8544 + }, + { + "epoch": 0.5985622597564072, + "grad_norm": 3.8801498413085938, + "learning_rate": 4.01999649737303e-05, + "loss": 0.8926, + "num_input_tokens_seen": 137507104, + "step": 8545 + }, + { + "epoch": 0.5986323080021365, + "grad_norm": 5.767789363861084, + "learning_rate": 4.019296672504379e-05, + "loss": 0.8929, + "num_input_tokens_seen": 137522160, + "step": 8546 + }, + { + "epoch": 0.5987023562478657, + "grad_norm": 3.4490139484405518, + "learning_rate": 4.018596847635727e-05, + "loss": 0.8501, + "num_input_tokens_seen": 137538536, + "step": 8547 + }, + { + "epoch": 0.5987724044935949, + "grad_norm": 4.004854679107666, + "learning_rate": 4.017897022767076e-05, + "loss": 1.1142, + "num_input_tokens_seen": 137554496, + "step": 8548 + }, + { + "epoch": 0.5988424527393242, + "grad_norm": 4.666201591491699, + "learning_rate": 4.0171971978984236e-05, + "loss": 0.943, + "num_input_tokens_seen": 137570008, + "step": 8549 + }, + { + "epoch": 0.5989125009850534, + "grad_norm": 3.9355149269104004, + "learning_rate": 4.0164973730297725e-05, + "loss": 1.1007, + "num_input_tokens_seen": 137585048, + "step": 8550 + }, + { + "epoch": 0.5989825492307828, + "grad_norm": 4.14084005355835, + "learning_rate": 4.015797548161122e-05, + "loss": 0.9307, + "num_input_tokens_seen": 137601432, + "step": 8551 + }, + { + "epoch": 0.599052597476512, + "grad_norm": 5.831667423248291, + "learning_rate": 4.0150977232924695e-05, + "loss": 1.2231, + "num_input_tokens_seen": 137617816, + "step": 8552 + }, + { + "epoch": 0.5991226457222412, + "grad_norm": 5.509582996368408, + "learning_rate": 4.0143978984238184e-05, + "loss": 0.9859, + "num_input_tokens_seen": 137633688, + "step": 8553 + }, + { + "epoch": 0.5991926939679705, + "grad_norm": 3.6069693565368652, + "learning_rate": 4.0136980735551665e-05, + "loss": 1.105, + "num_input_tokens_seen": 137650072, + "step": 8554 + }, + { + "epoch": 0.5992627422136997, + "grad_norm": 3.864187717437744, + "learning_rate": 4.0129982486865154e-05, + "loss": 1.1324, + "num_input_tokens_seen": 137665624, + "step": 8555 + }, + { + "epoch": 0.5993327904594289, + "grad_norm": 5.807196140289307, + "learning_rate": 4.012298423817863e-05, + "loss": 1.1093, + "num_input_tokens_seen": 137680912, + "step": 8556 + }, + { + "epoch": 0.5994028387051582, + "grad_norm": 4.323429107666016, + "learning_rate": 4.0115985989492124e-05, + "loss": 1.115, + "num_input_tokens_seen": 137697296, + "step": 8557 + }, + { + "epoch": 0.5994728869508874, + "grad_norm": 4.901371955871582, + "learning_rate": 4.010898774080561e-05, + "loss": 1.056, + "num_input_tokens_seen": 137712728, + "step": 8558 + }, + { + "epoch": 0.5995429351966167, + "grad_norm": 4.160161972045898, + "learning_rate": 4.010198949211909e-05, + "loss": 0.9928, + "num_input_tokens_seen": 137728552, + "step": 8559 + }, + { + "epoch": 0.5996129834423459, + "grad_norm": 4.132303714752197, + "learning_rate": 4.009499124343258e-05, + "loss": 0.9725, + "num_input_tokens_seen": 137744936, + "step": 8560 + }, + { + "epoch": 0.5996830316880751, + "grad_norm": 3.91646146774292, + "learning_rate": 4.008799299474606e-05, + "loss": 1.1394, + "num_input_tokens_seen": 137761320, + "step": 8561 + }, + { + "epoch": 0.5997530799338044, + "grad_norm": 4.540143013000488, + "learning_rate": 4.008099474605955e-05, + "loss": 0.9597, + "num_input_tokens_seen": 137776280, + "step": 8562 + }, + { + "epoch": 0.5998231281795336, + "grad_norm": 4.037735939025879, + "learning_rate": 4.0073996497373036e-05, + "loss": 1.1567, + "num_input_tokens_seen": 137792376, + "step": 8563 + }, + { + "epoch": 0.5998931764252629, + "grad_norm": 3.7136945724487305, + "learning_rate": 4.006699824868652e-05, + "loss": 1.0078, + "num_input_tokens_seen": 137808760, + "step": 8564 + }, + { + "epoch": 0.5999632246709922, + "grad_norm": 3.7895519733428955, + "learning_rate": 4.0060000000000006e-05, + "loss": 1.0406, + "num_input_tokens_seen": 137825144, + "step": 8565 + }, + { + "epoch": 0.6000332729167214, + "grad_norm": 3.94679594039917, + "learning_rate": 4.005300175131348e-05, + "loss": 1.1398, + "num_input_tokens_seen": 137840792, + "step": 8566 + }, + { + "epoch": 0.6001033211624507, + "grad_norm": 3.698106527328491, + "learning_rate": 4.0046003502626976e-05, + "loss": 1.0232, + "num_input_tokens_seen": 137857176, + "step": 8567 + }, + { + "epoch": 0.6001733694081799, + "grad_norm": 3.581124782562256, + "learning_rate": 4.0039005253940465e-05, + "loss": 1.1008, + "num_input_tokens_seen": 137873560, + "step": 8568 + }, + { + "epoch": 0.6002434176539091, + "grad_norm": 3.516655206680298, + "learning_rate": 4.003200700525394e-05, + "loss": 0.9743, + "num_input_tokens_seen": 137889720, + "step": 8569 + }, + { + "epoch": 0.6003134658996384, + "grad_norm": 3.512657642364502, + "learning_rate": 4.002500875656743e-05, + "loss": 0.9692, + "num_input_tokens_seen": 137905952, + "step": 8570 + }, + { + "epoch": 0.6003835141453676, + "grad_norm": 3.8576531410217285, + "learning_rate": 4.001801050788091e-05, + "loss": 1.047, + "num_input_tokens_seen": 137922112, + "step": 8571 + }, + { + "epoch": 0.6004535623910968, + "grad_norm": 3.9889824390411377, + "learning_rate": 4.00110122591944e-05, + "loss": 1.0401, + "num_input_tokens_seen": 137938496, + "step": 8572 + }, + { + "epoch": 0.6005236106368261, + "grad_norm": 3.9449281692504883, + "learning_rate": 4.000401401050789e-05, + "loss": 1.0895, + "num_input_tokens_seen": 137954344, + "step": 8573 + }, + { + "epoch": 0.6005936588825553, + "grad_norm": 3.412224531173706, + "learning_rate": 3.999701576182137e-05, + "loss": 0.9309, + "num_input_tokens_seen": 137970560, + "step": 8574 + }, + { + "epoch": 0.6006637071282847, + "grad_norm": 4.4665398597717285, + "learning_rate": 3.999001751313486e-05, + "loss": 0.9608, + "num_input_tokens_seen": 137986944, + "step": 8575 + }, + { + "epoch": 0.6007337553740139, + "grad_norm": 5.645625114440918, + "learning_rate": 3.998301926444833e-05, + "loss": 1.1373, + "num_input_tokens_seen": 138003328, + "step": 8576 + }, + { + "epoch": 0.6008038036197431, + "grad_norm": 3.8064608573913574, + "learning_rate": 3.997602101576183e-05, + "loss": 1.2345, + "num_input_tokens_seen": 138019224, + "step": 8577 + }, + { + "epoch": 0.6008738518654724, + "grad_norm": 7.274814128875732, + "learning_rate": 3.996902276707532e-05, + "loss": 1.1195, + "num_input_tokens_seen": 138033656, + "step": 8578 + }, + { + "epoch": 0.6009439001112016, + "grad_norm": 5.181695461273193, + "learning_rate": 3.996202451838879e-05, + "loss": 1.2311, + "num_input_tokens_seen": 138049672, + "step": 8579 + }, + { + "epoch": 0.6010139483569308, + "grad_norm": 4.333089828491211, + "learning_rate": 3.995502626970228e-05, + "loss": 1.0354, + "num_input_tokens_seen": 138065464, + "step": 8580 + }, + { + "epoch": 0.6010839966026601, + "grad_norm": 4.825462341308594, + "learning_rate": 3.994802802101576e-05, + "loss": 1.156, + "num_input_tokens_seen": 138081848, + "step": 8581 + }, + { + "epoch": 0.6011540448483893, + "grad_norm": 4.870849132537842, + "learning_rate": 3.994102977232925e-05, + "loss": 0.9828, + "num_input_tokens_seen": 138098232, + "step": 8582 + }, + { + "epoch": 0.6012240930941186, + "grad_norm": 3.5566978454589844, + "learning_rate": 3.993403152364273e-05, + "loss": 0.9798, + "num_input_tokens_seen": 138114616, + "step": 8583 + }, + { + "epoch": 0.6012941413398478, + "grad_norm": 5.456608295440674, + "learning_rate": 3.992703327495622e-05, + "loss": 0.8018, + "num_input_tokens_seen": 138131000, + "step": 8584 + }, + { + "epoch": 0.601364189585577, + "grad_norm": 4.456579685211182, + "learning_rate": 3.992003502626971e-05, + "loss": 1.0531, + "num_input_tokens_seen": 138147384, + "step": 8585 + }, + { + "epoch": 0.6014342378313063, + "grad_norm": 3.383918046951294, + "learning_rate": 3.9913036777583185e-05, + "loss": 1.0281, + "num_input_tokens_seen": 138163568, + "step": 8586 + }, + { + "epoch": 0.6015042860770355, + "grad_norm": 4.197165012359619, + "learning_rate": 3.990603852889668e-05, + "loss": 1.0793, + "num_input_tokens_seen": 138179952, + "step": 8587 + }, + { + "epoch": 0.6015743343227649, + "grad_norm": 4.695816516876221, + "learning_rate": 3.9899040280210156e-05, + "loss": 0.9144, + "num_input_tokens_seen": 138196336, + "step": 8588 + }, + { + "epoch": 0.6016443825684941, + "grad_norm": 4.469571113586426, + "learning_rate": 3.9892042031523644e-05, + "loss": 0.9237, + "num_input_tokens_seen": 138211008, + "step": 8589 + }, + { + "epoch": 0.6017144308142233, + "grad_norm": 4.223603248596191, + "learning_rate": 3.988504378283713e-05, + "loss": 1.1692, + "num_input_tokens_seen": 138227264, + "step": 8590 + }, + { + "epoch": 0.6017844790599526, + "grad_norm": 4.124486446380615, + "learning_rate": 3.9878045534150615e-05, + "loss": 1.066, + "num_input_tokens_seen": 138243648, + "step": 8591 + }, + { + "epoch": 0.6018545273056818, + "grad_norm": 4.267524242401123, + "learning_rate": 3.98710472854641e-05, + "loss": 1.1311, + "num_input_tokens_seen": 138258584, + "step": 8592 + }, + { + "epoch": 0.601924575551411, + "grad_norm": 4.767324924468994, + "learning_rate": 3.9864049036777585e-05, + "loss": 1.0748, + "num_input_tokens_seen": 138274968, + "step": 8593 + }, + { + "epoch": 0.6019946237971403, + "grad_norm": 3.771775245666504, + "learning_rate": 3.9857050788091074e-05, + "loss": 1.0153, + "num_input_tokens_seen": 138291352, + "step": 8594 + }, + { + "epoch": 0.6020646720428695, + "grad_norm": 4.396296977996826, + "learning_rate": 3.985005253940456e-05, + "loss": 1.0143, + "num_input_tokens_seen": 138307448, + "step": 8595 + }, + { + "epoch": 0.6021347202885988, + "grad_norm": 3.7812373638153076, + "learning_rate": 3.984305429071804e-05, + "loss": 0.9706, + "num_input_tokens_seen": 138323832, + "step": 8596 + }, + { + "epoch": 0.602204768534328, + "grad_norm": 3.997318983078003, + "learning_rate": 3.983605604203153e-05, + "loss": 1.2324, + "num_input_tokens_seen": 138340096, + "step": 8597 + }, + { + "epoch": 0.6022748167800572, + "grad_norm": 4.59138822555542, + "learning_rate": 3.982905779334501e-05, + "loss": 1.2341, + "num_input_tokens_seen": 138356088, + "step": 8598 + }, + { + "epoch": 0.6023448650257865, + "grad_norm": 6.35938024520874, + "learning_rate": 3.9822059544658496e-05, + "loss": 1.014, + "num_input_tokens_seen": 138370400, + "step": 8599 + }, + { + "epoch": 0.6024149132715158, + "grad_norm": 3.6123111248016357, + "learning_rate": 3.9815061295971985e-05, + "loss": 1.0276, + "num_input_tokens_seen": 138386784, + "step": 8600 + }, + { + "epoch": 0.6024149132715158, + "eval_loss": 1.1186139583587646, + "eval_runtime": 0.2196, + "eval_samples_per_second": 4.555, + "eval_steps_per_second": 4.555, + "num_input_tokens_seen": 138386784, + "step": 8600 + }, + { + "epoch": 0.602484961517245, + "grad_norm": 6.006920337677002, + "learning_rate": 3.9808063047285467e-05, + "loss": 1.1676, + "num_input_tokens_seen": 138401864, + "step": 8601 + }, + { + "epoch": 0.6025550097629743, + "grad_norm": 5.827392101287842, + "learning_rate": 3.9801064798598955e-05, + "loss": 1.1777, + "num_input_tokens_seen": 138418048, + "step": 8602 + }, + { + "epoch": 0.6026250580087035, + "grad_norm": 3.5328269004821777, + "learning_rate": 3.979406654991244e-05, + "loss": 1.0939, + "num_input_tokens_seen": 138434432, + "step": 8603 + }, + { + "epoch": 0.6026951062544328, + "grad_norm": 8.836989402770996, + "learning_rate": 3.9787068301225926e-05, + "loss": 0.9377, + "num_input_tokens_seen": 138450768, + "step": 8604 + }, + { + "epoch": 0.602765154500162, + "grad_norm": 4.550467491149902, + "learning_rate": 3.9780070052539414e-05, + "loss": 0.9927, + "num_input_tokens_seen": 138467152, + "step": 8605 + }, + { + "epoch": 0.6028352027458912, + "grad_norm": 3.278535842895508, + "learning_rate": 3.977307180385289e-05, + "loss": 0.8975, + "num_input_tokens_seen": 138483336, + "step": 8606 + }, + { + "epoch": 0.6029052509916205, + "grad_norm": 4.926201820373535, + "learning_rate": 3.9766073555166385e-05, + "loss": 1.104, + "num_input_tokens_seen": 138499520, + "step": 8607 + }, + { + "epoch": 0.6029752992373497, + "grad_norm": 3.771263599395752, + "learning_rate": 3.975907530647986e-05, + "loss": 1.0829, + "num_input_tokens_seen": 138515312, + "step": 8608 + }, + { + "epoch": 0.6030453474830789, + "grad_norm": 3.7951557636260986, + "learning_rate": 3.975207705779335e-05, + "loss": 0.9362, + "num_input_tokens_seen": 138531696, + "step": 8609 + }, + { + "epoch": 0.6031153957288082, + "grad_norm": 3.843801736831665, + "learning_rate": 3.974507880910683e-05, + "loss": 1.109, + "num_input_tokens_seen": 138547288, + "step": 8610 + }, + { + "epoch": 0.6031854439745374, + "grad_norm": 4.3663458824157715, + "learning_rate": 3.973808056042032e-05, + "loss": 1.178, + "num_input_tokens_seen": 138562656, + "step": 8611 + }, + { + "epoch": 0.6032554922202668, + "grad_norm": 4.014590740203857, + "learning_rate": 3.973108231173381e-05, + "loss": 1.0005, + "num_input_tokens_seen": 138579040, + "step": 8612 + }, + { + "epoch": 0.603325540465996, + "grad_norm": 6.256049156188965, + "learning_rate": 3.972408406304729e-05, + "loss": 1.3283, + "num_input_tokens_seen": 138594768, + "step": 8613 + }, + { + "epoch": 0.6033955887117252, + "grad_norm": 3.753734588623047, + "learning_rate": 3.971708581436078e-05, + "loss": 1.1025, + "num_input_tokens_seen": 138611000, + "step": 8614 + }, + { + "epoch": 0.6034656369574545, + "grad_norm": 3.5668489933013916, + "learning_rate": 3.971008756567425e-05, + "loss": 0.9006, + "num_input_tokens_seen": 138627056, + "step": 8615 + }, + { + "epoch": 0.6035356852031837, + "grad_norm": 5.030658721923828, + "learning_rate": 3.970308931698774e-05, + "loss": 1.0404, + "num_input_tokens_seen": 138643440, + "step": 8616 + }, + { + "epoch": 0.6036057334489129, + "grad_norm": 3.952258825302124, + "learning_rate": 3.9696091068301237e-05, + "loss": 1.0489, + "num_input_tokens_seen": 138659824, + "step": 8617 + }, + { + "epoch": 0.6036757816946422, + "grad_norm": 3.7543816566467285, + "learning_rate": 3.968909281961471e-05, + "loss": 0.9313, + "num_input_tokens_seen": 138675464, + "step": 8618 + }, + { + "epoch": 0.6037458299403714, + "grad_norm": 3.3861594200134277, + "learning_rate": 3.96820945709282e-05, + "loss": 0.9698, + "num_input_tokens_seen": 138691848, + "step": 8619 + }, + { + "epoch": 0.6038158781861007, + "grad_norm": 4.57234001159668, + "learning_rate": 3.967509632224168e-05, + "loss": 1.082, + "num_input_tokens_seen": 138707336, + "step": 8620 + }, + { + "epoch": 0.6038859264318299, + "grad_norm": 4.1400017738342285, + "learning_rate": 3.966809807355517e-05, + "loss": 1.1072, + "num_input_tokens_seen": 138723632, + "step": 8621 + }, + { + "epoch": 0.6039559746775591, + "grad_norm": 4.089561939239502, + "learning_rate": 3.966109982486866e-05, + "loss": 1.1955, + "num_input_tokens_seen": 138739520, + "step": 8622 + }, + { + "epoch": 0.6040260229232884, + "grad_norm": 4.862533092498779, + "learning_rate": 3.965410157618214e-05, + "loss": 1.0813, + "num_input_tokens_seen": 138755904, + "step": 8623 + }, + { + "epoch": 0.6040960711690176, + "grad_norm": 3.7710344791412354, + "learning_rate": 3.964710332749563e-05, + "loss": 1.0692, + "num_input_tokens_seen": 138771896, + "step": 8624 + }, + { + "epoch": 0.604166119414747, + "grad_norm": 3.6991803646087646, + "learning_rate": 3.9640105078809105e-05, + "loss": 1.0352, + "num_input_tokens_seen": 138788200, + "step": 8625 + }, + { + "epoch": 0.6042361676604762, + "grad_norm": 4.2967000007629395, + "learning_rate": 3.963310683012259e-05, + "loss": 1.0592, + "num_input_tokens_seen": 138804304, + "step": 8626 + }, + { + "epoch": 0.6043062159062054, + "grad_norm": 3.801499605178833, + "learning_rate": 3.962610858143609e-05, + "loss": 1.1428, + "num_input_tokens_seen": 138820688, + "step": 8627 + }, + { + "epoch": 0.6043762641519347, + "grad_norm": 3.6990387439727783, + "learning_rate": 3.9619110332749564e-05, + "loss": 0.9586, + "num_input_tokens_seen": 138836736, + "step": 8628 + }, + { + "epoch": 0.6044463123976639, + "grad_norm": 4.749825954437256, + "learning_rate": 3.961211208406305e-05, + "loss": 0.9241, + "num_input_tokens_seen": 138852608, + "step": 8629 + }, + { + "epoch": 0.6045163606433931, + "grad_norm": 5.206172943115234, + "learning_rate": 3.9605113835376534e-05, + "loss": 0.9934, + "num_input_tokens_seen": 138868456, + "step": 8630 + }, + { + "epoch": 0.6045864088891224, + "grad_norm": 4.962406158447266, + "learning_rate": 3.959811558669002e-05, + "loss": 0.9899, + "num_input_tokens_seen": 138884840, + "step": 8631 + }, + { + "epoch": 0.6046564571348516, + "grad_norm": 3.799485445022583, + "learning_rate": 3.95911173380035e-05, + "loss": 1.0163, + "num_input_tokens_seen": 138901224, + "step": 8632 + }, + { + "epoch": 0.6047265053805809, + "grad_norm": 5.033595561981201, + "learning_rate": 3.958411908931699e-05, + "loss": 1.3195, + "num_input_tokens_seen": 138917520, + "step": 8633 + }, + { + "epoch": 0.6047965536263101, + "grad_norm": 3.578892469406128, + "learning_rate": 3.957712084063048e-05, + "loss": 0.8304, + "num_input_tokens_seen": 138933672, + "step": 8634 + }, + { + "epoch": 0.6048666018720393, + "grad_norm": 4.8468918800354, + "learning_rate": 3.957012259194396e-05, + "loss": 0.9684, + "num_input_tokens_seen": 138950056, + "step": 8635 + }, + { + "epoch": 0.6049366501177686, + "grad_norm": 4.073800086975098, + "learning_rate": 3.9563124343257445e-05, + "loss": 0.9949, + "num_input_tokens_seen": 138966440, + "step": 8636 + }, + { + "epoch": 0.6050066983634979, + "grad_norm": 3.830550193786621, + "learning_rate": 3.955612609457093e-05, + "loss": 0.9953, + "num_input_tokens_seen": 138982824, + "step": 8637 + }, + { + "epoch": 0.6050767466092271, + "grad_norm": 3.840424060821533, + "learning_rate": 3.9549127845884416e-05, + "loss": 1.0197, + "num_input_tokens_seen": 138999208, + "step": 8638 + }, + { + "epoch": 0.6051467948549564, + "grad_norm": 6.698359966278076, + "learning_rate": 3.9542129597197904e-05, + "loss": 1.1916, + "num_input_tokens_seen": 139014136, + "step": 8639 + }, + { + "epoch": 0.6052168431006856, + "grad_norm": 3.7611327171325684, + "learning_rate": 3.9535131348511386e-05, + "loss": 1.0069, + "num_input_tokens_seen": 139030224, + "step": 8640 + }, + { + "epoch": 0.6052868913464149, + "grad_norm": 3.665715217590332, + "learning_rate": 3.9528133099824875e-05, + "loss": 1.032, + "num_input_tokens_seen": 139046608, + "step": 8641 + }, + { + "epoch": 0.6053569395921441, + "grad_norm": 3.656867027282715, + "learning_rate": 3.952113485113835e-05, + "loss": 0.9664, + "num_input_tokens_seen": 139062576, + "step": 8642 + }, + { + "epoch": 0.6054269878378733, + "grad_norm": 4.180251121520996, + "learning_rate": 3.9514136602451845e-05, + "loss": 0.9523, + "num_input_tokens_seen": 139077152, + "step": 8643 + }, + { + "epoch": 0.6054970360836026, + "grad_norm": 4.83087158203125, + "learning_rate": 3.9507138353765334e-05, + "loss": 1.1481, + "num_input_tokens_seen": 139093368, + "step": 8644 + }, + { + "epoch": 0.6055670843293318, + "grad_norm": 4.232693672180176, + "learning_rate": 3.950014010507881e-05, + "loss": 1.0776, + "num_input_tokens_seen": 139109000, + "step": 8645 + }, + { + "epoch": 0.605637132575061, + "grad_norm": 3.773815631866455, + "learning_rate": 3.94931418563923e-05, + "loss": 0.9966, + "num_input_tokens_seen": 139125384, + "step": 8646 + }, + { + "epoch": 0.6057071808207903, + "grad_norm": 3.4463624954223633, + "learning_rate": 3.948614360770578e-05, + "loss": 0.8856, + "num_input_tokens_seen": 139141720, + "step": 8647 + }, + { + "epoch": 0.6057772290665195, + "grad_norm": 5.109038829803467, + "learning_rate": 3.947914535901927e-05, + "loss": 1.3271, + "num_input_tokens_seen": 139157584, + "step": 8648 + }, + { + "epoch": 0.6058472773122489, + "grad_norm": 3.7015492916107178, + "learning_rate": 3.9472147110332756e-05, + "loss": 1.0611, + "num_input_tokens_seen": 139173408, + "step": 8649 + }, + { + "epoch": 0.6059173255579781, + "grad_norm": 4.649860858917236, + "learning_rate": 3.946514886164624e-05, + "loss": 0.9677, + "num_input_tokens_seen": 139189792, + "step": 8650 + }, + { + "epoch": 0.6059873738037073, + "grad_norm": 4.192616939544678, + "learning_rate": 3.945815061295973e-05, + "loss": 0.946, + "num_input_tokens_seen": 139206176, + "step": 8651 + }, + { + "epoch": 0.6060574220494366, + "grad_norm": 4.1493239402771, + "learning_rate": 3.94511523642732e-05, + "loss": 1.0673, + "num_input_tokens_seen": 139222560, + "step": 8652 + }, + { + "epoch": 0.6061274702951658, + "grad_norm": 5.9447126388549805, + "learning_rate": 3.94441541155867e-05, + "loss": 1.1462, + "num_input_tokens_seen": 139238944, + "step": 8653 + }, + { + "epoch": 0.606197518540895, + "grad_norm": 3.4503183364868164, + "learning_rate": 3.9437155866900186e-05, + "loss": 0.9671, + "num_input_tokens_seen": 139254776, + "step": 8654 + }, + { + "epoch": 0.6062675667866243, + "grad_norm": 3.360295057296753, + "learning_rate": 3.943015761821366e-05, + "loss": 0.895, + "num_input_tokens_seen": 139270504, + "step": 8655 + }, + { + "epoch": 0.6063376150323535, + "grad_norm": 6.167716979980469, + "learning_rate": 3.942315936952715e-05, + "loss": 1.2048, + "num_input_tokens_seen": 139286888, + "step": 8656 + }, + { + "epoch": 0.6064076632780828, + "grad_norm": 4.070390701293945, + "learning_rate": 3.941616112084063e-05, + "loss": 1.0559, + "num_input_tokens_seen": 139302112, + "step": 8657 + }, + { + "epoch": 0.606477711523812, + "grad_norm": 4.5200910568237305, + "learning_rate": 3.940916287215412e-05, + "loss": 1.1955, + "num_input_tokens_seen": 139316640, + "step": 8658 + }, + { + "epoch": 0.6065477597695412, + "grad_norm": 4.460906028747559, + "learning_rate": 3.94021646234676e-05, + "loss": 1.1522, + "num_input_tokens_seen": 139332272, + "step": 8659 + }, + { + "epoch": 0.6066178080152705, + "grad_norm": 3.6839993000030518, + "learning_rate": 3.939516637478109e-05, + "loss": 1.066, + "num_input_tokens_seen": 139348656, + "step": 8660 + }, + { + "epoch": 0.6066878562609997, + "grad_norm": 3.7513279914855957, + "learning_rate": 3.938816812609458e-05, + "loss": 1.0835, + "num_input_tokens_seen": 139364320, + "step": 8661 + }, + { + "epoch": 0.6067579045067291, + "grad_norm": 3.7140040397644043, + "learning_rate": 3.9381169877408054e-05, + "loss": 0.9949, + "num_input_tokens_seen": 139380096, + "step": 8662 + }, + { + "epoch": 0.6068279527524583, + "grad_norm": 3.8038108348846436, + "learning_rate": 3.937417162872155e-05, + "loss": 1.1649, + "num_input_tokens_seen": 139396480, + "step": 8663 + }, + { + "epoch": 0.6068980009981875, + "grad_norm": 3.9177510738372803, + "learning_rate": 3.9367173380035024e-05, + "loss": 1.1795, + "num_input_tokens_seen": 139412696, + "step": 8664 + }, + { + "epoch": 0.6069680492439168, + "grad_norm": 3.6502163410186768, + "learning_rate": 3.936017513134851e-05, + "loss": 1.0512, + "num_input_tokens_seen": 139429080, + "step": 8665 + }, + { + "epoch": 0.607038097489646, + "grad_norm": 5.0769429206848145, + "learning_rate": 3.9353176882662e-05, + "loss": 1.2733, + "num_input_tokens_seen": 139445256, + "step": 8666 + }, + { + "epoch": 0.6071081457353752, + "grad_norm": 4.779011249542236, + "learning_rate": 3.934617863397548e-05, + "loss": 1.0737, + "num_input_tokens_seen": 139460904, + "step": 8667 + }, + { + "epoch": 0.6071781939811045, + "grad_norm": 6.152329921722412, + "learning_rate": 3.933918038528897e-05, + "loss": 1.0367, + "num_input_tokens_seen": 139477288, + "step": 8668 + }, + { + "epoch": 0.6072482422268337, + "grad_norm": 3.4945099353790283, + "learning_rate": 3.9332182136602454e-05, + "loss": 1.0752, + "num_input_tokens_seen": 139493672, + "step": 8669 + }, + { + "epoch": 0.607318290472563, + "grad_norm": 4.946517467498779, + "learning_rate": 3.932518388791594e-05, + "loss": 1.2263, + "num_input_tokens_seen": 139509536, + "step": 8670 + }, + { + "epoch": 0.6073883387182922, + "grad_norm": 3.9744396209716797, + "learning_rate": 3.931818563922943e-05, + "loss": 0.9351, + "num_input_tokens_seen": 139525920, + "step": 8671 + }, + { + "epoch": 0.6074583869640214, + "grad_norm": 3.706655263900757, + "learning_rate": 3.9311187390542906e-05, + "loss": 1.1001, + "num_input_tokens_seen": 139542304, + "step": 8672 + }, + { + "epoch": 0.6075284352097508, + "grad_norm": 4.078902244567871, + "learning_rate": 3.93041891418564e-05, + "loss": 1.0148, + "num_input_tokens_seen": 139558688, + "step": 8673 + }, + { + "epoch": 0.60759848345548, + "grad_norm": 4.942174911499023, + "learning_rate": 3.9297190893169876e-05, + "loss": 1.2956, + "num_input_tokens_seen": 139572960, + "step": 8674 + }, + { + "epoch": 0.6076685317012092, + "grad_norm": 4.287027835845947, + "learning_rate": 3.9290192644483365e-05, + "loss": 1.0743, + "num_input_tokens_seen": 139589296, + "step": 8675 + }, + { + "epoch": 0.6077385799469385, + "grad_norm": 5.903578758239746, + "learning_rate": 3.928319439579685e-05, + "loss": 1.1361, + "num_input_tokens_seen": 139605680, + "step": 8676 + }, + { + "epoch": 0.6078086281926677, + "grad_norm": 4.834617614746094, + "learning_rate": 3.9276196147110335e-05, + "loss": 1.0656, + "num_input_tokens_seen": 139622000, + "step": 8677 + }, + { + "epoch": 0.607878676438397, + "grad_norm": 3.5837063789367676, + "learning_rate": 3.9269197898423824e-05, + "loss": 1.0803, + "num_input_tokens_seen": 139638384, + "step": 8678 + }, + { + "epoch": 0.6079487246841262, + "grad_norm": 4.114101409912109, + "learning_rate": 3.9262199649737306e-05, + "loss": 0.9032, + "num_input_tokens_seen": 139654768, + "step": 8679 + }, + { + "epoch": 0.6080187729298554, + "grad_norm": 4.746354579925537, + "learning_rate": 3.9255201401050794e-05, + "loss": 1.1916, + "num_input_tokens_seen": 139671152, + "step": 8680 + }, + { + "epoch": 0.6080888211755847, + "grad_norm": 4.15376091003418, + "learning_rate": 3.924820315236428e-05, + "loss": 1.1337, + "num_input_tokens_seen": 139687536, + "step": 8681 + }, + { + "epoch": 0.6081588694213139, + "grad_norm": 3.82739520072937, + "learning_rate": 3.924120490367776e-05, + "loss": 0.9669, + "num_input_tokens_seen": 139703920, + "step": 8682 + }, + { + "epoch": 0.6082289176670431, + "grad_norm": 4.157196044921875, + "learning_rate": 3.923420665499125e-05, + "loss": 1.074, + "num_input_tokens_seen": 139719880, + "step": 8683 + }, + { + "epoch": 0.6082989659127724, + "grad_norm": 3.8748726844787598, + "learning_rate": 3.922720840630473e-05, + "loss": 1.1518, + "num_input_tokens_seen": 139736264, + "step": 8684 + }, + { + "epoch": 0.6083690141585016, + "grad_norm": 5.491943836212158, + "learning_rate": 3.922021015761822e-05, + "loss": 1.0592, + "num_input_tokens_seen": 139751880, + "step": 8685 + }, + { + "epoch": 0.608439062404231, + "grad_norm": 3.5256998538970947, + "learning_rate": 3.92132119089317e-05, + "loss": 0.9674, + "num_input_tokens_seen": 139768248, + "step": 8686 + }, + { + "epoch": 0.6085091106499602, + "grad_norm": 3.947817087173462, + "learning_rate": 3.920621366024519e-05, + "loss": 0.8453, + "num_input_tokens_seen": 139784632, + "step": 8687 + }, + { + "epoch": 0.6085791588956894, + "grad_norm": 3.964315414428711, + "learning_rate": 3.9199215411558676e-05, + "loss": 1.0547, + "num_input_tokens_seen": 139800720, + "step": 8688 + }, + { + "epoch": 0.6086492071414187, + "grad_norm": 4.725607395172119, + "learning_rate": 3.919221716287216e-05, + "loss": 1.3308, + "num_input_tokens_seen": 139817104, + "step": 8689 + }, + { + "epoch": 0.6087192553871479, + "grad_norm": 3.7689168453216553, + "learning_rate": 3.9185218914185646e-05, + "loss": 0.9974, + "num_input_tokens_seen": 139833136, + "step": 8690 + }, + { + "epoch": 0.6087893036328772, + "grad_norm": 3.5490543842315674, + "learning_rate": 3.917822066549912e-05, + "loss": 1.2286, + "num_input_tokens_seen": 139849520, + "step": 8691 + }, + { + "epoch": 0.6088593518786064, + "grad_norm": 3.6718966960906982, + "learning_rate": 3.917122241681261e-05, + "loss": 0.8859, + "num_input_tokens_seen": 139865272, + "step": 8692 + }, + { + "epoch": 0.6089294001243356, + "grad_norm": 3.8843367099761963, + "learning_rate": 3.9164224168126105e-05, + "loss": 0.9679, + "num_input_tokens_seen": 139881656, + "step": 8693 + }, + { + "epoch": 0.6089994483700649, + "grad_norm": 4.574242115020752, + "learning_rate": 3.915722591943958e-05, + "loss": 1.0865, + "num_input_tokens_seen": 139898040, + "step": 8694 + }, + { + "epoch": 0.6090694966157941, + "grad_norm": 5.347915172576904, + "learning_rate": 3.915022767075307e-05, + "loss": 1.2728, + "num_input_tokens_seen": 139914424, + "step": 8695 + }, + { + "epoch": 0.6091395448615233, + "grad_norm": 4.035526752471924, + "learning_rate": 3.914322942206655e-05, + "loss": 0.9864, + "num_input_tokens_seen": 139930808, + "step": 8696 + }, + { + "epoch": 0.6092095931072526, + "grad_norm": 4.355678081512451, + "learning_rate": 3.913623117338004e-05, + "loss": 1.0919, + "num_input_tokens_seen": 139947136, + "step": 8697 + }, + { + "epoch": 0.6092796413529818, + "grad_norm": 3.4143002033233643, + "learning_rate": 3.912923292469353e-05, + "loss": 0.8754, + "num_input_tokens_seen": 139963520, + "step": 8698 + }, + { + "epoch": 0.6093496895987112, + "grad_norm": 4.852843761444092, + "learning_rate": 3.912223467600701e-05, + "loss": 1.0508, + "num_input_tokens_seen": 139979080, + "step": 8699 + }, + { + "epoch": 0.6094197378444404, + "grad_norm": 4.338660717010498, + "learning_rate": 3.91152364273205e-05, + "loss": 1.1108, + "num_input_tokens_seen": 139994920, + "step": 8700 + }, + { + "epoch": 0.6094897860901696, + "grad_norm": 5.042794227600098, + "learning_rate": 3.910823817863397e-05, + "loss": 1.1471, + "num_input_tokens_seen": 140010992, + "step": 8701 + }, + { + "epoch": 0.6095598343358989, + "grad_norm": 6.307446002960205, + "learning_rate": 3.910123992994746e-05, + "loss": 1.0419, + "num_input_tokens_seen": 140026544, + "step": 8702 + }, + { + "epoch": 0.6096298825816281, + "grad_norm": 3.492285966873169, + "learning_rate": 3.909424168126096e-05, + "loss": 0.9893, + "num_input_tokens_seen": 140042928, + "step": 8703 + }, + { + "epoch": 0.6096999308273573, + "grad_norm": 3.170391798019409, + "learning_rate": 3.908724343257443e-05, + "loss": 0.8818, + "num_input_tokens_seen": 140059312, + "step": 8704 + }, + { + "epoch": 0.6097699790730866, + "grad_norm": 4.930979251861572, + "learning_rate": 3.908024518388792e-05, + "loss": 1.148, + "num_input_tokens_seen": 140075344, + "step": 8705 + }, + { + "epoch": 0.6098400273188158, + "grad_norm": 3.8034727573394775, + "learning_rate": 3.90732469352014e-05, + "loss": 1.1965, + "num_input_tokens_seen": 140091448, + "step": 8706 + }, + { + "epoch": 0.6099100755645451, + "grad_norm": 3.897214412689209, + "learning_rate": 3.906624868651489e-05, + "loss": 1.1226, + "num_input_tokens_seen": 140107368, + "step": 8707 + }, + { + "epoch": 0.6099801238102743, + "grad_norm": 4.290642738342285, + "learning_rate": 3.905925043782838e-05, + "loss": 1.0984, + "num_input_tokens_seen": 140122976, + "step": 8708 + }, + { + "epoch": 0.6100501720560035, + "grad_norm": 4.037378787994385, + "learning_rate": 3.905225218914186e-05, + "loss": 1.1394, + "num_input_tokens_seen": 140139360, + "step": 8709 + }, + { + "epoch": 0.6101202203017329, + "grad_norm": 4.367828369140625, + "learning_rate": 3.904525394045535e-05, + "loss": 1.0298, + "num_input_tokens_seen": 140155264, + "step": 8710 + }, + { + "epoch": 0.6101902685474621, + "grad_norm": 4.4513115882873535, + "learning_rate": 3.9038255691768825e-05, + "loss": 1.1901, + "num_input_tokens_seen": 140171648, + "step": 8711 + }, + { + "epoch": 0.6102603167931913, + "grad_norm": 3.7112040519714355, + "learning_rate": 3.9031257443082314e-05, + "loss": 1.0119, + "num_input_tokens_seen": 140188000, + "step": 8712 + }, + { + "epoch": 0.6103303650389206, + "grad_norm": 4.719367027282715, + "learning_rate": 3.9024259194395796e-05, + "loss": 1.0673, + "num_input_tokens_seen": 140204384, + "step": 8713 + }, + { + "epoch": 0.6104004132846498, + "grad_norm": 3.977473735809326, + "learning_rate": 3.9017260945709284e-05, + "loss": 1.0225, + "num_input_tokens_seen": 140220480, + "step": 8714 + }, + { + "epoch": 0.6104704615303791, + "grad_norm": 4.8350677490234375, + "learning_rate": 3.901026269702277e-05, + "loss": 0.9548, + "num_input_tokens_seen": 140236256, + "step": 8715 + }, + { + "epoch": 0.6105405097761083, + "grad_norm": 3.9946627616882324, + "learning_rate": 3.9003264448336255e-05, + "loss": 1.1326, + "num_input_tokens_seen": 140252472, + "step": 8716 + }, + { + "epoch": 0.6106105580218375, + "grad_norm": 4.122077941894531, + "learning_rate": 3.899626619964974e-05, + "loss": 1.1467, + "num_input_tokens_seen": 140268808, + "step": 8717 + }, + { + "epoch": 0.6106806062675668, + "grad_norm": 4.5335259437561035, + "learning_rate": 3.898926795096322e-05, + "loss": 1.1392, + "num_input_tokens_seen": 140285192, + "step": 8718 + }, + { + "epoch": 0.610750654513296, + "grad_norm": 4.293862819671631, + "learning_rate": 3.8982269702276714e-05, + "loss": 1.1738, + "num_input_tokens_seen": 140301472, + "step": 8719 + }, + { + "epoch": 0.6108207027590252, + "grad_norm": 4.643789291381836, + "learning_rate": 3.89752714535902e-05, + "loss": 1.1069, + "num_input_tokens_seen": 140317856, + "step": 8720 + }, + { + "epoch": 0.6108907510047545, + "grad_norm": 5.174399375915527, + "learning_rate": 3.896827320490368e-05, + "loss": 1.078, + "num_input_tokens_seen": 140334240, + "step": 8721 + }, + { + "epoch": 0.6109607992504837, + "grad_norm": 4.7002105712890625, + "learning_rate": 3.8961274956217166e-05, + "loss": 0.9888, + "num_input_tokens_seen": 140350400, + "step": 8722 + }, + { + "epoch": 0.6110308474962131, + "grad_norm": 4.760499000549316, + "learning_rate": 3.895427670753065e-05, + "loss": 1.1481, + "num_input_tokens_seen": 140365952, + "step": 8723 + }, + { + "epoch": 0.6111008957419423, + "grad_norm": 3.8696558475494385, + "learning_rate": 3.8947278458844136e-05, + "loss": 1.1217, + "num_input_tokens_seen": 140381768, + "step": 8724 + }, + { + "epoch": 0.6111709439876715, + "grad_norm": 4.472455978393555, + "learning_rate": 3.8940280210157625e-05, + "loss": 1.234, + "num_input_tokens_seen": 140397608, + "step": 8725 + }, + { + "epoch": 0.6112409922334008, + "grad_norm": 3.5356552600860596, + "learning_rate": 3.893328196147111e-05, + "loss": 1.0879, + "num_input_tokens_seen": 140413992, + "step": 8726 + }, + { + "epoch": 0.61131104047913, + "grad_norm": 3.6783483028411865, + "learning_rate": 3.8926283712784595e-05, + "loss": 1.1471, + "num_input_tokens_seen": 140429656, + "step": 8727 + }, + { + "epoch": 0.6113810887248593, + "grad_norm": 4.635765552520752, + "learning_rate": 3.891928546409807e-05, + "loss": 1.1236, + "num_input_tokens_seen": 140446040, + "step": 8728 + }, + { + "epoch": 0.6114511369705885, + "grad_norm": 3.8402459621429443, + "learning_rate": 3.8912287215411566e-05, + "loss": 1.0499, + "num_input_tokens_seen": 140462424, + "step": 8729 + }, + { + "epoch": 0.6115211852163177, + "grad_norm": 3.7482657432556152, + "learning_rate": 3.8905288966725054e-05, + "loss": 0.9362, + "num_input_tokens_seen": 140478424, + "step": 8730 + }, + { + "epoch": 0.611591233462047, + "grad_norm": 3.8513617515563965, + "learning_rate": 3.889829071803853e-05, + "loss": 1.2524, + "num_input_tokens_seen": 140494616, + "step": 8731 + }, + { + "epoch": 0.6116612817077762, + "grad_norm": 4.280187129974365, + "learning_rate": 3.889129246935202e-05, + "loss": 1.212, + "num_input_tokens_seen": 140511000, + "step": 8732 + }, + { + "epoch": 0.6117313299535054, + "grad_norm": 4.386607646942139, + "learning_rate": 3.88842942206655e-05, + "loss": 1.0173, + "num_input_tokens_seen": 140525664, + "step": 8733 + }, + { + "epoch": 0.6118013781992347, + "grad_norm": 4.0030670166015625, + "learning_rate": 3.887729597197899e-05, + "loss": 1.0556, + "num_input_tokens_seen": 140541960, + "step": 8734 + }, + { + "epoch": 0.611871426444964, + "grad_norm": 5.137122631072998, + "learning_rate": 3.887029772329248e-05, + "loss": 1.1654, + "num_input_tokens_seen": 140558344, + "step": 8735 + }, + { + "epoch": 0.6119414746906933, + "grad_norm": 4.097158908843994, + "learning_rate": 3.886329947460596e-05, + "loss": 1.0828, + "num_input_tokens_seen": 140574216, + "step": 8736 + }, + { + "epoch": 0.6120115229364225, + "grad_norm": 3.8837525844573975, + "learning_rate": 3.885630122591945e-05, + "loss": 0.8273, + "num_input_tokens_seen": 140590600, + "step": 8737 + }, + { + "epoch": 0.6120815711821517, + "grad_norm": 3.6691291332244873, + "learning_rate": 3.884930297723292e-05, + "loss": 0.9192, + "num_input_tokens_seen": 140606984, + "step": 8738 + }, + { + "epoch": 0.612151619427881, + "grad_norm": 4.032019138336182, + "learning_rate": 3.884230472854641e-05, + "loss": 1.1587, + "num_input_tokens_seen": 140623368, + "step": 8739 + }, + { + "epoch": 0.6122216676736102, + "grad_norm": 3.9690909385681152, + "learning_rate": 3.883530647985989e-05, + "loss": 1.1347, + "num_input_tokens_seen": 140639600, + "step": 8740 + }, + { + "epoch": 0.6122917159193394, + "grad_norm": 3.9645636081695557, + "learning_rate": 3.882830823117338e-05, + "loss": 1.052, + "num_input_tokens_seen": 140655984, + "step": 8741 + }, + { + "epoch": 0.6123617641650687, + "grad_norm": 4.980258464813232, + "learning_rate": 3.882130998248687e-05, + "loss": 0.8133, + "num_input_tokens_seen": 140672368, + "step": 8742 + }, + { + "epoch": 0.6124318124107979, + "grad_norm": 4.800034523010254, + "learning_rate": 3.881431173380035e-05, + "loss": 1.2895, + "num_input_tokens_seen": 140688136, + "step": 8743 + }, + { + "epoch": 0.6125018606565272, + "grad_norm": 3.5639631748199463, + "learning_rate": 3.880731348511384e-05, + "loss": 1.095, + "num_input_tokens_seen": 140704168, + "step": 8744 + }, + { + "epoch": 0.6125719089022564, + "grad_norm": 7.081785678863525, + "learning_rate": 3.880031523642732e-05, + "loss": 1.1957, + "num_input_tokens_seen": 140719576, + "step": 8745 + }, + { + "epoch": 0.6126419571479856, + "grad_norm": 4.816562652587891, + "learning_rate": 3.879331698774081e-05, + "loss": 0.9493, + "num_input_tokens_seen": 140735584, + "step": 8746 + }, + { + "epoch": 0.612712005393715, + "grad_norm": 4.442950248718262, + "learning_rate": 3.87863187390543e-05, + "loss": 1.0185, + "num_input_tokens_seen": 140751672, + "step": 8747 + }, + { + "epoch": 0.6127820536394442, + "grad_norm": 4.076030731201172, + "learning_rate": 3.8779320490367774e-05, + "loss": 0.9073, + "num_input_tokens_seen": 140768056, + "step": 8748 + }, + { + "epoch": 0.6128521018851734, + "grad_norm": 3.5422353744506836, + "learning_rate": 3.877232224168126e-05, + "loss": 0.7628, + "num_input_tokens_seen": 140784040, + "step": 8749 + }, + { + "epoch": 0.6129221501309027, + "grad_norm": 3.552335262298584, + "learning_rate": 3.8765323992994745e-05, + "loss": 1.043, + "num_input_tokens_seen": 140800424, + "step": 8750 + }, + { + "epoch": 0.6129921983766319, + "grad_norm": 3.5651915073394775, + "learning_rate": 3.875832574430823e-05, + "loss": 0.9862, + "num_input_tokens_seen": 140815848, + "step": 8751 + }, + { + "epoch": 0.6130622466223612, + "grad_norm": 4.0165629386901855, + "learning_rate": 3.875132749562172e-05, + "loss": 1.0807, + "num_input_tokens_seen": 140832112, + "step": 8752 + }, + { + "epoch": 0.6131322948680904, + "grad_norm": 4.543135643005371, + "learning_rate": 3.8744329246935204e-05, + "loss": 1.0683, + "num_input_tokens_seen": 140848496, + "step": 8753 + }, + { + "epoch": 0.6132023431138196, + "grad_norm": 3.670149326324463, + "learning_rate": 3.873733099824869e-05, + "loss": 0.9923, + "num_input_tokens_seen": 140864632, + "step": 8754 + }, + { + "epoch": 0.6132723913595489, + "grad_norm": 3.3616819381713867, + "learning_rate": 3.8730332749562174e-05, + "loss": 0.7386, + "num_input_tokens_seen": 140880696, + "step": 8755 + }, + { + "epoch": 0.6133424396052781, + "grad_norm": 3.9051620960235596, + "learning_rate": 3.872333450087566e-05, + "loss": 1.0067, + "num_input_tokens_seen": 140895992, + "step": 8756 + }, + { + "epoch": 0.6134124878510073, + "grad_norm": 3.7387287616729736, + "learning_rate": 3.871633625218915e-05, + "loss": 1.0493, + "num_input_tokens_seen": 140912376, + "step": 8757 + }, + { + "epoch": 0.6134825360967366, + "grad_norm": 3.7935774326324463, + "learning_rate": 3.8709338003502626e-05, + "loss": 0.9366, + "num_input_tokens_seen": 140927936, + "step": 8758 + }, + { + "epoch": 0.6135525843424658, + "grad_norm": 4.078311920166016, + "learning_rate": 3.8702339754816115e-05, + "loss": 1.1403, + "num_input_tokens_seen": 140944320, + "step": 8759 + }, + { + "epoch": 0.6136226325881952, + "grad_norm": 5.811614036560059, + "learning_rate": 3.86953415061296e-05, + "loss": 1.0728, + "num_input_tokens_seen": 140960704, + "step": 8760 + }, + { + "epoch": 0.6136926808339244, + "grad_norm": 3.9924943447113037, + "learning_rate": 3.8688343257443085e-05, + "loss": 1.0445, + "num_input_tokens_seen": 140976408, + "step": 8761 + }, + { + "epoch": 0.6137627290796536, + "grad_norm": 4.0190935134887695, + "learning_rate": 3.8681345008756574e-05, + "loss": 1.0733, + "num_input_tokens_seen": 140992792, + "step": 8762 + }, + { + "epoch": 0.6138327773253829, + "grad_norm": 4.201235294342041, + "learning_rate": 3.8674346760070056e-05, + "loss": 1.0284, + "num_input_tokens_seen": 141009176, + "step": 8763 + }, + { + "epoch": 0.6139028255711121, + "grad_norm": 3.501706123352051, + "learning_rate": 3.8667348511383544e-05, + "loss": 1.0776, + "num_input_tokens_seen": 141025560, + "step": 8764 + }, + { + "epoch": 0.6139728738168414, + "grad_norm": 3.998978614807129, + "learning_rate": 3.866035026269702e-05, + "loss": 0.9806, + "num_input_tokens_seen": 141041944, + "step": 8765 + }, + { + "epoch": 0.6140429220625706, + "grad_norm": 3.850513219833374, + "learning_rate": 3.8653352014010515e-05, + "loss": 0.9564, + "num_input_tokens_seen": 141058160, + "step": 8766 + }, + { + "epoch": 0.6141129703082998, + "grad_norm": 4.1797685623168945, + "learning_rate": 3.864635376532399e-05, + "loss": 1.0975, + "num_input_tokens_seen": 141074416, + "step": 8767 + }, + { + "epoch": 0.6141830185540291, + "grad_norm": 3.9805169105529785, + "learning_rate": 3.863935551663748e-05, + "loss": 1.1102, + "num_input_tokens_seen": 141089768, + "step": 8768 + }, + { + "epoch": 0.6142530667997583, + "grad_norm": 4.025511264801025, + "learning_rate": 3.863235726795097e-05, + "loss": 0.9587, + "num_input_tokens_seen": 141106144, + "step": 8769 + }, + { + "epoch": 0.6143231150454875, + "grad_norm": 8.855193138122559, + "learning_rate": 3.862535901926445e-05, + "loss": 1.3197, + "num_input_tokens_seen": 141122160, + "step": 8770 + }, + { + "epoch": 0.6143931632912168, + "grad_norm": 5.363997459411621, + "learning_rate": 3.861836077057794e-05, + "loss": 1.076, + "num_input_tokens_seen": 141138488, + "step": 8771 + }, + { + "epoch": 0.614463211536946, + "grad_norm": 3.576204776763916, + "learning_rate": 3.861136252189142e-05, + "loss": 0.9082, + "num_input_tokens_seen": 141154872, + "step": 8772 + }, + { + "epoch": 0.6145332597826754, + "grad_norm": 4.137770652770996, + "learning_rate": 3.860436427320491e-05, + "loss": 1.0111, + "num_input_tokens_seen": 141171256, + "step": 8773 + }, + { + "epoch": 0.6146033080284046, + "grad_norm": 7.550045490264893, + "learning_rate": 3.8597366024518396e-05, + "loss": 1.0635, + "num_input_tokens_seen": 141186096, + "step": 8774 + }, + { + "epoch": 0.6146733562741338, + "grad_norm": 4.097894668579102, + "learning_rate": 3.859036777583187e-05, + "loss": 1.1502, + "num_input_tokens_seen": 141201192, + "step": 8775 + }, + { + "epoch": 0.6147434045198631, + "grad_norm": 4.331363677978516, + "learning_rate": 3.858336952714537e-05, + "loss": 1.0661, + "num_input_tokens_seen": 141216688, + "step": 8776 + }, + { + "epoch": 0.6148134527655923, + "grad_norm": 4.026492595672607, + "learning_rate": 3.857637127845884e-05, + "loss": 1.0658, + "num_input_tokens_seen": 141233000, + "step": 8777 + }, + { + "epoch": 0.6148835010113215, + "grad_norm": 4.178946018218994, + "learning_rate": 3.856937302977233e-05, + "loss": 0.9163, + "num_input_tokens_seen": 141248792, + "step": 8778 + }, + { + "epoch": 0.6149535492570508, + "grad_norm": 4.316220760345459, + "learning_rate": 3.856237478108582e-05, + "loss": 1.0598, + "num_input_tokens_seen": 141265176, + "step": 8779 + }, + { + "epoch": 0.61502359750278, + "grad_norm": 4.140246868133545, + "learning_rate": 3.85553765323993e-05, + "loss": 1.1618, + "num_input_tokens_seen": 141280976, + "step": 8780 + }, + { + "epoch": 0.6150936457485093, + "grad_norm": 8.124638557434082, + "learning_rate": 3.854837828371279e-05, + "loss": 1.1448, + "num_input_tokens_seen": 141297248, + "step": 8781 + }, + { + "epoch": 0.6151636939942385, + "grad_norm": 3.7037198543548584, + "learning_rate": 3.854138003502627e-05, + "loss": 1.0785, + "num_input_tokens_seen": 141312880, + "step": 8782 + }, + { + "epoch": 0.6152337422399677, + "grad_norm": 3.8280651569366455, + "learning_rate": 3.853438178633976e-05, + "loss": 1.0771, + "num_input_tokens_seen": 141328992, + "step": 8783 + }, + { + "epoch": 0.615303790485697, + "grad_norm": 4.508422374725342, + "learning_rate": 3.852738353765325e-05, + "loss": 1.1592, + "num_input_tokens_seen": 141344416, + "step": 8784 + }, + { + "epoch": 0.6153738387314263, + "grad_norm": 4.738521099090576, + "learning_rate": 3.8520385288966723e-05, + "loss": 0.9237, + "num_input_tokens_seen": 141360800, + "step": 8785 + }, + { + "epoch": 0.6154438869771555, + "grad_norm": 3.75087833404541, + "learning_rate": 3.851338704028022e-05, + "loss": 1.137, + "num_input_tokens_seen": 141377184, + "step": 8786 + }, + { + "epoch": 0.6155139352228848, + "grad_norm": 6.626635551452637, + "learning_rate": 3.8506388791593694e-05, + "loss": 0.9702, + "num_input_tokens_seen": 141392736, + "step": 8787 + }, + { + "epoch": 0.615583983468614, + "grad_norm": 5.533083438873291, + "learning_rate": 3.849939054290718e-05, + "loss": 0.9885, + "num_input_tokens_seen": 141409120, + "step": 8788 + }, + { + "epoch": 0.6156540317143433, + "grad_norm": 3.5983059406280518, + "learning_rate": 3.849239229422067e-05, + "loss": 0.9806, + "num_input_tokens_seen": 141425472, + "step": 8789 + }, + { + "epoch": 0.6157240799600725, + "grad_norm": 4.028461456298828, + "learning_rate": 3.848539404553415e-05, + "loss": 1.0892, + "num_input_tokens_seen": 141441856, + "step": 8790 + }, + { + "epoch": 0.6157941282058017, + "grad_norm": 4.419944763183594, + "learning_rate": 3.847839579684764e-05, + "loss": 1.1227, + "num_input_tokens_seen": 141458240, + "step": 8791 + }, + { + "epoch": 0.615864176451531, + "grad_norm": 5.264087677001953, + "learning_rate": 3.847139754816112e-05, + "loss": 1.4229, + "num_input_tokens_seen": 141473648, + "step": 8792 + }, + { + "epoch": 0.6159342246972602, + "grad_norm": 4.207594394683838, + "learning_rate": 3.846439929947461e-05, + "loss": 0.8676, + "num_input_tokens_seen": 141489568, + "step": 8793 + }, + { + "epoch": 0.6160042729429895, + "grad_norm": 5.569483280181885, + "learning_rate": 3.845740105078809e-05, + "loss": 1.1212, + "num_input_tokens_seen": 141504496, + "step": 8794 + }, + { + "epoch": 0.6160743211887187, + "grad_norm": 4.781259536743164, + "learning_rate": 3.8450402802101575e-05, + "loss": 1.0974, + "num_input_tokens_seen": 141520328, + "step": 8795 + }, + { + "epoch": 0.616144369434448, + "grad_norm": 4.370481967926025, + "learning_rate": 3.844340455341507e-05, + "loss": 0.8102, + "num_input_tokens_seen": 141536712, + "step": 8796 + }, + { + "epoch": 0.6162144176801773, + "grad_norm": 3.913745641708374, + "learning_rate": 3.8436406304728546e-05, + "loss": 1.0474, + "num_input_tokens_seen": 141553096, + "step": 8797 + }, + { + "epoch": 0.6162844659259065, + "grad_norm": 3.646759271621704, + "learning_rate": 3.8429408056042034e-05, + "loss": 1.0241, + "num_input_tokens_seen": 141569480, + "step": 8798 + }, + { + "epoch": 0.6163545141716357, + "grad_norm": 4.749353885650635, + "learning_rate": 3.8422409807355516e-05, + "loss": 0.8867, + "num_input_tokens_seen": 141585864, + "step": 8799 + }, + { + "epoch": 0.616424562417365, + "grad_norm": 3.436282157897949, + "learning_rate": 3.8415411558669005e-05, + "loss": 0.8953, + "num_input_tokens_seen": 141602248, + "step": 8800 + }, + { + "epoch": 0.616424562417365, + "eval_loss": 1.1198780536651611, + "eval_runtime": 0.1825, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 5.478, + "num_input_tokens_seen": 141602248, + "step": 8800 + }, + { + "epoch": 0.6164946106630942, + "grad_norm": 3.844834566116333, + "learning_rate": 3.8408413309982493e-05, + "loss": 1.1384, + "num_input_tokens_seen": 141618632, + "step": 8801 + }, + { + "epoch": 0.6165646589088235, + "grad_norm": 6.700494766235352, + "learning_rate": 3.8401415061295975e-05, + "loss": 1.361, + "num_input_tokens_seen": 141635016, + "step": 8802 + }, + { + "epoch": 0.6166347071545527, + "grad_norm": 4.084661483764648, + "learning_rate": 3.8394416812609464e-05, + "loss": 1.0923, + "num_input_tokens_seen": 141651256, + "step": 8803 + }, + { + "epoch": 0.6167047554002819, + "grad_norm": 5.89295768737793, + "learning_rate": 3.838741856392294e-05, + "loss": 1.1316, + "num_input_tokens_seen": 141667368, + "step": 8804 + }, + { + "epoch": 0.6167748036460112, + "grad_norm": 3.4693360328674316, + "learning_rate": 3.838042031523643e-05, + "loss": 1.0346, + "num_input_tokens_seen": 141683360, + "step": 8805 + }, + { + "epoch": 0.6168448518917404, + "grad_norm": 5.216996669769287, + "learning_rate": 3.837342206654992e-05, + "loss": 1.0564, + "num_input_tokens_seen": 141699296, + "step": 8806 + }, + { + "epoch": 0.6169149001374696, + "grad_norm": 3.7782225608825684, + "learning_rate": 3.83664238178634e-05, + "loss": 1.0716, + "num_input_tokens_seen": 141715040, + "step": 8807 + }, + { + "epoch": 0.616984948383199, + "grad_norm": 3.692950487136841, + "learning_rate": 3.8359425569176887e-05, + "loss": 1.1667, + "num_input_tokens_seen": 141731424, + "step": 8808 + }, + { + "epoch": 0.6170549966289282, + "grad_norm": 3.532275676727295, + "learning_rate": 3.835242732049037e-05, + "loss": 0.9728, + "num_input_tokens_seen": 141747808, + "step": 8809 + }, + { + "epoch": 0.6171250448746575, + "grad_norm": 4.749434471130371, + "learning_rate": 3.834542907180386e-05, + "loss": 0.9784, + "num_input_tokens_seen": 141763936, + "step": 8810 + }, + { + "epoch": 0.6171950931203867, + "grad_norm": 4.1871442794799805, + "learning_rate": 3.8338430823117345e-05, + "loss": 0.9612, + "num_input_tokens_seen": 141780320, + "step": 8811 + }, + { + "epoch": 0.6172651413661159, + "grad_norm": 5.210129737854004, + "learning_rate": 3.833143257443083e-05, + "loss": 0.9569, + "num_input_tokens_seen": 141796552, + "step": 8812 + }, + { + "epoch": 0.6173351896118452, + "grad_norm": 4.053934097290039, + "learning_rate": 3.8324434325744316e-05, + "loss": 1.1493, + "num_input_tokens_seen": 141812320, + "step": 8813 + }, + { + "epoch": 0.6174052378575744, + "grad_norm": 3.872776746749878, + "learning_rate": 3.831743607705779e-05, + "loss": 1.1443, + "num_input_tokens_seen": 141828008, + "step": 8814 + }, + { + "epoch": 0.6174752861033036, + "grad_norm": 5.975555419921875, + "learning_rate": 3.831043782837128e-05, + "loss": 1.3241, + "num_input_tokens_seen": 141844392, + "step": 8815 + }, + { + "epoch": 0.6175453343490329, + "grad_norm": 4.658726692199707, + "learning_rate": 3.8303439579684775e-05, + "loss": 1.0224, + "num_input_tokens_seen": 141860776, + "step": 8816 + }, + { + "epoch": 0.6176153825947621, + "grad_norm": 4.645839214324951, + "learning_rate": 3.829644133099825e-05, + "loss": 1.1685, + "num_input_tokens_seen": 141877160, + "step": 8817 + }, + { + "epoch": 0.6176854308404914, + "grad_norm": 4.305716514587402, + "learning_rate": 3.828944308231174e-05, + "loss": 1.1367, + "num_input_tokens_seen": 141893296, + "step": 8818 + }, + { + "epoch": 0.6177554790862206, + "grad_norm": 6.359955787658691, + "learning_rate": 3.828244483362522e-05, + "loss": 0.9972, + "num_input_tokens_seen": 141909680, + "step": 8819 + }, + { + "epoch": 0.6178255273319498, + "grad_norm": 3.6261024475097656, + "learning_rate": 3.827544658493871e-05, + "loss": 1.1307, + "num_input_tokens_seen": 141926064, + "step": 8820 + }, + { + "epoch": 0.6178955755776792, + "grad_norm": 4.013975143432617, + "learning_rate": 3.8268448336252184e-05, + "loss": 0.9956, + "num_input_tokens_seen": 141942104, + "step": 8821 + }, + { + "epoch": 0.6179656238234084, + "grad_norm": 5.415589332580566, + "learning_rate": 3.826145008756568e-05, + "loss": 0.907, + "num_input_tokens_seen": 141957704, + "step": 8822 + }, + { + "epoch": 0.6180356720691376, + "grad_norm": 4.044425010681152, + "learning_rate": 3.825445183887917e-05, + "loss": 0.9269, + "num_input_tokens_seen": 141973888, + "step": 8823 + }, + { + "epoch": 0.6181057203148669, + "grad_norm": 4.470972537994385, + "learning_rate": 3.824745359019264e-05, + "loss": 1.2509, + "num_input_tokens_seen": 141990272, + "step": 8824 + }, + { + "epoch": 0.6181757685605961, + "grad_norm": 3.7580180168151855, + "learning_rate": 3.824045534150613e-05, + "loss": 1.0093, + "num_input_tokens_seen": 142006648, + "step": 8825 + }, + { + "epoch": 0.6182458168063254, + "grad_norm": 4.067566871643066, + "learning_rate": 3.823345709281961e-05, + "loss": 0.9987, + "num_input_tokens_seen": 142022640, + "step": 8826 + }, + { + "epoch": 0.6183158650520546, + "grad_norm": 3.7393903732299805, + "learning_rate": 3.82264588441331e-05, + "loss": 1.0583, + "num_input_tokens_seen": 142039024, + "step": 8827 + }, + { + "epoch": 0.6183859132977838, + "grad_norm": 4.278164863586426, + "learning_rate": 3.821946059544659e-05, + "loss": 0.8745, + "num_input_tokens_seen": 142055336, + "step": 8828 + }, + { + "epoch": 0.6184559615435131, + "grad_norm": 4.12859582901001, + "learning_rate": 3.821246234676007e-05, + "loss": 1.1159, + "num_input_tokens_seen": 142070608, + "step": 8829 + }, + { + "epoch": 0.6185260097892423, + "grad_norm": 5.109426975250244, + "learning_rate": 3.820546409807356e-05, + "loss": 1.0605, + "num_input_tokens_seen": 142086992, + "step": 8830 + }, + { + "epoch": 0.6185960580349716, + "grad_norm": 4.895833969116211, + "learning_rate": 3.8198465849387036e-05, + "loss": 1.0009, + "num_input_tokens_seen": 142103376, + "step": 8831 + }, + { + "epoch": 0.6186661062807008, + "grad_norm": 3.9808335304260254, + "learning_rate": 3.819146760070053e-05, + "loss": 1.0044, + "num_input_tokens_seen": 142119504, + "step": 8832 + }, + { + "epoch": 0.61873615452643, + "grad_norm": 3.944330930709839, + "learning_rate": 3.818446935201402e-05, + "loss": 1.1058, + "num_input_tokens_seen": 142135888, + "step": 8833 + }, + { + "epoch": 0.6188062027721594, + "grad_norm": 4.508646488189697, + "learning_rate": 3.8177471103327495e-05, + "loss": 1.0649, + "num_input_tokens_seen": 142152272, + "step": 8834 + }, + { + "epoch": 0.6188762510178886, + "grad_norm": 4.792259216308594, + "learning_rate": 3.8170472854640984e-05, + "loss": 0.9991, + "num_input_tokens_seen": 142168656, + "step": 8835 + }, + { + "epoch": 0.6189462992636178, + "grad_norm": 3.3887269496917725, + "learning_rate": 3.8163474605954465e-05, + "loss": 0.9316, + "num_input_tokens_seen": 142185040, + "step": 8836 + }, + { + "epoch": 0.6190163475093471, + "grad_norm": 4.3720831871032715, + "learning_rate": 3.8156476357267954e-05, + "loss": 1.0301, + "num_input_tokens_seen": 142201344, + "step": 8837 + }, + { + "epoch": 0.6190863957550763, + "grad_norm": 5.584211349487305, + "learning_rate": 3.814947810858144e-05, + "loss": 1.1305, + "num_input_tokens_seen": 142217728, + "step": 8838 + }, + { + "epoch": 0.6191564440008056, + "grad_norm": 3.439690113067627, + "learning_rate": 3.8142479859894924e-05, + "loss": 1.0307, + "num_input_tokens_seen": 142234112, + "step": 8839 + }, + { + "epoch": 0.6192264922465348, + "grad_norm": 3.5457499027252197, + "learning_rate": 3.813548161120841e-05, + "loss": 0.9315, + "num_input_tokens_seen": 142250456, + "step": 8840 + }, + { + "epoch": 0.619296540492264, + "grad_norm": 3.7961411476135254, + "learning_rate": 3.812848336252189e-05, + "loss": 1.0121, + "num_input_tokens_seen": 142266840, + "step": 8841 + }, + { + "epoch": 0.6193665887379933, + "grad_norm": 3.523218870162964, + "learning_rate": 3.812148511383538e-05, + "loss": 0.829, + "num_input_tokens_seen": 142282720, + "step": 8842 + }, + { + "epoch": 0.6194366369837225, + "grad_norm": 4.148589134216309, + "learning_rate": 3.811448686514887e-05, + "loss": 1.2733, + "num_input_tokens_seen": 142299104, + "step": 8843 + }, + { + "epoch": 0.6195066852294517, + "grad_norm": 3.8314008712768555, + "learning_rate": 3.810748861646235e-05, + "loss": 1.0721, + "num_input_tokens_seen": 142315288, + "step": 8844 + }, + { + "epoch": 0.619576733475181, + "grad_norm": 5.144819736480713, + "learning_rate": 3.8100490367775836e-05, + "loss": 1.1635, + "num_input_tokens_seen": 142330856, + "step": 8845 + }, + { + "epoch": 0.6196467817209103, + "grad_norm": 3.958228588104248, + "learning_rate": 3.809349211908932e-05, + "loss": 0.9761, + "num_input_tokens_seen": 142347056, + "step": 8846 + }, + { + "epoch": 0.6197168299666396, + "grad_norm": 4.0231404304504395, + "learning_rate": 3.8086493870402806e-05, + "loss": 1.1504, + "num_input_tokens_seen": 142363440, + "step": 8847 + }, + { + "epoch": 0.6197868782123688, + "grad_norm": 4.002577304840088, + "learning_rate": 3.807949562171629e-05, + "loss": 1.0443, + "num_input_tokens_seen": 142379824, + "step": 8848 + }, + { + "epoch": 0.619856926458098, + "grad_norm": 4.052623271942139, + "learning_rate": 3.8072497373029776e-05, + "loss": 1.1119, + "num_input_tokens_seen": 142395616, + "step": 8849 + }, + { + "epoch": 0.6199269747038273, + "grad_norm": 4.404623031616211, + "learning_rate": 3.8065499124343265e-05, + "loss": 1.0581, + "num_input_tokens_seen": 142412000, + "step": 8850 + }, + { + "epoch": 0.6199970229495565, + "grad_norm": 4.651787757873535, + "learning_rate": 3.805850087565674e-05, + "loss": 1.2277, + "num_input_tokens_seen": 142427584, + "step": 8851 + }, + { + "epoch": 0.6200670711952857, + "grad_norm": 4.038480758666992, + "learning_rate": 3.8051502626970235e-05, + "loss": 1.1064, + "num_input_tokens_seen": 142443968, + "step": 8852 + }, + { + "epoch": 0.620137119441015, + "grad_norm": 3.9735183715820312, + "learning_rate": 3.804450437828371e-05, + "loss": 0.9659, + "num_input_tokens_seen": 142460280, + "step": 8853 + }, + { + "epoch": 0.6202071676867442, + "grad_norm": 4.384612083435059, + "learning_rate": 3.80375061295972e-05, + "loss": 1.0265, + "num_input_tokens_seen": 142476664, + "step": 8854 + }, + { + "epoch": 0.6202772159324735, + "grad_norm": 4.33083963394165, + "learning_rate": 3.803050788091069e-05, + "loss": 1.1482, + "num_input_tokens_seen": 142493048, + "step": 8855 + }, + { + "epoch": 0.6203472641782027, + "grad_norm": 3.9577481746673584, + "learning_rate": 3.802350963222417e-05, + "loss": 1.0429, + "num_input_tokens_seen": 142509432, + "step": 8856 + }, + { + "epoch": 0.6204173124239319, + "grad_norm": 4.049169063568115, + "learning_rate": 3.801651138353766e-05, + "loss": 0.8656, + "num_input_tokens_seen": 142525816, + "step": 8857 + }, + { + "epoch": 0.6204873606696613, + "grad_norm": 4.449878692626953, + "learning_rate": 3.800951313485114e-05, + "loss": 0.8673, + "num_input_tokens_seen": 142541496, + "step": 8858 + }, + { + "epoch": 0.6205574089153905, + "grad_norm": 4.752803802490234, + "learning_rate": 3.800251488616463e-05, + "loss": 0.9456, + "num_input_tokens_seen": 142556728, + "step": 8859 + }, + { + "epoch": 0.6206274571611197, + "grad_norm": 3.426882743835449, + "learning_rate": 3.799551663747812e-05, + "loss": 0.9147, + "num_input_tokens_seen": 142573112, + "step": 8860 + }, + { + "epoch": 0.620697505406849, + "grad_norm": 3.8076608180999756, + "learning_rate": 3.798851838879159e-05, + "loss": 1.1146, + "num_input_tokens_seen": 142588688, + "step": 8861 + }, + { + "epoch": 0.6207675536525782, + "grad_norm": 4.169045448303223, + "learning_rate": 3.798152014010509e-05, + "loss": 0.9288, + "num_input_tokens_seen": 142605072, + "step": 8862 + }, + { + "epoch": 0.6208376018983075, + "grad_norm": 4.4878058433532715, + "learning_rate": 3.797452189141856e-05, + "loss": 0.9307, + "num_input_tokens_seen": 142621456, + "step": 8863 + }, + { + "epoch": 0.6209076501440367, + "grad_norm": 3.969947576522827, + "learning_rate": 3.796752364273205e-05, + "loss": 1.188, + "num_input_tokens_seen": 142637840, + "step": 8864 + }, + { + "epoch": 0.6209776983897659, + "grad_norm": 5.208865165710449, + "learning_rate": 3.796052539404554e-05, + "loss": 1.0355, + "num_input_tokens_seen": 142654224, + "step": 8865 + }, + { + "epoch": 0.6210477466354952, + "grad_norm": 3.6306381225585938, + "learning_rate": 3.795352714535902e-05, + "loss": 1.1135, + "num_input_tokens_seen": 142670608, + "step": 8866 + }, + { + "epoch": 0.6211177948812244, + "grad_norm": 6.061744689941406, + "learning_rate": 3.794652889667251e-05, + "loss": 0.8724, + "num_input_tokens_seen": 142686392, + "step": 8867 + }, + { + "epoch": 0.6211878431269537, + "grad_norm": 3.796785593032837, + "learning_rate": 3.793953064798599e-05, + "loss": 1.108, + "num_input_tokens_seen": 142702776, + "step": 8868 + }, + { + "epoch": 0.621257891372683, + "grad_norm": 4.68324613571167, + "learning_rate": 3.793253239929948e-05, + "loss": 1.0568, + "num_input_tokens_seen": 142718760, + "step": 8869 + }, + { + "epoch": 0.6213279396184122, + "grad_norm": 3.7691650390625, + "learning_rate": 3.792553415061297e-05, + "loss": 1.0131, + "num_input_tokens_seen": 142734608, + "step": 8870 + }, + { + "epoch": 0.6213979878641415, + "grad_norm": 4.684053897857666, + "learning_rate": 3.7918535901926444e-05, + "loss": 1.0531, + "num_input_tokens_seen": 142750992, + "step": 8871 + }, + { + "epoch": 0.6214680361098707, + "grad_norm": 4.86785364151001, + "learning_rate": 3.791153765323994e-05, + "loss": 1.1881, + "num_input_tokens_seen": 142766728, + "step": 8872 + }, + { + "epoch": 0.6215380843555999, + "grad_norm": 3.6730477809906006, + "learning_rate": 3.7904539404553414e-05, + "loss": 1.0903, + "num_input_tokens_seen": 142783112, + "step": 8873 + }, + { + "epoch": 0.6216081326013292, + "grad_norm": 4.139063358306885, + "learning_rate": 3.78975411558669e-05, + "loss": 1.0793, + "num_input_tokens_seen": 142799496, + "step": 8874 + }, + { + "epoch": 0.6216781808470584, + "grad_norm": 3.677797317504883, + "learning_rate": 3.7890542907180385e-05, + "loss": 1.0538, + "num_input_tokens_seen": 142815792, + "step": 8875 + }, + { + "epoch": 0.6217482290927877, + "grad_norm": 4.4752197265625, + "learning_rate": 3.7883544658493873e-05, + "loss": 1.1173, + "num_input_tokens_seen": 142832176, + "step": 8876 + }, + { + "epoch": 0.6218182773385169, + "grad_norm": 3.6812987327575684, + "learning_rate": 3.787654640980736e-05, + "loss": 0.9578, + "num_input_tokens_seen": 142848040, + "step": 8877 + }, + { + "epoch": 0.6218883255842461, + "grad_norm": 3.720367908477783, + "learning_rate": 3.7869548161120844e-05, + "loss": 1.0559, + "num_input_tokens_seen": 142864424, + "step": 8878 + }, + { + "epoch": 0.6219583738299754, + "grad_norm": 3.5268428325653076, + "learning_rate": 3.786254991243433e-05, + "loss": 0.9855, + "num_input_tokens_seen": 142880688, + "step": 8879 + }, + { + "epoch": 0.6220284220757046, + "grad_norm": 4.316417217254639, + "learning_rate": 3.785555166374781e-05, + "loss": 1.1222, + "num_input_tokens_seen": 142897072, + "step": 8880 + }, + { + "epoch": 0.6220984703214338, + "grad_norm": 6.519804954528809, + "learning_rate": 3.7848553415061296e-05, + "loss": 1.129, + "num_input_tokens_seen": 142912944, + "step": 8881 + }, + { + "epoch": 0.6221685185671632, + "grad_norm": 5.340738296508789, + "learning_rate": 3.784155516637479e-05, + "loss": 1.1761, + "num_input_tokens_seen": 142928856, + "step": 8882 + }, + { + "epoch": 0.6222385668128924, + "grad_norm": 5.973625183105469, + "learning_rate": 3.7834556917688267e-05, + "loss": 1.2331, + "num_input_tokens_seen": 142944056, + "step": 8883 + }, + { + "epoch": 0.6223086150586217, + "grad_norm": 3.400740385055542, + "learning_rate": 3.7827558669001755e-05, + "loss": 0.9725, + "num_input_tokens_seen": 142960440, + "step": 8884 + }, + { + "epoch": 0.6223786633043509, + "grad_norm": 4.189090728759766, + "learning_rate": 3.782056042031524e-05, + "loss": 1.1634, + "num_input_tokens_seen": 142976824, + "step": 8885 + }, + { + "epoch": 0.6224487115500801, + "grad_norm": 4.298450469970703, + "learning_rate": 3.7813562171628725e-05, + "loss": 1.0398, + "num_input_tokens_seen": 142993208, + "step": 8886 + }, + { + "epoch": 0.6225187597958094, + "grad_norm": 3.899477005004883, + "learning_rate": 3.7806563922942214e-05, + "loss": 1.0722, + "num_input_tokens_seen": 143009592, + "step": 8887 + }, + { + "epoch": 0.6225888080415386, + "grad_norm": 4.810802459716797, + "learning_rate": 3.7799565674255696e-05, + "loss": 1.0637, + "num_input_tokens_seen": 143024984, + "step": 8888 + }, + { + "epoch": 0.6226588562872678, + "grad_norm": 3.912600517272949, + "learning_rate": 3.7792567425569184e-05, + "loss": 1.1759, + "num_input_tokens_seen": 143041368, + "step": 8889 + }, + { + "epoch": 0.6227289045329971, + "grad_norm": 3.631242513656616, + "learning_rate": 3.778556917688266e-05, + "loss": 0.965, + "num_input_tokens_seen": 143057752, + "step": 8890 + }, + { + "epoch": 0.6227989527787263, + "grad_norm": 3.867696762084961, + "learning_rate": 3.777857092819615e-05, + "loss": 1.0452, + "num_input_tokens_seen": 143073792, + "step": 8891 + }, + { + "epoch": 0.6228690010244556, + "grad_norm": 4.482724189758301, + "learning_rate": 3.7771572679509643e-05, + "loss": 1.0962, + "num_input_tokens_seen": 143090176, + "step": 8892 + }, + { + "epoch": 0.6229390492701848, + "grad_norm": 3.575944423675537, + "learning_rate": 3.776457443082312e-05, + "loss": 0.9852, + "num_input_tokens_seen": 143106560, + "step": 8893 + }, + { + "epoch": 0.623009097515914, + "grad_norm": 4.862271785736084, + "learning_rate": 3.775757618213661e-05, + "loss": 1.1368, + "num_input_tokens_seen": 143122600, + "step": 8894 + }, + { + "epoch": 0.6230791457616434, + "grad_norm": 3.603167772293091, + "learning_rate": 3.775057793345009e-05, + "loss": 1.0526, + "num_input_tokens_seen": 143138872, + "step": 8895 + }, + { + "epoch": 0.6231491940073726, + "grad_norm": 4.46088981628418, + "learning_rate": 3.774357968476358e-05, + "loss": 0.9613, + "num_input_tokens_seen": 143155208, + "step": 8896 + }, + { + "epoch": 0.6232192422531019, + "grad_norm": 4.664116382598877, + "learning_rate": 3.7736581436077066e-05, + "loss": 1.1966, + "num_input_tokens_seen": 143171592, + "step": 8897 + }, + { + "epoch": 0.6232892904988311, + "grad_norm": 6.029530048370361, + "learning_rate": 3.772958318739055e-05, + "loss": 0.9586, + "num_input_tokens_seen": 143187432, + "step": 8898 + }, + { + "epoch": 0.6233593387445603, + "grad_norm": 5.297750473022461, + "learning_rate": 3.7722584938704037e-05, + "loss": 1.0174, + "num_input_tokens_seen": 143202976, + "step": 8899 + }, + { + "epoch": 0.6234293869902896, + "grad_norm": 4.243722438812256, + "learning_rate": 3.771558669001751e-05, + "loss": 1.0599, + "num_input_tokens_seen": 143218328, + "step": 8900 + }, + { + "epoch": 0.6234994352360188, + "grad_norm": 5.040163516998291, + "learning_rate": 3.7708588441331e-05, + "loss": 0.9463, + "num_input_tokens_seen": 143234496, + "step": 8901 + }, + { + "epoch": 0.623569483481748, + "grad_norm": 3.776176691055298, + "learning_rate": 3.770159019264448e-05, + "loss": 0.9702, + "num_input_tokens_seen": 143250264, + "step": 8902 + }, + { + "epoch": 0.6236395317274773, + "grad_norm": 4.226377964019775, + "learning_rate": 3.769459194395797e-05, + "loss": 0.9531, + "num_input_tokens_seen": 143265976, + "step": 8903 + }, + { + "epoch": 0.6237095799732065, + "grad_norm": 4.512108325958252, + "learning_rate": 3.768759369527146e-05, + "loss": 0.9165, + "num_input_tokens_seen": 143282360, + "step": 8904 + }, + { + "epoch": 0.6237796282189358, + "grad_norm": 3.6923460960388184, + "learning_rate": 3.768059544658494e-05, + "loss": 1.1001, + "num_input_tokens_seen": 143298744, + "step": 8905 + }, + { + "epoch": 0.623849676464665, + "grad_norm": 4.291098117828369, + "learning_rate": 3.767359719789843e-05, + "loss": 1.1216, + "num_input_tokens_seen": 143314184, + "step": 8906 + }, + { + "epoch": 0.6239197247103943, + "grad_norm": 4.348942756652832, + "learning_rate": 3.7666598949211905e-05, + "loss": 1.2986, + "num_input_tokens_seen": 143330568, + "step": 8907 + }, + { + "epoch": 0.6239897729561236, + "grad_norm": 4.6375412940979, + "learning_rate": 3.76596007005254e-05, + "loss": 0.8349, + "num_input_tokens_seen": 143346648, + "step": 8908 + }, + { + "epoch": 0.6240598212018528, + "grad_norm": 4.037846565246582, + "learning_rate": 3.765260245183889e-05, + "loss": 1.1271, + "num_input_tokens_seen": 143362280, + "step": 8909 + }, + { + "epoch": 0.624129869447582, + "grad_norm": 3.737903594970703, + "learning_rate": 3.7645604203152364e-05, + "loss": 1.168, + "num_input_tokens_seen": 143378664, + "step": 8910 + }, + { + "epoch": 0.6241999176933113, + "grad_norm": 3.7530734539031982, + "learning_rate": 3.763860595446585e-05, + "loss": 0.979, + "num_input_tokens_seen": 143394848, + "step": 8911 + }, + { + "epoch": 0.6242699659390405, + "grad_norm": 3.5531516075134277, + "learning_rate": 3.7631607705779334e-05, + "loss": 0.9539, + "num_input_tokens_seen": 143410864, + "step": 8912 + }, + { + "epoch": 0.6243400141847698, + "grad_norm": 3.59857439994812, + "learning_rate": 3.762460945709282e-05, + "loss": 1.0277, + "num_input_tokens_seen": 143427248, + "step": 8913 + }, + { + "epoch": 0.624410062430499, + "grad_norm": 4.993155002593994, + "learning_rate": 3.761761120840631e-05, + "loss": 0.9714, + "num_input_tokens_seen": 143442944, + "step": 8914 + }, + { + "epoch": 0.6244801106762282, + "grad_norm": 4.937681674957275, + "learning_rate": 3.761061295971979e-05, + "loss": 1.0694, + "num_input_tokens_seen": 143459328, + "step": 8915 + }, + { + "epoch": 0.6245501589219575, + "grad_norm": 3.3221328258514404, + "learning_rate": 3.760361471103328e-05, + "loss": 0.8962, + "num_input_tokens_seen": 143475440, + "step": 8916 + }, + { + "epoch": 0.6246202071676867, + "grad_norm": 4.134144306182861, + "learning_rate": 3.7596616462346757e-05, + "loss": 0.9835, + "num_input_tokens_seen": 143491824, + "step": 8917 + }, + { + "epoch": 0.6246902554134159, + "grad_norm": 3.695875644683838, + "learning_rate": 3.758961821366025e-05, + "loss": 0.9788, + "num_input_tokens_seen": 143507112, + "step": 8918 + }, + { + "epoch": 0.6247603036591453, + "grad_norm": 3.706967353820801, + "learning_rate": 3.758261996497374e-05, + "loss": 0.8975, + "num_input_tokens_seen": 143523368, + "step": 8919 + }, + { + "epoch": 0.6248303519048745, + "grad_norm": 3.5127205848693848, + "learning_rate": 3.7575621716287216e-05, + "loss": 1.0306, + "num_input_tokens_seen": 143539192, + "step": 8920 + }, + { + "epoch": 0.6249004001506038, + "grad_norm": 4.650593280792236, + "learning_rate": 3.7568623467600704e-05, + "loss": 1.0003, + "num_input_tokens_seen": 143555576, + "step": 8921 + }, + { + "epoch": 0.624970448396333, + "grad_norm": 4.415076732635498, + "learning_rate": 3.7561625218914186e-05, + "loss": 0.8477, + "num_input_tokens_seen": 143569312, + "step": 8922 + }, + { + "epoch": 0.6250404966420622, + "grad_norm": 4.305027961730957, + "learning_rate": 3.7554626970227675e-05, + "loss": 1.0733, + "num_input_tokens_seen": 143585696, + "step": 8923 + }, + { + "epoch": 0.6251105448877915, + "grad_norm": 3.63958477973938, + "learning_rate": 3.754762872154116e-05, + "loss": 1.1452, + "num_input_tokens_seen": 143602080, + "step": 8924 + }, + { + "epoch": 0.6251805931335207, + "grad_norm": 3.7688870429992676, + "learning_rate": 3.7540630472854645e-05, + "loss": 1.0149, + "num_input_tokens_seen": 143618088, + "step": 8925 + }, + { + "epoch": 0.6252506413792499, + "grad_norm": 3.8368911743164062, + "learning_rate": 3.7533632224168134e-05, + "loss": 0.9671, + "num_input_tokens_seen": 143634472, + "step": 8926 + }, + { + "epoch": 0.6253206896249792, + "grad_norm": 6.851833343505859, + "learning_rate": 3.752663397548161e-05, + "loss": 0.9675, + "num_input_tokens_seen": 143650856, + "step": 8927 + }, + { + "epoch": 0.6253907378707084, + "grad_norm": 3.400420904159546, + "learning_rate": 3.7519635726795104e-05, + "loss": 0.9035, + "num_input_tokens_seen": 143667240, + "step": 8928 + }, + { + "epoch": 0.6254607861164377, + "grad_norm": 3.409026622772217, + "learning_rate": 3.751263747810858e-05, + "loss": 0.9757, + "num_input_tokens_seen": 143683368, + "step": 8929 + }, + { + "epoch": 0.6255308343621669, + "grad_norm": 5.727250576019287, + "learning_rate": 3.750563922942207e-05, + "loss": 0.8788, + "num_input_tokens_seen": 143699568, + "step": 8930 + }, + { + "epoch": 0.6256008826078961, + "grad_norm": 3.4300310611724854, + "learning_rate": 3.7498640980735556e-05, + "loss": 0.9172, + "num_input_tokens_seen": 143715952, + "step": 8931 + }, + { + "epoch": 0.6256709308536255, + "grad_norm": 7.258231163024902, + "learning_rate": 3.749164273204904e-05, + "loss": 1.1165, + "num_input_tokens_seen": 143732336, + "step": 8932 + }, + { + "epoch": 0.6257409790993547, + "grad_norm": 3.3325328826904297, + "learning_rate": 3.748464448336253e-05, + "loss": 0.8748, + "num_input_tokens_seen": 143748688, + "step": 8933 + }, + { + "epoch": 0.625811027345084, + "grad_norm": 3.6152236461639404, + "learning_rate": 3.747764623467601e-05, + "loss": 0.9222, + "num_input_tokens_seen": 143764448, + "step": 8934 + }, + { + "epoch": 0.6258810755908132, + "grad_norm": 4.10430908203125, + "learning_rate": 3.74706479859895e-05, + "loss": 1.1151, + "num_input_tokens_seen": 143780776, + "step": 8935 + }, + { + "epoch": 0.6259511238365424, + "grad_norm": 3.966702461242676, + "learning_rate": 3.7463649737302986e-05, + "loss": 1.1749, + "num_input_tokens_seen": 143797064, + "step": 8936 + }, + { + "epoch": 0.6260211720822717, + "grad_norm": 4.03170108795166, + "learning_rate": 3.745665148861646e-05, + "loss": 1.0115, + "num_input_tokens_seen": 143813408, + "step": 8937 + }, + { + "epoch": 0.6260912203280009, + "grad_norm": 4.018350124359131, + "learning_rate": 3.7449653239929956e-05, + "loss": 1.0944, + "num_input_tokens_seen": 143829792, + "step": 8938 + }, + { + "epoch": 0.6261612685737301, + "grad_norm": 3.480238199234009, + "learning_rate": 3.744265499124343e-05, + "loss": 0.9981, + "num_input_tokens_seen": 143845888, + "step": 8939 + }, + { + "epoch": 0.6262313168194594, + "grad_norm": 5.4864020347595215, + "learning_rate": 3.743565674255692e-05, + "loss": 1.1133, + "num_input_tokens_seen": 143862272, + "step": 8940 + }, + { + "epoch": 0.6263013650651886, + "grad_norm": 4.835262775421143, + "learning_rate": 3.742865849387041e-05, + "loss": 1.0142, + "num_input_tokens_seen": 143878656, + "step": 8941 + }, + { + "epoch": 0.626371413310918, + "grad_norm": 3.9142558574676514, + "learning_rate": 3.742166024518389e-05, + "loss": 1.1068, + "num_input_tokens_seen": 143895040, + "step": 8942 + }, + { + "epoch": 0.6264414615566472, + "grad_norm": 4.131178379058838, + "learning_rate": 3.741466199649738e-05, + "loss": 1.0504, + "num_input_tokens_seen": 143910376, + "step": 8943 + }, + { + "epoch": 0.6265115098023764, + "grad_norm": 3.6397500038146973, + "learning_rate": 3.740766374781086e-05, + "loss": 0.9069, + "num_input_tokens_seen": 143926760, + "step": 8944 + }, + { + "epoch": 0.6265815580481057, + "grad_norm": 5.4881978034973145, + "learning_rate": 3.740066549912435e-05, + "loss": 1.0829, + "num_input_tokens_seen": 143943144, + "step": 8945 + }, + { + "epoch": 0.6266516062938349, + "grad_norm": 6.160380840301514, + "learning_rate": 3.739366725043784e-05, + "loss": 1.237, + "num_input_tokens_seen": 143958480, + "step": 8946 + }, + { + "epoch": 0.6267216545395641, + "grad_norm": 3.8040263652801514, + "learning_rate": 3.738666900175131e-05, + "loss": 0.9939, + "num_input_tokens_seen": 143974000, + "step": 8947 + }, + { + "epoch": 0.6267917027852934, + "grad_norm": 3.688765525817871, + "learning_rate": 3.737967075306481e-05, + "loss": 1.0005, + "num_input_tokens_seen": 143990192, + "step": 8948 + }, + { + "epoch": 0.6268617510310226, + "grad_norm": 4.164743900299072, + "learning_rate": 3.737267250437828e-05, + "loss": 1.0147, + "num_input_tokens_seen": 144006208, + "step": 8949 + }, + { + "epoch": 0.6269317992767519, + "grad_norm": 3.680326223373413, + "learning_rate": 3.736567425569177e-05, + "loss": 0.9095, + "num_input_tokens_seen": 144021816, + "step": 8950 + }, + { + "epoch": 0.6270018475224811, + "grad_norm": 4.087733745574951, + "learning_rate": 3.735867600700526e-05, + "loss": 1.1078, + "num_input_tokens_seen": 144038200, + "step": 8951 + }, + { + "epoch": 0.6270718957682103, + "grad_norm": 5.202064037322998, + "learning_rate": 3.735167775831874e-05, + "loss": 1.0315, + "num_input_tokens_seen": 144054584, + "step": 8952 + }, + { + "epoch": 0.6271419440139396, + "grad_norm": 4.578851222991943, + "learning_rate": 3.734467950963223e-05, + "loss": 1.0436, + "num_input_tokens_seen": 144070968, + "step": 8953 + }, + { + "epoch": 0.6272119922596688, + "grad_norm": 4.232603073120117, + "learning_rate": 3.733768126094571e-05, + "loss": 0.976, + "num_input_tokens_seen": 144087352, + "step": 8954 + }, + { + "epoch": 0.627282040505398, + "grad_norm": 6.200143337249756, + "learning_rate": 3.73306830122592e-05, + "loss": 1.1134, + "num_input_tokens_seen": 144103736, + "step": 8955 + }, + { + "epoch": 0.6273520887511274, + "grad_norm": 3.696054220199585, + "learning_rate": 3.7323684763572676e-05, + "loss": 1.0778, + "num_input_tokens_seen": 144120120, + "step": 8956 + }, + { + "epoch": 0.6274221369968566, + "grad_norm": 4.3352370262146, + "learning_rate": 3.7316686514886165e-05, + "loss": 0.9376, + "num_input_tokens_seen": 144135328, + "step": 8957 + }, + { + "epoch": 0.6274921852425859, + "grad_norm": 4.108107566833496, + "learning_rate": 3.730968826619966e-05, + "loss": 1.1491, + "num_input_tokens_seen": 144151288, + "step": 8958 + }, + { + "epoch": 0.6275622334883151, + "grad_norm": 4.748366355895996, + "learning_rate": 3.7302690017513135e-05, + "loss": 1.1301, + "num_input_tokens_seen": 144167672, + "step": 8959 + }, + { + "epoch": 0.6276322817340443, + "grad_norm": 3.789991855621338, + "learning_rate": 3.7295691768826624e-05, + "loss": 1.0851, + "num_input_tokens_seen": 144184056, + "step": 8960 + }, + { + "epoch": 0.6277023299797736, + "grad_norm": 4.499815464019775, + "learning_rate": 3.7288693520140105e-05, + "loss": 1.0044, + "num_input_tokens_seen": 144200120, + "step": 8961 + }, + { + "epoch": 0.6277723782255028, + "grad_norm": 4.267214775085449, + "learning_rate": 3.7281695271453594e-05, + "loss": 1.1354, + "num_input_tokens_seen": 144216504, + "step": 8962 + }, + { + "epoch": 0.627842426471232, + "grad_norm": 4.057802677154541, + "learning_rate": 3.727469702276708e-05, + "loss": 1.0281, + "num_input_tokens_seen": 144232160, + "step": 8963 + }, + { + "epoch": 0.6279124747169613, + "grad_norm": 4.479646682739258, + "learning_rate": 3.7267698774080564e-05, + "loss": 1.0394, + "num_input_tokens_seen": 144248288, + "step": 8964 + }, + { + "epoch": 0.6279825229626905, + "grad_norm": 4.205365180969238, + "learning_rate": 3.726070052539405e-05, + "loss": 1.2261, + "num_input_tokens_seen": 144264672, + "step": 8965 + }, + { + "epoch": 0.6280525712084198, + "grad_norm": 3.567383050918579, + "learning_rate": 3.725370227670753e-05, + "loss": 1.0032, + "num_input_tokens_seen": 144281056, + "step": 8966 + }, + { + "epoch": 0.628122619454149, + "grad_norm": 4.417497158050537, + "learning_rate": 3.724670402802102e-05, + "loss": 1.2401, + "num_input_tokens_seen": 144297088, + "step": 8967 + }, + { + "epoch": 0.6281926676998782, + "grad_norm": 3.6208767890930176, + "learning_rate": 3.723970577933451e-05, + "loss": 1.0565, + "num_input_tokens_seen": 144313472, + "step": 8968 + }, + { + "epoch": 0.6282627159456076, + "grad_norm": 3.9446747303009033, + "learning_rate": 3.723270753064799e-05, + "loss": 1.0335, + "num_input_tokens_seen": 144329856, + "step": 8969 + }, + { + "epoch": 0.6283327641913368, + "grad_norm": 3.8876211643218994, + "learning_rate": 3.7225709281961476e-05, + "loss": 1.0786, + "num_input_tokens_seen": 144345456, + "step": 8970 + }, + { + "epoch": 0.6284028124370661, + "grad_norm": 4.215117454528809, + "learning_rate": 3.721871103327496e-05, + "loss": 1.0279, + "num_input_tokens_seen": 144360688, + "step": 8971 + }, + { + "epoch": 0.6284728606827953, + "grad_norm": 4.418564796447754, + "learning_rate": 3.7211712784588446e-05, + "loss": 0.9532, + "num_input_tokens_seen": 144376952, + "step": 8972 + }, + { + "epoch": 0.6285429089285245, + "grad_norm": 4.910109519958496, + "learning_rate": 3.7204714535901935e-05, + "loss": 1.2107, + "num_input_tokens_seen": 144391224, + "step": 8973 + }, + { + "epoch": 0.6286129571742538, + "grad_norm": 6.908697128295898, + "learning_rate": 3.7197716287215417e-05, + "loss": 0.9197, + "num_input_tokens_seen": 144407608, + "step": 8974 + }, + { + "epoch": 0.628683005419983, + "grad_norm": 3.745656967163086, + "learning_rate": 3.7190718038528905e-05, + "loss": 1.1026, + "num_input_tokens_seen": 144423992, + "step": 8975 + }, + { + "epoch": 0.6287530536657122, + "grad_norm": 4.354073524475098, + "learning_rate": 3.718371978984238e-05, + "loss": 1.1037, + "num_input_tokens_seen": 144438736, + "step": 8976 + }, + { + "epoch": 0.6288231019114415, + "grad_norm": 3.270507335662842, + "learning_rate": 3.717672154115587e-05, + "loss": 0.855, + "num_input_tokens_seen": 144453984, + "step": 8977 + }, + { + "epoch": 0.6288931501571707, + "grad_norm": 3.6292295455932617, + "learning_rate": 3.7169723292469364e-05, + "loss": 1.1302, + "num_input_tokens_seen": 144469792, + "step": 8978 + }, + { + "epoch": 0.6289631984029, + "grad_norm": 3.5180768966674805, + "learning_rate": 3.716272504378284e-05, + "loss": 1.0444, + "num_input_tokens_seen": 144486176, + "step": 8979 + }, + { + "epoch": 0.6290332466486293, + "grad_norm": 4.382061958312988, + "learning_rate": 3.715572679509633e-05, + "loss": 1.0397, + "num_input_tokens_seen": 144502176, + "step": 8980 + }, + { + "epoch": 0.6291032948943585, + "grad_norm": 3.7982583045959473, + "learning_rate": 3.714872854640981e-05, + "loss": 1.0557, + "num_input_tokens_seen": 144518440, + "step": 8981 + }, + { + "epoch": 0.6291733431400878, + "grad_norm": 4.439223289489746, + "learning_rate": 3.71417302977233e-05, + "loss": 0.9305, + "num_input_tokens_seen": 144534424, + "step": 8982 + }, + { + "epoch": 0.629243391385817, + "grad_norm": 5.305637359619141, + "learning_rate": 3.713473204903677e-05, + "loss": 1.0753, + "num_input_tokens_seen": 144550728, + "step": 8983 + }, + { + "epoch": 0.6293134396315462, + "grad_norm": 4.007991790771484, + "learning_rate": 3.712773380035027e-05, + "loss": 1.0242, + "num_input_tokens_seen": 144565968, + "step": 8984 + }, + { + "epoch": 0.6293834878772755, + "grad_norm": 4.10146951675415, + "learning_rate": 3.712073555166376e-05, + "loss": 1.0327, + "num_input_tokens_seen": 144581240, + "step": 8985 + }, + { + "epoch": 0.6294535361230047, + "grad_norm": 4.507574081420898, + "learning_rate": 3.711373730297723e-05, + "loss": 1.2051, + "num_input_tokens_seen": 144597448, + "step": 8986 + }, + { + "epoch": 0.629523584368734, + "grad_norm": 3.7026100158691406, + "learning_rate": 3.710673905429072e-05, + "loss": 0.8061, + "num_input_tokens_seen": 144613040, + "step": 8987 + }, + { + "epoch": 0.6295936326144632, + "grad_norm": 3.481471061706543, + "learning_rate": 3.70997408056042e-05, + "loss": 1.0188, + "num_input_tokens_seen": 144629400, + "step": 8988 + }, + { + "epoch": 0.6296636808601924, + "grad_norm": 3.471844434738159, + "learning_rate": 3.709274255691769e-05, + "loss": 1.019, + "num_input_tokens_seen": 144645584, + "step": 8989 + }, + { + "epoch": 0.6297337291059217, + "grad_norm": 3.891248941421509, + "learning_rate": 3.708574430823118e-05, + "loss": 1.1486, + "num_input_tokens_seen": 144661832, + "step": 8990 + }, + { + "epoch": 0.6298037773516509, + "grad_norm": 6.118776321411133, + "learning_rate": 3.707874605954466e-05, + "loss": 0.9555, + "num_input_tokens_seen": 144678216, + "step": 8991 + }, + { + "epoch": 0.6298738255973801, + "grad_norm": 4.165067672729492, + "learning_rate": 3.707174781085815e-05, + "loss": 1.1964, + "num_input_tokens_seen": 144694248, + "step": 8992 + }, + { + "epoch": 0.6299438738431095, + "grad_norm": 4.20080041885376, + "learning_rate": 3.7064749562171625e-05, + "loss": 1.1404, + "num_input_tokens_seen": 144710632, + "step": 8993 + }, + { + "epoch": 0.6300139220888387, + "grad_norm": 3.5977840423583984, + "learning_rate": 3.705775131348512e-05, + "loss": 0.9842, + "num_input_tokens_seen": 144727016, + "step": 8994 + }, + { + "epoch": 0.630083970334568, + "grad_norm": 3.618051290512085, + "learning_rate": 3.705075306479861e-05, + "loss": 0.8845, + "num_input_tokens_seen": 144742016, + "step": 8995 + }, + { + "epoch": 0.6301540185802972, + "grad_norm": 5.349210739135742, + "learning_rate": 3.7043754816112084e-05, + "loss": 0.9286, + "num_input_tokens_seen": 144758400, + "step": 8996 + }, + { + "epoch": 0.6302240668260264, + "grad_norm": 4.067931175231934, + "learning_rate": 3.703675656742557e-05, + "loss": 1.158, + "num_input_tokens_seen": 144774000, + "step": 8997 + }, + { + "epoch": 0.6302941150717557, + "grad_norm": 5.214384078979492, + "learning_rate": 3.7029758318739055e-05, + "loss": 1.0415, + "num_input_tokens_seen": 144788968, + "step": 8998 + }, + { + "epoch": 0.6303641633174849, + "grad_norm": 3.694132089614868, + "learning_rate": 3.702276007005254e-05, + "loss": 0.9188, + "num_input_tokens_seen": 144805352, + "step": 8999 + }, + { + "epoch": 0.6304342115632141, + "grad_norm": 3.9240880012512207, + "learning_rate": 3.701576182136603e-05, + "loss": 0.9736, + "num_input_tokens_seen": 144821736, + "step": 9000 + }, + { + "epoch": 0.6304342115632141, + "eval_loss": 1.1185849905014038, + "eval_runtime": 0.1723, + "eval_samples_per_second": 5.803, + "eval_steps_per_second": 5.803, + "num_input_tokens_seen": 144821736, + "step": 9000 + }, + { + "epoch": 0.6305042598089434, + "grad_norm": 3.515113592147827, + "learning_rate": 3.7008763572679514e-05, + "loss": 0.9837, + "num_input_tokens_seen": 144837128, + "step": 9001 + }, + { + "epoch": 0.6305743080546726, + "grad_norm": 4.272019863128662, + "learning_rate": 3.7001765323993e-05, + "loss": 1.0164, + "num_input_tokens_seen": 144852640, + "step": 9002 + }, + { + "epoch": 0.6306443563004019, + "grad_norm": 3.971062183380127, + "learning_rate": 3.699476707530648e-05, + "loss": 0.7769, + "num_input_tokens_seen": 144868456, + "step": 9003 + }, + { + "epoch": 0.6307144045461311, + "grad_norm": 3.4022669792175293, + "learning_rate": 3.698776882661997e-05, + "loss": 1.0261, + "num_input_tokens_seen": 144884432, + "step": 9004 + }, + { + "epoch": 0.6307844527918604, + "grad_norm": 5.52700662612915, + "learning_rate": 3.698077057793346e-05, + "loss": 1.1147, + "num_input_tokens_seen": 144900640, + "step": 9005 + }, + { + "epoch": 0.6308545010375897, + "grad_norm": 4.156646728515625, + "learning_rate": 3.6973772329246936e-05, + "loss": 1.138, + "num_input_tokens_seen": 144915976, + "step": 9006 + }, + { + "epoch": 0.6309245492833189, + "grad_norm": 4.33062219619751, + "learning_rate": 3.6966774080560425e-05, + "loss": 0.9596, + "num_input_tokens_seen": 144932216, + "step": 9007 + }, + { + "epoch": 0.6309945975290482, + "grad_norm": 3.931755781173706, + "learning_rate": 3.695977583187391e-05, + "loss": 1.2341, + "num_input_tokens_seen": 144946896, + "step": 9008 + }, + { + "epoch": 0.6310646457747774, + "grad_norm": 4.5909743309021, + "learning_rate": 3.6952777583187395e-05, + "loss": 1.1408, + "num_input_tokens_seen": 144963280, + "step": 9009 + }, + { + "epoch": 0.6311346940205066, + "grad_norm": 4.214529514312744, + "learning_rate": 3.694577933450088e-05, + "loss": 0.9254, + "num_input_tokens_seen": 144979184, + "step": 9010 + }, + { + "epoch": 0.6312047422662359, + "grad_norm": 4.150051116943359, + "learning_rate": 3.6938781085814366e-05, + "loss": 1.1942, + "num_input_tokens_seen": 144995112, + "step": 9011 + }, + { + "epoch": 0.6312747905119651, + "grad_norm": 5.873388290405273, + "learning_rate": 3.6931782837127854e-05, + "loss": 1.1874, + "num_input_tokens_seen": 145010568, + "step": 9012 + }, + { + "epoch": 0.6313448387576943, + "grad_norm": 4.812868118286133, + "learning_rate": 3.692478458844133e-05, + "loss": 1.101, + "num_input_tokens_seen": 145025680, + "step": 9013 + }, + { + "epoch": 0.6314148870034236, + "grad_norm": 5.805929660797119, + "learning_rate": 3.6917786339754825e-05, + "loss": 0.9559, + "num_input_tokens_seen": 145041504, + "step": 9014 + }, + { + "epoch": 0.6314849352491528, + "grad_norm": 3.957404613494873, + "learning_rate": 3.69107880910683e-05, + "loss": 0.9265, + "num_input_tokens_seen": 145057552, + "step": 9015 + }, + { + "epoch": 0.6315549834948821, + "grad_norm": 6.144781589508057, + "learning_rate": 3.690378984238179e-05, + "loss": 1.3109, + "num_input_tokens_seen": 145073328, + "step": 9016 + }, + { + "epoch": 0.6316250317406114, + "grad_norm": 4.233129501342773, + "learning_rate": 3.689679159369528e-05, + "loss": 1.0079, + "num_input_tokens_seen": 145089288, + "step": 9017 + }, + { + "epoch": 0.6316950799863406, + "grad_norm": 4.213534832000732, + "learning_rate": 3.688979334500876e-05, + "loss": 1.0943, + "num_input_tokens_seen": 145105376, + "step": 9018 + }, + { + "epoch": 0.6317651282320699, + "grad_norm": 4.123173713684082, + "learning_rate": 3.688279509632225e-05, + "loss": 1.1625, + "num_input_tokens_seen": 145121552, + "step": 9019 + }, + { + "epoch": 0.6318351764777991, + "grad_norm": 4.385013103485107, + "learning_rate": 3.687579684763573e-05, + "loss": 1.1554, + "num_input_tokens_seen": 145137936, + "step": 9020 + }, + { + "epoch": 0.6319052247235283, + "grad_norm": 4.416991233825684, + "learning_rate": 3.686879859894922e-05, + "loss": 1.1212, + "num_input_tokens_seen": 145153200, + "step": 9021 + }, + { + "epoch": 0.6319752729692576, + "grad_norm": 3.6630640029907227, + "learning_rate": 3.6861800350262706e-05, + "loss": 1.076, + "num_input_tokens_seen": 145169456, + "step": 9022 + }, + { + "epoch": 0.6320453212149868, + "grad_norm": 3.651345729827881, + "learning_rate": 3.685480210157618e-05, + "loss": 1.0585, + "num_input_tokens_seen": 145184480, + "step": 9023 + }, + { + "epoch": 0.6321153694607161, + "grad_norm": 3.581141710281372, + "learning_rate": 3.684780385288967e-05, + "loss": 1.0545, + "num_input_tokens_seen": 145200864, + "step": 9024 + }, + { + "epoch": 0.6321854177064453, + "grad_norm": 5.30309534072876, + "learning_rate": 3.684080560420315e-05, + "loss": 1.1139, + "num_input_tokens_seen": 145216664, + "step": 9025 + }, + { + "epoch": 0.6322554659521745, + "grad_norm": 4.219841957092285, + "learning_rate": 3.683380735551664e-05, + "loss": 0.9877, + "num_input_tokens_seen": 145233048, + "step": 9026 + }, + { + "epoch": 0.6323255141979038, + "grad_norm": 4.988033294677734, + "learning_rate": 3.682680910683013e-05, + "loss": 1.0307, + "num_input_tokens_seen": 145248576, + "step": 9027 + }, + { + "epoch": 0.632395562443633, + "grad_norm": 4.326094150543213, + "learning_rate": 3.681981085814361e-05, + "loss": 1.0483, + "num_input_tokens_seen": 145264848, + "step": 9028 + }, + { + "epoch": 0.6324656106893622, + "grad_norm": 4.7632222175598145, + "learning_rate": 3.68128126094571e-05, + "loss": 1.1164, + "num_input_tokens_seen": 145281232, + "step": 9029 + }, + { + "epoch": 0.6325356589350916, + "grad_norm": 3.8900914192199707, + "learning_rate": 3.680581436077058e-05, + "loss": 1.1013, + "num_input_tokens_seen": 145297512, + "step": 9030 + }, + { + "epoch": 0.6326057071808208, + "grad_norm": 4.701690673828125, + "learning_rate": 3.679881611208407e-05, + "loss": 1.0772, + "num_input_tokens_seen": 145313896, + "step": 9031 + }, + { + "epoch": 0.6326757554265501, + "grad_norm": 4.644353866577148, + "learning_rate": 3.6791817863397545e-05, + "loss": 1.2127, + "num_input_tokens_seen": 145330280, + "step": 9032 + }, + { + "epoch": 0.6327458036722793, + "grad_norm": 5.412736415863037, + "learning_rate": 3.678481961471103e-05, + "loss": 1.1139, + "num_input_tokens_seen": 145346664, + "step": 9033 + }, + { + "epoch": 0.6328158519180085, + "grad_norm": 3.7632980346679688, + "learning_rate": 3.677782136602452e-05, + "loss": 0.9418, + "num_input_tokens_seen": 145363048, + "step": 9034 + }, + { + "epoch": 0.6328859001637378, + "grad_norm": 4.526835918426514, + "learning_rate": 3.6770823117338004e-05, + "loss": 1.032, + "num_input_tokens_seen": 145377904, + "step": 9035 + }, + { + "epoch": 0.632955948409467, + "grad_norm": 5.5759501457214355, + "learning_rate": 3.676382486865149e-05, + "loss": 1.0475, + "num_input_tokens_seen": 145393912, + "step": 9036 + }, + { + "epoch": 0.6330259966551963, + "grad_norm": 4.324702262878418, + "learning_rate": 3.6756826619964974e-05, + "loss": 1.0389, + "num_input_tokens_seen": 145410296, + "step": 9037 + }, + { + "epoch": 0.6330960449009255, + "grad_norm": 4.075634956359863, + "learning_rate": 3.674982837127846e-05, + "loss": 0.8591, + "num_input_tokens_seen": 145426072, + "step": 9038 + }, + { + "epoch": 0.6331660931466547, + "grad_norm": 3.9009335041046143, + "learning_rate": 3.674283012259195e-05, + "loss": 1.2391, + "num_input_tokens_seen": 145442456, + "step": 9039 + }, + { + "epoch": 0.633236141392384, + "grad_norm": 3.7311902046203613, + "learning_rate": 3.673583187390543e-05, + "loss": 1.0359, + "num_input_tokens_seen": 145458840, + "step": 9040 + }, + { + "epoch": 0.6333061896381132, + "grad_norm": 3.5607376098632812, + "learning_rate": 3.672883362521892e-05, + "loss": 1.0247, + "num_input_tokens_seen": 145475224, + "step": 9041 + }, + { + "epoch": 0.6333762378838425, + "grad_norm": 3.715390920639038, + "learning_rate": 3.67218353765324e-05, + "loss": 1.0373, + "num_input_tokens_seen": 145491608, + "step": 9042 + }, + { + "epoch": 0.6334462861295718, + "grad_norm": 4.086394786834717, + "learning_rate": 3.6714837127845885e-05, + "loss": 1.1904, + "num_input_tokens_seen": 145507848, + "step": 9043 + }, + { + "epoch": 0.633516334375301, + "grad_norm": 3.9277267456054688, + "learning_rate": 3.6707838879159374e-05, + "loss": 1.1869, + "num_input_tokens_seen": 145524232, + "step": 9044 + }, + { + "epoch": 0.6335863826210303, + "grad_norm": 3.712989091873169, + "learning_rate": 3.6700840630472856e-05, + "loss": 1.0027, + "num_input_tokens_seen": 145540616, + "step": 9045 + }, + { + "epoch": 0.6336564308667595, + "grad_norm": 3.9120595455169678, + "learning_rate": 3.6693842381786344e-05, + "loss": 1.1826, + "num_input_tokens_seen": 145557000, + "step": 9046 + }, + { + "epoch": 0.6337264791124887, + "grad_norm": 4.040353775024414, + "learning_rate": 3.6686844133099826e-05, + "loss": 1.0813, + "num_input_tokens_seen": 145573384, + "step": 9047 + }, + { + "epoch": 0.633796527358218, + "grad_norm": 5.6979851722717285, + "learning_rate": 3.6679845884413315e-05, + "loss": 0.9604, + "num_input_tokens_seen": 145588432, + "step": 9048 + }, + { + "epoch": 0.6338665756039472, + "grad_norm": 4.1016950607299805, + "learning_rate": 3.66728476357268e-05, + "loss": 1.1608, + "num_input_tokens_seen": 145604424, + "step": 9049 + }, + { + "epoch": 0.6339366238496764, + "grad_norm": 3.9995434284210205, + "learning_rate": 3.666584938704028e-05, + "loss": 1.094, + "num_input_tokens_seen": 145620176, + "step": 9050 + }, + { + "epoch": 0.6340066720954057, + "grad_norm": 3.9250869750976562, + "learning_rate": 3.6658851138353774e-05, + "loss": 0.9262, + "num_input_tokens_seen": 145636560, + "step": 9051 + }, + { + "epoch": 0.6340767203411349, + "grad_norm": 4.216888904571533, + "learning_rate": 3.665185288966725e-05, + "loss": 0.945, + "num_input_tokens_seen": 145652208, + "step": 9052 + }, + { + "epoch": 0.6341467685868643, + "grad_norm": 5.43098783493042, + "learning_rate": 3.664485464098074e-05, + "loss": 1.1256, + "num_input_tokens_seen": 145668592, + "step": 9053 + }, + { + "epoch": 0.6342168168325935, + "grad_norm": 3.8012657165527344, + "learning_rate": 3.6637856392294226e-05, + "loss": 1.0852, + "num_input_tokens_seen": 145684536, + "step": 9054 + }, + { + "epoch": 0.6342868650783227, + "grad_norm": 3.621824026107788, + "learning_rate": 3.663085814360771e-05, + "loss": 0.9809, + "num_input_tokens_seen": 145700920, + "step": 9055 + }, + { + "epoch": 0.634356913324052, + "grad_norm": 5.076141834259033, + "learning_rate": 3.6623859894921196e-05, + "loss": 1.3727, + "num_input_tokens_seen": 145717304, + "step": 9056 + }, + { + "epoch": 0.6344269615697812, + "grad_norm": 3.614422559738159, + "learning_rate": 3.661686164623468e-05, + "loss": 0.958, + "num_input_tokens_seen": 145733104, + "step": 9057 + }, + { + "epoch": 0.6344970098155104, + "grad_norm": 3.3869717121124268, + "learning_rate": 3.660986339754817e-05, + "loss": 0.8175, + "num_input_tokens_seen": 145748616, + "step": 9058 + }, + { + "epoch": 0.6345670580612397, + "grad_norm": 6.532785415649414, + "learning_rate": 3.660286514886164e-05, + "loss": 1.0466, + "num_input_tokens_seen": 145765000, + "step": 9059 + }, + { + "epoch": 0.6346371063069689, + "grad_norm": 5.094097137451172, + "learning_rate": 3.659586690017513e-05, + "loss": 1.0772, + "num_input_tokens_seen": 145780968, + "step": 9060 + }, + { + "epoch": 0.6347071545526982, + "grad_norm": 4.011769771575928, + "learning_rate": 3.6588868651488626e-05, + "loss": 1.2032, + "num_input_tokens_seen": 145797352, + "step": 9061 + }, + { + "epoch": 0.6347772027984274, + "grad_norm": 3.6385433673858643, + "learning_rate": 3.65818704028021e-05, + "loss": 0.97, + "num_input_tokens_seen": 145813472, + "step": 9062 + }, + { + "epoch": 0.6348472510441566, + "grad_norm": 5.0770463943481445, + "learning_rate": 3.657487215411559e-05, + "loss": 1.0833, + "num_input_tokens_seen": 145829528, + "step": 9063 + }, + { + "epoch": 0.6349172992898859, + "grad_norm": 4.842851161956787, + "learning_rate": 3.656787390542907e-05, + "loss": 0.908, + "num_input_tokens_seen": 145845912, + "step": 9064 + }, + { + "epoch": 0.6349873475356151, + "grad_norm": 4.052587032318115, + "learning_rate": 3.656087565674256e-05, + "loss": 1.1313, + "num_input_tokens_seen": 145861512, + "step": 9065 + }, + { + "epoch": 0.6350573957813443, + "grad_norm": 3.604140520095825, + "learning_rate": 3.655387740805605e-05, + "loss": 0.9014, + "num_input_tokens_seen": 145876792, + "step": 9066 + }, + { + "epoch": 0.6351274440270737, + "grad_norm": 3.613266706466675, + "learning_rate": 3.654687915936953e-05, + "loss": 0.9118, + "num_input_tokens_seen": 145892696, + "step": 9067 + }, + { + "epoch": 0.6351974922728029, + "grad_norm": 3.5152804851531982, + "learning_rate": 3.653988091068302e-05, + "loss": 0.8084, + "num_input_tokens_seen": 145909080, + "step": 9068 + }, + { + "epoch": 0.6352675405185322, + "grad_norm": 5.389699935913086, + "learning_rate": 3.6532882661996494e-05, + "loss": 1.1569, + "num_input_tokens_seen": 145925464, + "step": 9069 + }, + { + "epoch": 0.6353375887642614, + "grad_norm": 5.22952938079834, + "learning_rate": 3.652588441330998e-05, + "loss": 1.0022, + "num_input_tokens_seen": 145941848, + "step": 9070 + }, + { + "epoch": 0.6354076370099906, + "grad_norm": 6.273350238800049, + "learning_rate": 3.651888616462348e-05, + "loss": 1.3353, + "num_input_tokens_seen": 145958232, + "step": 9071 + }, + { + "epoch": 0.6354776852557199, + "grad_norm": 3.541773796081543, + "learning_rate": 3.651188791593695e-05, + "loss": 0.9381, + "num_input_tokens_seen": 145973800, + "step": 9072 + }, + { + "epoch": 0.6355477335014491, + "grad_norm": 4.250660419464111, + "learning_rate": 3.650488966725044e-05, + "loss": 0.9462, + "num_input_tokens_seen": 145990184, + "step": 9073 + }, + { + "epoch": 0.6356177817471784, + "grad_norm": 4.3455939292907715, + "learning_rate": 3.649789141856392e-05, + "loss": 0.8765, + "num_input_tokens_seen": 146006568, + "step": 9074 + }, + { + "epoch": 0.6356878299929076, + "grad_norm": 3.65165638923645, + "learning_rate": 3.649089316987741e-05, + "loss": 1.0574, + "num_input_tokens_seen": 146022952, + "step": 9075 + }, + { + "epoch": 0.6357578782386368, + "grad_norm": 5.635416030883789, + "learning_rate": 3.64838949211909e-05, + "loss": 1.0706, + "num_input_tokens_seen": 146038416, + "step": 9076 + }, + { + "epoch": 0.6358279264843661, + "grad_norm": 5.283999919891357, + "learning_rate": 3.647689667250438e-05, + "loss": 1.16, + "num_input_tokens_seen": 146054800, + "step": 9077 + }, + { + "epoch": 0.6358979747300954, + "grad_norm": 5.622769355773926, + "learning_rate": 3.646989842381787e-05, + "loss": 0.8713, + "num_input_tokens_seen": 146071184, + "step": 9078 + }, + { + "epoch": 0.6359680229758246, + "grad_norm": 5.744052410125732, + "learning_rate": 3.6462900175131346e-05, + "loss": 1.1526, + "num_input_tokens_seen": 146087568, + "step": 9079 + }, + { + "epoch": 0.6360380712215539, + "grad_norm": 3.9100217819213867, + "learning_rate": 3.6455901926444834e-05, + "loss": 1.1266, + "num_input_tokens_seen": 146103952, + "step": 9080 + }, + { + "epoch": 0.6361081194672831, + "grad_norm": 3.62276029586792, + "learning_rate": 3.644890367775833e-05, + "loss": 0.9099, + "num_input_tokens_seen": 146120336, + "step": 9081 + }, + { + "epoch": 0.6361781677130124, + "grad_norm": 3.9678568840026855, + "learning_rate": 3.6441905429071805e-05, + "loss": 1.1715, + "num_input_tokens_seen": 146136720, + "step": 9082 + }, + { + "epoch": 0.6362482159587416, + "grad_norm": 3.5281286239624023, + "learning_rate": 3.6434907180385293e-05, + "loss": 0.9672, + "num_input_tokens_seen": 146152848, + "step": 9083 + }, + { + "epoch": 0.6363182642044708, + "grad_norm": 3.5496249198913574, + "learning_rate": 3.6427908931698775e-05, + "loss": 0.963, + "num_input_tokens_seen": 146169232, + "step": 9084 + }, + { + "epoch": 0.6363883124502001, + "grad_norm": 4.662434101104736, + "learning_rate": 3.6420910683012264e-05, + "loss": 1.1378, + "num_input_tokens_seen": 146185480, + "step": 9085 + }, + { + "epoch": 0.6364583606959293, + "grad_norm": 3.470349073410034, + "learning_rate": 3.641391243432574e-05, + "loss": 1.0195, + "num_input_tokens_seen": 146201864, + "step": 9086 + }, + { + "epoch": 0.6365284089416585, + "grad_norm": 4.262399673461914, + "learning_rate": 3.6406914185639234e-05, + "loss": 1.0558, + "num_input_tokens_seen": 146218072, + "step": 9087 + }, + { + "epoch": 0.6365984571873878, + "grad_norm": 4.1229705810546875, + "learning_rate": 3.639991593695272e-05, + "loss": 1.2311, + "num_input_tokens_seen": 146234456, + "step": 9088 + }, + { + "epoch": 0.636668505433117, + "grad_norm": 3.8673205375671387, + "learning_rate": 3.63929176882662e-05, + "loss": 1.1264, + "num_input_tokens_seen": 146250840, + "step": 9089 + }, + { + "epoch": 0.6367385536788464, + "grad_norm": 4.517571449279785, + "learning_rate": 3.6385919439579686e-05, + "loss": 1.0453, + "num_input_tokens_seen": 146267224, + "step": 9090 + }, + { + "epoch": 0.6368086019245756, + "grad_norm": 4.085517883300781, + "learning_rate": 3.637892119089317e-05, + "loss": 0.9927, + "num_input_tokens_seen": 146283072, + "step": 9091 + }, + { + "epoch": 0.6368786501703048, + "grad_norm": 4.365269660949707, + "learning_rate": 3.637192294220666e-05, + "loss": 1.1771, + "num_input_tokens_seen": 146299456, + "step": 9092 + }, + { + "epoch": 0.6369486984160341, + "grad_norm": 3.9037249088287354, + "learning_rate": 3.6364924693520145e-05, + "loss": 0.9402, + "num_input_tokens_seen": 146315192, + "step": 9093 + }, + { + "epoch": 0.6370187466617633, + "grad_norm": 5.186809539794922, + "learning_rate": 3.635792644483363e-05, + "loss": 1.207, + "num_input_tokens_seen": 146331576, + "step": 9094 + }, + { + "epoch": 0.6370887949074925, + "grad_norm": 3.9498887062072754, + "learning_rate": 3.6350928196147116e-05, + "loss": 0.8991, + "num_input_tokens_seen": 146347768, + "step": 9095 + }, + { + "epoch": 0.6371588431532218, + "grad_norm": 3.9411630630493164, + "learning_rate": 3.634392994746059e-05, + "loss": 1.0749, + "num_input_tokens_seen": 146363104, + "step": 9096 + }, + { + "epoch": 0.637228891398951, + "grad_norm": 3.807798385620117, + "learning_rate": 3.6336931698774086e-05, + "loss": 0.9417, + "num_input_tokens_seen": 146378616, + "step": 9097 + }, + { + "epoch": 0.6372989396446803, + "grad_norm": 3.5480258464813232, + "learning_rate": 3.6329933450087575e-05, + "loss": 1.0565, + "num_input_tokens_seen": 146394704, + "step": 9098 + }, + { + "epoch": 0.6373689878904095, + "grad_norm": 3.5732553005218506, + "learning_rate": 3.632293520140105e-05, + "loss": 0.973, + "num_input_tokens_seen": 146411088, + "step": 9099 + }, + { + "epoch": 0.6374390361361387, + "grad_norm": 4.395323753356934, + "learning_rate": 3.631593695271454e-05, + "loss": 1.1055, + "num_input_tokens_seen": 146427472, + "step": 9100 + }, + { + "epoch": 0.637509084381868, + "grad_norm": 3.7190189361572266, + "learning_rate": 3.630893870402802e-05, + "loss": 0.9982, + "num_input_tokens_seen": 146443856, + "step": 9101 + }, + { + "epoch": 0.6375791326275972, + "grad_norm": 3.766615152359009, + "learning_rate": 3.630194045534151e-05, + "loss": 1.0493, + "num_input_tokens_seen": 146460240, + "step": 9102 + }, + { + "epoch": 0.6376491808733264, + "grad_norm": 3.6077804565429688, + "learning_rate": 3.6294942206655e-05, + "loss": 1.0525, + "num_input_tokens_seen": 146476624, + "step": 9103 + }, + { + "epoch": 0.6377192291190558, + "grad_norm": 3.5977163314819336, + "learning_rate": 3.628794395796848e-05, + "loss": 1.0919, + "num_input_tokens_seen": 146492984, + "step": 9104 + }, + { + "epoch": 0.637789277364785, + "grad_norm": 3.5252113342285156, + "learning_rate": 3.628094570928197e-05, + "loss": 0.9042, + "num_input_tokens_seen": 146509368, + "step": 9105 + }, + { + "epoch": 0.6378593256105143, + "grad_norm": 4.097792148590088, + "learning_rate": 3.627394746059544e-05, + "loss": 1.3521, + "num_input_tokens_seen": 146525752, + "step": 9106 + }, + { + "epoch": 0.6379293738562435, + "grad_norm": 3.623084306716919, + "learning_rate": 3.626694921190894e-05, + "loss": 0.9484, + "num_input_tokens_seen": 146542136, + "step": 9107 + }, + { + "epoch": 0.6379994221019727, + "grad_norm": 3.647857666015625, + "learning_rate": 3.625995096322243e-05, + "loss": 1.1339, + "num_input_tokens_seen": 146558520, + "step": 9108 + }, + { + "epoch": 0.638069470347702, + "grad_norm": 3.976710796356201, + "learning_rate": 3.62529527145359e-05, + "loss": 1.1682, + "num_input_tokens_seen": 146574720, + "step": 9109 + }, + { + "epoch": 0.6381395185934312, + "grad_norm": 4.908950328826904, + "learning_rate": 3.624595446584939e-05, + "loss": 1.2816, + "num_input_tokens_seen": 146590600, + "step": 9110 + }, + { + "epoch": 0.6382095668391605, + "grad_norm": 4.037857532501221, + "learning_rate": 3.623895621716287e-05, + "loss": 1.0935, + "num_input_tokens_seen": 146606984, + "step": 9111 + }, + { + "epoch": 0.6382796150848897, + "grad_norm": 4.615200042724609, + "learning_rate": 3.623195796847636e-05, + "loss": 1.0079, + "num_input_tokens_seen": 146623368, + "step": 9112 + }, + { + "epoch": 0.6383496633306189, + "grad_norm": 5.455233573913574, + "learning_rate": 3.622495971978984e-05, + "loss": 0.9509, + "num_input_tokens_seen": 146638992, + "step": 9113 + }, + { + "epoch": 0.6384197115763482, + "grad_norm": 4.4954447746276855, + "learning_rate": 3.621796147110333e-05, + "loss": 0.9418, + "num_input_tokens_seen": 146655000, + "step": 9114 + }, + { + "epoch": 0.6384897598220775, + "grad_norm": 5.171751976013184, + "learning_rate": 3.621096322241682e-05, + "loss": 1.0975, + "num_input_tokens_seen": 146671336, + "step": 9115 + }, + { + "epoch": 0.6385598080678067, + "grad_norm": 3.6252377033233643, + "learning_rate": 3.6203964973730295e-05, + "loss": 0.9766, + "num_input_tokens_seen": 146687720, + "step": 9116 + }, + { + "epoch": 0.638629856313536, + "grad_norm": 4.410569667816162, + "learning_rate": 3.619696672504379e-05, + "loss": 0.9673, + "num_input_tokens_seen": 146704064, + "step": 9117 + }, + { + "epoch": 0.6386999045592652, + "grad_norm": 4.368156909942627, + "learning_rate": 3.6189968476357265e-05, + "loss": 0.951, + "num_input_tokens_seen": 146720104, + "step": 9118 + }, + { + "epoch": 0.6387699528049945, + "grad_norm": 3.501434326171875, + "learning_rate": 3.6182970227670754e-05, + "loss": 1.0741, + "num_input_tokens_seen": 146736264, + "step": 9119 + }, + { + "epoch": 0.6388400010507237, + "grad_norm": 4.83900785446167, + "learning_rate": 3.617597197898424e-05, + "loss": 0.9214, + "num_input_tokens_seen": 146750936, + "step": 9120 + }, + { + "epoch": 0.6389100492964529, + "grad_norm": 4.444680690765381, + "learning_rate": 3.6168973730297724e-05, + "loss": 1.0691, + "num_input_tokens_seen": 146767200, + "step": 9121 + }, + { + "epoch": 0.6389800975421822, + "grad_norm": 4.317345142364502, + "learning_rate": 3.616197548161121e-05, + "loss": 0.9229, + "num_input_tokens_seen": 146783584, + "step": 9122 + }, + { + "epoch": 0.6390501457879114, + "grad_norm": 3.361476421356201, + "learning_rate": 3.6154977232924695e-05, + "loss": 0.9149, + "num_input_tokens_seen": 146799648, + "step": 9123 + }, + { + "epoch": 0.6391201940336406, + "grad_norm": 3.988877296447754, + "learning_rate": 3.614797898423818e-05, + "loss": 1.0484, + "num_input_tokens_seen": 146815192, + "step": 9124 + }, + { + "epoch": 0.6391902422793699, + "grad_norm": 3.815969228744507, + "learning_rate": 3.614098073555167e-05, + "loss": 1.0546, + "num_input_tokens_seen": 146831040, + "step": 9125 + }, + { + "epoch": 0.6392602905250991, + "grad_norm": 4.215094566345215, + "learning_rate": 3.613398248686515e-05, + "loss": 0.9423, + "num_input_tokens_seen": 146847424, + "step": 9126 + }, + { + "epoch": 0.6393303387708285, + "grad_norm": 4.739633083343506, + "learning_rate": 3.612698423817864e-05, + "loss": 1.1306, + "num_input_tokens_seen": 146863808, + "step": 9127 + }, + { + "epoch": 0.6394003870165577, + "grad_norm": 3.986490488052368, + "learning_rate": 3.611998598949212e-05, + "loss": 0.9754, + "num_input_tokens_seen": 146879600, + "step": 9128 + }, + { + "epoch": 0.6394704352622869, + "grad_norm": 4.4773030281066895, + "learning_rate": 3.6112987740805606e-05, + "loss": 1.061, + "num_input_tokens_seen": 146895400, + "step": 9129 + }, + { + "epoch": 0.6395404835080162, + "grad_norm": 3.2978334426879883, + "learning_rate": 3.6105989492119095e-05, + "loss": 0.9697, + "num_input_tokens_seen": 146911784, + "step": 9130 + }, + { + "epoch": 0.6396105317537454, + "grad_norm": 5.616436958312988, + "learning_rate": 3.6098991243432576e-05, + "loss": 1.097, + "num_input_tokens_seen": 146928168, + "step": 9131 + }, + { + "epoch": 0.6396805799994746, + "grad_norm": 5.068376541137695, + "learning_rate": 3.6091992994746065e-05, + "loss": 0.9522, + "num_input_tokens_seen": 146944480, + "step": 9132 + }, + { + "epoch": 0.6397506282452039, + "grad_norm": 3.696591854095459, + "learning_rate": 3.608499474605955e-05, + "loss": 1.0946, + "num_input_tokens_seen": 146960240, + "step": 9133 + }, + { + "epoch": 0.6398206764909331, + "grad_norm": 4.561679840087891, + "learning_rate": 3.6077996497373035e-05, + "loss": 1.0204, + "num_input_tokens_seen": 146976536, + "step": 9134 + }, + { + "epoch": 0.6398907247366624, + "grad_norm": 5.250302314758301, + "learning_rate": 3.6070998248686524e-05, + "loss": 1.239, + "num_input_tokens_seen": 146992920, + "step": 9135 + }, + { + "epoch": 0.6399607729823916, + "grad_norm": 3.600713014602661, + "learning_rate": 3.6064e-05, + "loss": 0.9548, + "num_input_tokens_seen": 147009304, + "step": 9136 + }, + { + "epoch": 0.6400308212281208, + "grad_norm": 4.353972434997559, + "learning_rate": 3.6057001751313494e-05, + "loss": 0.9308, + "num_input_tokens_seen": 147025688, + "step": 9137 + }, + { + "epoch": 0.6401008694738501, + "grad_norm": 3.946777820587158, + "learning_rate": 3.605000350262697e-05, + "loss": 1.0474, + "num_input_tokens_seen": 147041368, + "step": 9138 + }, + { + "epoch": 0.6401709177195793, + "grad_norm": 4.437119960784912, + "learning_rate": 3.604300525394046e-05, + "loss": 1.1828, + "num_input_tokens_seen": 147057448, + "step": 9139 + }, + { + "epoch": 0.6402409659653087, + "grad_norm": 4.4218339920043945, + "learning_rate": 3.603600700525394e-05, + "loss": 1.1593, + "num_input_tokens_seen": 147073832, + "step": 9140 + }, + { + "epoch": 0.6403110142110379, + "grad_norm": 4.643231391906738, + "learning_rate": 3.602900875656743e-05, + "loss": 0.9469, + "num_input_tokens_seen": 147089864, + "step": 9141 + }, + { + "epoch": 0.6403810624567671, + "grad_norm": 4.425054550170898, + "learning_rate": 3.602201050788092e-05, + "loss": 1.1542, + "num_input_tokens_seen": 147106248, + "step": 9142 + }, + { + "epoch": 0.6404511107024964, + "grad_norm": 4.280392646789551, + "learning_rate": 3.60150122591944e-05, + "loss": 1.1396, + "num_input_tokens_seen": 147122632, + "step": 9143 + }, + { + "epoch": 0.6405211589482256, + "grad_norm": 3.6606171131134033, + "learning_rate": 3.600801401050789e-05, + "loss": 1.0122, + "num_input_tokens_seen": 147138376, + "step": 9144 + }, + { + "epoch": 0.6405912071939548, + "grad_norm": 5.603803634643555, + "learning_rate": 3.600101576182136e-05, + "loss": 0.9538, + "num_input_tokens_seen": 147153792, + "step": 9145 + }, + { + "epoch": 0.6406612554396841, + "grad_norm": 5.50916051864624, + "learning_rate": 3.599401751313485e-05, + "loss": 1.0029, + "num_input_tokens_seen": 147170176, + "step": 9146 + }, + { + "epoch": 0.6407313036854133, + "grad_norm": 3.6515350341796875, + "learning_rate": 3.5987019264448346e-05, + "loss": 1.0022, + "num_input_tokens_seen": 147186120, + "step": 9147 + }, + { + "epoch": 0.6408013519311426, + "grad_norm": 3.7642056941986084, + "learning_rate": 3.598002101576182e-05, + "loss": 0.9936, + "num_input_tokens_seen": 147202504, + "step": 9148 + }, + { + "epoch": 0.6408714001768718, + "grad_norm": 3.904592752456665, + "learning_rate": 3.597302276707531e-05, + "loss": 0.9486, + "num_input_tokens_seen": 147217872, + "step": 9149 + }, + { + "epoch": 0.640941448422601, + "grad_norm": 3.846968412399292, + "learning_rate": 3.596602451838879e-05, + "loss": 1.1448, + "num_input_tokens_seen": 147233904, + "step": 9150 + }, + { + "epoch": 0.6410114966683303, + "grad_norm": 3.457204818725586, + "learning_rate": 3.595902626970228e-05, + "loss": 0.8744, + "num_input_tokens_seen": 147249600, + "step": 9151 + }, + { + "epoch": 0.6410815449140596, + "grad_norm": 3.725909471511841, + "learning_rate": 3.595202802101577e-05, + "loss": 1.1382, + "num_input_tokens_seen": 147265984, + "step": 9152 + }, + { + "epoch": 0.6411515931597888, + "grad_norm": 3.956817626953125, + "learning_rate": 3.594502977232925e-05, + "loss": 1.0834, + "num_input_tokens_seen": 147282248, + "step": 9153 + }, + { + "epoch": 0.6412216414055181, + "grad_norm": 3.784158945083618, + "learning_rate": 3.593803152364274e-05, + "loss": 1.1258, + "num_input_tokens_seen": 147298504, + "step": 9154 + }, + { + "epoch": 0.6412916896512473, + "grad_norm": 3.998906135559082, + "learning_rate": 3.5931033274956214e-05, + "loss": 0.9071, + "num_input_tokens_seen": 147314504, + "step": 9155 + }, + { + "epoch": 0.6413617378969766, + "grad_norm": 4.2124810218811035, + "learning_rate": 3.59240350262697e-05, + "loss": 0.9667, + "num_input_tokens_seen": 147330720, + "step": 9156 + }, + { + "epoch": 0.6414317861427058, + "grad_norm": 5.016319274902344, + "learning_rate": 3.59170367775832e-05, + "loss": 1.0815, + "num_input_tokens_seen": 147347104, + "step": 9157 + }, + { + "epoch": 0.641501834388435, + "grad_norm": 3.860468864440918, + "learning_rate": 3.5910038528896673e-05, + "loss": 0.9221, + "num_input_tokens_seen": 147362280, + "step": 9158 + }, + { + "epoch": 0.6415718826341643, + "grad_norm": 3.88521409034729, + "learning_rate": 3.590304028021016e-05, + "loss": 1.2401, + "num_input_tokens_seen": 147378232, + "step": 9159 + }, + { + "epoch": 0.6416419308798935, + "grad_norm": 4.083076477050781, + "learning_rate": 3.5896042031523644e-05, + "loss": 0.8933, + "num_input_tokens_seen": 147394616, + "step": 9160 + }, + { + "epoch": 0.6417119791256227, + "grad_norm": 3.9662280082702637, + "learning_rate": 3.588904378283713e-05, + "loss": 1.0784, + "num_input_tokens_seen": 147411000, + "step": 9161 + }, + { + "epoch": 0.641782027371352, + "grad_norm": 4.424450874328613, + "learning_rate": 3.588204553415062e-05, + "loss": 1.0046, + "num_input_tokens_seen": 147427384, + "step": 9162 + }, + { + "epoch": 0.6418520756170812, + "grad_norm": 3.7087998390197754, + "learning_rate": 3.58750472854641e-05, + "loss": 1.0228, + "num_input_tokens_seen": 147443768, + "step": 9163 + }, + { + "epoch": 0.6419221238628106, + "grad_norm": 5.173811912536621, + "learning_rate": 3.586804903677759e-05, + "loss": 0.8751, + "num_input_tokens_seen": 147459848, + "step": 9164 + }, + { + "epoch": 0.6419921721085398, + "grad_norm": 3.321122407913208, + "learning_rate": 3.5861050788091066e-05, + "loss": 0.8889, + "num_input_tokens_seen": 147475928, + "step": 9165 + }, + { + "epoch": 0.642062220354269, + "grad_norm": 4.102786064147949, + "learning_rate": 3.5854052539404555e-05, + "loss": 1.2166, + "num_input_tokens_seen": 147492312, + "step": 9166 + }, + { + "epoch": 0.6421322685999983, + "grad_norm": 3.7009875774383545, + "learning_rate": 3.584705429071804e-05, + "loss": 1.0663, + "num_input_tokens_seen": 147508528, + "step": 9167 + }, + { + "epoch": 0.6422023168457275, + "grad_norm": 4.664505481719971, + "learning_rate": 3.5840056042031525e-05, + "loss": 1.1342, + "num_input_tokens_seen": 147524888, + "step": 9168 + }, + { + "epoch": 0.6422723650914567, + "grad_norm": 4.598338603973389, + "learning_rate": 3.5833057793345014e-05, + "loss": 1.1406, + "num_input_tokens_seen": 147541272, + "step": 9169 + }, + { + "epoch": 0.642342413337186, + "grad_norm": 3.5933330059051514, + "learning_rate": 3.5826059544658496e-05, + "loss": 0.9554, + "num_input_tokens_seen": 147557656, + "step": 9170 + }, + { + "epoch": 0.6424124615829152, + "grad_norm": 4.736749172210693, + "learning_rate": 3.5819061295971984e-05, + "loss": 1.0302, + "num_input_tokens_seen": 147574000, + "step": 9171 + }, + { + "epoch": 0.6424825098286445, + "grad_norm": 4.330468654632568, + "learning_rate": 3.581206304728546e-05, + "loss": 1.0556, + "num_input_tokens_seen": 147590384, + "step": 9172 + }, + { + "epoch": 0.6425525580743737, + "grad_norm": 8.585768699645996, + "learning_rate": 3.5805064798598955e-05, + "loss": 1.1132, + "num_input_tokens_seen": 147605680, + "step": 9173 + }, + { + "epoch": 0.6426226063201029, + "grad_norm": 6.60148286819458, + "learning_rate": 3.5798066549912443e-05, + "loss": 0.7804, + "num_input_tokens_seen": 147621824, + "step": 9174 + }, + { + "epoch": 0.6426926545658322, + "grad_norm": 3.8450257778167725, + "learning_rate": 3.579106830122592e-05, + "loss": 1.1124, + "num_input_tokens_seen": 147638208, + "step": 9175 + }, + { + "epoch": 0.6427627028115614, + "grad_norm": 5.236792087554932, + "learning_rate": 3.578407005253941e-05, + "loss": 1.2241, + "num_input_tokens_seen": 147652800, + "step": 9176 + }, + { + "epoch": 0.6428327510572908, + "grad_norm": 4.240713596343994, + "learning_rate": 3.577707180385289e-05, + "loss": 1.2109, + "num_input_tokens_seen": 147669184, + "step": 9177 + }, + { + "epoch": 0.64290279930302, + "grad_norm": 3.857419967651367, + "learning_rate": 3.577007355516638e-05, + "loss": 0.8716, + "num_input_tokens_seen": 147685320, + "step": 9178 + }, + { + "epoch": 0.6429728475487492, + "grad_norm": 4.0418901443481445, + "learning_rate": 3.5763075306479866e-05, + "loss": 1.0149, + "num_input_tokens_seen": 147701704, + "step": 9179 + }, + { + "epoch": 0.6430428957944785, + "grad_norm": 4.092870235443115, + "learning_rate": 3.575607705779335e-05, + "loss": 0.9154, + "num_input_tokens_seen": 147717624, + "step": 9180 + }, + { + "epoch": 0.6431129440402077, + "grad_norm": 4.32139778137207, + "learning_rate": 3.5749078809106836e-05, + "loss": 1.0471, + "num_input_tokens_seen": 147733776, + "step": 9181 + }, + { + "epoch": 0.6431829922859369, + "grad_norm": 4.06821870803833, + "learning_rate": 3.574208056042031e-05, + "loss": 1.0499, + "num_input_tokens_seen": 147750160, + "step": 9182 + }, + { + "epoch": 0.6432530405316662, + "grad_norm": 4.179255962371826, + "learning_rate": 3.573508231173381e-05, + "loss": 1.0151, + "num_input_tokens_seen": 147766496, + "step": 9183 + }, + { + "epoch": 0.6433230887773954, + "grad_norm": 4.223190784454346, + "learning_rate": 3.5728084063047295e-05, + "loss": 0.9637, + "num_input_tokens_seen": 147782880, + "step": 9184 + }, + { + "epoch": 0.6433931370231247, + "grad_norm": 3.8488879203796387, + "learning_rate": 3.572108581436077e-05, + "loss": 0.8018, + "num_input_tokens_seen": 147798976, + "step": 9185 + }, + { + "epoch": 0.6434631852688539, + "grad_norm": 3.6767921447753906, + "learning_rate": 3.571408756567426e-05, + "loss": 1.0427, + "num_input_tokens_seen": 147815360, + "step": 9186 + }, + { + "epoch": 0.6435332335145831, + "grad_norm": 4.070343971252441, + "learning_rate": 3.570708931698774e-05, + "loss": 1.0902, + "num_input_tokens_seen": 147831744, + "step": 9187 + }, + { + "epoch": 0.6436032817603125, + "grad_norm": 4.876281261444092, + "learning_rate": 3.570009106830123e-05, + "loss": 0.9933, + "num_input_tokens_seen": 147846792, + "step": 9188 + }, + { + "epoch": 0.6436733300060417, + "grad_norm": 4.0807576179504395, + "learning_rate": 3.569309281961472e-05, + "loss": 1.0476, + "num_input_tokens_seen": 147862864, + "step": 9189 + }, + { + "epoch": 0.6437433782517709, + "grad_norm": 4.121328830718994, + "learning_rate": 3.56860945709282e-05, + "loss": 1.13, + "num_input_tokens_seen": 147878704, + "step": 9190 + }, + { + "epoch": 0.6438134264975002, + "grad_norm": 4.3336029052734375, + "learning_rate": 3.567909632224169e-05, + "loss": 1.0113, + "num_input_tokens_seen": 147894640, + "step": 9191 + }, + { + "epoch": 0.6438834747432294, + "grad_norm": 5.3807291984558105, + "learning_rate": 3.5672098073555163e-05, + "loss": 1.1592, + "num_input_tokens_seen": 147910288, + "step": 9192 + }, + { + "epoch": 0.6439535229889587, + "grad_norm": 5.2983808517456055, + "learning_rate": 3.566509982486866e-05, + "loss": 1.1139, + "num_input_tokens_seen": 147925288, + "step": 9193 + }, + { + "epoch": 0.6440235712346879, + "grad_norm": 4.485876560211182, + "learning_rate": 3.5658101576182134e-05, + "loss": 1.0128, + "num_input_tokens_seen": 147941672, + "step": 9194 + }, + { + "epoch": 0.6440936194804171, + "grad_norm": 5.0248799324035645, + "learning_rate": 3.565110332749562e-05, + "loss": 1.232, + "num_input_tokens_seen": 147958056, + "step": 9195 + }, + { + "epoch": 0.6441636677261464, + "grad_norm": 3.7837677001953125, + "learning_rate": 3.564410507880911e-05, + "loss": 1.0184, + "num_input_tokens_seen": 147974440, + "step": 9196 + }, + { + "epoch": 0.6442337159718756, + "grad_norm": 4.665341377258301, + "learning_rate": 3.563710683012259e-05, + "loss": 0.9346, + "num_input_tokens_seen": 147990824, + "step": 9197 + }, + { + "epoch": 0.6443037642176048, + "grad_norm": 5.135046482086182, + "learning_rate": 3.563010858143608e-05, + "loss": 1.1362, + "num_input_tokens_seen": 148007208, + "step": 9198 + }, + { + "epoch": 0.6443738124633341, + "grad_norm": 5.411355972290039, + "learning_rate": 3.562311033274956e-05, + "loss": 1.2388, + "num_input_tokens_seen": 148023472, + "step": 9199 + }, + { + "epoch": 0.6444438607090633, + "grad_norm": 3.8075788021087646, + "learning_rate": 3.561611208406305e-05, + "loss": 1.0358, + "num_input_tokens_seen": 148039856, + "step": 9200 + }, + { + "epoch": 0.6444438607090633, + "eval_loss": 1.1188684701919556, + "eval_runtime": 0.183, + "eval_samples_per_second": 5.464, + "eval_steps_per_second": 5.464, + "num_input_tokens_seen": 148039856, + "step": 9200 + }, + { + "epoch": 0.6445139089547927, + "grad_norm": 4.368557453155518, + "learning_rate": 3.560911383537654e-05, + "loss": 0.9135, + "num_input_tokens_seen": 148056160, + "step": 9201 + }, + { + "epoch": 0.6445839572005219, + "grad_norm": 4.345520496368408, + "learning_rate": 3.5602115586690016e-05, + "loss": 0.9864, + "num_input_tokens_seen": 148072544, + "step": 9202 + }, + { + "epoch": 0.6446540054462511, + "grad_norm": 3.673990249633789, + "learning_rate": 3.559511733800351e-05, + "loss": 1.0972, + "num_input_tokens_seen": 148088928, + "step": 9203 + }, + { + "epoch": 0.6447240536919804, + "grad_norm": 3.796632766723633, + "learning_rate": 3.5588119089316986e-05, + "loss": 1.0473, + "num_input_tokens_seen": 148105312, + "step": 9204 + }, + { + "epoch": 0.6447941019377096, + "grad_norm": 4.765102863311768, + "learning_rate": 3.5581120840630475e-05, + "loss": 0.9837, + "num_input_tokens_seen": 148121696, + "step": 9205 + }, + { + "epoch": 0.6448641501834388, + "grad_norm": 4.8779988288879395, + "learning_rate": 3.557412259194396e-05, + "loss": 1.071, + "num_input_tokens_seen": 148138080, + "step": 9206 + }, + { + "epoch": 0.6449341984291681, + "grad_norm": 3.821910858154297, + "learning_rate": 3.5567124343257445e-05, + "loss": 1.1547, + "num_input_tokens_seen": 148154464, + "step": 9207 + }, + { + "epoch": 0.6450042466748973, + "grad_norm": 4.575494766235352, + "learning_rate": 3.5560126094570934e-05, + "loss": 1.1585, + "num_input_tokens_seen": 148170672, + "step": 9208 + }, + { + "epoch": 0.6450742949206266, + "grad_norm": 3.5709569454193115, + "learning_rate": 3.5553127845884415e-05, + "loss": 0.8729, + "num_input_tokens_seen": 148187056, + "step": 9209 + }, + { + "epoch": 0.6451443431663558, + "grad_norm": 3.843435287475586, + "learning_rate": 3.5546129597197904e-05, + "loss": 1.0008, + "num_input_tokens_seen": 148203440, + "step": 9210 + }, + { + "epoch": 0.645214391412085, + "grad_norm": 4.258293151855469, + "learning_rate": 3.553913134851139e-05, + "loss": 1.1879, + "num_input_tokens_seen": 148218064, + "step": 9211 + }, + { + "epoch": 0.6452844396578143, + "grad_norm": 4.630987167358398, + "learning_rate": 3.553213309982487e-05, + "loss": 1.0215, + "num_input_tokens_seen": 148234448, + "step": 9212 + }, + { + "epoch": 0.6453544879035435, + "grad_norm": 5.691484451293945, + "learning_rate": 3.552513485113836e-05, + "loss": 1.078, + "num_input_tokens_seen": 148250056, + "step": 9213 + }, + { + "epoch": 0.6454245361492729, + "grad_norm": 3.4728190898895264, + "learning_rate": 3.551813660245184e-05, + "loss": 0.9487, + "num_input_tokens_seen": 148266056, + "step": 9214 + }, + { + "epoch": 0.6454945843950021, + "grad_norm": 5.698538303375244, + "learning_rate": 3.5511138353765327e-05, + "loss": 1.1827, + "num_input_tokens_seen": 148280744, + "step": 9215 + }, + { + "epoch": 0.6455646326407313, + "grad_norm": 4.433426380157471, + "learning_rate": 3.5504140105078815e-05, + "loss": 1.0796, + "num_input_tokens_seen": 148297128, + "step": 9216 + }, + { + "epoch": 0.6456346808864606, + "grad_norm": 3.776545763015747, + "learning_rate": 3.54971418563923e-05, + "loss": 1.216, + "num_input_tokens_seen": 148313512, + "step": 9217 + }, + { + "epoch": 0.6457047291321898, + "grad_norm": 6.483054161071777, + "learning_rate": 3.5490143607705786e-05, + "loss": 1.1004, + "num_input_tokens_seen": 148329896, + "step": 9218 + }, + { + "epoch": 0.645774777377919, + "grad_norm": 3.762585401535034, + "learning_rate": 3.548314535901927e-05, + "loss": 1.151, + "num_input_tokens_seen": 148346280, + "step": 9219 + }, + { + "epoch": 0.6458448256236483, + "grad_norm": 3.7533860206604004, + "learning_rate": 3.5476147110332756e-05, + "loss": 1.0433, + "num_input_tokens_seen": 148362664, + "step": 9220 + }, + { + "epoch": 0.6459148738693775, + "grad_norm": 4.7706193923950195, + "learning_rate": 3.546914886164623e-05, + "loss": 1.1348, + "num_input_tokens_seen": 148379048, + "step": 9221 + }, + { + "epoch": 0.6459849221151068, + "grad_norm": 8.966822624206543, + "learning_rate": 3.546215061295972e-05, + "loss": 0.9141, + "num_input_tokens_seen": 148394968, + "step": 9222 + }, + { + "epoch": 0.646054970360836, + "grad_norm": 6.148746967315674, + "learning_rate": 3.5455152364273215e-05, + "loss": 1.0967, + "num_input_tokens_seen": 148411120, + "step": 9223 + }, + { + "epoch": 0.6461250186065652, + "grad_norm": 5.433462142944336, + "learning_rate": 3.544815411558669e-05, + "loss": 0.9757, + "num_input_tokens_seen": 148427504, + "step": 9224 + }, + { + "epoch": 0.6461950668522946, + "grad_norm": 4.530987739562988, + "learning_rate": 3.544115586690018e-05, + "loss": 0.9837, + "num_input_tokens_seen": 148443888, + "step": 9225 + }, + { + "epoch": 0.6462651150980238, + "grad_norm": 3.7626450061798096, + "learning_rate": 3.543415761821366e-05, + "loss": 0.9667, + "num_input_tokens_seen": 148460272, + "step": 9226 + }, + { + "epoch": 0.646335163343753, + "grad_norm": 3.8700945377349854, + "learning_rate": 3.542715936952715e-05, + "loss": 0.9902, + "num_input_tokens_seen": 148476656, + "step": 9227 + }, + { + "epoch": 0.6464052115894823, + "grad_norm": 3.5958094596862793, + "learning_rate": 3.542016112084064e-05, + "loss": 1.0532, + "num_input_tokens_seen": 148493040, + "step": 9228 + }, + { + "epoch": 0.6464752598352115, + "grad_norm": 3.7037572860717773, + "learning_rate": 3.541316287215412e-05, + "loss": 0.9857, + "num_input_tokens_seen": 148509184, + "step": 9229 + }, + { + "epoch": 0.6465453080809408, + "grad_norm": 4.143620491027832, + "learning_rate": 3.540616462346761e-05, + "loss": 0.9872, + "num_input_tokens_seen": 148524384, + "step": 9230 + }, + { + "epoch": 0.64661535632667, + "grad_norm": 4.512960910797119, + "learning_rate": 3.539916637478108e-05, + "loss": 1.1361, + "num_input_tokens_seen": 148540768, + "step": 9231 + }, + { + "epoch": 0.6466854045723992, + "grad_norm": 4.066840648651123, + "learning_rate": 3.539216812609457e-05, + "loss": 1.0777, + "num_input_tokens_seen": 148556672, + "step": 9232 + }, + { + "epoch": 0.6467554528181285, + "grad_norm": 3.489027738571167, + "learning_rate": 3.538516987740807e-05, + "loss": 0.9007, + "num_input_tokens_seen": 148573056, + "step": 9233 + }, + { + "epoch": 0.6468255010638577, + "grad_norm": 4.056064128875732, + "learning_rate": 3.537817162872154e-05, + "loss": 0.9946, + "num_input_tokens_seen": 148589368, + "step": 9234 + }, + { + "epoch": 0.6468955493095869, + "grad_norm": 5.149455547332764, + "learning_rate": 3.537117338003503e-05, + "loss": 1.0218, + "num_input_tokens_seen": 148604888, + "step": 9235 + }, + { + "epoch": 0.6469655975553162, + "grad_norm": 4.626084804534912, + "learning_rate": 3.536417513134851e-05, + "loss": 1.1735, + "num_input_tokens_seen": 148621272, + "step": 9236 + }, + { + "epoch": 0.6470356458010454, + "grad_norm": 3.8108484745025635, + "learning_rate": 3.5357176882662e-05, + "loss": 0.8607, + "num_input_tokens_seen": 148637000, + "step": 9237 + }, + { + "epoch": 0.6471056940467748, + "grad_norm": 3.6130454540252686, + "learning_rate": 3.535017863397549e-05, + "loss": 0.9696, + "num_input_tokens_seen": 148651904, + "step": 9238 + }, + { + "epoch": 0.647175742292504, + "grad_norm": 3.6412675380706787, + "learning_rate": 3.534318038528897e-05, + "loss": 1.0942, + "num_input_tokens_seen": 148667936, + "step": 9239 + }, + { + "epoch": 0.6472457905382332, + "grad_norm": 3.7499804496765137, + "learning_rate": 3.533618213660246e-05, + "loss": 0.9465, + "num_input_tokens_seen": 148684320, + "step": 9240 + }, + { + "epoch": 0.6473158387839625, + "grad_norm": 4.286163330078125, + "learning_rate": 3.5329183887915935e-05, + "loss": 1.2049, + "num_input_tokens_seen": 148700704, + "step": 9241 + }, + { + "epoch": 0.6473858870296917, + "grad_norm": 3.9942524433135986, + "learning_rate": 3.5322185639229424e-05, + "loss": 1.229, + "num_input_tokens_seen": 148717088, + "step": 9242 + }, + { + "epoch": 0.6474559352754209, + "grad_norm": 4.105100631713867, + "learning_rate": 3.531518739054292e-05, + "loss": 0.9059, + "num_input_tokens_seen": 148733120, + "step": 9243 + }, + { + "epoch": 0.6475259835211502, + "grad_norm": 3.757993459701538, + "learning_rate": 3.5308189141856394e-05, + "loss": 1.0382, + "num_input_tokens_seen": 148749504, + "step": 9244 + }, + { + "epoch": 0.6475960317668794, + "grad_norm": 5.484358310699463, + "learning_rate": 3.530119089316988e-05, + "loss": 1.0398, + "num_input_tokens_seen": 148765088, + "step": 9245 + }, + { + "epoch": 0.6476660800126087, + "grad_norm": 4.352504730224609, + "learning_rate": 3.5294192644483364e-05, + "loss": 1.052, + "num_input_tokens_seen": 148780680, + "step": 9246 + }, + { + "epoch": 0.6477361282583379, + "grad_norm": 3.860248327255249, + "learning_rate": 3.528719439579685e-05, + "loss": 1.1673, + "num_input_tokens_seen": 148797064, + "step": 9247 + }, + { + "epoch": 0.6478061765040671, + "grad_norm": 4.548919200897217, + "learning_rate": 3.528019614711033e-05, + "loss": 1.1648, + "num_input_tokens_seen": 148813344, + "step": 9248 + }, + { + "epoch": 0.6478762247497964, + "grad_norm": 4.4975409507751465, + "learning_rate": 3.5273197898423823e-05, + "loss": 1.033, + "num_input_tokens_seen": 148829728, + "step": 9249 + }, + { + "epoch": 0.6479462729955257, + "grad_norm": 3.8438351154327393, + "learning_rate": 3.526619964973731e-05, + "loss": 1.1875, + "num_input_tokens_seen": 148846112, + "step": 9250 + }, + { + "epoch": 0.648016321241255, + "grad_norm": 4.706649303436279, + "learning_rate": 3.525920140105079e-05, + "loss": 0.9494, + "num_input_tokens_seen": 148862496, + "step": 9251 + }, + { + "epoch": 0.6480863694869842, + "grad_norm": 3.526735544204712, + "learning_rate": 3.5252203152364276e-05, + "loss": 0.9476, + "num_input_tokens_seen": 148878856, + "step": 9252 + }, + { + "epoch": 0.6481564177327134, + "grad_norm": 3.831486463546753, + "learning_rate": 3.524520490367776e-05, + "loss": 1.188, + "num_input_tokens_seen": 148894712, + "step": 9253 + }, + { + "epoch": 0.6482264659784427, + "grad_norm": 4.237249374389648, + "learning_rate": 3.5238206654991246e-05, + "loss": 1.022, + "num_input_tokens_seen": 148910360, + "step": 9254 + }, + { + "epoch": 0.6482965142241719, + "grad_norm": 5.18316125869751, + "learning_rate": 3.5231208406304735e-05, + "loss": 1.0039, + "num_input_tokens_seen": 148924856, + "step": 9255 + }, + { + "epoch": 0.6483665624699011, + "grad_norm": 4.504611492156982, + "learning_rate": 3.5224210157618216e-05, + "loss": 1.1726, + "num_input_tokens_seen": 148940888, + "step": 9256 + }, + { + "epoch": 0.6484366107156304, + "grad_norm": 4.189022064208984, + "learning_rate": 3.5217211908931705e-05, + "loss": 1.0573, + "num_input_tokens_seen": 148957272, + "step": 9257 + }, + { + "epoch": 0.6485066589613596, + "grad_norm": 3.785248041152954, + "learning_rate": 3.521021366024518e-05, + "loss": 1.1941, + "num_input_tokens_seen": 148973024, + "step": 9258 + }, + { + "epoch": 0.6485767072070889, + "grad_norm": 4.815038681030273, + "learning_rate": 3.5203215411558675e-05, + "loss": 0.8447, + "num_input_tokens_seen": 148988488, + "step": 9259 + }, + { + "epoch": 0.6486467554528181, + "grad_norm": 3.973761796951294, + "learning_rate": 3.5196217162872164e-05, + "loss": 1.0053, + "num_input_tokens_seen": 149004872, + "step": 9260 + }, + { + "epoch": 0.6487168036985473, + "grad_norm": 3.6426029205322266, + "learning_rate": 3.518921891418564e-05, + "loss": 0.9906, + "num_input_tokens_seen": 149021256, + "step": 9261 + }, + { + "epoch": 0.6487868519442767, + "grad_norm": 3.9852302074432373, + "learning_rate": 3.518222066549913e-05, + "loss": 0.9823, + "num_input_tokens_seen": 149037168, + "step": 9262 + }, + { + "epoch": 0.6488569001900059, + "grad_norm": 4.678459167480469, + "learning_rate": 3.517522241681261e-05, + "loss": 0.9528, + "num_input_tokens_seen": 149053216, + "step": 9263 + }, + { + "epoch": 0.6489269484357351, + "grad_norm": 4.394834518432617, + "learning_rate": 3.51682241681261e-05, + "loss": 1.0763, + "num_input_tokens_seen": 149068880, + "step": 9264 + }, + { + "epoch": 0.6489969966814644, + "grad_norm": 4.350287914276123, + "learning_rate": 3.516122591943959e-05, + "loss": 0.9263, + "num_input_tokens_seen": 149084672, + "step": 9265 + }, + { + "epoch": 0.6490670449271936, + "grad_norm": 4.098289966583252, + "learning_rate": 3.515422767075307e-05, + "loss": 1.3206, + "num_input_tokens_seen": 149101056, + "step": 9266 + }, + { + "epoch": 0.6491370931729229, + "grad_norm": 4.111892223358154, + "learning_rate": 3.514722942206656e-05, + "loss": 1.0648, + "num_input_tokens_seen": 149117136, + "step": 9267 + }, + { + "epoch": 0.6492071414186521, + "grad_norm": 3.617952346801758, + "learning_rate": 3.514023117338003e-05, + "loss": 1.1017, + "num_input_tokens_seen": 149133336, + "step": 9268 + }, + { + "epoch": 0.6492771896643813, + "grad_norm": 4.351487636566162, + "learning_rate": 3.513323292469353e-05, + "loss": 1.1425, + "num_input_tokens_seen": 149149352, + "step": 9269 + }, + { + "epoch": 0.6493472379101106, + "grad_norm": 3.7895679473876953, + "learning_rate": 3.5126234676007016e-05, + "loss": 1.1444, + "num_input_tokens_seen": 149165736, + "step": 9270 + }, + { + "epoch": 0.6494172861558398, + "grad_norm": 4.267433166503906, + "learning_rate": 3.511923642732049e-05, + "loss": 0.842, + "num_input_tokens_seen": 149181816, + "step": 9271 + }, + { + "epoch": 0.649487334401569, + "grad_norm": 3.528719663619995, + "learning_rate": 3.511223817863398e-05, + "loss": 0.9683, + "num_input_tokens_seen": 149198200, + "step": 9272 + }, + { + "epoch": 0.6495573826472983, + "grad_norm": 4.105490207672119, + "learning_rate": 3.510523992994746e-05, + "loss": 0.922, + "num_input_tokens_seen": 149214584, + "step": 9273 + }, + { + "epoch": 0.6496274308930275, + "grad_norm": 3.7803657054901123, + "learning_rate": 3.509824168126095e-05, + "loss": 1.0865, + "num_input_tokens_seen": 149230968, + "step": 9274 + }, + { + "epoch": 0.6496974791387569, + "grad_norm": 4.0793023109436035, + "learning_rate": 3.509124343257443e-05, + "loss": 1.0357, + "num_input_tokens_seen": 149246360, + "step": 9275 + }, + { + "epoch": 0.6497675273844861, + "grad_norm": 4.912314414978027, + "learning_rate": 3.508424518388792e-05, + "loss": 1.0405, + "num_input_tokens_seen": 149261864, + "step": 9276 + }, + { + "epoch": 0.6498375756302153, + "grad_norm": 4.330320358276367, + "learning_rate": 3.507724693520141e-05, + "loss": 0.968, + "num_input_tokens_seen": 149278248, + "step": 9277 + }, + { + "epoch": 0.6499076238759446, + "grad_norm": 5.587654113769531, + "learning_rate": 3.5070248686514884e-05, + "loss": 1.1403, + "num_input_tokens_seen": 149293864, + "step": 9278 + }, + { + "epoch": 0.6499776721216738, + "grad_norm": 3.7197189331054688, + "learning_rate": 3.506325043782838e-05, + "loss": 0.922, + "num_input_tokens_seen": 149309832, + "step": 9279 + }, + { + "epoch": 0.6500477203674031, + "grad_norm": 4.377011299133301, + "learning_rate": 3.5056252189141855e-05, + "loss": 1.0946, + "num_input_tokens_seen": 149326112, + "step": 9280 + }, + { + "epoch": 0.6501177686131323, + "grad_norm": 3.4686145782470703, + "learning_rate": 3.504925394045534e-05, + "loss": 1.112, + "num_input_tokens_seen": 149342496, + "step": 9281 + }, + { + "epoch": 0.6501878168588615, + "grad_norm": 4.899174213409424, + "learning_rate": 3.504225569176883e-05, + "loss": 1.0053, + "num_input_tokens_seen": 149358880, + "step": 9282 + }, + { + "epoch": 0.6502578651045908, + "grad_norm": 4.101467609405518, + "learning_rate": 3.5035257443082314e-05, + "loss": 1.0835, + "num_input_tokens_seen": 149374088, + "step": 9283 + }, + { + "epoch": 0.65032791335032, + "grad_norm": 3.850806951522827, + "learning_rate": 3.50282591943958e-05, + "loss": 1.0356, + "num_input_tokens_seen": 149389768, + "step": 9284 + }, + { + "epoch": 0.6503979615960492, + "grad_norm": 4.449591159820557, + "learning_rate": 3.5021260945709284e-05, + "loss": 0.9802, + "num_input_tokens_seen": 149406072, + "step": 9285 + }, + { + "epoch": 0.6504680098417785, + "grad_norm": 3.9867982864379883, + "learning_rate": 3.501426269702277e-05, + "loss": 0.9766, + "num_input_tokens_seen": 149421920, + "step": 9286 + }, + { + "epoch": 0.6505380580875078, + "grad_norm": 5.579376697540283, + "learning_rate": 3.500726444833626e-05, + "loss": 1.1088, + "num_input_tokens_seen": 149438304, + "step": 9287 + }, + { + "epoch": 0.6506081063332371, + "grad_norm": 3.6809887886047363, + "learning_rate": 3.5000266199649736e-05, + "loss": 1.105, + "num_input_tokens_seen": 149454688, + "step": 9288 + }, + { + "epoch": 0.6506781545789663, + "grad_norm": 3.4062514305114746, + "learning_rate": 3.499326795096323e-05, + "loss": 0.9478, + "num_input_tokens_seen": 149470128, + "step": 9289 + }, + { + "epoch": 0.6507482028246955, + "grad_norm": 3.659839630126953, + "learning_rate": 3.4986269702276707e-05, + "loss": 1.1343, + "num_input_tokens_seen": 149486512, + "step": 9290 + }, + { + "epoch": 0.6508182510704248, + "grad_norm": 4.694009304046631, + "learning_rate": 3.4979271453590195e-05, + "loss": 1.1365, + "num_input_tokens_seen": 149502896, + "step": 9291 + }, + { + "epoch": 0.650888299316154, + "grad_norm": 4.390425205230713, + "learning_rate": 3.4972273204903684e-05, + "loss": 1.0058, + "num_input_tokens_seen": 149519280, + "step": 9292 + }, + { + "epoch": 0.6509583475618832, + "grad_norm": 4.62302303314209, + "learning_rate": 3.4965274956217166e-05, + "loss": 1.0927, + "num_input_tokens_seen": 149534880, + "step": 9293 + }, + { + "epoch": 0.6510283958076125, + "grad_norm": 4.169375419616699, + "learning_rate": 3.4958276707530654e-05, + "loss": 1.1174, + "num_input_tokens_seen": 149551264, + "step": 9294 + }, + { + "epoch": 0.6510984440533417, + "grad_norm": 3.841010093688965, + "learning_rate": 3.4951278458844136e-05, + "loss": 1.0406, + "num_input_tokens_seen": 149567648, + "step": 9295 + }, + { + "epoch": 0.651168492299071, + "grad_norm": 4.527113914489746, + "learning_rate": 3.4944280210157625e-05, + "loss": 1.0398, + "num_input_tokens_seen": 149582688, + "step": 9296 + }, + { + "epoch": 0.6512385405448002, + "grad_norm": 3.6402499675750732, + "learning_rate": 3.493728196147111e-05, + "loss": 1.0945, + "num_input_tokens_seen": 149599072, + "step": 9297 + }, + { + "epoch": 0.6513085887905294, + "grad_norm": 3.7380270957946777, + "learning_rate": 3.493028371278459e-05, + "loss": 1.1785, + "num_input_tokens_seen": 149615424, + "step": 9298 + }, + { + "epoch": 0.6513786370362588, + "grad_norm": 5.974104404449463, + "learning_rate": 3.4923285464098084e-05, + "loss": 1.0541, + "num_input_tokens_seen": 149631808, + "step": 9299 + }, + { + "epoch": 0.651448685281988, + "grad_norm": 5.0363874435424805, + "learning_rate": 3.491628721541156e-05, + "loss": 0.8974, + "num_input_tokens_seen": 149648192, + "step": 9300 + }, + { + "epoch": 0.6515187335277172, + "grad_norm": 3.8840138912200928, + "learning_rate": 3.490928896672505e-05, + "loss": 0.9802, + "num_input_tokens_seen": 149664576, + "step": 9301 + }, + { + "epoch": 0.6515887817734465, + "grad_norm": 3.598795175552368, + "learning_rate": 3.490229071803853e-05, + "loss": 1.0979, + "num_input_tokens_seen": 149680928, + "step": 9302 + }, + { + "epoch": 0.6516588300191757, + "grad_norm": 3.7103326320648193, + "learning_rate": 3.489529246935202e-05, + "loss": 1.1326, + "num_input_tokens_seen": 149697312, + "step": 9303 + }, + { + "epoch": 0.651728878264905, + "grad_norm": 5.972287654876709, + "learning_rate": 3.4888294220665506e-05, + "loss": 1.0054, + "num_input_tokens_seen": 149713696, + "step": 9304 + }, + { + "epoch": 0.6517989265106342, + "grad_norm": 6.008615493774414, + "learning_rate": 3.488129597197899e-05, + "loss": 0.843, + "num_input_tokens_seen": 149729000, + "step": 9305 + }, + { + "epoch": 0.6518689747563634, + "grad_norm": 5.612421035766602, + "learning_rate": 3.4874297723292477e-05, + "loss": 1.0821, + "num_input_tokens_seen": 149745384, + "step": 9306 + }, + { + "epoch": 0.6519390230020927, + "grad_norm": 4.998090744018555, + "learning_rate": 3.486729947460595e-05, + "loss": 1.2086, + "num_input_tokens_seen": 149760264, + "step": 9307 + }, + { + "epoch": 0.6520090712478219, + "grad_norm": 3.4391870498657227, + "learning_rate": 3.486030122591944e-05, + "loss": 0.8801, + "num_input_tokens_seen": 149776648, + "step": 9308 + }, + { + "epoch": 0.6520791194935511, + "grad_norm": 6.887722969055176, + "learning_rate": 3.485330297723293e-05, + "loss": 1.0269, + "num_input_tokens_seen": 149793032, + "step": 9309 + }, + { + "epoch": 0.6521491677392804, + "grad_norm": 3.6658191680908203, + "learning_rate": 3.484630472854641e-05, + "loss": 1.1763, + "num_input_tokens_seen": 149809416, + "step": 9310 + }, + { + "epoch": 0.6522192159850096, + "grad_norm": 4.455888748168945, + "learning_rate": 3.48393064798599e-05, + "loss": 1.0886, + "num_input_tokens_seen": 149825800, + "step": 9311 + }, + { + "epoch": 0.652289264230739, + "grad_norm": 3.553740978240967, + "learning_rate": 3.483230823117338e-05, + "loss": 0.9932, + "num_input_tokens_seen": 149842184, + "step": 9312 + }, + { + "epoch": 0.6523593124764682, + "grad_norm": 4.304836273193359, + "learning_rate": 3.482530998248687e-05, + "loss": 0.9551, + "num_input_tokens_seen": 149857888, + "step": 9313 + }, + { + "epoch": 0.6524293607221974, + "grad_norm": 5.257026672363281, + "learning_rate": 3.481831173380036e-05, + "loss": 0.9986, + "num_input_tokens_seen": 149874272, + "step": 9314 + }, + { + "epoch": 0.6524994089679267, + "grad_norm": 4.301852703094482, + "learning_rate": 3.481131348511384e-05, + "loss": 1.0636, + "num_input_tokens_seen": 149890568, + "step": 9315 + }, + { + "epoch": 0.6525694572136559, + "grad_norm": 3.8146626949310303, + "learning_rate": 3.480431523642733e-05, + "loss": 0.9849, + "num_input_tokens_seen": 149906632, + "step": 9316 + }, + { + "epoch": 0.6526395054593852, + "grad_norm": 4.092647552490234, + "learning_rate": 3.4797316987740804e-05, + "loss": 1.1259, + "num_input_tokens_seen": 149922648, + "step": 9317 + }, + { + "epoch": 0.6527095537051144, + "grad_norm": 3.570619583129883, + "learning_rate": 3.479031873905429e-05, + "loss": 0.9859, + "num_input_tokens_seen": 149938936, + "step": 9318 + }, + { + "epoch": 0.6527796019508436, + "grad_norm": 4.6556077003479, + "learning_rate": 3.478332049036778e-05, + "loss": 0.9434, + "num_input_tokens_seen": 149954376, + "step": 9319 + }, + { + "epoch": 0.6528496501965729, + "grad_norm": 3.5924341678619385, + "learning_rate": 3.477632224168126e-05, + "loss": 1.0104, + "num_input_tokens_seen": 149970760, + "step": 9320 + }, + { + "epoch": 0.6529196984423021, + "grad_norm": 3.3704681396484375, + "learning_rate": 3.476932399299475e-05, + "loss": 1.0059, + "num_input_tokens_seen": 149987144, + "step": 9321 + }, + { + "epoch": 0.6529897466880313, + "grad_norm": 3.3549299240112305, + "learning_rate": 3.476232574430823e-05, + "loss": 0.7868, + "num_input_tokens_seen": 150002448, + "step": 9322 + }, + { + "epoch": 0.6530597949337607, + "grad_norm": 5.747979164123535, + "learning_rate": 3.475532749562172e-05, + "loss": 1.1781, + "num_input_tokens_seen": 150017320, + "step": 9323 + }, + { + "epoch": 0.6531298431794899, + "grad_norm": 4.000280857086182, + "learning_rate": 3.474832924693521e-05, + "loss": 1.0249, + "num_input_tokens_seen": 150033704, + "step": 9324 + }, + { + "epoch": 0.6531998914252192, + "grad_norm": 8.421374320983887, + "learning_rate": 3.474133099824869e-05, + "loss": 1.0694, + "num_input_tokens_seen": 150050088, + "step": 9325 + }, + { + "epoch": 0.6532699396709484, + "grad_norm": 5.783685207366943, + "learning_rate": 3.473433274956218e-05, + "loss": 1.063, + "num_input_tokens_seen": 150066472, + "step": 9326 + }, + { + "epoch": 0.6533399879166776, + "grad_norm": 4.068075656890869, + "learning_rate": 3.4727334500875656e-05, + "loss": 1.0543, + "num_input_tokens_seen": 150082296, + "step": 9327 + }, + { + "epoch": 0.6534100361624069, + "grad_norm": 3.5591349601745605, + "learning_rate": 3.4720336252189144e-05, + "loss": 0.9712, + "num_input_tokens_seen": 150098448, + "step": 9328 + }, + { + "epoch": 0.6534800844081361, + "grad_norm": 4.313381195068359, + "learning_rate": 3.4713338003502626e-05, + "loss": 0.9222, + "num_input_tokens_seen": 150114832, + "step": 9329 + }, + { + "epoch": 0.6535501326538653, + "grad_norm": 3.91219425201416, + "learning_rate": 3.4706339754816115e-05, + "loss": 0.7862, + "num_input_tokens_seen": 150131176, + "step": 9330 + }, + { + "epoch": 0.6536201808995946, + "grad_norm": 7.561997413635254, + "learning_rate": 3.46993415061296e-05, + "loss": 0.9901, + "num_input_tokens_seen": 150146312, + "step": 9331 + }, + { + "epoch": 0.6536902291453238, + "grad_norm": 5.7827839851379395, + "learning_rate": 3.4692343257443085e-05, + "loss": 1.0653, + "num_input_tokens_seen": 150161952, + "step": 9332 + }, + { + "epoch": 0.6537602773910531, + "grad_norm": 3.89255428314209, + "learning_rate": 3.4685345008756574e-05, + "loss": 1.1275, + "num_input_tokens_seen": 150178336, + "step": 9333 + }, + { + "epoch": 0.6538303256367823, + "grad_norm": 3.993328094482422, + "learning_rate": 3.467834676007005e-05, + "loss": 1.3535, + "num_input_tokens_seen": 150194720, + "step": 9334 + }, + { + "epoch": 0.6539003738825115, + "grad_norm": 5.949758529663086, + "learning_rate": 3.467134851138354e-05, + "loss": 1.0441, + "num_input_tokens_seen": 150211104, + "step": 9335 + }, + { + "epoch": 0.6539704221282409, + "grad_norm": 4.4025349617004395, + "learning_rate": 3.466435026269703e-05, + "loss": 1.0853, + "num_input_tokens_seen": 150227264, + "step": 9336 + }, + { + "epoch": 0.6540404703739701, + "grad_norm": 3.49371075630188, + "learning_rate": 3.465735201401051e-05, + "loss": 0.84, + "num_input_tokens_seen": 150243328, + "step": 9337 + }, + { + "epoch": 0.6541105186196993, + "grad_norm": 4.42230224609375, + "learning_rate": 3.4650353765323996e-05, + "loss": 1.023, + "num_input_tokens_seen": 150259616, + "step": 9338 + }, + { + "epoch": 0.6541805668654286, + "grad_norm": 3.894158363342285, + "learning_rate": 3.464335551663748e-05, + "loss": 1.0807, + "num_input_tokens_seen": 150275032, + "step": 9339 + }, + { + "epoch": 0.6542506151111578, + "grad_norm": 3.852571964263916, + "learning_rate": 3.463635726795097e-05, + "loss": 0.9884, + "num_input_tokens_seen": 150290496, + "step": 9340 + }, + { + "epoch": 0.6543206633568871, + "grad_norm": 3.6513307094573975, + "learning_rate": 3.4629359019264455e-05, + "loss": 1.0986, + "num_input_tokens_seen": 150306568, + "step": 9341 + }, + { + "epoch": 0.6543907116026163, + "grad_norm": 3.3683671951293945, + "learning_rate": 3.462236077057794e-05, + "loss": 0.9463, + "num_input_tokens_seen": 150322344, + "step": 9342 + }, + { + "epoch": 0.6544607598483455, + "grad_norm": 3.8671581745147705, + "learning_rate": 3.4615362521891426e-05, + "loss": 0.924, + "num_input_tokens_seen": 150337880, + "step": 9343 + }, + { + "epoch": 0.6545308080940748, + "grad_norm": 4.907329082489014, + "learning_rate": 3.46083642732049e-05, + "loss": 1.1068, + "num_input_tokens_seen": 150354192, + "step": 9344 + }, + { + "epoch": 0.654600856339804, + "grad_norm": 3.418144702911377, + "learning_rate": 3.460136602451839e-05, + "loss": 0.8838, + "num_input_tokens_seen": 150370400, + "step": 9345 + }, + { + "epoch": 0.6546709045855332, + "grad_norm": 4.830320358276367, + "learning_rate": 3.4594367775831885e-05, + "loss": 1.2755, + "num_input_tokens_seen": 150386784, + "step": 9346 + }, + { + "epoch": 0.6547409528312625, + "grad_norm": 3.5603537559509277, + "learning_rate": 3.458736952714536e-05, + "loss": 0.9978, + "num_input_tokens_seen": 150403168, + "step": 9347 + }, + { + "epoch": 0.6548110010769917, + "grad_norm": 4.346913814544678, + "learning_rate": 3.458037127845885e-05, + "loss": 1.2331, + "num_input_tokens_seen": 150419264, + "step": 9348 + }, + { + "epoch": 0.6548810493227211, + "grad_norm": 4.1000471115112305, + "learning_rate": 3.457337302977233e-05, + "loss": 0.9739, + "num_input_tokens_seen": 150434760, + "step": 9349 + }, + { + "epoch": 0.6549510975684503, + "grad_norm": 3.8785784244537354, + "learning_rate": 3.456637478108582e-05, + "loss": 1.0802, + "num_input_tokens_seen": 150450784, + "step": 9350 + }, + { + "epoch": 0.6550211458141795, + "grad_norm": 4.044277667999268, + "learning_rate": 3.455937653239931e-05, + "loss": 1.1494, + "num_input_tokens_seen": 150466704, + "step": 9351 + }, + { + "epoch": 0.6550911940599088, + "grad_norm": 4.537202835083008, + "learning_rate": 3.455237828371279e-05, + "loss": 1.3344, + "num_input_tokens_seen": 150482760, + "step": 9352 + }, + { + "epoch": 0.655161242305638, + "grad_norm": 3.4889261722564697, + "learning_rate": 3.454538003502628e-05, + "loss": 1.015, + "num_input_tokens_seen": 150498832, + "step": 9353 + }, + { + "epoch": 0.6552312905513673, + "grad_norm": 4.298145294189453, + "learning_rate": 3.453838178633975e-05, + "loss": 0.9788, + "num_input_tokens_seen": 150515040, + "step": 9354 + }, + { + "epoch": 0.6553013387970965, + "grad_norm": 4.11521053314209, + "learning_rate": 3.453138353765324e-05, + "loss": 1.0735, + "num_input_tokens_seen": 150531424, + "step": 9355 + }, + { + "epoch": 0.6553713870428257, + "grad_norm": 3.5442028045654297, + "learning_rate": 3.452438528896672e-05, + "loss": 0.9131, + "num_input_tokens_seen": 150547728, + "step": 9356 + }, + { + "epoch": 0.655441435288555, + "grad_norm": 3.299097776412964, + "learning_rate": 3.451738704028021e-05, + "loss": 0.8276, + "num_input_tokens_seen": 150564112, + "step": 9357 + }, + { + "epoch": 0.6555114835342842, + "grad_norm": 3.9883878231048584, + "learning_rate": 3.45103887915937e-05, + "loss": 1.1212, + "num_input_tokens_seen": 150580496, + "step": 9358 + }, + { + "epoch": 0.6555815317800134, + "grad_norm": 4.217438697814941, + "learning_rate": 3.450339054290718e-05, + "loss": 0.9461, + "num_input_tokens_seen": 150596880, + "step": 9359 + }, + { + "epoch": 0.6556515800257428, + "grad_norm": 4.405513286590576, + "learning_rate": 3.449639229422067e-05, + "loss": 1.2798, + "num_input_tokens_seen": 150613264, + "step": 9360 + }, + { + "epoch": 0.655721628271472, + "grad_norm": 3.7431020736694336, + "learning_rate": 3.4489394045534146e-05, + "loss": 1.0564, + "num_input_tokens_seen": 150629328, + "step": 9361 + }, + { + "epoch": 0.6557916765172013, + "grad_norm": 3.9886715412139893, + "learning_rate": 3.448239579684764e-05, + "loss": 1.0014, + "num_input_tokens_seen": 150645712, + "step": 9362 + }, + { + "epoch": 0.6558617247629305, + "grad_norm": 4.192348003387451, + "learning_rate": 3.447539754816113e-05, + "loss": 1.2656, + "num_input_tokens_seen": 150662096, + "step": 9363 + }, + { + "epoch": 0.6559317730086597, + "grad_norm": 3.472885847091675, + "learning_rate": 3.4468399299474605e-05, + "loss": 1.02, + "num_input_tokens_seen": 150678456, + "step": 9364 + }, + { + "epoch": 0.656001821254389, + "grad_norm": 5.588219165802002, + "learning_rate": 3.446140105078809e-05, + "loss": 1.0747, + "num_input_tokens_seen": 150694784, + "step": 9365 + }, + { + "epoch": 0.6560718695001182, + "grad_norm": 4.032818794250488, + "learning_rate": 3.4454402802101575e-05, + "loss": 0.9757, + "num_input_tokens_seen": 150710320, + "step": 9366 + }, + { + "epoch": 0.6561419177458474, + "grad_norm": 3.6986799240112305, + "learning_rate": 3.4447404553415064e-05, + "loss": 1.0547, + "num_input_tokens_seen": 150726704, + "step": 9367 + }, + { + "epoch": 0.6562119659915767, + "grad_norm": 4.5291008949279785, + "learning_rate": 3.444040630472855e-05, + "loss": 1.0274, + "num_input_tokens_seen": 150742008, + "step": 9368 + }, + { + "epoch": 0.6562820142373059, + "grad_norm": 4.255716800689697, + "learning_rate": 3.4433408056042034e-05, + "loss": 1.0174, + "num_input_tokens_seen": 150758392, + "step": 9369 + }, + { + "epoch": 0.6563520624830352, + "grad_norm": 4.012734889984131, + "learning_rate": 3.442640980735552e-05, + "loss": 1.1147, + "num_input_tokens_seen": 150774712, + "step": 9370 + }, + { + "epoch": 0.6564221107287644, + "grad_norm": 4.020108699798584, + "learning_rate": 3.4419411558669e-05, + "loss": 0.8917, + "num_input_tokens_seen": 150789832, + "step": 9371 + }, + { + "epoch": 0.6564921589744936, + "grad_norm": 3.965582847595215, + "learning_rate": 3.441241330998249e-05, + "loss": 1.1832, + "num_input_tokens_seen": 150805776, + "step": 9372 + }, + { + "epoch": 0.656562207220223, + "grad_norm": 3.7959187030792236, + "learning_rate": 3.440541506129598e-05, + "loss": 1.0402, + "num_input_tokens_seen": 150821984, + "step": 9373 + }, + { + "epoch": 0.6566322554659522, + "grad_norm": 3.8097522258758545, + "learning_rate": 3.439841681260946e-05, + "loss": 1.0232, + "num_input_tokens_seen": 150838368, + "step": 9374 + }, + { + "epoch": 0.6567023037116814, + "grad_norm": 4.871758937835693, + "learning_rate": 3.4391418563922945e-05, + "loss": 1.0021, + "num_input_tokens_seen": 150854632, + "step": 9375 + }, + { + "epoch": 0.6567723519574107, + "grad_norm": 3.6479547023773193, + "learning_rate": 3.438442031523643e-05, + "loss": 1.0738, + "num_input_tokens_seen": 150871016, + "step": 9376 + }, + { + "epoch": 0.6568424002031399, + "grad_norm": 3.564363479614258, + "learning_rate": 3.4377422066549916e-05, + "loss": 1.0555, + "num_input_tokens_seen": 150887272, + "step": 9377 + }, + { + "epoch": 0.6569124484488692, + "grad_norm": 3.351440906524658, + "learning_rate": 3.4370423817863404e-05, + "loss": 0.8708, + "num_input_tokens_seen": 150903584, + "step": 9378 + }, + { + "epoch": 0.6569824966945984, + "grad_norm": 5.013852119445801, + "learning_rate": 3.4363425569176886e-05, + "loss": 1.0927, + "num_input_tokens_seen": 150919968, + "step": 9379 + }, + { + "epoch": 0.6570525449403276, + "grad_norm": 3.935525894165039, + "learning_rate": 3.4356427320490375e-05, + "loss": 1.0541, + "num_input_tokens_seen": 150936352, + "step": 9380 + }, + { + "epoch": 0.6571225931860569, + "grad_norm": 3.793931484222412, + "learning_rate": 3.434942907180385e-05, + "loss": 0.9834, + "num_input_tokens_seen": 150951976, + "step": 9381 + }, + { + "epoch": 0.6571926414317861, + "grad_norm": 3.9709832668304443, + "learning_rate": 3.4342430823117345e-05, + "loss": 1.0398, + "num_input_tokens_seen": 150967696, + "step": 9382 + }, + { + "epoch": 0.6572626896775154, + "grad_norm": 3.6775920391082764, + "learning_rate": 3.433543257443082e-05, + "loss": 1.052, + "num_input_tokens_seen": 150984080, + "step": 9383 + }, + { + "epoch": 0.6573327379232446, + "grad_norm": 3.944552183151245, + "learning_rate": 3.432843432574431e-05, + "loss": 0.9619, + "num_input_tokens_seen": 151000464, + "step": 9384 + }, + { + "epoch": 0.6574027861689739, + "grad_norm": 4.884947776794434, + "learning_rate": 3.43214360770578e-05, + "loss": 0.962, + "num_input_tokens_seen": 151016336, + "step": 9385 + }, + { + "epoch": 0.6574728344147032, + "grad_norm": 3.7997472286224365, + "learning_rate": 3.431443782837128e-05, + "loss": 1.168, + "num_input_tokens_seen": 151032720, + "step": 9386 + }, + { + "epoch": 0.6575428826604324, + "grad_norm": 3.6229546070098877, + "learning_rate": 3.430743957968477e-05, + "loss": 1.041, + "num_input_tokens_seen": 151049104, + "step": 9387 + }, + { + "epoch": 0.6576129309061616, + "grad_norm": 3.550644636154175, + "learning_rate": 3.430044133099825e-05, + "loss": 0.9054, + "num_input_tokens_seen": 151065488, + "step": 9388 + }, + { + "epoch": 0.6576829791518909, + "grad_norm": 3.731940984725952, + "learning_rate": 3.429344308231174e-05, + "loss": 1.0072, + "num_input_tokens_seen": 151081872, + "step": 9389 + }, + { + "epoch": 0.6577530273976201, + "grad_norm": 4.065341949462891, + "learning_rate": 3.428644483362523e-05, + "loss": 1.2458, + "num_input_tokens_seen": 151098032, + "step": 9390 + }, + { + "epoch": 0.6578230756433494, + "grad_norm": 3.6913809776306152, + "learning_rate": 3.42794465849387e-05, + "loss": 1.0634, + "num_input_tokens_seen": 151114296, + "step": 9391 + }, + { + "epoch": 0.6578931238890786, + "grad_norm": 4.060647010803223, + "learning_rate": 3.42724483362522e-05, + "loss": 1.0957, + "num_input_tokens_seen": 151129696, + "step": 9392 + }, + { + "epoch": 0.6579631721348078, + "grad_norm": 3.552968978881836, + "learning_rate": 3.426545008756567e-05, + "loss": 1.0621, + "num_input_tokens_seen": 151146080, + "step": 9393 + }, + { + "epoch": 0.6580332203805371, + "grad_norm": 3.4175221920013428, + "learning_rate": 3.425845183887916e-05, + "loss": 1.0011, + "num_input_tokens_seen": 151162464, + "step": 9394 + }, + { + "epoch": 0.6581032686262663, + "grad_norm": 5.83115816116333, + "learning_rate": 3.425145359019265e-05, + "loss": 1.2742, + "num_input_tokens_seen": 151178848, + "step": 9395 + }, + { + "epoch": 0.6581733168719955, + "grad_norm": 3.870480537414551, + "learning_rate": 3.424445534150613e-05, + "loss": 1.0549, + "num_input_tokens_seen": 151195232, + "step": 9396 + }, + { + "epoch": 0.6582433651177249, + "grad_norm": 3.625668525695801, + "learning_rate": 3.423745709281962e-05, + "loss": 1.0928, + "num_input_tokens_seen": 151211120, + "step": 9397 + }, + { + "epoch": 0.6583134133634541, + "grad_norm": 3.956568956375122, + "learning_rate": 3.42304588441331e-05, + "loss": 1.1662, + "num_input_tokens_seen": 151227336, + "step": 9398 + }, + { + "epoch": 0.6583834616091834, + "grad_norm": 3.528535842895508, + "learning_rate": 3.422346059544659e-05, + "loss": 1.0421, + "num_input_tokens_seen": 151243688, + "step": 9399 + }, + { + "epoch": 0.6584535098549126, + "grad_norm": 3.476381540298462, + "learning_rate": 3.421646234676008e-05, + "loss": 0.936, + "num_input_tokens_seen": 151260072, + "step": 9400 + }, + { + "epoch": 0.6584535098549126, + "eval_loss": 1.1180161237716675, + "eval_runtime": 0.1789, + "eval_samples_per_second": 5.591, + "eval_steps_per_second": 5.591, + "num_input_tokens_seen": 151260072, + "step": 9400 + }, + { + "epoch": 0.6585235581006418, + "grad_norm": 3.5269508361816406, + "learning_rate": 3.4209464098073554e-05, + "loss": 1.0063, + "num_input_tokens_seen": 151276456, + "step": 9401 + }, + { + "epoch": 0.6585936063463711, + "grad_norm": 5.150668144226074, + "learning_rate": 3.420246584938705e-05, + "loss": 1.0781, + "num_input_tokens_seen": 151292496, + "step": 9402 + }, + { + "epoch": 0.6586636545921003, + "grad_norm": 3.690948009490967, + "learning_rate": 3.4195467600700524e-05, + "loss": 0.9374, + "num_input_tokens_seen": 151308448, + "step": 9403 + }, + { + "epoch": 0.6587337028378295, + "grad_norm": 3.9759581089019775, + "learning_rate": 3.418846935201401e-05, + "loss": 1.2101, + "num_input_tokens_seen": 151324832, + "step": 9404 + }, + { + "epoch": 0.6588037510835588, + "grad_norm": 4.860933780670166, + "learning_rate": 3.41814711033275e-05, + "loss": 1.0192, + "num_input_tokens_seen": 151341024, + "step": 9405 + }, + { + "epoch": 0.658873799329288, + "grad_norm": 5.007035255432129, + "learning_rate": 3.417447285464098e-05, + "loss": 1.0735, + "num_input_tokens_seen": 151357408, + "step": 9406 + }, + { + "epoch": 0.6589438475750173, + "grad_norm": 6.018434047698975, + "learning_rate": 3.416747460595447e-05, + "loss": 1.1597, + "num_input_tokens_seen": 151372816, + "step": 9407 + }, + { + "epoch": 0.6590138958207465, + "grad_norm": 3.7404720783233643, + "learning_rate": 3.4160476357267954e-05, + "loss": 1.0291, + "num_input_tokens_seen": 151389200, + "step": 9408 + }, + { + "epoch": 0.6590839440664757, + "grad_norm": 4.315739631652832, + "learning_rate": 3.415347810858144e-05, + "loss": 1.0067, + "num_input_tokens_seen": 151405272, + "step": 9409 + }, + { + "epoch": 0.6591539923122051, + "grad_norm": 4.934928894042969, + "learning_rate": 3.414647985989492e-05, + "loss": 0.8984, + "num_input_tokens_seen": 151421656, + "step": 9410 + }, + { + "epoch": 0.6592240405579343, + "grad_norm": 4.2328314781188965, + "learning_rate": 3.4139481611208406e-05, + "loss": 1.3312, + "num_input_tokens_seen": 151437936, + "step": 9411 + }, + { + "epoch": 0.6592940888036635, + "grad_norm": 4.081817150115967, + "learning_rate": 3.41324833625219e-05, + "loss": 1.2755, + "num_input_tokens_seen": 151453416, + "step": 9412 + }, + { + "epoch": 0.6593641370493928, + "grad_norm": 6.1436004638671875, + "learning_rate": 3.4125485113835376e-05, + "loss": 1.1423, + "num_input_tokens_seen": 151469800, + "step": 9413 + }, + { + "epoch": 0.659434185295122, + "grad_norm": 3.5476527214050293, + "learning_rate": 3.4118486865148865e-05, + "loss": 0.8508, + "num_input_tokens_seen": 151486184, + "step": 9414 + }, + { + "epoch": 0.6595042335408513, + "grad_norm": 5.349584579467773, + "learning_rate": 3.411148861646235e-05, + "loss": 1.0057, + "num_input_tokens_seen": 151502360, + "step": 9415 + }, + { + "epoch": 0.6595742817865805, + "grad_norm": 4.1030778884887695, + "learning_rate": 3.4104490367775835e-05, + "loss": 1.1046, + "num_input_tokens_seen": 151518704, + "step": 9416 + }, + { + "epoch": 0.6596443300323097, + "grad_norm": 3.699965238571167, + "learning_rate": 3.4097492119089324e-05, + "loss": 1.1088, + "num_input_tokens_seen": 151534288, + "step": 9417 + }, + { + "epoch": 0.659714378278039, + "grad_norm": 3.5363376140594482, + "learning_rate": 3.4090493870402806e-05, + "loss": 0.9081, + "num_input_tokens_seen": 151550672, + "step": 9418 + }, + { + "epoch": 0.6597844265237682, + "grad_norm": 4.74515962600708, + "learning_rate": 3.4083495621716294e-05, + "loss": 1.2152, + "num_input_tokens_seen": 151567056, + "step": 9419 + }, + { + "epoch": 0.6598544747694975, + "grad_norm": 3.872194766998291, + "learning_rate": 3.407649737302977e-05, + "loss": 1.0312, + "num_input_tokens_seen": 151583424, + "step": 9420 + }, + { + "epoch": 0.6599245230152267, + "grad_norm": 4.199446678161621, + "learning_rate": 3.406949912434326e-05, + "loss": 1.1338, + "num_input_tokens_seen": 151599160, + "step": 9421 + }, + { + "epoch": 0.659994571260956, + "grad_norm": 6.7564592361450195, + "learning_rate": 3.406250087565675e-05, + "loss": 1.1666, + "num_input_tokens_seen": 151615328, + "step": 9422 + }, + { + "epoch": 0.6600646195066853, + "grad_norm": 4.2356767654418945, + "learning_rate": 3.405550262697023e-05, + "loss": 0.972, + "num_input_tokens_seen": 151631304, + "step": 9423 + }, + { + "epoch": 0.6601346677524145, + "grad_norm": 3.542555332183838, + "learning_rate": 3.404850437828372e-05, + "loss": 0.9942, + "num_input_tokens_seen": 151647688, + "step": 9424 + }, + { + "epoch": 0.6602047159981437, + "grad_norm": 4.248206615447998, + "learning_rate": 3.40415061295972e-05, + "loss": 0.9974, + "num_input_tokens_seen": 151664072, + "step": 9425 + }, + { + "epoch": 0.660274764243873, + "grad_norm": 3.905214786529541, + "learning_rate": 3.403450788091069e-05, + "loss": 1.0054, + "num_input_tokens_seen": 151680456, + "step": 9426 + }, + { + "epoch": 0.6603448124896022, + "grad_norm": 6.451211929321289, + "learning_rate": 3.4027509632224176e-05, + "loss": 1.2838, + "num_input_tokens_seen": 151696840, + "step": 9427 + }, + { + "epoch": 0.6604148607353315, + "grad_norm": 3.9670305252075195, + "learning_rate": 3.402051138353766e-05, + "loss": 1.0709, + "num_input_tokens_seen": 151713224, + "step": 9428 + }, + { + "epoch": 0.6604849089810607, + "grad_norm": 4.485940456390381, + "learning_rate": 3.4013513134851146e-05, + "loss": 1.1487, + "num_input_tokens_seen": 151729000, + "step": 9429 + }, + { + "epoch": 0.6605549572267899, + "grad_norm": 3.700206995010376, + "learning_rate": 3.400651488616462e-05, + "loss": 0.9172, + "num_input_tokens_seen": 151745304, + "step": 9430 + }, + { + "epoch": 0.6606250054725192, + "grad_norm": 3.323678970336914, + "learning_rate": 3.399951663747811e-05, + "loss": 0.8349, + "num_input_tokens_seen": 151761592, + "step": 9431 + }, + { + "epoch": 0.6606950537182484, + "grad_norm": 4.245424747467041, + "learning_rate": 3.3992518388791605e-05, + "loss": 1.1232, + "num_input_tokens_seen": 151777976, + "step": 9432 + }, + { + "epoch": 0.6607651019639776, + "grad_norm": 4.301302909851074, + "learning_rate": 3.398552014010508e-05, + "loss": 0.9911, + "num_input_tokens_seen": 151793656, + "step": 9433 + }, + { + "epoch": 0.660835150209707, + "grad_norm": 3.6650359630584717, + "learning_rate": 3.397852189141857e-05, + "loss": 1.0688, + "num_input_tokens_seen": 151810040, + "step": 9434 + }, + { + "epoch": 0.6609051984554362, + "grad_norm": 4.1109418869018555, + "learning_rate": 3.397152364273205e-05, + "loss": 1.0724, + "num_input_tokens_seen": 151826392, + "step": 9435 + }, + { + "epoch": 0.6609752467011655, + "grad_norm": 4.112772464752197, + "learning_rate": 3.396452539404554e-05, + "loss": 0.898, + "num_input_tokens_seen": 151842776, + "step": 9436 + }, + { + "epoch": 0.6610452949468947, + "grad_norm": 3.760443925857544, + "learning_rate": 3.3957527145359014e-05, + "loss": 1.1901, + "num_input_tokens_seen": 151859160, + "step": 9437 + }, + { + "epoch": 0.6611153431926239, + "grad_norm": 3.621912956237793, + "learning_rate": 3.395052889667251e-05, + "loss": 0.8056, + "num_input_tokens_seen": 151875544, + "step": 9438 + }, + { + "epoch": 0.6611853914383532, + "grad_norm": 3.9284732341766357, + "learning_rate": 3.3943530647986e-05, + "loss": 1.2642, + "num_input_tokens_seen": 151891928, + "step": 9439 + }, + { + "epoch": 0.6612554396840824, + "grad_norm": 4.0845136642456055, + "learning_rate": 3.393653239929947e-05, + "loss": 1.0544, + "num_input_tokens_seen": 151907824, + "step": 9440 + }, + { + "epoch": 0.6613254879298116, + "grad_norm": 6.096484184265137, + "learning_rate": 3.392953415061296e-05, + "loss": 1.1214, + "num_input_tokens_seen": 151924208, + "step": 9441 + }, + { + "epoch": 0.6613955361755409, + "grad_norm": 3.7888896465301514, + "learning_rate": 3.3922535901926444e-05, + "loss": 1.1715, + "num_input_tokens_seen": 151940048, + "step": 9442 + }, + { + "epoch": 0.6614655844212701, + "grad_norm": 3.5970754623413086, + "learning_rate": 3.391553765323993e-05, + "loss": 0.9706, + "num_input_tokens_seen": 151956432, + "step": 9443 + }, + { + "epoch": 0.6615356326669994, + "grad_norm": 3.444704055786133, + "learning_rate": 3.390853940455342e-05, + "loss": 0.875, + "num_input_tokens_seen": 151972816, + "step": 9444 + }, + { + "epoch": 0.6616056809127286, + "grad_norm": 6.007751941680908, + "learning_rate": 3.39015411558669e-05, + "loss": 1.0048, + "num_input_tokens_seen": 151989200, + "step": 9445 + }, + { + "epoch": 0.6616757291584578, + "grad_norm": 3.6840016841888428, + "learning_rate": 3.389454290718039e-05, + "loss": 1.1341, + "num_input_tokens_seen": 152005576, + "step": 9446 + }, + { + "epoch": 0.6617457774041872, + "grad_norm": 4.422646522521973, + "learning_rate": 3.3887544658493866e-05, + "loss": 1.0368, + "num_input_tokens_seen": 152021768, + "step": 9447 + }, + { + "epoch": 0.6618158256499164, + "grad_norm": 6.254637718200684, + "learning_rate": 3.388054640980736e-05, + "loss": 1.0334, + "num_input_tokens_seen": 152038152, + "step": 9448 + }, + { + "epoch": 0.6618858738956456, + "grad_norm": 3.8378922939300537, + "learning_rate": 3.387354816112085e-05, + "loss": 1.068, + "num_input_tokens_seen": 152054536, + "step": 9449 + }, + { + "epoch": 0.6619559221413749, + "grad_norm": 3.7315855026245117, + "learning_rate": 3.3866549912434325e-05, + "loss": 0.951, + "num_input_tokens_seen": 152070920, + "step": 9450 + }, + { + "epoch": 0.6620259703871041, + "grad_norm": 3.883901834487915, + "learning_rate": 3.3859551663747814e-05, + "loss": 1.1214, + "num_input_tokens_seen": 152086456, + "step": 9451 + }, + { + "epoch": 0.6620960186328334, + "grad_norm": 5.336439609527588, + "learning_rate": 3.3852553415061296e-05, + "loss": 0.9823, + "num_input_tokens_seen": 152102440, + "step": 9452 + }, + { + "epoch": 0.6621660668785626, + "grad_norm": 3.723749876022339, + "learning_rate": 3.3845555166374784e-05, + "loss": 1.1013, + "num_input_tokens_seen": 152118440, + "step": 9453 + }, + { + "epoch": 0.6622361151242918, + "grad_norm": 4.6532301902771, + "learning_rate": 3.383855691768827e-05, + "loss": 0.9636, + "num_input_tokens_seen": 152134824, + "step": 9454 + }, + { + "epoch": 0.6623061633700211, + "grad_norm": 4.187502861022949, + "learning_rate": 3.3831558669001755e-05, + "loss": 1.0313, + "num_input_tokens_seen": 152151208, + "step": 9455 + }, + { + "epoch": 0.6623762116157503, + "grad_norm": 4.200648307800293, + "learning_rate": 3.382456042031524e-05, + "loss": 1.1569, + "num_input_tokens_seen": 152167592, + "step": 9456 + }, + { + "epoch": 0.6624462598614796, + "grad_norm": 3.9999473094940186, + "learning_rate": 3.381756217162872e-05, + "loss": 1.166, + "num_input_tokens_seen": 152183976, + "step": 9457 + }, + { + "epoch": 0.6625163081072089, + "grad_norm": 3.662642002105713, + "learning_rate": 3.3810563922942214e-05, + "loss": 0.8551, + "num_input_tokens_seen": 152200344, + "step": 9458 + }, + { + "epoch": 0.662586356352938, + "grad_norm": 4.253822326660156, + "learning_rate": 3.380356567425569e-05, + "loss": 0.9508, + "num_input_tokens_seen": 152216728, + "step": 9459 + }, + { + "epoch": 0.6626564045986674, + "grad_norm": 4.651457786560059, + "learning_rate": 3.379656742556918e-05, + "loss": 0.8668, + "num_input_tokens_seen": 152232888, + "step": 9460 + }, + { + "epoch": 0.6627264528443966, + "grad_norm": 3.5644173622131348, + "learning_rate": 3.3789569176882666e-05, + "loss": 0.843, + "num_input_tokens_seen": 152248992, + "step": 9461 + }, + { + "epoch": 0.6627965010901258, + "grad_norm": 3.687054395675659, + "learning_rate": 3.378257092819615e-05, + "loss": 1.0215, + "num_input_tokens_seen": 152265200, + "step": 9462 + }, + { + "epoch": 0.6628665493358551, + "grad_norm": 3.6727592945098877, + "learning_rate": 3.3775572679509636e-05, + "loss": 0.863, + "num_input_tokens_seen": 152281584, + "step": 9463 + }, + { + "epoch": 0.6629365975815843, + "grad_norm": 4.446970462799072, + "learning_rate": 3.376857443082312e-05, + "loss": 1.075, + "num_input_tokens_seen": 152297664, + "step": 9464 + }, + { + "epoch": 0.6630066458273136, + "grad_norm": 4.436252593994141, + "learning_rate": 3.376157618213661e-05, + "loss": 0.9757, + "num_input_tokens_seen": 152313576, + "step": 9465 + }, + { + "epoch": 0.6630766940730428, + "grad_norm": 4.0958571434021, + "learning_rate": 3.3754577933450095e-05, + "loss": 1.1412, + "num_input_tokens_seen": 152329168, + "step": 9466 + }, + { + "epoch": 0.663146742318772, + "grad_norm": 3.964369297027588, + "learning_rate": 3.374757968476357e-05, + "loss": 1.1048, + "num_input_tokens_seen": 152345552, + "step": 9467 + }, + { + "epoch": 0.6632167905645013, + "grad_norm": 3.828552007675171, + "learning_rate": 3.3740581436077066e-05, + "loss": 0.9588, + "num_input_tokens_seen": 152361752, + "step": 9468 + }, + { + "epoch": 0.6632868388102305, + "grad_norm": 7.937028884887695, + "learning_rate": 3.373358318739054e-05, + "loss": 0.9826, + "num_input_tokens_seen": 152377168, + "step": 9469 + }, + { + "epoch": 0.6633568870559597, + "grad_norm": 7.392448902130127, + "learning_rate": 3.372658493870403e-05, + "loss": 1.2274, + "num_input_tokens_seen": 152393512, + "step": 9470 + }, + { + "epoch": 0.6634269353016891, + "grad_norm": 4.474769592285156, + "learning_rate": 3.371958669001752e-05, + "loss": 1.084, + "num_input_tokens_seen": 152409896, + "step": 9471 + }, + { + "epoch": 0.6634969835474183, + "grad_norm": 5.453771114349365, + "learning_rate": 3.3712588441331e-05, + "loss": 1.0788, + "num_input_tokens_seen": 152424776, + "step": 9472 + }, + { + "epoch": 0.6635670317931476, + "grad_norm": 3.9744513034820557, + "learning_rate": 3.370559019264449e-05, + "loss": 0.8917, + "num_input_tokens_seen": 152441160, + "step": 9473 + }, + { + "epoch": 0.6636370800388768, + "grad_norm": 3.5041935443878174, + "learning_rate": 3.369859194395797e-05, + "loss": 1.0637, + "num_input_tokens_seen": 152456488, + "step": 9474 + }, + { + "epoch": 0.663707128284606, + "grad_norm": 3.9633948802948, + "learning_rate": 3.369159369527146e-05, + "loss": 1.041, + "num_input_tokens_seen": 152472872, + "step": 9475 + }, + { + "epoch": 0.6637771765303353, + "grad_norm": 3.540529489517212, + "learning_rate": 3.368459544658495e-05, + "loss": 0.966, + "num_input_tokens_seen": 152488664, + "step": 9476 + }, + { + "epoch": 0.6638472247760645, + "grad_norm": 3.701507806777954, + "learning_rate": 3.367759719789842e-05, + "loss": 1.112, + "num_input_tokens_seen": 152505048, + "step": 9477 + }, + { + "epoch": 0.6639172730217937, + "grad_norm": 3.892002582550049, + "learning_rate": 3.367059894921192e-05, + "loss": 0.9215, + "num_input_tokens_seen": 152521432, + "step": 9478 + }, + { + "epoch": 0.663987321267523, + "grad_norm": 5.297150611877441, + "learning_rate": 3.366360070052539e-05, + "loss": 1.1723, + "num_input_tokens_seen": 152537616, + "step": 9479 + }, + { + "epoch": 0.6640573695132522, + "grad_norm": 8.54346752166748, + "learning_rate": 3.365660245183888e-05, + "loss": 0.9812, + "num_input_tokens_seen": 152553184, + "step": 9480 + }, + { + "epoch": 0.6641274177589815, + "grad_norm": 3.787071466445923, + "learning_rate": 3.364960420315237e-05, + "loss": 1.0059, + "num_input_tokens_seen": 152569392, + "step": 9481 + }, + { + "epoch": 0.6641974660047107, + "grad_norm": 6.331751823425293, + "learning_rate": 3.364260595446585e-05, + "loss": 1.0972, + "num_input_tokens_seen": 152584976, + "step": 9482 + }, + { + "epoch": 0.66426751425044, + "grad_norm": 3.77357816696167, + "learning_rate": 3.363560770577934e-05, + "loss": 1.149, + "num_input_tokens_seen": 152601360, + "step": 9483 + }, + { + "epoch": 0.6643375624961693, + "grad_norm": 3.870004415512085, + "learning_rate": 3.362860945709282e-05, + "loss": 0.848, + "num_input_tokens_seen": 152616816, + "step": 9484 + }, + { + "epoch": 0.6644076107418985, + "grad_norm": 4.989454746246338, + "learning_rate": 3.362161120840631e-05, + "loss": 0.9917, + "num_input_tokens_seen": 152632280, + "step": 9485 + }, + { + "epoch": 0.6644776589876278, + "grad_norm": 3.9458837509155273, + "learning_rate": 3.3614612959719786e-05, + "loss": 0.9944, + "num_input_tokens_seen": 152647720, + "step": 9486 + }, + { + "epoch": 0.664547707233357, + "grad_norm": 4.702520847320557, + "learning_rate": 3.3607614711033274e-05, + "loss": 1.2036, + "num_input_tokens_seen": 152664104, + "step": 9487 + }, + { + "epoch": 0.6646177554790862, + "grad_norm": 3.8816921710968018, + "learning_rate": 3.360061646234677e-05, + "loss": 1.0434, + "num_input_tokens_seen": 152680488, + "step": 9488 + }, + { + "epoch": 0.6646878037248155, + "grad_norm": 3.8652398586273193, + "learning_rate": 3.3593618213660245e-05, + "loss": 1.065, + "num_input_tokens_seen": 152695968, + "step": 9489 + }, + { + "epoch": 0.6647578519705447, + "grad_norm": 3.4198479652404785, + "learning_rate": 3.3586619964973733e-05, + "loss": 1.0802, + "num_input_tokens_seen": 152712184, + "step": 9490 + }, + { + "epoch": 0.6648279002162739, + "grad_norm": 3.654895544052124, + "learning_rate": 3.3579621716287215e-05, + "loss": 0.8913, + "num_input_tokens_seen": 152728136, + "step": 9491 + }, + { + "epoch": 0.6648979484620032, + "grad_norm": 4.547909736633301, + "learning_rate": 3.3572623467600704e-05, + "loss": 0.8603, + "num_input_tokens_seen": 152744520, + "step": 9492 + }, + { + "epoch": 0.6649679967077324, + "grad_norm": 3.9165170192718506, + "learning_rate": 3.356562521891419e-05, + "loss": 1.153, + "num_input_tokens_seen": 152760904, + "step": 9493 + }, + { + "epoch": 0.6650380449534617, + "grad_norm": 3.5142409801483154, + "learning_rate": 3.3558626970227674e-05, + "loss": 1.079, + "num_input_tokens_seen": 152777288, + "step": 9494 + }, + { + "epoch": 0.665108093199191, + "grad_norm": 5.543309688568115, + "learning_rate": 3.355162872154116e-05, + "loss": 1.2104, + "num_input_tokens_seen": 152793672, + "step": 9495 + }, + { + "epoch": 0.6651781414449202, + "grad_norm": 3.869401693344116, + "learning_rate": 3.354463047285464e-05, + "loss": 0.989, + "num_input_tokens_seen": 152809920, + "step": 9496 + }, + { + "epoch": 0.6652481896906495, + "grad_norm": 5.091144561767578, + "learning_rate": 3.3537632224168126e-05, + "loss": 1.1099, + "num_input_tokens_seen": 152825408, + "step": 9497 + }, + { + "epoch": 0.6653182379363787, + "grad_norm": 3.599029302597046, + "learning_rate": 3.353063397548162e-05, + "loss": 1.0926, + "num_input_tokens_seen": 152841792, + "step": 9498 + }, + { + "epoch": 0.6653882861821079, + "grad_norm": 4.032376289367676, + "learning_rate": 3.35236357267951e-05, + "loss": 1.292, + "num_input_tokens_seen": 152858176, + "step": 9499 + }, + { + "epoch": 0.6654583344278372, + "grad_norm": 5.079924583435059, + "learning_rate": 3.3516637478108585e-05, + "loss": 1.0657, + "num_input_tokens_seen": 152873296, + "step": 9500 + }, + { + "epoch": 0.6655283826735664, + "grad_norm": 5.513190746307373, + "learning_rate": 3.350963922942207e-05, + "loss": 1.1515, + "num_input_tokens_seen": 152888280, + "step": 9501 + }, + { + "epoch": 0.6655984309192957, + "grad_norm": 3.9420206546783447, + "learning_rate": 3.3502640980735556e-05, + "loss": 1.051, + "num_input_tokens_seen": 152904008, + "step": 9502 + }, + { + "epoch": 0.6656684791650249, + "grad_norm": 4.6618123054504395, + "learning_rate": 3.3495642732049044e-05, + "loss": 0.9585, + "num_input_tokens_seen": 152920088, + "step": 9503 + }, + { + "epoch": 0.6657385274107541, + "grad_norm": 3.9407856464385986, + "learning_rate": 3.3488644483362526e-05, + "loss": 1.043, + "num_input_tokens_seen": 152936472, + "step": 9504 + }, + { + "epoch": 0.6658085756564834, + "grad_norm": 3.4341776371002197, + "learning_rate": 3.3481646234676015e-05, + "loss": 1.1322, + "num_input_tokens_seen": 152952416, + "step": 9505 + }, + { + "epoch": 0.6658786239022126, + "grad_norm": 4.084626197814941, + "learning_rate": 3.347464798598949e-05, + "loss": 1.1713, + "num_input_tokens_seen": 152968448, + "step": 9506 + }, + { + "epoch": 0.6659486721479418, + "grad_norm": 3.8343982696533203, + "learning_rate": 3.346764973730298e-05, + "loss": 1.0054, + "num_input_tokens_seen": 152984744, + "step": 9507 + }, + { + "epoch": 0.6660187203936712, + "grad_norm": 3.8985395431518555, + "learning_rate": 3.3460651488616474e-05, + "loss": 1.1044, + "num_input_tokens_seen": 153001128, + "step": 9508 + }, + { + "epoch": 0.6660887686394004, + "grad_norm": 3.4615519046783447, + "learning_rate": 3.345365323992995e-05, + "loss": 0.9899, + "num_input_tokens_seen": 153017328, + "step": 9509 + }, + { + "epoch": 0.6661588168851297, + "grad_norm": 4.714277744293213, + "learning_rate": 3.344665499124344e-05, + "loss": 1.1478, + "num_input_tokens_seen": 153032424, + "step": 9510 + }, + { + "epoch": 0.6662288651308589, + "grad_norm": 3.7734928131103516, + "learning_rate": 3.343965674255692e-05, + "loss": 0.9304, + "num_input_tokens_seen": 153048536, + "step": 9511 + }, + { + "epoch": 0.6662989133765881, + "grad_norm": 3.9096951484680176, + "learning_rate": 3.343265849387041e-05, + "loss": 0.9934, + "num_input_tokens_seen": 153064864, + "step": 9512 + }, + { + "epoch": 0.6663689616223174, + "grad_norm": 4.22433614730835, + "learning_rate": 3.342566024518388e-05, + "loss": 1.1047, + "num_input_tokens_seen": 153081248, + "step": 9513 + }, + { + "epoch": 0.6664390098680466, + "grad_norm": 3.6338961124420166, + "learning_rate": 3.341866199649738e-05, + "loss": 1.1064, + "num_input_tokens_seen": 153097192, + "step": 9514 + }, + { + "epoch": 0.6665090581137758, + "grad_norm": 3.903615713119507, + "learning_rate": 3.341166374781087e-05, + "loss": 0.8592, + "num_input_tokens_seen": 153113088, + "step": 9515 + }, + { + "epoch": 0.6665791063595051, + "grad_norm": 5.644516468048096, + "learning_rate": 3.340466549912434e-05, + "loss": 1.1684, + "num_input_tokens_seen": 153129472, + "step": 9516 + }, + { + "epoch": 0.6666491546052343, + "grad_norm": 3.7316622734069824, + "learning_rate": 3.339766725043783e-05, + "loss": 1.0655, + "num_input_tokens_seen": 153145856, + "step": 9517 + }, + { + "epoch": 0.6667192028509636, + "grad_norm": 4.2488908767700195, + "learning_rate": 3.339066900175131e-05, + "loss": 0.9961, + "num_input_tokens_seen": 153162224, + "step": 9518 + }, + { + "epoch": 0.6667892510966928, + "grad_norm": 4.319308757781982, + "learning_rate": 3.33836707530648e-05, + "loss": 0.9599, + "num_input_tokens_seen": 153178608, + "step": 9519 + }, + { + "epoch": 0.666859299342422, + "grad_norm": 3.928856372833252, + "learning_rate": 3.337667250437829e-05, + "loss": 1.3008, + "num_input_tokens_seen": 153194992, + "step": 9520 + }, + { + "epoch": 0.6669293475881514, + "grad_norm": 3.890669822692871, + "learning_rate": 3.336967425569177e-05, + "loss": 1.1411, + "num_input_tokens_seen": 153211376, + "step": 9521 + }, + { + "epoch": 0.6669993958338806, + "grad_norm": 3.9828126430511475, + "learning_rate": 3.336267600700526e-05, + "loss": 0.9936, + "num_input_tokens_seen": 153227760, + "step": 9522 + }, + { + "epoch": 0.6670694440796099, + "grad_norm": 4.098526477813721, + "learning_rate": 3.3355677758318735e-05, + "loss": 1.0702, + "num_input_tokens_seen": 153244144, + "step": 9523 + }, + { + "epoch": 0.6671394923253391, + "grad_norm": 3.6568076610565186, + "learning_rate": 3.334867950963223e-05, + "loss": 1.023, + "num_input_tokens_seen": 153260528, + "step": 9524 + }, + { + "epoch": 0.6672095405710683, + "grad_norm": 3.8984262943267822, + "learning_rate": 3.334168126094572e-05, + "loss": 1.1156, + "num_input_tokens_seen": 153276912, + "step": 9525 + }, + { + "epoch": 0.6672795888167976, + "grad_norm": 3.629810094833374, + "learning_rate": 3.3334683012259194e-05, + "loss": 0.9147, + "num_input_tokens_seen": 153293296, + "step": 9526 + }, + { + "epoch": 0.6673496370625268, + "grad_norm": 4.0950775146484375, + "learning_rate": 3.332768476357268e-05, + "loss": 1.2899, + "num_input_tokens_seen": 153309680, + "step": 9527 + }, + { + "epoch": 0.667419685308256, + "grad_norm": 3.8461296558380127, + "learning_rate": 3.3320686514886164e-05, + "loss": 1.012, + "num_input_tokens_seen": 153325248, + "step": 9528 + }, + { + "epoch": 0.6674897335539853, + "grad_norm": 5.369377136230469, + "learning_rate": 3.331368826619965e-05, + "loss": 0.8388, + "num_input_tokens_seen": 153341392, + "step": 9529 + }, + { + "epoch": 0.6675597817997145, + "grad_norm": 7.369980812072754, + "learning_rate": 3.330669001751314e-05, + "loss": 1.2141, + "num_input_tokens_seen": 153357776, + "step": 9530 + }, + { + "epoch": 0.6676298300454439, + "grad_norm": 3.59489369392395, + "learning_rate": 3.329969176882662e-05, + "loss": 0.956, + "num_input_tokens_seen": 153374160, + "step": 9531 + }, + { + "epoch": 0.667699878291173, + "grad_norm": 3.9876575469970703, + "learning_rate": 3.329269352014011e-05, + "loss": 0.9642, + "num_input_tokens_seen": 153390416, + "step": 9532 + }, + { + "epoch": 0.6677699265369023, + "grad_norm": 4.399029731750488, + "learning_rate": 3.328569527145359e-05, + "loss": 1.0592, + "num_input_tokens_seen": 153406320, + "step": 9533 + }, + { + "epoch": 0.6678399747826316, + "grad_norm": 5.089081764221191, + "learning_rate": 3.327869702276708e-05, + "loss": 0.989, + "num_input_tokens_seen": 153422536, + "step": 9534 + }, + { + "epoch": 0.6679100230283608, + "grad_norm": 4.612379550933838, + "learning_rate": 3.327169877408057e-05, + "loss": 1.1272, + "num_input_tokens_seen": 153438640, + "step": 9535 + }, + { + "epoch": 0.66798007127409, + "grad_norm": 3.9107158184051514, + "learning_rate": 3.3264700525394046e-05, + "loss": 0.9679, + "num_input_tokens_seen": 153455024, + "step": 9536 + }, + { + "epoch": 0.6680501195198193, + "grad_norm": 4.265791416168213, + "learning_rate": 3.3257702276707535e-05, + "loss": 1.1185, + "num_input_tokens_seen": 153471304, + "step": 9537 + }, + { + "epoch": 0.6681201677655485, + "grad_norm": 3.600353479385376, + "learning_rate": 3.3250704028021016e-05, + "loss": 0.9221, + "num_input_tokens_seen": 153487688, + "step": 9538 + }, + { + "epoch": 0.6681902160112778, + "grad_norm": 3.841486930847168, + "learning_rate": 3.3243705779334505e-05, + "loss": 1.0915, + "num_input_tokens_seen": 153504072, + "step": 9539 + }, + { + "epoch": 0.668260264257007, + "grad_norm": 3.8733341693878174, + "learning_rate": 3.323670753064799e-05, + "loss": 0.994, + "num_input_tokens_seen": 153519368, + "step": 9540 + }, + { + "epoch": 0.6683303125027362, + "grad_norm": 3.599177360534668, + "learning_rate": 3.3229709281961475e-05, + "loss": 1.119, + "num_input_tokens_seen": 153535752, + "step": 9541 + }, + { + "epoch": 0.6684003607484655, + "grad_norm": 3.652437925338745, + "learning_rate": 3.3222711033274964e-05, + "loss": 0.9226, + "num_input_tokens_seen": 153551752, + "step": 9542 + }, + { + "epoch": 0.6684704089941947, + "grad_norm": 3.7662746906280518, + "learning_rate": 3.321571278458844e-05, + "loss": 1.0907, + "num_input_tokens_seen": 153568136, + "step": 9543 + }, + { + "epoch": 0.668540457239924, + "grad_norm": 3.631655216217041, + "learning_rate": 3.3208714535901934e-05, + "loss": 0.9983, + "num_input_tokens_seen": 153583256, + "step": 9544 + }, + { + "epoch": 0.6686105054856533, + "grad_norm": 4.06046199798584, + "learning_rate": 3.320171628721541e-05, + "loss": 1.1752, + "num_input_tokens_seen": 153599552, + "step": 9545 + }, + { + "epoch": 0.6686805537313825, + "grad_norm": 4.241738796234131, + "learning_rate": 3.31947180385289e-05, + "loss": 0.95, + "num_input_tokens_seen": 153614752, + "step": 9546 + }, + { + "epoch": 0.6687506019771118, + "grad_norm": 4.933783531188965, + "learning_rate": 3.3187719789842387e-05, + "loss": 1.0228, + "num_input_tokens_seen": 153631096, + "step": 9547 + }, + { + "epoch": 0.668820650222841, + "grad_norm": 3.831364393234253, + "learning_rate": 3.318072154115587e-05, + "loss": 1.2669, + "num_input_tokens_seen": 153647416, + "step": 9548 + }, + { + "epoch": 0.6688906984685702, + "grad_norm": 3.838050365447998, + "learning_rate": 3.317372329246936e-05, + "loss": 1.1594, + "num_input_tokens_seen": 153663656, + "step": 9549 + }, + { + "epoch": 0.6689607467142995, + "grad_norm": 3.6540346145629883, + "learning_rate": 3.316672504378284e-05, + "loss": 1.1004, + "num_input_tokens_seen": 153679816, + "step": 9550 + }, + { + "epoch": 0.6690307949600287, + "grad_norm": 4.717043399810791, + "learning_rate": 3.315972679509633e-05, + "loss": 0.9198, + "num_input_tokens_seen": 153696200, + "step": 9551 + }, + { + "epoch": 0.6691008432057579, + "grad_norm": 4.063976287841797, + "learning_rate": 3.3152728546409816e-05, + "loss": 0.9141, + "num_input_tokens_seen": 153712584, + "step": 9552 + }, + { + "epoch": 0.6691708914514872, + "grad_norm": 4.467658042907715, + "learning_rate": 3.314573029772329e-05, + "loss": 1.0497, + "num_input_tokens_seen": 153728824, + "step": 9553 + }, + { + "epoch": 0.6692409396972164, + "grad_norm": 3.6834609508514404, + "learning_rate": 3.3138732049036786e-05, + "loss": 0.9845, + "num_input_tokens_seen": 153745208, + "step": 9554 + }, + { + "epoch": 0.6693109879429457, + "grad_norm": 3.4636154174804688, + "learning_rate": 3.313173380035026e-05, + "loss": 1.0364, + "num_input_tokens_seen": 153761544, + "step": 9555 + }, + { + "epoch": 0.669381036188675, + "grad_norm": 3.738222360610962, + "learning_rate": 3.312473555166375e-05, + "loss": 0.9403, + "num_input_tokens_seen": 153776952, + "step": 9556 + }, + { + "epoch": 0.6694510844344042, + "grad_norm": 3.5253403186798096, + "learning_rate": 3.311773730297724e-05, + "loss": 0.9064, + "num_input_tokens_seen": 153793256, + "step": 9557 + }, + { + "epoch": 0.6695211326801335, + "grad_norm": 3.7795767784118652, + "learning_rate": 3.311073905429072e-05, + "loss": 1.0937, + "num_input_tokens_seen": 153809112, + "step": 9558 + }, + { + "epoch": 0.6695911809258627, + "grad_norm": 9.336627006530762, + "learning_rate": 3.310374080560421e-05, + "loss": 1.1717, + "num_input_tokens_seen": 153823208, + "step": 9559 + }, + { + "epoch": 0.669661229171592, + "grad_norm": 3.6486527919769287, + "learning_rate": 3.309674255691769e-05, + "loss": 1.056, + "num_input_tokens_seen": 153839592, + "step": 9560 + }, + { + "epoch": 0.6697312774173212, + "grad_norm": 5.983423233032227, + "learning_rate": 3.308974430823118e-05, + "loss": 1.0208, + "num_input_tokens_seen": 153855872, + "step": 9561 + }, + { + "epoch": 0.6698013256630504, + "grad_norm": 3.978843927383423, + "learning_rate": 3.308274605954467e-05, + "loss": 1.0714, + "num_input_tokens_seen": 153872256, + "step": 9562 + }, + { + "epoch": 0.6698713739087797, + "grad_norm": 3.920125722885132, + "learning_rate": 3.307574781085814e-05, + "loss": 1.1294, + "num_input_tokens_seen": 153888464, + "step": 9563 + }, + { + "epoch": 0.6699414221545089, + "grad_norm": 4.259881019592285, + "learning_rate": 3.306874956217164e-05, + "loss": 1.1078, + "num_input_tokens_seen": 153904848, + "step": 9564 + }, + { + "epoch": 0.6700114704002381, + "grad_norm": 3.7385001182556152, + "learning_rate": 3.3061751313485113e-05, + "loss": 0.9286, + "num_input_tokens_seen": 153920664, + "step": 9565 + }, + { + "epoch": 0.6700815186459674, + "grad_norm": 4.7608418464660645, + "learning_rate": 3.30547530647986e-05, + "loss": 0.9329, + "num_input_tokens_seen": 153937048, + "step": 9566 + }, + { + "epoch": 0.6701515668916966, + "grad_norm": 3.9172866344451904, + "learning_rate": 3.3047754816112084e-05, + "loss": 0.9722, + "num_input_tokens_seen": 153953432, + "step": 9567 + }, + { + "epoch": 0.670221615137426, + "grad_norm": 5.326093673706055, + "learning_rate": 3.304075656742557e-05, + "loss": 1.2063, + "num_input_tokens_seen": 153969392, + "step": 9568 + }, + { + "epoch": 0.6702916633831552, + "grad_norm": 3.9417648315429688, + "learning_rate": 3.303375831873906e-05, + "loss": 1.3491, + "num_input_tokens_seen": 153985264, + "step": 9569 + }, + { + "epoch": 0.6703617116288844, + "grad_norm": 3.791285276412964, + "learning_rate": 3.302676007005254e-05, + "loss": 0.8604, + "num_input_tokens_seen": 154001080, + "step": 9570 + }, + { + "epoch": 0.6704317598746137, + "grad_norm": 4.892256259918213, + "learning_rate": 3.301976182136603e-05, + "loss": 1.0985, + "num_input_tokens_seen": 154016240, + "step": 9571 + }, + { + "epoch": 0.6705018081203429, + "grad_norm": 5.776528835296631, + "learning_rate": 3.3012763572679506e-05, + "loss": 0.9167, + "num_input_tokens_seen": 154032208, + "step": 9572 + }, + { + "epoch": 0.6705718563660721, + "grad_norm": 5.320072650909424, + "learning_rate": 3.3005765323992995e-05, + "loss": 1.0268, + "num_input_tokens_seen": 154048152, + "step": 9573 + }, + { + "epoch": 0.6706419046118014, + "grad_norm": 3.5747718811035156, + "learning_rate": 3.299876707530649e-05, + "loss": 1.1065, + "num_input_tokens_seen": 154064536, + "step": 9574 + }, + { + "epoch": 0.6707119528575306, + "grad_norm": 3.7374205589294434, + "learning_rate": 3.2991768826619965e-05, + "loss": 1.008, + "num_input_tokens_seen": 154080896, + "step": 9575 + }, + { + "epoch": 0.6707820011032599, + "grad_norm": 4.420716285705566, + "learning_rate": 3.2984770577933454e-05, + "loss": 0.954, + "num_input_tokens_seen": 154097040, + "step": 9576 + }, + { + "epoch": 0.6708520493489891, + "grad_norm": 3.814714193344116, + "learning_rate": 3.2977772329246936e-05, + "loss": 1.0808, + "num_input_tokens_seen": 154113248, + "step": 9577 + }, + { + "epoch": 0.6709220975947183, + "grad_norm": 3.7520110607147217, + "learning_rate": 3.2970774080560424e-05, + "loss": 0.9403, + "num_input_tokens_seen": 154129256, + "step": 9578 + }, + { + "epoch": 0.6709921458404476, + "grad_norm": 4.158069133758545, + "learning_rate": 3.296377583187391e-05, + "loss": 0.8843, + "num_input_tokens_seen": 154145424, + "step": 9579 + }, + { + "epoch": 0.6710621940861768, + "grad_norm": 3.803387403488159, + "learning_rate": 3.2956777583187395e-05, + "loss": 0.9005, + "num_input_tokens_seen": 154161808, + "step": 9580 + }, + { + "epoch": 0.671132242331906, + "grad_norm": 6.207254409790039, + "learning_rate": 3.2949779334500883e-05, + "loss": 1.1427, + "num_input_tokens_seen": 154178192, + "step": 9581 + }, + { + "epoch": 0.6712022905776354, + "grad_norm": 3.6358389854431152, + "learning_rate": 3.294278108581436e-05, + "loss": 0.9227, + "num_input_tokens_seen": 154194096, + "step": 9582 + }, + { + "epoch": 0.6712723388233646, + "grad_norm": 5.4410624504089355, + "learning_rate": 3.293578283712785e-05, + "loss": 1.0482, + "num_input_tokens_seen": 154209168, + "step": 9583 + }, + { + "epoch": 0.6713423870690939, + "grad_norm": 3.4521219730377197, + "learning_rate": 3.292878458844134e-05, + "loss": 0.9539, + "num_input_tokens_seen": 154225552, + "step": 9584 + }, + { + "epoch": 0.6714124353148231, + "grad_norm": 4.823176860809326, + "learning_rate": 3.292178633975482e-05, + "loss": 1.1321, + "num_input_tokens_seen": 154241936, + "step": 9585 + }, + { + "epoch": 0.6714824835605523, + "grad_norm": 5.205235481262207, + "learning_rate": 3.2914788091068306e-05, + "loss": 1.0839, + "num_input_tokens_seen": 154256976, + "step": 9586 + }, + { + "epoch": 0.6715525318062816, + "grad_norm": 4.533977031707764, + "learning_rate": 3.290778984238179e-05, + "loss": 1.1357, + "num_input_tokens_seen": 154272816, + "step": 9587 + }, + { + "epoch": 0.6716225800520108, + "grad_norm": 3.7653615474700928, + "learning_rate": 3.2900791593695276e-05, + "loss": 0.9452, + "num_input_tokens_seen": 154289200, + "step": 9588 + }, + { + "epoch": 0.67169262829774, + "grad_norm": 3.9661972522735596, + "learning_rate": 3.2893793345008765e-05, + "loss": 1.1368, + "num_input_tokens_seen": 154305584, + "step": 9589 + }, + { + "epoch": 0.6717626765434693, + "grad_norm": 5.597352504730225, + "learning_rate": 3.288679509632225e-05, + "loss": 1.1131, + "num_input_tokens_seen": 154321704, + "step": 9590 + }, + { + "epoch": 0.6718327247891985, + "grad_norm": 6.373961925506592, + "learning_rate": 3.2879796847635735e-05, + "loss": 1.132, + "num_input_tokens_seen": 154337640, + "step": 9591 + }, + { + "epoch": 0.6719027730349278, + "grad_norm": 5.681432723999023, + "learning_rate": 3.287279859894921e-05, + "loss": 1.0399, + "num_input_tokens_seen": 154354024, + "step": 9592 + }, + { + "epoch": 0.671972821280657, + "grad_norm": 4.054728031158447, + "learning_rate": 3.28658003502627e-05, + "loss": 1.2101, + "num_input_tokens_seen": 154370408, + "step": 9593 + }, + { + "epoch": 0.6720428695263863, + "grad_norm": 3.410970687866211, + "learning_rate": 3.285880210157618e-05, + "loss": 0.9283, + "num_input_tokens_seen": 154386792, + "step": 9594 + }, + { + "epoch": 0.6721129177721156, + "grad_norm": 4.009875297546387, + "learning_rate": 3.285180385288967e-05, + "loss": 0.8792, + "num_input_tokens_seen": 154403176, + "step": 9595 + }, + { + "epoch": 0.6721829660178448, + "grad_norm": 3.857598066329956, + "learning_rate": 3.284480560420316e-05, + "loss": 0.9828, + "num_input_tokens_seen": 154419328, + "step": 9596 + }, + { + "epoch": 0.6722530142635741, + "grad_norm": 4.585512638092041, + "learning_rate": 3.283780735551664e-05, + "loss": 1.0087, + "num_input_tokens_seen": 154434920, + "step": 9597 + }, + { + "epoch": 0.6723230625093033, + "grad_norm": 4.45999813079834, + "learning_rate": 3.283080910683013e-05, + "loss": 1.0064, + "num_input_tokens_seen": 154451128, + "step": 9598 + }, + { + "epoch": 0.6723931107550325, + "grad_norm": 4.1541876792907715, + "learning_rate": 3.2823810858143604e-05, + "loss": 1.0947, + "num_input_tokens_seen": 154467512, + "step": 9599 + }, + { + "epoch": 0.6724631590007618, + "grad_norm": 4.274892330169678, + "learning_rate": 3.28168126094571e-05, + "loss": 1.1441, + "num_input_tokens_seen": 154483896, + "step": 9600 + }, + { + "epoch": 0.6724631590007618, + "eval_loss": 1.1179660558700562, + "eval_runtime": 0.1772, + "eval_samples_per_second": 5.644, + "eval_steps_per_second": 5.644, + "num_input_tokens_seen": 154483896, + "step": 9600 + }, + { + "epoch": 0.672533207246491, + "grad_norm": 3.9469761848449707, + "learning_rate": 3.280981436077059e-05, + "loss": 0.9266, + "num_input_tokens_seen": 154500280, + "step": 9601 + }, + { + "epoch": 0.6726032554922202, + "grad_norm": 3.510401964187622, + "learning_rate": 3.280281611208406e-05, + "loss": 0.932, + "num_input_tokens_seen": 154516664, + "step": 9602 + }, + { + "epoch": 0.6726733037379495, + "grad_norm": 3.5583336353302, + "learning_rate": 3.279581786339755e-05, + "loss": 0.9116, + "num_input_tokens_seen": 154533048, + "step": 9603 + }, + { + "epoch": 0.6727433519836787, + "grad_norm": 4.121950626373291, + "learning_rate": 3.278881961471103e-05, + "loss": 1.112, + "num_input_tokens_seen": 154548744, + "step": 9604 + }, + { + "epoch": 0.672813400229408, + "grad_norm": 5.071646213531494, + "learning_rate": 3.278182136602452e-05, + "loss": 1.2609, + "num_input_tokens_seen": 154565128, + "step": 9605 + }, + { + "epoch": 0.6728834484751373, + "grad_norm": 3.7802047729492188, + "learning_rate": 3.277482311733801e-05, + "loss": 0.9625, + "num_input_tokens_seen": 154581512, + "step": 9606 + }, + { + "epoch": 0.6729534967208665, + "grad_norm": 4.104621410369873, + "learning_rate": 3.276782486865149e-05, + "loss": 1.0382, + "num_input_tokens_seen": 154597736, + "step": 9607 + }, + { + "epoch": 0.6730235449665958, + "grad_norm": 3.192068099975586, + "learning_rate": 3.276082661996498e-05, + "loss": 0.85, + "num_input_tokens_seen": 154613752, + "step": 9608 + }, + { + "epoch": 0.673093593212325, + "grad_norm": 5.587652683258057, + "learning_rate": 3.2753828371278456e-05, + "loss": 1.09, + "num_input_tokens_seen": 154628872, + "step": 9609 + }, + { + "epoch": 0.6731636414580542, + "grad_norm": 4.380537033081055, + "learning_rate": 3.274683012259195e-05, + "loss": 0.9495, + "num_input_tokens_seen": 154643360, + "step": 9610 + }, + { + "epoch": 0.6732336897037835, + "grad_norm": 4.789302825927734, + "learning_rate": 3.273983187390544e-05, + "loss": 1.0341, + "num_input_tokens_seen": 154659744, + "step": 9611 + }, + { + "epoch": 0.6733037379495127, + "grad_norm": 3.8109471797943115, + "learning_rate": 3.2732833625218915e-05, + "loss": 0.9579, + "num_input_tokens_seen": 154675632, + "step": 9612 + }, + { + "epoch": 0.673373786195242, + "grad_norm": 3.9217190742492676, + "learning_rate": 3.27258353765324e-05, + "loss": 0.9113, + "num_input_tokens_seen": 154691680, + "step": 9613 + }, + { + "epoch": 0.6734438344409712, + "grad_norm": 3.694230794906616, + "learning_rate": 3.2718837127845885e-05, + "loss": 1.0087, + "num_input_tokens_seen": 154708064, + "step": 9614 + }, + { + "epoch": 0.6735138826867004, + "grad_norm": 3.8731272220611572, + "learning_rate": 3.2711838879159374e-05, + "loss": 1.0436, + "num_input_tokens_seen": 154724344, + "step": 9615 + }, + { + "epoch": 0.6735839309324297, + "grad_norm": 3.8610358238220215, + "learning_rate": 3.270484063047286e-05, + "loss": 1.2051, + "num_input_tokens_seen": 154740728, + "step": 9616 + }, + { + "epoch": 0.673653979178159, + "grad_norm": 4.294131755828857, + "learning_rate": 3.2697842381786344e-05, + "loss": 0.99, + "num_input_tokens_seen": 154757112, + "step": 9617 + }, + { + "epoch": 0.6737240274238881, + "grad_norm": 3.6998608112335205, + "learning_rate": 3.269084413309983e-05, + "loss": 0.9926, + "num_input_tokens_seen": 154773496, + "step": 9618 + }, + { + "epoch": 0.6737940756696175, + "grad_norm": 5.402703285217285, + "learning_rate": 3.268384588441331e-05, + "loss": 0.9515, + "num_input_tokens_seen": 154788736, + "step": 9619 + }, + { + "epoch": 0.6738641239153467, + "grad_norm": 3.922522783279419, + "learning_rate": 3.2676847635726796e-05, + "loss": 1.0232, + "num_input_tokens_seen": 154803976, + "step": 9620 + }, + { + "epoch": 0.673934172161076, + "grad_norm": 3.6370649337768555, + "learning_rate": 3.266984938704028e-05, + "loss": 0.973, + "num_input_tokens_seen": 154820360, + "step": 9621 + }, + { + "epoch": 0.6740042204068052, + "grad_norm": 3.650177478790283, + "learning_rate": 3.2662851138353767e-05, + "loss": 1.0053, + "num_input_tokens_seen": 154836744, + "step": 9622 + }, + { + "epoch": 0.6740742686525344, + "grad_norm": 4.65486478805542, + "learning_rate": 3.2655852889667255e-05, + "loss": 0.9177, + "num_input_tokens_seen": 154853128, + "step": 9623 + }, + { + "epoch": 0.6741443168982637, + "grad_norm": 4.205890655517578, + "learning_rate": 3.264885464098074e-05, + "loss": 1.2154, + "num_input_tokens_seen": 154869512, + "step": 9624 + }, + { + "epoch": 0.6742143651439929, + "grad_norm": 3.813800573348999, + "learning_rate": 3.2641856392294226e-05, + "loss": 1.0328, + "num_input_tokens_seen": 154884872, + "step": 9625 + }, + { + "epoch": 0.6742844133897222, + "grad_norm": 4.3491435050964355, + "learning_rate": 3.263485814360771e-05, + "loss": 1.1031, + "num_input_tokens_seen": 154901256, + "step": 9626 + }, + { + "epoch": 0.6743544616354514, + "grad_norm": 4.027802467346191, + "learning_rate": 3.2627859894921196e-05, + "loss": 1.0148, + "num_input_tokens_seen": 154917216, + "step": 9627 + }, + { + "epoch": 0.6744245098811806, + "grad_norm": 4.4686479568481445, + "learning_rate": 3.2620861646234685e-05, + "loss": 0.8989, + "num_input_tokens_seen": 154933600, + "step": 9628 + }, + { + "epoch": 0.67449455812691, + "grad_norm": 3.568321466445923, + "learning_rate": 3.261386339754816e-05, + "loss": 0.9383, + "num_input_tokens_seen": 154949528, + "step": 9629 + }, + { + "epoch": 0.6745646063726392, + "grad_norm": 3.7910568714141846, + "learning_rate": 3.260686514886165e-05, + "loss": 1.0971, + "num_input_tokens_seen": 154965912, + "step": 9630 + }, + { + "epoch": 0.6746346546183684, + "grad_norm": 6.671166896820068, + "learning_rate": 3.259986690017513e-05, + "loss": 0.8993, + "num_input_tokens_seen": 154981704, + "step": 9631 + }, + { + "epoch": 0.6747047028640977, + "grad_norm": 4.055356025695801, + "learning_rate": 3.259286865148862e-05, + "loss": 0.9928, + "num_input_tokens_seen": 154997552, + "step": 9632 + }, + { + "epoch": 0.6747747511098269, + "grad_norm": 4.01240873336792, + "learning_rate": 3.258587040280211e-05, + "loss": 0.9884, + "num_input_tokens_seen": 155013936, + "step": 9633 + }, + { + "epoch": 0.6748447993555562, + "grad_norm": 4.451525688171387, + "learning_rate": 3.257887215411559e-05, + "loss": 1.0524, + "num_input_tokens_seen": 155029832, + "step": 9634 + }, + { + "epoch": 0.6749148476012854, + "grad_norm": 5.295654296875, + "learning_rate": 3.257187390542908e-05, + "loss": 0.8858, + "num_input_tokens_seen": 155045616, + "step": 9635 + }, + { + "epoch": 0.6749848958470146, + "grad_norm": 4.3195343017578125, + "learning_rate": 3.256487565674255e-05, + "loss": 0.977, + "num_input_tokens_seen": 155061104, + "step": 9636 + }, + { + "epoch": 0.6750549440927439, + "grad_norm": 4.976731777191162, + "learning_rate": 3.255787740805605e-05, + "loss": 1.0341, + "num_input_tokens_seen": 155076776, + "step": 9637 + }, + { + "epoch": 0.6751249923384731, + "grad_norm": 6.241532325744629, + "learning_rate": 3.2550879159369537e-05, + "loss": 1.1003, + "num_input_tokens_seen": 155093160, + "step": 9638 + }, + { + "epoch": 0.6751950405842023, + "grad_norm": 3.9951634407043457, + "learning_rate": 3.254388091068301e-05, + "loss": 0.916, + "num_input_tokens_seen": 155109160, + "step": 9639 + }, + { + "epoch": 0.6752650888299316, + "grad_norm": 4.132941722869873, + "learning_rate": 3.25368826619965e-05, + "loss": 0.9415, + "num_input_tokens_seen": 155125544, + "step": 9640 + }, + { + "epoch": 0.6753351370756608, + "grad_norm": 4.777303695678711, + "learning_rate": 3.252988441330998e-05, + "loss": 1.0636, + "num_input_tokens_seen": 155141928, + "step": 9641 + }, + { + "epoch": 0.6754051853213902, + "grad_norm": 5.361613750457764, + "learning_rate": 3.252288616462347e-05, + "loss": 1.0534, + "num_input_tokens_seen": 155157896, + "step": 9642 + }, + { + "epoch": 0.6754752335671194, + "grad_norm": 3.4499411582946777, + "learning_rate": 3.251588791593696e-05, + "loss": 0.9386, + "num_input_tokens_seen": 155173912, + "step": 9643 + }, + { + "epoch": 0.6755452818128486, + "grad_norm": 5.655459880828857, + "learning_rate": 3.250888966725044e-05, + "loss": 1.0044, + "num_input_tokens_seen": 155190136, + "step": 9644 + }, + { + "epoch": 0.6756153300585779, + "grad_norm": 7.2610578536987305, + "learning_rate": 3.250189141856393e-05, + "loss": 0.9489, + "num_input_tokens_seen": 155206464, + "step": 9645 + }, + { + "epoch": 0.6756853783043071, + "grad_norm": 7.128015518188477, + "learning_rate": 3.2494893169877405e-05, + "loss": 1.0651, + "num_input_tokens_seen": 155221928, + "step": 9646 + }, + { + "epoch": 0.6757554265500363, + "grad_norm": 3.7394115924835205, + "learning_rate": 3.24878949211909e-05, + "loss": 1.1557, + "num_input_tokens_seen": 155237360, + "step": 9647 + }, + { + "epoch": 0.6758254747957656, + "grad_norm": 4.402042388916016, + "learning_rate": 3.2480896672504375e-05, + "loss": 1.1021, + "num_input_tokens_seen": 155252640, + "step": 9648 + }, + { + "epoch": 0.6758955230414948, + "grad_norm": 3.378056764602661, + "learning_rate": 3.2473898423817864e-05, + "loss": 1.0828, + "num_input_tokens_seen": 155269024, + "step": 9649 + }, + { + "epoch": 0.6759655712872241, + "grad_norm": 4.998075008392334, + "learning_rate": 3.246690017513135e-05, + "loss": 1.1658, + "num_input_tokens_seen": 155285408, + "step": 9650 + }, + { + "epoch": 0.6760356195329533, + "grad_norm": 3.4548230171203613, + "learning_rate": 3.2459901926444834e-05, + "loss": 1.0301, + "num_input_tokens_seen": 155301792, + "step": 9651 + }, + { + "epoch": 0.6761056677786825, + "grad_norm": 3.98690128326416, + "learning_rate": 3.245290367775832e-05, + "loss": 1.1282, + "num_input_tokens_seen": 155317792, + "step": 9652 + }, + { + "epoch": 0.6761757160244118, + "grad_norm": 4.647696495056152, + "learning_rate": 3.2445905429071804e-05, + "loss": 1.0161, + "num_input_tokens_seen": 155334176, + "step": 9653 + }, + { + "epoch": 0.676245764270141, + "grad_norm": 5.285068511962891, + "learning_rate": 3.243890718038529e-05, + "loss": 0.8845, + "num_input_tokens_seen": 155350560, + "step": 9654 + }, + { + "epoch": 0.6763158125158703, + "grad_norm": 3.6858527660369873, + "learning_rate": 3.243190893169878e-05, + "loss": 1.0347, + "num_input_tokens_seen": 155366472, + "step": 9655 + }, + { + "epoch": 0.6763858607615996, + "grad_norm": 4.104813098907471, + "learning_rate": 3.242491068301226e-05, + "loss": 1.1455, + "num_input_tokens_seen": 155382856, + "step": 9656 + }, + { + "epoch": 0.6764559090073288, + "grad_norm": 3.8051059246063232, + "learning_rate": 3.241791243432575e-05, + "loss": 0.8834, + "num_input_tokens_seen": 155398440, + "step": 9657 + }, + { + "epoch": 0.6765259572530581, + "grad_norm": 4.418055534362793, + "learning_rate": 3.241091418563923e-05, + "loss": 0.9283, + "num_input_tokens_seen": 155414824, + "step": 9658 + }, + { + "epoch": 0.6765960054987873, + "grad_norm": 3.6005654335021973, + "learning_rate": 3.2403915936952716e-05, + "loss": 1.0635, + "num_input_tokens_seen": 155431208, + "step": 9659 + }, + { + "epoch": 0.6766660537445165, + "grad_norm": 4.436532497406006, + "learning_rate": 3.2396917688266204e-05, + "loss": 0.9997, + "num_input_tokens_seen": 155447592, + "step": 9660 + }, + { + "epoch": 0.6767361019902458, + "grad_norm": 3.9821643829345703, + "learning_rate": 3.2389919439579686e-05, + "loss": 1.037, + "num_input_tokens_seen": 155463976, + "step": 9661 + }, + { + "epoch": 0.676806150235975, + "grad_norm": 4.8469133377075195, + "learning_rate": 3.2382921190893175e-05, + "loss": 1.1514, + "num_input_tokens_seen": 155480360, + "step": 9662 + }, + { + "epoch": 0.6768761984817043, + "grad_norm": 3.7075741291046143, + "learning_rate": 3.2375922942206656e-05, + "loss": 1.1636, + "num_input_tokens_seen": 155496176, + "step": 9663 + }, + { + "epoch": 0.6769462467274335, + "grad_norm": 3.967015266418457, + "learning_rate": 3.2368924693520145e-05, + "loss": 0.9709, + "num_input_tokens_seen": 155510976, + "step": 9664 + }, + { + "epoch": 0.6770162949731627, + "grad_norm": 3.8911147117614746, + "learning_rate": 3.2361926444833634e-05, + "loss": 1.0653, + "num_input_tokens_seen": 155527360, + "step": 9665 + }, + { + "epoch": 0.677086343218892, + "grad_norm": 3.5894508361816406, + "learning_rate": 3.235492819614711e-05, + "loss": 0.9433, + "num_input_tokens_seen": 155543600, + "step": 9666 + }, + { + "epoch": 0.6771563914646213, + "grad_norm": 4.796082496643066, + "learning_rate": 3.2347929947460604e-05, + "loss": 1.0827, + "num_input_tokens_seen": 155559984, + "step": 9667 + }, + { + "epoch": 0.6772264397103505, + "grad_norm": 4.07179069519043, + "learning_rate": 3.234093169877408e-05, + "loss": 1.199, + "num_input_tokens_seen": 155576368, + "step": 9668 + }, + { + "epoch": 0.6772964879560798, + "grad_norm": 4.188907623291016, + "learning_rate": 3.233393345008757e-05, + "loss": 1.1571, + "num_input_tokens_seen": 155592536, + "step": 9669 + }, + { + "epoch": 0.677366536201809, + "grad_norm": 6.003124237060547, + "learning_rate": 3.2326935201401056e-05, + "loss": 1.1887, + "num_input_tokens_seen": 155608920, + "step": 9670 + }, + { + "epoch": 0.6774365844475383, + "grad_norm": 3.5606276988983154, + "learning_rate": 3.231993695271454e-05, + "loss": 0.8758, + "num_input_tokens_seen": 155625304, + "step": 9671 + }, + { + "epoch": 0.6775066326932675, + "grad_norm": 4.253870010375977, + "learning_rate": 3.231293870402803e-05, + "loss": 1.0689, + "num_input_tokens_seen": 155641688, + "step": 9672 + }, + { + "epoch": 0.6775766809389967, + "grad_norm": 5.083495616912842, + "learning_rate": 3.230594045534151e-05, + "loss": 1.0994, + "num_input_tokens_seen": 155657008, + "step": 9673 + }, + { + "epoch": 0.677646729184726, + "grad_norm": 3.9135494232177734, + "learning_rate": 3.2298942206655e-05, + "loss": 1.1357, + "num_input_tokens_seen": 155672616, + "step": 9674 + }, + { + "epoch": 0.6777167774304552, + "grad_norm": 3.749938726425171, + "learning_rate": 3.229194395796847e-05, + "loss": 1.0298, + "num_input_tokens_seen": 155689000, + "step": 9675 + }, + { + "epoch": 0.6777868256761844, + "grad_norm": 5.114835262298584, + "learning_rate": 3.228494570928196e-05, + "loss": 0.9209, + "num_input_tokens_seen": 155703920, + "step": 9676 + }, + { + "epoch": 0.6778568739219137, + "grad_norm": 3.9304583072662354, + "learning_rate": 3.2277947460595456e-05, + "loss": 1.2831, + "num_input_tokens_seen": 155719736, + "step": 9677 + }, + { + "epoch": 0.6779269221676429, + "grad_norm": 6.98004674911499, + "learning_rate": 3.227094921190893e-05, + "loss": 0.9394, + "num_input_tokens_seen": 155736120, + "step": 9678 + }, + { + "epoch": 0.6779969704133723, + "grad_norm": 3.8113934993743896, + "learning_rate": 3.226395096322242e-05, + "loss": 0.972, + "num_input_tokens_seen": 155752152, + "step": 9679 + }, + { + "epoch": 0.6780670186591015, + "grad_norm": 4.115365028381348, + "learning_rate": 3.22569527145359e-05, + "loss": 1.0365, + "num_input_tokens_seen": 155768216, + "step": 9680 + }, + { + "epoch": 0.6781370669048307, + "grad_norm": 4.222641468048096, + "learning_rate": 3.224995446584939e-05, + "loss": 0.9715, + "num_input_tokens_seen": 155783424, + "step": 9681 + }, + { + "epoch": 0.67820711515056, + "grad_norm": 4.028203010559082, + "learning_rate": 3.224295621716288e-05, + "loss": 1.153, + "num_input_tokens_seen": 155799400, + "step": 9682 + }, + { + "epoch": 0.6782771633962892, + "grad_norm": 4.193026542663574, + "learning_rate": 3.223595796847636e-05, + "loss": 0.9425, + "num_input_tokens_seen": 155815672, + "step": 9683 + }, + { + "epoch": 0.6783472116420184, + "grad_norm": 4.100712776184082, + "learning_rate": 3.222895971978985e-05, + "loss": 1.1333, + "num_input_tokens_seen": 155831480, + "step": 9684 + }, + { + "epoch": 0.6784172598877477, + "grad_norm": 4.624992847442627, + "learning_rate": 3.2221961471103324e-05, + "loss": 1.0315, + "num_input_tokens_seen": 155847864, + "step": 9685 + }, + { + "epoch": 0.6784873081334769, + "grad_norm": 3.9078385829925537, + "learning_rate": 3.221496322241681e-05, + "loss": 0.9722, + "num_input_tokens_seen": 155864248, + "step": 9686 + }, + { + "epoch": 0.6785573563792062, + "grad_norm": 4.793765068054199, + "learning_rate": 3.220796497373031e-05, + "loss": 1.0826, + "num_input_tokens_seen": 155879424, + "step": 9687 + }, + { + "epoch": 0.6786274046249354, + "grad_norm": 3.9083974361419678, + "learning_rate": 3.220096672504378e-05, + "loss": 1.0243, + "num_input_tokens_seen": 155895752, + "step": 9688 + }, + { + "epoch": 0.6786974528706646, + "grad_norm": 3.7716784477233887, + "learning_rate": 3.219396847635727e-05, + "loss": 0.8931, + "num_input_tokens_seen": 155912136, + "step": 9689 + }, + { + "epoch": 0.678767501116394, + "grad_norm": 3.297051429748535, + "learning_rate": 3.2186970227670754e-05, + "loss": 0.9477, + "num_input_tokens_seen": 155928208, + "step": 9690 + }, + { + "epoch": 0.6788375493621231, + "grad_norm": 4.009176254272461, + "learning_rate": 3.217997197898424e-05, + "loss": 1.2115, + "num_input_tokens_seen": 155944592, + "step": 9691 + }, + { + "epoch": 0.6789075976078524, + "grad_norm": 3.651552200317383, + "learning_rate": 3.217297373029773e-05, + "loss": 1.2017, + "num_input_tokens_seen": 155960712, + "step": 9692 + }, + { + "epoch": 0.6789776458535817, + "grad_norm": 3.483879804611206, + "learning_rate": 3.216597548161121e-05, + "loss": 1.0646, + "num_input_tokens_seen": 155976568, + "step": 9693 + }, + { + "epoch": 0.6790476940993109, + "grad_norm": 4.056620121002197, + "learning_rate": 3.21589772329247e-05, + "loss": 1.0249, + "num_input_tokens_seen": 155991824, + "step": 9694 + }, + { + "epoch": 0.6791177423450402, + "grad_norm": 4.6582536697387695, + "learning_rate": 3.2151978984238176e-05, + "loss": 1.016, + "num_input_tokens_seen": 156007672, + "step": 9695 + }, + { + "epoch": 0.6791877905907694, + "grad_norm": 5.168168544769287, + "learning_rate": 3.2144980735551665e-05, + "loss": 1.0872, + "num_input_tokens_seen": 156022624, + "step": 9696 + }, + { + "epoch": 0.6792578388364986, + "grad_norm": 4.291696548461914, + "learning_rate": 3.213798248686516e-05, + "loss": 1.0562, + "num_input_tokens_seen": 156039008, + "step": 9697 + }, + { + "epoch": 0.6793278870822279, + "grad_norm": 3.7295303344726562, + "learning_rate": 3.2130984238178635e-05, + "loss": 1.0577, + "num_input_tokens_seen": 156054784, + "step": 9698 + }, + { + "epoch": 0.6793979353279571, + "grad_norm": 3.5940639972686768, + "learning_rate": 3.2123985989492124e-05, + "loss": 1.1079, + "num_input_tokens_seen": 156071168, + "step": 9699 + }, + { + "epoch": 0.6794679835736864, + "grad_norm": 4.4046220779418945, + "learning_rate": 3.2116987740805606e-05, + "loss": 1.0488, + "num_input_tokens_seen": 156087520, + "step": 9700 + }, + { + "epoch": 0.6795380318194156, + "grad_norm": 3.5668511390686035, + "learning_rate": 3.2109989492119094e-05, + "loss": 1.0733, + "num_input_tokens_seen": 156103904, + "step": 9701 + }, + { + "epoch": 0.6796080800651448, + "grad_norm": 4.068506717681885, + "learning_rate": 3.210299124343257e-05, + "loss": 1.2341, + "num_input_tokens_seen": 156120192, + "step": 9702 + }, + { + "epoch": 0.6796781283108742, + "grad_norm": 3.8071236610412598, + "learning_rate": 3.2095992994746065e-05, + "loss": 0.8822, + "num_input_tokens_seen": 156136528, + "step": 9703 + }, + { + "epoch": 0.6797481765566034, + "grad_norm": 3.7498772144317627, + "learning_rate": 3.208899474605955e-05, + "loss": 0.9442, + "num_input_tokens_seen": 156152824, + "step": 9704 + }, + { + "epoch": 0.6798182248023326, + "grad_norm": 3.594505786895752, + "learning_rate": 3.208199649737303e-05, + "loss": 1.1437, + "num_input_tokens_seen": 156169208, + "step": 9705 + }, + { + "epoch": 0.6798882730480619, + "grad_norm": 4.215333938598633, + "learning_rate": 3.207499824868652e-05, + "loss": 0.8894, + "num_input_tokens_seen": 156185592, + "step": 9706 + }, + { + "epoch": 0.6799583212937911, + "grad_norm": 3.8823065757751465, + "learning_rate": 3.2068e-05, + "loss": 0.973, + "num_input_tokens_seen": 156201976, + "step": 9707 + }, + { + "epoch": 0.6800283695395204, + "grad_norm": 3.9521706104278564, + "learning_rate": 3.206100175131349e-05, + "loss": 0.9668, + "num_input_tokens_seen": 156218344, + "step": 9708 + }, + { + "epoch": 0.6800984177852496, + "grad_norm": 3.927607536315918, + "learning_rate": 3.2054003502626976e-05, + "loss": 1.0116, + "num_input_tokens_seen": 156234432, + "step": 9709 + }, + { + "epoch": 0.6801684660309788, + "grad_norm": 3.529273271560669, + "learning_rate": 3.204700525394046e-05, + "loss": 1.0027, + "num_input_tokens_seen": 156250080, + "step": 9710 + }, + { + "epoch": 0.6802385142767081, + "grad_norm": 3.4426844120025635, + "learning_rate": 3.2040007005253946e-05, + "loss": 0.9499, + "num_input_tokens_seen": 156266464, + "step": 9711 + }, + { + "epoch": 0.6803085625224373, + "grad_norm": 3.93259334564209, + "learning_rate": 3.203300875656742e-05, + "loss": 1.1965, + "num_input_tokens_seen": 156282848, + "step": 9712 + }, + { + "epoch": 0.6803786107681665, + "grad_norm": 4.213457107543945, + "learning_rate": 3.2026010507880917e-05, + "loss": 1.0243, + "num_input_tokens_seen": 156299232, + "step": 9713 + }, + { + "epoch": 0.6804486590138958, + "grad_norm": 3.9757726192474365, + "learning_rate": 3.2019012259194405e-05, + "loss": 1.1156, + "num_input_tokens_seen": 156315616, + "step": 9714 + }, + { + "epoch": 0.680518707259625, + "grad_norm": 3.6526811122894287, + "learning_rate": 3.201201401050788e-05, + "loss": 1.0091, + "num_input_tokens_seen": 156332000, + "step": 9715 + }, + { + "epoch": 0.6805887555053544, + "grad_norm": 3.941851854324341, + "learning_rate": 3.200501576182137e-05, + "loss": 1.1827, + "num_input_tokens_seen": 156348384, + "step": 9716 + }, + { + "epoch": 0.6806588037510836, + "grad_norm": 4.00084114074707, + "learning_rate": 3.199801751313485e-05, + "loss": 1.0499, + "num_input_tokens_seen": 156364768, + "step": 9717 + }, + { + "epoch": 0.6807288519968128, + "grad_norm": 5.883040428161621, + "learning_rate": 3.199101926444834e-05, + "loss": 1.0096, + "num_input_tokens_seen": 156380920, + "step": 9718 + }, + { + "epoch": 0.6807989002425421, + "grad_norm": 4.973423957824707, + "learning_rate": 3.198402101576183e-05, + "loss": 0.988, + "num_input_tokens_seen": 156397304, + "step": 9719 + }, + { + "epoch": 0.6808689484882713, + "grad_norm": 4.990798473358154, + "learning_rate": 3.197702276707531e-05, + "loss": 1.1515, + "num_input_tokens_seen": 156412320, + "step": 9720 + }, + { + "epoch": 0.6809389967340005, + "grad_norm": 3.8593194484710693, + "learning_rate": 3.19700245183888e-05, + "loss": 1.0715, + "num_input_tokens_seen": 156428096, + "step": 9721 + }, + { + "epoch": 0.6810090449797298, + "grad_norm": 4.024546146392822, + "learning_rate": 3.196302626970227e-05, + "loss": 1.0963, + "num_input_tokens_seen": 156444064, + "step": 9722 + }, + { + "epoch": 0.681079093225459, + "grad_norm": 3.9243736267089844, + "learning_rate": 3.195602802101577e-05, + "loss": 0.9093, + "num_input_tokens_seen": 156459856, + "step": 9723 + }, + { + "epoch": 0.6811491414711883, + "grad_norm": 5.202075958251953, + "learning_rate": 3.194902977232926e-05, + "loss": 1.2068, + "num_input_tokens_seen": 156475808, + "step": 9724 + }, + { + "epoch": 0.6812191897169175, + "grad_norm": 3.6658310890197754, + "learning_rate": 3.194203152364273e-05, + "loss": 1.0788, + "num_input_tokens_seen": 156491736, + "step": 9725 + }, + { + "epoch": 0.6812892379626467, + "grad_norm": 3.8188209533691406, + "learning_rate": 3.193503327495622e-05, + "loss": 0.9011, + "num_input_tokens_seen": 156508120, + "step": 9726 + }, + { + "epoch": 0.681359286208376, + "grad_norm": 3.819744825363159, + "learning_rate": 3.19280350262697e-05, + "loss": 1.1335, + "num_input_tokens_seen": 156524504, + "step": 9727 + }, + { + "epoch": 0.6814293344541053, + "grad_norm": 5.102762699127197, + "learning_rate": 3.192103677758319e-05, + "loss": 1.1391, + "num_input_tokens_seen": 156539992, + "step": 9728 + }, + { + "epoch": 0.6814993826998346, + "grad_norm": 4.1731109619140625, + "learning_rate": 3.191403852889667e-05, + "loss": 1.0393, + "num_input_tokens_seen": 156555864, + "step": 9729 + }, + { + "epoch": 0.6815694309455638, + "grad_norm": 7.489117622375488, + "learning_rate": 3.190704028021016e-05, + "loss": 0.9437, + "num_input_tokens_seen": 156570848, + "step": 9730 + }, + { + "epoch": 0.681639479191293, + "grad_norm": 4.853135108947754, + "learning_rate": 3.190004203152365e-05, + "loss": 1.1154, + "num_input_tokens_seen": 156585336, + "step": 9731 + }, + { + "epoch": 0.6817095274370223, + "grad_norm": 5.824226379394531, + "learning_rate": 3.1893043782837125e-05, + "loss": 1.0554, + "num_input_tokens_seen": 156601232, + "step": 9732 + }, + { + "epoch": 0.6817795756827515, + "grad_norm": 3.820237874984741, + "learning_rate": 3.188604553415062e-05, + "loss": 0.9357, + "num_input_tokens_seen": 156617312, + "step": 9733 + }, + { + "epoch": 0.6818496239284807, + "grad_norm": 3.6588523387908936, + "learning_rate": 3.1879047285464096e-05, + "loss": 0.8876, + "num_input_tokens_seen": 156633696, + "step": 9734 + }, + { + "epoch": 0.68191967217421, + "grad_norm": 4.0700812339782715, + "learning_rate": 3.1872049036777584e-05, + "loss": 0.9496, + "num_input_tokens_seen": 156650080, + "step": 9735 + }, + { + "epoch": 0.6819897204199392, + "grad_norm": 3.554778814315796, + "learning_rate": 3.186505078809107e-05, + "loss": 0.7421, + "num_input_tokens_seen": 156665072, + "step": 9736 + }, + { + "epoch": 0.6820597686656685, + "grad_norm": 3.735191583633423, + "learning_rate": 3.1858052539404555e-05, + "loss": 0.9248, + "num_input_tokens_seen": 156681456, + "step": 9737 + }, + { + "epoch": 0.6821298169113977, + "grad_norm": 3.893950939178467, + "learning_rate": 3.185105429071804e-05, + "loss": 1.1413, + "num_input_tokens_seen": 156697840, + "step": 9738 + }, + { + "epoch": 0.6821998651571269, + "grad_norm": 4.813759803771973, + "learning_rate": 3.1844056042031525e-05, + "loss": 0.9249, + "num_input_tokens_seen": 156714224, + "step": 9739 + }, + { + "epoch": 0.6822699134028563, + "grad_norm": 6.0400309562683105, + "learning_rate": 3.1837057793345014e-05, + "loss": 1.1508, + "num_input_tokens_seen": 156730536, + "step": 9740 + }, + { + "epoch": 0.6823399616485855, + "grad_norm": 6.131940841674805, + "learning_rate": 3.18300595446585e-05, + "loss": 1.0151, + "num_input_tokens_seen": 156746808, + "step": 9741 + }, + { + "epoch": 0.6824100098943147, + "grad_norm": 3.324862003326416, + "learning_rate": 3.182306129597198e-05, + "loss": 0.8944, + "num_input_tokens_seen": 156763192, + "step": 9742 + }, + { + "epoch": 0.682480058140044, + "grad_norm": 5.59218692779541, + "learning_rate": 3.181606304728547e-05, + "loss": 0.919, + "num_input_tokens_seen": 156779576, + "step": 9743 + }, + { + "epoch": 0.6825501063857732, + "grad_norm": 3.505868673324585, + "learning_rate": 3.180906479859895e-05, + "loss": 0.9869, + "num_input_tokens_seen": 156795960, + "step": 9744 + }, + { + "epoch": 0.6826201546315025, + "grad_norm": 3.6450185775756836, + "learning_rate": 3.1802066549912436e-05, + "loss": 0.9698, + "num_input_tokens_seen": 156812344, + "step": 9745 + }, + { + "epoch": 0.6826902028772317, + "grad_norm": 6.763232707977295, + "learning_rate": 3.1795068301225925e-05, + "loss": 1.0432, + "num_input_tokens_seen": 156828728, + "step": 9746 + }, + { + "epoch": 0.6827602511229609, + "grad_norm": 4.681893825531006, + "learning_rate": 3.178807005253941e-05, + "loss": 1.184, + "num_input_tokens_seen": 156843520, + "step": 9747 + }, + { + "epoch": 0.6828302993686902, + "grad_norm": 3.835857391357422, + "learning_rate": 3.1781071803852895e-05, + "loss": 1.0689, + "num_input_tokens_seen": 156859904, + "step": 9748 + }, + { + "epoch": 0.6829003476144194, + "grad_norm": 4.3855814933776855, + "learning_rate": 3.177407355516638e-05, + "loss": 1.0326, + "num_input_tokens_seen": 156876288, + "step": 9749 + }, + { + "epoch": 0.6829703958601486, + "grad_norm": 3.589956045150757, + "learning_rate": 3.1767075306479866e-05, + "loss": 0.9729, + "num_input_tokens_seen": 156892672, + "step": 9750 + }, + { + "epoch": 0.6830404441058779, + "grad_norm": 3.9540622234344482, + "learning_rate": 3.1760077057793354e-05, + "loss": 0.9998, + "num_input_tokens_seen": 156908008, + "step": 9751 + }, + { + "epoch": 0.6831104923516071, + "grad_norm": 3.7458391189575195, + "learning_rate": 3.175307880910683e-05, + "loss": 0.9379, + "num_input_tokens_seen": 156923872, + "step": 9752 + }, + { + "epoch": 0.6831805405973365, + "grad_norm": 4.343460559844971, + "learning_rate": 3.1746080560420325e-05, + "loss": 1.1989, + "num_input_tokens_seen": 156940256, + "step": 9753 + }, + { + "epoch": 0.6832505888430657, + "grad_norm": 4.1958088874816895, + "learning_rate": 3.17390823117338e-05, + "loss": 0.9803, + "num_input_tokens_seen": 156956368, + "step": 9754 + }, + { + "epoch": 0.6833206370887949, + "grad_norm": 4.1247992515563965, + "learning_rate": 3.173208406304729e-05, + "loss": 0.9539, + "num_input_tokens_seen": 156972104, + "step": 9755 + }, + { + "epoch": 0.6833906853345242, + "grad_norm": 4.311051845550537, + "learning_rate": 3.172508581436077e-05, + "loss": 0.9346, + "num_input_tokens_seen": 156988488, + "step": 9756 + }, + { + "epoch": 0.6834607335802534, + "grad_norm": 3.187197208404541, + "learning_rate": 3.171808756567426e-05, + "loss": 0.9492, + "num_input_tokens_seen": 157004872, + "step": 9757 + }, + { + "epoch": 0.6835307818259826, + "grad_norm": 4.677457332611084, + "learning_rate": 3.171108931698775e-05, + "loss": 1.0282, + "num_input_tokens_seen": 157021256, + "step": 9758 + }, + { + "epoch": 0.6836008300717119, + "grad_norm": 4.986778259277344, + "learning_rate": 3.170409106830123e-05, + "loss": 1.1071, + "num_input_tokens_seen": 157037640, + "step": 9759 + }, + { + "epoch": 0.6836708783174411, + "grad_norm": 5.576174259185791, + "learning_rate": 3.169709281961472e-05, + "loss": 0.9534, + "num_input_tokens_seen": 157052928, + "step": 9760 + }, + { + "epoch": 0.6837409265631704, + "grad_norm": 3.9162395000457764, + "learning_rate": 3.169009457092819e-05, + "loss": 1.0098, + "num_input_tokens_seen": 157068928, + "step": 9761 + }, + { + "epoch": 0.6838109748088996, + "grad_norm": 4.502906799316406, + "learning_rate": 3.168309632224168e-05, + "loss": 0.9377, + "num_input_tokens_seen": 157084792, + "step": 9762 + }, + { + "epoch": 0.6838810230546288, + "grad_norm": 6.99123477935791, + "learning_rate": 3.167609807355518e-05, + "loss": 0.9802, + "num_input_tokens_seen": 157101176, + "step": 9763 + }, + { + "epoch": 0.6839510713003581, + "grad_norm": 3.843501329421997, + "learning_rate": 3.166909982486865e-05, + "loss": 1.1354, + "num_input_tokens_seen": 157117560, + "step": 9764 + }, + { + "epoch": 0.6840211195460874, + "grad_norm": 5.376356601715088, + "learning_rate": 3.166210157618214e-05, + "loss": 0.9912, + "num_input_tokens_seen": 157133024, + "step": 9765 + }, + { + "epoch": 0.6840911677918167, + "grad_norm": 4.668597221374512, + "learning_rate": 3.165510332749562e-05, + "loss": 1.0249, + "num_input_tokens_seen": 157149408, + "step": 9766 + }, + { + "epoch": 0.6841612160375459, + "grad_norm": 5.503859519958496, + "learning_rate": 3.164810507880911e-05, + "loss": 1.2094, + "num_input_tokens_seen": 157165768, + "step": 9767 + }, + { + "epoch": 0.6842312642832751, + "grad_norm": 4.0992536544799805, + "learning_rate": 3.16411068301226e-05, + "loss": 0.9695, + "num_input_tokens_seen": 157182152, + "step": 9768 + }, + { + "epoch": 0.6843013125290044, + "grad_norm": 4.090651988983154, + "learning_rate": 3.163410858143608e-05, + "loss": 1.1875, + "num_input_tokens_seen": 157198448, + "step": 9769 + }, + { + "epoch": 0.6843713607747336, + "grad_norm": 3.8701817989349365, + "learning_rate": 3.162711033274957e-05, + "loss": 1.0895, + "num_input_tokens_seen": 157214008, + "step": 9770 + }, + { + "epoch": 0.6844414090204628, + "grad_norm": 4.262059688568115, + "learning_rate": 3.1620112084063045e-05, + "loss": 0.9372, + "num_input_tokens_seen": 157229784, + "step": 9771 + }, + { + "epoch": 0.6845114572661921, + "grad_norm": 3.4763944149017334, + "learning_rate": 3.161311383537653e-05, + "loss": 1.0625, + "num_input_tokens_seen": 157246168, + "step": 9772 + }, + { + "epoch": 0.6845815055119213, + "grad_norm": 3.617454767227173, + "learning_rate": 3.160611558669003e-05, + "loss": 0.9096, + "num_input_tokens_seen": 157261952, + "step": 9773 + }, + { + "epoch": 0.6846515537576506, + "grad_norm": 4.222943305969238, + "learning_rate": 3.1599117338003504e-05, + "loss": 1.1223, + "num_input_tokens_seen": 157277264, + "step": 9774 + }, + { + "epoch": 0.6847216020033798, + "grad_norm": 4.458327293395996, + "learning_rate": 3.159211908931699e-05, + "loss": 1.1044, + "num_input_tokens_seen": 157293648, + "step": 9775 + }, + { + "epoch": 0.684791650249109, + "grad_norm": 3.3364436626434326, + "learning_rate": 3.1585120840630474e-05, + "loss": 0.9301, + "num_input_tokens_seen": 157310032, + "step": 9776 + }, + { + "epoch": 0.6848616984948384, + "grad_norm": 4.5057373046875, + "learning_rate": 3.157812259194396e-05, + "loss": 1.1578, + "num_input_tokens_seen": 157326416, + "step": 9777 + }, + { + "epoch": 0.6849317467405676, + "grad_norm": 3.752368211746216, + "learning_rate": 3.157112434325745e-05, + "loss": 0.9265, + "num_input_tokens_seen": 157342800, + "step": 9778 + }, + { + "epoch": 0.6850017949862968, + "grad_norm": 3.8875732421875, + "learning_rate": 3.156412609457093e-05, + "loss": 1.0984, + "num_input_tokens_seen": 157358456, + "step": 9779 + }, + { + "epoch": 0.6850718432320261, + "grad_norm": 4.596309661865234, + "learning_rate": 3.155712784588442e-05, + "loss": 1.1893, + "num_input_tokens_seen": 157374840, + "step": 9780 + }, + { + "epoch": 0.6851418914777553, + "grad_norm": 4.421420097351074, + "learning_rate": 3.15501295971979e-05, + "loss": 0.932, + "num_input_tokens_seen": 157391224, + "step": 9781 + }, + { + "epoch": 0.6852119397234846, + "grad_norm": 4.805121421813965, + "learning_rate": 3.1543131348511385e-05, + "loss": 1.2236, + "num_input_tokens_seen": 157407608, + "step": 9782 + }, + { + "epoch": 0.6852819879692138, + "grad_norm": 3.776397943496704, + "learning_rate": 3.153613309982487e-05, + "loss": 0.9411, + "num_input_tokens_seen": 157423024, + "step": 9783 + }, + { + "epoch": 0.685352036214943, + "grad_norm": 3.696873903274536, + "learning_rate": 3.1529134851138356e-05, + "loss": 1.0628, + "num_input_tokens_seen": 157439408, + "step": 9784 + }, + { + "epoch": 0.6854220844606723, + "grad_norm": 3.949763774871826, + "learning_rate": 3.1522136602451844e-05, + "loss": 0.8908, + "num_input_tokens_seen": 157455792, + "step": 9785 + }, + { + "epoch": 0.6854921327064015, + "grad_norm": 4.893935203552246, + "learning_rate": 3.1515138353765326e-05, + "loss": 0.8883, + "num_input_tokens_seen": 157472072, + "step": 9786 + }, + { + "epoch": 0.6855621809521307, + "grad_norm": 3.7111427783966064, + "learning_rate": 3.1508140105078815e-05, + "loss": 0.8686, + "num_input_tokens_seen": 157488456, + "step": 9787 + }, + { + "epoch": 0.68563222919786, + "grad_norm": 3.719639301300049, + "learning_rate": 3.150114185639229e-05, + "loss": 0.8673, + "num_input_tokens_seen": 157504728, + "step": 9788 + }, + { + "epoch": 0.6857022774435892, + "grad_norm": 3.4434754848480225, + "learning_rate": 3.1494143607705785e-05, + "loss": 0.7908, + "num_input_tokens_seen": 157520432, + "step": 9789 + }, + { + "epoch": 0.6857723256893186, + "grad_norm": 4.634060382843018, + "learning_rate": 3.1487145359019274e-05, + "loss": 1.0427, + "num_input_tokens_seen": 157536592, + "step": 9790 + }, + { + "epoch": 0.6858423739350478, + "grad_norm": 4.497537136077881, + "learning_rate": 3.148014711033275e-05, + "loss": 1.0239, + "num_input_tokens_seen": 157552136, + "step": 9791 + }, + { + "epoch": 0.685912422180777, + "grad_norm": 5.154428005218506, + "learning_rate": 3.147314886164624e-05, + "loss": 1.3121, + "num_input_tokens_seen": 157568520, + "step": 9792 + }, + { + "epoch": 0.6859824704265063, + "grad_norm": 5.402700901031494, + "learning_rate": 3.146615061295972e-05, + "loss": 0.7471, + "num_input_tokens_seen": 157583768, + "step": 9793 + }, + { + "epoch": 0.6860525186722355, + "grad_norm": 3.83581805229187, + "learning_rate": 3.145915236427321e-05, + "loss": 1.185, + "num_input_tokens_seen": 157600152, + "step": 9794 + }, + { + "epoch": 0.6861225669179647, + "grad_norm": 4.120491027832031, + "learning_rate": 3.1452154115586696e-05, + "loss": 1.0647, + "num_input_tokens_seen": 157616312, + "step": 9795 + }, + { + "epoch": 0.686192615163694, + "grad_norm": 3.6193275451660156, + "learning_rate": 3.144515586690018e-05, + "loss": 1.0026, + "num_input_tokens_seen": 157632240, + "step": 9796 + }, + { + "epoch": 0.6862626634094232, + "grad_norm": 5.425684452056885, + "learning_rate": 3.143815761821367e-05, + "loss": 1.0789, + "num_input_tokens_seen": 157648216, + "step": 9797 + }, + { + "epoch": 0.6863327116551525, + "grad_norm": 4.484485149383545, + "learning_rate": 3.143115936952714e-05, + "loss": 1.0247, + "num_input_tokens_seen": 157663960, + "step": 9798 + }, + { + "epoch": 0.6864027599008817, + "grad_norm": 3.604705572128296, + "learning_rate": 3.142416112084064e-05, + "loss": 0.9853, + "num_input_tokens_seen": 157679528, + "step": 9799 + }, + { + "epoch": 0.6864728081466109, + "grad_norm": 3.754070520401001, + "learning_rate": 3.1417162872154126e-05, + "loss": 1.0059, + "num_input_tokens_seen": 157695912, + "step": 9800 + }, + { + "epoch": 0.6864728081466109, + "eval_loss": 1.1177294254302979, + "eval_runtime": 0.1852, + "eval_samples_per_second": 5.398, + "eval_steps_per_second": 5.398, + "num_input_tokens_seen": 157695912, + "step": 9800 + }, + { + "epoch": 0.6865428563923403, + "grad_norm": 4.581052780151367, + "learning_rate": 3.14101646234676e-05, + "loss": 1.0741, + "num_input_tokens_seen": 157711648, + "step": 9801 + }, + { + "epoch": 0.6866129046380695, + "grad_norm": 4.674559116363525, + "learning_rate": 3.140316637478109e-05, + "loss": 1.2022, + "num_input_tokens_seen": 157727880, + "step": 9802 + }, + { + "epoch": 0.6866829528837988, + "grad_norm": 4.420334815979004, + "learning_rate": 3.139616812609457e-05, + "loss": 0.9036, + "num_input_tokens_seen": 157743320, + "step": 9803 + }, + { + "epoch": 0.686753001129528, + "grad_norm": 3.9861433506011963, + "learning_rate": 3.138916987740806e-05, + "loss": 1.0934, + "num_input_tokens_seen": 157759704, + "step": 9804 + }, + { + "epoch": 0.6868230493752572, + "grad_norm": 3.53330135345459, + "learning_rate": 3.138217162872155e-05, + "loss": 0.9765, + "num_input_tokens_seen": 157775760, + "step": 9805 + }, + { + "epoch": 0.6868930976209865, + "grad_norm": 6.029666423797607, + "learning_rate": 3.137517338003503e-05, + "loss": 1.1081, + "num_input_tokens_seen": 157792144, + "step": 9806 + }, + { + "epoch": 0.6869631458667157, + "grad_norm": 3.950190782546997, + "learning_rate": 3.136817513134852e-05, + "loss": 1.1905, + "num_input_tokens_seen": 157808528, + "step": 9807 + }, + { + "epoch": 0.6870331941124449, + "grad_norm": 5.544137954711914, + "learning_rate": 3.1361176882661994e-05, + "loss": 1.1867, + "num_input_tokens_seen": 157823896, + "step": 9808 + }, + { + "epoch": 0.6871032423581742, + "grad_norm": 3.8273894786834717, + "learning_rate": 3.135417863397549e-05, + "loss": 0.98, + "num_input_tokens_seen": 157839272, + "step": 9809 + }, + { + "epoch": 0.6871732906039034, + "grad_norm": 3.6053881645202637, + "learning_rate": 3.1347180385288964e-05, + "loss": 1.0828, + "num_input_tokens_seen": 157855656, + "step": 9810 + }, + { + "epoch": 0.6872433388496327, + "grad_norm": 4.042213439941406, + "learning_rate": 3.134018213660245e-05, + "loss": 0.9724, + "num_input_tokens_seen": 157871032, + "step": 9811 + }, + { + "epoch": 0.6873133870953619, + "grad_norm": 4.079894542694092, + "learning_rate": 3.133318388791594e-05, + "loss": 0.9703, + "num_input_tokens_seen": 157887416, + "step": 9812 + }, + { + "epoch": 0.6873834353410911, + "grad_norm": 3.9334349632263184, + "learning_rate": 3.132618563922942e-05, + "loss": 1.0362, + "num_input_tokens_seen": 157903800, + "step": 9813 + }, + { + "epoch": 0.6874534835868205, + "grad_norm": 3.788583517074585, + "learning_rate": 3.131918739054291e-05, + "loss": 1.0279, + "num_input_tokens_seen": 157920184, + "step": 9814 + }, + { + "epoch": 0.6875235318325497, + "grad_norm": 4.044598579406738, + "learning_rate": 3.1312189141856394e-05, + "loss": 1.1396, + "num_input_tokens_seen": 157936240, + "step": 9815 + }, + { + "epoch": 0.6875935800782789, + "grad_norm": 3.6873228549957275, + "learning_rate": 3.130519089316988e-05, + "loss": 1.0249, + "num_input_tokens_seen": 157952544, + "step": 9816 + }, + { + "epoch": 0.6876636283240082, + "grad_norm": 3.804030418395996, + "learning_rate": 3.129819264448337e-05, + "loss": 0.8939, + "num_input_tokens_seen": 157968352, + "step": 9817 + }, + { + "epoch": 0.6877336765697374, + "grad_norm": 3.826918840408325, + "learning_rate": 3.1291194395796846e-05, + "loss": 0.9813, + "num_input_tokens_seen": 157984392, + "step": 9818 + }, + { + "epoch": 0.6878037248154667, + "grad_norm": 3.5948212146759033, + "learning_rate": 3.128419614711034e-05, + "loss": 1.109, + "num_input_tokens_seen": 158000776, + "step": 9819 + }, + { + "epoch": 0.6878737730611959, + "grad_norm": 4.4051594734191895, + "learning_rate": 3.1277197898423816e-05, + "loss": 0.8722, + "num_input_tokens_seen": 158017160, + "step": 9820 + }, + { + "epoch": 0.6879438213069251, + "grad_norm": 4.117359638214111, + "learning_rate": 3.1270199649737305e-05, + "loss": 1.1546, + "num_input_tokens_seen": 158033544, + "step": 9821 + }, + { + "epoch": 0.6880138695526544, + "grad_norm": 4.026952743530273, + "learning_rate": 3.1263201401050793e-05, + "loss": 0.9462, + "num_input_tokens_seen": 158048736, + "step": 9822 + }, + { + "epoch": 0.6880839177983836, + "grad_norm": 4.486753940582275, + "learning_rate": 3.1256203152364275e-05, + "loss": 1.0434, + "num_input_tokens_seen": 158064712, + "step": 9823 + }, + { + "epoch": 0.6881539660441128, + "grad_norm": 4.287067890167236, + "learning_rate": 3.1249204903677764e-05, + "loss": 1.0611, + "num_input_tokens_seen": 158081096, + "step": 9824 + }, + { + "epoch": 0.6882240142898421, + "grad_norm": 4.286116123199463, + "learning_rate": 3.1242206654991246e-05, + "loss": 1.0567, + "num_input_tokens_seen": 158097304, + "step": 9825 + }, + { + "epoch": 0.6882940625355713, + "grad_norm": 4.457149505615234, + "learning_rate": 3.1235208406304734e-05, + "loss": 1.0229, + "num_input_tokens_seen": 158113688, + "step": 9826 + }, + { + "epoch": 0.6883641107813007, + "grad_norm": 4.044060230255127, + "learning_rate": 3.122821015761822e-05, + "loss": 0.8945, + "num_input_tokens_seen": 158130072, + "step": 9827 + }, + { + "epoch": 0.6884341590270299, + "grad_norm": 4.375307083129883, + "learning_rate": 3.12212119089317e-05, + "loss": 0.9683, + "num_input_tokens_seen": 158146456, + "step": 9828 + }, + { + "epoch": 0.6885042072727591, + "grad_norm": 4.88633394241333, + "learning_rate": 3.121421366024519e-05, + "loss": 1.0133, + "num_input_tokens_seen": 158162840, + "step": 9829 + }, + { + "epoch": 0.6885742555184884, + "grad_norm": 4.796840667724609, + "learning_rate": 3.120721541155867e-05, + "loss": 1.2833, + "num_input_tokens_seen": 158179224, + "step": 9830 + }, + { + "epoch": 0.6886443037642176, + "grad_norm": 4.500367164611816, + "learning_rate": 3.120021716287216e-05, + "loss": 1.1633, + "num_input_tokens_seen": 158194696, + "step": 9831 + }, + { + "epoch": 0.6887143520099469, + "grad_norm": 3.6117005348205566, + "learning_rate": 3.1193218914185645e-05, + "loss": 1.0584, + "num_input_tokens_seen": 158211080, + "step": 9832 + }, + { + "epoch": 0.6887844002556761, + "grad_norm": 4.293186664581299, + "learning_rate": 3.118622066549913e-05, + "loss": 0.885, + "num_input_tokens_seen": 158226472, + "step": 9833 + }, + { + "epoch": 0.6888544485014053, + "grad_norm": 5.183225154876709, + "learning_rate": 3.1179222416812616e-05, + "loss": 0.9569, + "num_input_tokens_seen": 158242856, + "step": 9834 + }, + { + "epoch": 0.6889244967471346, + "grad_norm": 6.489871501922607, + "learning_rate": 3.11722241681261e-05, + "loss": 0.9373, + "num_input_tokens_seen": 158258160, + "step": 9835 + }, + { + "epoch": 0.6889945449928638, + "grad_norm": 3.918494701385498, + "learning_rate": 3.1165225919439586e-05, + "loss": 0.943, + "num_input_tokens_seen": 158274120, + "step": 9836 + }, + { + "epoch": 0.689064593238593, + "grad_norm": 3.6617259979248047, + "learning_rate": 3.115822767075306e-05, + "loss": 1.0878, + "num_input_tokens_seen": 158289856, + "step": 9837 + }, + { + "epoch": 0.6891346414843224, + "grad_norm": 3.6620688438415527, + "learning_rate": 3.115122942206655e-05, + "loss": 1.071, + "num_input_tokens_seen": 158306240, + "step": 9838 + }, + { + "epoch": 0.6892046897300516, + "grad_norm": 3.364589214324951, + "learning_rate": 3.1144231173380045e-05, + "loss": 0.9745, + "num_input_tokens_seen": 158322624, + "step": 9839 + }, + { + "epoch": 0.6892747379757809, + "grad_norm": 4.892593860626221, + "learning_rate": 3.113723292469352e-05, + "loss": 1.1136, + "num_input_tokens_seen": 158337888, + "step": 9840 + }, + { + "epoch": 0.6893447862215101, + "grad_norm": 6.718249797821045, + "learning_rate": 3.113023467600701e-05, + "loss": 1.1577, + "num_input_tokens_seen": 158354272, + "step": 9841 + }, + { + "epoch": 0.6894148344672393, + "grad_norm": 5.81899356842041, + "learning_rate": 3.112323642732049e-05, + "loss": 1.0319, + "num_input_tokens_seen": 158370656, + "step": 9842 + }, + { + "epoch": 0.6894848827129686, + "grad_norm": 4.793848514556885, + "learning_rate": 3.111623817863398e-05, + "loss": 1.1874, + "num_input_tokens_seen": 158387040, + "step": 9843 + }, + { + "epoch": 0.6895549309586978, + "grad_norm": 4.389505863189697, + "learning_rate": 3.110923992994747e-05, + "loss": 1.2588, + "num_input_tokens_seen": 158403360, + "step": 9844 + }, + { + "epoch": 0.689624979204427, + "grad_norm": 3.752194881439209, + "learning_rate": 3.110224168126095e-05, + "loss": 1.021, + "num_input_tokens_seen": 158419744, + "step": 9845 + }, + { + "epoch": 0.6896950274501563, + "grad_norm": 3.7187631130218506, + "learning_rate": 3.109524343257444e-05, + "loss": 0.9153, + "num_input_tokens_seen": 158435760, + "step": 9846 + }, + { + "epoch": 0.6897650756958855, + "grad_norm": 4.190175533294678, + "learning_rate": 3.108824518388791e-05, + "loss": 1.2234, + "num_input_tokens_seen": 158452144, + "step": 9847 + }, + { + "epoch": 0.6898351239416148, + "grad_norm": 5.718031883239746, + "learning_rate": 3.10812469352014e-05, + "loss": 0.9419, + "num_input_tokens_seen": 158467000, + "step": 9848 + }, + { + "epoch": 0.689905172187344, + "grad_norm": 3.7210283279418945, + "learning_rate": 3.10742486865149e-05, + "loss": 1.0347, + "num_input_tokens_seen": 158483384, + "step": 9849 + }, + { + "epoch": 0.6899752204330732, + "grad_norm": 4.186606407165527, + "learning_rate": 3.106725043782837e-05, + "loss": 0.9937, + "num_input_tokens_seen": 158499264, + "step": 9850 + }, + { + "epoch": 0.6900452686788026, + "grad_norm": 4.367093086242676, + "learning_rate": 3.106025218914186e-05, + "loss": 1.03, + "num_input_tokens_seen": 158514376, + "step": 9851 + }, + { + "epoch": 0.6901153169245318, + "grad_norm": 3.856431245803833, + "learning_rate": 3.105325394045534e-05, + "loss": 1.1087, + "num_input_tokens_seen": 158529696, + "step": 9852 + }, + { + "epoch": 0.690185365170261, + "grad_norm": 4.161552429199219, + "learning_rate": 3.104625569176883e-05, + "loss": 1.1218, + "num_input_tokens_seen": 158545696, + "step": 9853 + }, + { + "epoch": 0.6902554134159903, + "grad_norm": 3.5685617923736572, + "learning_rate": 3.103925744308232e-05, + "loss": 1.0921, + "num_input_tokens_seen": 158562080, + "step": 9854 + }, + { + "epoch": 0.6903254616617195, + "grad_norm": 3.916200876235962, + "learning_rate": 3.10322591943958e-05, + "loss": 0.8789, + "num_input_tokens_seen": 158578464, + "step": 9855 + }, + { + "epoch": 0.6903955099074488, + "grad_norm": 5.1914496421813965, + "learning_rate": 3.102526094570929e-05, + "loss": 1.0995, + "num_input_tokens_seen": 158594848, + "step": 9856 + }, + { + "epoch": 0.690465558153178, + "grad_norm": 6.285881996154785, + "learning_rate": 3.1018262697022765e-05, + "loss": 0.9264, + "num_input_tokens_seen": 158610064, + "step": 9857 + }, + { + "epoch": 0.6905356063989072, + "grad_norm": 4.552140712738037, + "learning_rate": 3.1011264448336254e-05, + "loss": 0.9701, + "num_input_tokens_seen": 158624504, + "step": 9858 + }, + { + "epoch": 0.6906056546446365, + "grad_norm": 5.056686878204346, + "learning_rate": 3.100426619964975e-05, + "loss": 1.0656, + "num_input_tokens_seen": 158640360, + "step": 9859 + }, + { + "epoch": 0.6906757028903657, + "grad_norm": 8.140069007873535, + "learning_rate": 3.0997267950963224e-05, + "loss": 1.1361, + "num_input_tokens_seen": 158656744, + "step": 9860 + }, + { + "epoch": 0.6907457511360949, + "grad_norm": 4.607056617736816, + "learning_rate": 3.099026970227671e-05, + "loss": 1.0282, + "num_input_tokens_seen": 158672576, + "step": 9861 + }, + { + "epoch": 0.6908157993818242, + "grad_norm": 5.533421039581299, + "learning_rate": 3.0983271453590195e-05, + "loss": 1.1357, + "num_input_tokens_seen": 158687136, + "step": 9862 + }, + { + "epoch": 0.6908858476275535, + "grad_norm": 4.765280723571777, + "learning_rate": 3.097627320490368e-05, + "loss": 1.0519, + "num_input_tokens_seen": 158702584, + "step": 9863 + }, + { + "epoch": 0.6909558958732828, + "grad_norm": 3.8743479251861572, + "learning_rate": 3.096927495621716e-05, + "loss": 0.9754, + "num_input_tokens_seen": 158717832, + "step": 9864 + }, + { + "epoch": 0.691025944119012, + "grad_norm": 4.586508274078369, + "learning_rate": 3.0962276707530654e-05, + "loss": 1.1755, + "num_input_tokens_seen": 158734032, + "step": 9865 + }, + { + "epoch": 0.6910959923647412, + "grad_norm": 5.059841156005859, + "learning_rate": 3.095527845884414e-05, + "loss": 0.9512, + "num_input_tokens_seen": 158750416, + "step": 9866 + }, + { + "epoch": 0.6911660406104705, + "grad_norm": 3.5932974815368652, + "learning_rate": 3.094828021015762e-05, + "loss": 1.0244, + "num_input_tokens_seen": 158765880, + "step": 9867 + }, + { + "epoch": 0.6912360888561997, + "grad_norm": 3.622814893722534, + "learning_rate": 3.0941281961471106e-05, + "loss": 0.9653, + "num_input_tokens_seen": 158782264, + "step": 9868 + }, + { + "epoch": 0.691306137101929, + "grad_norm": 4.406857013702393, + "learning_rate": 3.093428371278459e-05, + "loss": 1.2304, + "num_input_tokens_seen": 158798648, + "step": 9869 + }, + { + "epoch": 0.6913761853476582, + "grad_norm": 5.478177070617676, + "learning_rate": 3.0927285464098076e-05, + "loss": 0.9475, + "num_input_tokens_seen": 158814744, + "step": 9870 + }, + { + "epoch": 0.6914462335933874, + "grad_norm": 4.930840015411377, + "learning_rate": 3.0920287215411565e-05, + "loss": 1.0326, + "num_input_tokens_seen": 158830544, + "step": 9871 + }, + { + "epoch": 0.6915162818391167, + "grad_norm": 4.093355178833008, + "learning_rate": 3.091328896672505e-05, + "loss": 1.1184, + "num_input_tokens_seen": 158846928, + "step": 9872 + }, + { + "epoch": 0.6915863300848459, + "grad_norm": 5.412513732910156, + "learning_rate": 3.0906290718038535e-05, + "loss": 1.199, + "num_input_tokens_seen": 158862288, + "step": 9873 + }, + { + "epoch": 0.6916563783305751, + "grad_norm": 3.9262020587921143, + "learning_rate": 3.089929246935201e-05, + "loss": 1.0103, + "num_input_tokens_seen": 158878360, + "step": 9874 + }, + { + "epoch": 0.6917264265763045, + "grad_norm": 4.176063060760498, + "learning_rate": 3.0892294220665506e-05, + "loss": 1.0794, + "num_input_tokens_seen": 158894744, + "step": 9875 + }, + { + "epoch": 0.6917964748220337, + "grad_norm": 3.5418083667755127, + "learning_rate": 3.0885295971978994e-05, + "loss": 1.1219, + "num_input_tokens_seen": 158911128, + "step": 9876 + }, + { + "epoch": 0.691866523067763, + "grad_norm": 4.5964789390563965, + "learning_rate": 3.087829772329247e-05, + "loss": 1.0354, + "num_input_tokens_seen": 158926976, + "step": 9877 + }, + { + "epoch": 0.6919365713134922, + "grad_norm": 5.64787483215332, + "learning_rate": 3.087129947460596e-05, + "loss": 1.377, + "num_input_tokens_seen": 158941272, + "step": 9878 + }, + { + "epoch": 0.6920066195592214, + "grad_norm": 3.6600422859191895, + "learning_rate": 3.086430122591944e-05, + "loss": 0.9638, + "num_input_tokens_seen": 158957248, + "step": 9879 + }, + { + "epoch": 0.6920766678049507, + "grad_norm": 3.4376487731933594, + "learning_rate": 3.085730297723293e-05, + "loss": 0.9191, + "num_input_tokens_seen": 158973632, + "step": 9880 + }, + { + "epoch": 0.6921467160506799, + "grad_norm": 3.4287545680999756, + "learning_rate": 3.085030472854642e-05, + "loss": 0.926, + "num_input_tokens_seen": 158989960, + "step": 9881 + }, + { + "epoch": 0.6922167642964091, + "grad_norm": 5.2852783203125, + "learning_rate": 3.08433064798599e-05, + "loss": 1.0603, + "num_input_tokens_seen": 159006264, + "step": 9882 + }, + { + "epoch": 0.6922868125421384, + "grad_norm": 3.751561164855957, + "learning_rate": 3.083630823117339e-05, + "loss": 0.9054, + "num_input_tokens_seen": 159022648, + "step": 9883 + }, + { + "epoch": 0.6923568607878676, + "grad_norm": 4.51289701461792, + "learning_rate": 3.082930998248686e-05, + "loss": 0.9889, + "num_input_tokens_seen": 159038568, + "step": 9884 + }, + { + "epoch": 0.6924269090335969, + "grad_norm": 4.42695426940918, + "learning_rate": 3.082231173380036e-05, + "loss": 1.1481, + "num_input_tokens_seen": 159053992, + "step": 9885 + }, + { + "epoch": 0.6924969572793261, + "grad_norm": 4.036323070526123, + "learning_rate": 3.081531348511383e-05, + "loss": 1.1686, + "num_input_tokens_seen": 159070256, + "step": 9886 + }, + { + "epoch": 0.6925670055250553, + "grad_norm": 4.326934814453125, + "learning_rate": 3.080831523642732e-05, + "loss": 1.3935, + "num_input_tokens_seen": 159086640, + "step": 9887 + }, + { + "epoch": 0.6926370537707847, + "grad_norm": 4.341803073883057, + "learning_rate": 3.080131698774081e-05, + "loss": 0.9545, + "num_input_tokens_seen": 159102880, + "step": 9888 + }, + { + "epoch": 0.6927071020165139, + "grad_norm": 3.982081413269043, + "learning_rate": 3.079431873905429e-05, + "loss": 1.1245, + "num_input_tokens_seen": 159119136, + "step": 9889 + }, + { + "epoch": 0.6927771502622431, + "grad_norm": 3.9021053314208984, + "learning_rate": 3.078732049036778e-05, + "loss": 1.1032, + "num_input_tokens_seen": 159135520, + "step": 9890 + }, + { + "epoch": 0.6928471985079724, + "grad_norm": 6.730637073516846, + "learning_rate": 3.078032224168126e-05, + "loss": 1.0817, + "num_input_tokens_seen": 159150928, + "step": 9891 + }, + { + "epoch": 0.6929172467537016, + "grad_norm": 5.079481601715088, + "learning_rate": 3.077332399299475e-05, + "loss": 1.092, + "num_input_tokens_seen": 159166680, + "step": 9892 + }, + { + "epoch": 0.6929872949994309, + "grad_norm": 3.9414215087890625, + "learning_rate": 3.076632574430824e-05, + "loss": 1.0367, + "num_input_tokens_seen": 159183064, + "step": 9893 + }, + { + "epoch": 0.6930573432451601, + "grad_norm": 3.4023585319519043, + "learning_rate": 3.0759327495621714e-05, + "loss": 1.0794, + "num_input_tokens_seen": 159199104, + "step": 9894 + }, + { + "epoch": 0.6931273914908893, + "grad_norm": 3.946047067642212, + "learning_rate": 3.075232924693521e-05, + "loss": 1.0406, + "num_input_tokens_seen": 159215488, + "step": 9895 + }, + { + "epoch": 0.6931974397366186, + "grad_norm": 5.388001441955566, + "learning_rate": 3.0745330998248685e-05, + "loss": 1.0193, + "num_input_tokens_seen": 159231472, + "step": 9896 + }, + { + "epoch": 0.6932674879823478, + "grad_norm": 4.3835129737854, + "learning_rate": 3.0738332749562173e-05, + "loss": 1.0889, + "num_input_tokens_seen": 159247856, + "step": 9897 + }, + { + "epoch": 0.693337536228077, + "grad_norm": 4.293518543243408, + "learning_rate": 3.073133450087566e-05, + "loss": 1.0315, + "num_input_tokens_seen": 159264032, + "step": 9898 + }, + { + "epoch": 0.6934075844738063, + "grad_norm": 3.974782705307007, + "learning_rate": 3.0724336252189144e-05, + "loss": 0.9496, + "num_input_tokens_seen": 159280152, + "step": 9899 + }, + { + "epoch": 0.6934776327195356, + "grad_norm": 4.195755958557129, + "learning_rate": 3.071733800350263e-05, + "loss": 0.9556, + "num_input_tokens_seen": 159296536, + "step": 9900 + }, + { + "epoch": 0.6935476809652649, + "grad_norm": 3.6928093433380127, + "learning_rate": 3.0710339754816114e-05, + "loss": 0.9666, + "num_input_tokens_seen": 159312920, + "step": 9901 + }, + { + "epoch": 0.6936177292109941, + "grad_norm": 3.8017020225524902, + "learning_rate": 3.07033415061296e-05, + "loss": 1.0447, + "num_input_tokens_seen": 159328432, + "step": 9902 + }, + { + "epoch": 0.6936877774567233, + "grad_norm": 4.495299816131592, + "learning_rate": 3.069634325744309e-05, + "loss": 1.2429, + "num_input_tokens_seen": 159344816, + "step": 9903 + }, + { + "epoch": 0.6937578257024526, + "grad_norm": 5.789355754852295, + "learning_rate": 3.0689345008756566e-05, + "loss": 1.0239, + "num_input_tokens_seen": 159361200, + "step": 9904 + }, + { + "epoch": 0.6938278739481818, + "grad_norm": 4.642138481140137, + "learning_rate": 3.0682346760070055e-05, + "loss": 0.9583, + "num_input_tokens_seen": 159377264, + "step": 9905 + }, + { + "epoch": 0.6938979221939111, + "grad_norm": 3.754173517227173, + "learning_rate": 3.067534851138354e-05, + "loss": 1.0314, + "num_input_tokens_seen": 159393040, + "step": 9906 + }, + { + "epoch": 0.6939679704396403, + "grad_norm": 3.646178960800171, + "learning_rate": 3.0668350262697025e-05, + "loss": 1.0091, + "num_input_tokens_seen": 159409304, + "step": 9907 + }, + { + "epoch": 0.6940380186853695, + "grad_norm": 4.762131690979004, + "learning_rate": 3.0661352014010514e-05, + "loss": 0.8843, + "num_input_tokens_seen": 159424968, + "step": 9908 + }, + { + "epoch": 0.6941080669310988, + "grad_norm": 4.467316150665283, + "learning_rate": 3.0654353765323996e-05, + "loss": 0.9526, + "num_input_tokens_seen": 159439800, + "step": 9909 + }, + { + "epoch": 0.694178115176828, + "grad_norm": 4.280580043792725, + "learning_rate": 3.0647355516637484e-05, + "loss": 0.9283, + "num_input_tokens_seen": 159456184, + "step": 9910 + }, + { + "epoch": 0.6942481634225572, + "grad_norm": 4.713488578796387, + "learning_rate": 3.0640357267950966e-05, + "loss": 1.0294, + "num_input_tokens_seen": 159471712, + "step": 9911 + }, + { + "epoch": 0.6943182116682866, + "grad_norm": 5.159468650817871, + "learning_rate": 3.0633359019264455e-05, + "loss": 0.9041, + "num_input_tokens_seen": 159487648, + "step": 9912 + }, + { + "epoch": 0.6943882599140158, + "grad_norm": 3.66630220413208, + "learning_rate": 3.062636077057793e-05, + "loss": 0.9883, + "num_input_tokens_seen": 159502912, + "step": 9913 + }, + { + "epoch": 0.6944583081597451, + "grad_norm": 3.744642734527588, + "learning_rate": 3.061936252189142e-05, + "loss": 0.9382, + "num_input_tokens_seen": 159519296, + "step": 9914 + }, + { + "epoch": 0.6945283564054743, + "grad_norm": 5.04609489440918, + "learning_rate": 3.061236427320491e-05, + "loss": 0.8768, + "num_input_tokens_seen": 159535680, + "step": 9915 + }, + { + "epoch": 0.6945984046512035, + "grad_norm": 3.8748037815093994, + "learning_rate": 3.060536602451839e-05, + "loss": 0.9885, + "num_input_tokens_seen": 159551984, + "step": 9916 + }, + { + "epoch": 0.6946684528969328, + "grad_norm": 3.5732550621032715, + "learning_rate": 3.059836777583188e-05, + "loss": 0.91, + "num_input_tokens_seen": 159568272, + "step": 9917 + }, + { + "epoch": 0.694738501142662, + "grad_norm": 3.7340505123138428, + "learning_rate": 3.059136952714535e-05, + "loss": 0.8805, + "num_input_tokens_seen": 159583464, + "step": 9918 + }, + { + "epoch": 0.6948085493883912, + "grad_norm": 5.672329425811768, + "learning_rate": 3.058437127845885e-05, + "loss": 0.9648, + "num_input_tokens_seen": 159599848, + "step": 9919 + }, + { + "epoch": 0.6948785976341205, + "grad_norm": 4.033182621002197, + "learning_rate": 3.0577373029772336e-05, + "loss": 1.1314, + "num_input_tokens_seen": 159616232, + "step": 9920 + }, + { + "epoch": 0.6949486458798497, + "grad_norm": 4.055893898010254, + "learning_rate": 3.057037478108581e-05, + "loss": 1.0322, + "num_input_tokens_seen": 159631864, + "step": 9921 + }, + { + "epoch": 0.695018694125579, + "grad_norm": 3.9675509929656982, + "learning_rate": 3.05633765323993e-05, + "loss": 0.9239, + "num_input_tokens_seen": 159647144, + "step": 9922 + }, + { + "epoch": 0.6950887423713082, + "grad_norm": 3.558152675628662, + "learning_rate": 3.055637828371278e-05, + "loss": 1.11, + "num_input_tokens_seen": 159662816, + "step": 9923 + }, + { + "epoch": 0.6951587906170374, + "grad_norm": 4.272154331207275, + "learning_rate": 3.054938003502627e-05, + "loss": 0.8675, + "num_input_tokens_seen": 159678848, + "step": 9924 + }, + { + "epoch": 0.6952288388627668, + "grad_norm": 4.0221333503723145, + "learning_rate": 3.054238178633976e-05, + "loss": 1.1281, + "num_input_tokens_seen": 159694472, + "step": 9925 + }, + { + "epoch": 0.695298887108496, + "grad_norm": 3.8302323818206787, + "learning_rate": 3.053538353765324e-05, + "loss": 0.9524, + "num_input_tokens_seen": 159710456, + "step": 9926 + }, + { + "epoch": 0.6953689353542252, + "grad_norm": 4.294325828552246, + "learning_rate": 3.052838528896673e-05, + "loss": 1.0062, + "num_input_tokens_seen": 159725664, + "step": 9927 + }, + { + "epoch": 0.6954389835999545, + "grad_norm": 4.0839433670043945, + "learning_rate": 3.0521387040280205e-05, + "loss": 1.0537, + "num_input_tokens_seen": 159742048, + "step": 9928 + }, + { + "epoch": 0.6955090318456837, + "grad_norm": 8.5400972366333, + "learning_rate": 3.0514388791593697e-05, + "loss": 1.1738, + "num_input_tokens_seen": 159758432, + "step": 9929 + }, + { + "epoch": 0.695579080091413, + "grad_norm": 3.7397027015686035, + "learning_rate": 3.0507390542907182e-05, + "loss": 0.9087, + "num_input_tokens_seen": 159774816, + "step": 9930 + }, + { + "epoch": 0.6956491283371422, + "grad_norm": 6.724664688110352, + "learning_rate": 3.0500392294220664e-05, + "loss": 0.9931, + "num_input_tokens_seen": 159790408, + "step": 9931 + }, + { + "epoch": 0.6957191765828714, + "grad_norm": 6.519052982330322, + "learning_rate": 3.0493394045534152e-05, + "loss": 1.1114, + "num_input_tokens_seen": 159806544, + "step": 9932 + }, + { + "epoch": 0.6957892248286007, + "grad_norm": 3.7849678993225098, + "learning_rate": 3.0486395796847634e-05, + "loss": 1.0276, + "num_input_tokens_seen": 159822824, + "step": 9933 + }, + { + "epoch": 0.6958592730743299, + "grad_norm": 5.704104900360107, + "learning_rate": 3.047939754816112e-05, + "loss": 1.109, + "num_input_tokens_seen": 159839208, + "step": 9934 + }, + { + "epoch": 0.6959293213200591, + "grad_norm": 4.74746036529541, + "learning_rate": 3.0472399299474615e-05, + "loss": 1.0648, + "num_input_tokens_seen": 159855472, + "step": 9935 + }, + { + "epoch": 0.6959993695657885, + "grad_norm": 3.6369898319244385, + "learning_rate": 3.046540105078809e-05, + "loss": 0.976, + "num_input_tokens_seen": 159871856, + "step": 9936 + }, + { + "epoch": 0.6960694178115177, + "grad_norm": 7.172027587890625, + "learning_rate": 3.0458402802101578e-05, + "loss": 1.0349, + "num_input_tokens_seen": 159888240, + "step": 9937 + }, + { + "epoch": 0.696139466057247, + "grad_norm": 4.013808727264404, + "learning_rate": 3.045140455341506e-05, + "loss": 1.1096, + "num_input_tokens_seen": 159904624, + "step": 9938 + }, + { + "epoch": 0.6962095143029762, + "grad_norm": 4.1765265464782715, + "learning_rate": 3.0444406304728552e-05, + "loss": 1.1875, + "num_input_tokens_seen": 159921008, + "step": 9939 + }, + { + "epoch": 0.6962795625487054, + "grad_norm": 4.282069206237793, + "learning_rate": 3.0437408056042027e-05, + "loss": 0.9603, + "num_input_tokens_seen": 159937392, + "step": 9940 + }, + { + "epoch": 0.6963496107944347, + "grad_norm": 6.7467570304870605, + "learning_rate": 3.0430409807355516e-05, + "loss": 0.9501, + "num_input_tokens_seen": 159952432, + "step": 9941 + }, + { + "epoch": 0.6964196590401639, + "grad_norm": 6.585689544677734, + "learning_rate": 3.0423411558669e-05, + "loss": 1.0867, + "num_input_tokens_seen": 159968816, + "step": 9942 + }, + { + "epoch": 0.6964897072858932, + "grad_norm": 4.122556209564209, + "learning_rate": 3.0416413309982483e-05, + "loss": 1.1605, + "num_input_tokens_seen": 159984808, + "step": 9943 + }, + { + "epoch": 0.6965597555316224, + "grad_norm": 6.386577606201172, + "learning_rate": 3.0409415061295975e-05, + "loss": 1.0542, + "num_input_tokens_seen": 160000224, + "step": 9944 + }, + { + "epoch": 0.6966298037773516, + "grad_norm": 4.382735729217529, + "learning_rate": 3.0402416812609453e-05, + "loss": 1.0503, + "num_input_tokens_seen": 160016608, + "step": 9945 + }, + { + "epoch": 0.6966998520230809, + "grad_norm": 3.7992031574249268, + "learning_rate": 3.0395418563922938e-05, + "loss": 0.9457, + "num_input_tokens_seen": 160032992, + "step": 9946 + }, + { + "epoch": 0.6967699002688101, + "grad_norm": 3.7470431327819824, + "learning_rate": 3.0388420315236434e-05, + "loss": 1.1664, + "num_input_tokens_seen": 160049376, + "step": 9947 + }, + { + "epoch": 0.6968399485145393, + "grad_norm": 3.7542309761047363, + "learning_rate": 3.038142206654991e-05, + "loss": 1.0668, + "num_input_tokens_seen": 160065736, + "step": 9948 + }, + { + "epoch": 0.6969099967602687, + "grad_norm": 5.810544013977051, + "learning_rate": 3.03744238178634e-05, + "loss": 1.0868, + "num_input_tokens_seen": 160081528, + "step": 9949 + }, + { + "epoch": 0.6969800450059979, + "grad_norm": 5.0998406410217285, + "learning_rate": 3.0367425569176876e-05, + "loss": 0.906, + "num_input_tokens_seen": 160097600, + "step": 9950 + }, + { + "epoch": 0.6970500932517272, + "grad_norm": 4.198676586151123, + "learning_rate": 3.036042732049037e-05, + "loss": 1.0983, + "num_input_tokens_seen": 160113984, + "step": 9951 + }, + { + "epoch": 0.6971201414974564, + "grad_norm": 3.7066121101379395, + "learning_rate": 3.0353429071803856e-05, + "loss": 0.9996, + "num_input_tokens_seen": 160129296, + "step": 9952 + }, + { + "epoch": 0.6971901897431856, + "grad_norm": 4.445394992828369, + "learning_rate": 3.0346430823117335e-05, + "loss": 1.0595, + "num_input_tokens_seen": 160145216, + "step": 9953 + }, + { + "epoch": 0.6972602379889149, + "grad_norm": 4.308450222015381, + "learning_rate": 3.0339432574430827e-05, + "loss": 1.0971, + "num_input_tokens_seen": 160160584, + "step": 9954 + }, + { + "epoch": 0.6973302862346441, + "grad_norm": 3.5946972370147705, + "learning_rate": 3.033243432574431e-05, + "loss": 0.9127, + "num_input_tokens_seen": 160176960, + "step": 9955 + }, + { + "epoch": 0.6974003344803733, + "grad_norm": 3.5399606227874756, + "learning_rate": 3.0325436077057794e-05, + "loss": 0.8822, + "num_input_tokens_seen": 160192424, + "step": 9956 + }, + { + "epoch": 0.6974703827261026, + "grad_norm": 4.733034610748291, + "learning_rate": 3.0318437828371282e-05, + "loss": 1.071, + "num_input_tokens_seen": 160208720, + "step": 9957 + }, + { + "epoch": 0.6975404309718318, + "grad_norm": 4.197108268737793, + "learning_rate": 3.0311439579684757e-05, + "loss": 0.9587, + "num_input_tokens_seen": 160225104, + "step": 9958 + }, + { + "epoch": 0.6976104792175611, + "grad_norm": 3.231484889984131, + "learning_rate": 3.0304441330998253e-05, + "loss": 0.7644, + "num_input_tokens_seen": 160241488, + "step": 9959 + }, + { + "epoch": 0.6976805274632903, + "grad_norm": 3.5635123252868652, + "learning_rate": 3.0297443082311734e-05, + "loss": 1.0122, + "num_input_tokens_seen": 160257616, + "step": 9960 + }, + { + "epoch": 0.6977505757090195, + "grad_norm": 4.850098609924316, + "learning_rate": 3.029044483362522e-05, + "loss": 1.0965, + "num_input_tokens_seen": 160274000, + "step": 9961 + }, + { + "epoch": 0.6978206239547489, + "grad_norm": 4.113264083862305, + "learning_rate": 3.0283446584938708e-05, + "loss": 1.1065, + "num_input_tokens_seen": 160290384, + "step": 9962 + }, + { + "epoch": 0.6978906722004781, + "grad_norm": 3.8856465816497803, + "learning_rate": 3.027644833625219e-05, + "loss": 0.9973, + "num_input_tokens_seen": 160306088, + "step": 9963 + }, + { + "epoch": 0.6979607204462073, + "grad_norm": 3.9685230255126953, + "learning_rate": 3.0269450087565675e-05, + "loss": 1.0881, + "num_input_tokens_seen": 160322472, + "step": 9964 + }, + { + "epoch": 0.6980307686919366, + "grad_norm": 4.484827995300293, + "learning_rate": 3.0262451838879157e-05, + "loss": 0.9674, + "num_input_tokens_seen": 160338632, + "step": 9965 + }, + { + "epoch": 0.6981008169376658, + "grad_norm": 3.810039520263672, + "learning_rate": 3.0255453590192646e-05, + "loss": 1.0702, + "num_input_tokens_seen": 160355016, + "step": 9966 + }, + { + "epoch": 0.6981708651833951, + "grad_norm": 4.685659408569336, + "learning_rate": 3.0248455341506127e-05, + "loss": 1.0486, + "num_input_tokens_seen": 160371400, + "step": 9967 + }, + { + "epoch": 0.6982409134291243, + "grad_norm": 4.03449010848999, + "learning_rate": 3.0241457092819613e-05, + "loss": 0.8593, + "num_input_tokens_seen": 160387784, + "step": 9968 + }, + { + "epoch": 0.6983109616748535, + "grad_norm": 4.901271820068359, + "learning_rate": 3.0234458844133108e-05, + "loss": 0.9302, + "num_input_tokens_seen": 160404160, + "step": 9969 + }, + { + "epoch": 0.6983810099205828, + "grad_norm": 3.7507076263427734, + "learning_rate": 3.0227460595446583e-05, + "loss": 1.0123, + "num_input_tokens_seen": 160420352, + "step": 9970 + }, + { + "epoch": 0.698451058166312, + "grad_norm": 4.233783721923828, + "learning_rate": 3.022046234676007e-05, + "loss": 1.17, + "num_input_tokens_seen": 160436424, + "step": 9971 + }, + { + "epoch": 0.6985211064120413, + "grad_norm": 4.30189323425293, + "learning_rate": 3.021346409807355e-05, + "loss": 1.135, + "num_input_tokens_seen": 160452808, + "step": 9972 + }, + { + "epoch": 0.6985911546577706, + "grad_norm": 3.7838134765625, + "learning_rate": 3.020646584938704e-05, + "loss": 0.9985, + "num_input_tokens_seen": 160469168, + "step": 9973 + }, + { + "epoch": 0.6986612029034998, + "grad_norm": 4.582680702209473, + "learning_rate": 3.019946760070053e-05, + "loss": 1.0683, + "num_input_tokens_seen": 160485048, + "step": 9974 + }, + { + "epoch": 0.6987312511492291, + "grad_norm": 3.8949317932128906, + "learning_rate": 3.019246935201401e-05, + "loss": 1.109, + "num_input_tokens_seen": 160501432, + "step": 9975 + }, + { + "epoch": 0.6988012993949583, + "grad_norm": 3.563477039337158, + "learning_rate": 3.0185471103327494e-05, + "loss": 0.9017, + "num_input_tokens_seen": 160517816, + "step": 9976 + }, + { + "epoch": 0.6988713476406875, + "grad_norm": 4.100624084472656, + "learning_rate": 3.0178472854640976e-05, + "loss": 1.1129, + "num_input_tokens_seen": 160533864, + "step": 9977 + }, + { + "epoch": 0.6989413958864168, + "grad_norm": 4.587320804595947, + "learning_rate": 3.0171474605954465e-05, + "loss": 1.1402, + "num_input_tokens_seen": 160550248, + "step": 9978 + }, + { + "epoch": 0.699011444132146, + "grad_norm": 3.8500516414642334, + "learning_rate": 3.0164476357267957e-05, + "loss": 1.1349, + "num_input_tokens_seen": 160565896, + "step": 9979 + }, + { + "epoch": 0.6990814923778753, + "grad_norm": 3.507326126098633, + "learning_rate": 3.015747810858143e-05, + "loss": 0.9827, + "num_input_tokens_seen": 160581136, + "step": 9980 + }, + { + "epoch": 0.6991515406236045, + "grad_norm": 4.412599563598633, + "learning_rate": 3.0150479859894927e-05, + "loss": 1.1103, + "num_input_tokens_seen": 160597520, + "step": 9981 + }, + { + "epoch": 0.6992215888693337, + "grad_norm": 3.683112621307373, + "learning_rate": 3.0143481611208402e-05, + "loss": 0.9766, + "num_input_tokens_seen": 160613904, + "step": 9982 + }, + { + "epoch": 0.699291637115063, + "grad_norm": 3.94653058052063, + "learning_rate": 3.0136483362521887e-05, + "loss": 0.9589, + "num_input_tokens_seen": 160629072, + "step": 9983 + }, + { + "epoch": 0.6993616853607922, + "grad_norm": 4.542760372161865, + "learning_rate": 3.0129485113835383e-05, + "loss": 1.1109, + "num_input_tokens_seen": 160645456, + "step": 9984 + }, + { + "epoch": 0.6994317336065214, + "grad_norm": 4.816009998321533, + "learning_rate": 3.0122486865148864e-05, + "loss": 0.9927, + "num_input_tokens_seen": 160661472, + "step": 9985 + }, + { + "epoch": 0.6995017818522508, + "grad_norm": 3.939324378967285, + "learning_rate": 3.011548861646235e-05, + "loss": 1.0349, + "num_input_tokens_seen": 160677856, + "step": 9986 + }, + { + "epoch": 0.69957183009798, + "grad_norm": 4.211832523345947, + "learning_rate": 3.0108490367775828e-05, + "loss": 1.2934, + "num_input_tokens_seen": 160694240, + "step": 9987 + }, + { + "epoch": 0.6996418783437093, + "grad_norm": 4.728166580200195, + "learning_rate": 3.0101492119089313e-05, + "loss": 1.2775, + "num_input_tokens_seen": 160710584, + "step": 9988 + }, + { + "epoch": 0.6997119265894385, + "grad_norm": 4.537207126617432, + "learning_rate": 3.0094493870402805e-05, + "loss": 1.2192, + "num_input_tokens_seen": 160726936, + "step": 9989 + }, + { + "epoch": 0.6997819748351677, + "grad_norm": 4.751252174377441, + "learning_rate": 3.0087495621716287e-05, + "loss": 0.9631, + "num_input_tokens_seen": 160743024, + "step": 9990 + }, + { + "epoch": 0.699852023080897, + "grad_norm": 5.063358783721924, + "learning_rate": 3.0080497373029776e-05, + "loss": 1.1344, + "num_input_tokens_seen": 160759408, + "step": 9991 + }, + { + "epoch": 0.6999220713266262, + "grad_norm": 4.767810344696045, + "learning_rate": 3.007349912434325e-05, + "loss": 1.3158, + "num_input_tokens_seen": 160774464, + "step": 9992 + }, + { + "epoch": 0.6999921195723554, + "grad_norm": 5.853913307189941, + "learning_rate": 3.0066500875656746e-05, + "loss": 1.1619, + "num_input_tokens_seen": 160790136, + "step": 9993 + }, + { + "epoch": 0.7000621678180847, + "grad_norm": 3.6670284271240234, + "learning_rate": 3.005950262697022e-05, + "loss": 0.9999, + "num_input_tokens_seen": 160806520, + "step": 9994 + }, + { + "epoch": 0.7001322160638139, + "grad_norm": 5.497128009796143, + "learning_rate": 3.0052504378283713e-05, + "loss": 1.066, + "num_input_tokens_seen": 160822904, + "step": 9995 + }, + { + "epoch": 0.7002022643095432, + "grad_norm": 3.90018892288208, + "learning_rate": 3.00455061295972e-05, + "loss": 1.032, + "num_input_tokens_seen": 160838928, + "step": 9996 + }, + { + "epoch": 0.7002723125552724, + "grad_norm": 4.37199592590332, + "learning_rate": 3.0038507880910683e-05, + "loss": 1.0756, + "num_input_tokens_seen": 160854696, + "step": 9997 + }, + { + "epoch": 0.7003423608010017, + "grad_norm": 4.189897060394287, + "learning_rate": 3.003150963222417e-05, + "loss": 0.9732, + "num_input_tokens_seen": 160870312, + "step": 9998 + }, + { + "epoch": 0.700412409046731, + "grad_norm": 3.815664291381836, + "learning_rate": 3.0024511383537647e-05, + "loss": 1.073, + "num_input_tokens_seen": 160886160, + "step": 9999 + }, + { + "epoch": 0.7004824572924602, + "grad_norm": 3.390901565551758, + "learning_rate": 3.001751313485114e-05, + "loss": 0.9494, + "num_input_tokens_seen": 160902096, + "step": 10000 + }, + { + "epoch": 0.7004824572924602, + "eval_loss": 1.1161283254623413, + "eval_runtime": 0.1926, + "eval_samples_per_second": 5.193, + "eval_steps_per_second": 5.193, + "num_input_tokens_seen": 160902096, + "step": 10000 + }, + { + "epoch": 0.7005525055381894, + "grad_norm": 4.175451755523682, + "learning_rate": 3.0010514886164624e-05, + "loss": 0.9272, + "num_input_tokens_seen": 160917936, + "step": 10001 + }, + { + "epoch": 0.7006225537839187, + "grad_norm": 3.91949200630188, + "learning_rate": 3.0003516637478106e-05, + "loss": 1.0464, + "num_input_tokens_seen": 160933912, + "step": 10002 + }, + { + "epoch": 0.7006926020296479, + "grad_norm": 3.6604700088500977, + "learning_rate": 2.9996518388791595e-05, + "loss": 1.1365, + "num_input_tokens_seen": 160950104, + "step": 10003 + }, + { + "epoch": 0.7007626502753772, + "grad_norm": 4.147017478942871, + "learning_rate": 2.998952014010507e-05, + "loss": 1.0801, + "num_input_tokens_seen": 160965536, + "step": 10004 + }, + { + "epoch": 0.7008326985211064, + "grad_norm": 3.8158257007598877, + "learning_rate": 2.9982521891418565e-05, + "loss": 1.077, + "num_input_tokens_seen": 160981104, + "step": 10005 + }, + { + "epoch": 0.7009027467668356, + "grad_norm": 3.5689399242401123, + "learning_rate": 2.9975523642732057e-05, + "loss": 1.0179, + "num_input_tokens_seen": 160997488, + "step": 10006 + }, + { + "epoch": 0.7009727950125649, + "grad_norm": 3.8672614097595215, + "learning_rate": 2.9968525394045532e-05, + "loss": 1.025, + "num_input_tokens_seen": 161013376, + "step": 10007 + }, + { + "epoch": 0.7010428432582941, + "grad_norm": 3.7843289375305176, + "learning_rate": 2.996152714535902e-05, + "loss": 1.0705, + "num_input_tokens_seen": 161028872, + "step": 10008 + }, + { + "epoch": 0.7011128915040234, + "grad_norm": 5.51763916015625, + "learning_rate": 2.9954528896672503e-05, + "loss": 0.9481, + "num_input_tokens_seen": 161045256, + "step": 10009 + }, + { + "epoch": 0.7011829397497527, + "grad_norm": 3.911912679672241, + "learning_rate": 2.9947530647985988e-05, + "loss": 0.888, + "num_input_tokens_seen": 161060880, + "step": 10010 + }, + { + "epoch": 0.7012529879954819, + "grad_norm": 3.8247056007385254, + "learning_rate": 2.994053239929948e-05, + "loss": 0.9951, + "num_input_tokens_seen": 161076560, + "step": 10011 + }, + { + "epoch": 0.7013230362412112, + "grad_norm": 4.128506183624268, + "learning_rate": 2.9933534150612958e-05, + "loss": 1.0566, + "num_input_tokens_seen": 161092944, + "step": 10012 + }, + { + "epoch": 0.7013930844869404, + "grad_norm": 3.8099727630615234, + "learning_rate": 2.9926535901926443e-05, + "loss": 0.91, + "num_input_tokens_seen": 161108632, + "step": 10013 + }, + { + "epoch": 0.7014631327326696, + "grad_norm": 3.801342010498047, + "learning_rate": 2.9919537653239925e-05, + "loss": 0.9862, + "num_input_tokens_seen": 161125016, + "step": 10014 + }, + { + "epoch": 0.7015331809783989, + "grad_norm": 4.915028095245361, + "learning_rate": 2.991253940455342e-05, + "loss": 1.1119, + "num_input_tokens_seen": 161141400, + "step": 10015 + }, + { + "epoch": 0.7016032292241281, + "grad_norm": 4.670784950256348, + "learning_rate": 2.9905541155866906e-05, + "loss": 1.1207, + "num_input_tokens_seen": 161157784, + "step": 10016 + }, + { + "epoch": 0.7016732774698574, + "grad_norm": 4.292564392089844, + "learning_rate": 2.989854290718038e-05, + "loss": 1.129, + "num_input_tokens_seen": 161174168, + "step": 10017 + }, + { + "epoch": 0.7017433257155866, + "grad_norm": 7.292533874511719, + "learning_rate": 2.9891544658493876e-05, + "loss": 1.192, + "num_input_tokens_seen": 161190552, + "step": 10018 + }, + { + "epoch": 0.7018133739613158, + "grad_norm": 5.342220783233643, + "learning_rate": 2.988454640980735e-05, + "loss": 1.0906, + "num_input_tokens_seen": 161206936, + "step": 10019 + }, + { + "epoch": 0.7018834222070451, + "grad_norm": 3.818959951400757, + "learning_rate": 2.9877548161120843e-05, + "loss": 1.0778, + "num_input_tokens_seen": 161223320, + "step": 10020 + }, + { + "epoch": 0.7019534704527743, + "grad_norm": 4.779533386230469, + "learning_rate": 2.987054991243432e-05, + "loss": 1.2013, + "num_input_tokens_seen": 161239224, + "step": 10021 + }, + { + "epoch": 0.7020235186985035, + "grad_norm": 4.7241363525390625, + "learning_rate": 2.9863551663747807e-05, + "loss": 0.9601, + "num_input_tokens_seen": 161254944, + "step": 10022 + }, + { + "epoch": 0.7020935669442329, + "grad_norm": 4.112974643707275, + "learning_rate": 2.98565534150613e-05, + "loss": 1.1438, + "num_input_tokens_seen": 161271216, + "step": 10023 + }, + { + "epoch": 0.7021636151899621, + "grad_norm": 4.047839641571045, + "learning_rate": 2.9849555166374777e-05, + "loss": 0.9126, + "num_input_tokens_seen": 161286872, + "step": 10024 + }, + { + "epoch": 0.7022336634356914, + "grad_norm": 3.397982597351074, + "learning_rate": 2.984255691768827e-05, + "loss": 0.8711, + "num_input_tokens_seen": 161303088, + "step": 10025 + }, + { + "epoch": 0.7023037116814206, + "grad_norm": 4.357393741607666, + "learning_rate": 2.9835558669001744e-05, + "loss": 1.0642, + "num_input_tokens_seen": 161319472, + "step": 10026 + }, + { + "epoch": 0.7023737599271498, + "grad_norm": 5.623782157897949, + "learning_rate": 2.982856042031524e-05, + "loss": 0.8932, + "num_input_tokens_seen": 161335472, + "step": 10027 + }, + { + "epoch": 0.7024438081728791, + "grad_norm": 4.734277248382568, + "learning_rate": 2.9821562171628725e-05, + "loss": 1.2361, + "num_input_tokens_seen": 161351856, + "step": 10028 + }, + { + "epoch": 0.7025138564186083, + "grad_norm": 4.250011444091797, + "learning_rate": 2.98145639229422e-05, + "loss": 0.9704, + "num_input_tokens_seen": 161368240, + "step": 10029 + }, + { + "epoch": 0.7025839046643375, + "grad_norm": 4.014016628265381, + "learning_rate": 2.9807565674255695e-05, + "loss": 1.047, + "num_input_tokens_seen": 161384624, + "step": 10030 + }, + { + "epoch": 0.7026539529100668, + "grad_norm": 4.941507816314697, + "learning_rate": 2.9800567425569177e-05, + "loss": 0.8968, + "num_input_tokens_seen": 161400152, + "step": 10031 + }, + { + "epoch": 0.702724001155796, + "grad_norm": 5.39419412612915, + "learning_rate": 2.9793569176882662e-05, + "loss": 0.8279, + "num_input_tokens_seen": 161416264, + "step": 10032 + }, + { + "epoch": 0.7027940494015253, + "grad_norm": 5.776906490325928, + "learning_rate": 2.978657092819615e-05, + "loss": 1.0014, + "num_input_tokens_seen": 161432648, + "step": 10033 + }, + { + "epoch": 0.7028640976472545, + "grad_norm": 5.846651077270508, + "learning_rate": 2.9779572679509633e-05, + "loss": 1.2137, + "num_input_tokens_seen": 161449032, + "step": 10034 + }, + { + "epoch": 0.7029341458929838, + "grad_norm": 4.244979381561279, + "learning_rate": 2.9772574430823118e-05, + "loss": 1.2836, + "num_input_tokens_seen": 161465416, + "step": 10035 + }, + { + "epoch": 0.7030041941387131, + "grad_norm": 3.282975435256958, + "learning_rate": 2.97655761821366e-05, + "loss": 0.8286, + "num_input_tokens_seen": 161481800, + "step": 10036 + }, + { + "epoch": 0.7030742423844423, + "grad_norm": 3.5289463996887207, + "learning_rate": 2.9758577933450088e-05, + "loss": 1.0605, + "num_input_tokens_seen": 161498184, + "step": 10037 + }, + { + "epoch": 0.7031442906301715, + "grad_norm": 6.254805564880371, + "learning_rate": 2.9751579684763577e-05, + "loss": 1.0343, + "num_input_tokens_seen": 161514144, + "step": 10038 + }, + { + "epoch": 0.7032143388759008, + "grad_norm": 3.361053705215454, + "learning_rate": 2.974458143607706e-05, + "loss": 0.8051, + "num_input_tokens_seen": 161530528, + "step": 10039 + }, + { + "epoch": 0.70328438712163, + "grad_norm": 7.596766948699951, + "learning_rate": 2.9737583187390544e-05, + "loss": 1.1947, + "num_input_tokens_seen": 161546912, + "step": 10040 + }, + { + "epoch": 0.7033544353673593, + "grad_norm": 3.704799175262451, + "learning_rate": 2.9730584938704026e-05, + "loss": 1.1108, + "num_input_tokens_seen": 161562232, + "step": 10041 + }, + { + "epoch": 0.7034244836130885, + "grad_norm": 4.690667629241943, + "learning_rate": 2.9723586690017514e-05, + "loss": 1.0278, + "num_input_tokens_seen": 161578120, + "step": 10042 + }, + { + "epoch": 0.7034945318588177, + "grad_norm": 3.6758084297180176, + "learning_rate": 2.9716588441331e-05, + "loss": 0.8591, + "num_input_tokens_seen": 161594504, + "step": 10043 + }, + { + "epoch": 0.703564580104547, + "grad_norm": 6.599920272827148, + "learning_rate": 2.970959019264448e-05, + "loss": 1.0026, + "num_input_tokens_seen": 161610536, + "step": 10044 + }, + { + "epoch": 0.7036346283502762, + "grad_norm": 7.760443687438965, + "learning_rate": 2.970259194395797e-05, + "loss": 1.099, + "num_input_tokens_seen": 161626256, + "step": 10045 + }, + { + "epoch": 0.7037046765960056, + "grad_norm": 3.263460636138916, + "learning_rate": 2.969559369527145e-05, + "loss": 0.8826, + "num_input_tokens_seen": 161642640, + "step": 10046 + }, + { + "epoch": 0.7037747248417348, + "grad_norm": 3.6771352291107178, + "learning_rate": 2.9688595446584937e-05, + "loss": 1.134, + "num_input_tokens_seen": 161658208, + "step": 10047 + }, + { + "epoch": 0.703844773087464, + "grad_norm": 3.5002529621124268, + "learning_rate": 2.968159719789842e-05, + "loss": 0.9892, + "num_input_tokens_seen": 161674144, + "step": 10048 + }, + { + "epoch": 0.7039148213331933, + "grad_norm": 4.207690238952637, + "learning_rate": 2.9674598949211907e-05, + "loss": 1.0664, + "num_input_tokens_seen": 161690528, + "step": 10049 + }, + { + "epoch": 0.7039848695789225, + "grad_norm": 6.3054423332214355, + "learning_rate": 2.9667600700525396e-05, + "loss": 0.9844, + "num_input_tokens_seen": 161706776, + "step": 10050 + }, + { + "epoch": 0.7040549178246517, + "grad_norm": 3.508957624435425, + "learning_rate": 2.9660602451838874e-05, + "loss": 1.1167, + "num_input_tokens_seen": 161723160, + "step": 10051 + }, + { + "epoch": 0.704124966070381, + "grad_norm": 3.7781097888946533, + "learning_rate": 2.965360420315237e-05, + "loss": 1.0289, + "num_input_tokens_seen": 161739544, + "step": 10052 + }, + { + "epoch": 0.7041950143161102, + "grad_norm": 4.113853454589844, + "learning_rate": 2.9646605954465845e-05, + "loss": 1.1348, + "num_input_tokens_seen": 161755928, + "step": 10053 + }, + { + "epoch": 0.7042650625618395, + "grad_norm": 4.110382080078125, + "learning_rate": 2.9639607705779333e-05, + "loss": 1.1124, + "num_input_tokens_seen": 161771920, + "step": 10054 + }, + { + "epoch": 0.7043351108075687, + "grad_norm": 4.413727760314941, + "learning_rate": 2.963260945709282e-05, + "loss": 0.988, + "num_input_tokens_seen": 161788032, + "step": 10055 + }, + { + "epoch": 0.7044051590532979, + "grad_norm": 5.492150783538818, + "learning_rate": 2.96256112084063e-05, + "loss": 1.3986, + "num_input_tokens_seen": 161803432, + "step": 10056 + }, + { + "epoch": 0.7044752072990272, + "grad_norm": 4.049644947052002, + "learning_rate": 2.9618612959719792e-05, + "loss": 1.146, + "num_input_tokens_seen": 161819816, + "step": 10057 + }, + { + "epoch": 0.7045452555447564, + "grad_norm": 3.4189934730529785, + "learning_rate": 2.961161471103327e-05, + "loss": 0.8556, + "num_input_tokens_seen": 161836200, + "step": 10058 + }, + { + "epoch": 0.7046153037904856, + "grad_norm": 3.3457398414611816, + "learning_rate": 2.9604616462346756e-05, + "loss": 0.8749, + "num_input_tokens_seen": 161852584, + "step": 10059 + }, + { + "epoch": 0.704685352036215, + "grad_norm": 4.639912128448486, + "learning_rate": 2.959761821366025e-05, + "loss": 1.0921, + "num_input_tokens_seen": 161868960, + "step": 10060 + }, + { + "epoch": 0.7047554002819442, + "grad_norm": 4.668527126312256, + "learning_rate": 2.9590619964973726e-05, + "loss": 1.1327, + "num_input_tokens_seen": 161885344, + "step": 10061 + }, + { + "epoch": 0.7048254485276735, + "grad_norm": 4.452634811401367, + "learning_rate": 2.9583621716287218e-05, + "loss": 1.3208, + "num_input_tokens_seen": 161900728, + "step": 10062 + }, + { + "epoch": 0.7048954967734027, + "grad_norm": 6.427389621734619, + "learning_rate": 2.9576623467600693e-05, + "loss": 0.9741, + "num_input_tokens_seen": 161916888, + "step": 10063 + }, + { + "epoch": 0.7049655450191319, + "grad_norm": 3.9564223289489746, + "learning_rate": 2.956962521891419e-05, + "loss": 1.1183, + "num_input_tokens_seen": 161933272, + "step": 10064 + }, + { + "epoch": 0.7050355932648612, + "grad_norm": 3.6865665912628174, + "learning_rate": 2.9562626970227674e-05, + "loss": 0.9469, + "num_input_tokens_seen": 161949288, + "step": 10065 + }, + { + "epoch": 0.7051056415105904, + "grad_norm": 4.036081790924072, + "learning_rate": 2.9555628721541152e-05, + "loss": 1.1211, + "num_input_tokens_seen": 161965672, + "step": 10066 + }, + { + "epoch": 0.7051756897563196, + "grad_norm": 3.775333881378174, + "learning_rate": 2.9548630472854644e-05, + "loss": 0.9414, + "num_input_tokens_seen": 161981600, + "step": 10067 + }, + { + "epoch": 0.7052457380020489, + "grad_norm": 4.128803253173828, + "learning_rate": 2.9541632224168126e-05, + "loss": 1.1112, + "num_input_tokens_seen": 161997984, + "step": 10068 + }, + { + "epoch": 0.7053157862477781, + "grad_norm": 4.627614498138428, + "learning_rate": 2.953463397548161e-05, + "loss": 0.8422, + "num_input_tokens_seen": 162014368, + "step": 10069 + }, + { + "epoch": 0.7053858344935074, + "grad_norm": 9.56108570098877, + "learning_rate": 2.95276357267951e-05, + "loss": 1.1856, + "num_input_tokens_seen": 162030512, + "step": 10070 + }, + { + "epoch": 0.7054558827392367, + "grad_norm": 3.9285595417022705, + "learning_rate": 2.9520637478108575e-05, + "loss": 1.0792, + "num_input_tokens_seen": 162046504, + "step": 10071 + }, + { + "epoch": 0.7055259309849659, + "grad_norm": 6.086811065673828, + "learning_rate": 2.951363922942207e-05, + "loss": 1.0556, + "num_input_tokens_seen": 162062888, + "step": 10072 + }, + { + "epoch": 0.7055959792306952, + "grad_norm": 3.8708689212799072, + "learning_rate": 2.950664098073555e-05, + "loss": 0.884, + "num_input_tokens_seen": 162078496, + "step": 10073 + }, + { + "epoch": 0.7056660274764244, + "grad_norm": 4.602541923522949, + "learning_rate": 2.9499642732049037e-05, + "loss": 1.2931, + "num_input_tokens_seen": 162094880, + "step": 10074 + }, + { + "epoch": 0.7057360757221537, + "grad_norm": 3.6065030097961426, + "learning_rate": 2.9492644483362512e-05, + "loss": 1.1081, + "num_input_tokens_seen": 162110432, + "step": 10075 + }, + { + "epoch": 0.7058061239678829, + "grad_norm": 6.054698467254639, + "learning_rate": 2.9485646234676008e-05, + "loss": 0.9708, + "num_input_tokens_seen": 162125776, + "step": 10076 + }, + { + "epoch": 0.7058761722136121, + "grad_norm": 3.5533549785614014, + "learning_rate": 2.9478647985989493e-05, + "loss": 1.1084, + "num_input_tokens_seen": 162142160, + "step": 10077 + }, + { + "epoch": 0.7059462204593414, + "grad_norm": 6.547595024108887, + "learning_rate": 2.9471649737302975e-05, + "loss": 0.8649, + "num_input_tokens_seen": 162158120, + "step": 10078 + }, + { + "epoch": 0.7060162687050706, + "grad_norm": 3.7578535079956055, + "learning_rate": 2.9464651488616463e-05, + "loss": 1.0864, + "num_input_tokens_seen": 162174264, + "step": 10079 + }, + { + "epoch": 0.7060863169507998, + "grad_norm": 3.6316750049591064, + "learning_rate": 2.9457653239929945e-05, + "loss": 0.9306, + "num_input_tokens_seen": 162189632, + "step": 10080 + }, + { + "epoch": 0.7061563651965291, + "grad_norm": 3.7164456844329834, + "learning_rate": 2.945065499124343e-05, + "loss": 0.9419, + "num_input_tokens_seen": 162205376, + "step": 10081 + }, + { + "epoch": 0.7062264134422583, + "grad_norm": 3.5166351795196533, + "learning_rate": 2.9443656742556926e-05, + "loss": 1.0768, + "num_input_tokens_seen": 162221760, + "step": 10082 + }, + { + "epoch": 0.7062964616879877, + "grad_norm": 4.940329551696777, + "learning_rate": 2.94366584938704e-05, + "loss": 1.0603, + "num_input_tokens_seen": 162237592, + "step": 10083 + }, + { + "epoch": 0.7063665099337169, + "grad_norm": 5.47645902633667, + "learning_rate": 2.942966024518389e-05, + "loss": 1.0955, + "num_input_tokens_seen": 162253976, + "step": 10084 + }, + { + "epoch": 0.7064365581794461, + "grad_norm": 4.766308784484863, + "learning_rate": 2.9422661996497368e-05, + "loss": 1.1437, + "num_input_tokens_seen": 162269344, + "step": 10085 + }, + { + "epoch": 0.7065066064251754, + "grad_norm": 4.8654375076293945, + "learning_rate": 2.9415663747810856e-05, + "loss": 0.9426, + "num_input_tokens_seen": 162285112, + "step": 10086 + }, + { + "epoch": 0.7065766546709046, + "grad_norm": 3.4488401412963867, + "learning_rate": 2.940866549912435e-05, + "loss": 0.9307, + "num_input_tokens_seen": 162301432, + "step": 10087 + }, + { + "epoch": 0.7066467029166338, + "grad_norm": 4.035887718200684, + "learning_rate": 2.9401667250437827e-05, + "loss": 0.9301, + "num_input_tokens_seen": 162317776, + "step": 10088 + }, + { + "epoch": 0.7067167511623631, + "grad_norm": 3.6484487056732178, + "learning_rate": 2.9394669001751312e-05, + "loss": 0.9782, + "num_input_tokens_seen": 162333872, + "step": 10089 + }, + { + "epoch": 0.7067867994080923, + "grad_norm": 4.192502498626709, + "learning_rate": 2.9387670753064794e-05, + "loss": 1.2717, + "num_input_tokens_seen": 162350256, + "step": 10090 + }, + { + "epoch": 0.7068568476538216, + "grad_norm": 4.043034076690674, + "learning_rate": 2.9380672504378282e-05, + "loss": 1.0269, + "num_input_tokens_seen": 162366640, + "step": 10091 + }, + { + "epoch": 0.7069268958995508, + "grad_norm": 5.189533710479736, + "learning_rate": 2.9373674255691774e-05, + "loss": 1.1543, + "num_input_tokens_seen": 162382832, + "step": 10092 + }, + { + "epoch": 0.70699694414528, + "grad_norm": 4.4720869064331055, + "learning_rate": 2.936667600700525e-05, + "loss": 1.149, + "num_input_tokens_seen": 162399216, + "step": 10093 + }, + { + "epoch": 0.7070669923910093, + "grad_norm": 6.456845283508301, + "learning_rate": 2.9359677758318745e-05, + "loss": 1.1416, + "num_input_tokens_seen": 162415600, + "step": 10094 + }, + { + "epoch": 0.7071370406367385, + "grad_norm": 4.491336822509766, + "learning_rate": 2.935267950963222e-05, + "loss": 1.0357, + "num_input_tokens_seen": 162431944, + "step": 10095 + }, + { + "epoch": 0.7072070888824677, + "grad_norm": 7.4843430519104, + "learning_rate": 2.9345681260945705e-05, + "loss": 1.2356, + "num_input_tokens_seen": 162448328, + "step": 10096 + }, + { + "epoch": 0.7072771371281971, + "grad_norm": 4.887217044830322, + "learning_rate": 2.93386830122592e-05, + "loss": 1.169, + "num_input_tokens_seen": 162463952, + "step": 10097 + }, + { + "epoch": 0.7073471853739263, + "grad_norm": 3.566239595413208, + "learning_rate": 2.9331684763572682e-05, + "loss": 1.0325, + "num_input_tokens_seen": 162480336, + "step": 10098 + }, + { + "epoch": 0.7074172336196556, + "grad_norm": 3.828345537185669, + "learning_rate": 2.9324686514886167e-05, + "loss": 1.0109, + "num_input_tokens_seen": 162496184, + "step": 10099 + }, + { + "epoch": 0.7074872818653848, + "grad_norm": 4.090230464935303, + "learning_rate": 2.9317688266199646e-05, + "loss": 1.055, + "num_input_tokens_seen": 162512456, + "step": 10100 + }, + { + "epoch": 0.707557330111114, + "grad_norm": 3.957228422164917, + "learning_rate": 2.931069001751313e-05, + "loss": 1.1194, + "num_input_tokens_seen": 162528840, + "step": 10101 + }, + { + "epoch": 0.7076273783568433, + "grad_norm": 4.136928558349609, + "learning_rate": 2.9303691768826613e-05, + "loss": 1.1885, + "num_input_tokens_seen": 162545224, + "step": 10102 + }, + { + "epoch": 0.7076974266025725, + "grad_norm": 4.075767517089844, + "learning_rate": 2.9296693520140105e-05, + "loss": 0.9965, + "num_input_tokens_seen": 162560352, + "step": 10103 + }, + { + "epoch": 0.7077674748483017, + "grad_norm": 4.019789695739746, + "learning_rate": 2.9289695271453593e-05, + "loss": 1.0464, + "num_input_tokens_seen": 162576304, + "step": 10104 + }, + { + "epoch": 0.707837523094031, + "grad_norm": 3.8875513076782227, + "learning_rate": 2.928269702276707e-05, + "loss": 0.9253, + "num_input_tokens_seen": 162592688, + "step": 10105 + }, + { + "epoch": 0.7079075713397602, + "grad_norm": 6.7977471351623535, + "learning_rate": 2.9275698774080564e-05, + "loss": 1.1282, + "num_input_tokens_seen": 162609072, + "step": 10106 + }, + { + "epoch": 0.7079776195854895, + "grad_norm": 4.503828525543213, + "learning_rate": 2.926870052539404e-05, + "loss": 1.1017, + "num_input_tokens_seen": 162624960, + "step": 10107 + }, + { + "epoch": 0.7080476678312188, + "grad_norm": 3.684790849685669, + "learning_rate": 2.926170227670753e-05, + "loss": 1.0087, + "num_input_tokens_seen": 162641152, + "step": 10108 + }, + { + "epoch": 0.708117716076948, + "grad_norm": 4.349503517150879, + "learning_rate": 2.925470402802102e-05, + "loss": 1.243, + "num_input_tokens_seen": 162657064, + "step": 10109 + }, + { + "epoch": 0.7081877643226773, + "grad_norm": 4.057005882263184, + "learning_rate": 2.92477057793345e-05, + "loss": 1.1071, + "num_input_tokens_seen": 162673448, + "step": 10110 + }, + { + "epoch": 0.7082578125684065, + "grad_norm": 4.564140796661377, + "learning_rate": 2.9240707530647986e-05, + "loss": 0.8929, + "num_input_tokens_seen": 162689832, + "step": 10111 + }, + { + "epoch": 0.7083278608141358, + "grad_norm": 3.9894440174102783, + "learning_rate": 2.9233709281961465e-05, + "loss": 1.2122, + "num_input_tokens_seen": 162705680, + "step": 10112 + }, + { + "epoch": 0.708397909059865, + "grad_norm": 3.654106855392456, + "learning_rate": 2.9226711033274957e-05, + "loss": 0.9486, + "num_input_tokens_seen": 162720896, + "step": 10113 + }, + { + "epoch": 0.7084679573055942, + "grad_norm": 4.905731201171875, + "learning_rate": 2.9219712784588442e-05, + "loss": 1.2464, + "num_input_tokens_seen": 162737280, + "step": 10114 + }, + { + "epoch": 0.7085380055513235, + "grad_norm": 3.9459686279296875, + "learning_rate": 2.9212714535901924e-05, + "loss": 0.8647, + "num_input_tokens_seen": 162753664, + "step": 10115 + }, + { + "epoch": 0.7086080537970527, + "grad_norm": 4.94581937789917, + "learning_rate": 2.9205716287215412e-05, + "loss": 1.098, + "num_input_tokens_seen": 162769848, + "step": 10116 + }, + { + "epoch": 0.7086781020427819, + "grad_norm": 3.5704598426818848, + "learning_rate": 2.9198718038528887e-05, + "loss": 1.1402, + "num_input_tokens_seen": 162786160, + "step": 10117 + }, + { + "epoch": 0.7087481502885112, + "grad_norm": 5.69130277633667, + "learning_rate": 2.919171978984238e-05, + "loss": 1.3884, + "num_input_tokens_seen": 162802544, + "step": 10118 + }, + { + "epoch": 0.7088181985342404, + "grad_norm": 8.142668724060059, + "learning_rate": 2.9184721541155875e-05, + "loss": 0.9914, + "num_input_tokens_seen": 162818928, + "step": 10119 + }, + { + "epoch": 0.7088882467799698, + "grad_norm": 5.4760918617248535, + "learning_rate": 2.917772329246935e-05, + "loss": 0.9987, + "num_input_tokens_seen": 162835312, + "step": 10120 + }, + { + "epoch": 0.708958295025699, + "grad_norm": 3.6087825298309326, + "learning_rate": 2.917072504378284e-05, + "loss": 1.1799, + "num_input_tokens_seen": 162851648, + "step": 10121 + }, + { + "epoch": 0.7090283432714282, + "grad_norm": 4.520792007446289, + "learning_rate": 2.916372679509632e-05, + "loss": 1.2501, + "num_input_tokens_seen": 162868032, + "step": 10122 + }, + { + "epoch": 0.7090983915171575, + "grad_norm": 3.704040765762329, + "learning_rate": 2.9156728546409805e-05, + "loss": 1.0613, + "num_input_tokens_seen": 162884416, + "step": 10123 + }, + { + "epoch": 0.7091684397628867, + "grad_norm": 3.810776472091675, + "learning_rate": 2.9149730297723297e-05, + "loss": 0.9037, + "num_input_tokens_seen": 162900560, + "step": 10124 + }, + { + "epoch": 0.7092384880086159, + "grad_norm": 5.723479270935059, + "learning_rate": 2.9142732049036776e-05, + "loss": 1.1245, + "num_input_tokens_seen": 162916944, + "step": 10125 + }, + { + "epoch": 0.7093085362543452, + "grad_norm": 4.733275413513184, + "learning_rate": 2.913573380035026e-05, + "loss": 1.1192, + "num_input_tokens_seen": 162932616, + "step": 10126 + }, + { + "epoch": 0.7093785845000744, + "grad_norm": 4.826333522796631, + "learning_rate": 2.9128735551663743e-05, + "loss": 1.0966, + "num_input_tokens_seen": 162947728, + "step": 10127 + }, + { + "epoch": 0.7094486327458037, + "grad_norm": 4.671482086181641, + "learning_rate": 2.9121737302977238e-05, + "loss": 1.1226, + "num_input_tokens_seen": 162964112, + "step": 10128 + }, + { + "epoch": 0.7095186809915329, + "grad_norm": 3.523709774017334, + "learning_rate": 2.9114739054290713e-05, + "loss": 0.9052, + "num_input_tokens_seen": 162980400, + "step": 10129 + }, + { + "epoch": 0.7095887292372621, + "grad_norm": 3.397223711013794, + "learning_rate": 2.91077408056042e-05, + "loss": 1.0204, + "num_input_tokens_seen": 162996704, + "step": 10130 + }, + { + "epoch": 0.7096587774829914, + "grad_norm": 5.219079494476318, + "learning_rate": 2.9100742556917694e-05, + "loss": 1.1563, + "num_input_tokens_seen": 163012048, + "step": 10131 + }, + { + "epoch": 0.7097288257287206, + "grad_norm": 4.322859287261963, + "learning_rate": 2.909374430823117e-05, + "loss": 1.2806, + "num_input_tokens_seen": 163028432, + "step": 10132 + }, + { + "epoch": 0.7097988739744499, + "grad_norm": 3.52075457572937, + "learning_rate": 2.908674605954466e-05, + "loss": 0.9903, + "num_input_tokens_seen": 163044816, + "step": 10133 + }, + { + "epoch": 0.7098689222201792, + "grad_norm": 3.5259647369384766, + "learning_rate": 2.907974781085814e-05, + "loss": 0.9446, + "num_input_tokens_seen": 163061152, + "step": 10134 + }, + { + "epoch": 0.7099389704659084, + "grad_norm": 3.873305082321167, + "learning_rate": 2.907274956217163e-05, + "loss": 1.1303, + "num_input_tokens_seen": 163076976, + "step": 10135 + }, + { + "epoch": 0.7100090187116377, + "grad_norm": 3.702538251876831, + "learning_rate": 2.9065751313485116e-05, + "loss": 1.1233, + "num_input_tokens_seen": 163093360, + "step": 10136 + }, + { + "epoch": 0.7100790669573669, + "grad_norm": 4.439333915710449, + "learning_rate": 2.9058753064798595e-05, + "loss": 0.9589, + "num_input_tokens_seen": 163109744, + "step": 10137 + }, + { + "epoch": 0.7101491152030961, + "grad_norm": 4.237407207489014, + "learning_rate": 2.9051754816112087e-05, + "loss": 1.1367, + "num_input_tokens_seen": 163126128, + "step": 10138 + }, + { + "epoch": 0.7102191634488254, + "grad_norm": 7.355090618133545, + "learning_rate": 2.9044756567425562e-05, + "loss": 1.1257, + "num_input_tokens_seen": 163139560, + "step": 10139 + }, + { + "epoch": 0.7102892116945546, + "grad_norm": 4.0328216552734375, + "learning_rate": 2.9037758318739057e-05, + "loss": 1.1562, + "num_input_tokens_seen": 163155944, + "step": 10140 + }, + { + "epoch": 0.7103592599402838, + "grad_norm": 5.896379470825195, + "learning_rate": 2.9030760070052542e-05, + "loss": 0.9636, + "num_input_tokens_seen": 163172328, + "step": 10141 + }, + { + "epoch": 0.7104293081860131, + "grad_norm": 4.2958807945251465, + "learning_rate": 2.9023761821366017e-05, + "loss": 1.0225, + "num_input_tokens_seen": 163188712, + "step": 10142 + }, + { + "epoch": 0.7104993564317423, + "grad_norm": 5.122844696044922, + "learning_rate": 2.9016763572679513e-05, + "loss": 1.0207, + "num_input_tokens_seen": 163205032, + "step": 10143 + }, + { + "epoch": 0.7105694046774716, + "grad_norm": 3.780733823776245, + "learning_rate": 2.9009765323992995e-05, + "loss": 1.179, + "num_input_tokens_seen": 163221408, + "step": 10144 + }, + { + "epoch": 0.7106394529232009, + "grad_norm": 3.5465266704559326, + "learning_rate": 2.900276707530648e-05, + "loss": 1.0262, + "num_input_tokens_seen": 163237640, + "step": 10145 + }, + { + "epoch": 0.7107095011689301, + "grad_norm": 3.7843377590179443, + "learning_rate": 2.899576882661997e-05, + "loss": 1.0915, + "num_input_tokens_seen": 163254024, + "step": 10146 + }, + { + "epoch": 0.7107795494146594, + "grad_norm": 6.503340721130371, + "learning_rate": 2.898877057793345e-05, + "loss": 1.0151, + "num_input_tokens_seen": 163270408, + "step": 10147 + }, + { + "epoch": 0.7108495976603886, + "grad_norm": 3.9190897941589355, + "learning_rate": 2.8981772329246935e-05, + "loss": 1.2116, + "num_input_tokens_seen": 163286792, + "step": 10148 + }, + { + "epoch": 0.7109196459061179, + "grad_norm": 4.070304870605469, + "learning_rate": 2.8974774080560417e-05, + "loss": 0.9191, + "num_input_tokens_seen": 163303152, + "step": 10149 + }, + { + "epoch": 0.7109896941518471, + "grad_norm": 5.0177226066589355, + "learning_rate": 2.8967775831873906e-05, + "loss": 1.3018, + "num_input_tokens_seen": 163319536, + "step": 10150 + }, + { + "epoch": 0.7110597423975763, + "grad_norm": 3.3853724002838135, + "learning_rate": 2.8960777583187394e-05, + "loss": 1.0065, + "num_input_tokens_seen": 163335816, + "step": 10151 + }, + { + "epoch": 0.7111297906433056, + "grad_norm": 3.6592559814453125, + "learning_rate": 2.8953779334500873e-05, + "loss": 1.0425, + "num_input_tokens_seen": 163352200, + "step": 10152 + }, + { + "epoch": 0.7111998388890348, + "grad_norm": 4.346684455871582, + "learning_rate": 2.8946781085814368e-05, + "loss": 1.0804, + "num_input_tokens_seen": 163367560, + "step": 10153 + }, + { + "epoch": 0.711269887134764, + "grad_norm": 5.302639007568359, + "learning_rate": 2.8939782837127843e-05, + "loss": 1.054, + "num_input_tokens_seen": 163383944, + "step": 10154 + }, + { + "epoch": 0.7113399353804933, + "grad_norm": 3.6945741176605225, + "learning_rate": 2.8932784588441332e-05, + "loss": 1.0826, + "num_input_tokens_seen": 163400328, + "step": 10155 + }, + { + "epoch": 0.7114099836262225, + "grad_norm": 4.606947898864746, + "learning_rate": 2.8925786339754814e-05, + "loss": 1.248, + "num_input_tokens_seen": 163415648, + "step": 10156 + }, + { + "epoch": 0.7114800318719519, + "grad_norm": 5.846989154815674, + "learning_rate": 2.89187880910683e-05, + "loss": 0.9289, + "num_input_tokens_seen": 163431272, + "step": 10157 + }, + { + "epoch": 0.7115500801176811, + "grad_norm": 3.610605239868164, + "learning_rate": 2.891178984238179e-05, + "loss": 1.0373, + "num_input_tokens_seen": 163447472, + "step": 10158 + }, + { + "epoch": 0.7116201283634103, + "grad_norm": 3.9474918842315674, + "learning_rate": 2.890479159369527e-05, + "loss": 1.1583, + "num_input_tokens_seen": 163463032, + "step": 10159 + }, + { + "epoch": 0.7116901766091396, + "grad_norm": 5.435600280761719, + "learning_rate": 2.8897793345008755e-05, + "loss": 1.1946, + "num_input_tokens_seen": 163478904, + "step": 10160 + }, + { + "epoch": 0.7117602248548688, + "grad_norm": 6.368228912353516, + "learning_rate": 2.8890795096322236e-05, + "loss": 1.3101, + "num_input_tokens_seen": 163494480, + "step": 10161 + }, + { + "epoch": 0.711830273100598, + "grad_norm": 4.408838272094727, + "learning_rate": 2.8883796847635725e-05, + "loss": 1.2293, + "num_input_tokens_seen": 163509680, + "step": 10162 + }, + { + "epoch": 0.7119003213463273, + "grad_norm": 4.071101665496826, + "learning_rate": 2.8876798598949217e-05, + "loss": 0.9217, + "num_input_tokens_seen": 163524376, + "step": 10163 + }, + { + "epoch": 0.7119703695920565, + "grad_norm": 5.084413051605225, + "learning_rate": 2.8869800350262692e-05, + "loss": 1.1191, + "num_input_tokens_seen": 163540760, + "step": 10164 + }, + { + "epoch": 0.7120404178377858, + "grad_norm": 3.890895366668701, + "learning_rate": 2.8862802101576187e-05, + "loss": 1.0115, + "num_input_tokens_seen": 163556488, + "step": 10165 + }, + { + "epoch": 0.712110466083515, + "grad_norm": 3.4019622802734375, + "learning_rate": 2.8855803852889662e-05, + "loss": 0.9896, + "num_input_tokens_seen": 163572872, + "step": 10166 + }, + { + "epoch": 0.7121805143292442, + "grad_norm": 4.469103813171387, + "learning_rate": 2.884880560420315e-05, + "loss": 1.058, + "num_input_tokens_seen": 163589256, + "step": 10167 + }, + { + "epoch": 0.7122505625749735, + "grad_norm": 6.599802017211914, + "learning_rate": 2.8841807355516643e-05, + "loss": 0.9967, + "num_input_tokens_seen": 163605640, + "step": 10168 + }, + { + "epoch": 0.7123206108207027, + "grad_norm": 3.251373767852783, + "learning_rate": 2.8834809106830125e-05, + "loss": 0.9301, + "num_input_tokens_seen": 163621856, + "step": 10169 + }, + { + "epoch": 0.712390659066432, + "grad_norm": 4.436539173126221, + "learning_rate": 2.882781085814361e-05, + "loss": 1.0816, + "num_input_tokens_seen": 163638240, + "step": 10170 + }, + { + "epoch": 0.7124607073121613, + "grad_norm": 3.8655641078948975, + "learning_rate": 2.882081260945709e-05, + "loss": 1.1814, + "num_input_tokens_seen": 163654624, + "step": 10171 + }, + { + "epoch": 0.7125307555578905, + "grad_norm": 3.812119483947754, + "learning_rate": 2.8813814360770574e-05, + "loss": 1.0635, + "num_input_tokens_seen": 163671008, + "step": 10172 + }, + { + "epoch": 0.7126008038036198, + "grad_norm": 3.9413108825683594, + "learning_rate": 2.880681611208407e-05, + "loss": 0.9148, + "num_input_tokens_seen": 163686760, + "step": 10173 + }, + { + "epoch": 0.712670852049349, + "grad_norm": 4.039848327636719, + "learning_rate": 2.879981786339755e-05, + "loss": 1.0447, + "num_input_tokens_seen": 163703144, + "step": 10174 + }, + { + "epoch": 0.7127409002950782, + "grad_norm": 3.8388967514038086, + "learning_rate": 2.8792819614711036e-05, + "loss": 1.177, + "num_input_tokens_seen": 163719528, + "step": 10175 + }, + { + "epoch": 0.7128109485408075, + "grad_norm": 5.222955226898193, + "learning_rate": 2.878582136602451e-05, + "loss": 1.0926, + "num_input_tokens_seen": 163735912, + "step": 10176 + }, + { + "epoch": 0.7128809967865367, + "grad_norm": 3.552682876586914, + "learning_rate": 2.8778823117338006e-05, + "loss": 0.9928, + "num_input_tokens_seen": 163752296, + "step": 10177 + }, + { + "epoch": 0.712951045032266, + "grad_norm": 3.854668140411377, + "learning_rate": 2.877182486865149e-05, + "loss": 1.1885, + "num_input_tokens_seen": 163768000, + "step": 10178 + }, + { + "epoch": 0.7130210932779952, + "grad_norm": 3.7080280780792236, + "learning_rate": 2.8764826619964973e-05, + "loss": 1.0274, + "num_input_tokens_seen": 163783576, + "step": 10179 + }, + { + "epoch": 0.7130911415237244, + "grad_norm": 4.082748889923096, + "learning_rate": 2.8757828371278462e-05, + "loss": 1.1456, + "num_input_tokens_seen": 163799960, + "step": 10180 + }, + { + "epoch": 0.7131611897694538, + "grad_norm": 3.812077283859253, + "learning_rate": 2.8750830122591944e-05, + "loss": 1.0388, + "num_input_tokens_seen": 163816344, + "step": 10181 + }, + { + "epoch": 0.713231238015183, + "grad_norm": 4.2239580154418945, + "learning_rate": 2.874383187390543e-05, + "loss": 1.0814, + "num_input_tokens_seen": 163832728, + "step": 10182 + }, + { + "epoch": 0.7133012862609122, + "grad_norm": 4.545609951019287, + "learning_rate": 2.8736833625218907e-05, + "loss": 1.0128, + "num_input_tokens_seen": 163847944, + "step": 10183 + }, + { + "epoch": 0.7133713345066415, + "grad_norm": 3.715818166732788, + "learning_rate": 2.87298353765324e-05, + "loss": 1.0009, + "num_input_tokens_seen": 163863672, + "step": 10184 + }, + { + "epoch": 0.7134413827523707, + "grad_norm": 3.7492432594299316, + "learning_rate": 2.8722837127845888e-05, + "loss": 1.0395, + "num_input_tokens_seen": 163880056, + "step": 10185 + }, + { + "epoch": 0.7135114309981, + "grad_norm": 4.049376487731934, + "learning_rate": 2.8715838879159366e-05, + "loss": 1.1694, + "num_input_tokens_seen": 163896440, + "step": 10186 + }, + { + "epoch": 0.7135814792438292, + "grad_norm": 3.32942795753479, + "learning_rate": 2.8708840630472855e-05, + "loss": 0.8517, + "num_input_tokens_seen": 163912224, + "step": 10187 + }, + { + "epoch": 0.7136515274895584, + "grad_norm": 3.8010544776916504, + "learning_rate": 2.870184238178633e-05, + "loss": 0.8891, + "num_input_tokens_seen": 163928608, + "step": 10188 + }, + { + "epoch": 0.7137215757352877, + "grad_norm": 6.62272310256958, + "learning_rate": 2.8694844133099825e-05, + "loss": 1.0394, + "num_input_tokens_seen": 163944840, + "step": 10189 + }, + { + "epoch": 0.7137916239810169, + "grad_norm": 3.8584375381469727, + "learning_rate": 2.868784588441331e-05, + "loss": 1.1386, + "num_input_tokens_seen": 163961224, + "step": 10190 + }, + { + "epoch": 0.7138616722267461, + "grad_norm": 5.351186275482178, + "learning_rate": 2.8680847635726792e-05, + "loss": 1.0391, + "num_input_tokens_seen": 163977608, + "step": 10191 + }, + { + "epoch": 0.7139317204724754, + "grad_norm": 6.586696624755859, + "learning_rate": 2.867384938704028e-05, + "loss": 1.0712, + "num_input_tokens_seen": 163993368, + "step": 10192 + }, + { + "epoch": 0.7140017687182046, + "grad_norm": 3.8578314781188965, + "learning_rate": 2.8666851138353763e-05, + "loss": 0.8987, + "num_input_tokens_seen": 164009560, + "step": 10193 + }, + { + "epoch": 0.714071816963934, + "grad_norm": 3.9483256340026855, + "learning_rate": 2.8659852889667248e-05, + "loss": 0.9672, + "num_input_tokens_seen": 164025456, + "step": 10194 + }, + { + "epoch": 0.7141418652096632, + "grad_norm": 5.458444595336914, + "learning_rate": 2.8652854640980743e-05, + "loss": 0.9112, + "num_input_tokens_seen": 164041376, + "step": 10195 + }, + { + "epoch": 0.7142119134553924, + "grad_norm": 3.439246892929077, + "learning_rate": 2.864585639229422e-05, + "loss": 0.9042, + "num_input_tokens_seen": 164057760, + "step": 10196 + }, + { + "epoch": 0.7142819617011217, + "grad_norm": 4.876694202423096, + "learning_rate": 2.8638858143607704e-05, + "loss": 0.9997, + "num_input_tokens_seen": 164074144, + "step": 10197 + }, + { + "epoch": 0.7143520099468509, + "grad_norm": 5.811089992523193, + "learning_rate": 2.8631859894921185e-05, + "loss": 1.1991, + "num_input_tokens_seen": 164089512, + "step": 10198 + }, + { + "epoch": 0.7144220581925801, + "grad_norm": 3.49867844581604, + "learning_rate": 2.862486164623468e-05, + "loss": 0.9684, + "num_input_tokens_seen": 164105856, + "step": 10199 + }, + { + "epoch": 0.7144921064383094, + "grad_norm": 4.251277446746826, + "learning_rate": 2.8617863397548166e-05, + "loss": 1.0473, + "num_input_tokens_seen": 164121816, + "step": 10200 + }, + { + "epoch": 0.7144921064383094, + "eval_loss": 1.1186010837554932, + "eval_runtime": 0.1801, + "eval_samples_per_second": 5.552, + "eval_steps_per_second": 5.552, + "num_input_tokens_seen": 164121816, + "step": 10200 + }, + { + "epoch": 0.7145621546840386, + "grad_norm": 3.9915730953216553, + "learning_rate": 2.8610865148861644e-05, + "loss": 1.2284, + "num_input_tokens_seen": 164138072, + "step": 10201 + }, + { + "epoch": 0.7146322029297679, + "grad_norm": 4.045751571655273, + "learning_rate": 2.860386690017513e-05, + "loss": 1.0396, + "num_input_tokens_seen": 164154456, + "step": 10202 + }, + { + "epoch": 0.7147022511754971, + "grad_norm": 3.8065125942230225, + "learning_rate": 2.859686865148861e-05, + "loss": 1.0566, + "num_input_tokens_seen": 164170840, + "step": 10203 + }, + { + "epoch": 0.7147722994212263, + "grad_norm": 3.6807355880737305, + "learning_rate": 2.8589870402802103e-05, + "loss": 1.0031, + "num_input_tokens_seen": 164187224, + "step": 10204 + }, + { + "epoch": 0.7148423476669556, + "grad_norm": 4.320464134216309, + "learning_rate": 2.8582872154115592e-05, + "loss": 1.3319, + "num_input_tokens_seen": 164202208, + "step": 10205 + }, + { + "epoch": 0.7149123959126849, + "grad_norm": 4.6180853843688965, + "learning_rate": 2.8575873905429067e-05, + "loss": 1.1077, + "num_input_tokens_seen": 164218016, + "step": 10206 + }, + { + "epoch": 0.714982444158414, + "grad_norm": 4.053890705108643, + "learning_rate": 2.8568875656742562e-05, + "loss": 1.2112, + "num_input_tokens_seen": 164234400, + "step": 10207 + }, + { + "epoch": 0.7150524924041434, + "grad_norm": 3.825147867202759, + "learning_rate": 2.8561877408056037e-05, + "loss": 0.8833, + "num_input_tokens_seen": 164250280, + "step": 10208 + }, + { + "epoch": 0.7151225406498726, + "grad_norm": 5.286179542541504, + "learning_rate": 2.855487915936953e-05, + "loss": 1.0059, + "num_input_tokens_seen": 164266080, + "step": 10209 + }, + { + "epoch": 0.7151925888956019, + "grad_norm": 4.496762275695801, + "learning_rate": 2.8547880910683004e-05, + "loss": 1.3002, + "num_input_tokens_seen": 164281888, + "step": 10210 + }, + { + "epoch": 0.7152626371413311, + "grad_norm": 4.441507339477539, + "learning_rate": 2.85408826619965e-05, + "loss": 1.0516, + "num_input_tokens_seen": 164296624, + "step": 10211 + }, + { + "epoch": 0.7153326853870603, + "grad_norm": 4.207118988037109, + "learning_rate": 2.8533884413309985e-05, + "loss": 1.4361, + "num_input_tokens_seen": 164312424, + "step": 10212 + }, + { + "epoch": 0.7154027336327896, + "grad_norm": 5.8654279708862305, + "learning_rate": 2.8526886164623463e-05, + "loss": 1.0943, + "num_input_tokens_seen": 164328808, + "step": 10213 + }, + { + "epoch": 0.7154727818785188, + "grad_norm": 3.9512927532196045, + "learning_rate": 2.8519887915936955e-05, + "loss": 1.0309, + "num_input_tokens_seen": 164345192, + "step": 10214 + }, + { + "epoch": 0.7155428301242481, + "grad_norm": 4.126234531402588, + "learning_rate": 2.8512889667250437e-05, + "loss": 1.1437, + "num_input_tokens_seen": 164361576, + "step": 10215 + }, + { + "epoch": 0.7156128783699773, + "grad_norm": 3.878683090209961, + "learning_rate": 2.8505891418563922e-05, + "loss": 0.9248, + "num_input_tokens_seen": 164377960, + "step": 10216 + }, + { + "epoch": 0.7156829266157065, + "grad_norm": 4.521285057067871, + "learning_rate": 2.849889316987741e-05, + "loss": 1.2594, + "num_input_tokens_seen": 164394344, + "step": 10217 + }, + { + "epoch": 0.7157529748614359, + "grad_norm": 4.424064636230469, + "learning_rate": 2.8491894921190886e-05, + "loss": 1.0233, + "num_input_tokens_seen": 164410168, + "step": 10218 + }, + { + "epoch": 0.7158230231071651, + "grad_norm": 3.9379990100860596, + "learning_rate": 2.848489667250438e-05, + "loss": 1.02, + "num_input_tokens_seen": 164425256, + "step": 10219 + }, + { + "epoch": 0.7158930713528943, + "grad_norm": 4.564741134643555, + "learning_rate": 2.847789842381786e-05, + "loss": 0.9766, + "num_input_tokens_seen": 164441640, + "step": 10220 + }, + { + "epoch": 0.7159631195986236, + "grad_norm": 3.6122641563415527, + "learning_rate": 2.847090017513135e-05, + "loss": 1.0835, + "num_input_tokens_seen": 164458024, + "step": 10221 + }, + { + "epoch": 0.7160331678443528, + "grad_norm": 4.062864780426025, + "learning_rate": 2.8463901926444837e-05, + "loss": 0.9343, + "num_input_tokens_seen": 164474096, + "step": 10222 + }, + { + "epoch": 0.7161032160900821, + "grad_norm": 4.965015888214111, + "learning_rate": 2.845690367775832e-05, + "loss": 1.4288, + "num_input_tokens_seen": 164490480, + "step": 10223 + }, + { + "epoch": 0.7161732643358113, + "grad_norm": 4.075133800506592, + "learning_rate": 2.8449905429071804e-05, + "loss": 1.1901, + "num_input_tokens_seen": 164506864, + "step": 10224 + }, + { + "epoch": 0.7162433125815405, + "grad_norm": 3.3851492404937744, + "learning_rate": 2.8442907180385286e-05, + "loss": 0.9236, + "num_input_tokens_seen": 164523248, + "step": 10225 + }, + { + "epoch": 0.7163133608272698, + "grad_norm": 3.893324136734009, + "learning_rate": 2.8435908931698774e-05, + "loss": 1.1086, + "num_input_tokens_seen": 164539632, + "step": 10226 + }, + { + "epoch": 0.716383409072999, + "grad_norm": 5.058289527893066, + "learning_rate": 2.842891068301226e-05, + "loss": 1.1043, + "num_input_tokens_seen": 164556016, + "step": 10227 + }, + { + "epoch": 0.7164534573187282, + "grad_norm": 3.836785078048706, + "learning_rate": 2.842191243432574e-05, + "loss": 1.0504, + "num_input_tokens_seen": 164572400, + "step": 10228 + }, + { + "epoch": 0.7165235055644575, + "grad_norm": 3.5536303520202637, + "learning_rate": 2.8414914185639237e-05, + "loss": 0.8815, + "num_input_tokens_seen": 164588784, + "step": 10229 + }, + { + "epoch": 0.7165935538101867, + "grad_norm": 3.2314844131469727, + "learning_rate": 2.8407915936952712e-05, + "loss": 0.9303, + "num_input_tokens_seen": 164604560, + "step": 10230 + }, + { + "epoch": 0.7166636020559161, + "grad_norm": 5.271570205688477, + "learning_rate": 2.8400917688266197e-05, + "loss": 1.1015, + "num_input_tokens_seen": 164620944, + "step": 10231 + }, + { + "epoch": 0.7167336503016453, + "grad_norm": 4.171263694763184, + "learning_rate": 2.8393919439579692e-05, + "loss": 0.9802, + "num_input_tokens_seen": 164637328, + "step": 10232 + }, + { + "epoch": 0.7168036985473745, + "grad_norm": 5.417962551116943, + "learning_rate": 2.8386921190893168e-05, + "loss": 1.0356, + "num_input_tokens_seen": 164653216, + "step": 10233 + }, + { + "epoch": 0.7168737467931038, + "grad_norm": 3.821176767349243, + "learning_rate": 2.837992294220666e-05, + "loss": 0.9461, + "num_input_tokens_seen": 164669408, + "step": 10234 + }, + { + "epoch": 0.716943795038833, + "grad_norm": 3.861858606338501, + "learning_rate": 2.8372924693520138e-05, + "loss": 1.0511, + "num_input_tokens_seen": 164685792, + "step": 10235 + }, + { + "epoch": 0.7170138432845622, + "grad_norm": 3.862168550491333, + "learning_rate": 2.836592644483363e-05, + "loss": 0.9552, + "num_input_tokens_seen": 164702176, + "step": 10236 + }, + { + "epoch": 0.7170838915302915, + "grad_norm": 4.597825527191162, + "learning_rate": 2.8358928196147105e-05, + "loss": 0.9609, + "num_input_tokens_seen": 164718472, + "step": 10237 + }, + { + "epoch": 0.7171539397760207, + "grad_norm": 3.795511245727539, + "learning_rate": 2.8351929947460594e-05, + "loss": 1.0721, + "num_input_tokens_seen": 164734632, + "step": 10238 + }, + { + "epoch": 0.71722398802175, + "grad_norm": 5.189326763153076, + "learning_rate": 2.8344931698774085e-05, + "loss": 1.1053, + "num_input_tokens_seen": 164750456, + "step": 10239 + }, + { + "epoch": 0.7172940362674792, + "grad_norm": 4.004899501800537, + "learning_rate": 2.833793345008756e-05, + "loss": 0.9984, + "num_input_tokens_seen": 164765152, + "step": 10240 + }, + { + "epoch": 0.7173640845132084, + "grad_norm": 3.9022834300994873, + "learning_rate": 2.8330935201401056e-05, + "loss": 1.1144, + "num_input_tokens_seen": 164780352, + "step": 10241 + }, + { + "epoch": 0.7174341327589377, + "grad_norm": 4.132354259490967, + "learning_rate": 2.832393695271453e-05, + "loss": 1.0381, + "num_input_tokens_seen": 164796736, + "step": 10242 + }, + { + "epoch": 0.717504181004667, + "grad_norm": 3.935626983642578, + "learning_rate": 2.8316938704028016e-05, + "loss": 1.0919, + "num_input_tokens_seen": 164813120, + "step": 10243 + }, + { + "epoch": 0.7175742292503962, + "grad_norm": 4.009433269500732, + "learning_rate": 2.830994045534151e-05, + "loss": 1.0977, + "num_input_tokens_seen": 164828848, + "step": 10244 + }, + { + "epoch": 0.7176442774961255, + "grad_norm": 4.445498943328857, + "learning_rate": 2.8302942206654993e-05, + "loss": 1.1814, + "num_input_tokens_seen": 164845232, + "step": 10245 + }, + { + "epoch": 0.7177143257418547, + "grad_norm": 5.5955119132995605, + "learning_rate": 2.829594395796848e-05, + "loss": 1.2163, + "num_input_tokens_seen": 164860696, + "step": 10246 + }, + { + "epoch": 0.717784373987584, + "grad_norm": 3.8706002235412598, + "learning_rate": 2.8288945709281957e-05, + "loss": 0.9127, + "num_input_tokens_seen": 164877080, + "step": 10247 + }, + { + "epoch": 0.7178544222333132, + "grad_norm": 3.740467071533203, + "learning_rate": 2.828194746059545e-05, + "loss": 1.1449, + "num_input_tokens_seen": 164893464, + "step": 10248 + }, + { + "epoch": 0.7179244704790424, + "grad_norm": 3.4275166988372803, + "learning_rate": 2.8274949211908934e-05, + "loss": 1.052, + "num_input_tokens_seen": 164909848, + "step": 10249 + }, + { + "epoch": 0.7179945187247717, + "grad_norm": 4.689492225646973, + "learning_rate": 2.8267950963222416e-05, + "loss": 1.0974, + "num_input_tokens_seen": 164926232, + "step": 10250 + }, + { + "epoch": 0.7180645669705009, + "grad_norm": 4.994006633758545, + "learning_rate": 2.8260952714535905e-05, + "loss": 1.0365, + "num_input_tokens_seen": 164942072, + "step": 10251 + }, + { + "epoch": 0.7181346152162302, + "grad_norm": 4.38145112991333, + "learning_rate": 2.825395446584938e-05, + "loss": 1.1655, + "num_input_tokens_seen": 164958456, + "step": 10252 + }, + { + "epoch": 0.7182046634619594, + "grad_norm": 4.806957244873047, + "learning_rate": 2.824695621716287e-05, + "loss": 1.0492, + "num_input_tokens_seen": 164974840, + "step": 10253 + }, + { + "epoch": 0.7182747117076886, + "grad_norm": 4.238828182220459, + "learning_rate": 2.8239957968476367e-05, + "loss": 1.1168, + "num_input_tokens_seen": 164991224, + "step": 10254 + }, + { + "epoch": 0.718344759953418, + "grad_norm": 3.530094623565674, + "learning_rate": 2.8232959719789842e-05, + "loss": 1.0449, + "num_input_tokens_seen": 165007608, + "step": 10255 + }, + { + "epoch": 0.7184148081991472, + "grad_norm": 3.990522623062134, + "learning_rate": 2.822596147110333e-05, + "loss": 1.1437, + "num_input_tokens_seen": 165023008, + "step": 10256 + }, + { + "epoch": 0.7184848564448764, + "grad_norm": 4.1331658363342285, + "learning_rate": 2.8218963222416812e-05, + "loss": 0.9851, + "num_input_tokens_seen": 165039392, + "step": 10257 + }, + { + "epoch": 0.7185549046906057, + "grad_norm": 6.6662211418151855, + "learning_rate": 2.8211964973730298e-05, + "loss": 0.9901, + "num_input_tokens_seen": 165055096, + "step": 10258 + }, + { + "epoch": 0.7186249529363349, + "grad_norm": 4.196070194244385, + "learning_rate": 2.820496672504379e-05, + "loss": 1.1699, + "num_input_tokens_seen": 165071352, + "step": 10259 + }, + { + "epoch": 0.7186950011820642, + "grad_norm": 3.613032579421997, + "learning_rate": 2.8197968476357268e-05, + "loss": 1.0881, + "num_input_tokens_seen": 165087192, + "step": 10260 + }, + { + "epoch": 0.7187650494277934, + "grad_norm": 4.036563396453857, + "learning_rate": 2.8190970227670753e-05, + "loss": 1.0783, + "num_input_tokens_seen": 165103280, + "step": 10261 + }, + { + "epoch": 0.7188350976735226, + "grad_norm": 3.9039313793182373, + "learning_rate": 2.8183971978984235e-05, + "loss": 0.9553, + "num_input_tokens_seen": 165119232, + "step": 10262 + }, + { + "epoch": 0.7189051459192519, + "grad_norm": 5.912670612335205, + "learning_rate": 2.8176973730297724e-05, + "loss": 1.2245, + "num_input_tokens_seen": 165135616, + "step": 10263 + }, + { + "epoch": 0.7189751941649811, + "grad_norm": 4.601632118225098, + "learning_rate": 2.81699754816112e-05, + "loss": 1.1137, + "num_input_tokens_seen": 165150624, + "step": 10264 + }, + { + "epoch": 0.7190452424107103, + "grad_norm": 4.7475152015686035, + "learning_rate": 2.816297723292469e-05, + "loss": 0.8658, + "num_input_tokens_seen": 165165864, + "step": 10265 + }, + { + "epoch": 0.7191152906564396, + "grad_norm": 3.858160972595215, + "learning_rate": 2.8155978984238186e-05, + "loss": 1.1066, + "num_input_tokens_seen": 165182248, + "step": 10266 + }, + { + "epoch": 0.7191853389021688, + "grad_norm": 4.388938903808594, + "learning_rate": 2.814898073555166e-05, + "loss": 1.145, + "num_input_tokens_seen": 165198520, + "step": 10267 + }, + { + "epoch": 0.7192553871478982, + "grad_norm": 3.8434929847717285, + "learning_rate": 2.814198248686515e-05, + "loss": 0.8912, + "num_input_tokens_seen": 165214032, + "step": 10268 + }, + { + "epoch": 0.7193254353936274, + "grad_norm": 3.855112075805664, + "learning_rate": 2.813498423817863e-05, + "loss": 0.9904, + "num_input_tokens_seen": 165230416, + "step": 10269 + }, + { + "epoch": 0.7193954836393566, + "grad_norm": 3.56986141204834, + "learning_rate": 2.8127985989492123e-05, + "loss": 0.9421, + "num_input_tokens_seen": 165246800, + "step": 10270 + }, + { + "epoch": 0.7194655318850859, + "grad_norm": 6.553144454956055, + "learning_rate": 2.812098774080561e-05, + "loss": 1.054, + "num_input_tokens_seen": 165263184, + "step": 10271 + }, + { + "epoch": 0.7195355801308151, + "grad_norm": 3.7116053104400635, + "learning_rate": 2.8113989492119087e-05, + "loss": 1.0871, + "num_input_tokens_seen": 165279568, + "step": 10272 + }, + { + "epoch": 0.7196056283765443, + "grad_norm": 4.297267436981201, + "learning_rate": 2.8106991243432572e-05, + "loss": 0.9588, + "num_input_tokens_seen": 165295952, + "step": 10273 + }, + { + "epoch": 0.7196756766222736, + "grad_norm": 3.4298181533813477, + "learning_rate": 2.8099992994746054e-05, + "loss": 0.9327, + "num_input_tokens_seen": 165312184, + "step": 10274 + }, + { + "epoch": 0.7197457248680028, + "grad_norm": 4.35665225982666, + "learning_rate": 2.809299474605955e-05, + "loss": 1.1581, + "num_input_tokens_seen": 165328072, + "step": 10275 + }, + { + "epoch": 0.7198157731137321, + "grad_norm": 4.917849063873291, + "learning_rate": 2.8085996497373035e-05, + "loss": 1.0367, + "num_input_tokens_seen": 165344296, + "step": 10276 + }, + { + "epoch": 0.7198858213594613, + "grad_norm": 3.7641944885253906, + "learning_rate": 2.807899824868651e-05, + "loss": 1.04, + "num_input_tokens_seen": 165360416, + "step": 10277 + }, + { + "epoch": 0.7199558696051905, + "grad_norm": 3.7544620037078857, + "learning_rate": 2.8072000000000005e-05, + "loss": 1.2186, + "num_input_tokens_seen": 165376800, + "step": 10278 + }, + { + "epoch": 0.7200259178509198, + "grad_norm": 4.157299041748047, + "learning_rate": 2.806500175131348e-05, + "loss": 1.0123, + "num_input_tokens_seen": 165392984, + "step": 10279 + }, + { + "epoch": 0.720095966096649, + "grad_norm": 4.2357635498046875, + "learning_rate": 2.8058003502626972e-05, + "loss": 0.9621, + "num_input_tokens_seen": 165408392, + "step": 10280 + }, + { + "epoch": 0.7201660143423783, + "grad_norm": 4.561477184295654, + "learning_rate": 2.805100525394046e-05, + "loss": 1.156, + "num_input_tokens_seen": 165424336, + "step": 10281 + }, + { + "epoch": 0.7202360625881076, + "grad_norm": 4.365609169006348, + "learning_rate": 2.8044007005253942e-05, + "loss": 1.0937, + "num_input_tokens_seen": 165440432, + "step": 10282 + }, + { + "epoch": 0.7203061108338368, + "grad_norm": 3.7303407192230225, + "learning_rate": 2.8037008756567428e-05, + "loss": 0.9721, + "num_input_tokens_seen": 165456816, + "step": 10283 + }, + { + "epoch": 0.7203761590795661, + "grad_norm": 4.923166275024414, + "learning_rate": 2.8030010507880906e-05, + "loss": 1.0347, + "num_input_tokens_seen": 165473200, + "step": 10284 + }, + { + "epoch": 0.7204462073252953, + "grad_norm": 3.866345167160034, + "learning_rate": 2.8023012259194398e-05, + "loss": 0.9164, + "num_input_tokens_seen": 165489584, + "step": 10285 + }, + { + "epoch": 0.7205162555710245, + "grad_norm": 4.118798732757568, + "learning_rate": 2.8016014010507887e-05, + "loss": 1.0242, + "num_input_tokens_seen": 165505968, + "step": 10286 + }, + { + "epoch": 0.7205863038167538, + "grad_norm": 3.8411672115325928, + "learning_rate": 2.8009015761821365e-05, + "loss": 0.9728, + "num_input_tokens_seen": 165522352, + "step": 10287 + }, + { + "epoch": 0.720656352062483, + "grad_norm": 3.7325501441955566, + "learning_rate": 2.8002017513134854e-05, + "loss": 1.0796, + "num_input_tokens_seen": 165538736, + "step": 10288 + }, + { + "epoch": 0.7207264003082123, + "grad_norm": 5.980128288269043, + "learning_rate": 2.799501926444833e-05, + "loss": 0.9491, + "num_input_tokens_seen": 165554584, + "step": 10289 + }, + { + "epoch": 0.7207964485539415, + "grad_norm": 8.208232879638672, + "learning_rate": 2.7988021015761824e-05, + "loss": 1.0137, + "num_input_tokens_seen": 165570392, + "step": 10290 + }, + { + "epoch": 0.7208664967996707, + "grad_norm": 3.569568634033203, + "learning_rate": 2.7981022767075306e-05, + "loss": 0.838, + "num_input_tokens_seen": 165586776, + "step": 10291 + }, + { + "epoch": 0.7209365450454001, + "grad_norm": 3.5389115810394287, + "learning_rate": 2.797402451838879e-05, + "loss": 1.117, + "num_input_tokens_seen": 165603160, + "step": 10292 + }, + { + "epoch": 0.7210065932911293, + "grad_norm": 3.325836658477783, + "learning_rate": 2.796702626970228e-05, + "loss": 0.9836, + "num_input_tokens_seen": 165619544, + "step": 10293 + }, + { + "epoch": 0.7210766415368585, + "grad_norm": 5.3725972175598145, + "learning_rate": 2.796002802101576e-05, + "loss": 1.1432, + "num_input_tokens_seen": 165634704, + "step": 10294 + }, + { + "epoch": 0.7211466897825878, + "grad_norm": 4.499677658081055, + "learning_rate": 2.7953029772329247e-05, + "loss": 1.1066, + "num_input_tokens_seen": 165651088, + "step": 10295 + }, + { + "epoch": 0.721216738028317, + "grad_norm": 6.7855305671691895, + "learning_rate": 2.794603152364273e-05, + "loss": 1.2053, + "num_input_tokens_seen": 165666672, + "step": 10296 + }, + { + "epoch": 0.7212867862740463, + "grad_norm": 5.920072555541992, + "learning_rate": 2.7939033274956217e-05, + "loss": 0.8738, + "num_input_tokens_seen": 165683056, + "step": 10297 + }, + { + "epoch": 0.7213568345197755, + "grad_norm": 4.631723880767822, + "learning_rate": 2.7932035026269702e-05, + "loss": 1.244, + "num_input_tokens_seen": 165698272, + "step": 10298 + }, + { + "epoch": 0.7214268827655047, + "grad_norm": 4.384388446807861, + "learning_rate": 2.7925036777583184e-05, + "loss": 1.0547, + "num_input_tokens_seen": 165714656, + "step": 10299 + }, + { + "epoch": 0.721496931011234, + "grad_norm": 3.871509075164795, + "learning_rate": 2.791803852889668e-05, + "loss": 1.1946, + "num_input_tokens_seen": 165731040, + "step": 10300 + }, + { + "epoch": 0.7215669792569632, + "grad_norm": 3.4678781032562256, + "learning_rate": 2.7911040280210154e-05, + "loss": 1.0712, + "num_input_tokens_seen": 165747176, + "step": 10301 + }, + { + "epoch": 0.7216370275026924, + "grad_norm": 3.8726000785827637, + "learning_rate": 2.7904042031523643e-05, + "loss": 0.9682, + "num_input_tokens_seen": 165763560, + "step": 10302 + }, + { + "epoch": 0.7217070757484217, + "grad_norm": 3.821103096008301, + "learning_rate": 2.7897043782837128e-05, + "loss": 1.092, + "num_input_tokens_seen": 165779720, + "step": 10303 + }, + { + "epoch": 0.721777123994151, + "grad_norm": 3.7635152339935303, + "learning_rate": 2.789004553415061e-05, + "loss": 1.0633, + "num_input_tokens_seen": 165796016, + "step": 10304 + }, + { + "epoch": 0.7218471722398803, + "grad_norm": 3.477344036102295, + "learning_rate": 2.7883047285464102e-05, + "loss": 1.038, + "num_input_tokens_seen": 165812400, + "step": 10305 + }, + { + "epoch": 0.7219172204856095, + "grad_norm": 4.083456039428711, + "learning_rate": 2.787604903677758e-05, + "loss": 1.1106, + "num_input_tokens_seen": 165828464, + "step": 10306 + }, + { + "epoch": 0.7219872687313387, + "grad_norm": 3.9229588508605957, + "learning_rate": 2.7869050788091066e-05, + "loss": 1.1126, + "num_input_tokens_seen": 165844328, + "step": 10307 + }, + { + "epoch": 0.722057316977068, + "grad_norm": 3.7151594161987305, + "learning_rate": 2.786205253940456e-05, + "loss": 1.0188, + "num_input_tokens_seen": 165860632, + "step": 10308 + }, + { + "epoch": 0.7221273652227972, + "grad_norm": 3.99238920211792, + "learning_rate": 2.7855054290718036e-05, + "loss": 0.929, + "num_input_tokens_seen": 165876984, + "step": 10309 + }, + { + "epoch": 0.7221974134685264, + "grad_norm": 4.6939377784729, + "learning_rate": 2.7848056042031528e-05, + "loss": 1.0269, + "num_input_tokens_seen": 165893208, + "step": 10310 + }, + { + "epoch": 0.7222674617142557, + "grad_norm": 3.9611010551452637, + "learning_rate": 2.7841057793345003e-05, + "loss": 1.191, + "num_input_tokens_seen": 165909400, + "step": 10311 + }, + { + "epoch": 0.7223375099599849, + "grad_norm": 3.9842464923858643, + "learning_rate": 2.78340595446585e-05, + "loss": 0.9288, + "num_input_tokens_seen": 165925064, + "step": 10312 + }, + { + "epoch": 0.7224075582057142, + "grad_norm": 3.5810182094573975, + "learning_rate": 2.7827061295971974e-05, + "loss": 1.0607, + "num_input_tokens_seen": 165941448, + "step": 10313 + }, + { + "epoch": 0.7224776064514434, + "grad_norm": 3.8511481285095215, + "learning_rate": 2.7820063047285462e-05, + "loss": 1.1114, + "num_input_tokens_seen": 165957720, + "step": 10314 + }, + { + "epoch": 0.7225476546971726, + "grad_norm": 3.8797762393951416, + "learning_rate": 2.7813064798598954e-05, + "loss": 1.0452, + "num_input_tokens_seen": 165973928, + "step": 10315 + }, + { + "epoch": 0.722617702942902, + "grad_norm": 3.9000251293182373, + "learning_rate": 2.7806066549912436e-05, + "loss": 1.0848, + "num_input_tokens_seen": 165989416, + "step": 10316 + }, + { + "epoch": 0.7226877511886312, + "grad_norm": 6.081394195556641, + "learning_rate": 2.779906830122592e-05, + "loss": 1.0619, + "num_input_tokens_seen": 166005680, + "step": 10317 + }, + { + "epoch": 0.7227577994343605, + "grad_norm": 3.676379919052124, + "learning_rate": 2.77920700525394e-05, + "loss": 0.9908, + "num_input_tokens_seen": 166022064, + "step": 10318 + }, + { + "epoch": 0.7228278476800897, + "grad_norm": 4.574796199798584, + "learning_rate": 2.7785071803852885e-05, + "loss": 1.139, + "num_input_tokens_seen": 166037800, + "step": 10319 + }, + { + "epoch": 0.7228978959258189, + "grad_norm": 3.7007546424865723, + "learning_rate": 2.777807355516638e-05, + "loss": 0.8979, + "num_input_tokens_seen": 166054184, + "step": 10320 + }, + { + "epoch": 0.7229679441715482, + "grad_norm": 3.6782896518707275, + "learning_rate": 2.777107530647986e-05, + "loss": 1.0965, + "num_input_tokens_seen": 166070568, + "step": 10321 + }, + { + "epoch": 0.7230379924172774, + "grad_norm": 5.105771064758301, + "learning_rate": 2.7764077057793347e-05, + "loss": 1.3118, + "num_input_tokens_seen": 166086952, + "step": 10322 + }, + { + "epoch": 0.7231080406630066, + "grad_norm": 3.6661746501922607, + "learning_rate": 2.7757078809106822e-05, + "loss": 0.9266, + "num_input_tokens_seen": 166102936, + "step": 10323 + }, + { + "epoch": 0.7231780889087359, + "grad_norm": 4.583796977996826, + "learning_rate": 2.7750080560420318e-05, + "loss": 0.9997, + "num_input_tokens_seen": 166119320, + "step": 10324 + }, + { + "epoch": 0.7232481371544651, + "grad_norm": 4.289971828460693, + "learning_rate": 2.7743082311733803e-05, + "loss": 1.0232, + "num_input_tokens_seen": 166135704, + "step": 10325 + }, + { + "epoch": 0.7233181854001944, + "grad_norm": 4.3797807693481445, + "learning_rate": 2.7736084063047285e-05, + "loss": 1.0123, + "num_input_tokens_seen": 166152088, + "step": 10326 + }, + { + "epoch": 0.7233882336459236, + "grad_norm": 4.879162788391113, + "learning_rate": 2.7729085814360773e-05, + "loss": 0.8722, + "num_input_tokens_seen": 166168168, + "step": 10327 + }, + { + "epoch": 0.7234582818916528, + "grad_norm": 4.368032455444336, + "learning_rate": 2.7722087565674255e-05, + "loss": 0.892, + "num_input_tokens_seen": 166184072, + "step": 10328 + }, + { + "epoch": 0.7235283301373822, + "grad_norm": 4.858910083770752, + "learning_rate": 2.771508931698774e-05, + "loss": 1.1479, + "num_input_tokens_seen": 166199168, + "step": 10329 + }, + { + "epoch": 0.7235983783831114, + "grad_norm": 5.889763355255127, + "learning_rate": 2.770809106830123e-05, + "loss": 1.1444, + "num_input_tokens_seen": 166215552, + "step": 10330 + }, + { + "epoch": 0.7236684266288406, + "grad_norm": 6.488298416137695, + "learning_rate": 2.770109281961471e-05, + "loss": 1.026, + "num_input_tokens_seen": 166231560, + "step": 10331 + }, + { + "epoch": 0.7237384748745699, + "grad_norm": 5.49334716796875, + "learning_rate": 2.7694094570928196e-05, + "loss": 1.0944, + "num_input_tokens_seen": 166247944, + "step": 10332 + }, + { + "epoch": 0.7238085231202991, + "grad_norm": 4.890691757202148, + "learning_rate": 2.7687096322241678e-05, + "loss": 0.8141, + "num_input_tokens_seen": 166264328, + "step": 10333 + }, + { + "epoch": 0.7238785713660284, + "grad_norm": 3.3884124755859375, + "learning_rate": 2.7680098073555166e-05, + "loss": 0.9279, + "num_input_tokens_seen": 166280712, + "step": 10334 + }, + { + "epoch": 0.7239486196117576, + "grad_norm": 5.4549736976623535, + "learning_rate": 2.7673099824868655e-05, + "loss": 1.0552, + "num_input_tokens_seen": 166295736, + "step": 10335 + }, + { + "epoch": 0.7240186678574868, + "grad_norm": 4.9658966064453125, + "learning_rate": 2.7666101576182137e-05, + "loss": 0.926, + "num_input_tokens_seen": 166312120, + "step": 10336 + }, + { + "epoch": 0.7240887161032161, + "grad_norm": 5.078591823577881, + "learning_rate": 2.7659103327495622e-05, + "loss": 1.0816, + "num_input_tokens_seen": 166328504, + "step": 10337 + }, + { + "epoch": 0.7241587643489453, + "grad_norm": 3.609013319015503, + "learning_rate": 2.7652105078809104e-05, + "loss": 0.9452, + "num_input_tokens_seen": 166344888, + "step": 10338 + }, + { + "epoch": 0.7242288125946745, + "grad_norm": 3.5383999347686768, + "learning_rate": 2.7645106830122592e-05, + "loss": 0.9776, + "num_input_tokens_seen": 166361272, + "step": 10339 + }, + { + "epoch": 0.7242988608404038, + "grad_norm": 4.391222953796387, + "learning_rate": 2.7638108581436074e-05, + "loss": 1.1571, + "num_input_tokens_seen": 166376416, + "step": 10340 + }, + { + "epoch": 0.724368909086133, + "grad_norm": 4.0183234214782715, + "learning_rate": 2.763111033274956e-05, + "loss": 1.0534, + "num_input_tokens_seen": 166392800, + "step": 10341 + }, + { + "epoch": 0.7244389573318624, + "grad_norm": 5.366726875305176, + "learning_rate": 2.7624112084063055e-05, + "loss": 1.0972, + "num_input_tokens_seen": 166408600, + "step": 10342 + }, + { + "epoch": 0.7245090055775916, + "grad_norm": 4.471798419952393, + "learning_rate": 2.761711383537653e-05, + "loss": 1.1936, + "num_input_tokens_seen": 166424984, + "step": 10343 + }, + { + "epoch": 0.7245790538233208, + "grad_norm": 3.707608222961426, + "learning_rate": 2.7610115586690015e-05, + "loss": 0.8556, + "num_input_tokens_seen": 166441368, + "step": 10344 + }, + { + "epoch": 0.7246491020690501, + "grad_norm": 3.525665283203125, + "learning_rate": 2.7603117338003497e-05, + "loss": 0.9541, + "num_input_tokens_seen": 166457752, + "step": 10345 + }, + { + "epoch": 0.7247191503147793, + "grad_norm": 4.538028240203857, + "learning_rate": 2.7596119089316985e-05, + "loss": 1.0511, + "num_input_tokens_seen": 166474136, + "step": 10346 + }, + { + "epoch": 0.7247891985605085, + "grad_norm": 3.4679977893829346, + "learning_rate": 2.7589120840630477e-05, + "loss": 0.9038, + "num_input_tokens_seen": 166490520, + "step": 10347 + }, + { + "epoch": 0.7248592468062378, + "grad_norm": 3.8179333209991455, + "learning_rate": 2.7582122591943956e-05, + "loss": 1.1053, + "num_input_tokens_seen": 166506904, + "step": 10348 + }, + { + "epoch": 0.724929295051967, + "grad_norm": 4.212880611419678, + "learning_rate": 2.7575124343257448e-05, + "loss": 1.022, + "num_input_tokens_seen": 166523288, + "step": 10349 + }, + { + "epoch": 0.7249993432976963, + "grad_norm": 3.875537157058716, + "learning_rate": 2.7568126094570923e-05, + "loss": 1.0622, + "num_input_tokens_seen": 166539672, + "step": 10350 + }, + { + "epoch": 0.7250693915434255, + "grad_norm": 3.7325987815856934, + "learning_rate": 2.756112784588441e-05, + "loss": 1.0495, + "num_input_tokens_seen": 166555616, + "step": 10351 + }, + { + "epoch": 0.7251394397891547, + "grad_norm": 4.300967693328857, + "learning_rate": 2.7554129597197903e-05, + "loss": 1.1065, + "num_input_tokens_seen": 166571368, + "step": 10352 + }, + { + "epoch": 0.725209488034884, + "grad_norm": 4.1681952476501465, + "learning_rate": 2.7547131348511378e-05, + "loss": 1.1373, + "num_input_tokens_seen": 166587752, + "step": 10353 + }, + { + "epoch": 0.7252795362806133, + "grad_norm": 5.550619125366211, + "learning_rate": 2.754013309982487e-05, + "loss": 0.9134, + "num_input_tokens_seen": 166603864, + "step": 10354 + }, + { + "epoch": 0.7253495845263426, + "grad_norm": 4.052687644958496, + "learning_rate": 2.753313485113835e-05, + "loss": 1.2837, + "num_input_tokens_seen": 166620248, + "step": 10355 + }, + { + "epoch": 0.7254196327720718, + "grad_norm": 6.030454635620117, + "learning_rate": 2.7526136602451834e-05, + "loss": 1.1705, + "num_input_tokens_seen": 166636632, + "step": 10356 + }, + { + "epoch": 0.725489681017801, + "grad_norm": 4.460616588592529, + "learning_rate": 2.751913835376533e-05, + "loss": 0.9572, + "num_input_tokens_seen": 166652952, + "step": 10357 + }, + { + "epoch": 0.7255597292635303, + "grad_norm": 4.548913478851318, + "learning_rate": 2.751214010507881e-05, + "loss": 1.3622, + "num_input_tokens_seen": 166668352, + "step": 10358 + }, + { + "epoch": 0.7256297775092595, + "grad_norm": 3.6430022716522217, + "learning_rate": 2.7505141856392296e-05, + "loss": 1.0496, + "num_input_tokens_seen": 166684736, + "step": 10359 + }, + { + "epoch": 0.7256998257549887, + "grad_norm": 3.8594837188720703, + "learning_rate": 2.7498143607705775e-05, + "loss": 0.9465, + "num_input_tokens_seen": 166700160, + "step": 10360 + }, + { + "epoch": 0.725769874000718, + "grad_norm": 3.8263142108917236, + "learning_rate": 2.7491145359019267e-05, + "loss": 0.9949, + "num_input_tokens_seen": 166715784, + "step": 10361 + }, + { + "epoch": 0.7258399222464472, + "grad_norm": 4.155397415161133, + "learning_rate": 2.7484147110332752e-05, + "loss": 1.0083, + "num_input_tokens_seen": 166732168, + "step": 10362 + }, + { + "epoch": 0.7259099704921765, + "grad_norm": 4.515734672546387, + "learning_rate": 2.7477148861646234e-05, + "loss": 0.8461, + "num_input_tokens_seen": 166747576, + "step": 10363 + }, + { + "epoch": 0.7259800187379057, + "grad_norm": 4.12512731552124, + "learning_rate": 2.7470150612959722e-05, + "loss": 1.1981, + "num_input_tokens_seen": 166762712, + "step": 10364 + }, + { + "epoch": 0.726050066983635, + "grad_norm": 4.325915336608887, + "learning_rate": 2.7463152364273197e-05, + "loss": 1.1736, + "num_input_tokens_seen": 166778208, + "step": 10365 + }, + { + "epoch": 0.7261201152293643, + "grad_norm": 4.159921169281006, + "learning_rate": 2.745615411558669e-05, + "loss": 1.2615, + "num_input_tokens_seen": 166793592, + "step": 10366 + }, + { + "epoch": 0.7261901634750935, + "grad_norm": 4.307136535644531, + "learning_rate": 2.744915586690017e-05, + "loss": 1.2221, + "num_input_tokens_seen": 166809976, + "step": 10367 + }, + { + "epoch": 0.7262602117208227, + "grad_norm": 3.4330127239227295, + "learning_rate": 2.744215761821366e-05, + "loss": 0.9535, + "num_input_tokens_seen": 166826168, + "step": 10368 + }, + { + "epoch": 0.726330259966552, + "grad_norm": 3.9667155742645264, + "learning_rate": 2.7435159369527148e-05, + "loss": 0.9927, + "num_input_tokens_seen": 166841696, + "step": 10369 + }, + { + "epoch": 0.7264003082122812, + "grad_norm": 5.606724262237549, + "learning_rate": 2.742816112084063e-05, + "loss": 0.9524, + "num_input_tokens_seen": 166858080, + "step": 10370 + }, + { + "epoch": 0.7264703564580105, + "grad_norm": 4.5639801025390625, + "learning_rate": 2.7421162872154115e-05, + "loss": 1.0551, + "num_input_tokens_seen": 166874152, + "step": 10371 + }, + { + "epoch": 0.7265404047037397, + "grad_norm": 4.040156364440918, + "learning_rate": 2.741416462346759e-05, + "loss": 1.0179, + "num_input_tokens_seen": 166890112, + "step": 10372 + }, + { + "epoch": 0.7266104529494689, + "grad_norm": 5.87882137298584, + "learning_rate": 2.7407166374781086e-05, + "loss": 0.9229, + "num_input_tokens_seen": 166906496, + "step": 10373 + }, + { + "epoch": 0.7266805011951982, + "grad_norm": 3.4096579551696777, + "learning_rate": 2.740016812609457e-05, + "loss": 0.9523, + "num_input_tokens_seen": 166922840, + "step": 10374 + }, + { + "epoch": 0.7267505494409274, + "grad_norm": 6.920338153839111, + "learning_rate": 2.7393169877408053e-05, + "loss": 1.2237, + "num_input_tokens_seen": 166939224, + "step": 10375 + }, + { + "epoch": 0.7268205976866566, + "grad_norm": 4.722144603729248, + "learning_rate": 2.738617162872154e-05, + "loss": 1.1155, + "num_input_tokens_seen": 166955288, + "step": 10376 + }, + { + "epoch": 0.726890645932386, + "grad_norm": 3.888092517852783, + "learning_rate": 2.7379173380035016e-05, + "loss": 1.123, + "num_input_tokens_seen": 166971672, + "step": 10377 + }, + { + "epoch": 0.7269606941781152, + "grad_norm": 4.694401264190674, + "learning_rate": 2.7372175131348508e-05, + "loss": 1.0697, + "num_input_tokens_seen": 166987792, + "step": 10378 + }, + { + "epoch": 0.7270307424238445, + "grad_norm": 3.4407129287719727, + "learning_rate": 2.7365176882662004e-05, + "loss": 0.8966, + "num_input_tokens_seen": 167003544, + "step": 10379 + }, + { + "epoch": 0.7271007906695737, + "grad_norm": 4.054849147796631, + "learning_rate": 2.735817863397548e-05, + "loss": 0.9385, + "num_input_tokens_seen": 167018464, + "step": 10380 + }, + { + "epoch": 0.7271708389153029, + "grad_norm": 3.5279064178466797, + "learning_rate": 2.7351180385288967e-05, + "loss": 0.986, + "num_input_tokens_seen": 167034120, + "step": 10381 + }, + { + "epoch": 0.7272408871610322, + "grad_norm": 3.570910692214966, + "learning_rate": 2.734418213660245e-05, + "loss": 0.9576, + "num_input_tokens_seen": 167050504, + "step": 10382 + }, + { + "epoch": 0.7273109354067614, + "grad_norm": 3.892106771469116, + "learning_rate": 2.733718388791594e-05, + "loss": 1.0227, + "num_input_tokens_seen": 167065736, + "step": 10383 + }, + { + "epoch": 0.7273809836524906, + "grad_norm": 4.2454094886779785, + "learning_rate": 2.7330185639229426e-05, + "loss": 1.117, + "num_input_tokens_seen": 167081600, + "step": 10384 + }, + { + "epoch": 0.7274510318982199, + "grad_norm": 3.517366886138916, + "learning_rate": 2.7323187390542905e-05, + "loss": 0.7899, + "num_input_tokens_seen": 167097576, + "step": 10385 + }, + { + "epoch": 0.7275210801439491, + "grad_norm": 4.315273761749268, + "learning_rate": 2.731618914185639e-05, + "loss": 1.1475, + "num_input_tokens_seen": 167113664, + "step": 10386 + }, + { + "epoch": 0.7275911283896784, + "grad_norm": 4.031404972076416, + "learning_rate": 2.7309190893169872e-05, + "loss": 0.9619, + "num_input_tokens_seen": 167129280, + "step": 10387 + }, + { + "epoch": 0.7276611766354076, + "grad_norm": 4.945637226104736, + "learning_rate": 2.7302192644483364e-05, + "loss": 0.9487, + "num_input_tokens_seen": 167145664, + "step": 10388 + }, + { + "epoch": 0.7277312248811368, + "grad_norm": 4.158786773681641, + "learning_rate": 2.7295194395796852e-05, + "loss": 1.1751, + "num_input_tokens_seen": 167162048, + "step": 10389 + }, + { + "epoch": 0.7278012731268662, + "grad_norm": 4.599488258361816, + "learning_rate": 2.7288196147110327e-05, + "loss": 1.0968, + "num_input_tokens_seen": 167178432, + "step": 10390 + }, + { + "epoch": 0.7278713213725954, + "grad_norm": 4.452554702758789, + "learning_rate": 2.7281197898423823e-05, + "loss": 1.0499, + "num_input_tokens_seen": 167194816, + "step": 10391 + }, + { + "epoch": 0.7279413696183247, + "grad_norm": 5.320280075073242, + "learning_rate": 2.7274199649737298e-05, + "loss": 0.9845, + "num_input_tokens_seen": 167209824, + "step": 10392 + }, + { + "epoch": 0.7280114178640539, + "grad_norm": 3.955512285232544, + "learning_rate": 2.726720140105079e-05, + "loss": 1.1117, + "num_input_tokens_seen": 167226208, + "step": 10393 + }, + { + "epoch": 0.7280814661097831, + "grad_norm": 3.848613977432251, + "learning_rate": 2.7260203152364268e-05, + "loss": 1.0484, + "num_input_tokens_seen": 167242232, + "step": 10394 + }, + { + "epoch": 0.7281515143555124, + "grad_norm": 5.797041893005371, + "learning_rate": 2.725320490367776e-05, + "loss": 1.0885, + "num_input_tokens_seen": 167257312, + "step": 10395 + }, + { + "epoch": 0.7282215626012416, + "grad_norm": 4.228905200958252, + "learning_rate": 2.7246206654991245e-05, + "loss": 1.087, + "num_input_tokens_seen": 167273696, + "step": 10396 + }, + { + "epoch": 0.7282916108469708, + "grad_norm": 5.637840270996094, + "learning_rate": 2.7239208406304724e-05, + "loss": 1.1459, + "num_input_tokens_seen": 167290080, + "step": 10397 + }, + { + "epoch": 0.7283616590927001, + "grad_norm": 3.923344850540161, + "learning_rate": 2.7232210157618216e-05, + "loss": 1.0636, + "num_input_tokens_seen": 167306464, + "step": 10398 + }, + { + "epoch": 0.7284317073384293, + "grad_norm": 4.073136806488037, + "learning_rate": 2.722521190893169e-05, + "loss": 1.2086, + "num_input_tokens_seen": 167322848, + "step": 10399 + }, + { + "epoch": 0.7285017555841586, + "grad_norm": 3.957937717437744, + "learning_rate": 2.7218213660245183e-05, + "loss": 1.1213, + "num_input_tokens_seen": 167339232, + "step": 10400 + }, + { + "epoch": 0.7285017555841586, + "eval_loss": 1.1175782680511475, + "eval_runtime": 0.192, + "eval_samples_per_second": 5.208, + "eval_steps_per_second": 5.208, + "num_input_tokens_seen": 167339232, + "step": 10400 + }, + { + "epoch": 0.7285718038298878, + "grad_norm": 4.422255992889404, + "learning_rate": 2.721121541155867e-05, + "loss": 1.018, + "num_input_tokens_seen": 167355312, + "step": 10401 + }, + { + "epoch": 0.728641852075617, + "grad_norm": 3.6482644081115723, + "learning_rate": 2.7204217162872146e-05, + "loss": 1.1249, + "num_input_tokens_seen": 167371552, + "step": 10402 + }, + { + "epoch": 0.7287119003213464, + "grad_norm": 3.3811533451080322, + "learning_rate": 2.7197218914185642e-05, + "loss": 0.9752, + "num_input_tokens_seen": 167387936, + "step": 10403 + }, + { + "epoch": 0.7287819485670756, + "grad_norm": 5.434871196746826, + "learning_rate": 2.7190220665499124e-05, + "loss": 1.0555, + "num_input_tokens_seen": 167403208, + "step": 10404 + }, + { + "epoch": 0.7288519968128048, + "grad_norm": 3.4232592582702637, + "learning_rate": 2.718322241681261e-05, + "loss": 0.9457, + "num_input_tokens_seen": 167419336, + "step": 10405 + }, + { + "epoch": 0.7289220450585341, + "grad_norm": 3.75134539604187, + "learning_rate": 2.7176224168126097e-05, + "loss": 0.9731, + "num_input_tokens_seen": 167435544, + "step": 10406 + }, + { + "epoch": 0.7289920933042633, + "grad_norm": 3.936197519302368, + "learning_rate": 2.716922591943958e-05, + "loss": 1.023, + "num_input_tokens_seen": 167451600, + "step": 10407 + }, + { + "epoch": 0.7290621415499926, + "grad_norm": 5.55162239074707, + "learning_rate": 2.7162227670753064e-05, + "loss": 1.1445, + "num_input_tokens_seen": 167467728, + "step": 10408 + }, + { + "epoch": 0.7291321897957218, + "grad_norm": 4.733914852142334, + "learning_rate": 2.7155229422066546e-05, + "loss": 1.1873, + "num_input_tokens_seen": 167483800, + "step": 10409 + }, + { + "epoch": 0.729202238041451, + "grad_norm": 4.848385810852051, + "learning_rate": 2.7148231173380035e-05, + "loss": 1.1684, + "num_input_tokens_seen": 167500184, + "step": 10410 + }, + { + "epoch": 0.7292722862871803, + "grad_norm": 4.961422443389893, + "learning_rate": 2.714123292469352e-05, + "loss": 1.1039, + "num_input_tokens_seen": 167516512, + "step": 10411 + }, + { + "epoch": 0.7293423345329095, + "grad_norm": 4.644773006439209, + "learning_rate": 2.7134234676007002e-05, + "loss": 1.2127, + "num_input_tokens_seen": 167532896, + "step": 10412 + }, + { + "epoch": 0.7294123827786387, + "grad_norm": 3.76466965675354, + "learning_rate": 2.7127236427320497e-05, + "loss": 1.0597, + "num_input_tokens_seen": 167549280, + "step": 10413 + }, + { + "epoch": 0.729482431024368, + "grad_norm": 4.336117267608643, + "learning_rate": 2.7120238178633972e-05, + "loss": 1.0865, + "num_input_tokens_seen": 167565664, + "step": 10414 + }, + { + "epoch": 0.7295524792700973, + "grad_norm": 4.7359232902526855, + "learning_rate": 2.711323992994746e-05, + "loss": 0.9568, + "num_input_tokens_seen": 167581528, + "step": 10415 + }, + { + "epoch": 0.7296225275158266, + "grad_norm": 4.192590713500977, + "learning_rate": 2.7106241681260953e-05, + "loss": 1.0085, + "num_input_tokens_seen": 167597552, + "step": 10416 + }, + { + "epoch": 0.7296925757615558, + "grad_norm": 3.452561616897583, + "learning_rate": 2.7099243432574428e-05, + "loss": 0.9331, + "num_input_tokens_seen": 167613864, + "step": 10417 + }, + { + "epoch": 0.729762624007285, + "grad_norm": 3.6941444873809814, + "learning_rate": 2.709224518388792e-05, + "loss": 0.9944, + "num_input_tokens_seen": 167630248, + "step": 10418 + }, + { + "epoch": 0.7298326722530143, + "grad_norm": 4.149344444274902, + "learning_rate": 2.7085246935201398e-05, + "loss": 1.1534, + "num_input_tokens_seen": 167646632, + "step": 10419 + }, + { + "epoch": 0.7299027204987435, + "grad_norm": 3.694056510925293, + "learning_rate": 2.7078248686514883e-05, + "loss": 0.9395, + "num_input_tokens_seen": 167663016, + "step": 10420 + }, + { + "epoch": 0.7299727687444728, + "grad_norm": 4.3449788093566895, + "learning_rate": 2.7071250437828365e-05, + "loss": 0.7934, + "num_input_tokens_seen": 167679400, + "step": 10421 + }, + { + "epoch": 0.730042816990202, + "grad_norm": 3.6112940311431885, + "learning_rate": 2.7064252189141854e-05, + "loss": 0.9397, + "num_input_tokens_seen": 167695288, + "step": 10422 + }, + { + "epoch": 0.7301128652359312, + "grad_norm": 3.5492844581604004, + "learning_rate": 2.7057253940455346e-05, + "loss": 1.1144, + "num_input_tokens_seen": 167711672, + "step": 10423 + }, + { + "epoch": 0.7301829134816605, + "grad_norm": 3.9729115962982178, + "learning_rate": 2.705025569176882e-05, + "loss": 0.9258, + "num_input_tokens_seen": 167728056, + "step": 10424 + }, + { + "epoch": 0.7302529617273897, + "grad_norm": 3.777050018310547, + "learning_rate": 2.7043257443082316e-05, + "loss": 0.997, + "num_input_tokens_seen": 167744440, + "step": 10425 + }, + { + "epoch": 0.7303230099731189, + "grad_norm": 5.670863151550293, + "learning_rate": 2.703625919439579e-05, + "loss": 0.9291, + "num_input_tokens_seen": 167760408, + "step": 10426 + }, + { + "epoch": 0.7303930582188483, + "grad_norm": 4.042688369750977, + "learning_rate": 2.702926094570928e-05, + "loss": 0.9844, + "num_input_tokens_seen": 167776792, + "step": 10427 + }, + { + "epoch": 0.7304631064645775, + "grad_norm": 4.464125633239746, + "learning_rate": 2.7022262697022772e-05, + "loss": 1.0739, + "num_input_tokens_seen": 167792136, + "step": 10428 + }, + { + "epoch": 0.7305331547103068, + "grad_norm": 4.0579023361206055, + "learning_rate": 2.7015264448336254e-05, + "loss": 0.9619, + "num_input_tokens_seen": 167808296, + "step": 10429 + }, + { + "epoch": 0.730603202956036, + "grad_norm": 3.825371026992798, + "learning_rate": 2.700826619964974e-05, + "loss": 1.1639, + "num_input_tokens_seen": 167824680, + "step": 10430 + }, + { + "epoch": 0.7306732512017652, + "grad_norm": 4.315201282501221, + "learning_rate": 2.7001267950963217e-05, + "loss": 0.8776, + "num_input_tokens_seen": 167841064, + "step": 10431 + }, + { + "epoch": 0.7307432994474945, + "grad_norm": 4.290385723114014, + "learning_rate": 2.6994269702276702e-05, + "loss": 0.8949, + "num_input_tokens_seen": 167857248, + "step": 10432 + }, + { + "epoch": 0.7308133476932237, + "grad_norm": 4.67999267578125, + "learning_rate": 2.6987271453590194e-05, + "loss": 0.8328, + "num_input_tokens_seen": 167873592, + "step": 10433 + }, + { + "epoch": 0.7308833959389529, + "grad_norm": 3.963594675064087, + "learning_rate": 2.6980273204903676e-05, + "loss": 0.9998, + "num_input_tokens_seen": 167889976, + "step": 10434 + }, + { + "epoch": 0.7309534441846822, + "grad_norm": 3.8906006813049316, + "learning_rate": 2.6973274956217165e-05, + "loss": 1.0557, + "num_input_tokens_seen": 167906360, + "step": 10435 + }, + { + "epoch": 0.7310234924304114, + "grad_norm": 5.195767879486084, + "learning_rate": 2.696627670753064e-05, + "loss": 1.2075, + "num_input_tokens_seen": 167922744, + "step": 10436 + }, + { + "epoch": 0.7310935406761407, + "grad_norm": 3.912337064743042, + "learning_rate": 2.6959278458844135e-05, + "loss": 1.2091, + "num_input_tokens_seen": 167939128, + "step": 10437 + }, + { + "epoch": 0.73116358892187, + "grad_norm": 4.685909748077393, + "learning_rate": 2.695228021015762e-05, + "loss": 1.2258, + "num_input_tokens_seen": 167955328, + "step": 10438 + }, + { + "epoch": 0.7312336371675991, + "grad_norm": 4.122082710266113, + "learning_rate": 2.6945281961471102e-05, + "loss": 1.1744, + "num_input_tokens_seen": 167971712, + "step": 10439 + }, + { + "epoch": 0.7313036854133285, + "grad_norm": 3.7890889644622803, + "learning_rate": 2.693828371278459e-05, + "loss": 1.0068, + "num_input_tokens_seen": 167988056, + "step": 10440 + }, + { + "epoch": 0.7313737336590577, + "grad_norm": 4.497992992401123, + "learning_rate": 2.6931285464098073e-05, + "loss": 1.0587, + "num_input_tokens_seen": 168004048, + "step": 10441 + }, + { + "epoch": 0.7314437819047869, + "grad_norm": 4.838020324707031, + "learning_rate": 2.6924287215411558e-05, + "loss": 1.1897, + "num_input_tokens_seen": 168019200, + "step": 10442 + }, + { + "epoch": 0.7315138301505162, + "grad_norm": 4.309928894042969, + "learning_rate": 2.6917288966725053e-05, + "loss": 1.0791, + "num_input_tokens_seen": 168035480, + "step": 10443 + }, + { + "epoch": 0.7315838783962454, + "grad_norm": 3.6449790000915527, + "learning_rate": 2.6910290718038528e-05, + "loss": 0.918, + "num_input_tokens_seen": 168051840, + "step": 10444 + }, + { + "epoch": 0.7316539266419747, + "grad_norm": 3.5886402130126953, + "learning_rate": 2.6903292469352013e-05, + "loss": 0.9919, + "num_input_tokens_seen": 168068224, + "step": 10445 + }, + { + "epoch": 0.7317239748877039, + "grad_norm": 4.587193965911865, + "learning_rate": 2.6896294220665495e-05, + "loss": 1.0713, + "num_input_tokens_seen": 168082352, + "step": 10446 + }, + { + "epoch": 0.7317940231334331, + "grad_norm": 3.7961313724517822, + "learning_rate": 2.6889295971978984e-05, + "loss": 1.0661, + "num_input_tokens_seen": 168098576, + "step": 10447 + }, + { + "epoch": 0.7318640713791624, + "grad_norm": 3.649651527404785, + "learning_rate": 2.688229772329246e-05, + "loss": 0.8768, + "num_input_tokens_seen": 168114704, + "step": 10448 + }, + { + "epoch": 0.7319341196248916, + "grad_norm": 3.8223345279693604, + "learning_rate": 2.6875299474605954e-05, + "loss": 1.0781, + "num_input_tokens_seen": 168131088, + "step": 10449 + }, + { + "epoch": 0.7320041678706208, + "grad_norm": 4.501837253570557, + "learning_rate": 2.6868301225919446e-05, + "loss": 1.0512, + "num_input_tokens_seen": 168147472, + "step": 10450 + }, + { + "epoch": 0.7320742161163502, + "grad_norm": 3.5899078845977783, + "learning_rate": 2.686130297723292e-05, + "loss": 0.9448, + "num_input_tokens_seen": 168163760, + "step": 10451 + }, + { + "epoch": 0.7321442643620794, + "grad_norm": 4.342123985290527, + "learning_rate": 2.685430472854641e-05, + "loss": 1.0678, + "num_input_tokens_seen": 168180144, + "step": 10452 + }, + { + "epoch": 0.7322143126078087, + "grad_norm": 4.743003845214844, + "learning_rate": 2.684730647985989e-05, + "loss": 0.987, + "num_input_tokens_seen": 168196528, + "step": 10453 + }, + { + "epoch": 0.7322843608535379, + "grad_norm": 3.6461548805236816, + "learning_rate": 2.6840308231173377e-05, + "loss": 1.055, + "num_input_tokens_seen": 168212912, + "step": 10454 + }, + { + "epoch": 0.7323544090992671, + "grad_norm": 4.148495197296143, + "learning_rate": 2.6833309982486872e-05, + "loss": 1.3233, + "num_input_tokens_seen": 168228624, + "step": 10455 + }, + { + "epoch": 0.7324244573449964, + "grad_norm": 4.9329609870910645, + "learning_rate": 2.6826311733800347e-05, + "loss": 1.1733, + "num_input_tokens_seen": 168245008, + "step": 10456 + }, + { + "epoch": 0.7324945055907256, + "grad_norm": 4.5730881690979, + "learning_rate": 2.6819313485113832e-05, + "loss": 0.9909, + "num_input_tokens_seen": 168261128, + "step": 10457 + }, + { + "epoch": 0.7325645538364549, + "grad_norm": 3.742635488510132, + "learning_rate": 2.6812315236427314e-05, + "loss": 0.9072, + "num_input_tokens_seen": 168277328, + "step": 10458 + }, + { + "epoch": 0.7326346020821841, + "grad_norm": 7.511114597320557, + "learning_rate": 2.680531698774081e-05, + "loss": 1.2551, + "num_input_tokens_seen": 168292808, + "step": 10459 + }, + { + "epoch": 0.7327046503279133, + "grad_norm": 4.715389251708984, + "learning_rate": 2.6798318739054295e-05, + "loss": 0.9265, + "num_input_tokens_seen": 168309192, + "step": 10460 + }, + { + "epoch": 0.7327746985736426, + "grad_norm": 3.19683837890625, + "learning_rate": 2.6791320490367773e-05, + "loss": 0.921, + "num_input_tokens_seen": 168325576, + "step": 10461 + }, + { + "epoch": 0.7328447468193718, + "grad_norm": 4.128951549530029, + "learning_rate": 2.6784322241681265e-05, + "loss": 0.7789, + "num_input_tokens_seen": 168341192, + "step": 10462 + }, + { + "epoch": 0.732914795065101, + "grad_norm": 4.0677080154418945, + "learning_rate": 2.677732399299474e-05, + "loss": 0.9569, + "num_input_tokens_seen": 168357576, + "step": 10463 + }, + { + "epoch": 0.7329848433108304, + "grad_norm": 4.675279140472412, + "learning_rate": 2.6770325744308232e-05, + "loss": 1.2201, + "num_input_tokens_seen": 168373960, + "step": 10464 + }, + { + "epoch": 0.7330548915565596, + "grad_norm": 3.493903875350952, + "learning_rate": 2.676332749562172e-05, + "loss": 1.033, + "num_input_tokens_seen": 168390344, + "step": 10465 + }, + { + "epoch": 0.7331249398022889, + "grad_norm": 4.765762805938721, + "learning_rate": 2.6756329246935196e-05, + "loss": 1.0877, + "num_input_tokens_seen": 168406704, + "step": 10466 + }, + { + "epoch": 0.7331949880480181, + "grad_norm": 4.0836286544799805, + "learning_rate": 2.6749330998248688e-05, + "loss": 1.1247, + "num_input_tokens_seen": 168422768, + "step": 10467 + }, + { + "epoch": 0.7332650362937473, + "grad_norm": 3.715101718902588, + "learning_rate": 2.6742332749562166e-05, + "loss": 0.9895, + "num_input_tokens_seen": 168439152, + "step": 10468 + }, + { + "epoch": 0.7333350845394766, + "grad_norm": 4.004080772399902, + "learning_rate": 2.6735334500875658e-05, + "loss": 1.2341, + "num_input_tokens_seen": 168455536, + "step": 10469 + }, + { + "epoch": 0.7334051327852058, + "grad_norm": 3.3160030841827393, + "learning_rate": 2.6728336252189147e-05, + "loss": 1.0855, + "num_input_tokens_seen": 168471920, + "step": 10470 + }, + { + "epoch": 0.733475181030935, + "grad_norm": 3.8586158752441406, + "learning_rate": 2.672133800350263e-05, + "loss": 0.9817, + "num_input_tokens_seen": 168487472, + "step": 10471 + }, + { + "epoch": 0.7335452292766643, + "grad_norm": 4.187497615814209, + "learning_rate": 2.6714339754816114e-05, + "loss": 1.1406, + "num_input_tokens_seen": 168503856, + "step": 10472 + }, + { + "epoch": 0.7336152775223935, + "grad_norm": 6.3048295974731445, + "learning_rate": 2.6707341506129592e-05, + "loss": 1.1001, + "num_input_tokens_seen": 168520240, + "step": 10473 + }, + { + "epoch": 0.7336853257681228, + "grad_norm": 5.55105447769165, + "learning_rate": 2.6700343257443084e-05, + "loss": 0.8683, + "num_input_tokens_seen": 168535768, + "step": 10474 + }, + { + "epoch": 0.733755374013852, + "grad_norm": 3.6796741485595703, + "learning_rate": 2.6693345008756566e-05, + "loss": 1.187, + "num_input_tokens_seen": 168551976, + "step": 10475 + }, + { + "epoch": 0.7338254222595813, + "grad_norm": 3.8010668754577637, + "learning_rate": 2.668634676007005e-05, + "loss": 0.9265, + "num_input_tokens_seen": 168567800, + "step": 10476 + }, + { + "epoch": 0.7338954705053106, + "grad_norm": 3.4310591220855713, + "learning_rate": 2.667934851138354e-05, + "loss": 0.9819, + "num_input_tokens_seen": 168584168, + "step": 10477 + }, + { + "epoch": 0.7339655187510398, + "grad_norm": 3.798675775527954, + "learning_rate": 2.6672350262697015e-05, + "loss": 0.9576, + "num_input_tokens_seen": 168600552, + "step": 10478 + }, + { + "epoch": 0.734035566996769, + "grad_norm": 4.0093092918396, + "learning_rate": 2.6665352014010507e-05, + "loss": 0.9996, + "num_input_tokens_seen": 168616936, + "step": 10479 + }, + { + "epoch": 0.7341056152424983, + "grad_norm": 4.396603107452393, + "learning_rate": 2.665835376532399e-05, + "loss": 1.0612, + "num_input_tokens_seen": 168633320, + "step": 10480 + }, + { + "epoch": 0.7341756634882275, + "grad_norm": 3.693871021270752, + "learning_rate": 2.6651355516637477e-05, + "loss": 1.0184, + "num_input_tokens_seen": 168649704, + "step": 10481 + }, + { + "epoch": 0.7342457117339568, + "grad_norm": 4.1276092529296875, + "learning_rate": 2.6644357267950966e-05, + "loss": 1.0159, + "num_input_tokens_seen": 168665576, + "step": 10482 + }, + { + "epoch": 0.734315759979686, + "grad_norm": 6.173518180847168, + "learning_rate": 2.6637359019264448e-05, + "loss": 1.0765, + "num_input_tokens_seen": 168681568, + "step": 10483 + }, + { + "epoch": 0.7343858082254152, + "grad_norm": 3.5717406272888184, + "learning_rate": 2.663036077057794e-05, + "loss": 0.9033, + "num_input_tokens_seen": 168697016, + "step": 10484 + }, + { + "epoch": 0.7344558564711445, + "grad_norm": 3.7915477752685547, + "learning_rate": 2.6623362521891415e-05, + "loss": 0.813, + "num_input_tokens_seen": 168713064, + "step": 10485 + }, + { + "epoch": 0.7345259047168737, + "grad_norm": 4.762414932250977, + "learning_rate": 2.6616364273204903e-05, + "loss": 1.0752, + "num_input_tokens_seen": 168729448, + "step": 10486 + }, + { + "epoch": 0.7345959529626029, + "grad_norm": 3.334425687789917, + "learning_rate": 2.660936602451839e-05, + "loss": 0.9167, + "num_input_tokens_seen": 168745832, + "step": 10487 + }, + { + "epoch": 0.7346660012083323, + "grad_norm": 4.164660930633545, + "learning_rate": 2.660236777583187e-05, + "loss": 1.0758, + "num_input_tokens_seen": 168761192, + "step": 10488 + }, + { + "epoch": 0.7347360494540615, + "grad_norm": 3.635967254638672, + "learning_rate": 2.6595369527145362e-05, + "loss": 1.0774, + "num_input_tokens_seen": 168776896, + "step": 10489 + }, + { + "epoch": 0.7348060976997908, + "grad_norm": 4.33154821395874, + "learning_rate": 2.658837127845884e-05, + "loss": 1.2035, + "num_input_tokens_seen": 168793216, + "step": 10490 + }, + { + "epoch": 0.73487614594552, + "grad_norm": 5.088499069213867, + "learning_rate": 2.6581373029772326e-05, + "loss": 0.9548, + "num_input_tokens_seen": 168809600, + "step": 10491 + }, + { + "epoch": 0.7349461941912492, + "grad_norm": 4.5256218910217285, + "learning_rate": 2.657437478108582e-05, + "loss": 0.9798, + "num_input_tokens_seen": 168825984, + "step": 10492 + }, + { + "epoch": 0.7350162424369785, + "grad_norm": 4.051859378814697, + "learning_rate": 2.6567376532399296e-05, + "loss": 1.084, + "num_input_tokens_seen": 168842368, + "step": 10493 + }, + { + "epoch": 0.7350862906827077, + "grad_norm": 3.9187848567962646, + "learning_rate": 2.656037828371279e-05, + "loss": 1.0996, + "num_input_tokens_seen": 168858752, + "step": 10494 + }, + { + "epoch": 0.735156338928437, + "grad_norm": 4.167224884033203, + "learning_rate": 2.6553380035026267e-05, + "loss": 0.9906, + "num_input_tokens_seen": 168874744, + "step": 10495 + }, + { + "epoch": 0.7352263871741662, + "grad_norm": 4.158847332000732, + "learning_rate": 2.654638178633976e-05, + "loss": 0.9981, + "num_input_tokens_seen": 168891128, + "step": 10496 + }, + { + "epoch": 0.7352964354198954, + "grad_norm": 3.7300357818603516, + "learning_rate": 2.6539383537653244e-05, + "loss": 1.0339, + "num_input_tokens_seen": 168907512, + "step": 10497 + }, + { + "epoch": 0.7353664836656247, + "grad_norm": 3.5747900009155273, + "learning_rate": 2.6532385288966722e-05, + "loss": 1.0778, + "num_input_tokens_seen": 168923840, + "step": 10498 + }, + { + "epoch": 0.7354365319113539, + "grad_norm": 4.034585475921631, + "learning_rate": 2.6525387040280214e-05, + "loss": 0.9481, + "num_input_tokens_seen": 168939024, + "step": 10499 + }, + { + "epoch": 0.7355065801570831, + "grad_norm": 3.4616448879241943, + "learning_rate": 2.651838879159369e-05, + "loss": 0.9596, + "num_input_tokens_seen": 168954784, + "step": 10500 + }, + { + "epoch": 0.7355766284028125, + "grad_norm": 3.660223960876465, + "learning_rate": 2.651139054290718e-05, + "loss": 1.1675, + "num_input_tokens_seen": 168970792, + "step": 10501 + }, + { + "epoch": 0.7356466766485417, + "grad_norm": 4.153808116912842, + "learning_rate": 2.650439229422066e-05, + "loss": 1.3198, + "num_input_tokens_seen": 168987176, + "step": 10502 + }, + { + "epoch": 0.735716724894271, + "grad_norm": 3.6979668140411377, + "learning_rate": 2.6497394045534145e-05, + "loss": 1.0639, + "num_input_tokens_seen": 169002920, + "step": 10503 + }, + { + "epoch": 0.7357867731400002, + "grad_norm": 5.158874988555908, + "learning_rate": 2.649039579684764e-05, + "loss": 0.8628, + "num_input_tokens_seen": 169019176, + "step": 10504 + }, + { + "epoch": 0.7358568213857294, + "grad_norm": 3.8838446140289307, + "learning_rate": 2.6483397548161122e-05, + "loss": 1.1046, + "num_input_tokens_seen": 169034944, + "step": 10505 + }, + { + "epoch": 0.7359268696314587, + "grad_norm": 3.431797504425049, + "learning_rate": 2.6476399299474607e-05, + "loss": 0.9229, + "num_input_tokens_seen": 169051200, + "step": 10506 + }, + { + "epoch": 0.7359969178771879, + "grad_norm": 3.956989288330078, + "learning_rate": 2.6469401050788082e-05, + "loss": 0.869, + "num_input_tokens_seen": 169066856, + "step": 10507 + }, + { + "epoch": 0.7360669661229171, + "grad_norm": 4.84227991104126, + "learning_rate": 2.6462402802101578e-05, + "loss": 1.1559, + "num_input_tokens_seen": 169083240, + "step": 10508 + }, + { + "epoch": 0.7361370143686464, + "grad_norm": 3.379727363586426, + "learning_rate": 2.6455404553415063e-05, + "loss": 0.9674, + "num_input_tokens_seen": 169099392, + "step": 10509 + }, + { + "epoch": 0.7362070626143756, + "grad_norm": 5.2606425285339355, + "learning_rate": 2.6448406304728545e-05, + "loss": 0.9467, + "num_input_tokens_seen": 169114776, + "step": 10510 + }, + { + "epoch": 0.7362771108601049, + "grad_norm": 3.6076810359954834, + "learning_rate": 2.6441408056042033e-05, + "loss": 1.258, + "num_input_tokens_seen": 169130936, + "step": 10511 + }, + { + "epoch": 0.7363471591058341, + "grad_norm": 5.483637809753418, + "learning_rate": 2.643440980735551e-05, + "loss": 1.1859, + "num_input_tokens_seen": 169147320, + "step": 10512 + }, + { + "epoch": 0.7364172073515634, + "grad_norm": 3.4674081802368164, + "learning_rate": 2.6427411558669e-05, + "loss": 1.1012, + "num_input_tokens_seen": 169163704, + "step": 10513 + }, + { + "epoch": 0.7364872555972927, + "grad_norm": 3.9137392044067383, + "learning_rate": 2.6420413309982496e-05, + "loss": 1.0542, + "num_input_tokens_seen": 169180088, + "step": 10514 + }, + { + "epoch": 0.7365573038430219, + "grad_norm": 5.275045871734619, + "learning_rate": 2.641341506129597e-05, + "loss": 0.9421, + "num_input_tokens_seen": 169196472, + "step": 10515 + }, + { + "epoch": 0.7366273520887511, + "grad_norm": 3.5561769008636475, + "learning_rate": 2.640641681260946e-05, + "loss": 0.937, + "num_input_tokens_seen": 169212720, + "step": 10516 + }, + { + "epoch": 0.7366974003344804, + "grad_norm": 4.008266448974609, + "learning_rate": 2.639941856392294e-05, + "loss": 0.9478, + "num_input_tokens_seen": 169229104, + "step": 10517 + }, + { + "epoch": 0.7367674485802096, + "grad_norm": 3.5651564598083496, + "learning_rate": 2.6392420315236426e-05, + "loss": 1.1196, + "num_input_tokens_seen": 169245488, + "step": 10518 + }, + { + "epoch": 0.7368374968259389, + "grad_norm": 4.301855087280273, + "learning_rate": 2.638542206654992e-05, + "loss": 1.2025, + "num_input_tokens_seen": 169261768, + "step": 10519 + }, + { + "epoch": 0.7369075450716681, + "grad_norm": 4.726162433624268, + "learning_rate": 2.6378423817863397e-05, + "loss": 1.0374, + "num_input_tokens_seen": 169278152, + "step": 10520 + }, + { + "epoch": 0.7369775933173973, + "grad_norm": 3.7057480812072754, + "learning_rate": 2.6371425569176882e-05, + "loss": 1.0371, + "num_input_tokens_seen": 169294536, + "step": 10521 + }, + { + "epoch": 0.7370476415631266, + "grad_norm": 3.4197916984558105, + "learning_rate": 2.6364427320490364e-05, + "loss": 0.9889, + "num_input_tokens_seen": 169310848, + "step": 10522 + }, + { + "epoch": 0.7371176898088558, + "grad_norm": 4.718552112579346, + "learning_rate": 2.6357429071803852e-05, + "loss": 1.2608, + "num_input_tokens_seen": 169327232, + "step": 10523 + }, + { + "epoch": 0.737187738054585, + "grad_norm": 3.8711113929748535, + "learning_rate": 2.6350430823117344e-05, + "loss": 1.0354, + "num_input_tokens_seen": 169343616, + "step": 10524 + }, + { + "epoch": 0.7372577863003144, + "grad_norm": 4.5464396476745605, + "learning_rate": 2.634343257443082e-05, + "loss": 1.1143, + "num_input_tokens_seen": 169360000, + "step": 10525 + }, + { + "epoch": 0.7373278345460436, + "grad_norm": 6.192793369293213, + "learning_rate": 2.6336434325744315e-05, + "loss": 1.2314, + "num_input_tokens_seen": 169376008, + "step": 10526 + }, + { + "epoch": 0.7373978827917729, + "grad_norm": 5.170640468597412, + "learning_rate": 2.632943607705779e-05, + "loss": 1.0557, + "num_input_tokens_seen": 169390816, + "step": 10527 + }, + { + "epoch": 0.7374679310375021, + "grad_norm": 3.7066495418548584, + "learning_rate": 2.632243782837128e-05, + "loss": 0.8756, + "num_input_tokens_seen": 169407200, + "step": 10528 + }, + { + "epoch": 0.7375379792832313, + "grad_norm": 3.887248992919922, + "learning_rate": 2.631543957968476e-05, + "loss": 1.2238, + "num_input_tokens_seen": 169423584, + "step": 10529 + }, + { + "epoch": 0.7376080275289606, + "grad_norm": 3.9181246757507324, + "learning_rate": 2.6308441330998252e-05, + "loss": 1.0416, + "num_input_tokens_seen": 169439528, + "step": 10530 + }, + { + "epoch": 0.7376780757746898, + "grad_norm": 5.559629440307617, + "learning_rate": 2.6301443082311737e-05, + "loss": 1.0438, + "num_input_tokens_seen": 169454808, + "step": 10531 + }, + { + "epoch": 0.7377481240204191, + "grad_norm": 3.934699535369873, + "learning_rate": 2.6294444833625216e-05, + "loss": 1.129, + "num_input_tokens_seen": 169470432, + "step": 10532 + }, + { + "epoch": 0.7378181722661483, + "grad_norm": 4.632133960723877, + "learning_rate": 2.62874465849387e-05, + "loss": 1.233, + "num_input_tokens_seen": 169486816, + "step": 10533 + }, + { + "epoch": 0.7378882205118775, + "grad_norm": 5.4470438957214355, + "learning_rate": 2.6280448336252183e-05, + "loss": 0.9313, + "num_input_tokens_seen": 169501088, + "step": 10534 + }, + { + "epoch": 0.7379582687576068, + "grad_norm": 5.563885688781738, + "learning_rate": 2.6273450087565675e-05, + "loss": 1.0469, + "num_input_tokens_seen": 169517384, + "step": 10535 + }, + { + "epoch": 0.738028317003336, + "grad_norm": 3.4310665130615234, + "learning_rate": 2.6266451838879163e-05, + "loss": 0.8574, + "num_input_tokens_seen": 169533400, + "step": 10536 + }, + { + "epoch": 0.7380983652490652, + "grad_norm": 4.377484321594238, + "learning_rate": 2.625945359019264e-05, + "loss": 1.1315, + "num_input_tokens_seen": 169549728, + "step": 10537 + }, + { + "epoch": 0.7381684134947946, + "grad_norm": 6.437469959259033, + "learning_rate": 2.6252455341506134e-05, + "loss": 1.1526, + "num_input_tokens_seen": 169565536, + "step": 10538 + }, + { + "epoch": 0.7382384617405238, + "grad_norm": 7.9621052742004395, + "learning_rate": 2.624545709281961e-05, + "loss": 1.162, + "num_input_tokens_seen": 169581560, + "step": 10539 + }, + { + "epoch": 0.7383085099862531, + "grad_norm": 4.019838809967041, + "learning_rate": 2.62384588441331e-05, + "loss": 1.1473, + "num_input_tokens_seen": 169597584, + "step": 10540 + }, + { + "epoch": 0.7383785582319823, + "grad_norm": 3.6662468910217285, + "learning_rate": 2.623146059544659e-05, + "loss": 0.93, + "num_input_tokens_seen": 169613968, + "step": 10541 + }, + { + "epoch": 0.7384486064777115, + "grad_norm": 5.373934745788574, + "learning_rate": 2.622446234676007e-05, + "loss": 0.9703, + "num_input_tokens_seen": 169630016, + "step": 10542 + }, + { + "epoch": 0.7385186547234408, + "grad_norm": 3.7633867263793945, + "learning_rate": 2.6217464098073556e-05, + "loss": 0.994, + "num_input_tokens_seen": 169645096, + "step": 10543 + }, + { + "epoch": 0.73858870296917, + "grad_norm": 3.6099295616149902, + "learning_rate": 2.6210465849387035e-05, + "loss": 1.0996, + "num_input_tokens_seen": 169661480, + "step": 10544 + }, + { + "epoch": 0.7386587512148992, + "grad_norm": 5.264440536499023, + "learning_rate": 2.6203467600700527e-05, + "loss": 1.1081, + "num_input_tokens_seen": 169677864, + "step": 10545 + }, + { + "epoch": 0.7387287994606285, + "grad_norm": 4.800827980041504, + "learning_rate": 2.6196469352014012e-05, + "loss": 0.9133, + "num_input_tokens_seen": 169692464, + "step": 10546 + }, + { + "epoch": 0.7387988477063577, + "grad_norm": 5.129746913909912, + "learning_rate": 2.6189471103327494e-05, + "loss": 1.2092, + "num_input_tokens_seen": 169708848, + "step": 10547 + }, + { + "epoch": 0.738868895952087, + "grad_norm": 7.417953014373779, + "learning_rate": 2.6182472854640982e-05, + "loss": 1.0533, + "num_input_tokens_seen": 169723528, + "step": 10548 + }, + { + "epoch": 0.7389389441978162, + "grad_norm": 4.942826271057129, + "learning_rate": 2.6175474605954458e-05, + "loss": 1.0438, + "num_input_tokens_seen": 169739232, + "step": 10549 + }, + { + "epoch": 0.7390089924435455, + "grad_norm": 3.784309148788452, + "learning_rate": 2.6168476357267953e-05, + "loss": 0.9626, + "num_input_tokens_seen": 169754944, + "step": 10550 + }, + { + "epoch": 0.7390790406892748, + "grad_norm": 6.448162078857422, + "learning_rate": 2.6161478108581445e-05, + "loss": 1.0245, + "num_input_tokens_seen": 169770832, + "step": 10551 + }, + { + "epoch": 0.739149088935004, + "grad_norm": 3.8399481773376465, + "learning_rate": 2.615447985989492e-05, + "loss": 0.9778, + "num_input_tokens_seen": 169787216, + "step": 10552 + }, + { + "epoch": 0.7392191371807332, + "grad_norm": 3.907796621322632, + "learning_rate": 2.614748161120841e-05, + "loss": 0.9861, + "num_input_tokens_seen": 169803352, + "step": 10553 + }, + { + "epoch": 0.7392891854264625, + "grad_norm": 5.517360687255859, + "learning_rate": 2.614048336252189e-05, + "loss": 1.0895, + "num_input_tokens_seen": 169819736, + "step": 10554 + }, + { + "epoch": 0.7393592336721917, + "grad_norm": 5.287674427032471, + "learning_rate": 2.6133485113835376e-05, + "loss": 1.07, + "num_input_tokens_seen": 169835168, + "step": 10555 + }, + { + "epoch": 0.739429281917921, + "grad_norm": 6.481996536254883, + "learning_rate": 2.6126486865148857e-05, + "loss": 1.1045, + "num_input_tokens_seen": 169850664, + "step": 10556 + }, + { + "epoch": 0.7394993301636502, + "grad_norm": 4.7750749588012695, + "learning_rate": 2.6119488616462346e-05, + "loss": 1.1844, + "num_input_tokens_seen": 169865968, + "step": 10557 + }, + { + "epoch": 0.7395693784093794, + "grad_norm": 4.263725757598877, + "learning_rate": 2.611249036777583e-05, + "loss": 1.2777, + "num_input_tokens_seen": 169882128, + "step": 10558 + }, + { + "epoch": 0.7396394266551087, + "grad_norm": 4.387961387634277, + "learning_rate": 2.6105492119089313e-05, + "loss": 0.9462, + "num_input_tokens_seen": 169897504, + "step": 10559 + }, + { + "epoch": 0.7397094749008379, + "grad_norm": 4.31755256652832, + "learning_rate": 2.6098493870402808e-05, + "loss": 0.9729, + "num_input_tokens_seen": 169913888, + "step": 10560 + }, + { + "epoch": 0.7397795231465673, + "grad_norm": 3.586451768875122, + "learning_rate": 2.6091495621716283e-05, + "loss": 0.993, + "num_input_tokens_seen": 169930272, + "step": 10561 + }, + { + "epoch": 0.7398495713922965, + "grad_norm": 3.8307273387908936, + "learning_rate": 2.6084497373029772e-05, + "loss": 1.0185, + "num_input_tokens_seen": 169946656, + "step": 10562 + }, + { + "epoch": 0.7399196196380257, + "grad_norm": 4.19478702545166, + "learning_rate": 2.6077499124343264e-05, + "loss": 1.041, + "num_input_tokens_seen": 169962136, + "step": 10563 + }, + { + "epoch": 0.739989667883755, + "grad_norm": 4.226399898529053, + "learning_rate": 2.607050087565674e-05, + "loss": 1.2628, + "num_input_tokens_seen": 169978520, + "step": 10564 + }, + { + "epoch": 0.7400597161294842, + "grad_norm": 4.149684906005859, + "learning_rate": 2.606350262697023e-05, + "loss": 0.954, + "num_input_tokens_seen": 169994904, + "step": 10565 + }, + { + "epoch": 0.7401297643752134, + "grad_norm": 4.815145015716553, + "learning_rate": 2.605650437828371e-05, + "loss": 1.2183, + "num_input_tokens_seen": 170011064, + "step": 10566 + }, + { + "epoch": 0.7401998126209427, + "grad_norm": 3.9827334880828857, + "learning_rate": 2.6049506129597195e-05, + "loss": 1.186, + "num_input_tokens_seen": 170027448, + "step": 10567 + }, + { + "epoch": 0.7402698608666719, + "grad_norm": 3.891735076904297, + "learning_rate": 2.6042507880910687e-05, + "loss": 1.0315, + "num_input_tokens_seen": 170042472, + "step": 10568 + }, + { + "epoch": 0.7403399091124012, + "grad_norm": 4.232996940612793, + "learning_rate": 2.6035509632224165e-05, + "loss": 1.0434, + "num_input_tokens_seen": 170058856, + "step": 10569 + }, + { + "epoch": 0.7404099573581304, + "grad_norm": 3.8315019607543945, + "learning_rate": 2.6028511383537657e-05, + "loss": 1.0733, + "num_input_tokens_seen": 170074296, + "step": 10570 + }, + { + "epoch": 0.7404800056038596, + "grad_norm": 4.026500225067139, + "learning_rate": 2.6021513134851132e-05, + "loss": 1.0227, + "num_input_tokens_seen": 170088760, + "step": 10571 + }, + { + "epoch": 0.7405500538495889, + "grad_norm": 4.838707447052002, + "learning_rate": 2.6014514886164627e-05, + "loss": 1.0781, + "num_input_tokens_seen": 170105024, + "step": 10572 + }, + { + "epoch": 0.7406201020953181, + "grad_norm": 4.13600492477417, + "learning_rate": 2.6007516637478113e-05, + "loss": 1.1114, + "num_input_tokens_seen": 170120800, + "step": 10573 + }, + { + "epoch": 0.7406901503410473, + "grad_norm": 3.9262802600860596, + "learning_rate": 2.600051838879159e-05, + "loss": 1.0231, + "num_input_tokens_seen": 170137184, + "step": 10574 + }, + { + "epoch": 0.7407601985867767, + "grad_norm": 3.807204246520996, + "learning_rate": 2.5993520140105083e-05, + "loss": 1.1283, + "num_input_tokens_seen": 170153568, + "step": 10575 + }, + { + "epoch": 0.7408302468325059, + "grad_norm": 5.639378547668457, + "learning_rate": 2.5986521891418565e-05, + "loss": 1.0768, + "num_input_tokens_seen": 170168960, + "step": 10576 + }, + { + "epoch": 0.7409002950782352, + "grad_norm": 6.665270805358887, + "learning_rate": 2.597952364273205e-05, + "loss": 1.0528, + "num_input_tokens_seen": 170185288, + "step": 10577 + }, + { + "epoch": 0.7409703433239644, + "grad_norm": 3.8074893951416016, + "learning_rate": 2.597252539404554e-05, + "loss": 1.0076, + "num_input_tokens_seen": 170201672, + "step": 10578 + }, + { + "epoch": 0.7410403915696936, + "grad_norm": 4.729522705078125, + "learning_rate": 2.5965527145359014e-05, + "loss": 0.8868, + "num_input_tokens_seen": 170217912, + "step": 10579 + }, + { + "epoch": 0.7411104398154229, + "grad_norm": 4.224606990814209, + "learning_rate": 2.5958528896672506e-05, + "loss": 1.2899, + "num_input_tokens_seen": 170234160, + "step": 10580 + }, + { + "epoch": 0.7411804880611521, + "grad_norm": 4.445167064666748, + "learning_rate": 2.5951530647985987e-05, + "loss": 1.0837, + "num_input_tokens_seen": 170249832, + "step": 10581 + }, + { + "epoch": 0.7412505363068813, + "grad_norm": 3.750044584274292, + "learning_rate": 2.5944532399299476e-05, + "loss": 1.0933, + "num_input_tokens_seen": 170265800, + "step": 10582 + }, + { + "epoch": 0.7413205845526106, + "grad_norm": 3.5876152515411377, + "learning_rate": 2.593753415061295e-05, + "loss": 0.922, + "num_input_tokens_seen": 170281648, + "step": 10583 + }, + { + "epoch": 0.7413906327983398, + "grad_norm": 3.395641565322876, + "learning_rate": 2.5930535901926446e-05, + "loss": 0.904, + "num_input_tokens_seen": 170297160, + "step": 10584 + }, + { + "epoch": 0.7414606810440691, + "grad_norm": 4.398489952087402, + "learning_rate": 2.592353765323994e-05, + "loss": 0.9261, + "num_input_tokens_seen": 170312448, + "step": 10585 + }, + { + "epoch": 0.7415307292897984, + "grad_norm": 5.254413604736328, + "learning_rate": 2.5916539404553413e-05, + "loss": 1.0327, + "num_input_tokens_seen": 170328832, + "step": 10586 + }, + { + "epoch": 0.7416007775355276, + "grad_norm": 3.7298836708068848, + "learning_rate": 2.5909541155866902e-05, + "loss": 1.0702, + "num_input_tokens_seen": 170345216, + "step": 10587 + }, + { + "epoch": 0.7416708257812569, + "grad_norm": 3.779209852218628, + "learning_rate": 2.5902542907180384e-05, + "loss": 0.9811, + "num_input_tokens_seen": 170361600, + "step": 10588 + }, + { + "epoch": 0.7417408740269861, + "grad_norm": 3.5138185024261475, + "learning_rate": 2.589554465849387e-05, + "loss": 0.9033, + "num_input_tokens_seen": 170377984, + "step": 10589 + }, + { + "epoch": 0.7418109222727153, + "grad_norm": 5.928292751312256, + "learning_rate": 2.5888546409807364e-05, + "loss": 0.9679, + "num_input_tokens_seen": 170394280, + "step": 10590 + }, + { + "epoch": 0.7418809705184446, + "grad_norm": 3.648385524749756, + "learning_rate": 2.588154816112084e-05, + "loss": 1.0114, + "num_input_tokens_seen": 170410664, + "step": 10591 + }, + { + "epoch": 0.7419510187641738, + "grad_norm": 3.9961655139923096, + "learning_rate": 2.5874549912434325e-05, + "loss": 0.8669, + "num_input_tokens_seen": 170427048, + "step": 10592 + }, + { + "epoch": 0.7420210670099031, + "grad_norm": 3.4764299392700195, + "learning_rate": 2.5867551663747806e-05, + "loss": 0.9029, + "num_input_tokens_seen": 170443432, + "step": 10593 + }, + { + "epoch": 0.7420911152556323, + "grad_norm": 3.867755889892578, + "learning_rate": 2.5860553415061295e-05, + "loss": 1.1045, + "num_input_tokens_seen": 170459512, + "step": 10594 + }, + { + "epoch": 0.7421611635013615, + "grad_norm": 3.87514066696167, + "learning_rate": 2.5853555166374787e-05, + "loss": 1.1876, + "num_input_tokens_seen": 170475608, + "step": 10595 + }, + { + "epoch": 0.7422312117470908, + "grad_norm": 3.9780101776123047, + "learning_rate": 2.5846556917688265e-05, + "loss": 1.0626, + "num_input_tokens_seen": 170491992, + "step": 10596 + }, + { + "epoch": 0.74230125999282, + "grad_norm": 4.703016757965088, + "learning_rate": 2.5839558669001757e-05, + "loss": 1.0437, + "num_input_tokens_seen": 170507400, + "step": 10597 + }, + { + "epoch": 0.7423713082385494, + "grad_norm": 4.587261199951172, + "learning_rate": 2.5832560420315232e-05, + "loss": 0.9361, + "num_input_tokens_seen": 170523512, + "step": 10598 + }, + { + "epoch": 0.7424413564842786, + "grad_norm": 3.7180306911468506, + "learning_rate": 2.582556217162872e-05, + "loss": 1.0559, + "num_input_tokens_seen": 170539896, + "step": 10599 + }, + { + "epoch": 0.7425114047300078, + "grad_norm": 3.655656099319458, + "learning_rate": 2.5818563922942213e-05, + "loss": 0.9954, + "num_input_tokens_seen": 170556280, + "step": 10600 + }, + { + "epoch": 0.7425114047300078, + "eval_loss": 1.11661958694458, + "eval_runtime": 0.1836, + "eval_samples_per_second": 5.446, + "eval_steps_per_second": 5.446, + "num_input_tokens_seen": 170556280, + "step": 10600 + }, + { + "epoch": 0.7425814529757371, + "grad_norm": 5.531803131103516, + "learning_rate": 2.5811565674255688e-05, + "loss": 1.0918, + "num_input_tokens_seen": 170571728, + "step": 10601 + }, + { + "epoch": 0.7426515012214663, + "grad_norm": 4.434628486633301, + "learning_rate": 2.580456742556918e-05, + "loss": 1.1212, + "num_input_tokens_seen": 170588112, + "step": 10602 + }, + { + "epoch": 0.7427215494671955, + "grad_norm": 3.665743827819824, + "learning_rate": 2.579756917688266e-05, + "loss": 1.0545, + "num_input_tokens_seen": 170604016, + "step": 10603 + }, + { + "epoch": 0.7427915977129248, + "grad_norm": 3.749884605407715, + "learning_rate": 2.5790570928196144e-05, + "loss": 1.0363, + "num_input_tokens_seen": 170620152, + "step": 10604 + }, + { + "epoch": 0.742861645958654, + "grad_norm": 4.121984004974365, + "learning_rate": 2.578357267950964e-05, + "loss": 1.2097, + "num_input_tokens_seen": 170636536, + "step": 10605 + }, + { + "epoch": 0.7429316942043833, + "grad_norm": 3.4826276302337646, + "learning_rate": 2.577657443082312e-05, + "loss": 1.1107, + "num_input_tokens_seen": 170652920, + "step": 10606 + }, + { + "epoch": 0.7430017424501125, + "grad_norm": 3.600797414779663, + "learning_rate": 2.5769576182136606e-05, + "loss": 1.1297, + "num_input_tokens_seen": 170669304, + "step": 10607 + }, + { + "epoch": 0.7430717906958417, + "grad_norm": 3.976576805114746, + "learning_rate": 2.576257793345008e-05, + "loss": 1.2266, + "num_input_tokens_seen": 170685528, + "step": 10608 + }, + { + "epoch": 0.743141838941571, + "grad_norm": 3.749668836593628, + "learning_rate": 2.5755579684763576e-05, + "loss": 1.0035, + "num_input_tokens_seen": 170701912, + "step": 10609 + }, + { + "epoch": 0.7432118871873002, + "grad_norm": 4.767414569854736, + "learning_rate": 2.574858143607705e-05, + "loss": 0.9334, + "num_input_tokens_seen": 170718296, + "step": 10610 + }, + { + "epoch": 0.7432819354330295, + "grad_norm": 4.388510704040527, + "learning_rate": 2.5741583187390543e-05, + "loss": 1.1993, + "num_input_tokens_seen": 170734152, + "step": 10611 + }, + { + "epoch": 0.7433519836787588, + "grad_norm": 3.796036720275879, + "learning_rate": 2.5734584938704032e-05, + "loss": 1.1017, + "num_input_tokens_seen": 170750536, + "step": 10612 + }, + { + "epoch": 0.743422031924488, + "grad_norm": 3.8187673091888428, + "learning_rate": 2.5727586690017507e-05, + "loss": 1.185, + "num_input_tokens_seen": 170766312, + "step": 10613 + }, + { + "epoch": 0.7434920801702173, + "grad_norm": 3.6440799236297607, + "learning_rate": 2.5720588441331e-05, + "loss": 0.8955, + "num_input_tokens_seen": 170782696, + "step": 10614 + }, + { + "epoch": 0.7435621284159465, + "grad_norm": 5.519860744476318, + "learning_rate": 2.5713590192644477e-05, + "loss": 1.16, + "num_input_tokens_seen": 170799080, + "step": 10615 + }, + { + "epoch": 0.7436321766616757, + "grad_norm": 4.185689449310303, + "learning_rate": 2.570659194395797e-05, + "loss": 1.0197, + "num_input_tokens_seen": 170815464, + "step": 10616 + }, + { + "epoch": 0.743702224907405, + "grad_norm": 3.762596607208252, + "learning_rate": 2.5699593695271458e-05, + "loss": 1.0832, + "num_input_tokens_seen": 170831376, + "step": 10617 + }, + { + "epoch": 0.7437722731531342, + "grad_norm": 4.324756622314453, + "learning_rate": 2.569259544658494e-05, + "loss": 1.0614, + "num_input_tokens_seen": 170847760, + "step": 10618 + }, + { + "epoch": 0.7438423213988634, + "grad_norm": 4.862603187561035, + "learning_rate": 2.5685597197898425e-05, + "loss": 1.0072, + "num_input_tokens_seen": 170863032, + "step": 10619 + }, + { + "epoch": 0.7439123696445927, + "grad_norm": 3.5700461864471436, + "learning_rate": 2.56785989492119e-05, + "loss": 0.9967, + "num_input_tokens_seen": 170879056, + "step": 10620 + }, + { + "epoch": 0.7439824178903219, + "grad_norm": 3.4935829639434814, + "learning_rate": 2.5671600700525395e-05, + "loss": 0.9368, + "num_input_tokens_seen": 170895440, + "step": 10621 + }, + { + "epoch": 0.7440524661360512, + "grad_norm": 3.730043649673462, + "learning_rate": 2.566460245183888e-05, + "loss": 0.9705, + "num_input_tokens_seen": 170911824, + "step": 10622 + }, + { + "epoch": 0.7441225143817805, + "grad_norm": 3.3017678260803223, + "learning_rate": 2.5657604203152362e-05, + "loss": 0.9188, + "num_input_tokens_seen": 170928208, + "step": 10623 + }, + { + "epoch": 0.7441925626275097, + "grad_norm": 4.462038993835449, + "learning_rate": 2.565060595446585e-05, + "loss": 1.0623, + "num_input_tokens_seen": 170944592, + "step": 10624 + }, + { + "epoch": 0.744262610873239, + "grad_norm": 3.3981754779815674, + "learning_rate": 2.5643607705779333e-05, + "loss": 0.9908, + "num_input_tokens_seen": 170960976, + "step": 10625 + }, + { + "epoch": 0.7443326591189682, + "grad_norm": 3.9813733100891113, + "learning_rate": 2.5636609457092818e-05, + "loss": 0.9697, + "num_input_tokens_seen": 170977216, + "step": 10626 + }, + { + "epoch": 0.7444027073646974, + "grad_norm": 4.091405868530273, + "learning_rate": 2.5629611208406313e-05, + "loss": 1.1804, + "num_input_tokens_seen": 170993600, + "step": 10627 + }, + { + "epoch": 0.7444727556104267, + "grad_norm": 4.190169334411621, + "learning_rate": 2.562261295971979e-05, + "loss": 1.2369, + "num_input_tokens_seen": 171009384, + "step": 10628 + }, + { + "epoch": 0.7445428038561559, + "grad_norm": 3.8834333419799805, + "learning_rate": 2.5615614711033277e-05, + "loss": 1.0872, + "num_input_tokens_seen": 171025768, + "step": 10629 + }, + { + "epoch": 0.7446128521018852, + "grad_norm": 4.439603328704834, + "learning_rate": 2.560861646234676e-05, + "loss": 1.0134, + "num_input_tokens_seen": 171041112, + "step": 10630 + }, + { + "epoch": 0.7446829003476144, + "grad_norm": 4.1875901222229, + "learning_rate": 2.5601618213660244e-05, + "loss": 1.2061, + "num_input_tokens_seen": 171056392, + "step": 10631 + }, + { + "epoch": 0.7447529485933436, + "grad_norm": 3.7817699909210205, + "learning_rate": 2.5594619964973736e-05, + "loss": 1.0379, + "num_input_tokens_seen": 171072776, + "step": 10632 + }, + { + "epoch": 0.7448229968390729, + "grad_norm": 4.192639350891113, + "learning_rate": 2.5587621716287215e-05, + "loss": 1.081, + "num_input_tokens_seen": 171088600, + "step": 10633 + }, + { + "epoch": 0.7448930450848021, + "grad_norm": 3.8280909061431885, + "learning_rate": 2.55806234676007e-05, + "loss": 1.2012, + "num_input_tokens_seen": 171104832, + "step": 10634 + }, + { + "epoch": 0.7449630933305315, + "grad_norm": 4.556401252746582, + "learning_rate": 2.557362521891418e-05, + "loss": 0.9528, + "num_input_tokens_seen": 171120808, + "step": 10635 + }, + { + "epoch": 0.7450331415762607, + "grad_norm": 4.06897497177124, + "learning_rate": 2.556662697022767e-05, + "loss": 1.0774, + "num_input_tokens_seen": 171136672, + "step": 10636 + }, + { + "epoch": 0.7451031898219899, + "grad_norm": 6.814166069030762, + "learning_rate": 2.5559628721541152e-05, + "loss": 0.9488, + "num_input_tokens_seen": 171152440, + "step": 10637 + }, + { + "epoch": 0.7451732380677192, + "grad_norm": 3.775862693786621, + "learning_rate": 2.5552630472854637e-05, + "loss": 0.9929, + "num_input_tokens_seen": 171168824, + "step": 10638 + }, + { + "epoch": 0.7452432863134484, + "grad_norm": 3.5109944343566895, + "learning_rate": 2.5545632224168132e-05, + "loss": 0.981, + "num_input_tokens_seen": 171184608, + "step": 10639 + }, + { + "epoch": 0.7453133345591776, + "grad_norm": 4.050136089324951, + "learning_rate": 2.5538633975481608e-05, + "loss": 0.875, + "num_input_tokens_seen": 171200992, + "step": 10640 + }, + { + "epoch": 0.7453833828049069, + "grad_norm": 4.239757537841797, + "learning_rate": 2.5531635726795096e-05, + "loss": 0.956, + "num_input_tokens_seen": 171216248, + "step": 10641 + }, + { + "epoch": 0.7454534310506361, + "grad_norm": 3.6020138263702393, + "learning_rate": 2.5524637478108575e-05, + "loss": 0.9343, + "num_input_tokens_seen": 171232280, + "step": 10642 + }, + { + "epoch": 0.7455234792963654, + "grad_norm": 3.815358877182007, + "learning_rate": 2.551763922942207e-05, + "loss": 0.995, + "num_input_tokens_seen": 171248664, + "step": 10643 + }, + { + "epoch": 0.7455935275420946, + "grad_norm": 3.7581701278686523, + "learning_rate": 2.5510640980735555e-05, + "loss": 1.1521, + "num_input_tokens_seen": 171264968, + "step": 10644 + }, + { + "epoch": 0.7456635757878238, + "grad_norm": 3.853900909423828, + "learning_rate": 2.5503642732049034e-05, + "loss": 1.0782, + "num_input_tokens_seen": 171281352, + "step": 10645 + }, + { + "epoch": 0.7457336240335531, + "grad_norm": 4.2661261558532715, + "learning_rate": 2.549664448336252e-05, + "loss": 1.0724, + "num_input_tokens_seen": 171296872, + "step": 10646 + }, + { + "epoch": 0.7458036722792823, + "grad_norm": 4.543593883514404, + "learning_rate": 2.5489646234676e-05, + "loss": 1.2378, + "num_input_tokens_seen": 171313128, + "step": 10647 + }, + { + "epoch": 0.7458737205250116, + "grad_norm": 4.210000038146973, + "learning_rate": 2.5482647985989493e-05, + "loss": 1.0777, + "num_input_tokens_seen": 171329512, + "step": 10648 + }, + { + "epoch": 0.7459437687707409, + "grad_norm": 3.93757700920105, + "learning_rate": 2.547564973730298e-05, + "loss": 0.9252, + "num_input_tokens_seen": 171345552, + "step": 10649 + }, + { + "epoch": 0.7460138170164701, + "grad_norm": 3.522759199142456, + "learning_rate": 2.5468651488616456e-05, + "loss": 1.0239, + "num_input_tokens_seen": 171361936, + "step": 10650 + }, + { + "epoch": 0.7460838652621994, + "grad_norm": 4.195476055145264, + "learning_rate": 2.546165323992995e-05, + "loss": 1.1127, + "num_input_tokens_seen": 171378320, + "step": 10651 + }, + { + "epoch": 0.7461539135079286, + "grad_norm": 3.5321669578552246, + "learning_rate": 2.5454654991243433e-05, + "loss": 0.9256, + "num_input_tokens_seen": 171393672, + "step": 10652 + }, + { + "epoch": 0.7462239617536578, + "grad_norm": 4.739002227783203, + "learning_rate": 2.544765674255692e-05, + "loss": 1.1916, + "num_input_tokens_seen": 171409184, + "step": 10653 + }, + { + "epoch": 0.7462940099993871, + "grad_norm": 3.63215970993042, + "learning_rate": 2.5440658493870407e-05, + "loss": 1.0442, + "num_input_tokens_seen": 171425568, + "step": 10654 + }, + { + "epoch": 0.7463640582451163, + "grad_norm": 3.468248128890991, + "learning_rate": 2.543366024518389e-05, + "loss": 0.8852, + "num_input_tokens_seen": 171441600, + "step": 10655 + }, + { + "epoch": 0.7464341064908455, + "grad_norm": 3.406644582748413, + "learning_rate": 2.5426661996497374e-05, + "loss": 0.9319, + "num_input_tokens_seen": 171457592, + "step": 10656 + }, + { + "epoch": 0.7465041547365748, + "grad_norm": 4.434091567993164, + "learning_rate": 2.5419663747810853e-05, + "loss": 1.0928, + "num_input_tokens_seen": 171473976, + "step": 10657 + }, + { + "epoch": 0.746574202982304, + "grad_norm": 3.3733532428741455, + "learning_rate": 2.5412665499124345e-05, + "loss": 0.9817, + "num_input_tokens_seen": 171490360, + "step": 10658 + }, + { + "epoch": 0.7466442512280334, + "grad_norm": 3.8291285037994385, + "learning_rate": 2.540566725043783e-05, + "loss": 1.0406, + "num_input_tokens_seen": 171506744, + "step": 10659 + }, + { + "epoch": 0.7467142994737626, + "grad_norm": 4.024164199829102, + "learning_rate": 2.539866900175131e-05, + "loss": 1.1402, + "num_input_tokens_seen": 171523128, + "step": 10660 + }, + { + "epoch": 0.7467843477194918, + "grad_norm": 3.767835855484009, + "learning_rate": 2.53916707530648e-05, + "loss": 1.012, + "num_input_tokens_seen": 171539512, + "step": 10661 + }, + { + "epoch": 0.7468543959652211, + "grad_norm": 3.981430768966675, + "learning_rate": 2.5384672504378275e-05, + "loss": 1.1532, + "num_input_tokens_seen": 171555744, + "step": 10662 + }, + { + "epoch": 0.7469244442109503, + "grad_norm": 3.8949761390686035, + "learning_rate": 2.537767425569177e-05, + "loss": 1.0614, + "num_input_tokens_seen": 171572128, + "step": 10663 + }, + { + "epoch": 0.7469944924566796, + "grad_norm": 3.7961387634277344, + "learning_rate": 2.5370676007005252e-05, + "loss": 1.092, + "num_input_tokens_seen": 171587968, + "step": 10664 + }, + { + "epoch": 0.7470645407024088, + "grad_norm": 5.627418041229248, + "learning_rate": 2.5363677758318738e-05, + "loss": 1.1872, + "num_input_tokens_seen": 171604240, + "step": 10665 + }, + { + "epoch": 0.747134588948138, + "grad_norm": 5.008187294006348, + "learning_rate": 2.5356679509632226e-05, + "loss": 1.1522, + "num_input_tokens_seen": 171620216, + "step": 10666 + }, + { + "epoch": 0.7472046371938673, + "grad_norm": 3.4284491539001465, + "learning_rate": 2.5349681260945708e-05, + "loss": 0.9582, + "num_input_tokens_seen": 171636600, + "step": 10667 + }, + { + "epoch": 0.7472746854395965, + "grad_norm": 4.255136966705322, + "learning_rate": 2.5342683012259193e-05, + "loss": 1.1406, + "num_input_tokens_seen": 171652984, + "step": 10668 + }, + { + "epoch": 0.7473447336853257, + "grad_norm": 3.8285486698150635, + "learning_rate": 2.5335684763572675e-05, + "loss": 1.0823, + "num_input_tokens_seen": 171669192, + "step": 10669 + }, + { + "epoch": 0.747414781931055, + "grad_norm": 4.412316799163818, + "learning_rate": 2.5328686514886164e-05, + "loss": 1.1782, + "num_input_tokens_seen": 171685576, + "step": 10670 + }, + { + "epoch": 0.7474848301767842, + "grad_norm": 5.281959056854248, + "learning_rate": 2.532168826619965e-05, + "loss": 1.1132, + "num_input_tokens_seen": 171701456, + "step": 10671 + }, + { + "epoch": 0.7475548784225136, + "grad_norm": 3.9816291332244873, + "learning_rate": 2.531469001751313e-05, + "loss": 0.9646, + "num_input_tokens_seen": 171716544, + "step": 10672 + }, + { + "epoch": 0.7476249266682428, + "grad_norm": 3.970984935760498, + "learning_rate": 2.5307691768826626e-05, + "loss": 0.9505, + "num_input_tokens_seen": 171731928, + "step": 10673 + }, + { + "epoch": 0.747694974913972, + "grad_norm": 4.070466995239258, + "learning_rate": 2.53006935201401e-05, + "loss": 1.0459, + "num_input_tokens_seen": 171748312, + "step": 10674 + }, + { + "epoch": 0.7477650231597013, + "grad_norm": 3.77146315574646, + "learning_rate": 2.529369527145359e-05, + "loss": 1.0332, + "num_input_tokens_seen": 171764320, + "step": 10675 + }, + { + "epoch": 0.7478350714054305, + "grad_norm": 4.411896228790283, + "learning_rate": 2.528669702276708e-05, + "loss": 1.1426, + "num_input_tokens_seen": 171780704, + "step": 10676 + }, + { + "epoch": 0.7479051196511597, + "grad_norm": 4.011537075042725, + "learning_rate": 2.5279698774080557e-05, + "loss": 1.02, + "num_input_tokens_seen": 171797032, + "step": 10677 + }, + { + "epoch": 0.747975167896889, + "grad_norm": 4.461518287658691, + "learning_rate": 2.527270052539405e-05, + "loss": 1.0812, + "num_input_tokens_seen": 171813416, + "step": 10678 + }, + { + "epoch": 0.7480452161426182, + "grad_norm": 3.4736366271972656, + "learning_rate": 2.5265702276707527e-05, + "loss": 0.8958, + "num_input_tokens_seen": 171829800, + "step": 10679 + }, + { + "epoch": 0.7481152643883475, + "grad_norm": 4.153388500213623, + "learning_rate": 2.5258704028021012e-05, + "loss": 1.0535, + "num_input_tokens_seen": 171846184, + "step": 10680 + }, + { + "epoch": 0.7481853126340767, + "grad_norm": 4.170673370361328, + "learning_rate": 2.5251705779334504e-05, + "loss": 1.1873, + "num_input_tokens_seen": 171862568, + "step": 10681 + }, + { + "epoch": 0.7482553608798059, + "grad_norm": 3.533921480178833, + "learning_rate": 2.5244707530647983e-05, + "loss": 0.8793, + "num_input_tokens_seen": 171878952, + "step": 10682 + }, + { + "epoch": 0.7483254091255352, + "grad_norm": 3.7430715560913086, + "learning_rate": 2.5237709281961475e-05, + "loss": 1.0652, + "num_input_tokens_seen": 171895336, + "step": 10683 + }, + { + "epoch": 0.7483954573712644, + "grad_norm": 3.664469003677368, + "learning_rate": 2.523071103327495e-05, + "loss": 1.2357, + "num_input_tokens_seen": 171911720, + "step": 10684 + }, + { + "epoch": 0.7484655056169937, + "grad_norm": 6.700071811676025, + "learning_rate": 2.5223712784588445e-05, + "loss": 1.0798, + "num_input_tokens_seen": 171927152, + "step": 10685 + }, + { + "epoch": 0.748535553862723, + "grad_norm": 4.5248494148254395, + "learning_rate": 2.521671453590193e-05, + "loss": 1.0959, + "num_input_tokens_seen": 171943536, + "step": 10686 + }, + { + "epoch": 0.7486056021084522, + "grad_norm": 3.665634870529175, + "learning_rate": 2.5209716287215405e-05, + "loss": 1.0225, + "num_input_tokens_seen": 171959920, + "step": 10687 + }, + { + "epoch": 0.7486756503541815, + "grad_norm": 3.7656190395355225, + "learning_rate": 2.52027180385289e-05, + "loss": 1.1329, + "num_input_tokens_seen": 171976304, + "step": 10688 + }, + { + "epoch": 0.7487456985999107, + "grad_norm": 3.964611291885376, + "learning_rate": 2.5195719789842382e-05, + "loss": 0.9297, + "num_input_tokens_seen": 171992688, + "step": 10689 + }, + { + "epoch": 0.7488157468456399, + "grad_norm": 5.222901344299316, + "learning_rate": 2.5188721541155868e-05, + "loss": 1.1624, + "num_input_tokens_seen": 172009072, + "step": 10690 + }, + { + "epoch": 0.7488857950913692, + "grad_norm": 3.472870111465454, + "learning_rate": 2.5181723292469346e-05, + "loss": 1.0387, + "num_input_tokens_seen": 172025136, + "step": 10691 + }, + { + "epoch": 0.7489558433370984, + "grad_norm": 3.6522982120513916, + "learning_rate": 2.517472504378283e-05, + "loss": 0.9499, + "num_input_tokens_seen": 172041520, + "step": 10692 + }, + { + "epoch": 0.7490258915828276, + "grad_norm": 4.379208087921143, + "learning_rate": 2.5167726795096323e-05, + "loss": 1.0675, + "num_input_tokens_seen": 172057704, + "step": 10693 + }, + { + "epoch": 0.7490959398285569, + "grad_norm": 4.277910232543945, + "learning_rate": 2.5160728546409805e-05, + "loss": 1.2339, + "num_input_tokens_seen": 172074088, + "step": 10694 + }, + { + "epoch": 0.7491659880742861, + "grad_norm": 3.756138324737549, + "learning_rate": 2.5153730297723294e-05, + "loss": 1.1307, + "num_input_tokens_seen": 172090472, + "step": 10695 + }, + { + "epoch": 0.7492360363200155, + "grad_norm": 3.303793430328369, + "learning_rate": 2.514673204903677e-05, + "loss": 0.86, + "num_input_tokens_seen": 172106856, + "step": 10696 + }, + { + "epoch": 0.7493060845657447, + "grad_norm": 4.1949005126953125, + "learning_rate": 2.5139733800350264e-05, + "loss": 1.1436, + "num_input_tokens_seen": 172123240, + "step": 10697 + }, + { + "epoch": 0.7493761328114739, + "grad_norm": 3.550511121749878, + "learning_rate": 2.5132735551663756e-05, + "loss": 0.936, + "num_input_tokens_seen": 172139624, + "step": 10698 + }, + { + "epoch": 0.7494461810572032, + "grad_norm": 4.004504203796387, + "learning_rate": 2.512573730297723e-05, + "loss": 0.9329, + "num_input_tokens_seen": 172156008, + "step": 10699 + }, + { + "epoch": 0.7495162293029324, + "grad_norm": 4.565281391143799, + "learning_rate": 2.511873905429072e-05, + "loss": 1.2725, + "num_input_tokens_seen": 172172392, + "step": 10700 + }, + { + "epoch": 0.7495862775486617, + "grad_norm": 5.487757682800293, + "learning_rate": 2.51117408056042e-05, + "loss": 0.9488, + "num_input_tokens_seen": 172188776, + "step": 10701 + }, + { + "epoch": 0.7496563257943909, + "grad_norm": 5.526664733886719, + "learning_rate": 2.5104742556917687e-05, + "loss": 1.0077, + "num_input_tokens_seen": 172204664, + "step": 10702 + }, + { + "epoch": 0.7497263740401201, + "grad_norm": 3.900934934616089, + "learning_rate": 2.509774430823118e-05, + "loss": 0.9928, + "num_input_tokens_seen": 172221048, + "step": 10703 + }, + { + "epoch": 0.7497964222858494, + "grad_norm": 4.71485710144043, + "learning_rate": 2.5090746059544657e-05, + "loss": 1.0861, + "num_input_tokens_seen": 172236872, + "step": 10704 + }, + { + "epoch": 0.7498664705315786, + "grad_norm": 4.480246543884277, + "learning_rate": 2.5083747810858142e-05, + "loss": 0.9637, + "num_input_tokens_seen": 172253256, + "step": 10705 + }, + { + "epoch": 0.7499365187773078, + "grad_norm": 3.812025785446167, + "learning_rate": 2.5076749562171624e-05, + "loss": 1.0175, + "num_input_tokens_seen": 172269112, + "step": 10706 + }, + { + "epoch": 0.7500065670230371, + "grad_norm": 12.59459400177002, + "learning_rate": 2.5069751313485113e-05, + "loss": 1.399, + "num_input_tokens_seen": 172282184, + "step": 10707 + }, + { + "epoch": 0.7500766152687663, + "grad_norm": 4.123873710632324, + "learning_rate": 2.5062753064798605e-05, + "loss": 0.9393, + "num_input_tokens_seen": 172297056, + "step": 10708 + }, + { + "epoch": 0.7501466635144957, + "grad_norm": 3.4555020332336426, + "learning_rate": 2.5055754816112083e-05, + "loss": 0.866, + "num_input_tokens_seen": 172313440, + "step": 10709 + }, + { + "epoch": 0.7502167117602249, + "grad_norm": 5.149211883544922, + "learning_rate": 2.5048756567425575e-05, + "loss": 1.1031, + "num_input_tokens_seen": 172329824, + "step": 10710 + }, + { + "epoch": 0.7502867600059541, + "grad_norm": 4.03378963470459, + "learning_rate": 2.504175831873905e-05, + "loss": 1.0769, + "num_input_tokens_seen": 172346208, + "step": 10711 + }, + { + "epoch": 0.7503568082516834, + "grad_norm": 3.7540276050567627, + "learning_rate": 2.503476007005254e-05, + "loss": 1.0931, + "num_input_tokens_seen": 172362592, + "step": 10712 + }, + { + "epoch": 0.7504268564974126, + "grad_norm": 4.295251369476318, + "learning_rate": 2.502776182136602e-05, + "loss": 1.1462, + "num_input_tokens_seen": 172378976, + "step": 10713 + }, + { + "epoch": 0.7504969047431418, + "grad_norm": 3.810828924179077, + "learning_rate": 2.5020763572679506e-05, + "loss": 1.0009, + "num_input_tokens_seen": 172395360, + "step": 10714 + }, + { + "epoch": 0.7505669529888711, + "grad_norm": 4.037637710571289, + "learning_rate": 2.5013765323992998e-05, + "loss": 0.9055, + "num_input_tokens_seen": 172410976, + "step": 10715 + }, + { + "epoch": 0.7506370012346003, + "grad_norm": 3.7160496711730957, + "learning_rate": 2.5006767075306476e-05, + "loss": 0.9728, + "num_input_tokens_seen": 172427208, + "step": 10716 + }, + { + "epoch": 0.7507070494803296, + "grad_norm": 3.940626382827759, + "learning_rate": 2.4999768826619965e-05, + "loss": 1.0152, + "num_input_tokens_seen": 172443216, + "step": 10717 + }, + { + "epoch": 0.7507770977260588, + "grad_norm": 3.6022868156433105, + "learning_rate": 2.4992770577933443e-05, + "loss": 1.0453, + "num_input_tokens_seen": 172458464, + "step": 10718 + }, + { + "epoch": 0.750847145971788, + "grad_norm": 3.9998247623443604, + "learning_rate": 2.4985772329246935e-05, + "loss": 1.1271, + "num_input_tokens_seen": 172474848, + "step": 10719 + }, + { + "epoch": 0.7509171942175173, + "grad_norm": 3.877952814102173, + "learning_rate": 2.4978774080560424e-05, + "loss": 1.0513, + "num_input_tokens_seen": 172491232, + "step": 10720 + }, + { + "epoch": 0.7509872424632466, + "grad_norm": 6.412514686584473, + "learning_rate": 2.4971775831873902e-05, + "loss": 1.1406, + "num_input_tokens_seen": 172506120, + "step": 10721 + }, + { + "epoch": 0.7510572907089758, + "grad_norm": 4.753631591796875, + "learning_rate": 2.496477758318739e-05, + "loss": 0.9748, + "num_input_tokens_seen": 172521856, + "step": 10722 + }, + { + "epoch": 0.7511273389547051, + "grad_norm": 3.470590353012085, + "learning_rate": 2.495777933450087e-05, + "loss": 1.0101, + "num_input_tokens_seen": 172538240, + "step": 10723 + }, + { + "epoch": 0.7511973872004343, + "grad_norm": 4.858765602111816, + "learning_rate": 2.495078108581436e-05, + "loss": 1.1645, + "num_input_tokens_seen": 172554256, + "step": 10724 + }, + { + "epoch": 0.7512674354461636, + "grad_norm": 6.36366605758667, + "learning_rate": 2.494378283712785e-05, + "loss": 1.165, + "num_input_tokens_seen": 172570416, + "step": 10725 + }, + { + "epoch": 0.7513374836918928, + "grad_norm": 3.953922748565674, + "learning_rate": 2.4936784588441328e-05, + "loss": 1.085, + "num_input_tokens_seen": 172586704, + "step": 10726 + }, + { + "epoch": 0.751407531937622, + "grad_norm": 4.349438190460205, + "learning_rate": 2.4929786339754817e-05, + "loss": 1.0934, + "num_input_tokens_seen": 172603088, + "step": 10727 + }, + { + "epoch": 0.7514775801833513, + "grad_norm": 3.8826959133148193, + "learning_rate": 2.4922788091068295e-05, + "loss": 0.9542, + "num_input_tokens_seen": 172618040, + "step": 10728 + }, + { + "epoch": 0.7515476284290805, + "grad_norm": 6.644694805145264, + "learning_rate": 2.4915789842381787e-05, + "loss": 1.3346, + "num_input_tokens_seen": 172633872, + "step": 10729 + }, + { + "epoch": 0.7516176766748097, + "grad_norm": 4.31898832321167, + "learning_rate": 2.4908791593695276e-05, + "loss": 1.1856, + "num_input_tokens_seen": 172650256, + "step": 10730 + }, + { + "epoch": 0.751687724920539, + "grad_norm": 3.7034151554107666, + "learning_rate": 2.4901793345008754e-05, + "loss": 0.9519, + "num_input_tokens_seen": 172666472, + "step": 10731 + }, + { + "epoch": 0.7517577731662682, + "grad_norm": 4.434296131134033, + "learning_rate": 2.4894795096322243e-05, + "loss": 0.9864, + "num_input_tokens_seen": 172682856, + "step": 10732 + }, + { + "epoch": 0.7518278214119976, + "grad_norm": 5.469362258911133, + "learning_rate": 2.488779684763572e-05, + "loss": 1.1553, + "num_input_tokens_seen": 172699240, + "step": 10733 + }, + { + "epoch": 0.7518978696577268, + "grad_norm": 4.098570346832275, + "learning_rate": 2.4880798598949213e-05, + "loss": 0.9323, + "num_input_tokens_seen": 172714344, + "step": 10734 + }, + { + "epoch": 0.751967917903456, + "grad_norm": 3.6279666423797607, + "learning_rate": 2.4873800350262702e-05, + "loss": 0.8597, + "num_input_tokens_seen": 172730728, + "step": 10735 + }, + { + "epoch": 0.7520379661491853, + "grad_norm": 3.586550235748291, + "learning_rate": 2.486680210157618e-05, + "loss": 1.0993, + "num_input_tokens_seen": 172747112, + "step": 10736 + }, + { + "epoch": 0.7521080143949145, + "grad_norm": 5.849387168884277, + "learning_rate": 2.485980385288967e-05, + "loss": 1.0777, + "num_input_tokens_seen": 172763496, + "step": 10737 + }, + { + "epoch": 0.7521780626406438, + "grad_norm": 4.139583110809326, + "learning_rate": 2.4852805604203147e-05, + "loss": 0.9387, + "num_input_tokens_seen": 172779880, + "step": 10738 + }, + { + "epoch": 0.752248110886373, + "grad_norm": 4.490769863128662, + "learning_rate": 2.484580735551664e-05, + "loss": 1.1018, + "num_input_tokens_seen": 172796040, + "step": 10739 + }, + { + "epoch": 0.7523181591321022, + "grad_norm": 4.846589088439941, + "learning_rate": 2.4838809106830118e-05, + "loss": 1.0814, + "num_input_tokens_seen": 172812352, + "step": 10740 + }, + { + "epoch": 0.7523882073778315, + "grad_norm": 4.20737886428833, + "learning_rate": 2.4831810858143606e-05, + "loss": 1.1148, + "num_input_tokens_seen": 172828272, + "step": 10741 + }, + { + "epoch": 0.7524582556235607, + "grad_norm": 3.8492374420166016, + "learning_rate": 2.4824812609457095e-05, + "loss": 1.1103, + "num_input_tokens_seen": 172844240, + "step": 10742 + }, + { + "epoch": 0.7525283038692899, + "grad_norm": 5.275799751281738, + "learning_rate": 2.4817814360770573e-05, + "loss": 1.179, + "num_input_tokens_seen": 172860624, + "step": 10743 + }, + { + "epoch": 0.7525983521150192, + "grad_norm": 3.8567919731140137, + "learning_rate": 2.4810816112084065e-05, + "loss": 1.0729, + "num_input_tokens_seen": 172876168, + "step": 10744 + }, + { + "epoch": 0.7526684003607484, + "grad_norm": 4.366634845733643, + "learning_rate": 2.4803817863397544e-05, + "loss": 1.442, + "num_input_tokens_seen": 172892296, + "step": 10745 + }, + { + "epoch": 0.7527384486064778, + "grad_norm": 4.501208782196045, + "learning_rate": 2.4796819614711032e-05, + "loss": 1.059, + "num_input_tokens_seen": 172907768, + "step": 10746 + }, + { + "epoch": 0.752808496852207, + "grad_norm": 5.248892307281494, + "learning_rate": 2.478982136602452e-05, + "loss": 1.1966, + "num_input_tokens_seen": 172923120, + "step": 10747 + }, + { + "epoch": 0.7528785450979362, + "grad_norm": 6.624785900115967, + "learning_rate": 2.4782823117338e-05, + "loss": 1.0983, + "num_input_tokens_seen": 172937688, + "step": 10748 + }, + { + "epoch": 0.7529485933436655, + "grad_norm": 3.677090644836426, + "learning_rate": 2.477582486865149e-05, + "loss": 1.0815, + "num_input_tokens_seen": 172953568, + "step": 10749 + }, + { + "epoch": 0.7530186415893947, + "grad_norm": 3.6275782585144043, + "learning_rate": 2.476882661996497e-05, + "loss": 0.9615, + "num_input_tokens_seen": 172969952, + "step": 10750 + }, + { + "epoch": 0.7530886898351239, + "grad_norm": 3.8345489501953125, + "learning_rate": 2.4761828371278458e-05, + "loss": 1.0902, + "num_input_tokens_seen": 172986336, + "step": 10751 + }, + { + "epoch": 0.7531587380808532, + "grad_norm": 3.5912234783172607, + "learning_rate": 2.4754830122591947e-05, + "loss": 1.0174, + "num_input_tokens_seen": 173002720, + "step": 10752 + }, + { + "epoch": 0.7532287863265824, + "grad_norm": 4.213565826416016, + "learning_rate": 2.4747831873905425e-05, + "loss": 1.1462, + "num_input_tokens_seen": 173018544, + "step": 10753 + }, + { + "epoch": 0.7532988345723117, + "grad_norm": 4.045791149139404, + "learning_rate": 2.4740833625218917e-05, + "loss": 1.0354, + "num_input_tokens_seen": 173034472, + "step": 10754 + }, + { + "epoch": 0.7533688828180409, + "grad_norm": 4.47079610824585, + "learning_rate": 2.4733835376532396e-05, + "loss": 1.0681, + "num_input_tokens_seen": 173050856, + "step": 10755 + }, + { + "epoch": 0.7534389310637701, + "grad_norm": 3.6233744621276855, + "learning_rate": 2.4726837127845884e-05, + "loss": 0.9676, + "num_input_tokens_seen": 173067240, + "step": 10756 + }, + { + "epoch": 0.7535089793094994, + "grad_norm": 3.5118415355682373, + "learning_rate": 2.4719838879159373e-05, + "loss": 1.0032, + "num_input_tokens_seen": 173083624, + "step": 10757 + }, + { + "epoch": 0.7535790275552287, + "grad_norm": 5.812018394470215, + "learning_rate": 2.471284063047285e-05, + "loss": 1.004, + "num_input_tokens_seen": 173100008, + "step": 10758 + }, + { + "epoch": 0.7536490758009579, + "grad_norm": 3.775524377822876, + "learning_rate": 2.4705842381786343e-05, + "loss": 1.123, + "num_input_tokens_seen": 173115392, + "step": 10759 + }, + { + "epoch": 0.7537191240466872, + "grad_norm": 3.8594207763671875, + "learning_rate": 2.469884413309982e-05, + "loss": 0.9503, + "num_input_tokens_seen": 173131776, + "step": 10760 + }, + { + "epoch": 0.7537891722924164, + "grad_norm": 4.109124183654785, + "learning_rate": 2.469184588441331e-05, + "loss": 0.939, + "num_input_tokens_seen": 173147608, + "step": 10761 + }, + { + "epoch": 0.7538592205381457, + "grad_norm": 4.234561920166016, + "learning_rate": 2.46848476357268e-05, + "loss": 0.9041, + "num_input_tokens_seen": 173163992, + "step": 10762 + }, + { + "epoch": 0.7539292687838749, + "grad_norm": 5.9067583084106445, + "learning_rate": 2.4677849387040277e-05, + "loss": 1.1977, + "num_input_tokens_seen": 173179344, + "step": 10763 + }, + { + "epoch": 0.7539993170296041, + "grad_norm": 4.289483547210693, + "learning_rate": 2.467085113835377e-05, + "loss": 1.1619, + "num_input_tokens_seen": 173195728, + "step": 10764 + }, + { + "epoch": 0.7540693652753334, + "grad_norm": 3.674454689025879, + "learning_rate": 2.4663852889667248e-05, + "loss": 0.9901, + "num_input_tokens_seen": 173212112, + "step": 10765 + }, + { + "epoch": 0.7541394135210626, + "grad_norm": 4.733745574951172, + "learning_rate": 2.4656854640980736e-05, + "loss": 1.0325, + "num_input_tokens_seen": 173227816, + "step": 10766 + }, + { + "epoch": 0.7542094617667919, + "grad_norm": 4.427460670471191, + "learning_rate": 2.4649856392294215e-05, + "loss": 1.0205, + "num_input_tokens_seen": 173244200, + "step": 10767 + }, + { + "epoch": 0.7542795100125211, + "grad_norm": 4.920037746429443, + "learning_rate": 2.4642858143607703e-05, + "loss": 0.9407, + "num_input_tokens_seen": 173259304, + "step": 10768 + }, + { + "epoch": 0.7543495582582503, + "grad_norm": 4.884332656860352, + "learning_rate": 2.4635859894921195e-05, + "loss": 1.1477, + "num_input_tokens_seen": 173275688, + "step": 10769 + }, + { + "epoch": 0.7544196065039797, + "grad_norm": 3.8365814685821533, + "learning_rate": 2.4628861646234674e-05, + "loss": 1.0598, + "num_input_tokens_seen": 173292072, + "step": 10770 + }, + { + "epoch": 0.7544896547497089, + "grad_norm": 4.100483417510986, + "learning_rate": 2.4621863397548162e-05, + "loss": 1.1897, + "num_input_tokens_seen": 173307184, + "step": 10771 + }, + { + "epoch": 0.7545597029954381, + "grad_norm": 3.719986915588379, + "learning_rate": 2.461486514886164e-05, + "loss": 0.916, + "num_input_tokens_seen": 173323568, + "step": 10772 + }, + { + "epoch": 0.7546297512411674, + "grad_norm": 4.147826671600342, + "learning_rate": 2.460786690017513e-05, + "loss": 0.9038, + "num_input_tokens_seen": 173339112, + "step": 10773 + }, + { + "epoch": 0.7546997994868966, + "grad_norm": 3.911804676055908, + "learning_rate": 2.460086865148862e-05, + "loss": 0.9965, + "num_input_tokens_seen": 173355096, + "step": 10774 + }, + { + "epoch": 0.7547698477326259, + "grad_norm": 3.8877179622650146, + "learning_rate": 2.45938704028021e-05, + "loss": 1.1977, + "num_input_tokens_seen": 173371256, + "step": 10775 + }, + { + "epoch": 0.7548398959783551, + "grad_norm": 4.481347560882568, + "learning_rate": 2.4586872154115588e-05, + "loss": 1.0792, + "num_input_tokens_seen": 173387640, + "step": 10776 + }, + { + "epoch": 0.7549099442240843, + "grad_norm": 4.284441947937012, + "learning_rate": 2.4579873905429067e-05, + "loss": 1.0014, + "num_input_tokens_seen": 173403032, + "step": 10777 + }, + { + "epoch": 0.7549799924698136, + "grad_norm": 3.4101266860961914, + "learning_rate": 2.4572875656742555e-05, + "loss": 1.121, + "num_input_tokens_seen": 173419416, + "step": 10778 + }, + { + "epoch": 0.7550500407155428, + "grad_norm": 3.4709014892578125, + "learning_rate": 2.4565877408056047e-05, + "loss": 0.9836, + "num_input_tokens_seen": 173435800, + "step": 10779 + }, + { + "epoch": 0.755120088961272, + "grad_norm": 4.048801898956299, + "learning_rate": 2.4558879159369526e-05, + "loss": 1.194, + "num_input_tokens_seen": 173451448, + "step": 10780 + }, + { + "epoch": 0.7551901372070013, + "grad_norm": 5.075813293457031, + "learning_rate": 2.4551880910683014e-05, + "loss": 1.044, + "num_input_tokens_seen": 173467832, + "step": 10781 + }, + { + "epoch": 0.7552601854527305, + "grad_norm": 4.117324352264404, + "learning_rate": 2.4544882661996493e-05, + "loss": 1.1465, + "num_input_tokens_seen": 173484216, + "step": 10782 + }, + { + "epoch": 0.7553302336984599, + "grad_norm": 3.5591964721679688, + "learning_rate": 2.453788441330998e-05, + "loss": 1.0813, + "num_input_tokens_seen": 173500600, + "step": 10783 + }, + { + "epoch": 0.7554002819441891, + "grad_norm": 4.860742092132568, + "learning_rate": 2.4530886164623473e-05, + "loss": 1.2786, + "num_input_tokens_seen": 173516312, + "step": 10784 + }, + { + "epoch": 0.7554703301899183, + "grad_norm": 3.7154600620269775, + "learning_rate": 2.452388791593695e-05, + "loss": 0.9805, + "num_input_tokens_seen": 173532696, + "step": 10785 + }, + { + "epoch": 0.7555403784356476, + "grad_norm": 3.7386090755462646, + "learning_rate": 2.451688966725044e-05, + "loss": 0.9193, + "num_input_tokens_seen": 173549080, + "step": 10786 + }, + { + "epoch": 0.7556104266813768, + "grad_norm": 4.206442356109619, + "learning_rate": 2.450989141856392e-05, + "loss": 0.9542, + "num_input_tokens_seen": 173564832, + "step": 10787 + }, + { + "epoch": 0.755680474927106, + "grad_norm": 6.350236415863037, + "learning_rate": 2.4502893169877407e-05, + "loss": 1.0349, + "num_input_tokens_seen": 173581216, + "step": 10788 + }, + { + "epoch": 0.7557505231728353, + "grad_norm": 5.970548152923584, + "learning_rate": 2.44958949211909e-05, + "loss": 1.0021, + "num_input_tokens_seen": 173597024, + "step": 10789 + }, + { + "epoch": 0.7558205714185645, + "grad_norm": 4.426954746246338, + "learning_rate": 2.4488896672504378e-05, + "loss": 1.1455, + "num_input_tokens_seen": 173613408, + "step": 10790 + }, + { + "epoch": 0.7558906196642938, + "grad_norm": 4.1317057609558105, + "learning_rate": 2.4481898423817866e-05, + "loss": 0.9448, + "num_input_tokens_seen": 173629792, + "step": 10791 + }, + { + "epoch": 0.755960667910023, + "grad_norm": 3.616722583770752, + "learning_rate": 2.4474900175131345e-05, + "loss": 1.0943, + "num_input_tokens_seen": 173646176, + "step": 10792 + }, + { + "epoch": 0.7560307161557522, + "grad_norm": 4.258011341094971, + "learning_rate": 2.4467901926444833e-05, + "loss": 1.0948, + "num_input_tokens_seen": 173662296, + "step": 10793 + }, + { + "epoch": 0.7561007644014816, + "grad_norm": 3.8636648654937744, + "learning_rate": 2.4460903677758312e-05, + "loss": 1.1171, + "num_input_tokens_seen": 173678680, + "step": 10794 + }, + { + "epoch": 0.7561708126472108, + "grad_norm": 3.9988362789154053, + "learning_rate": 2.4453905429071804e-05, + "loss": 1.0216, + "num_input_tokens_seen": 173694856, + "step": 10795 + }, + { + "epoch": 0.75624086089294, + "grad_norm": 4.590046405792236, + "learning_rate": 2.4446907180385292e-05, + "loss": 0.9452, + "num_input_tokens_seen": 173711240, + "step": 10796 + }, + { + "epoch": 0.7563109091386693, + "grad_norm": 3.7221243381500244, + "learning_rate": 2.443990893169877e-05, + "loss": 1.0149, + "num_input_tokens_seen": 173727624, + "step": 10797 + }, + { + "epoch": 0.7563809573843985, + "grad_norm": 4.296387672424316, + "learning_rate": 2.443291068301226e-05, + "loss": 0.9746, + "num_input_tokens_seen": 173743728, + "step": 10798 + }, + { + "epoch": 0.7564510056301278, + "grad_norm": 3.5614006519317627, + "learning_rate": 2.4425912434325738e-05, + "loss": 1.1506, + "num_input_tokens_seen": 173759824, + "step": 10799 + }, + { + "epoch": 0.756521053875857, + "grad_norm": 3.760089635848999, + "learning_rate": 2.441891418563923e-05, + "loss": 1.0306, + "num_input_tokens_seen": 173776208, + "step": 10800 + }, + { + "epoch": 0.756521053875857, + "eval_loss": 1.1171311140060425, + "eval_runtime": 0.1848, + "eval_samples_per_second": 5.412, + "eval_steps_per_second": 5.412, + "num_input_tokens_seen": 173776208, + "step": 10800 + }, + { + "epoch": 0.7565911021215862, + "grad_norm": 3.775071144104004, + "learning_rate": 2.441191593695272e-05, + "loss": 0.954, + "num_input_tokens_seen": 173791544, + "step": 10801 + }, + { + "epoch": 0.7566611503673155, + "grad_norm": 3.529268503189087, + "learning_rate": 2.4404917688266197e-05, + "loss": 0.8514, + "num_input_tokens_seen": 173807800, + "step": 10802 + }, + { + "epoch": 0.7567311986130447, + "grad_norm": 4.296457290649414, + "learning_rate": 2.4397919439579685e-05, + "loss": 1.1704, + "num_input_tokens_seen": 173823048, + "step": 10803 + }, + { + "epoch": 0.756801246858774, + "grad_norm": 4.295823574066162, + "learning_rate": 2.4390921190893164e-05, + "loss": 1.2958, + "num_input_tokens_seen": 173839320, + "step": 10804 + }, + { + "epoch": 0.7568712951045032, + "grad_norm": 4.27138090133667, + "learning_rate": 2.4383922942206656e-05, + "loss": 1.1363, + "num_input_tokens_seen": 173854896, + "step": 10805 + }, + { + "epoch": 0.7569413433502324, + "grad_norm": 3.802520513534546, + "learning_rate": 2.4376924693520144e-05, + "loss": 0.9698, + "num_input_tokens_seen": 173871056, + "step": 10806 + }, + { + "epoch": 0.7570113915959618, + "grad_norm": 5.144099235534668, + "learning_rate": 2.4369926444833623e-05, + "loss": 0.906, + "num_input_tokens_seen": 173887296, + "step": 10807 + }, + { + "epoch": 0.757081439841691, + "grad_norm": 3.779414176940918, + "learning_rate": 2.436292819614711e-05, + "loss": 1.1526, + "num_input_tokens_seen": 173903680, + "step": 10808 + }, + { + "epoch": 0.7571514880874202, + "grad_norm": 3.640977144241333, + "learning_rate": 2.435592994746059e-05, + "loss": 1.0439, + "num_input_tokens_seen": 173919512, + "step": 10809 + }, + { + "epoch": 0.7572215363331495, + "grad_norm": 4.017859935760498, + "learning_rate": 2.4348931698774082e-05, + "loss": 0.9482, + "num_input_tokens_seen": 173935720, + "step": 10810 + }, + { + "epoch": 0.7572915845788787, + "grad_norm": 4.146195888519287, + "learning_rate": 2.434193345008757e-05, + "loss": 1.0628, + "num_input_tokens_seen": 173951552, + "step": 10811 + }, + { + "epoch": 0.757361632824608, + "grad_norm": 3.6384456157684326, + "learning_rate": 2.433493520140105e-05, + "loss": 0.982, + "num_input_tokens_seen": 173967936, + "step": 10812 + }, + { + "epoch": 0.7574316810703372, + "grad_norm": 5.597431182861328, + "learning_rate": 2.4327936952714537e-05, + "loss": 0.9578, + "num_input_tokens_seen": 173984320, + "step": 10813 + }, + { + "epoch": 0.7575017293160664, + "grad_norm": 3.394164562225342, + "learning_rate": 2.4320938704028016e-05, + "loss": 1.0082, + "num_input_tokens_seen": 174000704, + "step": 10814 + }, + { + "epoch": 0.7575717775617957, + "grad_norm": 4.733256816864014, + "learning_rate": 2.4313940455341508e-05, + "loss": 1.1408, + "num_input_tokens_seen": 174016176, + "step": 10815 + }, + { + "epoch": 0.7576418258075249, + "grad_norm": 4.949265956878662, + "learning_rate": 2.4306942206654996e-05, + "loss": 1.2202, + "num_input_tokens_seen": 174031456, + "step": 10816 + }, + { + "epoch": 0.7577118740532541, + "grad_norm": 3.99837589263916, + "learning_rate": 2.4299943957968475e-05, + "loss": 0.9, + "num_input_tokens_seen": 174046344, + "step": 10817 + }, + { + "epoch": 0.7577819222989834, + "grad_norm": 3.62160062789917, + "learning_rate": 2.4292945709281963e-05, + "loss": 1.1249, + "num_input_tokens_seen": 174062688, + "step": 10818 + }, + { + "epoch": 0.7578519705447126, + "grad_norm": 5.388552188873291, + "learning_rate": 2.4285947460595442e-05, + "loss": 1.1428, + "num_input_tokens_seen": 174078016, + "step": 10819 + }, + { + "epoch": 0.757922018790442, + "grad_norm": 3.3548665046691895, + "learning_rate": 2.4278949211908934e-05, + "loss": 0.8758, + "num_input_tokens_seen": 174094320, + "step": 10820 + }, + { + "epoch": 0.7579920670361712, + "grad_norm": 4.163790225982666, + "learning_rate": 2.4271950963222412e-05, + "loss": 1.0792, + "num_input_tokens_seen": 174109912, + "step": 10821 + }, + { + "epoch": 0.7580621152819004, + "grad_norm": 3.7474441528320312, + "learning_rate": 2.42649527145359e-05, + "loss": 0.9192, + "num_input_tokens_seen": 174126296, + "step": 10822 + }, + { + "epoch": 0.7581321635276297, + "grad_norm": 3.224600315093994, + "learning_rate": 2.425795446584939e-05, + "loss": 0.8651, + "num_input_tokens_seen": 174142384, + "step": 10823 + }, + { + "epoch": 0.7582022117733589, + "grad_norm": 3.754689931869507, + "learning_rate": 2.4250956217162868e-05, + "loss": 1.1617, + "num_input_tokens_seen": 174158224, + "step": 10824 + }, + { + "epoch": 0.7582722600190881, + "grad_norm": 3.606917381286621, + "learning_rate": 2.424395796847636e-05, + "loss": 0.9601, + "num_input_tokens_seen": 174174608, + "step": 10825 + }, + { + "epoch": 0.7583423082648174, + "grad_norm": 4.322174072265625, + "learning_rate": 2.4236959719789838e-05, + "loss": 1.0369, + "num_input_tokens_seen": 174190992, + "step": 10826 + }, + { + "epoch": 0.7584123565105466, + "grad_norm": 4.417646408081055, + "learning_rate": 2.4229961471103327e-05, + "loss": 1.1, + "num_input_tokens_seen": 174207376, + "step": 10827 + }, + { + "epoch": 0.7584824047562759, + "grad_norm": 3.81874942779541, + "learning_rate": 2.4222963222416815e-05, + "loss": 1.1717, + "num_input_tokens_seen": 174222872, + "step": 10828 + }, + { + "epoch": 0.7585524530020051, + "grad_norm": 4.576498508453369, + "learning_rate": 2.4215964973730294e-05, + "loss": 1.0673, + "num_input_tokens_seen": 174239256, + "step": 10829 + }, + { + "epoch": 0.7586225012477343, + "grad_norm": 3.9401471614837646, + "learning_rate": 2.4208966725043786e-05, + "loss": 1.0168, + "num_input_tokens_seen": 174255640, + "step": 10830 + }, + { + "epoch": 0.7586925494934637, + "grad_norm": 3.8458027839660645, + "learning_rate": 2.4201968476357264e-05, + "loss": 0.9402, + "num_input_tokens_seen": 174272024, + "step": 10831 + }, + { + "epoch": 0.7587625977391929, + "grad_norm": 4.383147239685059, + "learning_rate": 2.4194970227670753e-05, + "loss": 1.0253, + "num_input_tokens_seen": 174287984, + "step": 10832 + }, + { + "epoch": 0.7588326459849221, + "grad_norm": 3.6222259998321533, + "learning_rate": 2.418797197898424e-05, + "loss": 1.0813, + "num_input_tokens_seen": 174304368, + "step": 10833 + }, + { + "epoch": 0.7589026942306514, + "grad_norm": 4.346301078796387, + "learning_rate": 2.418097373029772e-05, + "loss": 1.0178, + "num_input_tokens_seen": 174320256, + "step": 10834 + }, + { + "epoch": 0.7589727424763806, + "grad_norm": 4.459402561187744, + "learning_rate": 2.4173975481611212e-05, + "loss": 1.2717, + "num_input_tokens_seen": 174335680, + "step": 10835 + }, + { + "epoch": 0.7590427907221099, + "grad_norm": 4.185834884643555, + "learning_rate": 2.416697723292469e-05, + "loss": 1.1265, + "num_input_tokens_seen": 174352064, + "step": 10836 + }, + { + "epoch": 0.7591128389678391, + "grad_norm": 4.23890495300293, + "learning_rate": 2.415997898423818e-05, + "loss": 1.1752, + "num_input_tokens_seen": 174368448, + "step": 10837 + }, + { + "epoch": 0.7591828872135683, + "grad_norm": 3.566504716873169, + "learning_rate": 2.4152980735551667e-05, + "loss": 0.8887, + "num_input_tokens_seen": 174384640, + "step": 10838 + }, + { + "epoch": 0.7592529354592976, + "grad_norm": 3.4228594303131104, + "learning_rate": 2.4145982486865146e-05, + "loss": 0.8998, + "num_input_tokens_seen": 174401024, + "step": 10839 + }, + { + "epoch": 0.7593229837050268, + "grad_norm": 3.7111098766326904, + "learning_rate": 2.4138984238178638e-05, + "loss": 1.0423, + "num_input_tokens_seen": 174417408, + "step": 10840 + }, + { + "epoch": 0.7593930319507561, + "grad_norm": 3.930331230163574, + "learning_rate": 2.4131985989492116e-05, + "loss": 1.1201, + "num_input_tokens_seen": 174433792, + "step": 10841 + }, + { + "epoch": 0.7594630801964853, + "grad_norm": 5.416530132293701, + "learning_rate": 2.4124987740805605e-05, + "loss": 1.271, + "num_input_tokens_seen": 174450160, + "step": 10842 + }, + { + "epoch": 0.7595331284422145, + "grad_norm": 4.794942855834961, + "learning_rate": 2.4117989492119093e-05, + "loss": 1.2907, + "num_input_tokens_seen": 174466544, + "step": 10843 + }, + { + "epoch": 0.7596031766879439, + "grad_norm": 3.8518733978271484, + "learning_rate": 2.4110991243432572e-05, + "loss": 0.9957, + "num_input_tokens_seen": 174482912, + "step": 10844 + }, + { + "epoch": 0.7596732249336731, + "grad_norm": 5.308676719665527, + "learning_rate": 2.4103992994746064e-05, + "loss": 1.0372, + "num_input_tokens_seen": 174498096, + "step": 10845 + }, + { + "epoch": 0.7597432731794023, + "grad_norm": 4.467613220214844, + "learning_rate": 2.4096994746059542e-05, + "loss": 0.8803, + "num_input_tokens_seen": 174514320, + "step": 10846 + }, + { + "epoch": 0.7598133214251316, + "grad_norm": 3.555307626724243, + "learning_rate": 2.408999649737303e-05, + "loss": 1.0849, + "num_input_tokens_seen": 174530704, + "step": 10847 + }, + { + "epoch": 0.7598833696708608, + "grad_norm": 6.008337497711182, + "learning_rate": 2.408299824868651e-05, + "loss": 0.9208, + "num_input_tokens_seen": 174547064, + "step": 10848 + }, + { + "epoch": 0.7599534179165901, + "grad_norm": 4.8769636154174805, + "learning_rate": 2.4075999999999998e-05, + "loss": 1.0604, + "num_input_tokens_seen": 174563368, + "step": 10849 + }, + { + "epoch": 0.7600234661623193, + "grad_norm": 4.796581268310547, + "learning_rate": 2.406900175131349e-05, + "loss": 0.9504, + "num_input_tokens_seen": 174579752, + "step": 10850 + }, + { + "epoch": 0.7600935144080485, + "grad_norm": 3.9061808586120605, + "learning_rate": 2.4062003502626968e-05, + "loss": 1.0074, + "num_input_tokens_seen": 174595512, + "step": 10851 + }, + { + "epoch": 0.7601635626537778, + "grad_norm": 4.698152542114258, + "learning_rate": 2.4055005253940457e-05, + "loss": 0.828, + "num_input_tokens_seen": 174611896, + "step": 10852 + }, + { + "epoch": 0.760233610899507, + "grad_norm": 3.6942756175994873, + "learning_rate": 2.4048007005253935e-05, + "loss": 0.9096, + "num_input_tokens_seen": 174627776, + "step": 10853 + }, + { + "epoch": 0.7603036591452362, + "grad_norm": 3.9673640727996826, + "learning_rate": 2.4041008756567424e-05, + "loss": 1.1105, + "num_input_tokens_seen": 174644160, + "step": 10854 + }, + { + "epoch": 0.7603737073909655, + "grad_norm": 4.190493583679199, + "learning_rate": 2.4034010507880916e-05, + "loss": 1.0197, + "num_input_tokens_seen": 174659080, + "step": 10855 + }, + { + "epoch": 0.7604437556366948, + "grad_norm": 4.390242576599121, + "learning_rate": 2.4027012259194394e-05, + "loss": 0.907, + "num_input_tokens_seen": 174674712, + "step": 10856 + }, + { + "epoch": 0.7605138038824241, + "grad_norm": 4.082337856292725, + "learning_rate": 2.4020014010507883e-05, + "loss": 1.0256, + "num_input_tokens_seen": 174691096, + "step": 10857 + }, + { + "epoch": 0.7605838521281533, + "grad_norm": 3.8295276165008545, + "learning_rate": 2.401301576182136e-05, + "loss": 0.9256, + "num_input_tokens_seen": 174707432, + "step": 10858 + }, + { + "epoch": 0.7606539003738825, + "grad_norm": 4.0778679847717285, + "learning_rate": 2.400601751313485e-05, + "loss": 1.2012, + "num_input_tokens_seen": 174723328, + "step": 10859 + }, + { + "epoch": 0.7607239486196118, + "grad_norm": 3.4296936988830566, + "learning_rate": 2.3999019264448342e-05, + "loss": 0.9987, + "num_input_tokens_seen": 174739528, + "step": 10860 + }, + { + "epoch": 0.760793996865341, + "grad_norm": 3.534679889678955, + "learning_rate": 2.399202101576182e-05, + "loss": 0.9243, + "num_input_tokens_seen": 174755728, + "step": 10861 + }, + { + "epoch": 0.7608640451110702, + "grad_norm": 5.9626288414001465, + "learning_rate": 2.398502276707531e-05, + "loss": 0.9154, + "num_input_tokens_seen": 174771336, + "step": 10862 + }, + { + "epoch": 0.7609340933567995, + "grad_norm": 3.7579596042633057, + "learning_rate": 2.3978024518388787e-05, + "loss": 1.062, + "num_input_tokens_seen": 174787720, + "step": 10863 + }, + { + "epoch": 0.7610041416025287, + "grad_norm": 3.9172112941741943, + "learning_rate": 2.3971026269702276e-05, + "loss": 0.9741, + "num_input_tokens_seen": 174804088, + "step": 10864 + }, + { + "epoch": 0.761074189848258, + "grad_norm": 4.5351386070251465, + "learning_rate": 2.3964028021015768e-05, + "loss": 0.981, + "num_input_tokens_seen": 174819248, + "step": 10865 + }, + { + "epoch": 0.7611442380939872, + "grad_norm": 5.079945087432861, + "learning_rate": 2.3957029772329246e-05, + "loss": 1.2276, + "num_input_tokens_seen": 174835512, + "step": 10866 + }, + { + "epoch": 0.7612142863397164, + "grad_norm": 5.498474597930908, + "learning_rate": 2.3950031523642735e-05, + "loss": 1.1228, + "num_input_tokens_seen": 174850776, + "step": 10867 + }, + { + "epoch": 0.7612843345854458, + "grad_norm": 3.717484712600708, + "learning_rate": 2.3943033274956213e-05, + "loss": 1.014, + "num_input_tokens_seen": 174866968, + "step": 10868 + }, + { + "epoch": 0.761354382831175, + "grad_norm": 3.489631414413452, + "learning_rate": 2.3936035026269702e-05, + "loss": 1.0056, + "num_input_tokens_seen": 174883352, + "step": 10869 + }, + { + "epoch": 0.7614244310769042, + "grad_norm": 4.097408771514893, + "learning_rate": 2.3929036777583194e-05, + "loss": 1.2541, + "num_input_tokens_seen": 174899064, + "step": 10870 + }, + { + "epoch": 0.7614944793226335, + "grad_norm": 3.422544240951538, + "learning_rate": 2.3922038528896672e-05, + "loss": 0.9816, + "num_input_tokens_seen": 174915448, + "step": 10871 + }, + { + "epoch": 0.7615645275683627, + "grad_norm": 3.7063145637512207, + "learning_rate": 2.391504028021016e-05, + "loss": 1.0127, + "num_input_tokens_seen": 174931832, + "step": 10872 + }, + { + "epoch": 0.761634575814092, + "grad_norm": 3.2933335304260254, + "learning_rate": 2.390804203152364e-05, + "loss": 0.8284, + "num_input_tokens_seen": 174948216, + "step": 10873 + }, + { + "epoch": 0.7617046240598212, + "grad_norm": 3.884831428527832, + "learning_rate": 2.3901043782837128e-05, + "loss": 0.8372, + "num_input_tokens_seen": 174962808, + "step": 10874 + }, + { + "epoch": 0.7617746723055504, + "grad_norm": 5.1886773109436035, + "learning_rate": 2.3894045534150606e-05, + "loss": 1.017, + "num_input_tokens_seen": 174979192, + "step": 10875 + }, + { + "epoch": 0.7618447205512797, + "grad_norm": 4.502350330352783, + "learning_rate": 2.38870472854641e-05, + "loss": 1.1775, + "num_input_tokens_seen": 174995408, + "step": 10876 + }, + { + "epoch": 0.7619147687970089, + "grad_norm": 4.571274280548096, + "learning_rate": 2.3880049036777587e-05, + "loss": 1.0156, + "num_input_tokens_seen": 175011792, + "step": 10877 + }, + { + "epoch": 0.7619848170427382, + "grad_norm": 3.6366827487945557, + "learning_rate": 2.3873050788091065e-05, + "loss": 1.0852, + "num_input_tokens_seen": 175028176, + "step": 10878 + }, + { + "epoch": 0.7620548652884674, + "grad_norm": 3.9528684616088867, + "learning_rate": 2.3866052539404554e-05, + "loss": 1.0043, + "num_input_tokens_seen": 175044264, + "step": 10879 + }, + { + "epoch": 0.7621249135341966, + "grad_norm": 4.14894437789917, + "learning_rate": 2.3859054290718032e-05, + "loss": 1.097, + "num_input_tokens_seen": 175060648, + "step": 10880 + }, + { + "epoch": 0.762194961779926, + "grad_norm": 4.631619453430176, + "learning_rate": 2.3852056042031524e-05, + "loss": 1.0653, + "num_input_tokens_seen": 175075664, + "step": 10881 + }, + { + "epoch": 0.7622650100256552, + "grad_norm": 3.883129835128784, + "learning_rate": 2.3845057793345013e-05, + "loss": 1.14, + "num_input_tokens_seen": 175091464, + "step": 10882 + }, + { + "epoch": 0.7623350582713844, + "grad_norm": 3.919706344604492, + "learning_rate": 2.383805954465849e-05, + "loss": 1.0074, + "num_input_tokens_seen": 175107848, + "step": 10883 + }, + { + "epoch": 0.7624051065171137, + "grad_norm": 4.015553951263428, + "learning_rate": 2.383106129597198e-05, + "loss": 1.1203, + "num_input_tokens_seen": 175124080, + "step": 10884 + }, + { + "epoch": 0.7624751547628429, + "grad_norm": 3.8675665855407715, + "learning_rate": 2.382406304728546e-05, + "loss": 1.1375, + "num_input_tokens_seen": 175140264, + "step": 10885 + }, + { + "epoch": 0.7625452030085722, + "grad_norm": 4.491522312164307, + "learning_rate": 2.381706479859895e-05, + "loss": 1.0429, + "num_input_tokens_seen": 175156648, + "step": 10886 + }, + { + "epoch": 0.7626152512543014, + "grad_norm": 3.524600028991699, + "learning_rate": 2.381006654991244e-05, + "loss": 1.0733, + "num_input_tokens_seen": 175173032, + "step": 10887 + }, + { + "epoch": 0.7626852995000306, + "grad_norm": 4.3140106201171875, + "learning_rate": 2.3803068301225917e-05, + "loss": 0.9641, + "num_input_tokens_seen": 175189416, + "step": 10888 + }, + { + "epoch": 0.7627553477457599, + "grad_norm": 3.734637498855591, + "learning_rate": 2.3796070052539406e-05, + "loss": 1.0189, + "num_input_tokens_seen": 175205800, + "step": 10889 + }, + { + "epoch": 0.7628253959914891, + "grad_norm": 4.976166725158691, + "learning_rate": 2.3789071803852884e-05, + "loss": 1.0808, + "num_input_tokens_seen": 175221592, + "step": 10890 + }, + { + "epoch": 0.7628954442372183, + "grad_norm": 3.521130084991455, + "learning_rate": 2.3782073555166376e-05, + "loss": 0.8699, + "num_input_tokens_seen": 175237536, + "step": 10891 + }, + { + "epoch": 0.7629654924829476, + "grad_norm": 4.5201640129089355, + "learning_rate": 2.3775075306479865e-05, + "loss": 1.0113, + "num_input_tokens_seen": 175253144, + "step": 10892 + }, + { + "epoch": 0.7630355407286769, + "grad_norm": 4.005626201629639, + "learning_rate": 2.3768077057793343e-05, + "loss": 0.9182, + "num_input_tokens_seen": 175268384, + "step": 10893 + }, + { + "epoch": 0.7631055889744062, + "grad_norm": 4.754047870635986, + "learning_rate": 2.3761078809106832e-05, + "loss": 1.0392, + "num_input_tokens_seen": 175284464, + "step": 10894 + }, + { + "epoch": 0.7631756372201354, + "grad_norm": 4.085216045379639, + "learning_rate": 2.375408056042031e-05, + "loss": 1.0309, + "num_input_tokens_seen": 175300848, + "step": 10895 + }, + { + "epoch": 0.7632456854658646, + "grad_norm": 3.820629596710205, + "learning_rate": 2.3747082311733802e-05, + "loss": 0.9596, + "num_input_tokens_seen": 175317136, + "step": 10896 + }, + { + "epoch": 0.7633157337115939, + "grad_norm": 4.2085347175598145, + "learning_rate": 2.374008406304729e-05, + "loss": 1.0951, + "num_input_tokens_seen": 175333520, + "step": 10897 + }, + { + "epoch": 0.7633857819573231, + "grad_norm": 4.95050573348999, + "learning_rate": 2.373308581436077e-05, + "loss": 1.0552, + "num_input_tokens_seen": 175349568, + "step": 10898 + }, + { + "epoch": 0.7634558302030523, + "grad_norm": 3.6052463054656982, + "learning_rate": 2.3726087565674258e-05, + "loss": 0.8943, + "num_input_tokens_seen": 175365952, + "step": 10899 + }, + { + "epoch": 0.7635258784487816, + "grad_norm": 4.774879455566406, + "learning_rate": 2.3719089316987736e-05, + "loss": 0.9766, + "num_input_tokens_seen": 175381840, + "step": 10900 + }, + { + "epoch": 0.7635959266945108, + "grad_norm": 4.303728103637695, + "learning_rate": 2.371209106830123e-05, + "loss": 1.0285, + "num_input_tokens_seen": 175398224, + "step": 10901 + }, + { + "epoch": 0.7636659749402401, + "grad_norm": 3.7838282585144043, + "learning_rate": 2.3705092819614707e-05, + "loss": 1.0188, + "num_input_tokens_seen": 175414072, + "step": 10902 + }, + { + "epoch": 0.7637360231859693, + "grad_norm": 4.05057954788208, + "learning_rate": 2.3698094570928195e-05, + "loss": 1.1078, + "num_input_tokens_seen": 175430456, + "step": 10903 + }, + { + "epoch": 0.7638060714316985, + "grad_norm": 7.066936492919922, + "learning_rate": 2.3691096322241684e-05, + "loss": 1.102, + "num_input_tokens_seen": 175446472, + "step": 10904 + }, + { + "epoch": 0.7638761196774279, + "grad_norm": 3.308185338973999, + "learning_rate": 2.3684098073555162e-05, + "loss": 0.8776, + "num_input_tokens_seen": 175462784, + "step": 10905 + }, + { + "epoch": 0.7639461679231571, + "grad_norm": 3.7892000675201416, + "learning_rate": 2.3677099824868654e-05, + "loss": 1.1141, + "num_input_tokens_seen": 175478040, + "step": 10906 + }, + { + "epoch": 0.7640162161688864, + "grad_norm": 4.590544700622559, + "learning_rate": 2.3670101576182133e-05, + "loss": 1.0282, + "num_input_tokens_seen": 175493888, + "step": 10907 + }, + { + "epoch": 0.7640862644146156, + "grad_norm": 6.460153579711914, + "learning_rate": 2.366310332749562e-05, + "loss": 0.8352, + "num_input_tokens_seen": 175510272, + "step": 10908 + }, + { + "epoch": 0.7641563126603448, + "grad_norm": 4.30807638168335, + "learning_rate": 2.365610507880911e-05, + "loss": 0.9714, + "num_input_tokens_seen": 175526656, + "step": 10909 + }, + { + "epoch": 0.7642263609060741, + "grad_norm": 3.388319253921509, + "learning_rate": 2.364910683012259e-05, + "loss": 0.879, + "num_input_tokens_seen": 175542504, + "step": 10910 + }, + { + "epoch": 0.7642964091518033, + "grad_norm": 4.062392234802246, + "learning_rate": 2.364210858143608e-05, + "loss": 0.8467, + "num_input_tokens_seen": 175558888, + "step": 10911 + }, + { + "epoch": 0.7643664573975325, + "grad_norm": 6.293281078338623, + "learning_rate": 2.363511033274956e-05, + "loss": 0.9193, + "num_input_tokens_seen": 175574504, + "step": 10912 + }, + { + "epoch": 0.7644365056432618, + "grad_norm": 4.105156421661377, + "learning_rate": 2.3628112084063047e-05, + "loss": 1.1285, + "num_input_tokens_seen": 175590416, + "step": 10913 + }, + { + "epoch": 0.764506553888991, + "grad_norm": 4.037670135498047, + "learning_rate": 2.3621113835376536e-05, + "loss": 0.9733, + "num_input_tokens_seen": 175606800, + "step": 10914 + }, + { + "epoch": 0.7645766021347203, + "grad_norm": 3.798976182937622, + "learning_rate": 2.3614115586690014e-05, + "loss": 0.9725, + "num_input_tokens_seen": 175623032, + "step": 10915 + }, + { + "epoch": 0.7646466503804495, + "grad_norm": 3.4292778968811035, + "learning_rate": 2.3607117338003503e-05, + "loss": 0.9248, + "num_input_tokens_seen": 175639112, + "step": 10916 + }, + { + "epoch": 0.7647166986261787, + "grad_norm": 4.909303665161133, + "learning_rate": 2.3600119089316985e-05, + "loss": 0.959, + "num_input_tokens_seen": 175655496, + "step": 10917 + }, + { + "epoch": 0.7647867468719081, + "grad_norm": 5.337521553039551, + "learning_rate": 2.3593120840630473e-05, + "loss": 1.2015, + "num_input_tokens_seen": 175671640, + "step": 10918 + }, + { + "epoch": 0.7648567951176373, + "grad_norm": 5.79332971572876, + "learning_rate": 2.3586122591943962e-05, + "loss": 1.0753, + "num_input_tokens_seen": 175688024, + "step": 10919 + }, + { + "epoch": 0.7649268433633665, + "grad_norm": 3.7936301231384277, + "learning_rate": 2.357912434325744e-05, + "loss": 1.1318, + "num_input_tokens_seen": 175704408, + "step": 10920 + }, + { + "epoch": 0.7649968916090958, + "grad_norm": 4.191605091094971, + "learning_rate": 2.357212609457093e-05, + "loss": 1.1497, + "num_input_tokens_seen": 175720728, + "step": 10921 + }, + { + "epoch": 0.765066939854825, + "grad_norm": 3.4121246337890625, + "learning_rate": 2.356512784588441e-05, + "loss": 0.9877, + "num_input_tokens_seen": 175737112, + "step": 10922 + }, + { + "epoch": 0.7651369881005543, + "grad_norm": 4.149606227874756, + "learning_rate": 2.35581295971979e-05, + "loss": 1.077, + "num_input_tokens_seen": 175753496, + "step": 10923 + }, + { + "epoch": 0.7652070363462835, + "grad_norm": 3.96986985206604, + "learning_rate": 2.3551131348511388e-05, + "loss": 0.9999, + "num_input_tokens_seen": 175769384, + "step": 10924 + }, + { + "epoch": 0.7652770845920127, + "grad_norm": 4.02364444732666, + "learning_rate": 2.3544133099824866e-05, + "loss": 1.1324, + "num_input_tokens_seen": 175785768, + "step": 10925 + }, + { + "epoch": 0.765347132837742, + "grad_norm": 3.7053544521331787, + "learning_rate": 2.3537134851138355e-05, + "loss": 1.0221, + "num_input_tokens_seen": 175802152, + "step": 10926 + }, + { + "epoch": 0.7654171810834712, + "grad_norm": 3.8683714866638184, + "learning_rate": 2.3530136602451837e-05, + "loss": 1.0399, + "num_input_tokens_seen": 175817632, + "step": 10927 + }, + { + "epoch": 0.7654872293292004, + "grad_norm": 3.568403959274292, + "learning_rate": 2.3523138353765325e-05, + "loss": 1.1004, + "num_input_tokens_seen": 175834016, + "step": 10928 + }, + { + "epoch": 0.7655572775749298, + "grad_norm": 3.6663753986358643, + "learning_rate": 2.3516140105078804e-05, + "loss": 1.0213, + "num_input_tokens_seen": 175850400, + "step": 10929 + }, + { + "epoch": 0.765627325820659, + "grad_norm": 3.825601816177368, + "learning_rate": 2.3509141856392292e-05, + "loss": 1.1246, + "num_input_tokens_seen": 175866784, + "step": 10930 + }, + { + "epoch": 0.7656973740663883, + "grad_norm": 5.494607448577881, + "learning_rate": 2.350214360770578e-05, + "loss": 1.1104, + "num_input_tokens_seen": 175881160, + "step": 10931 + }, + { + "epoch": 0.7657674223121175, + "grad_norm": 4.0407538414001465, + "learning_rate": 2.3495145359019263e-05, + "loss": 1.165, + "num_input_tokens_seen": 175897544, + "step": 10932 + }, + { + "epoch": 0.7658374705578467, + "grad_norm": 3.6634268760681152, + "learning_rate": 2.348814711033275e-05, + "loss": 1.0675, + "num_input_tokens_seen": 175913928, + "step": 10933 + }, + { + "epoch": 0.765907518803576, + "grad_norm": 4.5524492263793945, + "learning_rate": 2.348114886164623e-05, + "loss": 1.0328, + "num_input_tokens_seen": 175929360, + "step": 10934 + }, + { + "epoch": 0.7659775670493052, + "grad_norm": 4.216582775115967, + "learning_rate": 2.347415061295972e-05, + "loss": 1.037, + "num_input_tokens_seen": 175945520, + "step": 10935 + }, + { + "epoch": 0.7660476152950344, + "grad_norm": 4.1431379318237305, + "learning_rate": 2.3467152364273207e-05, + "loss": 0.9936, + "num_input_tokens_seen": 175961504, + "step": 10936 + }, + { + "epoch": 0.7661176635407637, + "grad_norm": 3.594326972961426, + "learning_rate": 2.3460154115586685e-05, + "loss": 0.9905, + "num_input_tokens_seen": 175976664, + "step": 10937 + }, + { + "epoch": 0.7661877117864929, + "grad_norm": 3.392112970352173, + "learning_rate": 2.3453155866900177e-05, + "loss": 0.9289, + "num_input_tokens_seen": 175993048, + "step": 10938 + }, + { + "epoch": 0.7662577600322222, + "grad_norm": 4.7548909187316895, + "learning_rate": 2.3446157618213656e-05, + "loss": 1.1439, + "num_input_tokens_seen": 176009432, + "step": 10939 + }, + { + "epoch": 0.7663278082779514, + "grad_norm": 4.433189868927002, + "learning_rate": 2.3439159369527144e-05, + "loss": 0.9154, + "num_input_tokens_seen": 176025816, + "step": 10940 + }, + { + "epoch": 0.7663978565236806, + "grad_norm": 4.065654754638672, + "learning_rate": 2.3432161120840633e-05, + "loss": 1.0214, + "num_input_tokens_seen": 176042200, + "step": 10941 + }, + { + "epoch": 0.76646790476941, + "grad_norm": 4.149123668670654, + "learning_rate": 2.342516287215411e-05, + "loss": 1.0638, + "num_input_tokens_seen": 176058256, + "step": 10942 + }, + { + "epoch": 0.7665379530151392, + "grad_norm": 4.360565662384033, + "learning_rate": 2.3418164623467603e-05, + "loss": 0.9893, + "num_input_tokens_seen": 176074144, + "step": 10943 + }, + { + "epoch": 0.7666080012608685, + "grad_norm": 3.967151641845703, + "learning_rate": 2.3411166374781082e-05, + "loss": 1.0952, + "num_input_tokens_seen": 176090112, + "step": 10944 + }, + { + "epoch": 0.7666780495065977, + "grad_norm": 4.118748664855957, + "learning_rate": 2.340416812609457e-05, + "loss": 0.9322, + "num_input_tokens_seen": 176106496, + "step": 10945 + }, + { + "epoch": 0.7667480977523269, + "grad_norm": 6.3276753425598145, + "learning_rate": 2.339716987740806e-05, + "loss": 0.8985, + "num_input_tokens_seen": 176122704, + "step": 10946 + }, + { + "epoch": 0.7668181459980562, + "grad_norm": 5.161777019500732, + "learning_rate": 2.3390171628721537e-05, + "loss": 1.0065, + "num_input_tokens_seen": 176138192, + "step": 10947 + }, + { + "epoch": 0.7668881942437854, + "grad_norm": 3.5885562896728516, + "learning_rate": 2.338317338003503e-05, + "loss": 1.0055, + "num_input_tokens_seen": 176154576, + "step": 10948 + }, + { + "epoch": 0.7669582424895146, + "grad_norm": 5.125253677368164, + "learning_rate": 2.3376175131348508e-05, + "loss": 0.9902, + "num_input_tokens_seen": 176170504, + "step": 10949 + }, + { + "epoch": 0.7670282907352439, + "grad_norm": 3.6423850059509277, + "learning_rate": 2.3369176882661996e-05, + "loss": 1.1144, + "num_input_tokens_seen": 176186888, + "step": 10950 + }, + { + "epoch": 0.7670983389809731, + "grad_norm": 4.082790851593018, + "learning_rate": 2.3362178633975485e-05, + "loss": 1.0866, + "num_input_tokens_seen": 176203272, + "step": 10951 + }, + { + "epoch": 0.7671683872267024, + "grad_norm": 3.837416648864746, + "learning_rate": 2.3355180385288964e-05, + "loss": 0.9698, + "num_input_tokens_seen": 176219656, + "step": 10952 + }, + { + "epoch": 0.7672384354724316, + "grad_norm": 3.817786931991577, + "learning_rate": 2.3348182136602455e-05, + "loss": 1.1364, + "num_input_tokens_seen": 176235584, + "step": 10953 + }, + { + "epoch": 0.7673084837181608, + "grad_norm": 6.18491792678833, + "learning_rate": 2.3341183887915934e-05, + "loss": 0.9112, + "num_input_tokens_seen": 176251616, + "step": 10954 + }, + { + "epoch": 0.7673785319638902, + "grad_norm": 4.2666401863098145, + "learning_rate": 2.3334185639229423e-05, + "loss": 1.0764, + "num_input_tokens_seen": 176267648, + "step": 10955 + }, + { + "epoch": 0.7674485802096194, + "grad_norm": 3.369572162628174, + "learning_rate": 2.33271873905429e-05, + "loss": 0.873, + "num_input_tokens_seen": 176284032, + "step": 10956 + }, + { + "epoch": 0.7675186284553486, + "grad_norm": 3.7854626178741455, + "learning_rate": 2.332018914185639e-05, + "loss": 1.2152, + "num_input_tokens_seen": 176300416, + "step": 10957 + }, + { + "epoch": 0.7675886767010779, + "grad_norm": 4.096500873565674, + "learning_rate": 2.331319089316988e-05, + "loss": 0.8921, + "num_input_tokens_seen": 176316032, + "step": 10958 + }, + { + "epoch": 0.7676587249468071, + "grad_norm": 9.417652130126953, + "learning_rate": 2.330619264448336e-05, + "loss": 1.1325, + "num_input_tokens_seen": 176329960, + "step": 10959 + }, + { + "epoch": 0.7677287731925364, + "grad_norm": 3.7446489334106445, + "learning_rate": 2.329919439579685e-05, + "loss": 0.9503, + "num_input_tokens_seen": 176345736, + "step": 10960 + }, + { + "epoch": 0.7677988214382656, + "grad_norm": 3.8959639072418213, + "learning_rate": 2.3292196147110327e-05, + "loss": 1.2024, + "num_input_tokens_seen": 176362040, + "step": 10961 + }, + { + "epoch": 0.7678688696839948, + "grad_norm": 4.6875691413879395, + "learning_rate": 2.3285197898423816e-05, + "loss": 0.8816, + "num_input_tokens_seen": 176378424, + "step": 10962 + }, + { + "epoch": 0.7679389179297241, + "grad_norm": 4.101948261260986, + "learning_rate": 2.3278199649737308e-05, + "loss": 1.2086, + "num_input_tokens_seen": 176394552, + "step": 10963 + }, + { + "epoch": 0.7680089661754533, + "grad_norm": 4.199262619018555, + "learning_rate": 2.3271201401050786e-05, + "loss": 1.0635, + "num_input_tokens_seen": 176410936, + "step": 10964 + }, + { + "epoch": 0.7680790144211825, + "grad_norm": 3.7464520931243896, + "learning_rate": 2.3264203152364275e-05, + "loss": 1.0705, + "num_input_tokens_seen": 176427320, + "step": 10965 + }, + { + "epoch": 0.7681490626669119, + "grad_norm": 3.9267444610595703, + "learning_rate": 2.3257204903677753e-05, + "loss": 1.1777, + "num_input_tokens_seen": 176443704, + "step": 10966 + }, + { + "epoch": 0.7682191109126411, + "grad_norm": 4.333981037139893, + "learning_rate": 2.325020665499124e-05, + "loss": 0.9345, + "num_input_tokens_seen": 176460088, + "step": 10967 + }, + { + "epoch": 0.7682891591583704, + "grad_norm": 3.815565347671509, + "learning_rate": 2.3243208406304734e-05, + "loss": 0.8651, + "num_input_tokens_seen": 176476088, + "step": 10968 + }, + { + "epoch": 0.7683592074040996, + "grad_norm": 4.132841110229492, + "learning_rate": 2.3236210157618212e-05, + "loss": 1.1412, + "num_input_tokens_seen": 176491480, + "step": 10969 + }, + { + "epoch": 0.7684292556498288, + "grad_norm": 3.620159149169922, + "learning_rate": 2.32292119089317e-05, + "loss": 0.9307, + "num_input_tokens_seen": 176507864, + "step": 10970 + }, + { + "epoch": 0.7684993038955581, + "grad_norm": 4.625896453857422, + "learning_rate": 2.322221366024518e-05, + "loss": 0.9208, + "num_input_tokens_seen": 176524248, + "step": 10971 + }, + { + "epoch": 0.7685693521412873, + "grad_norm": 3.7396461963653564, + "learning_rate": 2.3215215411558668e-05, + "loss": 1.0332, + "num_input_tokens_seen": 176540056, + "step": 10972 + }, + { + "epoch": 0.7686394003870165, + "grad_norm": 5.772690773010254, + "learning_rate": 2.320821716287216e-05, + "loss": 1.151, + "num_input_tokens_seen": 176556360, + "step": 10973 + }, + { + "epoch": 0.7687094486327458, + "grad_norm": 4.765915393829346, + "learning_rate": 2.3201218914185638e-05, + "loss": 1.1234, + "num_input_tokens_seen": 176572096, + "step": 10974 + }, + { + "epoch": 0.768779496878475, + "grad_norm": 3.7485620975494385, + "learning_rate": 2.3194220665499127e-05, + "loss": 1.1528, + "num_input_tokens_seen": 176587784, + "step": 10975 + }, + { + "epoch": 0.7688495451242043, + "grad_norm": 3.533437490463257, + "learning_rate": 2.3187222416812605e-05, + "loss": 1.1026, + "num_input_tokens_seen": 176604168, + "step": 10976 + }, + { + "epoch": 0.7689195933699335, + "grad_norm": 3.5751569271087646, + "learning_rate": 2.3180224168126094e-05, + "loss": 1.0064, + "num_input_tokens_seen": 176620552, + "step": 10977 + }, + { + "epoch": 0.7689896416156627, + "grad_norm": 5.081743240356445, + "learning_rate": 2.3173225919439586e-05, + "loss": 1.2081, + "num_input_tokens_seen": 176636936, + "step": 10978 + }, + { + "epoch": 0.7690596898613921, + "grad_norm": 3.95900297164917, + "learning_rate": 2.3166227670753064e-05, + "loss": 1.0193, + "num_input_tokens_seen": 176653128, + "step": 10979 + }, + { + "epoch": 0.7691297381071213, + "grad_norm": 3.596835136413574, + "learning_rate": 2.3159229422066553e-05, + "loss": 0.9351, + "num_input_tokens_seen": 176669512, + "step": 10980 + }, + { + "epoch": 0.7691997863528506, + "grad_norm": 3.747396469116211, + "learning_rate": 2.315223117338003e-05, + "loss": 1.0759, + "num_input_tokens_seen": 176685824, + "step": 10981 + }, + { + "epoch": 0.7692698345985798, + "grad_norm": 4.430510997772217, + "learning_rate": 2.314523292469352e-05, + "loss": 1.166, + "num_input_tokens_seen": 176701808, + "step": 10982 + }, + { + "epoch": 0.769339882844309, + "grad_norm": 3.7171995639801025, + "learning_rate": 2.3138234676006998e-05, + "loss": 1.109, + "num_input_tokens_seen": 176718192, + "step": 10983 + }, + { + "epoch": 0.7694099310900383, + "grad_norm": 4.377289295196533, + "learning_rate": 2.313123642732049e-05, + "loss": 1.0733, + "num_input_tokens_seen": 176733320, + "step": 10984 + }, + { + "epoch": 0.7694799793357675, + "grad_norm": 3.497921943664551, + "learning_rate": 2.312423817863398e-05, + "loss": 1.0091, + "num_input_tokens_seen": 176748888, + "step": 10985 + }, + { + "epoch": 0.7695500275814967, + "grad_norm": 4.382628440856934, + "learning_rate": 2.3117239929947457e-05, + "loss": 1.2198, + "num_input_tokens_seen": 176764224, + "step": 10986 + }, + { + "epoch": 0.769620075827226, + "grad_norm": 3.4535982608795166, + "learning_rate": 2.3110241681260946e-05, + "loss": 1.0184, + "num_input_tokens_seen": 176780408, + "step": 10987 + }, + { + "epoch": 0.7696901240729552, + "grad_norm": 4.277075290679932, + "learning_rate": 2.3103243432574424e-05, + "loss": 1.1277, + "num_input_tokens_seen": 176795176, + "step": 10988 + }, + { + "epoch": 0.7697601723186845, + "grad_norm": 6.811451435089111, + "learning_rate": 2.3096245183887916e-05, + "loss": 1.1644, + "num_input_tokens_seen": 176811560, + "step": 10989 + }, + { + "epoch": 0.7698302205644137, + "grad_norm": 4.666324615478516, + "learning_rate": 2.3089246935201405e-05, + "loss": 1.082, + "num_input_tokens_seen": 176827944, + "step": 10990 + }, + { + "epoch": 0.769900268810143, + "grad_norm": 3.647332191467285, + "learning_rate": 2.3082248686514883e-05, + "loss": 1.0947, + "num_input_tokens_seen": 176843808, + "step": 10991 + }, + { + "epoch": 0.7699703170558723, + "grad_norm": 3.65639591217041, + "learning_rate": 2.307525043782837e-05, + "loss": 0.9373, + "num_input_tokens_seen": 176860088, + "step": 10992 + }, + { + "epoch": 0.7700403653016015, + "grad_norm": 4.904346466064453, + "learning_rate": 2.306825218914185e-05, + "loss": 0.9482, + "num_input_tokens_seen": 176876256, + "step": 10993 + }, + { + "epoch": 0.7701104135473307, + "grad_norm": 4.203798294067383, + "learning_rate": 2.3061253940455342e-05, + "loss": 1.1913, + "num_input_tokens_seen": 176892152, + "step": 10994 + }, + { + "epoch": 0.77018046179306, + "grad_norm": 4.432934761047363, + "learning_rate": 2.305425569176883e-05, + "loss": 1.0471, + "num_input_tokens_seen": 176906808, + "step": 10995 + }, + { + "epoch": 0.7702505100387892, + "grad_norm": 4.936946868896484, + "learning_rate": 2.304725744308231e-05, + "loss": 1.0809, + "num_input_tokens_seen": 176921576, + "step": 10996 + }, + { + "epoch": 0.7703205582845185, + "grad_norm": 4.698544979095459, + "learning_rate": 2.3040259194395798e-05, + "loss": 1.1073, + "num_input_tokens_seen": 176937960, + "step": 10997 + }, + { + "epoch": 0.7703906065302477, + "grad_norm": 4.584660053253174, + "learning_rate": 2.3033260945709276e-05, + "loss": 1.068, + "num_input_tokens_seen": 176954344, + "step": 10998 + }, + { + "epoch": 0.7704606547759769, + "grad_norm": 3.440721273422241, + "learning_rate": 2.3026262697022768e-05, + "loss": 0.9593, + "num_input_tokens_seen": 176970728, + "step": 10999 + }, + { + "epoch": 0.7705307030217062, + "grad_norm": 3.762274980545044, + "learning_rate": 2.3019264448336257e-05, + "loss": 0.8176, + "num_input_tokens_seen": 176986320, + "step": 11000 + }, + { + "epoch": 0.7705307030217062, + "eval_loss": 1.115494728088379, + "eval_runtime": 0.1833, + "eval_samples_per_second": 5.455, + "eval_steps_per_second": 5.455, + "num_input_tokens_seen": 176986320, + "step": 11000 + }, + { + "epoch": 0.7706007512674354, + "grad_norm": 4.450917720794678, + "learning_rate": 2.3012266199649735e-05, + "loss": 1.1314, + "num_input_tokens_seen": 177002600, + "step": 11001 + }, + { + "epoch": 0.7706707995131646, + "grad_norm": 4.702643394470215, + "learning_rate": 2.3005267950963224e-05, + "loss": 1.0185, + "num_input_tokens_seen": 177017880, + "step": 11002 + }, + { + "epoch": 0.770740847758894, + "grad_norm": 3.704450845718384, + "learning_rate": 2.2998269702276702e-05, + "loss": 1.1464, + "num_input_tokens_seen": 177033896, + "step": 11003 + }, + { + "epoch": 0.7708108960046232, + "grad_norm": 3.834083080291748, + "learning_rate": 2.2991271453590194e-05, + "loss": 0.9217, + "num_input_tokens_seen": 177049584, + "step": 11004 + }, + { + "epoch": 0.7708809442503525, + "grad_norm": 5.260603427886963, + "learning_rate": 2.2984273204903683e-05, + "loss": 1.0564, + "num_input_tokens_seen": 177065504, + "step": 11005 + }, + { + "epoch": 0.7709509924960817, + "grad_norm": 4.318889617919922, + "learning_rate": 2.297727495621716e-05, + "loss": 1.0566, + "num_input_tokens_seen": 177081672, + "step": 11006 + }, + { + "epoch": 0.7710210407418109, + "grad_norm": 3.8280837535858154, + "learning_rate": 2.297027670753065e-05, + "loss": 1.0264, + "num_input_tokens_seen": 177097808, + "step": 11007 + }, + { + "epoch": 0.7710910889875402, + "grad_norm": 3.924556016921997, + "learning_rate": 2.2963278458844128e-05, + "loss": 1.1024, + "num_input_tokens_seen": 177114192, + "step": 11008 + }, + { + "epoch": 0.7711611372332694, + "grad_norm": 3.771416187286377, + "learning_rate": 2.295628021015762e-05, + "loss": 1.2015, + "num_input_tokens_seen": 177130264, + "step": 11009 + }, + { + "epoch": 0.7712311854789987, + "grad_norm": 4.259891510009766, + "learning_rate": 2.29492819614711e-05, + "loss": 1.0529, + "num_input_tokens_seen": 177146648, + "step": 11010 + }, + { + "epoch": 0.7713012337247279, + "grad_norm": 3.779045343399048, + "learning_rate": 2.2942283712784587e-05, + "loss": 0.9459, + "num_input_tokens_seen": 177162296, + "step": 11011 + }, + { + "epoch": 0.7713712819704571, + "grad_norm": 3.9357433319091797, + "learning_rate": 2.2935285464098076e-05, + "loss": 1.1298, + "num_input_tokens_seen": 177178680, + "step": 11012 + }, + { + "epoch": 0.7714413302161864, + "grad_norm": 4.12274694442749, + "learning_rate": 2.2928287215411554e-05, + "loss": 1.0558, + "num_input_tokens_seen": 177194224, + "step": 11013 + }, + { + "epoch": 0.7715113784619156, + "grad_norm": 4.758367538452148, + "learning_rate": 2.2921288966725046e-05, + "loss": 0.9735, + "num_input_tokens_seen": 177209912, + "step": 11014 + }, + { + "epoch": 0.7715814267076448, + "grad_norm": 4.080582618713379, + "learning_rate": 2.2914290718038524e-05, + "loss": 1.0397, + "num_input_tokens_seen": 177226104, + "step": 11015 + }, + { + "epoch": 0.7716514749533742, + "grad_norm": 6.203343868255615, + "learning_rate": 2.2907292469352013e-05, + "loss": 1.1659, + "num_input_tokens_seen": 177242488, + "step": 11016 + }, + { + "epoch": 0.7717215231991034, + "grad_norm": 4.42435884475708, + "learning_rate": 2.29002942206655e-05, + "loss": 0.9362, + "num_input_tokens_seen": 177258872, + "step": 11017 + }, + { + "epoch": 0.7717915714448327, + "grad_norm": 3.661391496658325, + "learning_rate": 2.289329597197898e-05, + "loss": 0.9878, + "num_input_tokens_seen": 177274784, + "step": 11018 + }, + { + "epoch": 0.7718616196905619, + "grad_norm": 3.793839931488037, + "learning_rate": 2.2886297723292472e-05, + "loss": 1.1428, + "num_input_tokens_seen": 177291168, + "step": 11019 + }, + { + "epoch": 0.7719316679362911, + "grad_norm": 5.829233169555664, + "learning_rate": 2.287929947460595e-05, + "loss": 1.2178, + "num_input_tokens_seen": 177306488, + "step": 11020 + }, + { + "epoch": 0.7720017161820204, + "grad_norm": 4.071543216705322, + "learning_rate": 2.287230122591944e-05, + "loss": 1.1912, + "num_input_tokens_seen": 177322872, + "step": 11021 + }, + { + "epoch": 0.7720717644277496, + "grad_norm": 4.263513565063477, + "learning_rate": 2.2865302977232928e-05, + "loss": 1.1436, + "num_input_tokens_seen": 177338928, + "step": 11022 + }, + { + "epoch": 0.7721418126734788, + "grad_norm": 3.701070547103882, + "learning_rate": 2.2858304728546406e-05, + "loss": 0.9002, + "num_input_tokens_seen": 177355200, + "step": 11023 + }, + { + "epoch": 0.7722118609192081, + "grad_norm": 4.204559803009033, + "learning_rate": 2.2851306479859898e-05, + "loss": 0.9633, + "num_input_tokens_seen": 177371560, + "step": 11024 + }, + { + "epoch": 0.7722819091649373, + "grad_norm": 5.5023627281188965, + "learning_rate": 2.2844308231173376e-05, + "loss": 1.1311, + "num_input_tokens_seen": 177387944, + "step": 11025 + }, + { + "epoch": 0.7723519574106666, + "grad_norm": 3.8904125690460205, + "learning_rate": 2.2837309982486865e-05, + "loss": 1.0775, + "num_input_tokens_seen": 177404328, + "step": 11026 + }, + { + "epoch": 0.7724220056563958, + "grad_norm": 5.065434455871582, + "learning_rate": 2.2830311733800354e-05, + "loss": 1.0124, + "num_input_tokens_seen": 177420624, + "step": 11027 + }, + { + "epoch": 0.772492053902125, + "grad_norm": 3.711656093597412, + "learning_rate": 2.2823313485113832e-05, + "loss": 0.9186, + "num_input_tokens_seen": 177436832, + "step": 11028 + }, + { + "epoch": 0.7725621021478544, + "grad_norm": 3.6835408210754395, + "learning_rate": 2.2816315236427324e-05, + "loss": 1.0708, + "num_input_tokens_seen": 177453184, + "step": 11029 + }, + { + "epoch": 0.7726321503935836, + "grad_norm": 4.245560169219971, + "learning_rate": 2.2809316987740803e-05, + "loss": 0.8995, + "num_input_tokens_seen": 177469256, + "step": 11030 + }, + { + "epoch": 0.7727021986393128, + "grad_norm": 3.8374834060668945, + "learning_rate": 2.280231873905429e-05, + "loss": 1.0472, + "num_input_tokens_seen": 177485592, + "step": 11031 + }, + { + "epoch": 0.7727722468850421, + "grad_norm": 4.166322231292725, + "learning_rate": 2.279532049036778e-05, + "loss": 1.0709, + "num_input_tokens_seen": 177501944, + "step": 11032 + }, + { + "epoch": 0.7728422951307713, + "grad_norm": 3.858267307281494, + "learning_rate": 2.2788322241681258e-05, + "loss": 1.1467, + "num_input_tokens_seen": 177518328, + "step": 11033 + }, + { + "epoch": 0.7729123433765006, + "grad_norm": 4.947542190551758, + "learning_rate": 2.278132399299475e-05, + "loss": 1.1679, + "num_input_tokens_seen": 177534624, + "step": 11034 + }, + { + "epoch": 0.7729823916222298, + "grad_norm": 3.4847195148468018, + "learning_rate": 2.277432574430823e-05, + "loss": 0.9058, + "num_input_tokens_seen": 177551008, + "step": 11035 + }, + { + "epoch": 0.773052439867959, + "grad_norm": 5.226019382476807, + "learning_rate": 2.2767327495621717e-05, + "loss": 1.1298, + "num_input_tokens_seen": 177567392, + "step": 11036 + }, + { + "epoch": 0.7731224881136883, + "grad_norm": 3.9027740955352783, + "learning_rate": 2.2760329246935196e-05, + "loss": 1.0184, + "num_input_tokens_seen": 177583472, + "step": 11037 + }, + { + "epoch": 0.7731925363594175, + "grad_norm": 4.359976291656494, + "learning_rate": 2.2753330998248684e-05, + "loss": 0.9496, + "num_input_tokens_seen": 177599856, + "step": 11038 + }, + { + "epoch": 0.7732625846051467, + "grad_norm": 4.629324913024902, + "learning_rate": 2.2746332749562176e-05, + "loss": 1.0986, + "num_input_tokens_seen": 177616208, + "step": 11039 + }, + { + "epoch": 0.7733326328508761, + "grad_norm": 3.466921329498291, + "learning_rate": 2.2739334500875655e-05, + "loss": 0.7733, + "num_input_tokens_seen": 177632464, + "step": 11040 + }, + { + "epoch": 0.7734026810966053, + "grad_norm": 4.756137371063232, + "learning_rate": 2.2732336252189143e-05, + "loss": 1.2632, + "num_input_tokens_seen": 177648848, + "step": 11041 + }, + { + "epoch": 0.7734727293423346, + "grad_norm": 3.735460042953491, + "learning_rate": 2.272533800350262e-05, + "loss": 0.7861, + "num_input_tokens_seen": 177665232, + "step": 11042 + }, + { + "epoch": 0.7735427775880638, + "grad_norm": 3.3415637016296387, + "learning_rate": 2.271833975481611e-05, + "loss": 0.8076, + "num_input_tokens_seen": 177681616, + "step": 11043 + }, + { + "epoch": 0.773612825833793, + "grad_norm": 3.9260153770446777, + "learning_rate": 2.2711341506129602e-05, + "loss": 1.1635, + "num_input_tokens_seen": 177698000, + "step": 11044 + }, + { + "epoch": 0.7736828740795223, + "grad_norm": 3.3624448776245117, + "learning_rate": 2.270434325744308e-05, + "loss": 0.9623, + "num_input_tokens_seen": 177714384, + "step": 11045 + }, + { + "epoch": 0.7737529223252515, + "grad_norm": 3.386521100997925, + "learning_rate": 2.269734500875657e-05, + "loss": 0.8836, + "num_input_tokens_seen": 177730632, + "step": 11046 + }, + { + "epoch": 0.7738229705709808, + "grad_norm": 3.912233352661133, + "learning_rate": 2.2690346760070048e-05, + "loss": 1.0458, + "num_input_tokens_seen": 177747016, + "step": 11047 + }, + { + "epoch": 0.77389301881671, + "grad_norm": 4.177756309509277, + "learning_rate": 2.2683348511383536e-05, + "loss": 0.9833, + "num_input_tokens_seen": 177763400, + "step": 11048 + }, + { + "epoch": 0.7739630670624392, + "grad_norm": 3.2670862674713135, + "learning_rate": 2.2676350262697028e-05, + "loss": 0.941, + "num_input_tokens_seen": 177778904, + "step": 11049 + }, + { + "epoch": 0.7740331153081685, + "grad_norm": 4.859564781188965, + "learning_rate": 2.2669352014010507e-05, + "loss": 1.1018, + "num_input_tokens_seen": 177794944, + "step": 11050 + }, + { + "epoch": 0.7741031635538977, + "grad_norm": 3.744669198989868, + "learning_rate": 2.2662353765323995e-05, + "loss": 0.967, + "num_input_tokens_seen": 177811240, + "step": 11051 + }, + { + "epoch": 0.774173211799627, + "grad_norm": 4.515638828277588, + "learning_rate": 2.2655355516637474e-05, + "loss": 1.0198, + "num_input_tokens_seen": 177827472, + "step": 11052 + }, + { + "epoch": 0.7742432600453563, + "grad_norm": 3.6140289306640625, + "learning_rate": 2.2648357267950962e-05, + "loss": 0.9566, + "num_input_tokens_seen": 177843336, + "step": 11053 + }, + { + "epoch": 0.7743133082910855, + "grad_norm": 3.7888247966766357, + "learning_rate": 2.2641359019264454e-05, + "loss": 1.102, + "num_input_tokens_seen": 177859624, + "step": 11054 + }, + { + "epoch": 0.7743833565368148, + "grad_norm": 3.5480916500091553, + "learning_rate": 2.2634360770577933e-05, + "loss": 1.0271, + "num_input_tokens_seen": 177875512, + "step": 11055 + }, + { + "epoch": 0.774453404782544, + "grad_norm": 3.606433629989624, + "learning_rate": 2.262736252189142e-05, + "loss": 1.0311, + "num_input_tokens_seen": 177891880, + "step": 11056 + }, + { + "epoch": 0.7745234530282732, + "grad_norm": 4.603903770446777, + "learning_rate": 2.26203642732049e-05, + "loss": 1.1401, + "num_input_tokens_seen": 177906952, + "step": 11057 + }, + { + "epoch": 0.7745935012740025, + "grad_norm": 3.6753571033477783, + "learning_rate": 2.2613366024518388e-05, + "loss": 1.1768, + "num_input_tokens_seen": 177923152, + "step": 11058 + }, + { + "epoch": 0.7746635495197317, + "grad_norm": 3.6185266971588135, + "learning_rate": 2.260636777583188e-05, + "loss": 0.9244, + "num_input_tokens_seen": 177939536, + "step": 11059 + }, + { + "epoch": 0.7747335977654609, + "grad_norm": 4.618793487548828, + "learning_rate": 2.259936952714536e-05, + "loss": 1.0394, + "num_input_tokens_seen": 177954272, + "step": 11060 + }, + { + "epoch": 0.7748036460111902, + "grad_norm": 4.139368534088135, + "learning_rate": 2.2592371278458847e-05, + "loss": 1.0821, + "num_input_tokens_seen": 177970656, + "step": 11061 + }, + { + "epoch": 0.7748736942569194, + "grad_norm": 3.5731890201568604, + "learning_rate": 2.2585373029772326e-05, + "loss": 1.0302, + "num_input_tokens_seen": 177987040, + "step": 11062 + }, + { + "epoch": 0.7749437425026487, + "grad_norm": 8.048038482666016, + "learning_rate": 2.2578374781085814e-05, + "loss": 1.1078, + "num_input_tokens_seen": 178003424, + "step": 11063 + }, + { + "epoch": 0.775013790748378, + "grad_norm": 3.8885443210601807, + "learning_rate": 2.2571376532399293e-05, + "loss": 1.1492, + "num_input_tokens_seen": 178018848, + "step": 11064 + }, + { + "epoch": 0.7750838389941072, + "grad_norm": 4.066343307495117, + "learning_rate": 2.2564378283712785e-05, + "loss": 1.2294, + "num_input_tokens_seen": 178035232, + "step": 11065 + }, + { + "epoch": 0.7751538872398365, + "grad_norm": 3.7317886352539062, + "learning_rate": 2.2557380035026273e-05, + "loss": 1.0826, + "num_input_tokens_seen": 178051616, + "step": 11066 + }, + { + "epoch": 0.7752239354855657, + "grad_norm": 4.256893157958984, + "learning_rate": 2.255038178633975e-05, + "loss": 1.1273, + "num_input_tokens_seen": 178067664, + "step": 11067 + }, + { + "epoch": 0.7752939837312949, + "grad_norm": 4.537008762359619, + "learning_rate": 2.254338353765324e-05, + "loss": 0.9344, + "num_input_tokens_seen": 178083536, + "step": 11068 + }, + { + "epoch": 0.7753640319770242, + "grad_norm": 4.971778869628906, + "learning_rate": 2.253638528896672e-05, + "loss": 1.0578, + "num_input_tokens_seen": 178099800, + "step": 11069 + }, + { + "epoch": 0.7754340802227534, + "grad_norm": 3.7119333744049072, + "learning_rate": 2.252938704028021e-05, + "loss": 1.1037, + "num_input_tokens_seen": 178116168, + "step": 11070 + }, + { + "epoch": 0.7755041284684827, + "grad_norm": 4.262823104858398, + "learning_rate": 2.25223887915937e-05, + "loss": 1.0534, + "num_input_tokens_seen": 178131864, + "step": 11071 + }, + { + "epoch": 0.7755741767142119, + "grad_norm": 3.8917129039764404, + "learning_rate": 2.2515390542907178e-05, + "loss": 1.0993, + "num_input_tokens_seen": 178148248, + "step": 11072 + }, + { + "epoch": 0.7756442249599411, + "grad_norm": 3.733821392059326, + "learning_rate": 2.2508392294220666e-05, + "loss": 0.895, + "num_input_tokens_seen": 178164336, + "step": 11073 + }, + { + "epoch": 0.7757142732056704, + "grad_norm": 5.182557106018066, + "learning_rate": 2.2501394045534145e-05, + "loss": 0.9354, + "num_input_tokens_seen": 178180720, + "step": 11074 + }, + { + "epoch": 0.7757843214513996, + "grad_norm": 3.80326771736145, + "learning_rate": 2.2494395796847637e-05, + "loss": 1.0529, + "num_input_tokens_seen": 178197104, + "step": 11075 + }, + { + "epoch": 0.7758543696971288, + "grad_norm": 3.9499120712280273, + "learning_rate": 2.2487397548161125e-05, + "loss": 0.9928, + "num_input_tokens_seen": 178213488, + "step": 11076 + }, + { + "epoch": 0.7759244179428582, + "grad_norm": 4.4923014640808105, + "learning_rate": 2.2480399299474604e-05, + "loss": 1.2847, + "num_input_tokens_seen": 178228648, + "step": 11077 + }, + { + "epoch": 0.7759944661885874, + "grad_norm": 4.1589202880859375, + "learning_rate": 2.2473401050788092e-05, + "loss": 1.1556, + "num_input_tokens_seen": 178243976, + "step": 11078 + }, + { + "epoch": 0.7760645144343167, + "grad_norm": 3.496601104736328, + "learning_rate": 2.246640280210157e-05, + "loss": 1.016, + "num_input_tokens_seen": 178260360, + "step": 11079 + }, + { + "epoch": 0.7761345626800459, + "grad_norm": 3.5509777069091797, + "learning_rate": 2.2459404553415063e-05, + "loss": 0.9848, + "num_input_tokens_seen": 178276744, + "step": 11080 + }, + { + "epoch": 0.7762046109257751, + "grad_norm": 4.304171085357666, + "learning_rate": 2.245240630472855e-05, + "loss": 1.1059, + "num_input_tokens_seen": 178293128, + "step": 11081 + }, + { + "epoch": 0.7762746591715044, + "grad_norm": 4.849355697631836, + "learning_rate": 2.244540805604203e-05, + "loss": 0.9175, + "num_input_tokens_seen": 178308856, + "step": 11082 + }, + { + "epoch": 0.7763447074172336, + "grad_norm": 3.8573389053344727, + "learning_rate": 2.2438409807355518e-05, + "loss": 1.105, + "num_input_tokens_seen": 178325240, + "step": 11083 + }, + { + "epoch": 0.7764147556629629, + "grad_norm": 5.154492378234863, + "learning_rate": 2.2431411558668997e-05, + "loss": 1.1206, + "num_input_tokens_seen": 178341032, + "step": 11084 + }, + { + "epoch": 0.7764848039086921, + "grad_norm": 4.064823627471924, + "learning_rate": 2.242441330998249e-05, + "loss": 1.2233, + "num_input_tokens_seen": 178356552, + "step": 11085 + }, + { + "epoch": 0.7765548521544213, + "grad_norm": 4.686689376831055, + "learning_rate": 2.2417415061295977e-05, + "loss": 1.0179, + "num_input_tokens_seen": 178372296, + "step": 11086 + }, + { + "epoch": 0.7766249004001506, + "grad_norm": 3.728402614593506, + "learning_rate": 2.2410416812609456e-05, + "loss": 1.0734, + "num_input_tokens_seen": 178388680, + "step": 11087 + }, + { + "epoch": 0.7766949486458798, + "grad_norm": 3.6905949115753174, + "learning_rate": 2.2403418563922944e-05, + "loss": 1.1887, + "num_input_tokens_seen": 178404744, + "step": 11088 + }, + { + "epoch": 0.776764996891609, + "grad_norm": 3.360192060470581, + "learning_rate": 2.2396420315236423e-05, + "loss": 0.999, + "num_input_tokens_seen": 178421128, + "step": 11089 + }, + { + "epoch": 0.7768350451373384, + "grad_norm": 3.650238513946533, + "learning_rate": 2.2389422066549915e-05, + "loss": 1.128, + "num_input_tokens_seen": 178437184, + "step": 11090 + }, + { + "epoch": 0.7769050933830676, + "grad_norm": 3.88167142868042, + "learning_rate": 2.2382423817863393e-05, + "loss": 1.1531, + "num_input_tokens_seen": 178452592, + "step": 11091 + }, + { + "epoch": 0.7769751416287969, + "grad_norm": 3.857001543045044, + "learning_rate": 2.237542556917688e-05, + "loss": 1.0737, + "num_input_tokens_seen": 178467952, + "step": 11092 + }, + { + "epoch": 0.7770451898745261, + "grad_norm": 3.7196145057678223, + "learning_rate": 2.236842732049037e-05, + "loss": 1.0974, + "num_input_tokens_seen": 178484336, + "step": 11093 + }, + { + "epoch": 0.7771152381202553, + "grad_norm": 4.762890815734863, + "learning_rate": 2.236142907180385e-05, + "loss": 1.0765, + "num_input_tokens_seen": 178499496, + "step": 11094 + }, + { + "epoch": 0.7771852863659846, + "grad_norm": 4.216147422790527, + "learning_rate": 2.235443082311734e-05, + "loss": 1.0206, + "num_input_tokens_seen": 178515880, + "step": 11095 + }, + { + "epoch": 0.7772553346117138, + "grad_norm": 3.626659870147705, + "learning_rate": 2.234743257443082e-05, + "loss": 1.0263, + "num_input_tokens_seen": 178532264, + "step": 11096 + }, + { + "epoch": 0.777325382857443, + "grad_norm": 4.0801897048950195, + "learning_rate": 2.2340434325744308e-05, + "loss": 1.1804, + "num_input_tokens_seen": 178548560, + "step": 11097 + }, + { + "epoch": 0.7773954311031723, + "grad_norm": 5.473814487457275, + "learning_rate": 2.2333436077057796e-05, + "loss": 1.0431, + "num_input_tokens_seen": 178564360, + "step": 11098 + }, + { + "epoch": 0.7774654793489015, + "grad_norm": 6.363767623901367, + "learning_rate": 2.2326437828371275e-05, + "loss": 1.0929, + "num_input_tokens_seen": 178579888, + "step": 11099 + }, + { + "epoch": 0.7775355275946308, + "grad_norm": 3.2703661918640137, + "learning_rate": 2.2319439579684767e-05, + "loss": 0.9722, + "num_input_tokens_seen": 178595872, + "step": 11100 + }, + { + "epoch": 0.77760557584036, + "grad_norm": 3.642777919769287, + "learning_rate": 2.2312441330998245e-05, + "loss": 1.1422, + "num_input_tokens_seen": 178612256, + "step": 11101 + }, + { + "epoch": 0.7776756240860893, + "grad_norm": 5.101443767547607, + "learning_rate": 2.2305443082311734e-05, + "loss": 1.1866, + "num_input_tokens_seen": 178628512, + "step": 11102 + }, + { + "epoch": 0.7777456723318186, + "grad_norm": 3.7501959800720215, + "learning_rate": 2.2298444833625222e-05, + "loss": 1.0299, + "num_input_tokens_seen": 178644080, + "step": 11103 + }, + { + "epoch": 0.7778157205775478, + "grad_norm": 4.421937465667725, + "learning_rate": 2.22914465849387e-05, + "loss": 1.1835, + "num_input_tokens_seen": 178660192, + "step": 11104 + }, + { + "epoch": 0.777885768823277, + "grad_norm": 4.5925493240356445, + "learning_rate": 2.2284448336252193e-05, + "loss": 1.0908, + "num_input_tokens_seen": 178676488, + "step": 11105 + }, + { + "epoch": 0.7779558170690063, + "grad_norm": 4.042560577392578, + "learning_rate": 2.227745008756567e-05, + "loss": 1.0841, + "num_input_tokens_seen": 178692376, + "step": 11106 + }, + { + "epoch": 0.7780258653147355, + "grad_norm": 3.441215991973877, + "learning_rate": 2.227045183887916e-05, + "loss": 0.9225, + "num_input_tokens_seen": 178708288, + "step": 11107 + }, + { + "epoch": 0.7780959135604648, + "grad_norm": 3.469501256942749, + "learning_rate": 2.2263453590192648e-05, + "loss": 0.8682, + "num_input_tokens_seen": 178724672, + "step": 11108 + }, + { + "epoch": 0.778165961806194, + "grad_norm": 3.389233350753784, + "learning_rate": 2.2256455341506127e-05, + "loss": 1.0063, + "num_input_tokens_seen": 178741056, + "step": 11109 + }, + { + "epoch": 0.7782360100519232, + "grad_norm": 3.8499207496643066, + "learning_rate": 2.224945709281962e-05, + "loss": 0.9342, + "num_input_tokens_seen": 178757440, + "step": 11110 + }, + { + "epoch": 0.7783060582976525, + "grad_norm": 3.4075815677642822, + "learning_rate": 2.2242458844133097e-05, + "loss": 0.8925, + "num_input_tokens_seen": 178773736, + "step": 11111 + }, + { + "epoch": 0.7783761065433817, + "grad_norm": 6.01906156539917, + "learning_rate": 2.2235460595446586e-05, + "loss": 1.0487, + "num_input_tokens_seen": 178790120, + "step": 11112 + }, + { + "epoch": 0.7784461547891111, + "grad_norm": 4.80954122543335, + "learning_rate": 2.2228462346760074e-05, + "loss": 1.186, + "num_input_tokens_seen": 178805936, + "step": 11113 + }, + { + "epoch": 0.7785162030348403, + "grad_norm": 4.8050432205200195, + "learning_rate": 2.2221464098073553e-05, + "loss": 0.8957, + "num_input_tokens_seen": 178821024, + "step": 11114 + }, + { + "epoch": 0.7785862512805695, + "grad_norm": 3.6945126056671143, + "learning_rate": 2.2214465849387045e-05, + "loss": 1.0561, + "num_input_tokens_seen": 178837008, + "step": 11115 + }, + { + "epoch": 0.7786562995262988, + "grad_norm": 4.055240631103516, + "learning_rate": 2.2207467600700523e-05, + "loss": 1.0659, + "num_input_tokens_seen": 178853048, + "step": 11116 + }, + { + "epoch": 0.778726347772028, + "grad_norm": 5.597837448120117, + "learning_rate": 2.2200469352014012e-05, + "loss": 1.0019, + "num_input_tokens_seen": 178869432, + "step": 11117 + }, + { + "epoch": 0.7787963960177572, + "grad_norm": 3.767123222351074, + "learning_rate": 2.219347110332749e-05, + "loss": 1.054, + "num_input_tokens_seen": 178885816, + "step": 11118 + }, + { + "epoch": 0.7788664442634865, + "grad_norm": 3.965877056121826, + "learning_rate": 2.218647285464098e-05, + "loss": 0.9653, + "num_input_tokens_seen": 178902200, + "step": 11119 + }, + { + "epoch": 0.7789364925092157, + "grad_norm": 3.781658887863159, + "learning_rate": 2.217947460595447e-05, + "loss": 1.0831, + "num_input_tokens_seen": 178918584, + "step": 11120 + }, + { + "epoch": 0.779006540754945, + "grad_norm": 4.278552532196045, + "learning_rate": 2.217247635726795e-05, + "loss": 1.0968, + "num_input_tokens_seen": 178934576, + "step": 11121 + }, + { + "epoch": 0.7790765890006742, + "grad_norm": 5.066880702972412, + "learning_rate": 2.2165478108581438e-05, + "loss": 1.1125, + "num_input_tokens_seen": 178950784, + "step": 11122 + }, + { + "epoch": 0.7791466372464034, + "grad_norm": 3.3851606845855713, + "learning_rate": 2.2158479859894916e-05, + "loss": 0.9323, + "num_input_tokens_seen": 178967168, + "step": 11123 + }, + { + "epoch": 0.7792166854921327, + "grad_norm": 4.09178352355957, + "learning_rate": 2.2151481611208405e-05, + "loss": 1.1745, + "num_input_tokens_seen": 178983552, + "step": 11124 + }, + { + "epoch": 0.779286733737862, + "grad_norm": 3.8570780754089355, + "learning_rate": 2.2144483362521897e-05, + "loss": 1.0276, + "num_input_tokens_seen": 178999936, + "step": 11125 + }, + { + "epoch": 0.7793567819835912, + "grad_norm": 3.8036327362060547, + "learning_rate": 2.2137485113835375e-05, + "loss": 1.0696, + "num_input_tokens_seen": 179016320, + "step": 11126 + }, + { + "epoch": 0.7794268302293205, + "grad_norm": 3.749253749847412, + "learning_rate": 2.2130486865148864e-05, + "loss": 1.0895, + "num_input_tokens_seen": 179032704, + "step": 11127 + }, + { + "epoch": 0.7794968784750497, + "grad_norm": 3.4970247745513916, + "learning_rate": 2.2123488616462342e-05, + "loss": 1.0821, + "num_input_tokens_seen": 179049088, + "step": 11128 + }, + { + "epoch": 0.779566926720779, + "grad_norm": 4.560108184814453, + "learning_rate": 2.211649036777583e-05, + "loss": 0.881, + "num_input_tokens_seen": 179065216, + "step": 11129 + }, + { + "epoch": 0.7796369749665082, + "grad_norm": 4.630332946777344, + "learning_rate": 2.2109492119089323e-05, + "loss": 1.1236, + "num_input_tokens_seen": 179081456, + "step": 11130 + }, + { + "epoch": 0.7797070232122374, + "grad_norm": 4.403186798095703, + "learning_rate": 2.21024938704028e-05, + "loss": 1.0367, + "num_input_tokens_seen": 179097464, + "step": 11131 + }, + { + "epoch": 0.7797770714579667, + "grad_norm": 4.453989505767822, + "learning_rate": 2.209549562171629e-05, + "loss": 1.0915, + "num_input_tokens_seen": 179113848, + "step": 11132 + }, + { + "epoch": 0.7798471197036959, + "grad_norm": 3.8054792881011963, + "learning_rate": 2.2088497373029768e-05, + "loss": 0.9978, + "num_input_tokens_seen": 179129096, + "step": 11133 + }, + { + "epoch": 0.7799171679494251, + "grad_norm": 3.9605648517608643, + "learning_rate": 2.2081499124343257e-05, + "loss": 0.9003, + "num_input_tokens_seen": 179145232, + "step": 11134 + }, + { + "epoch": 0.7799872161951544, + "grad_norm": 3.8227620124816895, + "learning_rate": 2.207450087565675e-05, + "loss": 0.9717, + "num_input_tokens_seen": 179160936, + "step": 11135 + }, + { + "epoch": 0.7800572644408836, + "grad_norm": 4.243280410766602, + "learning_rate": 2.2067502626970227e-05, + "loss": 1.2835, + "num_input_tokens_seen": 179177320, + "step": 11136 + }, + { + "epoch": 0.780127312686613, + "grad_norm": 4.412675857543945, + "learning_rate": 2.2060504378283716e-05, + "loss": 1.1201, + "num_input_tokens_seen": 179193704, + "step": 11137 + }, + { + "epoch": 0.7801973609323422, + "grad_norm": 3.889578342437744, + "learning_rate": 2.2053506129597194e-05, + "loss": 1.1438, + "num_input_tokens_seen": 179209656, + "step": 11138 + }, + { + "epoch": 0.7802674091780714, + "grad_norm": 3.8184735774993896, + "learning_rate": 2.2046507880910683e-05, + "loss": 0.9368, + "num_input_tokens_seen": 179225536, + "step": 11139 + }, + { + "epoch": 0.7803374574238007, + "grad_norm": 3.7255520820617676, + "learning_rate": 2.203950963222416e-05, + "loss": 1.0922, + "num_input_tokens_seen": 179241680, + "step": 11140 + }, + { + "epoch": 0.7804075056695299, + "grad_norm": 4.123396396636963, + "learning_rate": 2.2032511383537653e-05, + "loss": 1.1033, + "num_input_tokens_seen": 179257720, + "step": 11141 + }, + { + "epoch": 0.7804775539152591, + "grad_norm": 3.8988237380981445, + "learning_rate": 2.2025513134851142e-05, + "loss": 1.1302, + "num_input_tokens_seen": 179273600, + "step": 11142 + }, + { + "epoch": 0.7805476021609884, + "grad_norm": 3.654130220413208, + "learning_rate": 2.201851488616462e-05, + "loss": 0.9274, + "num_input_tokens_seen": 179289328, + "step": 11143 + }, + { + "epoch": 0.7806176504067176, + "grad_norm": 3.6523854732513428, + "learning_rate": 2.201151663747811e-05, + "loss": 1.022, + "num_input_tokens_seen": 179305712, + "step": 11144 + }, + { + "epoch": 0.7806876986524469, + "grad_norm": 3.626288652420044, + "learning_rate": 2.2004518388791587e-05, + "loss": 0.8051, + "num_input_tokens_seen": 179320880, + "step": 11145 + }, + { + "epoch": 0.7807577468981761, + "grad_norm": 3.7162528038024902, + "learning_rate": 2.199752014010508e-05, + "loss": 0.9319, + "num_input_tokens_seen": 179336648, + "step": 11146 + }, + { + "epoch": 0.7808277951439053, + "grad_norm": 3.375903606414795, + "learning_rate": 2.1990521891418568e-05, + "loss": 0.9605, + "num_input_tokens_seen": 179353032, + "step": 11147 + }, + { + "epoch": 0.7808978433896346, + "grad_norm": 4.131130218505859, + "learning_rate": 2.1983523642732046e-05, + "loss": 1.1105, + "num_input_tokens_seen": 179368264, + "step": 11148 + }, + { + "epoch": 0.7809678916353638, + "grad_norm": 4.54514741897583, + "learning_rate": 2.1976525394045535e-05, + "loss": 1.0131, + "num_input_tokens_seen": 179384648, + "step": 11149 + }, + { + "epoch": 0.7810379398810932, + "grad_norm": 4.0040154457092285, + "learning_rate": 2.1969527145359013e-05, + "loss": 1.0012, + "num_input_tokens_seen": 179399616, + "step": 11150 + }, + { + "epoch": 0.7811079881268224, + "grad_norm": 3.6492838859558105, + "learning_rate": 2.1962528896672505e-05, + "loss": 0.8699, + "num_input_tokens_seen": 179415608, + "step": 11151 + }, + { + "epoch": 0.7811780363725516, + "grad_norm": 3.7898054122924805, + "learning_rate": 2.1955530647985994e-05, + "loss": 1.0518, + "num_input_tokens_seen": 179431992, + "step": 11152 + }, + { + "epoch": 0.7812480846182809, + "grad_norm": 3.7069926261901855, + "learning_rate": 2.1948532399299472e-05, + "loss": 0.8825, + "num_input_tokens_seen": 179448000, + "step": 11153 + }, + { + "epoch": 0.7813181328640101, + "grad_norm": 3.597917079925537, + "learning_rate": 2.194153415061296e-05, + "loss": 1.0129, + "num_input_tokens_seen": 179464384, + "step": 11154 + }, + { + "epoch": 0.7813881811097393, + "grad_norm": 4.146019458770752, + "learning_rate": 2.193453590192644e-05, + "loss": 0.9118, + "num_input_tokens_seen": 179480320, + "step": 11155 + }, + { + "epoch": 0.7814582293554686, + "grad_norm": 4.128831386566162, + "learning_rate": 2.192753765323993e-05, + "loss": 1.0328, + "num_input_tokens_seen": 179496536, + "step": 11156 + }, + { + "epoch": 0.7815282776011978, + "grad_norm": 3.5229671001434326, + "learning_rate": 2.192053940455342e-05, + "loss": 0.937, + "num_input_tokens_seen": 179512864, + "step": 11157 + }, + { + "epoch": 0.7815983258469271, + "grad_norm": 5.65916109085083, + "learning_rate": 2.1913541155866898e-05, + "loss": 1.0107, + "num_input_tokens_seen": 179529248, + "step": 11158 + }, + { + "epoch": 0.7816683740926563, + "grad_norm": 4.523099422454834, + "learning_rate": 2.1906542907180387e-05, + "loss": 1.2558, + "num_input_tokens_seen": 179545632, + "step": 11159 + }, + { + "epoch": 0.7817384223383855, + "grad_norm": 3.654788017272949, + "learning_rate": 2.1899544658493865e-05, + "loss": 0.9321, + "num_input_tokens_seen": 179562016, + "step": 11160 + }, + { + "epoch": 0.7818084705841148, + "grad_norm": 5.122577667236328, + "learning_rate": 2.1892546409807357e-05, + "loss": 1.1264, + "num_input_tokens_seen": 179578400, + "step": 11161 + }, + { + "epoch": 0.781878518829844, + "grad_norm": 3.730133295059204, + "learning_rate": 2.1885548161120846e-05, + "loss": 1.0509, + "num_input_tokens_seen": 179593928, + "step": 11162 + }, + { + "epoch": 0.7819485670755733, + "grad_norm": 5.275971412658691, + "learning_rate": 2.1878549912434324e-05, + "loss": 1.1848, + "num_input_tokens_seen": 179609840, + "step": 11163 + }, + { + "epoch": 0.7820186153213026, + "grad_norm": 3.608773946762085, + "learning_rate": 2.1871551663747813e-05, + "loss": 1.0714, + "num_input_tokens_seen": 179625536, + "step": 11164 + }, + { + "epoch": 0.7820886635670318, + "grad_norm": 4.253992080688477, + "learning_rate": 2.186455341506129e-05, + "loss": 0.9543, + "num_input_tokens_seen": 179641920, + "step": 11165 + }, + { + "epoch": 0.7821587118127611, + "grad_norm": 3.8503220081329346, + "learning_rate": 2.1857555166374783e-05, + "loss": 1.1, + "num_input_tokens_seen": 179658304, + "step": 11166 + }, + { + "epoch": 0.7822287600584903, + "grad_norm": 4.298995494842529, + "learning_rate": 2.185055691768826e-05, + "loss": 1.2346, + "num_input_tokens_seen": 179674688, + "step": 11167 + }, + { + "epoch": 0.7822988083042195, + "grad_norm": 4.771897792816162, + "learning_rate": 2.184355866900175e-05, + "loss": 1.0468, + "num_input_tokens_seen": 179690488, + "step": 11168 + }, + { + "epoch": 0.7823688565499488, + "grad_norm": 3.6129636764526367, + "learning_rate": 2.183656042031524e-05, + "loss": 1.1117, + "num_input_tokens_seen": 179706872, + "step": 11169 + }, + { + "epoch": 0.782438904795678, + "grad_norm": 3.4395971298217773, + "learning_rate": 2.1829562171628717e-05, + "loss": 0.8858, + "num_input_tokens_seen": 179723256, + "step": 11170 + }, + { + "epoch": 0.7825089530414072, + "grad_norm": 3.811906337738037, + "learning_rate": 2.182256392294221e-05, + "loss": 1.0964, + "num_input_tokens_seen": 179739344, + "step": 11171 + }, + { + "epoch": 0.7825790012871365, + "grad_norm": 3.708089828491211, + "learning_rate": 2.1815565674255688e-05, + "loss": 0.9984, + "num_input_tokens_seen": 179754752, + "step": 11172 + }, + { + "epoch": 0.7826490495328657, + "grad_norm": 4.111413478851318, + "learning_rate": 2.1808567425569176e-05, + "loss": 1.0622, + "num_input_tokens_seen": 179770832, + "step": 11173 + }, + { + "epoch": 0.782719097778595, + "grad_norm": 5.688427925109863, + "learning_rate": 2.1801569176882665e-05, + "loss": 0.9494, + "num_input_tokens_seen": 179786608, + "step": 11174 + }, + { + "epoch": 0.7827891460243243, + "grad_norm": 4.156343936920166, + "learning_rate": 2.1794570928196143e-05, + "loss": 1.0549, + "num_input_tokens_seen": 179802992, + "step": 11175 + }, + { + "epoch": 0.7828591942700535, + "grad_norm": 4.151462554931641, + "learning_rate": 2.1787572679509635e-05, + "loss": 1.0392, + "num_input_tokens_seen": 179819376, + "step": 11176 + }, + { + "epoch": 0.7829292425157828, + "grad_norm": 6.055819034576416, + "learning_rate": 2.1780574430823114e-05, + "loss": 1.1437, + "num_input_tokens_seen": 179835760, + "step": 11177 + }, + { + "epoch": 0.782999290761512, + "grad_norm": 3.6185219287872314, + "learning_rate": 2.1773576182136602e-05, + "loss": 0.9914, + "num_input_tokens_seen": 179852144, + "step": 11178 + }, + { + "epoch": 0.7830693390072412, + "grad_norm": 4.237154960632324, + "learning_rate": 2.176657793345009e-05, + "loss": 1.0547, + "num_input_tokens_seen": 179867776, + "step": 11179 + }, + { + "epoch": 0.7831393872529705, + "grad_norm": 3.831233263015747, + "learning_rate": 2.175957968476357e-05, + "loss": 0.8952, + "num_input_tokens_seen": 179884160, + "step": 11180 + }, + { + "epoch": 0.7832094354986997, + "grad_norm": 5.775389194488525, + "learning_rate": 2.175258143607706e-05, + "loss": 1.1499, + "num_input_tokens_seen": 179900312, + "step": 11181 + }, + { + "epoch": 0.783279483744429, + "grad_norm": 3.97485613822937, + "learning_rate": 2.174558318739054e-05, + "loss": 1.0049, + "num_input_tokens_seen": 179916696, + "step": 11182 + }, + { + "epoch": 0.7833495319901582, + "grad_norm": 3.626641035079956, + "learning_rate": 2.1738584938704028e-05, + "loss": 0.8712, + "num_input_tokens_seen": 179933080, + "step": 11183 + }, + { + "epoch": 0.7834195802358874, + "grad_norm": 3.821401596069336, + "learning_rate": 2.1731586690017517e-05, + "loss": 1.162, + "num_input_tokens_seen": 179948552, + "step": 11184 + }, + { + "epoch": 0.7834896284816167, + "grad_norm": 3.838019609451294, + "learning_rate": 2.1724588441330995e-05, + "loss": 0.8636, + "num_input_tokens_seen": 179964936, + "step": 11185 + }, + { + "epoch": 0.7835596767273459, + "grad_norm": 3.678387403488159, + "learning_rate": 2.1717590192644487e-05, + "loss": 0.8991, + "num_input_tokens_seen": 179980536, + "step": 11186 + }, + { + "epoch": 0.7836297249730753, + "grad_norm": 3.8727855682373047, + "learning_rate": 2.1710591943957966e-05, + "loss": 1.1796, + "num_input_tokens_seen": 179996384, + "step": 11187 + }, + { + "epoch": 0.7836997732188045, + "grad_norm": 4.235408306121826, + "learning_rate": 2.1703593695271454e-05, + "loss": 1.2263, + "num_input_tokens_seen": 180012736, + "step": 11188 + }, + { + "epoch": 0.7837698214645337, + "grad_norm": 3.6635186672210693, + "learning_rate": 2.1696595446584943e-05, + "loss": 1.0274, + "num_input_tokens_seen": 180029120, + "step": 11189 + }, + { + "epoch": 0.783839869710263, + "grad_norm": 3.868584394454956, + "learning_rate": 2.168959719789842e-05, + "loss": 1.1493, + "num_input_tokens_seen": 180044768, + "step": 11190 + }, + { + "epoch": 0.7839099179559922, + "grad_norm": 3.6237564086914062, + "learning_rate": 2.1682598949211913e-05, + "loss": 0.8888, + "num_input_tokens_seen": 180061024, + "step": 11191 + }, + { + "epoch": 0.7839799662017214, + "grad_norm": 4.136575698852539, + "learning_rate": 2.1675600700525392e-05, + "loss": 0.8805, + "num_input_tokens_seen": 180077408, + "step": 11192 + }, + { + "epoch": 0.7840500144474507, + "grad_norm": 3.5634124279022217, + "learning_rate": 2.166860245183888e-05, + "loss": 0.9457, + "num_input_tokens_seen": 180093792, + "step": 11193 + }, + { + "epoch": 0.7841200626931799, + "grad_norm": 3.8136534690856934, + "learning_rate": 2.166160420315236e-05, + "loss": 0.9846, + "num_input_tokens_seen": 180110176, + "step": 11194 + }, + { + "epoch": 0.7841901109389092, + "grad_norm": 3.8151447772979736, + "learning_rate": 2.1654605954465847e-05, + "loss": 0.9512, + "num_input_tokens_seen": 180126560, + "step": 11195 + }, + { + "epoch": 0.7842601591846384, + "grad_norm": 4.109495639801025, + "learning_rate": 2.164760770577934e-05, + "loss": 1.1096, + "num_input_tokens_seen": 180142456, + "step": 11196 + }, + { + "epoch": 0.7843302074303676, + "grad_norm": 3.628901481628418, + "learning_rate": 2.1640609457092818e-05, + "loss": 1.0511, + "num_input_tokens_seen": 180158840, + "step": 11197 + }, + { + "epoch": 0.784400255676097, + "grad_norm": 3.702497720718384, + "learning_rate": 2.1633611208406306e-05, + "loss": 0.9646, + "num_input_tokens_seen": 180175056, + "step": 11198 + }, + { + "epoch": 0.7844703039218262, + "grad_norm": 3.5293333530426025, + "learning_rate": 2.1626612959719785e-05, + "loss": 1.0095, + "num_input_tokens_seen": 180191200, + "step": 11199 + }, + { + "epoch": 0.7845403521675554, + "grad_norm": 3.7145540714263916, + "learning_rate": 2.1619614711033273e-05, + "loss": 1.0218, + "num_input_tokens_seen": 180207368, + "step": 11200 + }, + { + "epoch": 0.7845403521675554, + "eval_loss": 1.1162842512130737, + "eval_runtime": 0.2077, + "eval_samples_per_second": 4.814, + "eval_steps_per_second": 4.814, + "num_input_tokens_seen": 180207368, + "step": 11200 + }, + { + "epoch": 0.7846104004132847, + "grad_norm": 3.517585277557373, + "learning_rate": 2.1612616462346762e-05, + "loss": 0.8385, + "num_input_tokens_seen": 180223632, + "step": 11201 + }, + { + "epoch": 0.7846804486590139, + "grad_norm": 4.2850565910339355, + "learning_rate": 2.1605618213660244e-05, + "loss": 0.995, + "num_input_tokens_seen": 180240016, + "step": 11202 + }, + { + "epoch": 0.7847504969047432, + "grad_norm": 4.445461273193359, + "learning_rate": 2.1598619964973732e-05, + "loss": 1.2609, + "num_input_tokens_seen": 180256400, + "step": 11203 + }, + { + "epoch": 0.7848205451504724, + "grad_norm": 4.16579532623291, + "learning_rate": 2.159162171628721e-05, + "loss": 0.9434, + "num_input_tokens_seen": 180272224, + "step": 11204 + }, + { + "epoch": 0.7848905933962016, + "grad_norm": 5.920172691345215, + "learning_rate": 2.15846234676007e-05, + "loss": 1.1388, + "num_input_tokens_seen": 180288608, + "step": 11205 + }, + { + "epoch": 0.7849606416419309, + "grad_norm": 3.652376890182495, + "learning_rate": 2.1577625218914188e-05, + "loss": 1.0563, + "num_input_tokens_seen": 180304992, + "step": 11206 + }, + { + "epoch": 0.7850306898876601, + "grad_norm": 4.855906963348389, + "learning_rate": 2.157062697022767e-05, + "loss": 0.9549, + "num_input_tokens_seen": 180321376, + "step": 11207 + }, + { + "epoch": 0.7851007381333893, + "grad_norm": 3.6191766262054443, + "learning_rate": 2.156362872154116e-05, + "loss": 1.0248, + "num_input_tokens_seen": 180337760, + "step": 11208 + }, + { + "epoch": 0.7851707863791186, + "grad_norm": 4.0187788009643555, + "learning_rate": 2.1556630472854637e-05, + "loss": 1.0197, + "num_input_tokens_seen": 180352680, + "step": 11209 + }, + { + "epoch": 0.7852408346248478, + "grad_norm": 3.948625326156616, + "learning_rate": 2.1549632224168125e-05, + "loss": 1.024, + "num_input_tokens_seen": 180368752, + "step": 11210 + }, + { + "epoch": 0.7853108828705772, + "grad_norm": 3.801917552947998, + "learning_rate": 2.1542633975481614e-05, + "loss": 1.0794, + "num_input_tokens_seen": 180385032, + "step": 11211 + }, + { + "epoch": 0.7853809311163064, + "grad_norm": 4.274776458740234, + "learning_rate": 2.1535635726795096e-05, + "loss": 1.1627, + "num_input_tokens_seen": 180401288, + "step": 11212 + }, + { + "epoch": 0.7854509793620356, + "grad_norm": 4.011796474456787, + "learning_rate": 2.1528637478108584e-05, + "loss": 1.3131, + "num_input_tokens_seen": 180417672, + "step": 11213 + }, + { + "epoch": 0.7855210276077649, + "grad_norm": 3.523197889328003, + "learning_rate": 2.1521639229422063e-05, + "loss": 1.144, + "num_input_tokens_seen": 180433656, + "step": 11214 + }, + { + "epoch": 0.7855910758534941, + "grad_norm": 3.7762699127197266, + "learning_rate": 2.151464098073555e-05, + "loss": 1.2101, + "num_input_tokens_seen": 180450040, + "step": 11215 + }, + { + "epoch": 0.7856611240992233, + "grad_norm": 3.684281349182129, + "learning_rate": 2.150764273204904e-05, + "loss": 1.0681, + "num_input_tokens_seen": 180466224, + "step": 11216 + }, + { + "epoch": 0.7857311723449526, + "grad_norm": 4.406097412109375, + "learning_rate": 2.1500644483362522e-05, + "loss": 1.1465, + "num_input_tokens_seen": 180482608, + "step": 11217 + }, + { + "epoch": 0.7858012205906818, + "grad_norm": 3.767179489135742, + "learning_rate": 2.149364623467601e-05, + "loss": 1.141, + "num_input_tokens_seen": 180498992, + "step": 11218 + }, + { + "epoch": 0.7858712688364111, + "grad_norm": 3.7642788887023926, + "learning_rate": 2.148664798598949e-05, + "loss": 1.057, + "num_input_tokens_seen": 180514760, + "step": 11219 + }, + { + "epoch": 0.7859413170821403, + "grad_norm": 3.8525826930999756, + "learning_rate": 2.1479649737302977e-05, + "loss": 1.1105, + "num_input_tokens_seen": 180530864, + "step": 11220 + }, + { + "epoch": 0.7860113653278695, + "grad_norm": 4.174134254455566, + "learning_rate": 2.1472651488616456e-05, + "loss": 1.0653, + "num_input_tokens_seen": 180547248, + "step": 11221 + }, + { + "epoch": 0.7860814135735988, + "grad_norm": 4.353362083435059, + "learning_rate": 2.1465653239929944e-05, + "loss": 1.1903, + "num_input_tokens_seen": 180563632, + "step": 11222 + }, + { + "epoch": 0.786151461819328, + "grad_norm": 4.762564659118652, + "learning_rate": 2.1458654991243436e-05, + "loss": 0.9808, + "num_input_tokens_seen": 180579720, + "step": 11223 + }, + { + "epoch": 0.7862215100650574, + "grad_norm": 3.649965763092041, + "learning_rate": 2.1451656742556915e-05, + "loss": 1.2329, + "num_input_tokens_seen": 180596104, + "step": 11224 + }, + { + "epoch": 0.7862915583107866, + "grad_norm": 4.545001029968262, + "learning_rate": 2.1444658493870403e-05, + "loss": 0.9458, + "num_input_tokens_seen": 180611048, + "step": 11225 + }, + { + "epoch": 0.7863616065565158, + "grad_norm": 5.953097343444824, + "learning_rate": 2.1437660245183882e-05, + "loss": 1.0589, + "num_input_tokens_seen": 180626616, + "step": 11226 + }, + { + "epoch": 0.7864316548022451, + "grad_norm": 3.7630393505096436, + "learning_rate": 2.143066199649737e-05, + "loss": 1.0426, + "num_input_tokens_seen": 180643000, + "step": 11227 + }, + { + "epoch": 0.7865017030479743, + "grad_norm": 3.8025143146514893, + "learning_rate": 2.1423663747810862e-05, + "loss": 1.123, + "num_input_tokens_seen": 180659384, + "step": 11228 + }, + { + "epoch": 0.7865717512937035, + "grad_norm": 4.255170822143555, + "learning_rate": 2.141666549912434e-05, + "loss": 0.9671, + "num_input_tokens_seen": 180674824, + "step": 11229 + }, + { + "epoch": 0.7866417995394328, + "grad_norm": 4.719873428344727, + "learning_rate": 2.140966725043783e-05, + "loss": 1.1543, + "num_input_tokens_seen": 180691208, + "step": 11230 + }, + { + "epoch": 0.786711847785162, + "grad_norm": 3.941986560821533, + "learning_rate": 2.1402669001751308e-05, + "loss": 0.9294, + "num_input_tokens_seen": 180707592, + "step": 11231 + }, + { + "epoch": 0.7867818960308913, + "grad_norm": 4.0840535163879395, + "learning_rate": 2.1395670753064796e-05, + "loss": 1.2593, + "num_input_tokens_seen": 180723976, + "step": 11232 + }, + { + "epoch": 0.7868519442766205, + "grad_norm": 3.9683802127838135, + "learning_rate": 2.138867250437829e-05, + "loss": 0.9931, + "num_input_tokens_seen": 180739048, + "step": 11233 + }, + { + "epoch": 0.7869219925223497, + "grad_norm": 5.056516170501709, + "learning_rate": 2.1381674255691767e-05, + "loss": 1.047, + "num_input_tokens_seen": 180755432, + "step": 11234 + }, + { + "epoch": 0.786992040768079, + "grad_norm": 3.950148344039917, + "learning_rate": 2.1374676007005255e-05, + "loss": 0.9857, + "num_input_tokens_seen": 180771776, + "step": 11235 + }, + { + "epoch": 0.7870620890138083, + "grad_norm": 5.751395225524902, + "learning_rate": 2.1367677758318734e-05, + "loss": 1.1594, + "num_input_tokens_seen": 180788160, + "step": 11236 + }, + { + "epoch": 0.7871321372595375, + "grad_norm": 3.804720878601074, + "learning_rate": 2.1360679509632222e-05, + "loss": 1.1296, + "num_input_tokens_seen": 180804544, + "step": 11237 + }, + { + "epoch": 0.7872021855052668, + "grad_norm": 4.841265678405762, + "learning_rate": 2.1353681260945714e-05, + "loss": 1.1028, + "num_input_tokens_seen": 180820928, + "step": 11238 + }, + { + "epoch": 0.787272233750996, + "grad_norm": 4.771214008331299, + "learning_rate": 2.1346683012259193e-05, + "loss": 1.0543, + "num_input_tokens_seen": 180837312, + "step": 11239 + }, + { + "epoch": 0.7873422819967253, + "grad_norm": 3.95639705657959, + "learning_rate": 2.133968476357268e-05, + "loss": 1.0559, + "num_input_tokens_seen": 180853224, + "step": 11240 + }, + { + "epoch": 0.7874123302424545, + "grad_norm": 4.844511032104492, + "learning_rate": 2.133268651488616e-05, + "loss": 1.0115, + "num_input_tokens_seen": 180869112, + "step": 11241 + }, + { + "epoch": 0.7874823784881837, + "grad_norm": 4.355325698852539, + "learning_rate": 2.132568826619965e-05, + "loss": 1.2253, + "num_input_tokens_seen": 180885496, + "step": 11242 + }, + { + "epoch": 0.787552426733913, + "grad_norm": 4.854236125946045, + "learning_rate": 2.131869001751314e-05, + "loss": 1.0765, + "num_input_tokens_seen": 180901432, + "step": 11243 + }, + { + "epoch": 0.7876224749796422, + "grad_norm": 4.370208263397217, + "learning_rate": 2.131169176882662e-05, + "loss": 0.8695, + "num_input_tokens_seen": 180917616, + "step": 11244 + }, + { + "epoch": 0.7876925232253714, + "grad_norm": 3.437425136566162, + "learning_rate": 2.1304693520140107e-05, + "loss": 1.0363, + "num_input_tokens_seen": 180934000, + "step": 11245 + }, + { + "epoch": 0.7877625714711007, + "grad_norm": 4.6216959953308105, + "learning_rate": 2.1297695271453586e-05, + "loss": 1.1129, + "num_input_tokens_seen": 180950384, + "step": 11246 + }, + { + "epoch": 0.7878326197168299, + "grad_norm": 3.739614725112915, + "learning_rate": 2.1290697022767074e-05, + "loss": 0.9559, + "num_input_tokens_seen": 180966768, + "step": 11247 + }, + { + "epoch": 0.7879026679625593, + "grad_norm": 4.21837854385376, + "learning_rate": 2.1283698774080553e-05, + "loss": 1.3233, + "num_input_tokens_seen": 180983152, + "step": 11248 + }, + { + "epoch": 0.7879727162082885, + "grad_norm": 3.828267812728882, + "learning_rate": 2.1276700525394045e-05, + "loss": 1.072, + "num_input_tokens_seen": 180999536, + "step": 11249 + }, + { + "epoch": 0.7880427644540177, + "grad_norm": 3.731003999710083, + "learning_rate": 2.1269702276707533e-05, + "loss": 1.1281, + "num_input_tokens_seen": 181015864, + "step": 11250 + }, + { + "epoch": 0.788112812699747, + "grad_norm": 3.7999229431152344, + "learning_rate": 2.1262704028021012e-05, + "loss": 1.0471, + "num_input_tokens_seen": 181031808, + "step": 11251 + }, + { + "epoch": 0.7881828609454762, + "grad_norm": 3.710433006286621, + "learning_rate": 2.12557057793345e-05, + "loss": 1.1441, + "num_input_tokens_seen": 181048192, + "step": 11252 + }, + { + "epoch": 0.7882529091912055, + "grad_norm": 3.8095359802246094, + "learning_rate": 2.124870753064798e-05, + "loss": 1.0115, + "num_input_tokens_seen": 181063912, + "step": 11253 + }, + { + "epoch": 0.7883229574369347, + "grad_norm": 3.7094736099243164, + "learning_rate": 2.124170928196147e-05, + "loss": 0.9399, + "num_input_tokens_seen": 181079560, + "step": 11254 + }, + { + "epoch": 0.7883930056826639, + "grad_norm": 5.188350677490234, + "learning_rate": 2.123471103327496e-05, + "loss": 0.9658, + "num_input_tokens_seen": 181095704, + "step": 11255 + }, + { + "epoch": 0.7884630539283932, + "grad_norm": 3.5094776153564453, + "learning_rate": 2.1227712784588438e-05, + "loss": 0.8918, + "num_input_tokens_seen": 181111688, + "step": 11256 + }, + { + "epoch": 0.7885331021741224, + "grad_norm": 4.854673385620117, + "learning_rate": 2.1220714535901926e-05, + "loss": 1.0057, + "num_input_tokens_seen": 181128048, + "step": 11257 + }, + { + "epoch": 0.7886031504198516, + "grad_norm": 3.709547281265259, + "learning_rate": 2.1213716287215405e-05, + "loss": 0.9936, + "num_input_tokens_seen": 181144280, + "step": 11258 + }, + { + "epoch": 0.7886731986655809, + "grad_norm": 4.277520179748535, + "learning_rate": 2.1206718038528897e-05, + "loss": 1.0723, + "num_input_tokens_seen": 181160656, + "step": 11259 + }, + { + "epoch": 0.7887432469113101, + "grad_norm": 3.8040671348571777, + "learning_rate": 2.1199719789842385e-05, + "loss": 0.9185, + "num_input_tokens_seen": 181177040, + "step": 11260 + }, + { + "epoch": 0.7888132951570395, + "grad_norm": 3.4783263206481934, + "learning_rate": 2.1192721541155864e-05, + "loss": 1.103, + "num_input_tokens_seen": 181193424, + "step": 11261 + }, + { + "epoch": 0.7888833434027687, + "grad_norm": 4.194344997406006, + "learning_rate": 2.1185723292469352e-05, + "loss": 0.9811, + "num_input_tokens_seen": 181207960, + "step": 11262 + }, + { + "epoch": 0.7889533916484979, + "grad_norm": 3.5899999141693115, + "learning_rate": 2.117872504378283e-05, + "loss": 1.2261, + "num_input_tokens_seen": 181224344, + "step": 11263 + }, + { + "epoch": 0.7890234398942272, + "grad_norm": 3.708371162414551, + "learning_rate": 2.1171726795096323e-05, + "loss": 1.0946, + "num_input_tokens_seen": 181240728, + "step": 11264 + }, + { + "epoch": 0.7890934881399564, + "grad_norm": 4.351925373077393, + "learning_rate": 2.116472854640981e-05, + "loss": 1.0771, + "num_input_tokens_seen": 181255616, + "step": 11265 + }, + { + "epoch": 0.7891635363856856, + "grad_norm": 4.753475666046143, + "learning_rate": 2.115773029772329e-05, + "loss": 1.0606, + "num_input_tokens_seen": 181272000, + "step": 11266 + }, + { + "epoch": 0.7892335846314149, + "grad_norm": 4.911532402038574, + "learning_rate": 2.115073204903678e-05, + "loss": 1.0957, + "num_input_tokens_seen": 181287856, + "step": 11267 + }, + { + "epoch": 0.7893036328771441, + "grad_norm": 4.128995895385742, + "learning_rate": 2.1143733800350257e-05, + "loss": 0.9628, + "num_input_tokens_seen": 181304240, + "step": 11268 + }, + { + "epoch": 0.7893736811228734, + "grad_norm": 5.62176513671875, + "learning_rate": 2.113673555166375e-05, + "loss": 1.1872, + "num_input_tokens_seen": 181320480, + "step": 11269 + }, + { + "epoch": 0.7894437293686026, + "grad_norm": 3.5677547454833984, + "learning_rate": 2.1129737302977237e-05, + "loss": 0.8981, + "num_input_tokens_seen": 181336792, + "step": 11270 + }, + { + "epoch": 0.7895137776143318, + "grad_norm": 4.244653224945068, + "learning_rate": 2.1122739054290716e-05, + "loss": 1.2572, + "num_input_tokens_seen": 181352752, + "step": 11271 + }, + { + "epoch": 0.7895838258600612, + "grad_norm": 5.975011348724365, + "learning_rate": 2.1115740805604204e-05, + "loss": 1.1289, + "num_input_tokens_seen": 181368704, + "step": 11272 + }, + { + "epoch": 0.7896538741057904, + "grad_norm": 3.858487367630005, + "learning_rate": 2.1108742556917683e-05, + "loss": 1.1103, + "num_input_tokens_seen": 181384272, + "step": 11273 + }, + { + "epoch": 0.7897239223515196, + "grad_norm": 3.730074167251587, + "learning_rate": 2.1101744308231175e-05, + "loss": 0.875, + "num_input_tokens_seen": 181400656, + "step": 11274 + }, + { + "epoch": 0.7897939705972489, + "grad_norm": 4.2110595703125, + "learning_rate": 2.1094746059544653e-05, + "loss": 1.0633, + "num_input_tokens_seen": 181415968, + "step": 11275 + }, + { + "epoch": 0.7898640188429781, + "grad_norm": 3.253934860229492, + "learning_rate": 2.1087747810858142e-05, + "loss": 0.8833, + "num_input_tokens_seen": 181432352, + "step": 11276 + }, + { + "epoch": 0.7899340670887074, + "grad_norm": 3.9285271167755127, + "learning_rate": 2.108074956217163e-05, + "loss": 0.9725, + "num_input_tokens_seen": 181448624, + "step": 11277 + }, + { + "epoch": 0.7900041153344366, + "grad_norm": 4.236669540405273, + "learning_rate": 2.107375131348511e-05, + "loss": 1.0333, + "num_input_tokens_seen": 181464720, + "step": 11278 + }, + { + "epoch": 0.7900741635801658, + "grad_norm": 4.774725437164307, + "learning_rate": 2.10667530647986e-05, + "loss": 1.2076, + "num_input_tokens_seen": 181481104, + "step": 11279 + }, + { + "epoch": 0.7901442118258951, + "grad_norm": 3.8202743530273438, + "learning_rate": 2.105975481611208e-05, + "loss": 1.2396, + "num_input_tokens_seen": 181497488, + "step": 11280 + }, + { + "epoch": 0.7902142600716243, + "grad_norm": 4.039727687835693, + "learning_rate": 2.1052756567425568e-05, + "loss": 1.2327, + "num_input_tokens_seen": 181513872, + "step": 11281 + }, + { + "epoch": 0.7902843083173535, + "grad_norm": 3.7093374729156494, + "learning_rate": 2.1045758318739057e-05, + "loss": 1.1251, + "num_input_tokens_seen": 181530160, + "step": 11282 + }, + { + "epoch": 0.7903543565630828, + "grad_norm": 4.2075605392456055, + "learning_rate": 2.1038760070052535e-05, + "loss": 1.067, + "num_input_tokens_seen": 181545624, + "step": 11283 + }, + { + "epoch": 0.790424404808812, + "grad_norm": 3.7703986167907715, + "learning_rate": 2.1031761821366027e-05, + "loss": 0.9682, + "num_input_tokens_seen": 181562008, + "step": 11284 + }, + { + "epoch": 0.7904944530545414, + "grad_norm": 4.008590221405029, + "learning_rate": 2.1024763572679505e-05, + "loss": 1.2152, + "num_input_tokens_seen": 181578392, + "step": 11285 + }, + { + "epoch": 0.7905645013002706, + "grad_norm": 3.915849447250366, + "learning_rate": 2.1017765323992994e-05, + "loss": 0.9857, + "num_input_tokens_seen": 181594224, + "step": 11286 + }, + { + "epoch": 0.7906345495459998, + "grad_norm": 4.103799343109131, + "learning_rate": 2.1010767075306483e-05, + "loss": 1.2242, + "num_input_tokens_seen": 181609696, + "step": 11287 + }, + { + "epoch": 0.7907045977917291, + "grad_norm": 3.6777029037475586, + "learning_rate": 2.100376882661996e-05, + "loss": 1.0343, + "num_input_tokens_seen": 181625856, + "step": 11288 + }, + { + "epoch": 0.7907746460374583, + "grad_norm": 4.076472759246826, + "learning_rate": 2.0996770577933453e-05, + "loss": 0.9566, + "num_input_tokens_seen": 181642240, + "step": 11289 + }, + { + "epoch": 0.7908446942831876, + "grad_norm": 4.521420478820801, + "learning_rate": 2.098977232924693e-05, + "loss": 1.2266, + "num_input_tokens_seen": 181658624, + "step": 11290 + }, + { + "epoch": 0.7909147425289168, + "grad_norm": 3.4832510948181152, + "learning_rate": 2.098277408056042e-05, + "loss": 1.0993, + "num_input_tokens_seen": 181675008, + "step": 11291 + }, + { + "epoch": 0.790984790774646, + "grad_norm": 4.041224002838135, + "learning_rate": 2.097577583187391e-05, + "loss": 1.112, + "num_input_tokens_seen": 181690440, + "step": 11292 + }, + { + "epoch": 0.7910548390203753, + "grad_norm": 4.497851371765137, + "learning_rate": 2.0968777583187387e-05, + "loss": 0.8725, + "num_input_tokens_seen": 181706616, + "step": 11293 + }, + { + "epoch": 0.7911248872661045, + "grad_norm": 5.505173683166504, + "learning_rate": 2.096177933450088e-05, + "loss": 1.1036, + "num_input_tokens_seen": 181723000, + "step": 11294 + }, + { + "epoch": 0.7911949355118337, + "grad_norm": 4.2964372634887695, + "learning_rate": 2.0954781085814357e-05, + "loss": 0.9947, + "num_input_tokens_seen": 181739384, + "step": 11295 + }, + { + "epoch": 0.791264983757563, + "grad_norm": 4.1496477127075195, + "learning_rate": 2.0947782837127846e-05, + "loss": 1.0611, + "num_input_tokens_seen": 181755416, + "step": 11296 + }, + { + "epoch": 0.7913350320032922, + "grad_norm": 4.342074394226074, + "learning_rate": 2.0940784588441335e-05, + "loss": 1.163, + "num_input_tokens_seen": 181771360, + "step": 11297 + }, + { + "epoch": 0.7914050802490216, + "grad_norm": 3.9076614379882812, + "learning_rate": 2.0933786339754813e-05, + "loss": 1.1324, + "num_input_tokens_seen": 181787056, + "step": 11298 + }, + { + "epoch": 0.7914751284947508, + "grad_norm": 4.443563461303711, + "learning_rate": 2.0926788091068305e-05, + "loss": 1.0609, + "num_input_tokens_seen": 181803440, + "step": 11299 + }, + { + "epoch": 0.79154517674048, + "grad_norm": 4.043490886688232, + "learning_rate": 2.0919789842381783e-05, + "loss": 1.0759, + "num_input_tokens_seen": 181819824, + "step": 11300 + }, + { + "epoch": 0.7916152249862093, + "grad_norm": 4.256930828094482, + "learning_rate": 2.0912791593695272e-05, + "loss": 1.1434, + "num_input_tokens_seen": 181836208, + "step": 11301 + }, + { + "epoch": 0.7916852732319385, + "grad_norm": 3.8439314365386963, + "learning_rate": 2.090579334500875e-05, + "loss": 1.1055, + "num_input_tokens_seen": 181852592, + "step": 11302 + }, + { + "epoch": 0.7917553214776677, + "grad_norm": 3.8805763721466064, + "learning_rate": 2.089879509632224e-05, + "loss": 1.068, + "num_input_tokens_seen": 181868976, + "step": 11303 + }, + { + "epoch": 0.791825369723397, + "grad_norm": 3.8867344856262207, + "learning_rate": 2.089179684763573e-05, + "loss": 1.1392, + "num_input_tokens_seen": 181885096, + "step": 11304 + }, + { + "epoch": 0.7918954179691262, + "grad_norm": 3.846310615539551, + "learning_rate": 2.088479859894921e-05, + "loss": 0.9033, + "num_input_tokens_seen": 181901480, + "step": 11305 + }, + { + "epoch": 0.7919654662148555, + "grad_norm": 3.904285430908203, + "learning_rate": 2.0877800350262698e-05, + "loss": 1.1218, + "num_input_tokens_seen": 181917864, + "step": 11306 + }, + { + "epoch": 0.7920355144605847, + "grad_norm": 3.855797052383423, + "learning_rate": 2.0870802101576176e-05, + "loss": 1.0024, + "num_input_tokens_seen": 181934248, + "step": 11307 + }, + { + "epoch": 0.7921055627063139, + "grad_norm": 3.9284582138061523, + "learning_rate": 2.0863803852889665e-05, + "loss": 0.9784, + "num_input_tokens_seen": 181950632, + "step": 11308 + }, + { + "epoch": 0.7921756109520433, + "grad_norm": 3.838575601577759, + "learning_rate": 2.0856805604203157e-05, + "loss": 1.0943, + "num_input_tokens_seen": 181967016, + "step": 11309 + }, + { + "epoch": 0.7922456591977725, + "grad_norm": 6.6384429931640625, + "learning_rate": 2.0849807355516635e-05, + "loss": 1.2178, + "num_input_tokens_seen": 181983400, + "step": 11310 + }, + { + "epoch": 0.7923157074435017, + "grad_norm": 5.020096778869629, + "learning_rate": 2.0842809106830124e-05, + "loss": 1.1222, + "num_input_tokens_seen": 181998920, + "step": 11311 + }, + { + "epoch": 0.792385755689231, + "grad_norm": 3.59102463722229, + "learning_rate": 2.0835810858143602e-05, + "loss": 1.01, + "num_input_tokens_seen": 182015304, + "step": 11312 + }, + { + "epoch": 0.7924558039349602, + "grad_norm": 4.053440570831299, + "learning_rate": 2.082881260945709e-05, + "loss": 1.0415, + "num_input_tokens_seen": 182031688, + "step": 11313 + }, + { + "epoch": 0.7925258521806895, + "grad_norm": 4.084712505340576, + "learning_rate": 2.0821814360770583e-05, + "loss": 1.2866, + "num_input_tokens_seen": 182048072, + "step": 11314 + }, + { + "epoch": 0.7925959004264187, + "grad_norm": 3.659452199935913, + "learning_rate": 2.081481611208406e-05, + "loss": 0.9148, + "num_input_tokens_seen": 182064432, + "step": 11315 + }, + { + "epoch": 0.7926659486721479, + "grad_norm": 3.504512071609497, + "learning_rate": 2.080781786339755e-05, + "loss": 1.0907, + "num_input_tokens_seen": 182080816, + "step": 11316 + }, + { + "epoch": 0.7927359969178772, + "grad_norm": 3.4160404205322266, + "learning_rate": 2.080081961471103e-05, + "loss": 0.9507, + "num_input_tokens_seen": 182097080, + "step": 11317 + }, + { + "epoch": 0.7928060451636064, + "grad_norm": 3.595651865005493, + "learning_rate": 2.0793821366024517e-05, + "loss": 0.9885, + "num_input_tokens_seen": 182113352, + "step": 11318 + }, + { + "epoch": 0.7928760934093356, + "grad_norm": 3.862454891204834, + "learning_rate": 2.078682311733801e-05, + "loss": 1.107, + "num_input_tokens_seen": 182129736, + "step": 11319 + }, + { + "epoch": 0.7929461416550649, + "grad_norm": 3.12123441696167, + "learning_rate": 2.0779824868651487e-05, + "loss": 0.9059, + "num_input_tokens_seen": 182146120, + "step": 11320 + }, + { + "epoch": 0.7930161899007941, + "grad_norm": 4.7422895431518555, + "learning_rate": 2.0772826619964976e-05, + "loss": 1.0911, + "num_input_tokens_seen": 182162504, + "step": 11321 + }, + { + "epoch": 0.7930862381465235, + "grad_norm": 4.290980339050293, + "learning_rate": 2.0765828371278454e-05, + "loss": 1.0607, + "num_input_tokens_seen": 182178464, + "step": 11322 + }, + { + "epoch": 0.7931562863922527, + "grad_norm": 3.974414348602295, + "learning_rate": 2.0758830122591943e-05, + "loss": 1.0683, + "num_input_tokens_seen": 182194456, + "step": 11323 + }, + { + "epoch": 0.7932263346379819, + "grad_norm": 4.047220230102539, + "learning_rate": 2.0751831873905435e-05, + "loss": 1.0574, + "num_input_tokens_seen": 182210608, + "step": 11324 + }, + { + "epoch": 0.7932963828837112, + "grad_norm": 3.7999606132507324, + "learning_rate": 2.0744833625218913e-05, + "loss": 1.0857, + "num_input_tokens_seen": 182226968, + "step": 11325 + }, + { + "epoch": 0.7933664311294404, + "grad_norm": 3.909945011138916, + "learning_rate": 2.0737835376532402e-05, + "loss": 1.0642, + "num_input_tokens_seen": 182242200, + "step": 11326 + }, + { + "epoch": 0.7934364793751697, + "grad_norm": 6.329525947570801, + "learning_rate": 2.073083712784588e-05, + "loss": 1.0751, + "num_input_tokens_seen": 182258584, + "step": 11327 + }, + { + "epoch": 0.7935065276208989, + "grad_norm": 3.7989511489868164, + "learning_rate": 2.072383887915937e-05, + "loss": 1.0627, + "num_input_tokens_seen": 182274968, + "step": 11328 + }, + { + "epoch": 0.7935765758666281, + "grad_norm": 5.7271575927734375, + "learning_rate": 2.0716840630472847e-05, + "loss": 1.2508, + "num_input_tokens_seen": 182290632, + "step": 11329 + }, + { + "epoch": 0.7936466241123574, + "grad_norm": 4.999852180480957, + "learning_rate": 2.070984238178634e-05, + "loss": 1.1098, + "num_input_tokens_seen": 182307016, + "step": 11330 + }, + { + "epoch": 0.7937166723580866, + "grad_norm": 4.996677398681641, + "learning_rate": 2.0702844133099828e-05, + "loss": 0.9826, + "num_input_tokens_seen": 182322848, + "step": 11331 + }, + { + "epoch": 0.7937867206038158, + "grad_norm": 4.535329341888428, + "learning_rate": 2.0695845884413306e-05, + "loss": 1.048, + "num_input_tokens_seen": 182338720, + "step": 11332 + }, + { + "epoch": 0.7938567688495451, + "grad_norm": 5.787573337554932, + "learning_rate": 2.0688847635726795e-05, + "loss": 1.1436, + "num_input_tokens_seen": 182355104, + "step": 11333 + }, + { + "epoch": 0.7939268170952744, + "grad_norm": 3.9647014141082764, + "learning_rate": 2.0681849387040273e-05, + "loss": 0.8304, + "num_input_tokens_seen": 182371048, + "step": 11334 + }, + { + "epoch": 0.7939968653410037, + "grad_norm": 4.165703296661377, + "learning_rate": 2.0674851138353765e-05, + "loss": 1.1829, + "num_input_tokens_seen": 182387432, + "step": 11335 + }, + { + "epoch": 0.7940669135867329, + "grad_norm": 4.344659328460693, + "learning_rate": 2.0667852889667254e-05, + "loss": 1.0764, + "num_input_tokens_seen": 182403816, + "step": 11336 + }, + { + "epoch": 0.7941369618324621, + "grad_norm": 6.075753211975098, + "learning_rate": 2.0660854640980732e-05, + "loss": 1.0757, + "num_input_tokens_seen": 182419640, + "step": 11337 + }, + { + "epoch": 0.7942070100781914, + "grad_norm": 3.590771198272705, + "learning_rate": 2.065385639229422e-05, + "loss": 0.9962, + "num_input_tokens_seen": 182435928, + "step": 11338 + }, + { + "epoch": 0.7942770583239206, + "grad_norm": 4.162105083465576, + "learning_rate": 2.06468581436077e-05, + "loss": 0.9751, + "num_input_tokens_seen": 182451512, + "step": 11339 + }, + { + "epoch": 0.7943471065696498, + "grad_norm": 4.185384273529053, + "learning_rate": 2.063985989492119e-05, + "loss": 1.1115, + "num_input_tokens_seen": 182467192, + "step": 11340 + }, + { + "epoch": 0.7944171548153791, + "grad_norm": 3.6298980712890625, + "learning_rate": 2.063286164623468e-05, + "loss": 0.9632, + "num_input_tokens_seen": 182483576, + "step": 11341 + }, + { + "epoch": 0.7944872030611083, + "grad_norm": 4.535846710205078, + "learning_rate": 2.062586339754816e-05, + "loss": 1.0607, + "num_input_tokens_seen": 182499960, + "step": 11342 + }, + { + "epoch": 0.7945572513068376, + "grad_norm": 3.503600835800171, + "learning_rate": 2.0618865148861647e-05, + "loss": 1.0163, + "num_input_tokens_seen": 182516160, + "step": 11343 + }, + { + "epoch": 0.7946272995525668, + "grad_norm": 3.950356960296631, + "learning_rate": 2.0611866900175126e-05, + "loss": 0.9231, + "num_input_tokens_seen": 182532240, + "step": 11344 + }, + { + "epoch": 0.794697347798296, + "grad_norm": 3.8892312049865723, + "learning_rate": 2.0604868651488617e-05, + "loss": 1.3508, + "num_input_tokens_seen": 182548624, + "step": 11345 + }, + { + "epoch": 0.7947673960440254, + "grad_norm": 3.5516936779022217, + "learning_rate": 2.0597870402802106e-05, + "loss": 0.9957, + "num_input_tokens_seen": 182565008, + "step": 11346 + }, + { + "epoch": 0.7948374442897546, + "grad_norm": 4.0330305099487305, + "learning_rate": 2.0590872154115584e-05, + "loss": 1.2745, + "num_input_tokens_seen": 182581392, + "step": 11347 + }, + { + "epoch": 0.7949074925354838, + "grad_norm": 4.059846878051758, + "learning_rate": 2.0583873905429073e-05, + "loss": 0.9983, + "num_input_tokens_seen": 182596960, + "step": 11348 + }, + { + "epoch": 0.7949775407812131, + "grad_norm": 3.9882805347442627, + "learning_rate": 2.057687565674255e-05, + "loss": 1.0865, + "num_input_tokens_seen": 182613168, + "step": 11349 + }, + { + "epoch": 0.7950475890269423, + "grad_norm": 3.5204317569732666, + "learning_rate": 2.0569877408056043e-05, + "loss": 0.9665, + "num_input_tokens_seen": 182629168, + "step": 11350 + }, + { + "epoch": 0.7951176372726716, + "grad_norm": 4.002301216125488, + "learning_rate": 2.0562879159369532e-05, + "loss": 1.0354, + "num_input_tokens_seen": 182645552, + "step": 11351 + }, + { + "epoch": 0.7951876855184008, + "grad_norm": 3.811558246612549, + "learning_rate": 2.055588091068301e-05, + "loss": 0.9077, + "num_input_tokens_seen": 182660496, + "step": 11352 + }, + { + "epoch": 0.79525773376413, + "grad_norm": 4.055780410766602, + "learning_rate": 2.05488826619965e-05, + "loss": 1.1783, + "num_input_tokens_seen": 182676880, + "step": 11353 + }, + { + "epoch": 0.7953277820098593, + "grad_norm": 3.93117356300354, + "learning_rate": 2.0541884413309978e-05, + "loss": 1.0448, + "num_input_tokens_seen": 182692808, + "step": 11354 + }, + { + "epoch": 0.7953978302555885, + "grad_norm": 3.9845149517059326, + "learning_rate": 2.053488616462347e-05, + "loss": 1.0495, + "num_input_tokens_seen": 182709192, + "step": 11355 + }, + { + "epoch": 0.7954678785013178, + "grad_norm": 4.662755966186523, + "learning_rate": 2.0527887915936948e-05, + "loss": 1.0482, + "num_input_tokens_seen": 182725072, + "step": 11356 + }, + { + "epoch": 0.795537926747047, + "grad_norm": 3.9220123291015625, + "learning_rate": 2.0520889667250437e-05, + "loss": 1.0601, + "num_input_tokens_seen": 182741456, + "step": 11357 + }, + { + "epoch": 0.7956079749927762, + "grad_norm": 3.3682405948638916, + "learning_rate": 2.0513891418563925e-05, + "loss": 0.946, + "num_input_tokens_seen": 182757456, + "step": 11358 + }, + { + "epoch": 0.7956780232385056, + "grad_norm": 3.815948009490967, + "learning_rate": 2.0506893169877404e-05, + "loss": 0.9925, + "num_input_tokens_seen": 182772952, + "step": 11359 + }, + { + "epoch": 0.7957480714842348, + "grad_norm": 3.5300791263580322, + "learning_rate": 2.0499894921190896e-05, + "loss": 0.9281, + "num_input_tokens_seen": 182789336, + "step": 11360 + }, + { + "epoch": 0.795818119729964, + "grad_norm": 4.161052703857422, + "learning_rate": 2.0492896672504374e-05, + "loss": 0.9703, + "num_input_tokens_seen": 182805368, + "step": 11361 + }, + { + "epoch": 0.7958881679756933, + "grad_norm": 3.6891372203826904, + "learning_rate": 2.0485898423817863e-05, + "loss": 0.8745, + "num_input_tokens_seen": 182821752, + "step": 11362 + }, + { + "epoch": 0.7959582162214225, + "grad_norm": 5.8315348625183105, + "learning_rate": 2.047890017513135e-05, + "loss": 1.1455, + "num_input_tokens_seen": 182838136, + "step": 11363 + }, + { + "epoch": 0.7960282644671518, + "grad_norm": 4.368020534515381, + "learning_rate": 2.047190192644483e-05, + "loss": 1.0259, + "num_input_tokens_seen": 182852576, + "step": 11364 + }, + { + "epoch": 0.796098312712881, + "grad_norm": 3.6869943141937256, + "learning_rate": 2.046490367775832e-05, + "loss": 0.9517, + "num_input_tokens_seen": 182868960, + "step": 11365 + }, + { + "epoch": 0.7961683609586102, + "grad_norm": 4.402252674102783, + "learning_rate": 2.04579054290718e-05, + "loss": 1.0195, + "num_input_tokens_seen": 182884584, + "step": 11366 + }, + { + "epoch": 0.7962384092043395, + "grad_norm": 3.6449546813964844, + "learning_rate": 2.045090718038529e-05, + "loss": 1.0691, + "num_input_tokens_seen": 182900968, + "step": 11367 + }, + { + "epoch": 0.7963084574500687, + "grad_norm": 3.772266149520874, + "learning_rate": 2.0443908931698777e-05, + "loss": 1.139, + "num_input_tokens_seen": 182917352, + "step": 11368 + }, + { + "epoch": 0.7963785056957979, + "grad_norm": 4.096329689025879, + "learning_rate": 2.0436910683012256e-05, + "loss": 1.0898, + "num_input_tokens_seen": 182933680, + "step": 11369 + }, + { + "epoch": 0.7964485539415272, + "grad_norm": 3.4767210483551025, + "learning_rate": 2.0429912434325748e-05, + "loss": 1.0523, + "num_input_tokens_seen": 182950064, + "step": 11370 + }, + { + "epoch": 0.7965186021872565, + "grad_norm": 3.794715404510498, + "learning_rate": 2.0422914185639226e-05, + "loss": 1.2056, + "num_input_tokens_seen": 182965992, + "step": 11371 + }, + { + "epoch": 0.7965886504329858, + "grad_norm": 3.6804189682006836, + "learning_rate": 2.0415915936952715e-05, + "loss": 0.9848, + "num_input_tokens_seen": 182981264, + "step": 11372 + }, + { + "epoch": 0.796658698678715, + "grad_norm": 4.1626129150390625, + "learning_rate": 2.0408917688266203e-05, + "loss": 1.1721, + "num_input_tokens_seen": 182997648, + "step": 11373 + }, + { + "epoch": 0.7967287469244442, + "grad_norm": 4.327353000640869, + "learning_rate": 2.040191943957968e-05, + "loss": 1.1883, + "num_input_tokens_seen": 183014032, + "step": 11374 + }, + { + "epoch": 0.7967987951701735, + "grad_norm": 3.603639841079712, + "learning_rate": 2.0394921190893174e-05, + "loss": 0.8396, + "num_input_tokens_seen": 183030304, + "step": 11375 + }, + { + "epoch": 0.7968688434159027, + "grad_norm": 3.773019790649414, + "learning_rate": 2.0387922942206652e-05, + "loss": 1.1181, + "num_input_tokens_seen": 183046688, + "step": 11376 + }, + { + "epoch": 0.7969388916616319, + "grad_norm": 3.8155059814453125, + "learning_rate": 2.038092469352014e-05, + "loss": 0.9679, + "num_input_tokens_seen": 183063072, + "step": 11377 + }, + { + "epoch": 0.7970089399073612, + "grad_norm": 4.775983810424805, + "learning_rate": 2.037392644483363e-05, + "loss": 1.0981, + "num_input_tokens_seen": 183079280, + "step": 11378 + }, + { + "epoch": 0.7970789881530904, + "grad_norm": 3.984560489654541, + "learning_rate": 2.0366928196147108e-05, + "loss": 1.0158, + "num_input_tokens_seen": 183095440, + "step": 11379 + }, + { + "epoch": 0.7971490363988197, + "grad_norm": 5.132417678833008, + "learning_rate": 2.03599299474606e-05, + "loss": 1.2594, + "num_input_tokens_seen": 183111824, + "step": 11380 + }, + { + "epoch": 0.7972190846445489, + "grad_norm": 3.7097878456115723, + "learning_rate": 2.0352931698774078e-05, + "loss": 0.8499, + "num_input_tokens_seen": 183127208, + "step": 11381 + }, + { + "epoch": 0.7972891328902781, + "grad_norm": 3.6541168689727783, + "learning_rate": 2.0345933450087567e-05, + "loss": 0.9806, + "num_input_tokens_seen": 183143168, + "step": 11382 + }, + { + "epoch": 0.7973591811360075, + "grad_norm": 3.8408548831939697, + "learning_rate": 2.0338935201401045e-05, + "loss": 1.0416, + "num_input_tokens_seen": 183159552, + "step": 11383 + }, + { + "epoch": 0.7974292293817367, + "grad_norm": 3.6520988941192627, + "learning_rate": 2.0331936952714534e-05, + "loss": 1.055, + "num_input_tokens_seen": 183175936, + "step": 11384 + }, + { + "epoch": 0.7974992776274659, + "grad_norm": 3.8350319862365723, + "learning_rate": 2.0324938704028026e-05, + "loss": 1.0135, + "num_input_tokens_seen": 183191016, + "step": 11385 + }, + { + "epoch": 0.7975693258731952, + "grad_norm": 4.353374004364014, + "learning_rate": 2.0317940455341504e-05, + "loss": 1.1538, + "num_input_tokens_seen": 183207400, + "step": 11386 + }, + { + "epoch": 0.7976393741189244, + "grad_norm": 4.206396579742432, + "learning_rate": 2.0310942206654993e-05, + "loss": 1.0561, + "num_input_tokens_seen": 183223784, + "step": 11387 + }, + { + "epoch": 0.7977094223646537, + "grad_norm": 3.8964884281158447, + "learning_rate": 2.030394395796847e-05, + "loss": 1.1915, + "num_input_tokens_seen": 183240168, + "step": 11388 + }, + { + "epoch": 0.7977794706103829, + "grad_norm": 3.6234006881713867, + "learning_rate": 2.029694570928196e-05, + "loss": 0.9458, + "num_input_tokens_seen": 183256552, + "step": 11389 + }, + { + "epoch": 0.7978495188561121, + "grad_norm": 4.340541839599609, + "learning_rate": 2.028994746059545e-05, + "loss": 1.0248, + "num_input_tokens_seen": 183272936, + "step": 11390 + }, + { + "epoch": 0.7979195671018414, + "grad_norm": 3.7413766384124756, + "learning_rate": 2.028294921190893e-05, + "loss": 0.8919, + "num_input_tokens_seen": 183289320, + "step": 11391 + }, + { + "epoch": 0.7979896153475706, + "grad_norm": 3.9003493785858154, + "learning_rate": 2.027595096322242e-05, + "loss": 1.093, + "num_input_tokens_seen": 183305704, + "step": 11392 + }, + { + "epoch": 0.7980596635932999, + "grad_norm": 3.697850227355957, + "learning_rate": 2.0268952714535897e-05, + "loss": 0.9745, + "num_input_tokens_seen": 183322064, + "step": 11393 + }, + { + "epoch": 0.7981297118390291, + "grad_norm": 3.7919769287109375, + "learning_rate": 2.0261954465849386e-05, + "loss": 0.9197, + "num_input_tokens_seen": 183338272, + "step": 11394 + }, + { + "epoch": 0.7981997600847583, + "grad_norm": 5.054806709289551, + "learning_rate": 2.0254956217162878e-05, + "loss": 1.1679, + "num_input_tokens_seen": 183354448, + "step": 11395 + }, + { + "epoch": 0.7982698083304877, + "grad_norm": 3.903092622756958, + "learning_rate": 2.0247957968476356e-05, + "loss": 1.1211, + "num_input_tokens_seen": 183370384, + "step": 11396 + }, + { + "epoch": 0.7983398565762169, + "grad_norm": 3.9635748863220215, + "learning_rate": 2.0240959719789845e-05, + "loss": 1.1393, + "num_input_tokens_seen": 183386736, + "step": 11397 + }, + { + "epoch": 0.7984099048219461, + "grad_norm": 3.979397773742676, + "learning_rate": 2.0233961471103323e-05, + "loss": 0.91, + "num_input_tokens_seen": 183402096, + "step": 11398 + }, + { + "epoch": 0.7984799530676754, + "grad_norm": 4.025575160980225, + "learning_rate": 2.022696322241681e-05, + "loss": 1.1079, + "num_input_tokens_seen": 183418480, + "step": 11399 + }, + { + "epoch": 0.7985500013134046, + "grad_norm": 3.5414528846740723, + "learning_rate": 2.0219964973730304e-05, + "loss": 1.1122, + "num_input_tokens_seen": 183434616, + "step": 11400 + }, + { + "epoch": 0.7985500013134046, + "eval_loss": 1.1172616481781006, + "eval_runtime": 0.198, + "eval_samples_per_second": 5.049, + "eval_steps_per_second": 5.049, + "num_input_tokens_seen": 183434616, + "step": 11400 + }, + { + "epoch": 0.7986200495591339, + "grad_norm": 3.9279558658599854, + "learning_rate": 2.0212966725043782e-05, + "loss": 0.8576, + "num_input_tokens_seen": 183450944, + "step": 11401 + }, + { + "epoch": 0.7986900978048631, + "grad_norm": 4.41232442855835, + "learning_rate": 2.020596847635727e-05, + "loss": 0.835, + "num_input_tokens_seen": 183467056, + "step": 11402 + }, + { + "epoch": 0.7987601460505923, + "grad_norm": 3.450037956237793, + "learning_rate": 2.019897022767075e-05, + "loss": 0.9643, + "num_input_tokens_seen": 183483440, + "step": 11403 + }, + { + "epoch": 0.7988301942963216, + "grad_norm": 3.9011387825012207, + "learning_rate": 2.0191971978984238e-05, + "loss": 1.0658, + "num_input_tokens_seen": 183499824, + "step": 11404 + }, + { + "epoch": 0.7989002425420508, + "grad_norm": 4.462986946105957, + "learning_rate": 2.018497373029773e-05, + "loss": 1.3129, + "num_input_tokens_seen": 183516208, + "step": 11405 + }, + { + "epoch": 0.79897029078778, + "grad_norm": 3.870410442352295, + "learning_rate": 2.0177975481611208e-05, + "loss": 1.0809, + "num_input_tokens_seen": 183532592, + "step": 11406 + }, + { + "epoch": 0.7990403390335094, + "grad_norm": 4.198379993438721, + "learning_rate": 2.0170977232924697e-05, + "loss": 1.1086, + "num_input_tokens_seen": 183547672, + "step": 11407 + }, + { + "epoch": 0.7991103872792386, + "grad_norm": 3.9977095127105713, + "learning_rate": 2.0163978984238175e-05, + "loss": 1.0858, + "num_input_tokens_seen": 183564056, + "step": 11408 + }, + { + "epoch": 0.7991804355249679, + "grad_norm": 4.107580184936523, + "learning_rate": 2.0156980735551664e-05, + "loss": 1.089, + "num_input_tokens_seen": 183580440, + "step": 11409 + }, + { + "epoch": 0.7992504837706971, + "grad_norm": 4.774752140045166, + "learning_rate": 2.0149982486865142e-05, + "loss": 1.1756, + "num_input_tokens_seen": 183596824, + "step": 11410 + }, + { + "epoch": 0.7993205320164263, + "grad_norm": 3.695551633834839, + "learning_rate": 2.0142984238178634e-05, + "loss": 1.0525, + "num_input_tokens_seen": 183613208, + "step": 11411 + }, + { + "epoch": 0.7993905802621556, + "grad_norm": 3.5243430137634277, + "learning_rate": 2.0135985989492123e-05, + "loss": 0.9827, + "num_input_tokens_seen": 183629592, + "step": 11412 + }, + { + "epoch": 0.7994606285078848, + "grad_norm": 5.24747371673584, + "learning_rate": 2.01289877408056e-05, + "loss": 0.9394, + "num_input_tokens_seen": 183645976, + "step": 11413 + }, + { + "epoch": 0.799530676753614, + "grad_norm": 4.435072898864746, + "learning_rate": 2.012198949211909e-05, + "loss": 1.1643, + "num_input_tokens_seen": 183662360, + "step": 11414 + }, + { + "epoch": 0.7996007249993433, + "grad_norm": 4.691938877105713, + "learning_rate": 2.0114991243432568e-05, + "loss": 0.9411, + "num_input_tokens_seen": 183678736, + "step": 11415 + }, + { + "epoch": 0.7996707732450725, + "grad_norm": 3.341764450073242, + "learning_rate": 2.010799299474606e-05, + "loss": 0.9212, + "num_input_tokens_seen": 183694952, + "step": 11416 + }, + { + "epoch": 0.7997408214908018, + "grad_norm": 3.469421625137329, + "learning_rate": 2.010099474605955e-05, + "loss": 0.8929, + "num_input_tokens_seen": 183711304, + "step": 11417 + }, + { + "epoch": 0.799810869736531, + "grad_norm": 3.6374168395996094, + "learning_rate": 2.0093996497373027e-05, + "loss": 0.9958, + "num_input_tokens_seen": 183727688, + "step": 11418 + }, + { + "epoch": 0.7998809179822602, + "grad_norm": 4.266983509063721, + "learning_rate": 2.0086998248686516e-05, + "loss": 1.301, + "num_input_tokens_seen": 183744072, + "step": 11419 + }, + { + "epoch": 0.7999509662279896, + "grad_norm": 3.485584020614624, + "learning_rate": 2.0079999999999994e-05, + "loss": 0.8097, + "num_input_tokens_seen": 183759712, + "step": 11420 + }, + { + "epoch": 0.8000210144737188, + "grad_norm": 4.297959327697754, + "learning_rate": 2.0073001751313486e-05, + "loss": 1.1424, + "num_input_tokens_seen": 183776096, + "step": 11421 + }, + { + "epoch": 0.800091062719448, + "grad_norm": 3.9543375968933105, + "learning_rate": 2.0066003502626975e-05, + "loss": 1.2436, + "num_input_tokens_seen": 183791768, + "step": 11422 + }, + { + "epoch": 0.8001611109651773, + "grad_norm": 3.55887770652771, + "learning_rate": 2.0059005253940453e-05, + "loss": 0.9818, + "num_input_tokens_seen": 183808152, + "step": 11423 + }, + { + "epoch": 0.8002311592109065, + "grad_norm": 4.114664077758789, + "learning_rate": 2.005200700525394e-05, + "loss": 1.0449, + "num_input_tokens_seen": 183824464, + "step": 11424 + }, + { + "epoch": 0.8003012074566358, + "grad_norm": 3.702018976211548, + "learning_rate": 2.004500875656742e-05, + "loss": 1.0082, + "num_input_tokens_seen": 183840848, + "step": 11425 + }, + { + "epoch": 0.800371255702365, + "grad_norm": 4.891937255859375, + "learning_rate": 2.0038010507880912e-05, + "loss": 1.1321, + "num_input_tokens_seen": 183854384, + "step": 11426 + }, + { + "epoch": 0.8004413039480942, + "grad_norm": 4.264565944671631, + "learning_rate": 2.00310122591944e-05, + "loss": 0.9476, + "num_input_tokens_seen": 183869888, + "step": 11427 + }, + { + "epoch": 0.8005113521938235, + "grad_norm": 4.940980434417725, + "learning_rate": 2.002401401050788e-05, + "loss": 1.0723, + "num_input_tokens_seen": 183885768, + "step": 11428 + }, + { + "epoch": 0.8005814004395527, + "grad_norm": 4.278599739074707, + "learning_rate": 2.0017015761821368e-05, + "loss": 0.9159, + "num_input_tokens_seen": 183902064, + "step": 11429 + }, + { + "epoch": 0.800651448685282, + "grad_norm": 3.145711660385132, + "learning_rate": 2.0010017513134846e-05, + "loss": 0.9637, + "num_input_tokens_seen": 183918448, + "step": 11430 + }, + { + "epoch": 0.8007214969310112, + "grad_norm": 4.543213367462158, + "learning_rate": 2.0003019264448338e-05, + "loss": 1.0782, + "num_input_tokens_seen": 183934792, + "step": 11431 + }, + { + "epoch": 0.8007915451767404, + "grad_norm": 3.8029520511627197, + "learning_rate": 1.9996021015761827e-05, + "loss": 1.0676, + "num_input_tokens_seen": 183951176, + "step": 11432 + }, + { + "epoch": 0.8008615934224698, + "grad_norm": 5.314518928527832, + "learning_rate": 1.9989022767075305e-05, + "loss": 1.2114, + "num_input_tokens_seen": 183967560, + "step": 11433 + }, + { + "epoch": 0.800931641668199, + "grad_norm": 5.387121200561523, + "learning_rate": 1.9982024518388794e-05, + "loss": 1.0875, + "num_input_tokens_seen": 183983272, + "step": 11434 + }, + { + "epoch": 0.8010016899139282, + "grad_norm": 3.7685935497283936, + "learning_rate": 1.9975026269702272e-05, + "loss": 0.9419, + "num_input_tokens_seen": 183999656, + "step": 11435 + }, + { + "epoch": 0.8010717381596575, + "grad_norm": 4.174087047576904, + "learning_rate": 1.9968028021015764e-05, + "loss": 1.1316, + "num_input_tokens_seen": 184016040, + "step": 11436 + }, + { + "epoch": 0.8011417864053867, + "grad_norm": 4.407896995544434, + "learning_rate": 1.9961029772329243e-05, + "loss": 1.0656, + "num_input_tokens_seen": 184032216, + "step": 11437 + }, + { + "epoch": 0.801211834651116, + "grad_norm": 4.854999542236328, + "learning_rate": 1.995403152364273e-05, + "loss": 1.0466, + "num_input_tokens_seen": 184047368, + "step": 11438 + }, + { + "epoch": 0.8012818828968452, + "grad_norm": 6.143575668334961, + "learning_rate": 1.994703327495622e-05, + "loss": 1.1355, + "num_input_tokens_seen": 184063752, + "step": 11439 + }, + { + "epoch": 0.8013519311425744, + "grad_norm": 5.037444114685059, + "learning_rate": 1.9940035026269698e-05, + "loss": 1.0718, + "num_input_tokens_seen": 184079128, + "step": 11440 + }, + { + "epoch": 0.8014219793883037, + "grad_norm": 4.102870941162109, + "learning_rate": 1.993303677758319e-05, + "loss": 1.0682, + "num_input_tokens_seen": 184094640, + "step": 11441 + }, + { + "epoch": 0.8014920276340329, + "grad_norm": 5.17531156539917, + "learning_rate": 1.992603852889667e-05, + "loss": 1.1009, + "num_input_tokens_seen": 184110360, + "step": 11442 + }, + { + "epoch": 0.8015620758797621, + "grad_norm": 5.058488845825195, + "learning_rate": 1.9919040280210157e-05, + "loss": 1.028, + "num_input_tokens_seen": 184126744, + "step": 11443 + }, + { + "epoch": 0.8016321241254915, + "grad_norm": 4.1764302253723145, + "learning_rate": 1.9912042031523646e-05, + "loss": 1.1605, + "num_input_tokens_seen": 184143128, + "step": 11444 + }, + { + "epoch": 0.8017021723712207, + "grad_norm": 3.62280011177063, + "learning_rate": 1.9905043782837124e-05, + "loss": 0.9819, + "num_input_tokens_seen": 184159088, + "step": 11445 + }, + { + "epoch": 0.80177222061695, + "grad_norm": 4.713597774505615, + "learning_rate": 1.9898045534150616e-05, + "loss": 0.9735, + "num_input_tokens_seen": 184174208, + "step": 11446 + }, + { + "epoch": 0.8018422688626792, + "grad_norm": 4.068121910095215, + "learning_rate": 1.9891047285464095e-05, + "loss": 1.1687, + "num_input_tokens_seen": 184190592, + "step": 11447 + }, + { + "epoch": 0.8019123171084084, + "grad_norm": 3.899228572845459, + "learning_rate": 1.9884049036777583e-05, + "loss": 0.9901, + "num_input_tokens_seen": 184206976, + "step": 11448 + }, + { + "epoch": 0.8019823653541377, + "grad_norm": 4.076430320739746, + "learning_rate": 1.9877050788091072e-05, + "loss": 0.917, + "num_input_tokens_seen": 184222432, + "step": 11449 + }, + { + "epoch": 0.8020524135998669, + "grad_norm": 4.201887607574463, + "learning_rate": 1.987005253940455e-05, + "loss": 1.1886, + "num_input_tokens_seen": 184238816, + "step": 11450 + }, + { + "epoch": 0.8021224618455961, + "grad_norm": 4.167854309082031, + "learning_rate": 1.9863054290718042e-05, + "loss": 1.2368, + "num_input_tokens_seen": 184253704, + "step": 11451 + }, + { + "epoch": 0.8021925100913254, + "grad_norm": 4.817135810852051, + "learning_rate": 1.985605604203152e-05, + "loss": 0.984, + "num_input_tokens_seen": 184270088, + "step": 11452 + }, + { + "epoch": 0.8022625583370546, + "grad_norm": 4.3448381423950195, + "learning_rate": 1.984905779334501e-05, + "loss": 0.9543, + "num_input_tokens_seen": 184285256, + "step": 11453 + }, + { + "epoch": 0.8023326065827839, + "grad_norm": 3.9108359813690186, + "learning_rate": 1.9842059544658498e-05, + "loss": 0.9239, + "num_input_tokens_seen": 184301640, + "step": 11454 + }, + { + "epoch": 0.8024026548285131, + "grad_norm": 5.962432384490967, + "learning_rate": 1.9835061295971976e-05, + "loss": 0.939, + "num_input_tokens_seen": 184317344, + "step": 11455 + }, + { + "epoch": 0.8024727030742423, + "grad_norm": 3.2844338417053223, + "learning_rate": 1.9828063047285468e-05, + "loss": 0.938, + "num_input_tokens_seen": 184333728, + "step": 11456 + }, + { + "epoch": 0.8025427513199717, + "grad_norm": 4.240279674530029, + "learning_rate": 1.9821064798598947e-05, + "loss": 0.9471, + "num_input_tokens_seen": 184349992, + "step": 11457 + }, + { + "epoch": 0.8026127995657009, + "grad_norm": 4.206956386566162, + "learning_rate": 1.9814066549912435e-05, + "loss": 0.9991, + "num_input_tokens_seen": 184366104, + "step": 11458 + }, + { + "epoch": 0.8026828478114302, + "grad_norm": 4.9496564865112305, + "learning_rate": 1.9807068301225924e-05, + "loss": 1.27, + "num_input_tokens_seen": 184381992, + "step": 11459 + }, + { + "epoch": 0.8027528960571594, + "grad_norm": 5.337173938751221, + "learning_rate": 1.9800070052539402e-05, + "loss": 0.7588, + "num_input_tokens_seen": 184398224, + "step": 11460 + }, + { + "epoch": 0.8028229443028886, + "grad_norm": 4.094396591186523, + "learning_rate": 1.9793071803852894e-05, + "loss": 1.1127, + "num_input_tokens_seen": 184413640, + "step": 11461 + }, + { + "epoch": 0.8028929925486179, + "grad_norm": 4.155179023742676, + "learning_rate": 1.9786073555166373e-05, + "loss": 1.0206, + "num_input_tokens_seen": 184428920, + "step": 11462 + }, + { + "epoch": 0.8029630407943471, + "grad_norm": 6.654775619506836, + "learning_rate": 1.977907530647986e-05, + "loss": 1.0352, + "num_input_tokens_seen": 184445304, + "step": 11463 + }, + { + "epoch": 0.8030330890400763, + "grad_norm": 3.970658779144287, + "learning_rate": 1.977207705779334e-05, + "loss": 0.928, + "num_input_tokens_seen": 184461632, + "step": 11464 + }, + { + "epoch": 0.8031031372858056, + "grad_norm": 3.5124011039733887, + "learning_rate": 1.9765078809106828e-05, + "loss": 0.9667, + "num_input_tokens_seen": 184477096, + "step": 11465 + }, + { + "epoch": 0.8031731855315348, + "grad_norm": 3.7738683223724365, + "learning_rate": 1.975808056042032e-05, + "loss": 0.8789, + "num_input_tokens_seen": 184493128, + "step": 11466 + }, + { + "epoch": 0.8032432337772641, + "grad_norm": 4.6006245613098145, + "learning_rate": 1.97510823117338e-05, + "loss": 0.9661, + "num_input_tokens_seen": 184509512, + "step": 11467 + }, + { + "epoch": 0.8033132820229933, + "grad_norm": 5.316211700439453, + "learning_rate": 1.9744084063047287e-05, + "loss": 1.0203, + "num_input_tokens_seen": 184525896, + "step": 11468 + }, + { + "epoch": 0.8033833302687226, + "grad_norm": 3.88411021232605, + "learning_rate": 1.9737085814360766e-05, + "loss": 0.9652, + "num_input_tokens_seen": 184542280, + "step": 11469 + }, + { + "epoch": 0.8034533785144519, + "grad_norm": 4.115197658538818, + "learning_rate": 1.9730087565674254e-05, + "loss": 1.2168, + "num_input_tokens_seen": 184558664, + "step": 11470 + }, + { + "epoch": 0.8035234267601811, + "grad_norm": 3.5183017253875732, + "learning_rate": 1.9723089316987746e-05, + "loss": 0.9969, + "num_input_tokens_seen": 184575048, + "step": 11471 + }, + { + "epoch": 0.8035934750059103, + "grad_norm": 4.098026752471924, + "learning_rate": 1.9716091068301225e-05, + "loss": 0.9495, + "num_input_tokens_seen": 184591432, + "step": 11472 + }, + { + "epoch": 0.8036635232516396, + "grad_norm": 4.036252975463867, + "learning_rate": 1.9709092819614713e-05, + "loss": 0.9715, + "num_input_tokens_seen": 184607816, + "step": 11473 + }, + { + "epoch": 0.8037335714973688, + "grad_norm": 4.457969665527344, + "learning_rate": 1.970209457092819e-05, + "loss": 1.0787, + "num_input_tokens_seen": 184623200, + "step": 11474 + }, + { + "epoch": 0.8038036197430981, + "grad_norm": 4.962934494018555, + "learning_rate": 1.969509632224168e-05, + "loss": 0.8373, + "num_input_tokens_seen": 184639584, + "step": 11475 + }, + { + "epoch": 0.8038736679888273, + "grad_norm": 4.178068161010742, + "learning_rate": 1.9688098073555172e-05, + "loss": 1.0388, + "num_input_tokens_seen": 184654016, + "step": 11476 + }, + { + "epoch": 0.8039437162345565, + "grad_norm": 4.237048149108887, + "learning_rate": 1.968109982486865e-05, + "loss": 1.0932, + "num_input_tokens_seen": 184669304, + "step": 11477 + }, + { + "epoch": 0.8040137644802858, + "grad_norm": 4.317728042602539, + "learning_rate": 1.967410157618214e-05, + "loss": 0.8882, + "num_input_tokens_seen": 184685688, + "step": 11478 + }, + { + "epoch": 0.804083812726015, + "grad_norm": 3.8639659881591797, + "learning_rate": 1.9667103327495618e-05, + "loss": 1.0501, + "num_input_tokens_seen": 184701240, + "step": 11479 + }, + { + "epoch": 0.8041538609717442, + "grad_norm": 3.706894636154175, + "learning_rate": 1.9660105078809106e-05, + "loss": 0.9115, + "num_input_tokens_seen": 184717624, + "step": 11480 + }, + { + "epoch": 0.8042239092174736, + "grad_norm": 4.042561054229736, + "learning_rate": 1.9653106830122598e-05, + "loss": 0.9648, + "num_input_tokens_seen": 184733080, + "step": 11481 + }, + { + "epoch": 0.8042939574632028, + "grad_norm": 3.9040603637695312, + "learning_rate": 1.9646108581436077e-05, + "loss": 1.0351, + "num_input_tokens_seen": 184749328, + "step": 11482 + }, + { + "epoch": 0.8043640057089321, + "grad_norm": 3.5777230262756348, + "learning_rate": 1.9639110332749565e-05, + "loss": 1.0129, + "num_input_tokens_seen": 184765712, + "step": 11483 + }, + { + "epoch": 0.8044340539546613, + "grad_norm": 5.216578006744385, + "learning_rate": 1.9632112084063044e-05, + "loss": 1.0877, + "num_input_tokens_seen": 184781136, + "step": 11484 + }, + { + "epoch": 0.8045041022003905, + "grad_norm": 4.107327938079834, + "learning_rate": 1.9625113835376532e-05, + "loss": 0.9783, + "num_input_tokens_seen": 184797520, + "step": 11485 + }, + { + "epoch": 0.8045741504461198, + "grad_norm": 4.078060150146484, + "learning_rate": 1.961811558669002e-05, + "loss": 0.7735, + "num_input_tokens_seen": 184813760, + "step": 11486 + }, + { + "epoch": 0.804644198691849, + "grad_norm": 5.1282854080200195, + "learning_rate": 1.9611117338003503e-05, + "loss": 1.0743, + "num_input_tokens_seen": 184830144, + "step": 11487 + }, + { + "epoch": 0.8047142469375782, + "grad_norm": 4.975650787353516, + "learning_rate": 1.960411908931699e-05, + "loss": 1.2774, + "num_input_tokens_seen": 184844976, + "step": 11488 + }, + { + "epoch": 0.8047842951833075, + "grad_norm": 3.707805633544922, + "learning_rate": 1.959712084063047e-05, + "loss": 0.9877, + "num_input_tokens_seen": 184861360, + "step": 11489 + }, + { + "epoch": 0.8048543434290367, + "grad_norm": 4.688051223754883, + "learning_rate": 1.9590122591943958e-05, + "loss": 0.9775, + "num_input_tokens_seen": 184877744, + "step": 11490 + }, + { + "epoch": 0.804924391674766, + "grad_norm": 4.16152811050415, + "learning_rate": 1.9583124343257437e-05, + "loss": 1.3, + "num_input_tokens_seen": 184894128, + "step": 11491 + }, + { + "epoch": 0.8049944399204952, + "grad_norm": 4.152731895446777, + "learning_rate": 1.957612609457093e-05, + "loss": 0.9459, + "num_input_tokens_seen": 184910512, + "step": 11492 + }, + { + "epoch": 0.8050644881662244, + "grad_norm": 3.59941029548645, + "learning_rate": 1.9569127845884417e-05, + "loss": 1.0212, + "num_input_tokens_seen": 184926608, + "step": 11493 + }, + { + "epoch": 0.8051345364119538, + "grad_norm": 4.880187034606934, + "learning_rate": 1.9562129597197896e-05, + "loss": 1.0242, + "num_input_tokens_seen": 184942992, + "step": 11494 + }, + { + "epoch": 0.805204584657683, + "grad_norm": 4.694171905517578, + "learning_rate": 1.9555131348511384e-05, + "loss": 1.2477, + "num_input_tokens_seen": 184959376, + "step": 11495 + }, + { + "epoch": 0.8052746329034123, + "grad_norm": 4.3750691413879395, + "learning_rate": 1.9548133099824863e-05, + "loss": 1.0289, + "num_input_tokens_seen": 184975760, + "step": 11496 + }, + { + "epoch": 0.8053446811491415, + "grad_norm": 4.216646671295166, + "learning_rate": 1.9541134851138355e-05, + "loss": 0.9319, + "num_input_tokens_seen": 184992048, + "step": 11497 + }, + { + "epoch": 0.8054147293948707, + "grad_norm": 4.401334285736084, + "learning_rate": 1.9534136602451843e-05, + "loss": 1.1897, + "num_input_tokens_seen": 185008232, + "step": 11498 + }, + { + "epoch": 0.8054847776406, + "grad_norm": 4.8988800048828125, + "learning_rate": 1.952713835376532e-05, + "loss": 0.9003, + "num_input_tokens_seen": 185024256, + "step": 11499 + }, + { + "epoch": 0.8055548258863292, + "grad_norm": 4.154270172119141, + "learning_rate": 1.952014010507881e-05, + "loss": 0.8674, + "num_input_tokens_seen": 185038400, + "step": 11500 + }, + { + "epoch": 0.8056248741320584, + "grad_norm": 5.64815092086792, + "learning_rate": 1.951314185639229e-05, + "loss": 1.0587, + "num_input_tokens_seen": 185054784, + "step": 11501 + }, + { + "epoch": 0.8056949223777877, + "grad_norm": 3.357783794403076, + "learning_rate": 1.950614360770578e-05, + "loss": 0.9255, + "num_input_tokens_seen": 185071168, + "step": 11502 + }, + { + "epoch": 0.8057649706235169, + "grad_norm": 3.8376402854919434, + "learning_rate": 1.949914535901927e-05, + "loss": 1.0896, + "num_input_tokens_seen": 185087552, + "step": 11503 + }, + { + "epoch": 0.8058350188692462, + "grad_norm": 3.7782747745513916, + "learning_rate": 1.9492147110332748e-05, + "loss": 0.9993, + "num_input_tokens_seen": 185103488, + "step": 11504 + }, + { + "epoch": 0.8059050671149754, + "grad_norm": 4.2498297691345215, + "learning_rate": 1.9485148861646236e-05, + "loss": 0.9829, + "num_input_tokens_seen": 185119872, + "step": 11505 + }, + { + "epoch": 0.8059751153607047, + "grad_norm": 5.0415120124816895, + "learning_rate": 1.9478150612959715e-05, + "loss": 1.0292, + "num_input_tokens_seen": 185135232, + "step": 11506 + }, + { + "epoch": 0.806045163606434, + "grad_norm": 3.5486340522766113, + "learning_rate": 1.9471152364273203e-05, + "loss": 1.1185, + "num_input_tokens_seen": 185151616, + "step": 11507 + }, + { + "epoch": 0.8061152118521632, + "grad_norm": 3.7403719425201416, + "learning_rate": 1.9464154115586695e-05, + "loss": 0.9136, + "num_input_tokens_seen": 185168000, + "step": 11508 + }, + { + "epoch": 0.8061852600978924, + "grad_norm": 4.8431196212768555, + "learning_rate": 1.9457155866900174e-05, + "loss": 1.0521, + "num_input_tokens_seen": 185183552, + "step": 11509 + }, + { + "epoch": 0.8062553083436217, + "grad_norm": 4.695057392120361, + "learning_rate": 1.9450157618213662e-05, + "loss": 0.9651, + "num_input_tokens_seen": 185199936, + "step": 11510 + }, + { + "epoch": 0.8063253565893509, + "grad_norm": 3.9562768936157227, + "learning_rate": 1.944315936952714e-05, + "loss": 0.918, + "num_input_tokens_seen": 185216304, + "step": 11511 + }, + { + "epoch": 0.8063954048350802, + "grad_norm": 3.8799333572387695, + "learning_rate": 1.943616112084063e-05, + "loss": 1.0556, + "num_input_tokens_seen": 185232688, + "step": 11512 + }, + { + "epoch": 0.8064654530808094, + "grad_norm": 3.8644797801971436, + "learning_rate": 1.942916287215412e-05, + "loss": 0.9286, + "num_input_tokens_seen": 185248400, + "step": 11513 + }, + { + "epoch": 0.8065355013265386, + "grad_norm": 4.465839862823486, + "learning_rate": 1.94221646234676e-05, + "loss": 1.2447, + "num_input_tokens_seen": 185264784, + "step": 11514 + }, + { + "epoch": 0.8066055495722679, + "grad_norm": 4.064611911773682, + "learning_rate": 1.9415166374781088e-05, + "loss": 0.9601, + "num_input_tokens_seen": 185281032, + "step": 11515 + }, + { + "epoch": 0.8066755978179971, + "grad_norm": 4.350561618804932, + "learning_rate": 1.9408168126094567e-05, + "loss": 1.2507, + "num_input_tokens_seen": 185297352, + "step": 11516 + }, + { + "epoch": 0.8067456460637263, + "grad_norm": 4.059119701385498, + "learning_rate": 1.9401169877408055e-05, + "loss": 1.1797, + "num_input_tokens_seen": 185313440, + "step": 11517 + }, + { + "epoch": 0.8068156943094557, + "grad_norm": 4.219115734100342, + "learning_rate": 1.9394171628721537e-05, + "loss": 1.2917, + "num_input_tokens_seen": 185329656, + "step": 11518 + }, + { + "epoch": 0.8068857425551849, + "grad_norm": 3.9714906215667725, + "learning_rate": 1.9387173380035026e-05, + "loss": 1.1872, + "num_input_tokens_seen": 185345504, + "step": 11519 + }, + { + "epoch": 0.8069557908009142, + "grad_norm": 4.30182409286499, + "learning_rate": 1.9380175131348514e-05, + "loss": 1.3235, + "num_input_tokens_seen": 185361888, + "step": 11520 + }, + { + "epoch": 0.8070258390466434, + "grad_norm": 5.095610618591309, + "learning_rate": 1.9373176882661993e-05, + "loss": 1.0579, + "num_input_tokens_seen": 185377880, + "step": 11521 + }, + { + "epoch": 0.8070958872923726, + "grad_norm": 4.631853103637695, + "learning_rate": 1.936617863397548e-05, + "loss": 0.9431, + "num_input_tokens_seen": 185394264, + "step": 11522 + }, + { + "epoch": 0.8071659355381019, + "grad_norm": 4.1386823654174805, + "learning_rate": 1.9359180385288963e-05, + "loss": 1.1736, + "num_input_tokens_seen": 185409720, + "step": 11523 + }, + { + "epoch": 0.8072359837838311, + "grad_norm": 4.174017906188965, + "learning_rate": 1.9352182136602452e-05, + "loss": 1.1511, + "num_input_tokens_seen": 185425736, + "step": 11524 + }, + { + "epoch": 0.8073060320295603, + "grad_norm": 4.989072799682617, + "learning_rate": 1.934518388791594e-05, + "loss": 1.0341, + "num_input_tokens_seen": 185440880, + "step": 11525 + }, + { + "epoch": 0.8073760802752896, + "grad_norm": 4.275994300842285, + "learning_rate": 1.933818563922942e-05, + "loss": 1.0526, + "num_input_tokens_seen": 185456848, + "step": 11526 + }, + { + "epoch": 0.8074461285210188, + "grad_norm": 3.855053424835205, + "learning_rate": 1.9331187390542907e-05, + "loss": 1.1084, + "num_input_tokens_seen": 185473232, + "step": 11527 + }, + { + "epoch": 0.8075161767667481, + "grad_norm": 3.664043426513672, + "learning_rate": 1.9324189141856386e-05, + "loss": 1.0079, + "num_input_tokens_seen": 185489616, + "step": 11528 + }, + { + "epoch": 0.8075862250124773, + "grad_norm": 3.4673595428466797, + "learning_rate": 1.9317190893169878e-05, + "loss": 0.8547, + "num_input_tokens_seen": 185505888, + "step": 11529 + }, + { + "epoch": 0.8076562732582065, + "grad_norm": 3.925265073776245, + "learning_rate": 1.9310192644483366e-05, + "loss": 0.998, + "num_input_tokens_seen": 185522008, + "step": 11530 + }, + { + "epoch": 0.8077263215039359, + "grad_norm": 3.453542947769165, + "learning_rate": 1.9303194395796845e-05, + "loss": 0.8891, + "num_input_tokens_seen": 185538392, + "step": 11531 + }, + { + "epoch": 0.8077963697496651, + "grad_norm": 5.519572734832764, + "learning_rate": 1.9296196147110333e-05, + "loss": 1.0237, + "num_input_tokens_seen": 185554104, + "step": 11532 + }, + { + "epoch": 0.8078664179953944, + "grad_norm": 3.715970039367676, + "learning_rate": 1.9289197898423812e-05, + "loss": 1.1476, + "num_input_tokens_seen": 185570488, + "step": 11533 + }, + { + "epoch": 0.8079364662411236, + "grad_norm": 4.149783611297607, + "learning_rate": 1.9282199649737304e-05, + "loss": 1.0401, + "num_input_tokens_seen": 185586448, + "step": 11534 + }, + { + "epoch": 0.8080065144868528, + "grad_norm": 3.4546701908111572, + "learning_rate": 1.9275201401050792e-05, + "loss": 0.9869, + "num_input_tokens_seen": 185602304, + "step": 11535 + }, + { + "epoch": 0.8080765627325821, + "grad_norm": 3.9010703563690186, + "learning_rate": 1.926820315236427e-05, + "loss": 0.9941, + "num_input_tokens_seen": 185618568, + "step": 11536 + }, + { + "epoch": 0.8081466109783113, + "grad_norm": 4.250125885009766, + "learning_rate": 1.926120490367776e-05, + "loss": 1.21, + "num_input_tokens_seen": 185634952, + "step": 11537 + }, + { + "epoch": 0.8082166592240405, + "grad_norm": 3.5510425567626953, + "learning_rate": 1.9254206654991238e-05, + "loss": 0.9462, + "num_input_tokens_seen": 185650952, + "step": 11538 + }, + { + "epoch": 0.8082867074697698, + "grad_norm": 6.548225402832031, + "learning_rate": 1.924720840630473e-05, + "loss": 1.0745, + "num_input_tokens_seen": 185667336, + "step": 11539 + }, + { + "epoch": 0.808356755715499, + "grad_norm": 3.303722381591797, + "learning_rate": 1.924021015761822e-05, + "loss": 0.9585, + "num_input_tokens_seen": 185683720, + "step": 11540 + }, + { + "epoch": 0.8084268039612283, + "grad_norm": 5.6701741218566895, + "learning_rate": 1.9233211908931697e-05, + "loss": 1.1548, + "num_input_tokens_seen": 185700104, + "step": 11541 + }, + { + "epoch": 0.8084968522069576, + "grad_norm": 3.961703300476074, + "learning_rate": 1.9226213660245185e-05, + "loss": 1.0375, + "num_input_tokens_seen": 185716488, + "step": 11542 + }, + { + "epoch": 0.8085669004526868, + "grad_norm": 3.6580686569213867, + "learning_rate": 1.9219215411558664e-05, + "loss": 0.914, + "num_input_tokens_seen": 185732872, + "step": 11543 + }, + { + "epoch": 0.8086369486984161, + "grad_norm": 3.9027557373046875, + "learning_rate": 1.9212217162872156e-05, + "loss": 1.0027, + "num_input_tokens_seen": 185749256, + "step": 11544 + }, + { + "epoch": 0.8087069969441453, + "grad_norm": 3.9417309761047363, + "learning_rate": 1.9205218914185634e-05, + "loss": 0.9105, + "num_input_tokens_seen": 185765640, + "step": 11545 + }, + { + "epoch": 0.8087770451898745, + "grad_norm": 4.9320526123046875, + "learning_rate": 1.9198220665499123e-05, + "loss": 0.9742, + "num_input_tokens_seen": 185782024, + "step": 11546 + }, + { + "epoch": 0.8088470934356038, + "grad_norm": 4.412097454071045, + "learning_rate": 1.919122241681261e-05, + "loss": 1.0199, + "num_input_tokens_seen": 185797304, + "step": 11547 + }, + { + "epoch": 0.808917141681333, + "grad_norm": 3.272860527038574, + "learning_rate": 1.918422416812609e-05, + "loss": 0.8922, + "num_input_tokens_seen": 185813568, + "step": 11548 + }, + { + "epoch": 0.8089871899270623, + "grad_norm": 3.419759511947632, + "learning_rate": 1.9177225919439582e-05, + "loss": 0.8485, + "num_input_tokens_seen": 185829952, + "step": 11549 + }, + { + "epoch": 0.8090572381727915, + "grad_norm": 3.611455202102661, + "learning_rate": 1.917022767075306e-05, + "loss": 1.1203, + "num_input_tokens_seen": 185846336, + "step": 11550 + }, + { + "epoch": 0.8091272864185207, + "grad_norm": 3.9398574829101562, + "learning_rate": 1.916322942206655e-05, + "loss": 1.0787, + "num_input_tokens_seen": 185862160, + "step": 11551 + }, + { + "epoch": 0.80919733466425, + "grad_norm": 5.035219669342041, + "learning_rate": 1.9156231173380037e-05, + "loss": 1.1416, + "num_input_tokens_seen": 185878544, + "step": 11552 + }, + { + "epoch": 0.8092673829099792, + "grad_norm": 3.5867502689361572, + "learning_rate": 1.9149232924693516e-05, + "loss": 0.982, + "num_input_tokens_seen": 185894472, + "step": 11553 + }, + { + "epoch": 0.8093374311557084, + "grad_norm": 4.685681343078613, + "learning_rate": 1.9142234676007008e-05, + "loss": 1.3207, + "num_input_tokens_seen": 185910856, + "step": 11554 + }, + { + "epoch": 0.8094074794014378, + "grad_norm": 4.472120761871338, + "learning_rate": 1.9135236427320486e-05, + "loss": 0.9926, + "num_input_tokens_seen": 185927240, + "step": 11555 + }, + { + "epoch": 0.809477527647167, + "grad_norm": 3.576550245285034, + "learning_rate": 1.9128238178633975e-05, + "loss": 1.0098, + "num_input_tokens_seen": 185943624, + "step": 11556 + }, + { + "epoch": 0.8095475758928963, + "grad_norm": 4.344847202301025, + "learning_rate": 1.9121239929947463e-05, + "loss": 0.8569, + "num_input_tokens_seen": 185958592, + "step": 11557 + }, + { + "epoch": 0.8096176241386255, + "grad_norm": 4.8098530769348145, + "learning_rate": 1.9114241681260942e-05, + "loss": 0.8478, + "num_input_tokens_seen": 185974080, + "step": 11558 + }, + { + "epoch": 0.8096876723843547, + "grad_norm": 3.6215052604675293, + "learning_rate": 1.9107243432574434e-05, + "loss": 1.0956, + "num_input_tokens_seen": 185989976, + "step": 11559 + }, + { + "epoch": 0.809757720630084, + "grad_norm": 4.27924108505249, + "learning_rate": 1.9100245183887912e-05, + "loss": 1.169, + "num_input_tokens_seen": 186006256, + "step": 11560 + }, + { + "epoch": 0.8098277688758132, + "grad_norm": 3.782261371612549, + "learning_rate": 1.90932469352014e-05, + "loss": 1.0456, + "num_input_tokens_seen": 186022640, + "step": 11561 + }, + { + "epoch": 0.8098978171215424, + "grad_norm": 5.486629962921143, + "learning_rate": 1.908624868651489e-05, + "loss": 1.0469, + "num_input_tokens_seen": 186037512, + "step": 11562 + }, + { + "epoch": 0.8099678653672717, + "grad_norm": 3.929108142852783, + "learning_rate": 1.9079250437828368e-05, + "loss": 1.0238, + "num_input_tokens_seen": 186052680, + "step": 11563 + }, + { + "epoch": 0.8100379136130009, + "grad_norm": 4.256520748138428, + "learning_rate": 1.907225218914186e-05, + "loss": 1.2036, + "num_input_tokens_seen": 186069064, + "step": 11564 + }, + { + "epoch": 0.8101079618587302, + "grad_norm": 3.6261706352233887, + "learning_rate": 1.9065253940455338e-05, + "loss": 0.967, + "num_input_tokens_seen": 186085280, + "step": 11565 + }, + { + "epoch": 0.8101780101044594, + "grad_norm": 5.871255397796631, + "learning_rate": 1.9058255691768827e-05, + "loss": 0.9968, + "num_input_tokens_seen": 186101664, + "step": 11566 + }, + { + "epoch": 0.8102480583501886, + "grad_norm": 3.915750741958618, + "learning_rate": 1.9051257443082305e-05, + "loss": 0.9608, + "num_input_tokens_seen": 186117872, + "step": 11567 + }, + { + "epoch": 0.810318106595918, + "grad_norm": 3.4307005405426025, + "learning_rate": 1.9044259194395794e-05, + "loss": 0.954, + "num_input_tokens_seen": 186133368, + "step": 11568 + }, + { + "epoch": 0.8103881548416472, + "grad_norm": 3.975552797317505, + "learning_rate": 1.9037260945709286e-05, + "loss": 1.0483, + "num_input_tokens_seen": 186148536, + "step": 11569 + }, + { + "epoch": 0.8104582030873765, + "grad_norm": 4.064229965209961, + "learning_rate": 1.9030262697022764e-05, + "loss": 0.7718, + "num_input_tokens_seen": 186164920, + "step": 11570 + }, + { + "epoch": 0.8105282513331057, + "grad_norm": 3.793003797531128, + "learning_rate": 1.9023264448336253e-05, + "loss": 0.9202, + "num_input_tokens_seen": 186179960, + "step": 11571 + }, + { + "epoch": 0.8105982995788349, + "grad_norm": 4.839306831359863, + "learning_rate": 1.901626619964973e-05, + "loss": 1.0283, + "num_input_tokens_seen": 186196344, + "step": 11572 + }, + { + "epoch": 0.8106683478245642, + "grad_norm": 4.589587211608887, + "learning_rate": 1.900926795096322e-05, + "loss": 1.1193, + "num_input_tokens_seen": 186212080, + "step": 11573 + }, + { + "epoch": 0.8107383960702934, + "grad_norm": 4.157476902008057, + "learning_rate": 1.9002269702276712e-05, + "loss": 1.1751, + "num_input_tokens_seen": 186228464, + "step": 11574 + }, + { + "epoch": 0.8108084443160226, + "grad_norm": 3.811147689819336, + "learning_rate": 1.899527145359019e-05, + "loss": 0.9843, + "num_input_tokens_seen": 186244544, + "step": 11575 + }, + { + "epoch": 0.8108784925617519, + "grad_norm": 3.995877265930176, + "learning_rate": 1.898827320490368e-05, + "loss": 0.8695, + "num_input_tokens_seen": 186260824, + "step": 11576 + }, + { + "epoch": 0.8109485408074811, + "grad_norm": 6.229583263397217, + "learning_rate": 1.8981274956217157e-05, + "loss": 1.1362, + "num_input_tokens_seen": 186277208, + "step": 11577 + }, + { + "epoch": 0.8110185890532104, + "grad_norm": 3.8377645015716553, + "learning_rate": 1.8974276707530646e-05, + "loss": 0.9914, + "num_input_tokens_seen": 186292992, + "step": 11578 + }, + { + "epoch": 0.8110886372989397, + "grad_norm": 3.8437838554382324, + "learning_rate": 1.8967278458844138e-05, + "loss": 0.9425, + "num_input_tokens_seen": 186309376, + "step": 11579 + }, + { + "epoch": 0.8111586855446689, + "grad_norm": 4.816789150238037, + "learning_rate": 1.8960280210157616e-05, + "loss": 1.0669, + "num_input_tokens_seen": 186325760, + "step": 11580 + }, + { + "epoch": 0.8112287337903982, + "grad_norm": 3.576373815536499, + "learning_rate": 1.8953281961471105e-05, + "loss": 0.9834, + "num_input_tokens_seen": 186342144, + "step": 11581 + }, + { + "epoch": 0.8112987820361274, + "grad_norm": 5.0766143798828125, + "learning_rate": 1.8946283712784583e-05, + "loss": 1.0795, + "num_input_tokens_seen": 186358360, + "step": 11582 + }, + { + "epoch": 0.8113688302818566, + "grad_norm": 3.941556930541992, + "learning_rate": 1.8939285464098072e-05, + "loss": 1.2131, + "num_input_tokens_seen": 186374744, + "step": 11583 + }, + { + "epoch": 0.8114388785275859, + "grad_norm": 5.125598430633545, + "learning_rate": 1.8932287215411564e-05, + "loss": 1.0832, + "num_input_tokens_seen": 186390080, + "step": 11584 + }, + { + "epoch": 0.8115089267733151, + "grad_norm": 3.547121524810791, + "learning_rate": 1.8925288966725042e-05, + "loss": 1.1636, + "num_input_tokens_seen": 186406464, + "step": 11585 + }, + { + "epoch": 0.8115789750190444, + "grad_norm": 4.017105579376221, + "learning_rate": 1.891829071803853e-05, + "loss": 1.2736, + "num_input_tokens_seen": 186422648, + "step": 11586 + }, + { + "epoch": 0.8116490232647736, + "grad_norm": 3.9220118522644043, + "learning_rate": 1.891129246935201e-05, + "loss": 1.0321, + "num_input_tokens_seen": 186438760, + "step": 11587 + }, + { + "epoch": 0.8117190715105028, + "grad_norm": 4.503072738647461, + "learning_rate": 1.8904294220665498e-05, + "loss": 0.9686, + "num_input_tokens_seen": 186454160, + "step": 11588 + }, + { + "epoch": 0.8117891197562321, + "grad_norm": 5.073387145996094, + "learning_rate": 1.889729597197899e-05, + "loss": 1.03, + "num_input_tokens_seen": 186470544, + "step": 11589 + }, + { + "epoch": 0.8118591680019613, + "grad_norm": 3.558924436569214, + "learning_rate": 1.8890297723292468e-05, + "loss": 1.0681, + "num_input_tokens_seen": 186486928, + "step": 11590 + }, + { + "epoch": 0.8119292162476905, + "grad_norm": 3.542537212371826, + "learning_rate": 1.8883299474605957e-05, + "loss": 1.0214, + "num_input_tokens_seen": 186502192, + "step": 11591 + }, + { + "epoch": 0.8119992644934199, + "grad_norm": 3.936140537261963, + "learning_rate": 1.8876301225919435e-05, + "loss": 1.0707, + "num_input_tokens_seen": 186517560, + "step": 11592 + }, + { + "epoch": 0.8120693127391491, + "grad_norm": 3.787940502166748, + "learning_rate": 1.8869302977232924e-05, + "loss": 1.0093, + "num_input_tokens_seen": 186533904, + "step": 11593 + }, + { + "epoch": 0.8121393609848784, + "grad_norm": 4.023949146270752, + "learning_rate": 1.8862304728546402e-05, + "loss": 0.9266, + "num_input_tokens_seen": 186548816, + "step": 11594 + }, + { + "epoch": 0.8122094092306076, + "grad_norm": 3.503028631210327, + "learning_rate": 1.8855306479859894e-05, + "loss": 0.9323, + "num_input_tokens_seen": 186565088, + "step": 11595 + }, + { + "epoch": 0.8122794574763368, + "grad_norm": 3.906827211380005, + "learning_rate": 1.8848308231173383e-05, + "loss": 1.0543, + "num_input_tokens_seen": 186580592, + "step": 11596 + }, + { + "epoch": 0.8123495057220661, + "grad_norm": 3.77956223487854, + "learning_rate": 1.884130998248686e-05, + "loss": 1.1504, + "num_input_tokens_seen": 186596976, + "step": 11597 + }, + { + "epoch": 0.8124195539677953, + "grad_norm": 4.487522125244141, + "learning_rate": 1.883431173380035e-05, + "loss": 1.1205, + "num_input_tokens_seen": 186612224, + "step": 11598 + }, + { + "epoch": 0.8124896022135246, + "grad_norm": 4.476914405822754, + "learning_rate": 1.882731348511383e-05, + "loss": 1.1318, + "num_input_tokens_seen": 186628608, + "step": 11599 + }, + { + "epoch": 0.8125596504592538, + "grad_norm": 3.6299712657928467, + "learning_rate": 1.882031523642732e-05, + "loss": 0.9369, + "num_input_tokens_seen": 186644216, + "step": 11600 + }, + { + "epoch": 0.8125596504592538, + "eval_loss": 1.1162611246109009, + "eval_runtime": 0.1843, + "eval_samples_per_second": 5.426, + "eval_steps_per_second": 5.426, + "num_input_tokens_seen": 186644216, + "step": 11600 + }, + { + "epoch": 0.812629698704983, + "grad_norm": 3.3051514625549316, + "learning_rate": 1.881331698774081e-05, + "loss": 0.9698, + "num_input_tokens_seen": 186660408, + "step": 11601 + }, + { + "epoch": 0.8126997469507123, + "grad_norm": 4.919456958770752, + "learning_rate": 1.8806318739054287e-05, + "loss": 1.1144, + "num_input_tokens_seen": 186676792, + "step": 11602 + }, + { + "epoch": 0.8127697951964415, + "grad_norm": 3.576235055923462, + "learning_rate": 1.8799320490367776e-05, + "loss": 0.9383, + "num_input_tokens_seen": 186692408, + "step": 11603 + }, + { + "epoch": 0.8128398434421708, + "grad_norm": 3.534600257873535, + "learning_rate": 1.8792322241681254e-05, + "loss": 1.1306, + "num_input_tokens_seen": 186708600, + "step": 11604 + }, + { + "epoch": 0.8129098916879001, + "grad_norm": 3.861203908920288, + "learning_rate": 1.8785323992994746e-05, + "loss": 1.0438, + "num_input_tokens_seen": 186724984, + "step": 11605 + }, + { + "epoch": 0.8129799399336293, + "grad_norm": 3.960965633392334, + "learning_rate": 1.8778325744308235e-05, + "loss": 0.9445, + "num_input_tokens_seen": 186740616, + "step": 11606 + }, + { + "epoch": 0.8130499881793586, + "grad_norm": 6.401646614074707, + "learning_rate": 1.8771327495621713e-05, + "loss": 1.2054, + "num_input_tokens_seen": 186757000, + "step": 11607 + }, + { + "epoch": 0.8131200364250878, + "grad_norm": 3.776820182800293, + "learning_rate": 1.8764329246935202e-05, + "loss": 0.9283, + "num_input_tokens_seen": 186773000, + "step": 11608 + }, + { + "epoch": 0.813190084670817, + "grad_norm": 3.495617151260376, + "learning_rate": 1.875733099824868e-05, + "loss": 0.8386, + "num_input_tokens_seen": 186789384, + "step": 11609 + }, + { + "epoch": 0.8132601329165463, + "grad_norm": 3.96913743019104, + "learning_rate": 1.8750332749562172e-05, + "loss": 0.9881, + "num_input_tokens_seen": 186805504, + "step": 11610 + }, + { + "epoch": 0.8133301811622755, + "grad_norm": 3.623413324356079, + "learning_rate": 1.874333450087566e-05, + "loss": 0.9329, + "num_input_tokens_seen": 186821888, + "step": 11611 + }, + { + "epoch": 0.8134002294080047, + "grad_norm": 3.4724960327148438, + "learning_rate": 1.873633625218914e-05, + "loss": 0.9532, + "num_input_tokens_seen": 186838040, + "step": 11612 + }, + { + "epoch": 0.813470277653734, + "grad_norm": 4.430090427398682, + "learning_rate": 1.8729338003502628e-05, + "loss": 1.1666, + "num_input_tokens_seen": 186854424, + "step": 11613 + }, + { + "epoch": 0.8135403258994632, + "grad_norm": 3.7596726417541504, + "learning_rate": 1.8722339754816106e-05, + "loss": 0.9887, + "num_input_tokens_seen": 186870184, + "step": 11614 + }, + { + "epoch": 0.8136103741451925, + "grad_norm": 4.829859256744385, + "learning_rate": 1.87153415061296e-05, + "loss": 1.1878, + "num_input_tokens_seen": 186886472, + "step": 11615 + }, + { + "epoch": 0.8136804223909218, + "grad_norm": 3.659405469894409, + "learning_rate": 1.8708343257443087e-05, + "loss": 0.9409, + "num_input_tokens_seen": 186902368, + "step": 11616 + }, + { + "epoch": 0.813750470636651, + "grad_norm": 3.529557943344116, + "learning_rate": 1.8701345008756565e-05, + "loss": 1.0813, + "num_input_tokens_seen": 186917832, + "step": 11617 + }, + { + "epoch": 0.8138205188823803, + "grad_norm": 4.123054027557373, + "learning_rate": 1.8694346760070054e-05, + "loss": 0.934, + "num_input_tokens_seen": 186934216, + "step": 11618 + }, + { + "epoch": 0.8138905671281095, + "grad_norm": 4.424140453338623, + "learning_rate": 1.8687348511383532e-05, + "loss": 1.1555, + "num_input_tokens_seen": 186950600, + "step": 11619 + }, + { + "epoch": 0.8139606153738387, + "grad_norm": 4.446724891662598, + "learning_rate": 1.8680350262697024e-05, + "loss": 1.1396, + "num_input_tokens_seen": 186966984, + "step": 11620 + }, + { + "epoch": 0.814030663619568, + "grad_norm": 4.260793209075928, + "learning_rate": 1.8673352014010503e-05, + "loss": 1.013, + "num_input_tokens_seen": 186983368, + "step": 11621 + }, + { + "epoch": 0.8141007118652972, + "grad_norm": 5.402695178985596, + "learning_rate": 1.866635376532399e-05, + "loss": 1.0683, + "num_input_tokens_seen": 186998512, + "step": 11622 + }, + { + "epoch": 0.8141707601110265, + "grad_norm": 3.7450551986694336, + "learning_rate": 1.865935551663748e-05, + "loss": 1.0034, + "num_input_tokens_seen": 187014712, + "step": 11623 + }, + { + "epoch": 0.8142408083567557, + "grad_norm": 4.085254192352295, + "learning_rate": 1.865235726795096e-05, + "loss": 1.173, + "num_input_tokens_seen": 187030280, + "step": 11624 + }, + { + "epoch": 0.8143108566024849, + "grad_norm": 6.785838603973389, + "learning_rate": 1.864535901926445e-05, + "loss": 1.052, + "num_input_tokens_seen": 187046664, + "step": 11625 + }, + { + "epoch": 0.8143809048482142, + "grad_norm": 5.173651218414307, + "learning_rate": 1.863836077057793e-05, + "loss": 1.0256, + "num_input_tokens_seen": 187062096, + "step": 11626 + }, + { + "epoch": 0.8144509530939434, + "grad_norm": 4.474207878112793, + "learning_rate": 1.8631362521891417e-05, + "loss": 1.2382, + "num_input_tokens_seen": 187078480, + "step": 11627 + }, + { + "epoch": 0.8145210013396726, + "grad_norm": 3.548935651779175, + "learning_rate": 1.8624364273204906e-05, + "loss": 0.9708, + "num_input_tokens_seen": 187094216, + "step": 11628 + }, + { + "epoch": 0.814591049585402, + "grad_norm": 4.11301326751709, + "learning_rate": 1.8617366024518384e-05, + "loss": 1.1035, + "num_input_tokens_seen": 187110600, + "step": 11629 + }, + { + "epoch": 0.8146610978311312, + "grad_norm": 5.873447895050049, + "learning_rate": 1.8610367775831876e-05, + "loss": 0.9038, + "num_input_tokens_seen": 187126600, + "step": 11630 + }, + { + "epoch": 0.8147311460768605, + "grad_norm": 3.7859785556793213, + "learning_rate": 1.8603369527145355e-05, + "loss": 0.9204, + "num_input_tokens_seen": 187142672, + "step": 11631 + }, + { + "epoch": 0.8148011943225897, + "grad_norm": 4.169020652770996, + "learning_rate": 1.8596371278458843e-05, + "loss": 0.9332, + "num_input_tokens_seen": 187159056, + "step": 11632 + }, + { + "epoch": 0.8148712425683189, + "grad_norm": 5.8481035232543945, + "learning_rate": 1.8589373029772332e-05, + "loss": 0.9878, + "num_input_tokens_seen": 187174192, + "step": 11633 + }, + { + "epoch": 0.8149412908140482, + "grad_norm": 4.064828395843506, + "learning_rate": 1.858237478108581e-05, + "loss": 0.922, + "num_input_tokens_seen": 187190576, + "step": 11634 + }, + { + "epoch": 0.8150113390597774, + "grad_norm": 3.809098482131958, + "learning_rate": 1.8575376532399302e-05, + "loss": 1.0593, + "num_input_tokens_seen": 187206960, + "step": 11635 + }, + { + "epoch": 0.8150813873055067, + "grad_norm": 3.716153860092163, + "learning_rate": 1.856837828371278e-05, + "loss": 1.0253, + "num_input_tokens_seen": 187223344, + "step": 11636 + }, + { + "epoch": 0.8151514355512359, + "grad_norm": 4.149001598358154, + "learning_rate": 1.856138003502627e-05, + "loss": 1.0996, + "num_input_tokens_seen": 187239136, + "step": 11637 + }, + { + "epoch": 0.8152214837969651, + "grad_norm": 3.642832040786743, + "learning_rate": 1.8554381786339758e-05, + "loss": 0.9602, + "num_input_tokens_seen": 187255520, + "step": 11638 + }, + { + "epoch": 0.8152915320426944, + "grad_norm": 4.349885940551758, + "learning_rate": 1.8547383537653236e-05, + "loss": 1.0293, + "num_input_tokens_seen": 187271904, + "step": 11639 + }, + { + "epoch": 0.8153615802884236, + "grad_norm": 4.224632740020752, + "learning_rate": 1.854038528896673e-05, + "loss": 1.0961, + "num_input_tokens_seen": 187288000, + "step": 11640 + }, + { + "epoch": 0.8154316285341529, + "grad_norm": 3.580479621887207, + "learning_rate": 1.8533387040280207e-05, + "loss": 1.079, + "num_input_tokens_seen": 187304384, + "step": 11641 + }, + { + "epoch": 0.8155016767798822, + "grad_norm": 5.046919345855713, + "learning_rate": 1.8526388791593695e-05, + "loss": 1.1494, + "num_input_tokens_seen": 187318904, + "step": 11642 + }, + { + "epoch": 0.8155717250256114, + "grad_norm": 3.8281965255737305, + "learning_rate": 1.8519390542907184e-05, + "loss": 1.0504, + "num_input_tokens_seen": 187334864, + "step": 11643 + }, + { + "epoch": 0.8156417732713407, + "grad_norm": 4.526974678039551, + "learning_rate": 1.8512392294220662e-05, + "loss": 0.8494, + "num_input_tokens_seen": 187351248, + "step": 11644 + }, + { + "epoch": 0.8157118215170699, + "grad_norm": 3.8726227283477783, + "learning_rate": 1.8505394045534154e-05, + "loss": 1.1158, + "num_input_tokens_seen": 187367632, + "step": 11645 + }, + { + "epoch": 0.8157818697627991, + "grad_norm": 4.195963382720947, + "learning_rate": 1.8498395796847633e-05, + "loss": 1.0795, + "num_input_tokens_seen": 187384016, + "step": 11646 + }, + { + "epoch": 0.8158519180085284, + "grad_norm": 4.208303451538086, + "learning_rate": 1.849139754816112e-05, + "loss": 1.0054, + "num_input_tokens_seen": 187400400, + "step": 11647 + }, + { + "epoch": 0.8159219662542576, + "grad_norm": 3.7081055641174316, + "learning_rate": 1.84843992994746e-05, + "loss": 1.071, + "num_input_tokens_seen": 187416760, + "step": 11648 + }, + { + "epoch": 0.8159920144999868, + "grad_norm": 3.656008243560791, + "learning_rate": 1.847740105078809e-05, + "loss": 1.022, + "num_input_tokens_seen": 187433144, + "step": 11649 + }, + { + "epoch": 0.8160620627457161, + "grad_norm": 3.8131818771362305, + "learning_rate": 1.847040280210158e-05, + "loss": 1.0775, + "num_input_tokens_seen": 187449528, + "step": 11650 + }, + { + "epoch": 0.8161321109914453, + "grad_norm": 5.076834201812744, + "learning_rate": 1.846340455341506e-05, + "loss": 0.9972, + "num_input_tokens_seen": 187465224, + "step": 11651 + }, + { + "epoch": 0.8162021592371747, + "grad_norm": 3.4730820655822754, + "learning_rate": 1.8456406304728547e-05, + "loss": 0.952, + "num_input_tokens_seen": 187481608, + "step": 11652 + }, + { + "epoch": 0.8162722074829039, + "grad_norm": 3.8998332023620605, + "learning_rate": 1.8449408056042026e-05, + "loss": 1.0791, + "num_input_tokens_seen": 187497992, + "step": 11653 + }, + { + "epoch": 0.8163422557286331, + "grad_norm": 4.713872909545898, + "learning_rate": 1.8442409807355514e-05, + "loss": 1.0472, + "num_input_tokens_seen": 187514376, + "step": 11654 + }, + { + "epoch": 0.8164123039743624, + "grad_norm": 5.665464401245117, + "learning_rate": 1.8435411558669006e-05, + "loss": 0.9508, + "num_input_tokens_seen": 187530760, + "step": 11655 + }, + { + "epoch": 0.8164823522200916, + "grad_norm": 4.503643989562988, + "learning_rate": 1.8428413309982485e-05, + "loss": 0.8918, + "num_input_tokens_seen": 187547144, + "step": 11656 + }, + { + "epoch": 0.8165524004658208, + "grad_norm": 3.5383121967315674, + "learning_rate": 1.8421415061295973e-05, + "loss": 1.0584, + "num_input_tokens_seen": 187563528, + "step": 11657 + }, + { + "epoch": 0.8166224487115501, + "grad_norm": 6.368112564086914, + "learning_rate": 1.8414416812609452e-05, + "loss": 0.9904, + "num_input_tokens_seen": 187579912, + "step": 11658 + }, + { + "epoch": 0.8166924969572793, + "grad_norm": 4.906155586242676, + "learning_rate": 1.840741856392294e-05, + "loss": 1.1119, + "num_input_tokens_seen": 187596296, + "step": 11659 + }, + { + "epoch": 0.8167625452030086, + "grad_norm": 4.549793243408203, + "learning_rate": 1.8400420315236432e-05, + "loss": 1.2492, + "num_input_tokens_seen": 187612472, + "step": 11660 + }, + { + "epoch": 0.8168325934487378, + "grad_norm": 4.224708557128906, + "learning_rate": 1.839342206654991e-05, + "loss": 1.261, + "num_input_tokens_seen": 187628336, + "step": 11661 + }, + { + "epoch": 0.816902641694467, + "grad_norm": 3.97493052482605, + "learning_rate": 1.83864238178634e-05, + "loss": 1.2369, + "num_input_tokens_seen": 187644720, + "step": 11662 + }, + { + "epoch": 0.8169726899401963, + "grad_norm": 5.3355536460876465, + "learning_rate": 1.8379425569176878e-05, + "loss": 1.2391, + "num_input_tokens_seen": 187661104, + "step": 11663 + }, + { + "epoch": 0.8170427381859255, + "grad_norm": 3.889575958251953, + "learning_rate": 1.8372427320490366e-05, + "loss": 1.0826, + "num_input_tokens_seen": 187677488, + "step": 11664 + }, + { + "epoch": 0.8171127864316547, + "grad_norm": 4.244647979736328, + "learning_rate": 1.836542907180386e-05, + "loss": 0.9795, + "num_input_tokens_seen": 187693872, + "step": 11665 + }, + { + "epoch": 0.8171828346773841, + "grad_norm": 3.7573037147521973, + "learning_rate": 1.8358430823117337e-05, + "loss": 1.1379, + "num_input_tokens_seen": 187710032, + "step": 11666 + }, + { + "epoch": 0.8172528829231133, + "grad_norm": 4.578756809234619, + "learning_rate": 1.8351432574430825e-05, + "loss": 1.0457, + "num_input_tokens_seen": 187725424, + "step": 11667 + }, + { + "epoch": 0.8173229311688426, + "grad_norm": 3.6207306385040283, + "learning_rate": 1.8344434325744304e-05, + "loss": 0.927, + "num_input_tokens_seen": 187741808, + "step": 11668 + }, + { + "epoch": 0.8173929794145718, + "grad_norm": 3.761510133743286, + "learning_rate": 1.8337436077057793e-05, + "loss": 0.8181, + "num_input_tokens_seen": 187757232, + "step": 11669 + }, + { + "epoch": 0.817463027660301, + "grad_norm": 3.8988399505615234, + "learning_rate": 1.8330437828371284e-05, + "loss": 1.0068, + "num_input_tokens_seen": 187773496, + "step": 11670 + }, + { + "epoch": 0.8175330759060303, + "grad_norm": 4.214535713195801, + "learning_rate": 1.8323439579684763e-05, + "loss": 1.0239, + "num_input_tokens_seen": 187789840, + "step": 11671 + }, + { + "epoch": 0.8176031241517595, + "grad_norm": 4.5992207527160645, + "learning_rate": 1.831644133099825e-05, + "loss": 1.0092, + "num_input_tokens_seen": 187806224, + "step": 11672 + }, + { + "epoch": 0.8176731723974888, + "grad_norm": 3.296382427215576, + "learning_rate": 1.830944308231173e-05, + "loss": 0.9275, + "num_input_tokens_seen": 187822088, + "step": 11673 + }, + { + "epoch": 0.817743220643218, + "grad_norm": 3.646695852279663, + "learning_rate": 1.830244483362522e-05, + "loss": 1.1137, + "num_input_tokens_seen": 187838472, + "step": 11674 + }, + { + "epoch": 0.8178132688889472, + "grad_norm": 3.6973507404327393, + "learning_rate": 1.8295446584938697e-05, + "loss": 1.0335, + "num_input_tokens_seen": 187854856, + "step": 11675 + }, + { + "epoch": 0.8178833171346765, + "grad_norm": 3.730363368988037, + "learning_rate": 1.828844833625219e-05, + "loss": 1.1537, + "num_input_tokens_seen": 187871240, + "step": 11676 + }, + { + "epoch": 0.8179533653804058, + "grad_norm": 3.899176597595215, + "learning_rate": 1.8281450087565678e-05, + "loss": 0.9733, + "num_input_tokens_seen": 187887624, + "step": 11677 + }, + { + "epoch": 0.818023413626135, + "grad_norm": 4.780815601348877, + "learning_rate": 1.8274451838879156e-05, + "loss": 1.164, + "num_input_tokens_seen": 187903608, + "step": 11678 + }, + { + "epoch": 0.8180934618718643, + "grad_norm": 5.786088466644287, + "learning_rate": 1.8267453590192645e-05, + "loss": 1.016, + "num_input_tokens_seen": 187918304, + "step": 11679 + }, + { + "epoch": 0.8181635101175935, + "grad_norm": 3.888845920562744, + "learning_rate": 1.8260455341506123e-05, + "loss": 0.9738, + "num_input_tokens_seen": 187933240, + "step": 11680 + }, + { + "epoch": 0.8182335583633228, + "grad_norm": 3.774116277694702, + "learning_rate": 1.8253457092819615e-05, + "loss": 1.066, + "num_input_tokens_seen": 187949280, + "step": 11681 + }, + { + "epoch": 0.818303606609052, + "grad_norm": 9.087299346923828, + "learning_rate": 1.8246458844133104e-05, + "loss": 0.9664, + "num_input_tokens_seen": 187964224, + "step": 11682 + }, + { + "epoch": 0.8183736548547812, + "grad_norm": 3.947052001953125, + "learning_rate": 1.8239460595446582e-05, + "loss": 1.0936, + "num_input_tokens_seen": 187980048, + "step": 11683 + }, + { + "epoch": 0.8184437031005105, + "grad_norm": 4.287691593170166, + "learning_rate": 1.823246234676007e-05, + "loss": 1.128, + "num_input_tokens_seen": 187996216, + "step": 11684 + }, + { + "epoch": 0.8185137513462397, + "grad_norm": 5.179099082946777, + "learning_rate": 1.822546409807355e-05, + "loss": 1.0239, + "num_input_tokens_seen": 188012400, + "step": 11685 + }, + { + "epoch": 0.8185837995919689, + "grad_norm": 3.68485426902771, + "learning_rate": 1.821846584938704e-05, + "loss": 0.9719, + "num_input_tokens_seen": 188028784, + "step": 11686 + }, + { + "epoch": 0.8186538478376982, + "grad_norm": 4.3032002449035645, + "learning_rate": 1.821146760070053e-05, + "loss": 1.0609, + "num_input_tokens_seen": 188045168, + "step": 11687 + }, + { + "epoch": 0.8187238960834274, + "grad_norm": 3.598381757736206, + "learning_rate": 1.8204469352014008e-05, + "loss": 1.0124, + "num_input_tokens_seen": 188061552, + "step": 11688 + }, + { + "epoch": 0.8187939443291568, + "grad_norm": 3.4809765815734863, + "learning_rate": 1.8197471103327497e-05, + "loss": 0.9565, + "num_input_tokens_seen": 188077760, + "step": 11689 + }, + { + "epoch": 0.818863992574886, + "grad_norm": 4.5244059562683105, + "learning_rate": 1.8190472854640975e-05, + "loss": 1.0888, + "num_input_tokens_seen": 188093928, + "step": 11690 + }, + { + "epoch": 0.8189340408206152, + "grad_norm": 4.192951679229736, + "learning_rate": 1.8183474605954467e-05, + "loss": 1.1699, + "num_input_tokens_seen": 188109552, + "step": 11691 + }, + { + "epoch": 0.8190040890663445, + "grad_norm": 4.614601135253906, + "learning_rate": 1.8176476357267956e-05, + "loss": 1.0134, + "num_input_tokens_seen": 188125048, + "step": 11692 + }, + { + "epoch": 0.8190741373120737, + "grad_norm": 4.692627906799316, + "learning_rate": 1.8169478108581434e-05, + "loss": 1.0604, + "num_input_tokens_seen": 188141432, + "step": 11693 + }, + { + "epoch": 0.8191441855578029, + "grad_norm": 3.427537441253662, + "learning_rate": 1.8162479859894923e-05, + "loss": 1.0184, + "num_input_tokens_seen": 188157816, + "step": 11694 + }, + { + "epoch": 0.8192142338035322, + "grad_norm": 4.047211170196533, + "learning_rate": 1.81554816112084e-05, + "loss": 1.0303, + "num_input_tokens_seen": 188174200, + "step": 11695 + }, + { + "epoch": 0.8192842820492614, + "grad_norm": 4.440093994140625, + "learning_rate": 1.8148483362521893e-05, + "loss": 1.1323, + "num_input_tokens_seen": 188190584, + "step": 11696 + }, + { + "epoch": 0.8193543302949907, + "grad_norm": 3.8503477573394775, + "learning_rate": 1.814148511383538e-05, + "loss": 1.2847, + "num_input_tokens_seen": 188206792, + "step": 11697 + }, + { + "epoch": 0.8194243785407199, + "grad_norm": 3.8365819454193115, + "learning_rate": 1.813448686514886e-05, + "loss": 0.9724, + "num_input_tokens_seen": 188223176, + "step": 11698 + }, + { + "epoch": 0.8194944267864491, + "grad_norm": 3.6131432056427, + "learning_rate": 1.812748861646235e-05, + "loss": 1.0965, + "num_input_tokens_seen": 188239560, + "step": 11699 + }, + { + "epoch": 0.8195644750321784, + "grad_norm": 4.438934803009033, + "learning_rate": 1.8120490367775827e-05, + "loss": 1.0487, + "num_input_tokens_seen": 188255248, + "step": 11700 + }, + { + "epoch": 0.8196345232779076, + "grad_norm": 4.16170072555542, + "learning_rate": 1.811349211908932e-05, + "loss": 1.1821, + "num_input_tokens_seen": 188271632, + "step": 11701 + }, + { + "epoch": 0.819704571523637, + "grad_norm": 4.538398742675781, + "learning_rate": 1.8106493870402797e-05, + "loss": 0.9175, + "num_input_tokens_seen": 188287000, + "step": 11702 + }, + { + "epoch": 0.8197746197693662, + "grad_norm": 4.474297523498535, + "learning_rate": 1.8099495621716286e-05, + "loss": 1.1192, + "num_input_tokens_seen": 188302944, + "step": 11703 + }, + { + "epoch": 0.8198446680150954, + "grad_norm": 4.949738025665283, + "learning_rate": 1.8092497373029775e-05, + "loss": 0.9663, + "num_input_tokens_seen": 188319328, + "step": 11704 + }, + { + "epoch": 0.8199147162608247, + "grad_norm": 3.795344352722168, + "learning_rate": 1.8085499124343253e-05, + "loss": 0.9454, + "num_input_tokens_seen": 188335712, + "step": 11705 + }, + { + "epoch": 0.8199847645065539, + "grad_norm": 4.090040683746338, + "learning_rate": 1.8078500875656745e-05, + "loss": 0.8485, + "num_input_tokens_seen": 188351992, + "step": 11706 + }, + { + "epoch": 0.8200548127522831, + "grad_norm": 3.6649208068847656, + "learning_rate": 1.8071502626970223e-05, + "loss": 0.9743, + "num_input_tokens_seen": 188368376, + "step": 11707 + }, + { + "epoch": 0.8201248609980124, + "grad_norm": 3.797043561935425, + "learning_rate": 1.8064504378283712e-05, + "loss": 0.9721, + "num_input_tokens_seen": 188384168, + "step": 11708 + }, + { + "epoch": 0.8201949092437416, + "grad_norm": 4.092159748077393, + "learning_rate": 1.80575061295972e-05, + "loss": 1.2563, + "num_input_tokens_seen": 188399816, + "step": 11709 + }, + { + "epoch": 0.8202649574894709, + "grad_norm": 3.677422523498535, + "learning_rate": 1.805050788091068e-05, + "loss": 0.8733, + "num_input_tokens_seen": 188415888, + "step": 11710 + }, + { + "epoch": 0.8203350057352001, + "grad_norm": 3.961655616760254, + "learning_rate": 1.804350963222417e-05, + "loss": 1.0243, + "num_input_tokens_seen": 188432272, + "step": 11711 + }, + { + "epoch": 0.8204050539809293, + "grad_norm": 4.316862106323242, + "learning_rate": 1.803651138353765e-05, + "loss": 1.2255, + "num_input_tokens_seen": 188448400, + "step": 11712 + }, + { + "epoch": 0.8204751022266586, + "grad_norm": 3.4550070762634277, + "learning_rate": 1.8029513134851138e-05, + "loss": 0.9698, + "num_input_tokens_seen": 188464784, + "step": 11713 + }, + { + "epoch": 0.8205451504723879, + "grad_norm": 5.12759256362915, + "learning_rate": 1.8022514886164627e-05, + "loss": 1.2241, + "num_input_tokens_seen": 188481168, + "step": 11714 + }, + { + "epoch": 0.8206151987181171, + "grad_norm": 3.9451558589935303, + "learning_rate": 1.8015516637478105e-05, + "loss": 1.0516, + "num_input_tokens_seen": 188497552, + "step": 11715 + }, + { + "epoch": 0.8206852469638464, + "grad_norm": 3.7539796829223633, + "learning_rate": 1.8008518388791597e-05, + "loss": 0.9829, + "num_input_tokens_seen": 188513936, + "step": 11716 + }, + { + "epoch": 0.8207552952095756, + "grad_norm": 3.6421828269958496, + "learning_rate": 1.8001520140105075e-05, + "loss": 0.938, + "num_input_tokens_seen": 188530320, + "step": 11717 + }, + { + "epoch": 0.8208253434553049, + "grad_norm": 3.9875099658966064, + "learning_rate": 1.7994521891418564e-05, + "loss": 1.2077, + "num_input_tokens_seen": 188546704, + "step": 11718 + }, + { + "epoch": 0.8208953917010341, + "grad_norm": 3.8999719619750977, + "learning_rate": 1.7987523642732053e-05, + "loss": 0.9198, + "num_input_tokens_seen": 188563088, + "step": 11719 + }, + { + "epoch": 0.8209654399467633, + "grad_norm": 4.114893436431885, + "learning_rate": 1.798052539404553e-05, + "loss": 0.9734, + "num_input_tokens_seen": 188579160, + "step": 11720 + }, + { + "epoch": 0.8210354881924926, + "grad_norm": 3.896024227142334, + "learning_rate": 1.7973527145359023e-05, + "loss": 0.8833, + "num_input_tokens_seen": 188594704, + "step": 11721 + }, + { + "epoch": 0.8211055364382218, + "grad_norm": 4.675436973571777, + "learning_rate": 1.79665288966725e-05, + "loss": 1.2482, + "num_input_tokens_seen": 188610912, + "step": 11722 + }, + { + "epoch": 0.821175584683951, + "grad_norm": 3.808171033859253, + "learning_rate": 1.795953064798599e-05, + "loss": 0.9285, + "num_input_tokens_seen": 188627296, + "step": 11723 + }, + { + "epoch": 0.8212456329296803, + "grad_norm": 7.5378618240356445, + "learning_rate": 1.795253239929948e-05, + "loss": 1.0906, + "num_input_tokens_seen": 188643680, + "step": 11724 + }, + { + "epoch": 0.8213156811754095, + "grad_norm": 4.495467185974121, + "learning_rate": 1.7945534150612957e-05, + "loss": 1.1427, + "num_input_tokens_seen": 188660064, + "step": 11725 + }, + { + "epoch": 0.8213857294211389, + "grad_norm": 3.476060152053833, + "learning_rate": 1.793853590192645e-05, + "loss": 0.9287, + "num_input_tokens_seen": 188676448, + "step": 11726 + }, + { + "epoch": 0.8214557776668681, + "grad_norm": 5.000412464141846, + "learning_rate": 1.7931537653239927e-05, + "loss": 1.0403, + "num_input_tokens_seen": 188692392, + "step": 11727 + }, + { + "epoch": 0.8215258259125973, + "grad_norm": 6.9442667961120605, + "learning_rate": 1.7924539404553416e-05, + "loss": 0.9543, + "num_input_tokens_seen": 188707048, + "step": 11728 + }, + { + "epoch": 0.8215958741583266, + "grad_norm": 3.724740505218506, + "learning_rate": 1.7917541155866894e-05, + "loss": 1.1063, + "num_input_tokens_seen": 188723432, + "step": 11729 + }, + { + "epoch": 0.8216659224040558, + "grad_norm": 4.919325351715088, + "learning_rate": 1.7910542907180383e-05, + "loss": 1.0061, + "num_input_tokens_seen": 188739816, + "step": 11730 + }, + { + "epoch": 0.821735970649785, + "grad_norm": 3.4669697284698486, + "learning_rate": 1.7903544658493875e-05, + "loss": 1.0525, + "num_input_tokens_seen": 188756200, + "step": 11731 + }, + { + "epoch": 0.8218060188955143, + "grad_norm": 3.5254414081573486, + "learning_rate": 1.7896546409807353e-05, + "loss": 0.9823, + "num_input_tokens_seen": 188771640, + "step": 11732 + }, + { + "epoch": 0.8218760671412435, + "grad_norm": 3.657233476638794, + "learning_rate": 1.7889548161120842e-05, + "loss": 0.9228, + "num_input_tokens_seen": 188787720, + "step": 11733 + }, + { + "epoch": 0.8219461153869728, + "grad_norm": 3.4727237224578857, + "learning_rate": 1.788254991243432e-05, + "loss": 0.9665, + "num_input_tokens_seen": 188804104, + "step": 11734 + }, + { + "epoch": 0.822016163632702, + "grad_norm": 3.9914019107818604, + "learning_rate": 1.787555166374781e-05, + "loss": 1.0945, + "num_input_tokens_seen": 188819744, + "step": 11735 + }, + { + "epoch": 0.8220862118784312, + "grad_norm": 3.790971040725708, + "learning_rate": 1.78685534150613e-05, + "loss": 1.1075, + "num_input_tokens_seen": 188836128, + "step": 11736 + }, + { + "epoch": 0.8221562601241605, + "grad_norm": 4.110093593597412, + "learning_rate": 1.786155516637478e-05, + "loss": 0.8889, + "num_input_tokens_seen": 188851936, + "step": 11737 + }, + { + "epoch": 0.8222263083698897, + "grad_norm": 3.509882926940918, + "learning_rate": 1.7854556917688268e-05, + "loss": 1.0651, + "num_input_tokens_seen": 188868248, + "step": 11738 + }, + { + "epoch": 0.8222963566156191, + "grad_norm": 3.907679796218872, + "learning_rate": 1.7847558669001746e-05, + "loss": 1.0234, + "num_input_tokens_seen": 188884632, + "step": 11739 + }, + { + "epoch": 0.8223664048613483, + "grad_norm": 3.5460269451141357, + "learning_rate": 1.7840560420315235e-05, + "loss": 0.8945, + "num_input_tokens_seen": 188901016, + "step": 11740 + }, + { + "epoch": 0.8224364531070775, + "grad_norm": 3.404100179672241, + "learning_rate": 1.7833562171628727e-05, + "loss": 0.9763, + "num_input_tokens_seen": 188917400, + "step": 11741 + }, + { + "epoch": 0.8225065013528068, + "grad_norm": 4.25472354888916, + "learning_rate": 1.7826563922942205e-05, + "loss": 0.9349, + "num_input_tokens_seen": 188932896, + "step": 11742 + }, + { + "epoch": 0.822576549598536, + "grad_norm": 3.855407238006592, + "learning_rate": 1.7819565674255694e-05, + "loss": 0.9319, + "num_input_tokens_seen": 188949088, + "step": 11743 + }, + { + "epoch": 0.8226465978442652, + "grad_norm": 3.4372119903564453, + "learning_rate": 1.7812567425569173e-05, + "loss": 1.1262, + "num_input_tokens_seen": 188965472, + "step": 11744 + }, + { + "epoch": 0.8227166460899945, + "grad_norm": 4.730118751525879, + "learning_rate": 1.780556917688266e-05, + "loss": 1.0243, + "num_input_tokens_seen": 188980560, + "step": 11745 + }, + { + "epoch": 0.8227866943357237, + "grad_norm": 3.7299141883850098, + "learning_rate": 1.7798570928196153e-05, + "loss": 0.8506, + "num_input_tokens_seen": 188996944, + "step": 11746 + }, + { + "epoch": 0.822856742581453, + "grad_norm": 4.034008026123047, + "learning_rate": 1.779157267950963e-05, + "loss": 1.0192, + "num_input_tokens_seen": 189012904, + "step": 11747 + }, + { + "epoch": 0.8229267908271822, + "grad_norm": 3.4899730682373047, + "learning_rate": 1.778457443082312e-05, + "loss": 0.9993, + "num_input_tokens_seen": 189029288, + "step": 11748 + }, + { + "epoch": 0.8229968390729114, + "grad_norm": 5.346974849700928, + "learning_rate": 1.77775761821366e-05, + "loss": 1.1311, + "num_input_tokens_seen": 189045288, + "step": 11749 + }, + { + "epoch": 0.8230668873186407, + "grad_norm": 4.063854217529297, + "learning_rate": 1.7770577933450087e-05, + "loss": 1.2514, + "num_input_tokens_seen": 189060504, + "step": 11750 + }, + { + "epoch": 0.82313693556437, + "grad_norm": 3.9708306789398193, + "learning_rate": 1.776357968476358e-05, + "loss": 1.0316, + "num_input_tokens_seen": 189076088, + "step": 11751 + }, + { + "epoch": 0.8232069838100992, + "grad_norm": 4.934445381164551, + "learning_rate": 1.7756581436077058e-05, + "loss": 1.0554, + "num_input_tokens_seen": 189089552, + "step": 11752 + }, + { + "epoch": 0.8232770320558285, + "grad_norm": 4.177801609039307, + "learning_rate": 1.7749583187390546e-05, + "loss": 1.0397, + "num_input_tokens_seen": 189105368, + "step": 11753 + }, + { + "epoch": 0.8233470803015577, + "grad_norm": 3.9348433017730713, + "learning_rate": 1.7742584938704025e-05, + "loss": 1.0368, + "num_input_tokens_seen": 189120744, + "step": 11754 + }, + { + "epoch": 0.823417128547287, + "grad_norm": 3.964355707168579, + "learning_rate": 1.7735586690017513e-05, + "loss": 1.1091, + "num_input_tokens_seen": 189137128, + "step": 11755 + }, + { + "epoch": 0.8234871767930162, + "grad_norm": 5.3760600090026855, + "learning_rate": 1.772858844133099e-05, + "loss": 0.9241, + "num_input_tokens_seen": 189153512, + "step": 11756 + }, + { + "epoch": 0.8235572250387454, + "grad_norm": 4.6102776527404785, + "learning_rate": 1.7721590192644484e-05, + "loss": 0.9751, + "num_input_tokens_seen": 189169216, + "step": 11757 + }, + { + "epoch": 0.8236272732844747, + "grad_norm": 3.831645965576172, + "learning_rate": 1.7714591943957972e-05, + "loss": 0.9746, + "num_input_tokens_seen": 189185112, + "step": 11758 + }, + { + "epoch": 0.8236973215302039, + "grad_norm": 4.852298259735107, + "learning_rate": 1.770759369527145e-05, + "loss": 1.0264, + "num_input_tokens_seen": 189201328, + "step": 11759 + }, + { + "epoch": 0.8237673697759331, + "grad_norm": 3.928135395050049, + "learning_rate": 1.770059544658494e-05, + "loss": 0.9876, + "num_input_tokens_seen": 189217712, + "step": 11760 + }, + { + "epoch": 0.8238374180216624, + "grad_norm": 4.693716526031494, + "learning_rate": 1.7693597197898418e-05, + "loss": 1.0866, + "num_input_tokens_seen": 189234096, + "step": 11761 + }, + { + "epoch": 0.8239074662673916, + "grad_norm": 3.492230176925659, + "learning_rate": 1.768659894921191e-05, + "loss": 1.0498, + "num_input_tokens_seen": 189250480, + "step": 11762 + }, + { + "epoch": 0.823977514513121, + "grad_norm": 3.8025405406951904, + "learning_rate": 1.7679600700525398e-05, + "loss": 1.0502, + "num_input_tokens_seen": 189266368, + "step": 11763 + }, + { + "epoch": 0.8240475627588502, + "grad_norm": 3.9282734394073486, + "learning_rate": 1.7672602451838877e-05, + "loss": 1.0459, + "num_input_tokens_seen": 189282752, + "step": 11764 + }, + { + "epoch": 0.8241176110045794, + "grad_norm": 3.9255306720733643, + "learning_rate": 1.7665604203152365e-05, + "loss": 1.0313, + "num_input_tokens_seen": 189297880, + "step": 11765 + }, + { + "epoch": 0.8241876592503087, + "grad_norm": 3.966909885406494, + "learning_rate": 1.7658605954465844e-05, + "loss": 1.1931, + "num_input_tokens_seen": 189314264, + "step": 11766 + }, + { + "epoch": 0.8242577074960379, + "grad_norm": 3.4578847885131836, + "learning_rate": 1.7651607705779336e-05, + "loss": 0.932, + "num_input_tokens_seen": 189330648, + "step": 11767 + }, + { + "epoch": 0.8243277557417671, + "grad_norm": 3.394818067550659, + "learning_rate": 1.7644609457092824e-05, + "loss": 1.1306, + "num_input_tokens_seen": 189347032, + "step": 11768 + }, + { + "epoch": 0.8243978039874964, + "grad_norm": 3.581359624862671, + "learning_rate": 1.7637611208406303e-05, + "loss": 0.9943, + "num_input_tokens_seen": 189363416, + "step": 11769 + }, + { + "epoch": 0.8244678522332256, + "grad_norm": 3.766108989715576, + "learning_rate": 1.763061295971979e-05, + "loss": 0.9197, + "num_input_tokens_seen": 189379696, + "step": 11770 + }, + { + "epoch": 0.8245379004789549, + "grad_norm": 4.728603839874268, + "learning_rate": 1.762361471103327e-05, + "loss": 1.1586, + "num_input_tokens_seen": 189395856, + "step": 11771 + }, + { + "epoch": 0.8246079487246841, + "grad_norm": 6.618093967437744, + "learning_rate": 1.761661646234676e-05, + "loss": 0.8328, + "num_input_tokens_seen": 189412240, + "step": 11772 + }, + { + "epoch": 0.8246779969704133, + "grad_norm": 3.570030927658081, + "learning_rate": 1.760961821366025e-05, + "loss": 0.8832, + "num_input_tokens_seen": 189428536, + "step": 11773 + }, + { + "epoch": 0.8247480452161426, + "grad_norm": 4.6491475105285645, + "learning_rate": 1.760261996497373e-05, + "loss": 1.1903, + "num_input_tokens_seen": 189444048, + "step": 11774 + }, + { + "epoch": 0.8248180934618718, + "grad_norm": 3.6373143196105957, + "learning_rate": 1.7595621716287217e-05, + "loss": 1.0211, + "num_input_tokens_seen": 189460432, + "step": 11775 + }, + { + "epoch": 0.8248881417076012, + "grad_norm": 3.637921094894409, + "learning_rate": 1.7588623467600696e-05, + "loss": 1.1417, + "num_input_tokens_seen": 189475984, + "step": 11776 + }, + { + "epoch": 0.8249581899533304, + "grad_norm": 4.361634731292725, + "learning_rate": 1.7581625218914188e-05, + "loss": 0.9, + "num_input_tokens_seen": 189492368, + "step": 11777 + }, + { + "epoch": 0.8250282381990596, + "grad_norm": 3.674819231033325, + "learning_rate": 1.7574626970227676e-05, + "loss": 1.0065, + "num_input_tokens_seen": 189508752, + "step": 11778 + }, + { + "epoch": 0.8250982864447889, + "grad_norm": 4.118724346160889, + "learning_rate": 1.7567628721541155e-05, + "loss": 1.084, + "num_input_tokens_seen": 189524872, + "step": 11779 + }, + { + "epoch": 0.8251683346905181, + "grad_norm": 4.321122169494629, + "learning_rate": 1.7560630472854643e-05, + "loss": 1.09, + "num_input_tokens_seen": 189541256, + "step": 11780 + }, + { + "epoch": 0.8252383829362473, + "grad_norm": 4.440560340881348, + "learning_rate": 1.755363222416812e-05, + "loss": 0.9742, + "num_input_tokens_seen": 189557640, + "step": 11781 + }, + { + "epoch": 0.8253084311819766, + "grad_norm": 3.373345375061035, + "learning_rate": 1.7546633975481614e-05, + "loss": 0.992, + "num_input_tokens_seen": 189573880, + "step": 11782 + }, + { + "epoch": 0.8253784794277058, + "grad_norm": 4.856693744659424, + "learning_rate": 1.7539635726795092e-05, + "loss": 1.1316, + "num_input_tokens_seen": 189590264, + "step": 11783 + }, + { + "epoch": 0.8254485276734351, + "grad_norm": 3.9957010746002197, + "learning_rate": 1.753263747810858e-05, + "loss": 1.0903, + "num_input_tokens_seen": 189606648, + "step": 11784 + }, + { + "epoch": 0.8255185759191643, + "grad_norm": 4.480403900146484, + "learning_rate": 1.752563922942207e-05, + "loss": 1.1052, + "num_input_tokens_seen": 189622584, + "step": 11785 + }, + { + "epoch": 0.8255886241648935, + "grad_norm": 3.967893362045288, + "learning_rate": 1.7518640980735548e-05, + "loss": 1.062, + "num_input_tokens_seen": 189638504, + "step": 11786 + }, + { + "epoch": 0.8256586724106229, + "grad_norm": 6.035984992980957, + "learning_rate": 1.751164273204904e-05, + "loss": 1.0311, + "num_input_tokens_seen": 189654888, + "step": 11787 + }, + { + "epoch": 0.825728720656352, + "grad_norm": 4.154508113861084, + "learning_rate": 1.7504644483362518e-05, + "loss": 1.1558, + "num_input_tokens_seen": 189670360, + "step": 11788 + }, + { + "epoch": 0.8257987689020813, + "grad_norm": 4.111290454864502, + "learning_rate": 1.7497646234676007e-05, + "loss": 1.142, + "num_input_tokens_seen": 189685672, + "step": 11789 + }, + { + "epoch": 0.8258688171478106, + "grad_norm": 3.739572048187256, + "learning_rate": 1.7490647985989495e-05, + "loss": 0.9276, + "num_input_tokens_seen": 189701160, + "step": 11790 + }, + { + "epoch": 0.8259388653935398, + "grad_norm": 3.7378427982330322, + "learning_rate": 1.7483649737302974e-05, + "loss": 0.965, + "num_input_tokens_seen": 189717544, + "step": 11791 + }, + { + "epoch": 0.8260089136392691, + "grad_norm": 4.162009239196777, + "learning_rate": 1.7476651488616462e-05, + "loss": 1.1047, + "num_input_tokens_seen": 189733672, + "step": 11792 + }, + { + "epoch": 0.8260789618849983, + "grad_norm": 3.6458466053009033, + "learning_rate": 1.7469653239929944e-05, + "loss": 0.9218, + "num_input_tokens_seen": 189749672, + "step": 11793 + }, + { + "epoch": 0.8261490101307275, + "grad_norm": 3.7767248153686523, + "learning_rate": 1.7462654991243433e-05, + "loss": 0.848, + "num_input_tokens_seen": 189765816, + "step": 11794 + }, + { + "epoch": 0.8262190583764568, + "grad_norm": 4.214595794677734, + "learning_rate": 1.745565674255692e-05, + "loss": 1.0948, + "num_input_tokens_seen": 189781032, + "step": 11795 + }, + { + "epoch": 0.826289106622186, + "grad_norm": 3.745243549346924, + "learning_rate": 1.74486584938704e-05, + "loss": 1.0294, + "num_input_tokens_seen": 189797416, + "step": 11796 + }, + { + "epoch": 0.8263591548679152, + "grad_norm": 4.123645305633545, + "learning_rate": 1.7441660245183888e-05, + "loss": 1.126, + "num_input_tokens_seen": 189812464, + "step": 11797 + }, + { + "epoch": 0.8264292031136445, + "grad_norm": 4.5345072746276855, + "learning_rate": 1.743466199649737e-05, + "loss": 1.0128, + "num_input_tokens_seen": 189828848, + "step": 11798 + }, + { + "epoch": 0.8264992513593737, + "grad_norm": 5.456040382385254, + "learning_rate": 1.742766374781086e-05, + "loss": 0.9579, + "num_input_tokens_seen": 189844440, + "step": 11799 + }, + { + "epoch": 0.8265692996051031, + "grad_norm": 4.341781139373779, + "learning_rate": 1.7420665499124347e-05, + "loss": 1.1847, + "num_input_tokens_seen": 189860824, + "step": 11800 + }, + { + "epoch": 0.8265692996051031, + "eval_loss": 1.1157857179641724, + "eval_runtime": 0.1891, + "eval_samples_per_second": 5.288, + "eval_steps_per_second": 5.288, + "num_input_tokens_seen": 189860824, + "step": 11800 + }, + { + "epoch": 0.8266393478508323, + "grad_norm": 4.163001537322998, + "learning_rate": 1.7413667250437826e-05, + "loss": 0.9421, + "num_input_tokens_seen": 189876144, + "step": 11801 + }, + { + "epoch": 0.8267093960965615, + "grad_norm": 4.134149551391602, + "learning_rate": 1.7406669001751314e-05, + "loss": 1.128, + "num_input_tokens_seen": 189892528, + "step": 11802 + }, + { + "epoch": 0.8267794443422908, + "grad_norm": 4.333460330963135, + "learning_rate": 1.7399670753064796e-05, + "loss": 1.0665, + "num_input_tokens_seen": 189908912, + "step": 11803 + }, + { + "epoch": 0.82684949258802, + "grad_norm": 3.534644842147827, + "learning_rate": 1.7392672504378285e-05, + "loss": 0.9782, + "num_input_tokens_seen": 189925296, + "step": 11804 + }, + { + "epoch": 0.8269195408337493, + "grad_norm": 4.410394668579102, + "learning_rate": 1.7385674255691773e-05, + "loss": 0.997, + "num_input_tokens_seen": 189941016, + "step": 11805 + }, + { + "epoch": 0.8269895890794785, + "grad_norm": 4.069926738739014, + "learning_rate": 1.737867600700525e-05, + "loss": 1.2103, + "num_input_tokens_seen": 189957400, + "step": 11806 + }, + { + "epoch": 0.8270596373252077, + "grad_norm": 5.1703691482543945, + "learning_rate": 1.737167775831874e-05, + "loss": 1.1266, + "num_input_tokens_seen": 189973784, + "step": 11807 + }, + { + "epoch": 0.827129685570937, + "grad_norm": 4.0717267990112305, + "learning_rate": 1.7364679509632222e-05, + "loss": 0.8164, + "num_input_tokens_seen": 189989376, + "step": 11808 + }, + { + "epoch": 0.8271997338166662, + "grad_norm": 3.415815830230713, + "learning_rate": 1.735768126094571e-05, + "loss": 1.001, + "num_input_tokens_seen": 190005312, + "step": 11809 + }, + { + "epoch": 0.8272697820623954, + "grad_norm": 3.861854314804077, + "learning_rate": 1.735068301225919e-05, + "loss": 1.0616, + "num_input_tokens_seen": 190021696, + "step": 11810 + }, + { + "epoch": 0.8273398303081247, + "grad_norm": 3.945122241973877, + "learning_rate": 1.7343684763572678e-05, + "loss": 1.1037, + "num_input_tokens_seen": 190037024, + "step": 11811 + }, + { + "epoch": 0.827409878553854, + "grad_norm": 3.894895315170288, + "learning_rate": 1.7336686514886166e-05, + "loss": 1.1286, + "num_input_tokens_seen": 190053408, + "step": 11812 + }, + { + "epoch": 0.8274799267995833, + "grad_norm": 5.957433223724365, + "learning_rate": 1.7329688266199645e-05, + "loss": 1.138, + "num_input_tokens_seen": 190069792, + "step": 11813 + }, + { + "epoch": 0.8275499750453125, + "grad_norm": 3.622366189956665, + "learning_rate": 1.7322690017513137e-05, + "loss": 0.8744, + "num_input_tokens_seen": 190085216, + "step": 11814 + }, + { + "epoch": 0.8276200232910417, + "grad_norm": 4.047665596008301, + "learning_rate": 1.7315691768826615e-05, + "loss": 1.0097, + "num_input_tokens_seen": 190100744, + "step": 11815 + }, + { + "epoch": 0.827690071536771, + "grad_norm": 3.881692409515381, + "learning_rate": 1.7308693520140104e-05, + "loss": 1.1932, + "num_input_tokens_seen": 190117128, + "step": 11816 + }, + { + "epoch": 0.8277601197825002, + "grad_norm": 4.211220741271973, + "learning_rate": 1.7301695271453592e-05, + "loss": 1.0426, + "num_input_tokens_seen": 190132928, + "step": 11817 + }, + { + "epoch": 0.8278301680282294, + "grad_norm": 5.26561975479126, + "learning_rate": 1.729469702276707e-05, + "loss": 0.9566, + "num_input_tokens_seen": 190149312, + "step": 11818 + }, + { + "epoch": 0.8279002162739587, + "grad_norm": 4.029213905334473, + "learning_rate": 1.7287698774080563e-05, + "loss": 1.1134, + "num_input_tokens_seen": 190165696, + "step": 11819 + }, + { + "epoch": 0.8279702645196879, + "grad_norm": 3.9490396976470947, + "learning_rate": 1.728070052539404e-05, + "loss": 1.118, + "num_input_tokens_seen": 190182080, + "step": 11820 + }, + { + "epoch": 0.8280403127654172, + "grad_norm": 3.631500244140625, + "learning_rate": 1.727370227670753e-05, + "loss": 0.9387, + "num_input_tokens_seen": 190198464, + "step": 11821 + }, + { + "epoch": 0.8281103610111464, + "grad_norm": 3.789973735809326, + "learning_rate": 1.7266704028021018e-05, + "loss": 1.1848, + "num_input_tokens_seen": 190214408, + "step": 11822 + }, + { + "epoch": 0.8281804092568756, + "grad_norm": 4.24061393737793, + "learning_rate": 1.7259705779334497e-05, + "loss": 1.0826, + "num_input_tokens_seen": 190230792, + "step": 11823 + }, + { + "epoch": 0.828250457502605, + "grad_norm": 5.237750053405762, + "learning_rate": 1.725270753064799e-05, + "loss": 1.0896, + "num_input_tokens_seen": 190247176, + "step": 11824 + }, + { + "epoch": 0.8283205057483342, + "grad_norm": 3.7056257724761963, + "learning_rate": 1.7245709281961467e-05, + "loss": 1.0724, + "num_input_tokens_seen": 190262992, + "step": 11825 + }, + { + "epoch": 0.8283905539940634, + "grad_norm": 5.951148986816406, + "learning_rate": 1.7238711033274956e-05, + "loss": 1.0172, + "num_input_tokens_seen": 190279376, + "step": 11826 + }, + { + "epoch": 0.8284606022397927, + "grad_norm": 3.7844555377960205, + "learning_rate": 1.7231712784588444e-05, + "loss": 1.1782, + "num_input_tokens_seen": 190295760, + "step": 11827 + }, + { + "epoch": 0.8285306504855219, + "grad_norm": 5.806246757507324, + "learning_rate": 1.7224714535901923e-05, + "loss": 1.1323, + "num_input_tokens_seen": 190312144, + "step": 11828 + }, + { + "epoch": 0.8286006987312512, + "grad_norm": 4.325758457183838, + "learning_rate": 1.7217716287215415e-05, + "loss": 0.8239, + "num_input_tokens_seen": 190328488, + "step": 11829 + }, + { + "epoch": 0.8286707469769804, + "grad_norm": 4.933779716491699, + "learning_rate": 1.7210718038528893e-05, + "loss": 1.1221, + "num_input_tokens_seen": 190344592, + "step": 11830 + }, + { + "epoch": 0.8287407952227096, + "grad_norm": 3.6514499187469482, + "learning_rate": 1.7203719789842382e-05, + "loss": 0.9988, + "num_input_tokens_seen": 190360848, + "step": 11831 + }, + { + "epoch": 0.8288108434684389, + "grad_norm": 3.534811019897461, + "learning_rate": 1.719672154115587e-05, + "loss": 0.9542, + "num_input_tokens_seen": 190377232, + "step": 11832 + }, + { + "epoch": 0.8288808917141681, + "grad_norm": 4.5435662269592285, + "learning_rate": 1.718972329246935e-05, + "loss": 0.9228, + "num_input_tokens_seen": 190392608, + "step": 11833 + }, + { + "epoch": 0.8289509399598973, + "grad_norm": 4.517431735992432, + "learning_rate": 1.718272504378284e-05, + "loss": 1.0626, + "num_input_tokens_seen": 190408992, + "step": 11834 + }, + { + "epoch": 0.8290209882056266, + "grad_norm": 4.556105136871338, + "learning_rate": 1.717572679509632e-05, + "loss": 0.9313, + "num_input_tokens_seen": 190425376, + "step": 11835 + }, + { + "epoch": 0.8290910364513558, + "grad_norm": 3.7025258541107178, + "learning_rate": 1.7168728546409808e-05, + "loss": 1.0339, + "num_input_tokens_seen": 190439712, + "step": 11836 + }, + { + "epoch": 0.8291610846970852, + "grad_norm": 4.716762065887451, + "learning_rate": 1.7161730297723286e-05, + "loss": 0.9715, + "num_input_tokens_seen": 190454864, + "step": 11837 + }, + { + "epoch": 0.8292311329428144, + "grad_norm": 4.8925395011901855, + "learning_rate": 1.7154732049036775e-05, + "loss": 1.0983, + "num_input_tokens_seen": 190470936, + "step": 11838 + }, + { + "epoch": 0.8293011811885436, + "grad_norm": 3.9075071811676025, + "learning_rate": 1.7147733800350267e-05, + "loss": 1.0738, + "num_input_tokens_seen": 190486912, + "step": 11839 + }, + { + "epoch": 0.8293712294342729, + "grad_norm": 3.6200056076049805, + "learning_rate": 1.7140735551663745e-05, + "loss": 0.9534, + "num_input_tokens_seen": 190502712, + "step": 11840 + }, + { + "epoch": 0.8294412776800021, + "grad_norm": 4.101872444152832, + "learning_rate": 1.7133737302977234e-05, + "loss": 0.9276, + "num_input_tokens_seen": 190518272, + "step": 11841 + }, + { + "epoch": 0.8295113259257314, + "grad_norm": 4.07480525970459, + "learning_rate": 1.7126739054290712e-05, + "loss": 1.1312, + "num_input_tokens_seen": 190534096, + "step": 11842 + }, + { + "epoch": 0.8295813741714606, + "grad_norm": 3.9376211166381836, + "learning_rate": 1.71197408056042e-05, + "loss": 1.1576, + "num_input_tokens_seen": 190550480, + "step": 11843 + }, + { + "epoch": 0.8296514224171898, + "grad_norm": 4.380791664123535, + "learning_rate": 1.7112742556917693e-05, + "loss": 0.99, + "num_input_tokens_seen": 190566864, + "step": 11844 + }, + { + "epoch": 0.8297214706629191, + "grad_norm": 4.331662178039551, + "learning_rate": 1.710574430823117e-05, + "loss": 1.108, + "num_input_tokens_seen": 190582968, + "step": 11845 + }, + { + "epoch": 0.8297915189086483, + "grad_norm": 4.201968193054199, + "learning_rate": 1.709874605954466e-05, + "loss": 1.2065, + "num_input_tokens_seen": 190597944, + "step": 11846 + }, + { + "epoch": 0.8298615671543775, + "grad_norm": 4.108932971954346, + "learning_rate": 1.7091747810858138e-05, + "loss": 1.1471, + "num_input_tokens_seen": 190614152, + "step": 11847 + }, + { + "epoch": 0.8299316154001068, + "grad_norm": 3.5581209659576416, + "learning_rate": 1.7084749562171627e-05, + "loss": 0.9977, + "num_input_tokens_seen": 190630536, + "step": 11848 + }, + { + "epoch": 0.830001663645836, + "grad_norm": 5.358382701873779, + "learning_rate": 1.707775131348512e-05, + "loss": 0.9266, + "num_input_tokens_seen": 190646920, + "step": 11849 + }, + { + "epoch": 0.8300717118915654, + "grad_norm": 4.135157585144043, + "learning_rate": 1.7070753064798597e-05, + "loss": 1.0384, + "num_input_tokens_seen": 190663304, + "step": 11850 + }, + { + "epoch": 0.8301417601372946, + "grad_norm": 4.339188098907471, + "learning_rate": 1.7063754816112086e-05, + "loss": 1.0533, + "num_input_tokens_seen": 190679688, + "step": 11851 + }, + { + "epoch": 0.8302118083830238, + "grad_norm": 3.79980731010437, + "learning_rate": 1.7056756567425564e-05, + "loss": 1.04, + "num_input_tokens_seen": 190695712, + "step": 11852 + }, + { + "epoch": 0.8302818566287531, + "grad_norm": 3.7791082859039307, + "learning_rate": 1.7049758318739053e-05, + "loss": 0.9212, + "num_input_tokens_seen": 190711536, + "step": 11853 + }, + { + "epoch": 0.8303519048744823, + "grad_norm": 3.7648520469665527, + "learning_rate": 1.7042760070052545e-05, + "loss": 1.1054, + "num_input_tokens_seen": 190727640, + "step": 11854 + }, + { + "epoch": 0.8304219531202115, + "grad_norm": 4.625076770782471, + "learning_rate": 1.7035761821366023e-05, + "loss": 1.0993, + "num_input_tokens_seen": 190744024, + "step": 11855 + }, + { + "epoch": 0.8304920013659408, + "grad_norm": 3.398932695388794, + "learning_rate": 1.7028763572679512e-05, + "loss": 0.9067, + "num_input_tokens_seen": 190759832, + "step": 11856 + }, + { + "epoch": 0.83056204961167, + "grad_norm": 3.7646045684814453, + "learning_rate": 1.702176532399299e-05, + "loss": 1.0928, + "num_input_tokens_seen": 190776216, + "step": 11857 + }, + { + "epoch": 0.8306320978573993, + "grad_norm": 3.655564785003662, + "learning_rate": 1.701476707530648e-05, + "loss": 1.1267, + "num_input_tokens_seen": 190792600, + "step": 11858 + }, + { + "epoch": 0.8307021461031285, + "grad_norm": 4.77268123626709, + "learning_rate": 1.700776882661997e-05, + "loss": 1.0325, + "num_input_tokens_seen": 190808280, + "step": 11859 + }, + { + "epoch": 0.8307721943488577, + "grad_norm": 4.021299839019775, + "learning_rate": 1.700077057793345e-05, + "loss": 1.0728, + "num_input_tokens_seen": 190824664, + "step": 11860 + }, + { + "epoch": 0.830842242594587, + "grad_norm": 3.994454860687256, + "learning_rate": 1.6993772329246938e-05, + "loss": 1.1064, + "num_input_tokens_seen": 190840632, + "step": 11861 + }, + { + "epoch": 0.8309122908403163, + "grad_norm": 4.713461399078369, + "learning_rate": 1.6986774080560416e-05, + "loss": 1.0687, + "num_input_tokens_seen": 190857016, + "step": 11862 + }, + { + "epoch": 0.8309823390860455, + "grad_norm": 4.2996296882629395, + "learning_rate": 1.6979775831873905e-05, + "loss": 1.1259, + "num_input_tokens_seen": 190873312, + "step": 11863 + }, + { + "epoch": 0.8310523873317748, + "grad_norm": 4.389613151550293, + "learning_rate": 1.6972777583187383e-05, + "loss": 1.1359, + "num_input_tokens_seen": 190888800, + "step": 11864 + }, + { + "epoch": 0.831122435577504, + "grad_norm": 3.573570728302002, + "learning_rate": 1.6965779334500875e-05, + "loss": 1.0881, + "num_input_tokens_seen": 190905184, + "step": 11865 + }, + { + "epoch": 0.8311924838232333, + "grad_norm": 4.785638809204102, + "learning_rate": 1.6958781085814364e-05, + "loss": 1.0838, + "num_input_tokens_seen": 190921184, + "step": 11866 + }, + { + "epoch": 0.8312625320689625, + "grad_norm": 4.226380825042725, + "learning_rate": 1.6951782837127842e-05, + "loss": 1.0933, + "num_input_tokens_seen": 190937568, + "step": 11867 + }, + { + "epoch": 0.8313325803146917, + "grad_norm": 3.8026623725891113, + "learning_rate": 1.694478458844133e-05, + "loss": 1.2868, + "num_input_tokens_seen": 190953952, + "step": 11868 + }, + { + "epoch": 0.831402628560421, + "grad_norm": 4.257472038269043, + "learning_rate": 1.693778633975481e-05, + "loss": 1.1094, + "num_input_tokens_seen": 190970336, + "step": 11869 + }, + { + "epoch": 0.8314726768061502, + "grad_norm": 3.8169426918029785, + "learning_rate": 1.69307880910683e-05, + "loss": 1.1584, + "num_input_tokens_seen": 190986720, + "step": 11870 + }, + { + "epoch": 0.8315427250518794, + "grad_norm": 3.6224069595336914, + "learning_rate": 1.692378984238179e-05, + "loss": 1.0221, + "num_input_tokens_seen": 191003104, + "step": 11871 + }, + { + "epoch": 0.8316127732976087, + "grad_norm": 4.295750141143799, + "learning_rate": 1.6916791593695268e-05, + "loss": 0.9722, + "num_input_tokens_seen": 191019488, + "step": 11872 + }, + { + "epoch": 0.831682821543338, + "grad_norm": 3.3229777812957764, + "learning_rate": 1.6909793345008757e-05, + "loss": 0.9197, + "num_input_tokens_seen": 191035808, + "step": 11873 + }, + { + "epoch": 0.8317528697890673, + "grad_norm": 3.4732608795166016, + "learning_rate": 1.6902795096322235e-05, + "loss": 0.9892, + "num_input_tokens_seen": 191052192, + "step": 11874 + }, + { + "epoch": 0.8318229180347965, + "grad_norm": 3.8648483753204346, + "learning_rate": 1.6895796847635727e-05, + "loss": 1.0852, + "num_input_tokens_seen": 191068576, + "step": 11875 + }, + { + "epoch": 0.8318929662805257, + "grad_norm": 4.464422225952148, + "learning_rate": 1.6888798598949216e-05, + "loss": 1.0994, + "num_input_tokens_seen": 191084640, + "step": 11876 + }, + { + "epoch": 0.831963014526255, + "grad_norm": 4.2510833740234375, + "learning_rate": 1.6881800350262694e-05, + "loss": 1.1349, + "num_input_tokens_seen": 191100208, + "step": 11877 + }, + { + "epoch": 0.8320330627719842, + "grad_norm": 4.267856121063232, + "learning_rate": 1.6874802101576183e-05, + "loss": 1.0792, + "num_input_tokens_seen": 191116592, + "step": 11878 + }, + { + "epoch": 0.8321031110177135, + "grad_norm": 4.798216819763184, + "learning_rate": 1.686780385288966e-05, + "loss": 1.0767, + "num_input_tokens_seen": 191131576, + "step": 11879 + }, + { + "epoch": 0.8321731592634427, + "grad_norm": 4.711989402770996, + "learning_rate": 1.6860805604203153e-05, + "loss": 1.1802, + "num_input_tokens_seen": 191147960, + "step": 11880 + }, + { + "epoch": 0.8322432075091719, + "grad_norm": 4.011876106262207, + "learning_rate": 1.6853807355516642e-05, + "loss": 0.9491, + "num_input_tokens_seen": 191163256, + "step": 11881 + }, + { + "epoch": 0.8323132557549012, + "grad_norm": 3.5829570293426514, + "learning_rate": 1.684680910683012e-05, + "loss": 0.9466, + "num_input_tokens_seen": 191178776, + "step": 11882 + }, + { + "epoch": 0.8323833040006304, + "grad_norm": 3.6252686977386475, + "learning_rate": 1.683981085814361e-05, + "loss": 0.9244, + "num_input_tokens_seen": 191194944, + "step": 11883 + }, + { + "epoch": 0.8324533522463596, + "grad_norm": 4.171043872833252, + "learning_rate": 1.6832812609457087e-05, + "loss": 0.9782, + "num_input_tokens_seen": 191211072, + "step": 11884 + }, + { + "epoch": 0.832523400492089, + "grad_norm": 5.789127349853516, + "learning_rate": 1.682581436077058e-05, + "loss": 1.1373, + "num_input_tokens_seen": 191227456, + "step": 11885 + }, + { + "epoch": 0.8325934487378182, + "grad_norm": 4.0675883293151855, + "learning_rate": 1.6818816112084068e-05, + "loss": 0.9094, + "num_input_tokens_seen": 191243840, + "step": 11886 + }, + { + "epoch": 0.8326634969835475, + "grad_norm": 3.966773271560669, + "learning_rate": 1.6811817863397546e-05, + "loss": 1.0299, + "num_input_tokens_seen": 191260224, + "step": 11887 + }, + { + "epoch": 0.8327335452292767, + "grad_norm": 4.898385047912598, + "learning_rate": 1.6804819614711035e-05, + "loss": 1.1629, + "num_input_tokens_seen": 191276312, + "step": 11888 + }, + { + "epoch": 0.8328035934750059, + "grad_norm": 6.780879974365234, + "learning_rate": 1.6797821366024513e-05, + "loss": 0.9329, + "num_input_tokens_seen": 191292696, + "step": 11889 + }, + { + "epoch": 0.8328736417207352, + "grad_norm": 3.5352230072021484, + "learning_rate": 1.6790823117338005e-05, + "loss": 1.0389, + "num_input_tokens_seen": 191308528, + "step": 11890 + }, + { + "epoch": 0.8329436899664644, + "grad_norm": 4.030395984649658, + "learning_rate": 1.6783824868651484e-05, + "loss": 1.1713, + "num_input_tokens_seen": 191324912, + "step": 11891 + }, + { + "epoch": 0.8330137382121936, + "grad_norm": 3.980278968811035, + "learning_rate": 1.6776826619964972e-05, + "loss": 1.0571, + "num_input_tokens_seen": 191341296, + "step": 11892 + }, + { + "epoch": 0.8330837864579229, + "grad_norm": 4.058806419372559, + "learning_rate": 1.676982837127846e-05, + "loss": 1.2319, + "num_input_tokens_seen": 191357680, + "step": 11893 + }, + { + "epoch": 0.8331538347036521, + "grad_norm": 4.196229457855225, + "learning_rate": 1.676283012259194e-05, + "loss": 1.067, + "num_input_tokens_seen": 191374064, + "step": 11894 + }, + { + "epoch": 0.8332238829493814, + "grad_norm": 4.1781392097473145, + "learning_rate": 1.675583187390543e-05, + "loss": 1.1441, + "num_input_tokens_seen": 191390448, + "step": 11895 + }, + { + "epoch": 0.8332939311951106, + "grad_norm": 3.493803024291992, + "learning_rate": 1.674883362521891e-05, + "loss": 1.0041, + "num_input_tokens_seen": 191406832, + "step": 11896 + }, + { + "epoch": 0.8333639794408398, + "grad_norm": 3.4466605186462402, + "learning_rate": 1.6741835376532398e-05, + "loss": 0.9509, + "num_input_tokens_seen": 191423200, + "step": 11897 + }, + { + "epoch": 0.8334340276865692, + "grad_norm": 3.561582326889038, + "learning_rate": 1.6734837127845887e-05, + "loss": 0.9644, + "num_input_tokens_seen": 191439504, + "step": 11898 + }, + { + "epoch": 0.8335040759322984, + "grad_norm": 3.5769901275634766, + "learning_rate": 1.6727838879159365e-05, + "loss": 1.0968, + "num_input_tokens_seen": 191455888, + "step": 11899 + }, + { + "epoch": 0.8335741241780276, + "grad_norm": 3.334141254425049, + "learning_rate": 1.6720840630472857e-05, + "loss": 1.0564, + "num_input_tokens_seen": 191472272, + "step": 11900 + }, + { + "epoch": 0.8336441724237569, + "grad_norm": 3.8814949989318848, + "learning_rate": 1.6713842381786336e-05, + "loss": 1.1509, + "num_input_tokens_seen": 191488656, + "step": 11901 + }, + { + "epoch": 0.8337142206694861, + "grad_norm": 5.075226306915283, + "learning_rate": 1.6706844133099824e-05, + "loss": 0.9553, + "num_input_tokens_seen": 191505016, + "step": 11902 + }, + { + "epoch": 0.8337842689152154, + "grad_norm": 4.490326404571533, + "learning_rate": 1.6699845884413313e-05, + "loss": 1.2465, + "num_input_tokens_seen": 191520784, + "step": 11903 + }, + { + "epoch": 0.8338543171609446, + "grad_norm": 4.249811172485352, + "learning_rate": 1.669284763572679e-05, + "loss": 1.0229, + "num_input_tokens_seen": 191537168, + "step": 11904 + }, + { + "epoch": 0.8339243654066738, + "grad_norm": 4.171118259429932, + "learning_rate": 1.6685849387040283e-05, + "loss": 1.1459, + "num_input_tokens_seen": 191553552, + "step": 11905 + }, + { + "epoch": 0.8339944136524031, + "grad_norm": 3.8373475074768066, + "learning_rate": 1.6678851138353762e-05, + "loss": 1.0764, + "num_input_tokens_seen": 191569936, + "step": 11906 + }, + { + "epoch": 0.8340644618981323, + "grad_norm": 5.880360126495361, + "learning_rate": 1.667185288966725e-05, + "loss": 1.0339, + "num_input_tokens_seen": 191583040, + "step": 11907 + }, + { + "epoch": 0.8341345101438615, + "grad_norm": 5.5472846031188965, + "learning_rate": 1.666485464098074e-05, + "loss": 0.887, + "num_input_tokens_seen": 191599424, + "step": 11908 + }, + { + "epoch": 0.8342045583895908, + "grad_norm": 4.756584644317627, + "learning_rate": 1.6657856392294217e-05, + "loss": 1.0014, + "num_input_tokens_seen": 191615480, + "step": 11909 + }, + { + "epoch": 0.83427460663532, + "grad_norm": 4.074771881103516, + "learning_rate": 1.665085814360771e-05, + "loss": 1.1624, + "num_input_tokens_seen": 191631816, + "step": 11910 + }, + { + "epoch": 0.8343446548810494, + "grad_norm": 6.545849323272705, + "learning_rate": 1.6643859894921188e-05, + "loss": 0.9629, + "num_input_tokens_seen": 191648200, + "step": 11911 + }, + { + "epoch": 0.8344147031267786, + "grad_norm": 3.936283588409424, + "learning_rate": 1.6636861646234676e-05, + "loss": 1.0098, + "num_input_tokens_seen": 191664584, + "step": 11912 + }, + { + "epoch": 0.8344847513725078, + "grad_norm": 5.883980751037598, + "learning_rate": 1.6629863397548165e-05, + "loss": 1.1557, + "num_input_tokens_seen": 191679216, + "step": 11913 + }, + { + "epoch": 0.8345547996182371, + "grad_norm": 4.500554084777832, + "learning_rate": 1.6622865148861643e-05, + "loss": 0.9994, + "num_input_tokens_seen": 191695600, + "step": 11914 + }, + { + "epoch": 0.8346248478639663, + "grad_norm": 5.838204860687256, + "learning_rate": 1.6615866900175135e-05, + "loss": 1.2861, + "num_input_tokens_seen": 191711984, + "step": 11915 + }, + { + "epoch": 0.8346948961096956, + "grad_norm": 3.7973411083221436, + "learning_rate": 1.6608868651488614e-05, + "loss": 0.9482, + "num_input_tokens_seen": 191728368, + "step": 11916 + }, + { + "epoch": 0.8347649443554248, + "grad_norm": 3.4707980155944824, + "learning_rate": 1.6601870402802102e-05, + "loss": 1.0501, + "num_input_tokens_seen": 191744752, + "step": 11917 + }, + { + "epoch": 0.834834992601154, + "grad_norm": 4.274454593658447, + "learning_rate": 1.659487215411558e-05, + "loss": 1.0964, + "num_input_tokens_seen": 191760320, + "step": 11918 + }, + { + "epoch": 0.8349050408468833, + "grad_norm": 3.5171470642089844, + "learning_rate": 1.658787390542907e-05, + "loss": 0.8961, + "num_input_tokens_seen": 191776704, + "step": 11919 + }, + { + "epoch": 0.8349750890926125, + "grad_norm": 3.8805525302886963, + "learning_rate": 1.658087565674256e-05, + "loss": 1.044, + "num_input_tokens_seen": 191792768, + "step": 11920 + }, + { + "epoch": 0.8350451373383417, + "grad_norm": 3.9300646781921387, + "learning_rate": 1.657387740805604e-05, + "loss": 1.0734, + "num_input_tokens_seen": 191808704, + "step": 11921 + }, + { + "epoch": 0.835115185584071, + "grad_norm": 4.972050666809082, + "learning_rate": 1.656687915936953e-05, + "loss": 0.9866, + "num_input_tokens_seen": 191824728, + "step": 11922 + }, + { + "epoch": 0.8351852338298003, + "grad_norm": 4.511004447937012, + "learning_rate": 1.6559880910683007e-05, + "loss": 0.829, + "num_input_tokens_seen": 191841112, + "step": 11923 + }, + { + "epoch": 0.8352552820755296, + "grad_norm": 4.488559722900391, + "learning_rate": 1.6552882661996495e-05, + "loss": 1.0066, + "num_input_tokens_seen": 191857496, + "step": 11924 + }, + { + "epoch": 0.8353253303212588, + "grad_norm": 3.7446281909942627, + "learning_rate": 1.6545884413309987e-05, + "loss": 1.0875, + "num_input_tokens_seen": 191873232, + "step": 11925 + }, + { + "epoch": 0.835395378566988, + "grad_norm": 3.9745285511016846, + "learning_rate": 1.6538886164623466e-05, + "loss": 0.9048, + "num_input_tokens_seen": 191889616, + "step": 11926 + }, + { + "epoch": 0.8354654268127173, + "grad_norm": 4.4531331062316895, + "learning_rate": 1.6531887915936954e-05, + "loss": 1.2057, + "num_input_tokens_seen": 191905376, + "step": 11927 + }, + { + "epoch": 0.8355354750584465, + "grad_norm": 3.3426671028137207, + "learning_rate": 1.6524889667250433e-05, + "loss": 0.7893, + "num_input_tokens_seen": 191921760, + "step": 11928 + }, + { + "epoch": 0.8356055233041757, + "grad_norm": 3.5311334133148193, + "learning_rate": 1.651789141856392e-05, + "loss": 0.9343, + "num_input_tokens_seen": 191938144, + "step": 11929 + }, + { + "epoch": 0.835675571549905, + "grad_norm": 3.8315582275390625, + "learning_rate": 1.6510893169877413e-05, + "loss": 1.0148, + "num_input_tokens_seen": 191953648, + "step": 11930 + }, + { + "epoch": 0.8357456197956342, + "grad_norm": 4.489852428436279, + "learning_rate": 1.6503894921190892e-05, + "loss": 1.0099, + "num_input_tokens_seen": 191970032, + "step": 11931 + }, + { + "epoch": 0.8358156680413635, + "grad_norm": 4.0962233543396, + "learning_rate": 1.649689667250438e-05, + "loss": 1.1013, + "num_input_tokens_seen": 191986416, + "step": 11932 + }, + { + "epoch": 0.8358857162870927, + "grad_norm": 4.872864246368408, + "learning_rate": 1.648989842381786e-05, + "loss": 0.9041, + "num_input_tokens_seen": 192001760, + "step": 11933 + }, + { + "epoch": 0.8359557645328219, + "grad_norm": 3.787332057952881, + "learning_rate": 1.6482900175131347e-05, + "loss": 1.0723, + "num_input_tokens_seen": 192018144, + "step": 11934 + }, + { + "epoch": 0.8360258127785513, + "grad_norm": 4.9776611328125, + "learning_rate": 1.647590192644484e-05, + "loss": 1.04, + "num_input_tokens_seen": 192034528, + "step": 11935 + }, + { + "epoch": 0.8360958610242805, + "grad_norm": 3.441718578338623, + "learning_rate": 1.6468903677758318e-05, + "loss": 0.9994, + "num_input_tokens_seen": 192050912, + "step": 11936 + }, + { + "epoch": 0.8361659092700097, + "grad_norm": 3.3847341537475586, + "learning_rate": 1.6461905429071806e-05, + "loss": 0.9932, + "num_input_tokens_seen": 192067296, + "step": 11937 + }, + { + "epoch": 0.836235957515739, + "grad_norm": 4.921849727630615, + "learning_rate": 1.6454907180385285e-05, + "loss": 0.9243, + "num_input_tokens_seen": 192083384, + "step": 11938 + }, + { + "epoch": 0.8363060057614682, + "grad_norm": 3.956958293914795, + "learning_rate": 1.6447908931698773e-05, + "loss": 0.9234, + "num_input_tokens_seen": 192099768, + "step": 11939 + }, + { + "epoch": 0.8363760540071975, + "grad_norm": 5.2705159187316895, + "learning_rate": 1.6440910683012265e-05, + "loss": 0.9994, + "num_input_tokens_seen": 192114120, + "step": 11940 + }, + { + "epoch": 0.8364461022529267, + "grad_norm": 3.4167838096618652, + "learning_rate": 1.6433912434325744e-05, + "loss": 0.974, + "num_input_tokens_seen": 192130336, + "step": 11941 + }, + { + "epoch": 0.8365161504986559, + "grad_norm": 3.9222419261932373, + "learning_rate": 1.6426914185639232e-05, + "loss": 0.9788, + "num_input_tokens_seen": 192146240, + "step": 11942 + }, + { + "epoch": 0.8365861987443852, + "grad_norm": 3.7621796131134033, + "learning_rate": 1.641991593695271e-05, + "loss": 0.9912, + "num_input_tokens_seen": 192162336, + "step": 11943 + }, + { + "epoch": 0.8366562469901144, + "grad_norm": 3.9339025020599365, + "learning_rate": 1.64129176882662e-05, + "loss": 0.9133, + "num_input_tokens_seen": 192178648, + "step": 11944 + }, + { + "epoch": 0.8367262952358437, + "grad_norm": 4.394367694854736, + "learning_rate": 1.6405919439579678e-05, + "loss": 1.0183, + "num_input_tokens_seen": 192194352, + "step": 11945 + }, + { + "epoch": 0.836796343481573, + "grad_norm": 3.9944801330566406, + "learning_rate": 1.639892119089317e-05, + "loss": 0.9616, + "num_input_tokens_seen": 192210736, + "step": 11946 + }, + { + "epoch": 0.8368663917273022, + "grad_norm": 5.175609111785889, + "learning_rate": 1.639192294220666e-05, + "loss": 0.9657, + "num_input_tokens_seen": 192227120, + "step": 11947 + }, + { + "epoch": 0.8369364399730315, + "grad_norm": 3.525728702545166, + "learning_rate": 1.6384924693520137e-05, + "loss": 1.0994, + "num_input_tokens_seen": 192243504, + "step": 11948 + }, + { + "epoch": 0.8370064882187607, + "grad_norm": 3.887847661972046, + "learning_rate": 1.6377926444833625e-05, + "loss": 1.0785, + "num_input_tokens_seen": 192259888, + "step": 11949 + }, + { + "epoch": 0.8370765364644899, + "grad_norm": 3.8701891899108887, + "learning_rate": 1.6370928196147104e-05, + "loss": 1.0396, + "num_input_tokens_seen": 192276088, + "step": 11950 + }, + { + "epoch": 0.8371465847102192, + "grad_norm": 3.749453067779541, + "learning_rate": 1.6363929947460596e-05, + "loss": 0.8971, + "num_input_tokens_seen": 192291816, + "step": 11951 + }, + { + "epoch": 0.8372166329559484, + "grad_norm": 3.5484535694122314, + "learning_rate": 1.6356931698774084e-05, + "loss": 0.9667, + "num_input_tokens_seen": 192308200, + "step": 11952 + }, + { + "epoch": 0.8372866812016777, + "grad_norm": 3.794985055923462, + "learning_rate": 1.6349933450087563e-05, + "loss": 1.1358, + "num_input_tokens_seen": 192323976, + "step": 11953 + }, + { + "epoch": 0.8373567294474069, + "grad_norm": 3.6207921504974365, + "learning_rate": 1.634293520140105e-05, + "loss": 0.9774, + "num_input_tokens_seen": 192340168, + "step": 11954 + }, + { + "epoch": 0.8374267776931361, + "grad_norm": 3.5162854194641113, + "learning_rate": 1.633593695271453e-05, + "loss": 0.9865, + "num_input_tokens_seen": 192355008, + "step": 11955 + }, + { + "epoch": 0.8374968259388654, + "grad_norm": 4.084841251373291, + "learning_rate": 1.6328938704028022e-05, + "loss": 1.0957, + "num_input_tokens_seen": 192371136, + "step": 11956 + }, + { + "epoch": 0.8375668741845946, + "grad_norm": 4.632619380950928, + "learning_rate": 1.632194045534151e-05, + "loss": 1.2732, + "num_input_tokens_seen": 192387208, + "step": 11957 + }, + { + "epoch": 0.8376369224303238, + "grad_norm": 3.657517194747925, + "learning_rate": 1.631494220665499e-05, + "loss": 1.0023, + "num_input_tokens_seen": 192403592, + "step": 11958 + }, + { + "epoch": 0.8377069706760532, + "grad_norm": 3.312901258468628, + "learning_rate": 1.6307943957968477e-05, + "loss": 0.8848, + "num_input_tokens_seen": 192419976, + "step": 11959 + }, + { + "epoch": 0.8377770189217824, + "grad_norm": 5.255041122436523, + "learning_rate": 1.6300945709281956e-05, + "loss": 1.1365, + "num_input_tokens_seen": 192436360, + "step": 11960 + }, + { + "epoch": 0.8378470671675117, + "grad_norm": 4.0180816650390625, + "learning_rate": 1.6293947460595448e-05, + "loss": 1.1832, + "num_input_tokens_seen": 192452712, + "step": 11961 + }, + { + "epoch": 0.8379171154132409, + "grad_norm": 4.9609375, + "learning_rate": 1.6286949211908936e-05, + "loss": 1.062, + "num_input_tokens_seen": 192469096, + "step": 11962 + }, + { + "epoch": 0.8379871636589701, + "grad_norm": 4.360490322113037, + "learning_rate": 1.6279950963222415e-05, + "loss": 1.0455, + "num_input_tokens_seen": 192484880, + "step": 11963 + }, + { + "epoch": 0.8380572119046994, + "grad_norm": 3.758460521697998, + "learning_rate": 1.6272952714535903e-05, + "loss": 1.1217, + "num_input_tokens_seen": 192501264, + "step": 11964 + }, + { + "epoch": 0.8381272601504286, + "grad_norm": 5.65224552154541, + "learning_rate": 1.6265954465849382e-05, + "loss": 1.0149, + "num_input_tokens_seen": 192517648, + "step": 11965 + }, + { + "epoch": 0.8381973083961578, + "grad_norm": 4.073123931884766, + "learning_rate": 1.6258956217162874e-05, + "loss": 1.1151, + "num_input_tokens_seen": 192533904, + "step": 11966 + }, + { + "epoch": 0.8382673566418871, + "grad_norm": 3.949498176574707, + "learning_rate": 1.6251957968476352e-05, + "loss": 1.1474, + "num_input_tokens_seen": 192549672, + "step": 11967 + }, + { + "epoch": 0.8383374048876163, + "grad_norm": 3.4220855236053467, + "learning_rate": 1.624495971978984e-05, + "loss": 0.9255, + "num_input_tokens_seen": 192565608, + "step": 11968 + }, + { + "epoch": 0.8384074531333456, + "grad_norm": 4.09205961227417, + "learning_rate": 1.623796147110333e-05, + "loss": 1.1847, + "num_input_tokens_seen": 192581992, + "step": 11969 + }, + { + "epoch": 0.8384775013790748, + "grad_norm": 4.031548023223877, + "learning_rate": 1.6230963222416808e-05, + "loss": 1.2322, + "num_input_tokens_seen": 192598248, + "step": 11970 + }, + { + "epoch": 0.838547549624804, + "grad_norm": 4.288369655609131, + "learning_rate": 1.62239649737303e-05, + "loss": 0.892, + "num_input_tokens_seen": 192614632, + "step": 11971 + }, + { + "epoch": 0.8386175978705334, + "grad_norm": 3.4618611335754395, + "learning_rate": 1.6216966725043778e-05, + "loss": 1.0448, + "num_input_tokens_seen": 192630528, + "step": 11972 + }, + { + "epoch": 0.8386876461162626, + "grad_norm": 3.985739231109619, + "learning_rate": 1.6209968476357267e-05, + "loss": 1.1417, + "num_input_tokens_seen": 192646912, + "step": 11973 + }, + { + "epoch": 0.8387576943619918, + "grad_norm": 3.323279857635498, + "learning_rate": 1.6202970227670755e-05, + "loss": 0.8119, + "num_input_tokens_seen": 192663296, + "step": 11974 + }, + { + "epoch": 0.8388277426077211, + "grad_norm": 3.9376602172851562, + "learning_rate": 1.6195971978984234e-05, + "loss": 1.1254, + "num_input_tokens_seen": 192679680, + "step": 11975 + }, + { + "epoch": 0.8388977908534503, + "grad_norm": 4.5849738121032715, + "learning_rate": 1.6188973730297726e-05, + "loss": 1.0972, + "num_input_tokens_seen": 192696048, + "step": 11976 + }, + { + "epoch": 0.8389678390991796, + "grad_norm": 6.64498233795166, + "learning_rate": 1.6181975481611204e-05, + "loss": 1.307, + "num_input_tokens_seen": 192712432, + "step": 11977 + }, + { + "epoch": 0.8390378873449088, + "grad_norm": 3.698000192642212, + "learning_rate": 1.6174977232924693e-05, + "loss": 0.9925, + "num_input_tokens_seen": 192728816, + "step": 11978 + }, + { + "epoch": 0.839107935590638, + "grad_norm": 4.3604350090026855, + "learning_rate": 1.616797898423818e-05, + "loss": 1.0756, + "num_input_tokens_seen": 192745200, + "step": 11979 + }, + { + "epoch": 0.8391779838363673, + "grad_norm": 3.8895559310913086, + "learning_rate": 1.616098073555166e-05, + "loss": 0.9677, + "num_input_tokens_seen": 192761584, + "step": 11980 + }, + { + "epoch": 0.8392480320820965, + "grad_norm": 4.303154945373535, + "learning_rate": 1.6153982486865152e-05, + "loss": 1.1094, + "num_input_tokens_seen": 192777968, + "step": 11981 + }, + { + "epoch": 0.8393180803278258, + "grad_norm": 5.070140838623047, + "learning_rate": 1.614698423817863e-05, + "loss": 1.152, + "num_input_tokens_seen": 192794352, + "step": 11982 + }, + { + "epoch": 0.839388128573555, + "grad_norm": 5.493104934692383, + "learning_rate": 1.613998598949212e-05, + "loss": 1.0327, + "num_input_tokens_seen": 192810328, + "step": 11983 + }, + { + "epoch": 0.8394581768192843, + "grad_norm": 3.751399278640747, + "learning_rate": 1.6132987740805607e-05, + "loss": 1.1757, + "num_input_tokens_seen": 192826712, + "step": 11984 + }, + { + "epoch": 0.8395282250650136, + "grad_norm": 3.95241379737854, + "learning_rate": 1.6125989492119086e-05, + "loss": 0.8862, + "num_input_tokens_seen": 192842776, + "step": 11985 + }, + { + "epoch": 0.8395982733107428, + "grad_norm": 4.287566661834717, + "learning_rate": 1.6118991243432578e-05, + "loss": 0.9324, + "num_input_tokens_seen": 192858000, + "step": 11986 + }, + { + "epoch": 0.839668321556472, + "grad_norm": 3.7315032482147217, + "learning_rate": 1.6111992994746056e-05, + "loss": 1.0701, + "num_input_tokens_seen": 192873744, + "step": 11987 + }, + { + "epoch": 0.8397383698022013, + "grad_norm": 3.70556640625, + "learning_rate": 1.6104994746059545e-05, + "loss": 1.1685, + "num_input_tokens_seen": 192889912, + "step": 11988 + }, + { + "epoch": 0.8398084180479305, + "grad_norm": 3.425881862640381, + "learning_rate": 1.6097996497373033e-05, + "loss": 1.0134, + "num_input_tokens_seen": 192906296, + "step": 11989 + }, + { + "epoch": 0.8398784662936598, + "grad_norm": 3.900747776031494, + "learning_rate": 1.6090998248686512e-05, + "loss": 1.1288, + "num_input_tokens_seen": 192922680, + "step": 11990 + }, + { + "epoch": 0.839948514539389, + "grad_norm": 6.073937892913818, + "learning_rate": 1.6084000000000004e-05, + "loss": 1.2558, + "num_input_tokens_seen": 192937080, + "step": 11991 + }, + { + "epoch": 0.8400185627851182, + "grad_norm": 4.0814690589904785, + "learning_rate": 1.6077001751313482e-05, + "loss": 1.1392, + "num_input_tokens_seen": 192953464, + "step": 11992 + }, + { + "epoch": 0.8400886110308475, + "grad_norm": 4.272843360900879, + "learning_rate": 1.607000350262697e-05, + "loss": 0.9855, + "num_input_tokens_seen": 192969848, + "step": 11993 + }, + { + "epoch": 0.8401586592765767, + "grad_norm": 6.016204357147217, + "learning_rate": 1.606300525394045e-05, + "loss": 0.9126, + "num_input_tokens_seen": 192986232, + "step": 11994 + }, + { + "epoch": 0.8402287075223059, + "grad_norm": 3.928853750228882, + "learning_rate": 1.6056007005253938e-05, + "loss": 1.018, + "num_input_tokens_seen": 193001712, + "step": 11995 + }, + { + "epoch": 0.8402987557680353, + "grad_norm": 4.795254707336426, + "learning_rate": 1.604900875656743e-05, + "loss": 0.9967, + "num_input_tokens_seen": 193018096, + "step": 11996 + }, + { + "epoch": 0.8403688040137645, + "grad_norm": 5.224395751953125, + "learning_rate": 1.604201050788091e-05, + "loss": 0.9583, + "num_input_tokens_seen": 193034480, + "step": 11997 + }, + { + "epoch": 0.8404388522594938, + "grad_norm": 6.673128604888916, + "learning_rate": 1.6035012259194397e-05, + "loss": 1.0931, + "num_input_tokens_seen": 193050864, + "step": 11998 + }, + { + "epoch": 0.840508900505223, + "grad_norm": 4.845270156860352, + "learning_rate": 1.6028014010507875e-05, + "loss": 1.0684, + "num_input_tokens_seen": 193065888, + "step": 11999 + }, + { + "epoch": 0.8405789487509522, + "grad_norm": 4.275314807891846, + "learning_rate": 1.6021015761821364e-05, + "loss": 1.0739, + "num_input_tokens_seen": 193082272, + "step": 12000 + }, + { + "epoch": 0.8405789487509522, + "eval_loss": 1.1170098781585693, + "eval_runtime": 0.1868, + "eval_samples_per_second": 5.354, + "eval_steps_per_second": 5.354, + "num_input_tokens_seen": 193082272, + "step": 12000 + }, + { + "epoch": 0.8406489969966815, + "grad_norm": 4.874648571014404, + "learning_rate": 1.6014017513134856e-05, + "loss": 1.0993, + "num_input_tokens_seen": 193098032, + "step": 12001 + }, + { + "epoch": 0.8407190452424107, + "grad_norm": 3.820533037185669, + "learning_rate": 1.6007019264448334e-05, + "loss": 0.9764, + "num_input_tokens_seen": 193114416, + "step": 12002 + }, + { + "epoch": 0.8407890934881399, + "grad_norm": 3.682685136795044, + "learning_rate": 1.6000021015761823e-05, + "loss": 1.066, + "num_input_tokens_seen": 193130800, + "step": 12003 + }, + { + "epoch": 0.8408591417338692, + "grad_norm": 3.3958754539489746, + "learning_rate": 1.59930227670753e-05, + "loss": 0.8914, + "num_input_tokens_seen": 193147184, + "step": 12004 + }, + { + "epoch": 0.8409291899795984, + "grad_norm": 3.5782742500305176, + "learning_rate": 1.598602451838879e-05, + "loss": 0.9994, + "num_input_tokens_seen": 193162480, + "step": 12005 + }, + { + "epoch": 0.8409992382253277, + "grad_norm": 7.2911224365234375, + "learning_rate": 1.5979026269702282e-05, + "loss": 1.134, + "num_input_tokens_seen": 193176280, + "step": 12006 + }, + { + "epoch": 0.8410692864710569, + "grad_norm": 4.041016101837158, + "learning_rate": 1.597202802101576e-05, + "loss": 0.8704, + "num_input_tokens_seen": 193192592, + "step": 12007 + }, + { + "epoch": 0.8411393347167861, + "grad_norm": 5.453434944152832, + "learning_rate": 1.596502977232925e-05, + "loss": 0.9554, + "num_input_tokens_seen": 193208464, + "step": 12008 + }, + { + "epoch": 0.8412093829625155, + "grad_norm": 4.322493076324463, + "learning_rate": 1.5958031523642727e-05, + "loss": 1.3755, + "num_input_tokens_seen": 193224848, + "step": 12009 + }, + { + "epoch": 0.8412794312082447, + "grad_norm": 4.137533187866211, + "learning_rate": 1.5951033274956216e-05, + "loss": 1.2935, + "num_input_tokens_seen": 193240472, + "step": 12010 + }, + { + "epoch": 0.8413494794539739, + "grad_norm": 3.9863550662994385, + "learning_rate": 1.5944035026269708e-05, + "loss": 0.9197, + "num_input_tokens_seen": 193255840, + "step": 12011 + }, + { + "epoch": 0.8414195276997032, + "grad_norm": 4.608818531036377, + "learning_rate": 1.5937036777583186e-05, + "loss": 1.0325, + "num_input_tokens_seen": 193271968, + "step": 12012 + }, + { + "epoch": 0.8414895759454324, + "grad_norm": 3.9822158813476562, + "learning_rate": 1.5930038528896675e-05, + "loss": 1.1793, + "num_input_tokens_seen": 193288256, + "step": 12013 + }, + { + "epoch": 0.8415596241911617, + "grad_norm": 6.067972660064697, + "learning_rate": 1.5923040280210153e-05, + "loss": 1.2544, + "num_input_tokens_seen": 193303824, + "step": 12014 + }, + { + "epoch": 0.8416296724368909, + "grad_norm": 4.0312724113464355, + "learning_rate": 1.5916042031523642e-05, + "loss": 1.0363, + "num_input_tokens_seen": 193319728, + "step": 12015 + }, + { + "epoch": 0.8416997206826201, + "grad_norm": 4.513323783874512, + "learning_rate": 1.5909043782837134e-05, + "loss": 1.0231, + "num_input_tokens_seen": 193335552, + "step": 12016 + }, + { + "epoch": 0.8417697689283494, + "grad_norm": 5.092079162597656, + "learning_rate": 1.5902045534150612e-05, + "loss": 1.0354, + "num_input_tokens_seen": 193351936, + "step": 12017 + }, + { + "epoch": 0.8418398171740786, + "grad_norm": 3.187577486038208, + "learning_rate": 1.58950472854641e-05, + "loss": 0.8578, + "num_input_tokens_seen": 193368000, + "step": 12018 + }, + { + "epoch": 0.841909865419808, + "grad_norm": 4.527098655700684, + "learning_rate": 1.588804903677758e-05, + "loss": 1.2553, + "num_input_tokens_seen": 193384384, + "step": 12019 + }, + { + "epoch": 0.8419799136655371, + "grad_norm": 3.6829335689544678, + "learning_rate": 1.5881050788091068e-05, + "loss": 0.9057, + "num_input_tokens_seen": 193400768, + "step": 12020 + }, + { + "epoch": 0.8420499619112664, + "grad_norm": 4.401511192321777, + "learning_rate": 1.5874052539404546e-05, + "loss": 1.0288, + "num_input_tokens_seen": 193417040, + "step": 12021 + }, + { + "epoch": 0.8421200101569957, + "grad_norm": 4.0875749588012695, + "learning_rate": 1.586705429071804e-05, + "loss": 1.1955, + "num_input_tokens_seen": 193433368, + "step": 12022 + }, + { + "epoch": 0.8421900584027249, + "grad_norm": 4.661693572998047, + "learning_rate": 1.5860056042031527e-05, + "loss": 1.1627, + "num_input_tokens_seen": 193449752, + "step": 12023 + }, + { + "epoch": 0.8422601066484541, + "grad_norm": 6.44402551651001, + "learning_rate": 1.5853057793345005e-05, + "loss": 1.1977, + "num_input_tokens_seen": 193465512, + "step": 12024 + }, + { + "epoch": 0.8423301548941834, + "grad_norm": 3.4406261444091797, + "learning_rate": 1.5846059544658494e-05, + "loss": 0.858, + "num_input_tokens_seen": 193481896, + "step": 12025 + }, + { + "epoch": 0.8424002031399126, + "grad_norm": 3.676628828048706, + "learning_rate": 1.5839061295971972e-05, + "loss": 1.1051, + "num_input_tokens_seen": 193498280, + "step": 12026 + }, + { + "epoch": 0.8424702513856419, + "grad_norm": 4.189825057983398, + "learning_rate": 1.5832063047285464e-05, + "loss": 0.9588, + "num_input_tokens_seen": 193514664, + "step": 12027 + }, + { + "epoch": 0.8425402996313711, + "grad_norm": 4.845769882202148, + "learning_rate": 1.5825064798598953e-05, + "loss": 1.1135, + "num_input_tokens_seen": 193530624, + "step": 12028 + }, + { + "epoch": 0.8426103478771003, + "grad_norm": 4.196181297302246, + "learning_rate": 1.581806654991243e-05, + "loss": 0.904, + "num_input_tokens_seen": 193546560, + "step": 12029 + }, + { + "epoch": 0.8426803961228296, + "grad_norm": 4.761411666870117, + "learning_rate": 1.581106830122592e-05, + "loss": 1.1166, + "num_input_tokens_seen": 193561240, + "step": 12030 + }, + { + "epoch": 0.8427504443685588, + "grad_norm": 3.7880547046661377, + "learning_rate": 1.58040700525394e-05, + "loss": 1.246, + "num_input_tokens_seen": 193577624, + "step": 12031 + }, + { + "epoch": 0.842820492614288, + "grad_norm": 3.6379289627075195, + "learning_rate": 1.579707180385289e-05, + "loss": 1.0783, + "num_input_tokens_seen": 193593800, + "step": 12032 + }, + { + "epoch": 0.8428905408600174, + "grad_norm": 4.195937156677246, + "learning_rate": 1.579007355516638e-05, + "loss": 1.2092, + "num_input_tokens_seen": 193610184, + "step": 12033 + }, + { + "epoch": 0.8429605891057466, + "grad_norm": 4.512848854064941, + "learning_rate": 1.5783075306479857e-05, + "loss": 1.0531, + "num_input_tokens_seen": 193626568, + "step": 12034 + }, + { + "epoch": 0.8430306373514759, + "grad_norm": 3.400192975997925, + "learning_rate": 1.5776077057793346e-05, + "loss": 0.7725, + "num_input_tokens_seen": 193642952, + "step": 12035 + }, + { + "epoch": 0.8431006855972051, + "grad_norm": 3.611207962036133, + "learning_rate": 1.5769078809106824e-05, + "loss": 0.9955, + "num_input_tokens_seen": 193659336, + "step": 12036 + }, + { + "epoch": 0.8431707338429343, + "grad_norm": 4.1401495933532715, + "learning_rate": 1.5762080560420316e-05, + "loss": 1.1379, + "num_input_tokens_seen": 193675720, + "step": 12037 + }, + { + "epoch": 0.8432407820886636, + "grad_norm": 6.437751770019531, + "learning_rate": 1.5755082311733805e-05, + "loss": 0.984, + "num_input_tokens_seen": 193691528, + "step": 12038 + }, + { + "epoch": 0.8433108303343928, + "grad_norm": 3.9966866970062256, + "learning_rate": 1.5748084063047283e-05, + "loss": 1.221, + "num_input_tokens_seen": 193707912, + "step": 12039 + }, + { + "epoch": 0.843380878580122, + "grad_norm": 3.518955707550049, + "learning_rate": 1.5741085814360772e-05, + "loss": 0.8782, + "num_input_tokens_seen": 193723760, + "step": 12040 + }, + { + "epoch": 0.8434509268258513, + "grad_norm": 3.6474783420562744, + "learning_rate": 1.573408756567425e-05, + "loss": 1.013, + "num_input_tokens_seen": 193739856, + "step": 12041 + }, + { + "epoch": 0.8435209750715805, + "grad_norm": 4.614313125610352, + "learning_rate": 1.5727089316987742e-05, + "loss": 0.9218, + "num_input_tokens_seen": 193753864, + "step": 12042 + }, + { + "epoch": 0.8435910233173098, + "grad_norm": 3.778559684753418, + "learning_rate": 1.572009106830123e-05, + "loss": 1.2125, + "num_input_tokens_seen": 193770248, + "step": 12043 + }, + { + "epoch": 0.843661071563039, + "grad_norm": 3.4948506355285645, + "learning_rate": 1.571309281961471e-05, + "loss": 0.8957, + "num_input_tokens_seen": 193786632, + "step": 12044 + }, + { + "epoch": 0.8437311198087682, + "grad_norm": 3.710836887359619, + "learning_rate": 1.5706094570928198e-05, + "loss": 1.1372, + "num_input_tokens_seen": 193802056, + "step": 12045 + }, + { + "epoch": 0.8438011680544976, + "grad_norm": 4.412574291229248, + "learning_rate": 1.5699096322241676e-05, + "loss": 0.9202, + "num_input_tokens_seen": 193818440, + "step": 12046 + }, + { + "epoch": 0.8438712163002268, + "grad_norm": 4.510186672210693, + "learning_rate": 1.569209807355517e-05, + "loss": 1.2191, + "num_input_tokens_seen": 193834392, + "step": 12047 + }, + { + "epoch": 0.8439412645459561, + "grad_norm": 3.928220748901367, + "learning_rate": 1.5685099824868647e-05, + "loss": 1.0116, + "num_input_tokens_seen": 193850776, + "step": 12048 + }, + { + "epoch": 0.8440113127916853, + "grad_norm": 4.514281749725342, + "learning_rate": 1.5678101576182135e-05, + "loss": 1.3785, + "num_input_tokens_seen": 193866552, + "step": 12049 + }, + { + "epoch": 0.8440813610374145, + "grad_norm": 3.6730780601501465, + "learning_rate": 1.5671103327495624e-05, + "loss": 1.1029, + "num_input_tokens_seen": 193882936, + "step": 12050 + }, + { + "epoch": 0.8441514092831438, + "grad_norm": 4.554709434509277, + "learning_rate": 1.5664105078809102e-05, + "loss": 1.0638, + "num_input_tokens_seen": 193899320, + "step": 12051 + }, + { + "epoch": 0.844221457528873, + "grad_norm": 3.493704080581665, + "learning_rate": 1.5657106830122594e-05, + "loss": 0.9809, + "num_input_tokens_seen": 193915616, + "step": 12052 + }, + { + "epoch": 0.8442915057746022, + "grad_norm": 4.491965293884277, + "learning_rate": 1.5650108581436073e-05, + "loss": 1.012, + "num_input_tokens_seen": 193932000, + "step": 12053 + }, + { + "epoch": 0.8443615540203315, + "grad_norm": 3.449021816253662, + "learning_rate": 1.564311033274956e-05, + "loss": 0.946, + "num_input_tokens_seen": 193947848, + "step": 12054 + }, + { + "epoch": 0.8444316022660607, + "grad_norm": 3.8101770877838135, + "learning_rate": 1.563611208406305e-05, + "loss": 0.8982, + "num_input_tokens_seen": 193964232, + "step": 12055 + }, + { + "epoch": 0.84450165051179, + "grad_norm": 3.480774164199829, + "learning_rate": 1.562911383537653e-05, + "loss": 0.7605, + "num_input_tokens_seen": 193980616, + "step": 12056 + }, + { + "epoch": 0.8445716987575193, + "grad_norm": 3.954725742340088, + "learning_rate": 1.562211558669002e-05, + "loss": 1.1141, + "num_input_tokens_seen": 193997000, + "step": 12057 + }, + { + "epoch": 0.8446417470032485, + "grad_norm": 3.814922332763672, + "learning_rate": 1.56151173380035e-05, + "loss": 0.9685, + "num_input_tokens_seen": 194012296, + "step": 12058 + }, + { + "epoch": 0.8447117952489778, + "grad_norm": 4.4602251052856445, + "learning_rate": 1.5608119089316987e-05, + "loss": 1.1335, + "num_input_tokens_seen": 194028184, + "step": 12059 + }, + { + "epoch": 0.844781843494707, + "grad_norm": 3.3978233337402344, + "learning_rate": 1.5601120840630476e-05, + "loss": 0.7906, + "num_input_tokens_seen": 194044568, + "step": 12060 + }, + { + "epoch": 0.8448518917404362, + "grad_norm": 4.242612361907959, + "learning_rate": 1.5594122591943954e-05, + "loss": 0.9523, + "num_input_tokens_seen": 194060952, + "step": 12061 + }, + { + "epoch": 0.8449219399861655, + "grad_norm": 4.309764385223389, + "learning_rate": 1.5587124343257446e-05, + "loss": 1.232, + "num_input_tokens_seen": 194076464, + "step": 12062 + }, + { + "epoch": 0.8449919882318947, + "grad_norm": 3.8207156658172607, + "learning_rate": 1.5580126094570925e-05, + "loss": 1.1488, + "num_input_tokens_seen": 194092792, + "step": 12063 + }, + { + "epoch": 0.845062036477624, + "grad_norm": 3.7894468307495117, + "learning_rate": 1.5573127845884413e-05, + "loss": 1.0523, + "num_input_tokens_seen": 194108800, + "step": 12064 + }, + { + "epoch": 0.8451320847233532, + "grad_norm": 4.23434591293335, + "learning_rate": 1.5566129597197902e-05, + "loss": 1.1992, + "num_input_tokens_seen": 194124760, + "step": 12065 + }, + { + "epoch": 0.8452021329690824, + "grad_norm": 4.168613433837891, + "learning_rate": 1.555913134851138e-05, + "loss": 1.0458, + "num_input_tokens_seen": 194140840, + "step": 12066 + }, + { + "epoch": 0.8452721812148117, + "grad_norm": 3.7115471363067627, + "learning_rate": 1.5552133099824872e-05, + "loss": 1.1023, + "num_input_tokens_seen": 194157224, + "step": 12067 + }, + { + "epoch": 0.8453422294605409, + "grad_norm": 3.447483539581299, + "learning_rate": 1.554513485113835e-05, + "loss": 1.0047, + "num_input_tokens_seen": 194173608, + "step": 12068 + }, + { + "epoch": 0.8454122777062701, + "grad_norm": 4.164474964141846, + "learning_rate": 1.553813660245184e-05, + "loss": 0.7798, + "num_input_tokens_seen": 194189728, + "step": 12069 + }, + { + "epoch": 0.8454823259519995, + "grad_norm": 4.6112236976623535, + "learning_rate": 1.5531138353765328e-05, + "loss": 1.1556, + "num_input_tokens_seen": 194205440, + "step": 12070 + }, + { + "epoch": 0.8455523741977287, + "grad_norm": 4.205389499664307, + "learning_rate": 1.5524140105078807e-05, + "loss": 1.1568, + "num_input_tokens_seen": 194221824, + "step": 12071 + }, + { + "epoch": 0.845622422443458, + "grad_norm": 5.151251316070557, + "learning_rate": 1.55171418563923e-05, + "loss": 1.0378, + "num_input_tokens_seen": 194237976, + "step": 12072 + }, + { + "epoch": 0.8456924706891872, + "grad_norm": 4.48797082901001, + "learning_rate": 1.5510143607705777e-05, + "loss": 1.344, + "num_input_tokens_seen": 194254360, + "step": 12073 + }, + { + "epoch": 0.8457625189349164, + "grad_norm": 5.361583232879639, + "learning_rate": 1.5503145359019266e-05, + "loss": 1.1283, + "num_input_tokens_seen": 194270744, + "step": 12074 + }, + { + "epoch": 0.8458325671806457, + "grad_norm": 5.913809299468994, + "learning_rate": 1.5496147110332744e-05, + "loss": 1.1146, + "num_input_tokens_seen": 194286952, + "step": 12075 + }, + { + "epoch": 0.8459026154263749, + "grad_norm": 6.253215312957764, + "learning_rate": 1.5489148861646233e-05, + "loss": 1.0014, + "num_input_tokens_seen": 194303336, + "step": 12076 + }, + { + "epoch": 0.8459726636721041, + "grad_norm": 4.153392314910889, + "learning_rate": 1.548215061295972e-05, + "loss": 1.1396, + "num_input_tokens_seen": 194319720, + "step": 12077 + }, + { + "epoch": 0.8460427119178334, + "grad_norm": 4.02133846282959, + "learning_rate": 1.5475152364273203e-05, + "loss": 1.2252, + "num_input_tokens_seen": 194336104, + "step": 12078 + }, + { + "epoch": 0.8461127601635626, + "grad_norm": 3.883892297744751, + "learning_rate": 1.546815411558669e-05, + "loss": 1.1779, + "num_input_tokens_seen": 194352488, + "step": 12079 + }, + { + "epoch": 0.8461828084092919, + "grad_norm": 3.8820319175720215, + "learning_rate": 1.546115586690017e-05, + "loss": 1.1007, + "num_input_tokens_seen": 194368088, + "step": 12080 + }, + { + "epoch": 0.8462528566550211, + "grad_norm": 3.4272522926330566, + "learning_rate": 1.545415761821366e-05, + "loss": 0.8426, + "num_input_tokens_seen": 194384472, + "step": 12081 + }, + { + "epoch": 0.8463229049007504, + "grad_norm": 3.539595603942871, + "learning_rate": 1.5447159369527147e-05, + "loss": 0.9267, + "num_input_tokens_seen": 194400856, + "step": 12082 + }, + { + "epoch": 0.8463929531464797, + "grad_norm": 3.765079975128174, + "learning_rate": 1.544016112084063e-05, + "loss": 0.9278, + "num_input_tokens_seen": 194415704, + "step": 12083 + }, + { + "epoch": 0.8464630013922089, + "grad_norm": 3.6053740978240967, + "learning_rate": 1.5433162872154118e-05, + "loss": 1.0176, + "num_input_tokens_seen": 194431264, + "step": 12084 + }, + { + "epoch": 0.8465330496379382, + "grad_norm": 4.253146648406982, + "learning_rate": 1.5426164623467596e-05, + "loss": 1.2418, + "num_input_tokens_seen": 194447416, + "step": 12085 + }, + { + "epoch": 0.8466030978836674, + "grad_norm": 5.376433372497559, + "learning_rate": 1.5419166374781085e-05, + "loss": 1.1494, + "num_input_tokens_seen": 194463208, + "step": 12086 + }, + { + "epoch": 0.8466731461293966, + "grad_norm": 4.156835556030273, + "learning_rate": 1.5412168126094573e-05, + "loss": 0.9351, + "num_input_tokens_seen": 194479592, + "step": 12087 + }, + { + "epoch": 0.8467431943751259, + "grad_norm": 4.1842145919799805, + "learning_rate": 1.5405169877408055e-05, + "loss": 0.9753, + "num_input_tokens_seen": 194495976, + "step": 12088 + }, + { + "epoch": 0.8468132426208551, + "grad_norm": 4.8296308517456055, + "learning_rate": 1.5398171628721544e-05, + "loss": 1.0778, + "num_input_tokens_seen": 194510480, + "step": 12089 + }, + { + "epoch": 0.8468832908665843, + "grad_norm": 3.724938154220581, + "learning_rate": 1.5391173380035022e-05, + "loss": 1.127, + "num_input_tokens_seen": 194525624, + "step": 12090 + }, + { + "epoch": 0.8469533391123136, + "grad_norm": 3.81607985496521, + "learning_rate": 1.538417513134851e-05, + "loss": 0.978, + "num_input_tokens_seen": 194541648, + "step": 12091 + }, + { + "epoch": 0.8470233873580428, + "grad_norm": 4.193657875061035, + "learning_rate": 1.5377176882662e-05, + "loss": 1.2898, + "num_input_tokens_seen": 194557912, + "step": 12092 + }, + { + "epoch": 0.8470934356037721, + "grad_norm": 3.599743366241455, + "learning_rate": 1.537017863397548e-05, + "loss": 1.0182, + "num_input_tokens_seen": 194573584, + "step": 12093 + }, + { + "epoch": 0.8471634838495014, + "grad_norm": 3.405317544937134, + "learning_rate": 1.536318038528897e-05, + "loss": 1.0192, + "num_input_tokens_seen": 194589968, + "step": 12094 + }, + { + "epoch": 0.8472335320952306, + "grad_norm": 3.4924840927124023, + "learning_rate": 1.5356182136602448e-05, + "loss": 0.8593, + "num_input_tokens_seen": 194606352, + "step": 12095 + }, + { + "epoch": 0.8473035803409599, + "grad_norm": 5.599239349365234, + "learning_rate": 1.5349183887915937e-05, + "loss": 1.0632, + "num_input_tokens_seen": 194622736, + "step": 12096 + }, + { + "epoch": 0.8473736285866891, + "grad_norm": 4.667746543884277, + "learning_rate": 1.534218563922943e-05, + "loss": 1.0779, + "num_input_tokens_seen": 194639120, + "step": 12097 + }, + { + "epoch": 0.8474436768324183, + "grad_norm": 3.836945056915283, + "learning_rate": 1.5335187390542907e-05, + "loss": 0.942, + "num_input_tokens_seen": 194655504, + "step": 12098 + }, + { + "epoch": 0.8475137250781476, + "grad_norm": 3.948596477508545, + "learning_rate": 1.5328189141856396e-05, + "loss": 0.9226, + "num_input_tokens_seen": 194670744, + "step": 12099 + }, + { + "epoch": 0.8475837733238768, + "grad_norm": 3.88057017326355, + "learning_rate": 1.5321190893169874e-05, + "loss": 0.9096, + "num_input_tokens_seen": 194686520, + "step": 12100 + }, + { + "epoch": 0.8476538215696061, + "grad_norm": 3.883319854736328, + "learning_rate": 1.5314192644483363e-05, + "loss": 0.8304, + "num_input_tokens_seen": 194702360, + "step": 12101 + }, + { + "epoch": 0.8477238698153353, + "grad_norm": 3.7630467414855957, + "learning_rate": 1.530719439579684e-05, + "loss": 1.0092, + "num_input_tokens_seen": 194718744, + "step": 12102 + }, + { + "epoch": 0.8477939180610645, + "grad_norm": 4.748619079589844, + "learning_rate": 1.5300196147110333e-05, + "loss": 1.0075, + "num_input_tokens_seen": 194735128, + "step": 12103 + }, + { + "epoch": 0.8478639663067938, + "grad_norm": 3.6898598670959473, + "learning_rate": 1.529319789842382e-05, + "loss": 0.8899, + "num_input_tokens_seen": 194751280, + "step": 12104 + }, + { + "epoch": 0.847934014552523, + "grad_norm": 3.9355263710021973, + "learning_rate": 1.52861996497373e-05, + "loss": 1.2754, + "num_input_tokens_seen": 194766952, + "step": 12105 + }, + { + "epoch": 0.8480040627982522, + "grad_norm": 3.87125825881958, + "learning_rate": 1.527920140105079e-05, + "loss": 1.0591, + "num_input_tokens_seen": 194782224, + "step": 12106 + }, + { + "epoch": 0.8480741110439816, + "grad_norm": 3.6940572261810303, + "learning_rate": 1.5272203152364267e-05, + "loss": 1.0524, + "num_input_tokens_seen": 194798608, + "step": 12107 + }, + { + "epoch": 0.8481441592897108, + "grad_norm": 5.229637145996094, + "learning_rate": 1.526520490367776e-05, + "loss": 1.5138, + "num_input_tokens_seen": 194814648, + "step": 12108 + }, + { + "epoch": 0.8482142075354401, + "grad_norm": 3.936601161956787, + "learning_rate": 1.525820665499125e-05, + "loss": 1.1033, + "num_input_tokens_seen": 194831032, + "step": 12109 + }, + { + "epoch": 0.8482842557811693, + "grad_norm": 4.891690731048584, + "learning_rate": 1.525120840630473e-05, + "loss": 0.9392, + "num_input_tokens_seen": 194847416, + "step": 12110 + }, + { + "epoch": 0.8483543040268985, + "grad_norm": 3.5713164806365967, + "learning_rate": 1.5244210157618216e-05, + "loss": 1.1037, + "num_input_tokens_seen": 194863800, + "step": 12111 + }, + { + "epoch": 0.8484243522726278, + "grad_norm": 3.3758444786071777, + "learning_rate": 1.5237211908931695e-05, + "loss": 0.9943, + "num_input_tokens_seen": 194880184, + "step": 12112 + }, + { + "epoch": 0.848494400518357, + "grad_norm": 4.755926609039307, + "learning_rate": 1.5230213660245185e-05, + "loss": 1.0546, + "num_input_tokens_seen": 194896120, + "step": 12113 + }, + { + "epoch": 0.8485644487640862, + "grad_norm": 5.808522701263428, + "learning_rate": 1.5223215411558675e-05, + "loss": 1.2515, + "num_input_tokens_seen": 194909920, + "step": 12114 + }, + { + "epoch": 0.8486344970098155, + "grad_norm": 4.118279457092285, + "learning_rate": 1.5216217162872154e-05, + "loss": 1.001, + "num_input_tokens_seen": 194926120, + "step": 12115 + }, + { + "epoch": 0.8487045452555447, + "grad_norm": 3.8041601181030273, + "learning_rate": 1.520921891418564e-05, + "loss": 0.9628, + "num_input_tokens_seen": 194941696, + "step": 12116 + }, + { + "epoch": 0.848774593501274, + "grad_norm": 4.610496520996094, + "learning_rate": 1.520222066549912e-05, + "loss": 1.0791, + "num_input_tokens_seen": 194956064, + "step": 12117 + }, + { + "epoch": 0.8488446417470032, + "grad_norm": 4.996589660644531, + "learning_rate": 1.519522241681261e-05, + "loss": 1.0579, + "num_input_tokens_seen": 194972448, + "step": 12118 + }, + { + "epoch": 0.8489146899927325, + "grad_norm": 3.7913153171539307, + "learning_rate": 1.51882241681261e-05, + "loss": 0.948, + "num_input_tokens_seen": 194988136, + "step": 12119 + }, + { + "epoch": 0.8489847382384618, + "grad_norm": 3.5915844440460205, + "learning_rate": 1.518122591943958e-05, + "loss": 0.9078, + "num_input_tokens_seen": 195004520, + "step": 12120 + }, + { + "epoch": 0.849054786484191, + "grad_norm": 3.573150873184204, + "learning_rate": 1.5174227670753068e-05, + "loss": 1.0403, + "num_input_tokens_seen": 195020904, + "step": 12121 + }, + { + "epoch": 0.8491248347299203, + "grad_norm": 4.482332706451416, + "learning_rate": 1.5167229422066548e-05, + "loss": 0.9826, + "num_input_tokens_seen": 195037288, + "step": 12122 + }, + { + "epoch": 0.8491948829756495, + "grad_norm": 4.119875431060791, + "learning_rate": 1.5160231173380037e-05, + "loss": 1.1246, + "num_input_tokens_seen": 195053592, + "step": 12123 + }, + { + "epoch": 0.8492649312213787, + "grad_norm": 3.9764111042022705, + "learning_rate": 1.5153232924693527e-05, + "loss": 1.0534, + "num_input_tokens_seen": 195069976, + "step": 12124 + }, + { + "epoch": 0.849334979467108, + "grad_norm": 4.519789695739746, + "learning_rate": 1.5146234676007007e-05, + "loss": 1.0975, + "num_input_tokens_seen": 195086080, + "step": 12125 + }, + { + "epoch": 0.8494050277128372, + "grad_norm": 3.5267138481140137, + "learning_rate": 1.5139236427320494e-05, + "loss": 0.7535, + "num_input_tokens_seen": 195102392, + "step": 12126 + }, + { + "epoch": 0.8494750759585664, + "grad_norm": 3.8588201999664307, + "learning_rate": 1.5132238178633973e-05, + "loss": 1.0814, + "num_input_tokens_seen": 195118696, + "step": 12127 + }, + { + "epoch": 0.8495451242042957, + "grad_norm": 3.6477701663970947, + "learning_rate": 1.5125239929947463e-05, + "loss": 1.085, + "num_input_tokens_seen": 195135080, + "step": 12128 + }, + { + "epoch": 0.8496151724500249, + "grad_norm": 3.499056339263916, + "learning_rate": 1.5118241681260941e-05, + "loss": 0.9353, + "num_input_tokens_seen": 195151464, + "step": 12129 + }, + { + "epoch": 0.8496852206957543, + "grad_norm": 3.617955446243286, + "learning_rate": 1.5111243432574432e-05, + "loss": 0.8895, + "num_input_tokens_seen": 195167296, + "step": 12130 + }, + { + "epoch": 0.8497552689414835, + "grad_norm": 5.047911643981934, + "learning_rate": 1.5104245183887919e-05, + "loss": 1.0272, + "num_input_tokens_seen": 195183680, + "step": 12131 + }, + { + "epoch": 0.8498253171872127, + "grad_norm": 3.200286865234375, + "learning_rate": 1.5097246935201397e-05, + "loss": 0.9936, + "num_input_tokens_seen": 195199584, + "step": 12132 + }, + { + "epoch": 0.849895365432942, + "grad_norm": 3.8377163410186768, + "learning_rate": 1.5090248686514887e-05, + "loss": 1.099, + "num_input_tokens_seen": 195215968, + "step": 12133 + }, + { + "epoch": 0.8499654136786712, + "grad_norm": 3.689974784851074, + "learning_rate": 1.5083250437828367e-05, + "loss": 0.9951, + "num_input_tokens_seen": 195232352, + "step": 12134 + }, + { + "epoch": 0.8500354619244004, + "grad_norm": 3.8295516967773438, + "learning_rate": 1.5076252189141856e-05, + "loss": 1.0135, + "num_input_tokens_seen": 195248736, + "step": 12135 + }, + { + "epoch": 0.8501055101701297, + "grad_norm": 3.398880958557129, + "learning_rate": 1.5069253940455346e-05, + "loss": 1.0082, + "num_input_tokens_seen": 195265120, + "step": 12136 + }, + { + "epoch": 0.8501755584158589, + "grad_norm": 3.7320990562438965, + "learning_rate": 1.5062255691768826e-05, + "loss": 0.9868, + "num_input_tokens_seen": 195281504, + "step": 12137 + }, + { + "epoch": 0.8502456066615882, + "grad_norm": 3.9108645915985107, + "learning_rate": 1.5055257443082315e-05, + "loss": 1.2336, + "num_input_tokens_seen": 195297888, + "step": 12138 + }, + { + "epoch": 0.8503156549073174, + "grad_norm": 4.6531195640563965, + "learning_rate": 1.5048259194395795e-05, + "loss": 1.1608, + "num_input_tokens_seen": 195314272, + "step": 12139 + }, + { + "epoch": 0.8503857031530466, + "grad_norm": 4.298128128051758, + "learning_rate": 1.5041260945709284e-05, + "loss": 1.1676, + "num_input_tokens_seen": 195330216, + "step": 12140 + }, + { + "epoch": 0.8504557513987759, + "grad_norm": 3.442682981491089, + "learning_rate": 1.5034262697022772e-05, + "loss": 1.066, + "num_input_tokens_seen": 195346600, + "step": 12141 + }, + { + "epoch": 0.8505257996445051, + "grad_norm": 3.7144229412078857, + "learning_rate": 1.502726444833625e-05, + "loss": 1.1384, + "num_input_tokens_seen": 195362808, + "step": 12142 + }, + { + "epoch": 0.8505958478902343, + "grad_norm": 3.6948165893554688, + "learning_rate": 1.5020266199649741e-05, + "loss": 1.0429, + "num_input_tokens_seen": 195379192, + "step": 12143 + }, + { + "epoch": 0.8506658961359637, + "grad_norm": 3.605437994003296, + "learning_rate": 1.501326795096322e-05, + "loss": 1.1438, + "num_input_tokens_seen": 195395480, + "step": 12144 + }, + { + "epoch": 0.8507359443816929, + "grad_norm": 4.07830810546875, + "learning_rate": 1.500626970227671e-05, + "loss": 1.1201, + "num_input_tokens_seen": 195411864, + "step": 12145 + }, + { + "epoch": 0.8508059926274222, + "grad_norm": 4.108992099761963, + "learning_rate": 1.49992714535902e-05, + "loss": 1.0336, + "num_input_tokens_seen": 195428248, + "step": 12146 + }, + { + "epoch": 0.8508760408731514, + "grad_norm": 3.28821063041687, + "learning_rate": 1.4992273204903675e-05, + "loss": 0.9515, + "num_input_tokens_seen": 195444632, + "step": 12147 + }, + { + "epoch": 0.8509460891188806, + "grad_norm": 4.842494487762451, + "learning_rate": 1.4985274956217165e-05, + "loss": 0.923, + "num_input_tokens_seen": 195460416, + "step": 12148 + }, + { + "epoch": 0.8510161373646099, + "grad_norm": 3.6901016235351562, + "learning_rate": 1.4978276707530644e-05, + "loss": 0.9458, + "num_input_tokens_seen": 195476800, + "step": 12149 + }, + { + "epoch": 0.8510861856103391, + "grad_norm": 4.727509021759033, + "learning_rate": 1.4971278458844134e-05, + "loss": 1.0369, + "num_input_tokens_seen": 195493184, + "step": 12150 + }, + { + "epoch": 0.8511562338560683, + "grad_norm": 5.172854900360107, + "learning_rate": 1.4964280210157624e-05, + "loss": 1.2652, + "num_input_tokens_seen": 195508424, + "step": 12151 + }, + { + "epoch": 0.8512262821017976, + "grad_norm": 6.034160137176514, + "learning_rate": 1.4957281961471103e-05, + "loss": 1.1074, + "num_input_tokens_seen": 195524296, + "step": 12152 + }, + { + "epoch": 0.8512963303475268, + "grad_norm": 3.9685332775115967, + "learning_rate": 1.4950283712784593e-05, + "loss": 0.9542, + "num_input_tokens_seen": 195540000, + "step": 12153 + }, + { + "epoch": 0.8513663785932561, + "grad_norm": 4.82825231552124, + "learning_rate": 1.4943285464098073e-05, + "loss": 1.0483, + "num_input_tokens_seen": 195555432, + "step": 12154 + }, + { + "epoch": 0.8514364268389853, + "grad_norm": 3.818240165710449, + "learning_rate": 1.4936287215411562e-05, + "loss": 1.0108, + "num_input_tokens_seen": 195571816, + "step": 12155 + }, + { + "epoch": 0.8515064750847146, + "grad_norm": 3.7787468433380127, + "learning_rate": 1.4929288966725042e-05, + "loss": 1.0192, + "num_input_tokens_seen": 195587736, + "step": 12156 + }, + { + "epoch": 0.8515765233304439, + "grad_norm": 4.150100231170654, + "learning_rate": 1.4922290718038529e-05, + "loss": 0.9811, + "num_input_tokens_seen": 195602816, + "step": 12157 + }, + { + "epoch": 0.8516465715761731, + "grad_norm": 4.366484642028809, + "learning_rate": 1.4915292469352019e-05, + "loss": 1.1145, + "num_input_tokens_seen": 195617864, + "step": 12158 + }, + { + "epoch": 0.8517166198219024, + "grad_norm": 3.6240880489349365, + "learning_rate": 1.4908294220665498e-05, + "loss": 1.0606, + "num_input_tokens_seen": 195634248, + "step": 12159 + }, + { + "epoch": 0.8517866680676316, + "grad_norm": 3.6645634174346924, + "learning_rate": 1.4901295971978988e-05, + "loss": 0.8817, + "num_input_tokens_seen": 195650160, + "step": 12160 + }, + { + "epoch": 0.8518567163133608, + "grad_norm": 3.898866891860962, + "learning_rate": 1.4894297723292466e-05, + "loss": 0.857, + "num_input_tokens_seen": 195665664, + "step": 12161 + }, + { + "epoch": 0.8519267645590901, + "grad_norm": 6.77559232711792, + "learning_rate": 1.4887299474605953e-05, + "loss": 1.068, + "num_input_tokens_seen": 195682048, + "step": 12162 + }, + { + "epoch": 0.8519968128048193, + "grad_norm": 5.019065856933594, + "learning_rate": 1.4880301225919447e-05, + "loss": 1.1271, + "num_input_tokens_seen": 195698432, + "step": 12163 + }, + { + "epoch": 0.8520668610505485, + "grad_norm": 7.717326641082764, + "learning_rate": 1.4873302977232922e-05, + "loss": 1.3472, + "num_input_tokens_seen": 195714816, + "step": 12164 + }, + { + "epoch": 0.8521369092962778, + "grad_norm": 4.829509258270264, + "learning_rate": 1.4866304728546412e-05, + "loss": 1.0293, + "num_input_tokens_seen": 195730608, + "step": 12165 + }, + { + "epoch": 0.852206957542007, + "grad_norm": 4.767049312591553, + "learning_rate": 1.485930647985989e-05, + "loss": 1.1888, + "num_input_tokens_seen": 195746112, + "step": 12166 + }, + { + "epoch": 0.8522770057877364, + "grad_norm": 4.737157821655273, + "learning_rate": 1.485230823117338e-05, + "loss": 1.0307, + "num_input_tokens_seen": 195762032, + "step": 12167 + }, + { + "epoch": 0.8523470540334656, + "grad_norm": 4.816120624542236, + "learning_rate": 1.4845309982486871e-05, + "loss": 0.9283, + "num_input_tokens_seen": 195778416, + "step": 12168 + }, + { + "epoch": 0.8524171022791948, + "grad_norm": 3.637639284133911, + "learning_rate": 1.483831173380035e-05, + "loss": 0.9608, + "num_input_tokens_seen": 195794656, + "step": 12169 + }, + { + "epoch": 0.8524871505249241, + "grad_norm": 3.682736873626709, + "learning_rate": 1.4831313485113838e-05, + "loss": 0.9742, + "num_input_tokens_seen": 195810560, + "step": 12170 + }, + { + "epoch": 0.8525571987706533, + "grad_norm": 4.753480434417725, + "learning_rate": 1.482431523642732e-05, + "loss": 1.037, + "num_input_tokens_seen": 195826136, + "step": 12171 + }, + { + "epoch": 0.8526272470163825, + "grad_norm": 4.471009254455566, + "learning_rate": 1.4817316987740807e-05, + "loss": 0.9728, + "num_input_tokens_seen": 195842504, + "step": 12172 + }, + { + "epoch": 0.8526972952621118, + "grad_norm": 5.294859886169434, + "learning_rate": 1.4810318739054297e-05, + "loss": 1.0611, + "num_input_tokens_seen": 195858888, + "step": 12173 + }, + { + "epoch": 0.852767343507841, + "grad_norm": 3.4464609622955322, + "learning_rate": 1.4803320490367776e-05, + "loss": 0.9663, + "num_input_tokens_seen": 195875272, + "step": 12174 + }, + { + "epoch": 0.8528373917535703, + "grad_norm": 3.803563356399536, + "learning_rate": 1.4796322241681266e-05, + "loss": 1.0965, + "num_input_tokens_seen": 195891656, + "step": 12175 + }, + { + "epoch": 0.8529074399992995, + "grad_norm": 3.900587320327759, + "learning_rate": 1.4789323992994744e-05, + "loss": 1.0287, + "num_input_tokens_seen": 195908040, + "step": 12176 + }, + { + "epoch": 0.8529774882450287, + "grad_norm": 3.732893705368042, + "learning_rate": 1.4782325744308235e-05, + "loss": 1.0211, + "num_input_tokens_seen": 195924424, + "step": 12177 + }, + { + "epoch": 0.853047536490758, + "grad_norm": 3.786726236343384, + "learning_rate": 1.4775327495621725e-05, + "loss": 1.0154, + "num_input_tokens_seen": 195940808, + "step": 12178 + }, + { + "epoch": 0.8531175847364872, + "grad_norm": 3.8202338218688965, + "learning_rate": 1.47683292469352e-05, + "loss": 1.074, + "num_input_tokens_seen": 195957192, + "step": 12179 + }, + { + "epoch": 0.8531876329822164, + "grad_norm": 3.867497205734253, + "learning_rate": 1.476133099824869e-05, + "loss": 1.1185, + "num_input_tokens_seen": 195973576, + "step": 12180 + }, + { + "epoch": 0.8532576812279458, + "grad_norm": 4.039933681488037, + "learning_rate": 1.4754332749562169e-05, + "loss": 1.2435, + "num_input_tokens_seen": 195989960, + "step": 12181 + }, + { + "epoch": 0.853327729473675, + "grad_norm": 4.168961524963379, + "learning_rate": 1.4747334500875659e-05, + "loss": 1.1434, + "num_input_tokens_seen": 196006344, + "step": 12182 + }, + { + "epoch": 0.8533977777194043, + "grad_norm": 4.885080814361572, + "learning_rate": 1.4740336252189137e-05, + "loss": 0.9876, + "num_input_tokens_seen": 196021472, + "step": 12183 + }, + { + "epoch": 0.8534678259651335, + "grad_norm": 4.192296028137207, + "learning_rate": 1.4733338003502628e-05, + "loss": 1.0329, + "num_input_tokens_seen": 196037856, + "step": 12184 + }, + { + "epoch": 0.8535378742108627, + "grad_norm": 3.7160255908966064, + "learning_rate": 1.4726339754816114e-05, + "loss": 1.1324, + "num_input_tokens_seen": 196054240, + "step": 12185 + }, + { + "epoch": 0.853607922456592, + "grad_norm": 4.594933032989502, + "learning_rate": 1.4719341506129595e-05, + "loss": 1.1736, + "num_input_tokens_seen": 196069872, + "step": 12186 + }, + { + "epoch": 0.8536779707023212, + "grad_norm": 4.2073798179626465, + "learning_rate": 1.4712343257443085e-05, + "loss": 1.1342, + "num_input_tokens_seen": 196086256, + "step": 12187 + }, + { + "epoch": 0.8537480189480505, + "grad_norm": 5.0398478507995605, + "learning_rate": 1.4705345008756563e-05, + "loss": 1.1443, + "num_input_tokens_seen": 196102288, + "step": 12188 + }, + { + "epoch": 0.8538180671937797, + "grad_norm": 4.346383094787598, + "learning_rate": 1.4698346760070054e-05, + "loss": 1.3081, + "num_input_tokens_seen": 196118672, + "step": 12189 + }, + { + "epoch": 0.8538881154395089, + "grad_norm": 4.210121154785156, + "learning_rate": 1.4691348511383544e-05, + "loss": 1.0923, + "num_input_tokens_seen": 196134384, + "step": 12190 + }, + { + "epoch": 0.8539581636852382, + "grad_norm": 3.8330979347229004, + "learning_rate": 1.4684350262697022e-05, + "loss": 1.1481, + "num_input_tokens_seen": 196150360, + "step": 12191 + }, + { + "epoch": 0.8540282119309675, + "grad_norm": 6.525990009307861, + "learning_rate": 1.4677352014010513e-05, + "loss": 0.9305, + "num_input_tokens_seen": 196165344, + "step": 12192 + }, + { + "epoch": 0.8540982601766967, + "grad_norm": 4.303135871887207, + "learning_rate": 1.4670353765323988e-05, + "loss": 1.1595, + "num_input_tokens_seen": 196179872, + "step": 12193 + }, + { + "epoch": 0.854168308422426, + "grad_norm": 4.225890636444092, + "learning_rate": 1.4663355516637481e-05, + "loss": 1.1641, + "num_input_tokens_seen": 196195472, + "step": 12194 + }, + { + "epoch": 0.8542383566681552, + "grad_norm": 3.7457046508789062, + "learning_rate": 1.4656357267950968e-05, + "loss": 1.1002, + "num_input_tokens_seen": 196211768, + "step": 12195 + }, + { + "epoch": 0.8543084049138845, + "grad_norm": 3.3433685302734375, + "learning_rate": 1.4649359019264447e-05, + "loss": 0.8146, + "num_input_tokens_seen": 196228152, + "step": 12196 + }, + { + "epoch": 0.8543784531596137, + "grad_norm": 4.311893463134766, + "learning_rate": 1.4642360770577937e-05, + "loss": 1.0603, + "num_input_tokens_seen": 196244536, + "step": 12197 + }, + { + "epoch": 0.8544485014053429, + "grad_norm": 4.185448169708252, + "learning_rate": 1.4635362521891415e-05, + "loss": 1.0644, + "num_input_tokens_seen": 196260880, + "step": 12198 + }, + { + "epoch": 0.8545185496510722, + "grad_norm": 3.5842652320861816, + "learning_rate": 1.4628364273204906e-05, + "loss": 0.8133, + "num_input_tokens_seen": 196276176, + "step": 12199 + }, + { + "epoch": 0.8545885978968014, + "grad_norm": 4.0277533531188965, + "learning_rate": 1.4621366024518393e-05, + "loss": 0.863, + "num_input_tokens_seen": 196291976, + "step": 12200 + }, + { + "epoch": 0.8545885978968014, + "eval_loss": 1.1167926788330078, + "eval_runtime": 0.1913, + "eval_samples_per_second": 5.228, + "eval_steps_per_second": 5.228, + "num_input_tokens_seen": 196291976, + "step": 12200 + }, + { + "epoch": 0.8546586461425306, + "grad_norm": 4.720709323883057, + "learning_rate": 1.4614367775831873e-05, + "loss": 1.0326, + "num_input_tokens_seen": 196308360, + "step": 12201 + }, + { + "epoch": 0.8547286943882599, + "grad_norm": 3.761216163635254, + "learning_rate": 1.4607369527145361e-05, + "loss": 1.0746, + "num_input_tokens_seen": 196324408, + "step": 12202 + }, + { + "epoch": 0.8547987426339891, + "grad_norm": 4.258065223693848, + "learning_rate": 1.4600371278458841e-05, + "loss": 1.2327, + "num_input_tokens_seen": 196340792, + "step": 12203 + }, + { + "epoch": 0.8548687908797185, + "grad_norm": 4.706881046295166, + "learning_rate": 1.4593373029772332e-05, + "loss": 1.0301, + "num_input_tokens_seen": 196357176, + "step": 12204 + }, + { + "epoch": 0.8549388391254477, + "grad_norm": 3.9805383682250977, + "learning_rate": 1.458637478108582e-05, + "loss": 0.9925, + "num_input_tokens_seen": 196372832, + "step": 12205 + }, + { + "epoch": 0.8550088873711769, + "grad_norm": 4.8118462562561035, + "learning_rate": 1.45793765323993e-05, + "loss": 0.9806, + "num_input_tokens_seen": 196389216, + "step": 12206 + }, + { + "epoch": 0.8550789356169062, + "grad_norm": 4.129264831542969, + "learning_rate": 1.457237828371279e-05, + "loss": 0.8616, + "num_input_tokens_seen": 196405600, + "step": 12207 + }, + { + "epoch": 0.8551489838626354, + "grad_norm": 3.761479377746582, + "learning_rate": 1.4565380035026269e-05, + "loss": 1.0943, + "num_input_tokens_seen": 196421984, + "step": 12208 + }, + { + "epoch": 0.8552190321083646, + "grad_norm": 3.5886919498443604, + "learning_rate": 1.455838178633976e-05, + "loss": 1.0591, + "num_input_tokens_seen": 196438368, + "step": 12209 + }, + { + "epoch": 0.8552890803540939, + "grad_norm": 3.925198554992676, + "learning_rate": 1.4551383537653234e-05, + "loss": 1.0782, + "num_input_tokens_seen": 196453448, + "step": 12210 + }, + { + "epoch": 0.8553591285998231, + "grad_norm": 4.267056941986084, + "learning_rate": 1.4544385288966725e-05, + "loss": 1.0866, + "num_input_tokens_seen": 196469648, + "step": 12211 + }, + { + "epoch": 0.8554291768455524, + "grad_norm": 4.8296613693237305, + "learning_rate": 1.4537387040280215e-05, + "loss": 1.3747, + "num_input_tokens_seen": 196485784, + "step": 12212 + }, + { + "epoch": 0.8554992250912816, + "grad_norm": 3.6368753910064697, + "learning_rate": 1.4530388791593693e-05, + "loss": 0.9684, + "num_input_tokens_seen": 196501448, + "step": 12213 + }, + { + "epoch": 0.8555692733370108, + "grad_norm": 3.8658010959625244, + "learning_rate": 1.4523390542907184e-05, + "loss": 1.0927, + "num_input_tokens_seen": 196517832, + "step": 12214 + }, + { + "epoch": 0.8556393215827401, + "grad_norm": 5.381229400634766, + "learning_rate": 1.4516392294220662e-05, + "loss": 1.0376, + "num_input_tokens_seen": 196533632, + "step": 12215 + }, + { + "epoch": 0.8557093698284693, + "grad_norm": 4.44445276260376, + "learning_rate": 1.450939404553415e-05, + "loss": 1.1013, + "num_input_tokens_seen": 196550016, + "step": 12216 + }, + { + "epoch": 0.8557794180741986, + "grad_norm": 4.765165328979492, + "learning_rate": 1.450239579684764e-05, + "loss": 1.0449, + "num_input_tokens_seen": 196563984, + "step": 12217 + }, + { + "epoch": 0.8558494663199279, + "grad_norm": 3.8898394107818604, + "learning_rate": 1.449539754816112e-05, + "loss": 1.223, + "num_input_tokens_seen": 196580368, + "step": 12218 + }, + { + "epoch": 0.8559195145656571, + "grad_norm": 3.7699127197265625, + "learning_rate": 1.4488399299474608e-05, + "loss": 1.0103, + "num_input_tokens_seen": 196596752, + "step": 12219 + }, + { + "epoch": 0.8559895628113864, + "grad_norm": 3.8447883129119873, + "learning_rate": 1.4481401050788088e-05, + "loss": 1.0769, + "num_input_tokens_seen": 196612232, + "step": 12220 + }, + { + "epoch": 0.8560596110571156, + "grad_norm": 4.8102312088012695, + "learning_rate": 1.4474402802101578e-05, + "loss": 0.9685, + "num_input_tokens_seen": 196628616, + "step": 12221 + }, + { + "epoch": 0.8561296593028448, + "grad_norm": 4.415116786956787, + "learning_rate": 1.4467404553415067e-05, + "loss": 0.8643, + "num_input_tokens_seen": 196644352, + "step": 12222 + }, + { + "epoch": 0.8561997075485741, + "grad_norm": 4.085516452789307, + "learning_rate": 1.4460406304728547e-05, + "loss": 1.0171, + "num_input_tokens_seen": 196660608, + "step": 12223 + }, + { + "epoch": 0.8562697557943033, + "grad_norm": 4.242407321929932, + "learning_rate": 1.4453408056042036e-05, + "loss": 0.9662, + "num_input_tokens_seen": 196676608, + "step": 12224 + }, + { + "epoch": 0.8563398040400326, + "grad_norm": 3.9178318977355957, + "learning_rate": 1.4446409807355516e-05, + "loss": 0.8499, + "num_input_tokens_seen": 196692968, + "step": 12225 + }, + { + "epoch": 0.8564098522857618, + "grad_norm": 3.7654545307159424, + "learning_rate": 1.4439411558669003e-05, + "loss": 1.0207, + "num_input_tokens_seen": 196709352, + "step": 12226 + }, + { + "epoch": 0.856479900531491, + "grad_norm": 4.777754306793213, + "learning_rate": 1.4432413309982493e-05, + "loss": 0.9543, + "num_input_tokens_seen": 196725456, + "step": 12227 + }, + { + "epoch": 0.8565499487772203, + "grad_norm": 4.113293647766113, + "learning_rate": 1.4425415061295971e-05, + "loss": 1.0144, + "num_input_tokens_seen": 196741840, + "step": 12228 + }, + { + "epoch": 0.8566199970229496, + "grad_norm": 4.161285400390625, + "learning_rate": 1.4418416812609462e-05, + "loss": 1.0435, + "num_input_tokens_seen": 196758224, + "step": 12229 + }, + { + "epoch": 0.8566900452686788, + "grad_norm": 6.041318893432617, + "learning_rate": 1.441141856392294e-05, + "loss": 1.1259, + "num_input_tokens_seen": 196773688, + "step": 12230 + }, + { + "epoch": 0.8567600935144081, + "grad_norm": 3.5007405281066895, + "learning_rate": 1.4404420315236427e-05, + "loss": 0.9697, + "num_input_tokens_seen": 196789696, + "step": 12231 + }, + { + "epoch": 0.8568301417601373, + "grad_norm": 5.033860683441162, + "learning_rate": 1.4397422066549917e-05, + "loss": 1.048, + "num_input_tokens_seen": 196806080, + "step": 12232 + }, + { + "epoch": 0.8569001900058666, + "grad_norm": 5.296143531799316, + "learning_rate": 1.4390423817863396e-05, + "loss": 1.0841, + "num_input_tokens_seen": 196822464, + "step": 12233 + }, + { + "epoch": 0.8569702382515958, + "grad_norm": 4.199211120605469, + "learning_rate": 1.4383425569176886e-05, + "loss": 1.0657, + "num_input_tokens_seen": 196838848, + "step": 12234 + }, + { + "epoch": 0.857040286497325, + "grad_norm": 3.5562124252319336, + "learning_rate": 1.4376427320490366e-05, + "loss": 0.9934, + "num_input_tokens_seen": 196854712, + "step": 12235 + }, + { + "epoch": 0.8571103347430543, + "grad_norm": 3.508512258529663, + "learning_rate": 1.4369429071803855e-05, + "loss": 0.9408, + "num_input_tokens_seen": 196870480, + "step": 12236 + }, + { + "epoch": 0.8571803829887835, + "grad_norm": 3.8756065368652344, + "learning_rate": 1.4362430823117335e-05, + "loss": 1.0775, + "num_input_tokens_seen": 196886448, + "step": 12237 + }, + { + "epoch": 0.8572504312345127, + "grad_norm": 4.070326805114746, + "learning_rate": 1.4355432574430825e-05, + "loss": 1.0409, + "num_input_tokens_seen": 196902832, + "step": 12238 + }, + { + "epoch": 0.857320479480242, + "grad_norm": 3.3013839721679688, + "learning_rate": 1.4348434325744314e-05, + "loss": 0.8603, + "num_input_tokens_seen": 196918592, + "step": 12239 + }, + { + "epoch": 0.8573905277259712, + "grad_norm": 3.5716793537139893, + "learning_rate": 1.4341436077057794e-05, + "loss": 1.0718, + "num_input_tokens_seen": 196934800, + "step": 12240 + }, + { + "epoch": 0.8574605759717006, + "grad_norm": 4.033993244171143, + "learning_rate": 1.433443782837128e-05, + "loss": 0.8768, + "num_input_tokens_seen": 196951184, + "step": 12241 + }, + { + "epoch": 0.8575306242174298, + "grad_norm": 4.118512153625488, + "learning_rate": 1.4327439579684759e-05, + "loss": 0.9593, + "num_input_tokens_seen": 196966952, + "step": 12242 + }, + { + "epoch": 0.857600672463159, + "grad_norm": 3.3278896808624268, + "learning_rate": 1.432044133099825e-05, + "loss": 0.9791, + "num_input_tokens_seen": 196983336, + "step": 12243 + }, + { + "epoch": 0.8576707207088883, + "grad_norm": 5.154607772827148, + "learning_rate": 1.431344308231174e-05, + "loss": 0.9441, + "num_input_tokens_seen": 196999688, + "step": 12244 + }, + { + "epoch": 0.8577407689546175, + "grad_norm": 3.7285592555999756, + "learning_rate": 1.4306444833625218e-05, + "loss": 0.9681, + "num_input_tokens_seen": 197015384, + "step": 12245 + }, + { + "epoch": 0.8578108172003467, + "grad_norm": 3.716254472732544, + "learning_rate": 1.4299446584938705e-05, + "loss": 1.0063, + "num_input_tokens_seen": 197031768, + "step": 12246 + }, + { + "epoch": 0.857880865446076, + "grad_norm": 3.9763600826263428, + "learning_rate": 1.4292448336252185e-05, + "loss": 0.9336, + "num_input_tokens_seen": 197048152, + "step": 12247 + }, + { + "epoch": 0.8579509136918052, + "grad_norm": 3.3923416137695312, + "learning_rate": 1.4285450087565674e-05, + "loss": 0.7177, + "num_input_tokens_seen": 197064536, + "step": 12248 + }, + { + "epoch": 0.8580209619375345, + "grad_norm": 4.150362968444824, + "learning_rate": 1.4278451838879164e-05, + "loss": 1.0859, + "num_input_tokens_seen": 197079776, + "step": 12249 + }, + { + "epoch": 0.8580910101832637, + "grad_norm": 4.753663063049316, + "learning_rate": 1.4271453590192642e-05, + "loss": 1.1584, + "num_input_tokens_seen": 197096160, + "step": 12250 + }, + { + "epoch": 0.8581610584289929, + "grad_norm": 5.312755584716797, + "learning_rate": 1.4264455341506133e-05, + "loss": 1.1578, + "num_input_tokens_seen": 197111712, + "step": 12251 + }, + { + "epoch": 0.8582311066747222, + "grad_norm": 4.376021862030029, + "learning_rate": 1.4257457092819613e-05, + "loss": 0.9497, + "num_input_tokens_seen": 197128096, + "step": 12252 + }, + { + "epoch": 0.8583011549204514, + "grad_norm": 3.71584415435791, + "learning_rate": 1.4250458844133101e-05, + "loss": 1.0863, + "num_input_tokens_seen": 197144248, + "step": 12253 + }, + { + "epoch": 0.8583712031661807, + "grad_norm": 4.142114639282227, + "learning_rate": 1.4243460595446592e-05, + "loss": 0.9962, + "num_input_tokens_seen": 197160632, + "step": 12254 + }, + { + "epoch": 0.85844125141191, + "grad_norm": 4.162306785583496, + "learning_rate": 1.4236462346760072e-05, + "loss": 1.1986, + "num_input_tokens_seen": 197176864, + "step": 12255 + }, + { + "epoch": 0.8585112996576392, + "grad_norm": 3.3798508644104004, + "learning_rate": 1.4229464098073559e-05, + "loss": 1.0166, + "num_input_tokens_seen": 197193184, + "step": 12256 + }, + { + "epoch": 0.8585813479033685, + "grad_norm": 4.5630106925964355, + "learning_rate": 1.4222465849387037e-05, + "loss": 0.8932, + "num_input_tokens_seen": 197209160, + "step": 12257 + }, + { + "epoch": 0.8586513961490977, + "grad_norm": 3.7401790618896484, + "learning_rate": 1.4215467600700527e-05, + "loss": 1.0438, + "num_input_tokens_seen": 197225544, + "step": 12258 + }, + { + "epoch": 0.8587214443948269, + "grad_norm": 3.7096500396728516, + "learning_rate": 1.4208469352014018e-05, + "loss": 1.0781, + "num_input_tokens_seen": 197241800, + "step": 12259 + }, + { + "epoch": 0.8587914926405562, + "grad_norm": 4.454107761383057, + "learning_rate": 1.4201471103327496e-05, + "loss": 1.2278, + "num_input_tokens_seen": 197258184, + "step": 12260 + }, + { + "epoch": 0.8588615408862854, + "grad_norm": 4.291355609893799, + "learning_rate": 1.4194472854640986e-05, + "loss": 1.0768, + "num_input_tokens_seen": 197274568, + "step": 12261 + }, + { + "epoch": 0.8589315891320147, + "grad_norm": 4.1522088050842285, + "learning_rate": 1.4187474605954462e-05, + "loss": 1.1596, + "num_input_tokens_seen": 197290760, + "step": 12262 + }, + { + "epoch": 0.8590016373777439, + "grad_norm": 3.620926856994629, + "learning_rate": 1.4180476357267952e-05, + "loss": 1.0191, + "num_input_tokens_seen": 197307040, + "step": 12263 + }, + { + "epoch": 0.8590716856234731, + "grad_norm": 4.048181056976318, + "learning_rate": 1.4173478108581432e-05, + "loss": 1.1166, + "num_input_tokens_seen": 197323248, + "step": 12264 + }, + { + "epoch": 0.8591417338692025, + "grad_norm": 3.457897424697876, + "learning_rate": 1.416647985989492e-05, + "loss": 0.911, + "num_input_tokens_seen": 197339632, + "step": 12265 + }, + { + "epoch": 0.8592117821149317, + "grad_norm": 3.8750553131103516, + "learning_rate": 1.415948161120841e-05, + "loss": 1.1613, + "num_input_tokens_seen": 197356016, + "step": 12266 + }, + { + "epoch": 0.8592818303606609, + "grad_norm": 3.508734941482544, + "learning_rate": 1.415248336252189e-05, + "loss": 0.9422, + "num_input_tokens_seen": 197372400, + "step": 12267 + }, + { + "epoch": 0.8593518786063902, + "grad_norm": 4.101958751678467, + "learning_rate": 1.414548511383538e-05, + "loss": 0.9687, + "num_input_tokens_seen": 197388072, + "step": 12268 + }, + { + "epoch": 0.8594219268521194, + "grad_norm": 4.295889377593994, + "learning_rate": 1.413848686514886e-05, + "loss": 0.9989, + "num_input_tokens_seen": 197403688, + "step": 12269 + }, + { + "epoch": 0.8594919750978487, + "grad_norm": 5.125593662261963, + "learning_rate": 1.4131488616462348e-05, + "loss": 0.99, + "num_input_tokens_seen": 197419984, + "step": 12270 + }, + { + "epoch": 0.8595620233435779, + "grad_norm": 3.786207675933838, + "learning_rate": 1.4124490367775837e-05, + "loss": 1.0454, + "num_input_tokens_seen": 197435664, + "step": 12271 + }, + { + "epoch": 0.8596320715893071, + "grad_norm": 4.563023090362549, + "learning_rate": 1.4117492119089315e-05, + "loss": 1.1864, + "num_input_tokens_seen": 197452048, + "step": 12272 + }, + { + "epoch": 0.8597021198350364, + "grad_norm": 3.580601453781128, + "learning_rate": 1.4110493870402806e-05, + "loss": 1.0675, + "num_input_tokens_seen": 197468432, + "step": 12273 + }, + { + "epoch": 0.8597721680807656, + "grad_norm": 4.257338047027588, + "learning_rate": 1.4103495621716284e-05, + "loss": 1.0252, + "num_input_tokens_seen": 197484816, + "step": 12274 + }, + { + "epoch": 0.8598422163264948, + "grad_norm": 4.438417434692383, + "learning_rate": 1.4096497373029774e-05, + "loss": 1.0728, + "num_input_tokens_seen": 197501200, + "step": 12275 + }, + { + "epoch": 0.8599122645722241, + "grad_norm": 3.836141347885132, + "learning_rate": 1.4089499124343264e-05, + "loss": 1.1154, + "num_input_tokens_seen": 197517584, + "step": 12276 + }, + { + "epoch": 0.8599823128179533, + "grad_norm": 4.739768981933594, + "learning_rate": 1.408250087565674e-05, + "loss": 1.1165, + "num_input_tokens_seen": 197533232, + "step": 12277 + }, + { + "epoch": 0.8600523610636827, + "grad_norm": 5.457389831542969, + "learning_rate": 1.4075502626970233e-05, + "loss": 1.1621, + "num_input_tokens_seen": 197548496, + "step": 12278 + }, + { + "epoch": 0.8601224093094119, + "grad_norm": 3.6782705783843994, + "learning_rate": 1.4068504378283708e-05, + "loss": 1.0648, + "num_input_tokens_seen": 197564880, + "step": 12279 + }, + { + "epoch": 0.8601924575551411, + "grad_norm": 3.7545571327209473, + "learning_rate": 1.4061506129597199e-05, + "loss": 0.9853, + "num_input_tokens_seen": 197581264, + "step": 12280 + }, + { + "epoch": 0.8602625058008704, + "grad_norm": 3.8391709327697754, + "learning_rate": 1.4054507880910689e-05, + "loss": 1.136, + "num_input_tokens_seen": 197597624, + "step": 12281 + }, + { + "epoch": 0.8603325540465996, + "grad_norm": 3.721649646759033, + "learning_rate": 1.4047509632224167e-05, + "loss": 1.0438, + "num_input_tokens_seen": 197613896, + "step": 12282 + }, + { + "epoch": 0.8604026022923288, + "grad_norm": 4.638535976409912, + "learning_rate": 1.4040511383537658e-05, + "loss": 1.3678, + "num_input_tokens_seen": 197629968, + "step": 12283 + }, + { + "epoch": 0.8604726505380581, + "grad_norm": 3.6670308113098145, + "learning_rate": 1.4033513134851136e-05, + "loss": 0.9941, + "num_input_tokens_seen": 197645848, + "step": 12284 + }, + { + "epoch": 0.8605426987837873, + "grad_norm": 4.284358501434326, + "learning_rate": 1.4026514886164626e-05, + "loss": 0.9786, + "num_input_tokens_seen": 197662232, + "step": 12285 + }, + { + "epoch": 0.8606127470295166, + "grad_norm": 3.6369667053222656, + "learning_rate": 1.4019516637478113e-05, + "loss": 1.0523, + "num_input_tokens_seen": 197678616, + "step": 12286 + }, + { + "epoch": 0.8606827952752458, + "grad_norm": 4.541594505310059, + "learning_rate": 1.4012518388791593e-05, + "loss": 1.1341, + "num_input_tokens_seen": 197695000, + "step": 12287 + }, + { + "epoch": 0.860752843520975, + "grad_norm": 5.799292087554932, + "learning_rate": 1.4005520140105084e-05, + "loss": 0.9359, + "num_input_tokens_seen": 197711320, + "step": 12288 + }, + { + "epoch": 0.8608228917667043, + "grad_norm": 5.724205017089844, + "learning_rate": 1.3998521891418562e-05, + "loss": 1.1552, + "num_input_tokens_seen": 197726944, + "step": 12289 + }, + { + "epoch": 0.8608929400124335, + "grad_norm": 4.192360877990723, + "learning_rate": 1.3991523642732052e-05, + "loss": 0.9338, + "num_input_tokens_seen": 197742680, + "step": 12290 + }, + { + "epoch": 0.8609629882581629, + "grad_norm": 4.258605480194092, + "learning_rate": 1.398452539404553e-05, + "loss": 1.0385, + "num_input_tokens_seen": 197759064, + "step": 12291 + }, + { + "epoch": 0.8610330365038921, + "grad_norm": 3.6070141792297363, + "learning_rate": 1.3977527145359021e-05, + "loss": 1.0332, + "num_input_tokens_seen": 197775096, + "step": 12292 + }, + { + "epoch": 0.8611030847496213, + "grad_norm": 3.695129871368408, + "learning_rate": 1.3970528896672511e-05, + "loss": 0.9522, + "num_input_tokens_seen": 197791400, + "step": 12293 + }, + { + "epoch": 0.8611731329953506, + "grad_norm": 3.917037010192871, + "learning_rate": 1.3963530647985986e-05, + "loss": 1.1603, + "num_input_tokens_seen": 197807144, + "step": 12294 + }, + { + "epoch": 0.8612431812410798, + "grad_norm": 4.174408912658691, + "learning_rate": 1.395653239929948e-05, + "loss": 1.0011, + "num_input_tokens_seen": 197822752, + "step": 12295 + }, + { + "epoch": 0.861313229486809, + "grad_norm": 3.692047595977783, + "learning_rate": 1.3949534150612955e-05, + "loss": 1.0239, + "num_input_tokens_seen": 197838224, + "step": 12296 + }, + { + "epoch": 0.8613832777325383, + "grad_norm": 3.6707775592803955, + "learning_rate": 1.3942535901926445e-05, + "loss": 0.9931, + "num_input_tokens_seen": 197854328, + "step": 12297 + }, + { + "epoch": 0.8614533259782675, + "grad_norm": 3.528449058532715, + "learning_rate": 1.3935537653239936e-05, + "loss": 1.119, + "num_input_tokens_seen": 197869976, + "step": 12298 + }, + { + "epoch": 0.8615233742239968, + "grad_norm": 3.622828722000122, + "learning_rate": 1.3928539404553414e-05, + "loss": 0.9369, + "num_input_tokens_seen": 197885528, + "step": 12299 + }, + { + "epoch": 0.861593422469726, + "grad_norm": 4.029307842254639, + "learning_rate": 1.3921541155866903e-05, + "loss": 1.029, + "num_input_tokens_seen": 197901912, + "step": 12300 + }, + { + "epoch": 0.8616634707154552, + "grad_norm": 4.297327041625977, + "learning_rate": 1.3914542907180383e-05, + "loss": 0.9805, + "num_input_tokens_seen": 197918296, + "step": 12301 + }, + { + "epoch": 0.8617335189611846, + "grad_norm": 4.278064727783203, + "learning_rate": 1.3907544658493871e-05, + "loss": 1.0332, + "num_input_tokens_seen": 197934400, + "step": 12302 + }, + { + "epoch": 0.8618035672069138, + "grad_norm": 3.7779815196990967, + "learning_rate": 1.390054640980736e-05, + "loss": 0.9879, + "num_input_tokens_seen": 197949824, + "step": 12303 + }, + { + "epoch": 0.861873615452643, + "grad_norm": 4.907235622406006, + "learning_rate": 1.389354816112084e-05, + "loss": 1.1843, + "num_input_tokens_seen": 197966208, + "step": 12304 + }, + { + "epoch": 0.8619436636983723, + "grad_norm": 4.1908278465271, + "learning_rate": 1.388654991243433e-05, + "loss": 1.1141, + "num_input_tokens_seen": 197982592, + "step": 12305 + }, + { + "epoch": 0.8620137119441015, + "grad_norm": 4.79370641708374, + "learning_rate": 1.3879551663747809e-05, + "loss": 1.2971, + "num_input_tokens_seen": 197998328, + "step": 12306 + }, + { + "epoch": 0.8620837601898308, + "grad_norm": 5.116518974304199, + "learning_rate": 1.3872553415061299e-05, + "loss": 1.0723, + "num_input_tokens_seen": 198014624, + "step": 12307 + }, + { + "epoch": 0.86215380843556, + "grad_norm": 3.495544910430908, + "learning_rate": 1.386555516637479e-05, + "loss": 1.0378, + "num_input_tokens_seen": 198031008, + "step": 12308 + }, + { + "epoch": 0.8622238566812892, + "grad_norm": 3.6589462757110596, + "learning_rate": 1.3858556917688268e-05, + "loss": 0.9857, + "num_input_tokens_seen": 198046792, + "step": 12309 + }, + { + "epoch": 0.8622939049270185, + "grad_norm": 3.648871421813965, + "learning_rate": 1.3851558669001755e-05, + "loss": 1.0253, + "num_input_tokens_seen": 198063048, + "step": 12310 + }, + { + "epoch": 0.8623639531727477, + "grad_norm": 3.626164436340332, + "learning_rate": 1.3844560420315233e-05, + "loss": 1.0044, + "num_input_tokens_seen": 198079200, + "step": 12311 + }, + { + "epoch": 0.8624340014184769, + "grad_norm": 3.667262077331543, + "learning_rate": 1.3837562171628723e-05, + "loss": 0.899, + "num_input_tokens_seen": 198095584, + "step": 12312 + }, + { + "epoch": 0.8625040496642062, + "grad_norm": 3.907071828842163, + "learning_rate": 1.3830563922942214e-05, + "loss": 0.785, + "num_input_tokens_seen": 198110896, + "step": 12313 + }, + { + "epoch": 0.8625740979099354, + "grad_norm": 4.752641677856445, + "learning_rate": 1.3823565674255692e-05, + "loss": 1.1288, + "num_input_tokens_seen": 198127280, + "step": 12314 + }, + { + "epoch": 0.8626441461556648, + "grad_norm": 4.4694318771362305, + "learning_rate": 1.3816567425569179e-05, + "loss": 1.14, + "num_input_tokens_seen": 198143512, + "step": 12315 + }, + { + "epoch": 0.862714194401394, + "grad_norm": 3.6671674251556396, + "learning_rate": 1.380956917688266e-05, + "loss": 0.9471, + "num_input_tokens_seen": 198159896, + "step": 12316 + }, + { + "epoch": 0.8627842426471232, + "grad_norm": 3.3959665298461914, + "learning_rate": 1.380257092819615e-05, + "loss": 0.9142, + "num_input_tokens_seen": 198176280, + "step": 12317 + }, + { + "epoch": 0.8628542908928525, + "grad_norm": 3.6770408153533936, + "learning_rate": 1.3795572679509628e-05, + "loss": 0.9213, + "num_input_tokens_seen": 198192664, + "step": 12318 + }, + { + "epoch": 0.8629243391385817, + "grad_norm": 4.6343536376953125, + "learning_rate": 1.3788574430823118e-05, + "loss": 1.0726, + "num_input_tokens_seen": 198207496, + "step": 12319 + }, + { + "epoch": 0.8629943873843109, + "grad_norm": 4.799023628234863, + "learning_rate": 1.3781576182136607e-05, + "loss": 1.2065, + "num_input_tokens_seen": 198222440, + "step": 12320 + }, + { + "epoch": 0.8630644356300402, + "grad_norm": 3.789288282394409, + "learning_rate": 1.3774577933450087e-05, + "loss": 0.9994, + "num_input_tokens_seen": 198238824, + "step": 12321 + }, + { + "epoch": 0.8631344838757694, + "grad_norm": 3.7651724815368652, + "learning_rate": 1.3767579684763577e-05, + "loss": 0.869, + "num_input_tokens_seen": 198255008, + "step": 12322 + }, + { + "epoch": 0.8632045321214987, + "grad_norm": 4.822908401489258, + "learning_rate": 1.3760581436077052e-05, + "loss": 1.2589, + "num_input_tokens_seen": 198271392, + "step": 12323 + }, + { + "epoch": 0.8632745803672279, + "grad_norm": 4.3799943923950195, + "learning_rate": 1.3753583187390546e-05, + "loss": 1.2876, + "num_input_tokens_seen": 198287776, + "step": 12324 + }, + { + "epoch": 0.8633446286129571, + "grad_norm": 4.067901611328125, + "learning_rate": 1.3746584938704033e-05, + "loss": 1.3378, + "num_input_tokens_seen": 198303960, + "step": 12325 + }, + { + "epoch": 0.8634146768586864, + "grad_norm": 3.575632333755493, + "learning_rate": 1.3739586690017511e-05, + "loss": 0.9544, + "num_input_tokens_seen": 198320344, + "step": 12326 + }, + { + "epoch": 0.8634847251044157, + "grad_norm": 3.743607997894287, + "learning_rate": 1.3732588441331001e-05, + "loss": 1.0506, + "num_input_tokens_seen": 198336728, + "step": 12327 + }, + { + "epoch": 0.863554773350145, + "grad_norm": 3.822737455368042, + "learning_rate": 1.372559019264448e-05, + "loss": 1.0843, + "num_input_tokens_seen": 198353112, + "step": 12328 + }, + { + "epoch": 0.8636248215958742, + "grad_norm": 4.078860282897949, + "learning_rate": 1.371859194395797e-05, + "loss": 1.1691, + "num_input_tokens_seen": 198369496, + "step": 12329 + }, + { + "epoch": 0.8636948698416034, + "grad_norm": 4.305440902709961, + "learning_rate": 1.3711593695271457e-05, + "loss": 1.0685, + "num_input_tokens_seen": 198385608, + "step": 12330 + }, + { + "epoch": 0.8637649180873327, + "grad_norm": 4.594203472137451, + "learning_rate": 1.3704595446584937e-05, + "loss": 0.9671, + "num_input_tokens_seen": 198401008, + "step": 12331 + }, + { + "epoch": 0.8638349663330619, + "grad_norm": 4.861415386199951, + "learning_rate": 1.3697597197898426e-05, + "loss": 1.0387, + "num_input_tokens_seen": 198417392, + "step": 12332 + }, + { + "epoch": 0.8639050145787911, + "grad_norm": 4.010055065155029, + "learning_rate": 1.3690598949211906e-05, + "loss": 1.2078, + "num_input_tokens_seen": 198433776, + "step": 12333 + }, + { + "epoch": 0.8639750628245204, + "grad_norm": 4.298172950744629, + "learning_rate": 1.3683600700525396e-05, + "loss": 1.1976, + "num_input_tokens_seen": 198450160, + "step": 12334 + }, + { + "epoch": 0.8640451110702496, + "grad_norm": 3.922172784805298, + "learning_rate": 1.3676602451838885e-05, + "loss": 0.9933, + "num_input_tokens_seen": 198466288, + "step": 12335 + }, + { + "epoch": 0.8641151593159789, + "grad_norm": 3.7139413356781006, + "learning_rate": 1.3669604203152365e-05, + "loss": 1.0066, + "num_input_tokens_seen": 198482672, + "step": 12336 + }, + { + "epoch": 0.8641852075617081, + "grad_norm": 4.424309253692627, + "learning_rate": 1.3662605954465853e-05, + "loss": 1.0156, + "num_input_tokens_seen": 198498648, + "step": 12337 + }, + { + "epoch": 0.8642552558074373, + "grad_norm": 3.3239121437072754, + "learning_rate": 1.3655607705779333e-05, + "loss": 0.9029, + "num_input_tokens_seen": 198515032, + "step": 12338 + }, + { + "epoch": 0.8643253040531667, + "grad_norm": 3.6236252784729004, + "learning_rate": 1.3648609457092824e-05, + "loss": 0.9104, + "num_input_tokens_seen": 198531416, + "step": 12339 + }, + { + "epoch": 0.8643953522988959, + "grad_norm": 5.288454532623291, + "learning_rate": 1.364161120840631e-05, + "loss": 1.1697, + "num_input_tokens_seen": 198547800, + "step": 12340 + }, + { + "epoch": 0.8644654005446251, + "grad_norm": 4.480827808380127, + "learning_rate": 1.3634612959719789e-05, + "loss": 1.129, + "num_input_tokens_seen": 198563680, + "step": 12341 + }, + { + "epoch": 0.8645354487903544, + "grad_norm": 3.7555031776428223, + "learning_rate": 1.362761471103328e-05, + "loss": 0.982, + "num_input_tokens_seen": 198580064, + "step": 12342 + }, + { + "epoch": 0.8646054970360836, + "grad_norm": 4.276052474975586, + "learning_rate": 1.3620616462346758e-05, + "loss": 0.9055, + "num_input_tokens_seen": 198596448, + "step": 12343 + }, + { + "epoch": 0.8646755452818129, + "grad_norm": 3.91904354095459, + "learning_rate": 1.3613618213660248e-05, + "loss": 0.9158, + "num_input_tokens_seen": 198612832, + "step": 12344 + }, + { + "epoch": 0.8647455935275421, + "grad_norm": 4.059247970581055, + "learning_rate": 1.3606619964973727e-05, + "loss": 0.9841, + "num_input_tokens_seen": 198627944, + "step": 12345 + }, + { + "epoch": 0.8648156417732713, + "grad_norm": 3.7444424629211426, + "learning_rate": 1.3599621716287213e-05, + "loss": 1.0201, + "num_input_tokens_seen": 198644328, + "step": 12346 + }, + { + "epoch": 0.8648856900190006, + "grad_norm": 4.7748332023620605, + "learning_rate": 1.3592623467600704e-05, + "loss": 1.1637, + "num_input_tokens_seen": 198660088, + "step": 12347 + }, + { + "epoch": 0.8649557382647298, + "grad_norm": 3.6600682735443115, + "learning_rate": 1.3585625218914184e-05, + "loss": 0.9232, + "num_input_tokens_seen": 198675192, + "step": 12348 + }, + { + "epoch": 0.865025786510459, + "grad_norm": 4.359855651855469, + "learning_rate": 1.3578626970227672e-05, + "loss": 1.097, + "num_input_tokens_seen": 198691296, + "step": 12349 + }, + { + "epoch": 0.8650958347561883, + "grad_norm": 5.317723274230957, + "learning_rate": 1.3571628721541153e-05, + "loss": 1.02, + "num_input_tokens_seen": 198707680, + "step": 12350 + }, + { + "epoch": 0.8651658830019175, + "grad_norm": 4.9107465744018555, + "learning_rate": 1.3564630472854641e-05, + "loss": 1.2071, + "num_input_tokens_seen": 198724008, + "step": 12351 + }, + { + "epoch": 0.8652359312476469, + "grad_norm": 4.282744884490967, + "learning_rate": 1.3557632224168131e-05, + "loss": 0.8882, + "num_input_tokens_seen": 198740392, + "step": 12352 + }, + { + "epoch": 0.8653059794933761, + "grad_norm": 3.816134214401245, + "learning_rate": 1.3550633975481612e-05, + "loss": 0.8893, + "num_input_tokens_seen": 198756528, + "step": 12353 + }, + { + "epoch": 0.8653760277391053, + "grad_norm": 4.813945293426514, + "learning_rate": 1.35436357267951e-05, + "loss": 1.043, + "num_input_tokens_seen": 198771728, + "step": 12354 + }, + { + "epoch": 0.8654460759848346, + "grad_norm": 3.6743712425231934, + "learning_rate": 1.353663747810858e-05, + "loss": 1.0718, + "num_input_tokens_seen": 198788112, + "step": 12355 + }, + { + "epoch": 0.8655161242305638, + "grad_norm": 4.40257453918457, + "learning_rate": 1.3529639229422067e-05, + "loss": 1.1178, + "num_input_tokens_seen": 198804176, + "step": 12356 + }, + { + "epoch": 0.865586172476293, + "grad_norm": 3.5438432693481445, + "learning_rate": 1.3522640980735557e-05, + "loss": 1.1435, + "num_input_tokens_seen": 198820560, + "step": 12357 + }, + { + "epoch": 0.8656562207220223, + "grad_norm": 4.067643642425537, + "learning_rate": 1.3515642732049036e-05, + "loss": 1.1902, + "num_input_tokens_seen": 198836512, + "step": 12358 + }, + { + "epoch": 0.8657262689677515, + "grad_norm": 4.801469802856445, + "learning_rate": 1.3508644483362526e-05, + "loss": 0.9927, + "num_input_tokens_seen": 198852888, + "step": 12359 + }, + { + "epoch": 0.8657963172134808, + "grad_norm": 5.0618743896484375, + "learning_rate": 1.3501646234676005e-05, + "loss": 0.9171, + "num_input_tokens_seen": 198869272, + "step": 12360 + }, + { + "epoch": 0.86586636545921, + "grad_norm": 4.080538749694824, + "learning_rate": 1.3494647985989491e-05, + "loss": 1.1752, + "num_input_tokens_seen": 198885656, + "step": 12361 + }, + { + "epoch": 0.8659364137049392, + "grad_norm": 3.986765146255493, + "learning_rate": 1.3487649737302985e-05, + "loss": 1.0643, + "num_input_tokens_seen": 198902040, + "step": 12362 + }, + { + "epoch": 0.8660064619506685, + "grad_norm": 4.09193754196167, + "learning_rate": 1.348065148861646e-05, + "loss": 0.9475, + "num_input_tokens_seen": 198918424, + "step": 12363 + }, + { + "epoch": 0.8660765101963978, + "grad_norm": 3.819871425628662, + "learning_rate": 1.347365323992995e-05, + "loss": 1.1814, + "num_input_tokens_seen": 198934808, + "step": 12364 + }, + { + "epoch": 0.8661465584421271, + "grad_norm": 3.821725845336914, + "learning_rate": 1.346665499124343e-05, + "loss": 1.1176, + "num_input_tokens_seen": 198951192, + "step": 12365 + }, + { + "epoch": 0.8662166066878563, + "grad_norm": 4.877285480499268, + "learning_rate": 1.3459656742556919e-05, + "loss": 1.2689, + "num_input_tokens_seen": 198966816, + "step": 12366 + }, + { + "epoch": 0.8662866549335855, + "grad_norm": 4.730076789855957, + "learning_rate": 1.345265849387041e-05, + "loss": 1.0963, + "num_input_tokens_seen": 198983200, + "step": 12367 + }, + { + "epoch": 0.8663567031793148, + "grad_norm": 3.612898349761963, + "learning_rate": 1.3445660245183888e-05, + "loss": 0.996, + "num_input_tokens_seen": 198999584, + "step": 12368 + }, + { + "epoch": 0.866426751425044, + "grad_norm": 3.900376081466675, + "learning_rate": 1.3438661996497378e-05, + "loss": 0.9358, + "num_input_tokens_seen": 199015336, + "step": 12369 + }, + { + "epoch": 0.8664967996707732, + "grad_norm": 3.7482104301452637, + "learning_rate": 1.3431663747810858e-05, + "loss": 1.1019, + "num_input_tokens_seen": 199031720, + "step": 12370 + }, + { + "epoch": 0.8665668479165025, + "grad_norm": 4.847263336181641, + "learning_rate": 1.3424665499124345e-05, + "loss": 1.1836, + "num_input_tokens_seen": 199048104, + "step": 12371 + }, + { + "epoch": 0.8666368961622317, + "grad_norm": 4.493433475494385, + "learning_rate": 1.3417667250437824e-05, + "loss": 1.0613, + "num_input_tokens_seen": 199064488, + "step": 12372 + }, + { + "epoch": 0.866706944407961, + "grad_norm": 3.698686361312866, + "learning_rate": 1.3410669001751314e-05, + "loss": 0.7707, + "num_input_tokens_seen": 199079424, + "step": 12373 + }, + { + "epoch": 0.8667769926536902, + "grad_norm": 3.7090280055999756, + "learning_rate": 1.3403670753064804e-05, + "loss": 1.149, + "num_input_tokens_seen": 199095800, + "step": 12374 + }, + { + "epoch": 0.8668470408994194, + "grad_norm": 3.4420876502990723, + "learning_rate": 1.3396672504378283e-05, + "loss": 1.1093, + "num_input_tokens_seen": 199112184, + "step": 12375 + }, + { + "epoch": 0.8669170891451488, + "grad_norm": 3.7763211727142334, + "learning_rate": 1.338967425569177e-05, + "loss": 1.1539, + "num_input_tokens_seen": 199128568, + "step": 12376 + }, + { + "epoch": 0.866987137390878, + "grad_norm": 4.041818618774414, + "learning_rate": 1.3382676007005248e-05, + "loss": 1.0639, + "num_input_tokens_seen": 199144368, + "step": 12377 + }, + { + "epoch": 0.8670571856366072, + "grad_norm": 3.7480762004852295, + "learning_rate": 1.3375677758318738e-05, + "loss": 1.0199, + "num_input_tokens_seen": 199160752, + "step": 12378 + }, + { + "epoch": 0.8671272338823365, + "grad_norm": 4.339356422424316, + "learning_rate": 1.3368679509632232e-05, + "loss": 1.1513, + "num_input_tokens_seen": 199176728, + "step": 12379 + }, + { + "epoch": 0.8671972821280657, + "grad_norm": 6.06949520111084, + "learning_rate": 1.3361681260945707e-05, + "loss": 1.1411, + "num_input_tokens_seen": 199193112, + "step": 12380 + }, + { + "epoch": 0.867267330373795, + "grad_norm": 4.416282653808594, + "learning_rate": 1.3354683012259197e-05, + "loss": 1.0028, + "num_input_tokens_seen": 199209496, + "step": 12381 + }, + { + "epoch": 0.8673373786195242, + "grad_norm": 4.038880825042725, + "learning_rate": 1.3347684763572677e-05, + "loss": 1.0767, + "num_input_tokens_seen": 199225336, + "step": 12382 + }, + { + "epoch": 0.8674074268652534, + "grad_norm": 4.171569347381592, + "learning_rate": 1.3340686514886166e-05, + "loss": 0.9644, + "num_input_tokens_seen": 199240776, + "step": 12383 + }, + { + "epoch": 0.8674774751109827, + "grad_norm": 3.708641529083252, + "learning_rate": 1.3333688266199656e-05, + "loss": 0.9452, + "num_input_tokens_seen": 199257160, + "step": 12384 + }, + { + "epoch": 0.8675475233567119, + "grad_norm": 4.1407904624938965, + "learning_rate": 1.3326690017513135e-05, + "loss": 1.0846, + "num_input_tokens_seen": 199272736, + "step": 12385 + }, + { + "epoch": 0.8676175716024411, + "grad_norm": 3.66418719291687, + "learning_rate": 1.3319691768826623e-05, + "loss": 1.0027, + "num_input_tokens_seen": 199289120, + "step": 12386 + }, + { + "epoch": 0.8676876198481704, + "grad_norm": 4.472699165344238, + "learning_rate": 1.3312693520140102e-05, + "loss": 1.187, + "num_input_tokens_seen": 199305504, + "step": 12387 + }, + { + "epoch": 0.8677576680938996, + "grad_norm": 3.9275479316711426, + "learning_rate": 1.3305695271453592e-05, + "loss": 0.9246, + "num_input_tokens_seen": 199321760, + "step": 12388 + }, + { + "epoch": 0.867827716339629, + "grad_norm": 3.7369916439056396, + "learning_rate": 1.3298697022767082e-05, + "loss": 1.1456, + "num_input_tokens_seen": 199337536, + "step": 12389 + }, + { + "epoch": 0.8678977645853582, + "grad_norm": 5.484729290008545, + "learning_rate": 1.329169877408056e-05, + "loss": 1.2703, + "num_input_tokens_seen": 199352416, + "step": 12390 + }, + { + "epoch": 0.8679678128310874, + "grad_norm": 3.7572593688964844, + "learning_rate": 1.3284700525394051e-05, + "loss": 1.0297, + "num_input_tokens_seen": 199368560, + "step": 12391 + }, + { + "epoch": 0.8680378610768167, + "grad_norm": 3.707432985305786, + "learning_rate": 1.3277702276707526e-05, + "loss": 1.1472, + "num_input_tokens_seen": 199384784, + "step": 12392 + }, + { + "epoch": 0.8681079093225459, + "grad_norm": 4.56617546081543, + "learning_rate": 1.3270704028021016e-05, + "loss": 1.0961, + "num_input_tokens_seen": 199401168, + "step": 12393 + }, + { + "epoch": 0.8681779575682752, + "grad_norm": 3.8135783672332764, + "learning_rate": 1.3263705779334495e-05, + "loss": 1.0872, + "num_input_tokens_seen": 199417552, + "step": 12394 + }, + { + "epoch": 0.8682480058140044, + "grad_norm": 4.204775333404541, + "learning_rate": 1.3256707530647985e-05, + "loss": 1.0113, + "num_input_tokens_seen": 199433936, + "step": 12395 + }, + { + "epoch": 0.8683180540597336, + "grad_norm": 4.007627487182617, + "learning_rate": 1.3249709281961475e-05, + "loss": 1.0393, + "num_input_tokens_seen": 199450320, + "step": 12396 + }, + { + "epoch": 0.8683881023054629, + "grad_norm": 4.929315567016602, + "learning_rate": 1.3242711033274954e-05, + "loss": 1.2357, + "num_input_tokens_seen": 199465824, + "step": 12397 + }, + { + "epoch": 0.8684581505511921, + "grad_norm": 3.1852951049804688, + "learning_rate": 1.3235712784588444e-05, + "loss": 0.8188, + "num_input_tokens_seen": 199482208, + "step": 12398 + }, + { + "epoch": 0.8685281987969213, + "grad_norm": 3.758207321166992, + "learning_rate": 1.3228714535901924e-05, + "loss": 1.0947, + "num_input_tokens_seen": 199498592, + "step": 12399 + }, + { + "epoch": 0.8685982470426507, + "grad_norm": 3.978482961654663, + "learning_rate": 1.3221716287215413e-05, + "loss": 1.1147, + "num_input_tokens_seen": 199514328, + "step": 12400 + }, + { + "epoch": 0.8685982470426507, + "eval_loss": 1.1151013374328613, + "eval_runtime": 0.185, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 5.405, + "num_input_tokens_seen": 199514328, + "step": 12400 + }, + { + "epoch": 0.8686682952883799, + "grad_norm": 3.77974009513855, + "learning_rate": 1.3214718038528901e-05, + "loss": 1.044, + "num_input_tokens_seen": 199530712, + "step": 12401 + }, + { + "epoch": 0.8687383435341092, + "grad_norm": 3.390530824661255, + "learning_rate": 1.320771978984238e-05, + "loss": 1.0135, + "num_input_tokens_seen": 199546704, + "step": 12402 + }, + { + "epoch": 0.8688083917798384, + "grad_norm": 4.058717250823975, + "learning_rate": 1.320072154115587e-05, + "loss": 1.2552, + "num_input_tokens_seen": 199563088, + "step": 12403 + }, + { + "epoch": 0.8688784400255676, + "grad_norm": 3.4471309185028076, + "learning_rate": 1.3193723292469348e-05, + "loss": 0.922, + "num_input_tokens_seen": 199579232, + "step": 12404 + }, + { + "epoch": 0.8689484882712969, + "grad_norm": 4.3581366539001465, + "learning_rate": 1.3186725043782839e-05, + "loss": 1.1629, + "num_input_tokens_seen": 199595000, + "step": 12405 + }, + { + "epoch": 0.8690185365170261, + "grad_norm": 3.396852731704712, + "learning_rate": 1.3179726795096329e-05, + "loss": 1.0485, + "num_input_tokens_seen": 199611384, + "step": 12406 + }, + { + "epoch": 0.8690885847627553, + "grad_norm": 4.192478179931641, + "learning_rate": 1.3172728546409804e-05, + "loss": 1.3121, + "num_input_tokens_seen": 199627768, + "step": 12407 + }, + { + "epoch": 0.8691586330084846, + "grad_norm": 5.933086395263672, + "learning_rate": 1.3165730297723298e-05, + "loss": 1.0763, + "num_input_tokens_seen": 199643448, + "step": 12408 + }, + { + "epoch": 0.8692286812542138, + "grad_norm": 3.439328908920288, + "learning_rate": 1.3158732049036773e-05, + "loss": 0.9905, + "num_input_tokens_seen": 199659832, + "step": 12409 + }, + { + "epoch": 0.8692987294999431, + "grad_norm": 3.4219257831573486, + "learning_rate": 1.3151733800350266e-05, + "loss": 0.9319, + "num_input_tokens_seen": 199676216, + "step": 12410 + }, + { + "epoch": 0.8693687777456723, + "grad_norm": 3.5988125801086426, + "learning_rate": 1.3144735551663753e-05, + "loss": 1.0535, + "num_input_tokens_seen": 199692224, + "step": 12411 + }, + { + "epoch": 0.8694388259914015, + "grad_norm": 4.739302158355713, + "learning_rate": 1.3137737302977232e-05, + "loss": 0.9894, + "num_input_tokens_seen": 199707712, + "step": 12412 + }, + { + "epoch": 0.8695088742371309, + "grad_norm": 4.107836723327637, + "learning_rate": 1.3130739054290722e-05, + "loss": 1.1665, + "num_input_tokens_seen": 199724032, + "step": 12413 + }, + { + "epoch": 0.8695789224828601, + "grad_norm": 4.067647457122803, + "learning_rate": 1.31237408056042e-05, + "loss": 1.1882, + "num_input_tokens_seen": 199739992, + "step": 12414 + }, + { + "epoch": 0.8696489707285893, + "grad_norm": 4.295608043670654, + "learning_rate": 1.311674255691769e-05, + "loss": 1.1792, + "num_input_tokens_seen": 199756168, + "step": 12415 + }, + { + "epoch": 0.8697190189743186, + "grad_norm": 4.761004447937012, + "learning_rate": 1.3109744308231178e-05, + "loss": 0.9088, + "num_input_tokens_seen": 199772552, + "step": 12416 + }, + { + "epoch": 0.8697890672200478, + "grad_norm": 5.052656173706055, + "learning_rate": 1.3102746059544658e-05, + "loss": 0.9194, + "num_input_tokens_seen": 199788936, + "step": 12417 + }, + { + "epoch": 0.8698591154657771, + "grad_norm": 3.763441801071167, + "learning_rate": 1.3095747810858148e-05, + "loss": 0.9436, + "num_input_tokens_seen": 199805320, + "step": 12418 + }, + { + "epoch": 0.8699291637115063, + "grad_norm": 4.034101963043213, + "learning_rate": 1.3088749562171626e-05, + "loss": 1.1828, + "num_input_tokens_seen": 199820624, + "step": 12419 + }, + { + "epoch": 0.8699992119572355, + "grad_norm": 4.4746928215026855, + "learning_rate": 1.3081751313485117e-05, + "loss": 1.0457, + "num_input_tokens_seen": 199835144, + "step": 12420 + }, + { + "epoch": 0.8700692602029648, + "grad_norm": 3.680745840072632, + "learning_rate": 1.3074753064798595e-05, + "loss": 0.99, + "num_input_tokens_seen": 199851528, + "step": 12421 + }, + { + "epoch": 0.870139308448694, + "grad_norm": 4.099174976348877, + "learning_rate": 1.3067754816112085e-05, + "loss": 1.0666, + "num_input_tokens_seen": 199867912, + "step": 12422 + }, + { + "epoch": 0.8702093566944232, + "grad_norm": 4.232349872589111, + "learning_rate": 1.3060756567425576e-05, + "loss": 1.2063, + "num_input_tokens_seen": 199884296, + "step": 12423 + }, + { + "epoch": 0.8702794049401525, + "grad_norm": 4.020661354064941, + "learning_rate": 1.305375831873905e-05, + "loss": 1.1305, + "num_input_tokens_seen": 199900680, + "step": 12424 + }, + { + "epoch": 0.8703494531858817, + "grad_norm": 3.448404550552368, + "learning_rate": 1.3046760070052544e-05, + "loss": 0.831, + "num_input_tokens_seen": 199917064, + "step": 12425 + }, + { + "epoch": 0.8704195014316111, + "grad_norm": 3.580139636993408, + "learning_rate": 1.303976182136602e-05, + "loss": 1.1035, + "num_input_tokens_seen": 199932840, + "step": 12426 + }, + { + "epoch": 0.8704895496773403, + "grad_norm": 3.8546457290649414, + "learning_rate": 1.303276357267951e-05, + "loss": 1.1588, + "num_input_tokens_seen": 199949224, + "step": 12427 + }, + { + "epoch": 0.8705595979230695, + "grad_norm": 3.690006732940674, + "learning_rate": 1.3025765323993e-05, + "loss": 0.9228, + "num_input_tokens_seen": 199964720, + "step": 12428 + }, + { + "epoch": 0.8706296461687988, + "grad_norm": 3.78037691116333, + "learning_rate": 1.3018767075306478e-05, + "loss": 0.9105, + "num_input_tokens_seen": 199980120, + "step": 12429 + }, + { + "epoch": 0.870699694414528, + "grad_norm": 3.3280282020568848, + "learning_rate": 1.3011768826619969e-05, + "loss": 0.8458, + "num_input_tokens_seen": 199996488, + "step": 12430 + }, + { + "epoch": 0.8707697426602573, + "grad_norm": 5.226138591766357, + "learning_rate": 1.3004770577933447e-05, + "loss": 0.9628, + "num_input_tokens_seen": 200012568, + "step": 12431 + }, + { + "epoch": 0.8708397909059865, + "grad_norm": 4.096982955932617, + "learning_rate": 1.2997772329246936e-05, + "loss": 1.054, + "num_input_tokens_seen": 200028280, + "step": 12432 + }, + { + "epoch": 0.8709098391517157, + "grad_norm": 3.648414134979248, + "learning_rate": 1.2990774080560424e-05, + "loss": 0.9294, + "num_input_tokens_seen": 200044560, + "step": 12433 + }, + { + "epoch": 0.870979887397445, + "grad_norm": 3.68387508392334, + "learning_rate": 1.2983775831873904e-05, + "loss": 1.0197, + "num_input_tokens_seen": 200060944, + "step": 12434 + }, + { + "epoch": 0.8710499356431742, + "grad_norm": 5.014534950256348, + "learning_rate": 1.2976777583187395e-05, + "loss": 1.1098, + "num_input_tokens_seen": 200077328, + "step": 12435 + }, + { + "epoch": 0.8711199838889034, + "grad_norm": 4.2858076095581055, + "learning_rate": 1.2969779334500873e-05, + "loss": 1.1681, + "num_input_tokens_seen": 200093712, + "step": 12436 + }, + { + "epoch": 0.8711900321346328, + "grad_norm": 3.9795968532562256, + "learning_rate": 1.2962781085814363e-05, + "loss": 1.0369, + "num_input_tokens_seen": 200109832, + "step": 12437 + }, + { + "epoch": 0.871260080380362, + "grad_norm": 3.372939109802246, + "learning_rate": 1.2955782837127852e-05, + "loss": 1.0202, + "num_input_tokens_seen": 200126192, + "step": 12438 + }, + { + "epoch": 0.8713301286260913, + "grad_norm": 7.123353958129883, + "learning_rate": 1.2948784588441332e-05, + "loss": 1.1212, + "num_input_tokens_seen": 200141184, + "step": 12439 + }, + { + "epoch": 0.8714001768718205, + "grad_norm": 3.7220115661621094, + "learning_rate": 1.2941786339754822e-05, + "loss": 1.1856, + "num_input_tokens_seen": 200157408, + "step": 12440 + }, + { + "epoch": 0.8714702251175497, + "grad_norm": 4.475198268890381, + "learning_rate": 1.2934788091068297e-05, + "loss": 1.1733, + "num_input_tokens_seen": 200173792, + "step": 12441 + }, + { + "epoch": 0.871540273363279, + "grad_norm": 5.178131580352783, + "learning_rate": 1.2927789842381788e-05, + "loss": 1.1413, + "num_input_tokens_seen": 200188968, + "step": 12442 + }, + { + "epoch": 0.8716103216090082, + "grad_norm": 4.35125207901001, + "learning_rate": 1.2920791593695278e-05, + "loss": 0.9647, + "num_input_tokens_seen": 200205080, + "step": 12443 + }, + { + "epoch": 0.8716803698547374, + "grad_norm": 3.541158437728882, + "learning_rate": 1.2913793345008756e-05, + "loss": 1.0135, + "num_input_tokens_seen": 200220896, + "step": 12444 + }, + { + "epoch": 0.8717504181004667, + "grad_norm": 4.7379889488220215, + "learning_rate": 1.2906795096322243e-05, + "loss": 1.0727, + "num_input_tokens_seen": 200236864, + "step": 12445 + }, + { + "epoch": 0.8718204663461959, + "grad_norm": 3.855501651763916, + "learning_rate": 1.2899796847635725e-05, + "loss": 1.0139, + "num_input_tokens_seen": 200253136, + "step": 12446 + }, + { + "epoch": 0.8718905145919252, + "grad_norm": 4.1641411781311035, + "learning_rate": 1.2892798598949212e-05, + "loss": 1.0488, + "num_input_tokens_seen": 200269520, + "step": 12447 + }, + { + "epoch": 0.8719605628376544, + "grad_norm": 3.7989954948425293, + "learning_rate": 1.2885800350262692e-05, + "loss": 1.0231, + "num_input_tokens_seen": 200285904, + "step": 12448 + }, + { + "epoch": 0.8720306110833836, + "grad_norm": 3.493483543395996, + "learning_rate": 1.2878802101576182e-05, + "loss": 1.1218, + "num_input_tokens_seen": 200302288, + "step": 12449 + }, + { + "epoch": 0.872100659329113, + "grad_norm": 4.151757717132568, + "learning_rate": 1.2871803852889671e-05, + "loss": 1.0894, + "num_input_tokens_seen": 200318672, + "step": 12450 + }, + { + "epoch": 0.8721707075748422, + "grad_norm": 4.130611419677734, + "learning_rate": 1.2864805604203151e-05, + "loss": 0.9142, + "num_input_tokens_seen": 200335056, + "step": 12451 + }, + { + "epoch": 0.8722407558205714, + "grad_norm": 4.006992340087891, + "learning_rate": 1.2857807355516641e-05, + "loss": 0.8411, + "num_input_tokens_seen": 200350424, + "step": 12452 + }, + { + "epoch": 0.8723108040663007, + "grad_norm": 3.470613718032837, + "learning_rate": 1.285080910683012e-05, + "loss": 0.9141, + "num_input_tokens_seen": 200366632, + "step": 12453 + }, + { + "epoch": 0.8723808523120299, + "grad_norm": 4.114629745483398, + "learning_rate": 1.284381085814361e-05, + "loss": 1.0733, + "num_input_tokens_seen": 200382464, + "step": 12454 + }, + { + "epoch": 0.8724509005577592, + "grad_norm": 5.641251087188721, + "learning_rate": 1.2836812609457097e-05, + "loss": 1.2095, + "num_input_tokens_seen": 200398600, + "step": 12455 + }, + { + "epoch": 0.8725209488034884, + "grad_norm": 6.9227447509765625, + "learning_rate": 1.2829814360770579e-05, + "loss": 1.1948, + "num_input_tokens_seen": 200413928, + "step": 12456 + }, + { + "epoch": 0.8725909970492176, + "grad_norm": 3.924851417541504, + "learning_rate": 1.2822816112084066e-05, + "loss": 1.2043, + "num_input_tokens_seen": 200430312, + "step": 12457 + }, + { + "epoch": 0.8726610452949469, + "grad_norm": 4.105025291442871, + "learning_rate": 1.2815817863397544e-05, + "loss": 1.029, + "num_input_tokens_seen": 200446696, + "step": 12458 + }, + { + "epoch": 0.8727310935406761, + "grad_norm": 4.124265670776367, + "learning_rate": 1.2808819614711034e-05, + "loss": 1.1109, + "num_input_tokens_seen": 200461512, + "step": 12459 + }, + { + "epoch": 0.8728011417864053, + "grad_norm": 3.708024501800537, + "learning_rate": 1.2801821366024521e-05, + "loss": 0.989, + "num_input_tokens_seen": 200477248, + "step": 12460 + }, + { + "epoch": 0.8728711900321346, + "grad_norm": 4.170856475830078, + "learning_rate": 1.2794823117338003e-05, + "loss": 1.0944, + "num_input_tokens_seen": 200493448, + "step": 12461 + }, + { + "epoch": 0.8729412382778639, + "grad_norm": 3.9636075496673584, + "learning_rate": 1.278782486865149e-05, + "loss": 1.064, + "num_input_tokens_seen": 200509544, + "step": 12462 + }, + { + "epoch": 0.8730112865235932, + "grad_norm": 3.721010208129883, + "learning_rate": 1.278082661996497e-05, + "loss": 0.9376, + "num_input_tokens_seen": 200525840, + "step": 12463 + }, + { + "epoch": 0.8730813347693224, + "grad_norm": 3.500476598739624, + "learning_rate": 1.2773828371278459e-05, + "loss": 1.0983, + "num_input_tokens_seen": 200541632, + "step": 12464 + }, + { + "epoch": 0.8731513830150516, + "grad_norm": 3.475104331970215, + "learning_rate": 1.2766830122591949e-05, + "loss": 1.0465, + "num_input_tokens_seen": 200558016, + "step": 12465 + }, + { + "epoch": 0.8732214312607809, + "grad_norm": 4.194629669189453, + "learning_rate": 1.275983187390543e-05, + "loss": 0.9851, + "num_input_tokens_seen": 200574216, + "step": 12466 + }, + { + "epoch": 0.8732914795065101, + "grad_norm": 3.788418769836426, + "learning_rate": 1.2752833625218918e-05, + "loss": 0.9748, + "num_input_tokens_seen": 200590600, + "step": 12467 + }, + { + "epoch": 0.8733615277522394, + "grad_norm": 4.6982831954956055, + "learning_rate": 1.2745835376532398e-05, + "loss": 0.9904, + "num_input_tokens_seen": 200606272, + "step": 12468 + }, + { + "epoch": 0.8734315759979686, + "grad_norm": 4.186924457550049, + "learning_rate": 1.2738837127845886e-05, + "loss": 1.036, + "num_input_tokens_seen": 200622656, + "step": 12469 + }, + { + "epoch": 0.8735016242436978, + "grad_norm": 4.259232997894287, + "learning_rate": 1.2731838879159375e-05, + "loss": 0.7612, + "num_input_tokens_seen": 200639040, + "step": 12470 + }, + { + "epoch": 0.8735716724894271, + "grad_norm": 3.586690664291382, + "learning_rate": 1.2724840630472854e-05, + "loss": 0.945, + "num_input_tokens_seen": 200655424, + "step": 12471 + }, + { + "epoch": 0.8736417207351563, + "grad_norm": 4.360635757446289, + "learning_rate": 1.2717842381786344e-05, + "loss": 1.1584, + "num_input_tokens_seen": 200671632, + "step": 12472 + }, + { + "epoch": 0.8737117689808855, + "grad_norm": 3.4643614292144775, + "learning_rate": 1.2710844133099822e-05, + "loss": 0.9835, + "num_input_tokens_seen": 200687304, + "step": 12473 + }, + { + "epoch": 0.8737818172266149, + "grad_norm": 5.900325775146484, + "learning_rate": 1.2703845884413313e-05, + "loss": 0.8753, + "num_input_tokens_seen": 200703688, + "step": 12474 + }, + { + "epoch": 0.8738518654723441, + "grad_norm": 4.221566677093506, + "learning_rate": 1.2696847635726791e-05, + "loss": 1.0182, + "num_input_tokens_seen": 200719640, + "step": 12475 + }, + { + "epoch": 0.8739219137180734, + "grad_norm": 3.7173123359680176, + "learning_rate": 1.2689849387040278e-05, + "loss": 1.0216, + "num_input_tokens_seen": 200736024, + "step": 12476 + }, + { + "epoch": 0.8739919619638026, + "grad_norm": 3.9783029556274414, + "learning_rate": 1.2682851138353768e-05, + "loss": 1.2195, + "num_input_tokens_seen": 200752056, + "step": 12477 + }, + { + "epoch": 0.8740620102095318, + "grad_norm": 4.928297519683838, + "learning_rate": 1.2675852889667247e-05, + "loss": 1.1405, + "num_input_tokens_seen": 200768440, + "step": 12478 + }, + { + "epoch": 0.8741320584552611, + "grad_norm": 4.694872856140137, + "learning_rate": 1.2668854640980737e-05, + "loss": 1.1059, + "num_input_tokens_seen": 200784736, + "step": 12479 + }, + { + "epoch": 0.8742021067009903, + "grad_norm": 3.6363539695739746, + "learning_rate": 1.2661856392294217e-05, + "loss": 0.9914, + "num_input_tokens_seen": 200800584, + "step": 12480 + }, + { + "epoch": 0.8742721549467195, + "grad_norm": 4.06942892074585, + "learning_rate": 1.2654858143607706e-05, + "loss": 1.2255, + "num_input_tokens_seen": 200816968, + "step": 12481 + }, + { + "epoch": 0.8743422031924488, + "grad_norm": 3.4292290210723877, + "learning_rate": 1.2647859894921196e-05, + "loss": 1.0313, + "num_input_tokens_seen": 200833352, + "step": 12482 + }, + { + "epoch": 0.874412251438178, + "grad_norm": 3.4635426998138428, + "learning_rate": 1.2640861646234676e-05, + "loss": 0.8802, + "num_input_tokens_seen": 200849632, + "step": 12483 + }, + { + "epoch": 0.8744822996839073, + "grad_norm": 4.515510559082031, + "learning_rate": 1.2633863397548165e-05, + "loss": 1.2028, + "num_input_tokens_seen": 200865256, + "step": 12484 + }, + { + "epoch": 0.8745523479296365, + "grad_norm": 3.555277109146118, + "learning_rate": 1.2626865148861645e-05, + "loss": 1.0005, + "num_input_tokens_seen": 200881552, + "step": 12485 + }, + { + "epoch": 0.8746223961753657, + "grad_norm": 3.8341867923736572, + "learning_rate": 1.2619866900175132e-05, + "loss": 0.912, + "num_input_tokens_seen": 200897800, + "step": 12486 + }, + { + "epoch": 0.8746924444210951, + "grad_norm": 3.6899831295013428, + "learning_rate": 1.2612868651488622e-05, + "loss": 0.9737, + "num_input_tokens_seen": 200914000, + "step": 12487 + }, + { + "epoch": 0.8747624926668243, + "grad_norm": 3.9059953689575195, + "learning_rate": 1.26058704028021e-05, + "loss": 1.0746, + "num_input_tokens_seen": 200930384, + "step": 12488 + }, + { + "epoch": 0.8748325409125535, + "grad_norm": 5.538366794586182, + "learning_rate": 1.259887215411559e-05, + "loss": 1.0509, + "num_input_tokens_seen": 200945896, + "step": 12489 + }, + { + "epoch": 0.8749025891582828, + "grad_norm": 3.9518158435821533, + "learning_rate": 1.2591873905429069e-05, + "loss": 0.8957, + "num_input_tokens_seen": 200961448, + "step": 12490 + }, + { + "epoch": 0.874972637404012, + "grad_norm": 3.735319137573242, + "learning_rate": 1.2584875656742556e-05, + "loss": 0.8591, + "num_input_tokens_seen": 200977368, + "step": 12491 + }, + { + "epoch": 0.8750426856497413, + "grad_norm": 4.429387092590332, + "learning_rate": 1.257787740805605e-05, + "loss": 0.9027, + "num_input_tokens_seen": 200993752, + "step": 12492 + }, + { + "epoch": 0.8751127338954705, + "grad_norm": 3.5525283813476562, + "learning_rate": 1.2570879159369525e-05, + "loss": 0.9781, + "num_input_tokens_seen": 201010136, + "step": 12493 + }, + { + "epoch": 0.8751827821411997, + "grad_norm": 5.135288238525391, + "learning_rate": 1.2563880910683015e-05, + "loss": 1.3299, + "num_input_tokens_seen": 201026496, + "step": 12494 + }, + { + "epoch": 0.875252830386929, + "grad_norm": 3.783479690551758, + "learning_rate": 1.2556882661996493e-05, + "loss": 0.9514, + "num_input_tokens_seen": 201042880, + "step": 12495 + }, + { + "epoch": 0.8753228786326582, + "grad_norm": 3.584134578704834, + "learning_rate": 1.2549884413309984e-05, + "loss": 0.8974, + "num_input_tokens_seen": 201059264, + "step": 12496 + }, + { + "epoch": 0.8753929268783874, + "grad_norm": 3.902175188064575, + "learning_rate": 1.2542886164623474e-05, + "loss": 1.1517, + "num_input_tokens_seen": 201075648, + "step": 12497 + }, + { + "epoch": 0.8754629751241167, + "grad_norm": 3.7207841873168945, + "learning_rate": 1.2535887915936952e-05, + "loss": 1.032, + "num_input_tokens_seen": 201092032, + "step": 12498 + }, + { + "epoch": 0.875533023369846, + "grad_norm": 5.619358539581299, + "learning_rate": 1.2528889667250443e-05, + "loss": 0.9665, + "num_input_tokens_seen": 201108320, + "step": 12499 + }, + { + "epoch": 0.8756030716155753, + "grad_norm": 3.563180446624756, + "learning_rate": 1.2521891418563923e-05, + "loss": 1.0401, + "num_input_tokens_seen": 201124704, + "step": 12500 + }, + { + "epoch": 0.8756731198613045, + "grad_norm": 4.258616924285889, + "learning_rate": 1.251489316987741e-05, + "loss": 0.9011, + "num_input_tokens_seen": 201140872, + "step": 12501 + }, + { + "epoch": 0.8757431681070337, + "grad_norm": 3.92475962638855, + "learning_rate": 1.2507894921190888e-05, + "loss": 0.9539, + "num_input_tokens_seen": 201157256, + "step": 12502 + }, + { + "epoch": 0.875813216352763, + "grad_norm": 3.9077460765838623, + "learning_rate": 1.2500896672504378e-05, + "loss": 0.9787, + "num_input_tokens_seen": 201172928, + "step": 12503 + }, + { + "epoch": 0.8758832645984922, + "grad_norm": 5.061121463775635, + "learning_rate": 1.2493898423817869e-05, + "loss": 1.0909, + "num_input_tokens_seen": 201188632, + "step": 12504 + }, + { + "epoch": 0.8759533128442215, + "grad_norm": 3.860954523086548, + "learning_rate": 1.2486900175131347e-05, + "loss": 1.1872, + "num_input_tokens_seen": 201205016, + "step": 12505 + }, + { + "epoch": 0.8760233610899507, + "grad_norm": 3.831061840057373, + "learning_rate": 1.2479901926444836e-05, + "loss": 0.9477, + "num_input_tokens_seen": 201221400, + "step": 12506 + }, + { + "epoch": 0.8760934093356799, + "grad_norm": 3.494060754776001, + "learning_rate": 1.2472903677758314e-05, + "loss": 0.8806, + "num_input_tokens_seen": 201237784, + "step": 12507 + }, + { + "epoch": 0.8761634575814092, + "grad_norm": 4.006968975067139, + "learning_rate": 1.2465905429071804e-05, + "loss": 1.2963, + "num_input_tokens_seen": 201253792, + "step": 12508 + }, + { + "epoch": 0.8762335058271384, + "grad_norm": 6.7606401443481445, + "learning_rate": 1.2458907180385295e-05, + "loss": 1.1392, + "num_input_tokens_seen": 201269848, + "step": 12509 + }, + { + "epoch": 0.8763035540728676, + "grad_norm": 6.270375728607178, + "learning_rate": 1.2451908931698773e-05, + "loss": 1.2755, + "num_input_tokens_seen": 201285432, + "step": 12510 + }, + { + "epoch": 0.876373602318597, + "grad_norm": 4.672218322753906, + "learning_rate": 1.2444910683012262e-05, + "loss": 1.16, + "num_input_tokens_seen": 201301488, + "step": 12511 + }, + { + "epoch": 0.8764436505643262, + "grad_norm": 4.022189140319824, + "learning_rate": 1.243791243432574e-05, + "loss": 1.0827, + "num_input_tokens_seen": 201317872, + "step": 12512 + }, + { + "epoch": 0.8765136988100555, + "grad_norm": 5.916778087615967, + "learning_rate": 1.243091418563923e-05, + "loss": 1.3817, + "num_input_tokens_seen": 201334240, + "step": 12513 + }, + { + "epoch": 0.8765837470557847, + "grad_norm": 6.069848537445068, + "learning_rate": 1.242391593695272e-05, + "loss": 0.7956, + "num_input_tokens_seen": 201349576, + "step": 12514 + }, + { + "epoch": 0.8766537953015139, + "grad_norm": 3.574882984161377, + "learning_rate": 1.2416917688266199e-05, + "loss": 0.9827, + "num_input_tokens_seen": 201365960, + "step": 12515 + }, + { + "epoch": 0.8767238435472432, + "grad_norm": 4.096477031707764, + "learning_rate": 1.2409919439579688e-05, + "loss": 1.133, + "num_input_tokens_seen": 201380464, + "step": 12516 + }, + { + "epoch": 0.8767938917929724, + "grad_norm": 4.192535400390625, + "learning_rate": 1.2402921190893166e-05, + "loss": 1.0019, + "num_input_tokens_seen": 201396592, + "step": 12517 + }, + { + "epoch": 0.8768639400387016, + "grad_norm": 8.660335540771484, + "learning_rate": 1.2395922942206656e-05, + "loss": 1.0866, + "num_input_tokens_seen": 201412640, + "step": 12518 + }, + { + "epoch": 0.8769339882844309, + "grad_norm": 5.47048282623291, + "learning_rate": 1.2388924693520147e-05, + "loss": 1.2014, + "num_input_tokens_seen": 201428280, + "step": 12519 + }, + { + "epoch": 0.8770040365301601, + "grad_norm": 4.640336990356445, + "learning_rate": 1.2381926444833625e-05, + "loss": 1.398, + "num_input_tokens_seen": 201443400, + "step": 12520 + }, + { + "epoch": 0.8770740847758894, + "grad_norm": 3.7966647148132324, + "learning_rate": 1.2374928196147114e-05, + "loss": 1.0577, + "num_input_tokens_seen": 201459784, + "step": 12521 + }, + { + "epoch": 0.8771441330216186, + "grad_norm": 3.995532989501953, + "learning_rate": 1.2367929947460592e-05, + "loss": 0.9673, + "num_input_tokens_seen": 201475368, + "step": 12522 + }, + { + "epoch": 0.8772141812673478, + "grad_norm": 5.584611415863037, + "learning_rate": 1.2360931698774082e-05, + "loss": 1.0679, + "num_input_tokens_seen": 201491176, + "step": 12523 + }, + { + "epoch": 0.8772842295130772, + "grad_norm": 4.329719543457031, + "learning_rate": 1.2353933450087573e-05, + "loss": 1.2215, + "num_input_tokens_seen": 201507560, + "step": 12524 + }, + { + "epoch": 0.8773542777588064, + "grad_norm": 5.203586578369141, + "learning_rate": 1.2346935201401051e-05, + "loss": 1.1033, + "num_input_tokens_seen": 201523944, + "step": 12525 + }, + { + "epoch": 0.8774243260045356, + "grad_norm": 3.5449650287628174, + "learning_rate": 1.233993695271454e-05, + "loss": 0.9107, + "num_input_tokens_seen": 201540320, + "step": 12526 + }, + { + "epoch": 0.8774943742502649, + "grad_norm": 3.3628153800964355, + "learning_rate": 1.2332938704028018e-05, + "loss": 0.9714, + "num_input_tokens_seen": 201556704, + "step": 12527 + }, + { + "epoch": 0.8775644224959941, + "grad_norm": 5.353857040405273, + "learning_rate": 1.2325940455341508e-05, + "loss": 1.048, + "num_input_tokens_seen": 201573088, + "step": 12528 + }, + { + "epoch": 0.8776344707417234, + "grad_norm": 3.7415788173675537, + "learning_rate": 1.2318942206654987e-05, + "loss": 0.9305, + "num_input_tokens_seen": 201589472, + "step": 12529 + }, + { + "epoch": 0.8777045189874526, + "grad_norm": 3.674448013305664, + "learning_rate": 1.2311943957968477e-05, + "loss": 1.0883, + "num_input_tokens_seen": 201605736, + "step": 12530 + }, + { + "epoch": 0.8777745672331818, + "grad_norm": 3.797348976135254, + "learning_rate": 1.2304945709281966e-05, + "loss": 1.1998, + "num_input_tokens_seen": 201622120, + "step": 12531 + }, + { + "epoch": 0.8778446154789111, + "grad_norm": 4.789517402648926, + "learning_rate": 1.2297947460595444e-05, + "loss": 1.1824, + "num_input_tokens_seen": 201638056, + "step": 12532 + }, + { + "epoch": 0.8779146637246403, + "grad_norm": 4.846160888671875, + "learning_rate": 1.2290949211908934e-05, + "loss": 1.0575, + "num_input_tokens_seen": 201654440, + "step": 12533 + }, + { + "epoch": 0.8779847119703696, + "grad_norm": 4.066547393798828, + "learning_rate": 1.2283950963222413e-05, + "loss": 1.0053, + "num_input_tokens_seen": 201670824, + "step": 12534 + }, + { + "epoch": 0.8780547602160989, + "grad_norm": 4.036361217498779, + "learning_rate": 1.2276952714535903e-05, + "loss": 1.079, + "num_input_tokens_seen": 201687208, + "step": 12535 + }, + { + "epoch": 0.878124808461828, + "grad_norm": 3.7096452713012695, + "learning_rate": 1.2269954465849392e-05, + "loss": 1.2021, + "num_input_tokens_seen": 201703592, + "step": 12536 + }, + { + "epoch": 0.8781948567075574, + "grad_norm": 5.5861334800720215, + "learning_rate": 1.226295621716287e-05, + "loss": 1.0288, + "num_input_tokens_seen": 201719336, + "step": 12537 + }, + { + "epoch": 0.8782649049532866, + "grad_norm": 4.975471019744873, + "learning_rate": 1.225595796847636e-05, + "loss": 1.1122, + "num_input_tokens_seen": 201734896, + "step": 12538 + }, + { + "epoch": 0.8783349531990158, + "grad_norm": 4.835661888122559, + "learning_rate": 1.2248959719789839e-05, + "loss": 1.0797, + "num_input_tokens_seen": 201751280, + "step": 12539 + }, + { + "epoch": 0.8784050014447451, + "grad_norm": 3.7144432067871094, + "learning_rate": 1.2241961471103329e-05, + "loss": 0.9571, + "num_input_tokens_seen": 201767664, + "step": 12540 + }, + { + "epoch": 0.8784750496904743, + "grad_norm": 5.518250942230225, + "learning_rate": 1.2234963222416818e-05, + "loss": 1.0512, + "num_input_tokens_seen": 201784048, + "step": 12541 + }, + { + "epoch": 0.8785450979362036, + "grad_norm": 3.951824188232422, + "learning_rate": 1.2227964973730296e-05, + "loss": 0.9065, + "num_input_tokens_seen": 201799968, + "step": 12542 + }, + { + "epoch": 0.8786151461819328, + "grad_norm": 3.9954850673675537, + "learning_rate": 1.2220966725043786e-05, + "loss": 1.0503, + "num_input_tokens_seen": 201814504, + "step": 12543 + }, + { + "epoch": 0.878685194427662, + "grad_norm": 3.3838460445404053, + "learning_rate": 1.2213968476357265e-05, + "loss": 0.8911, + "num_input_tokens_seen": 201830192, + "step": 12544 + }, + { + "epoch": 0.8787552426733913, + "grad_norm": 3.478578805923462, + "learning_rate": 1.2206970227670755e-05, + "loss": 0.9808, + "num_input_tokens_seen": 201846576, + "step": 12545 + }, + { + "epoch": 0.8788252909191205, + "grad_norm": 4.194384574890137, + "learning_rate": 1.2199971978984244e-05, + "loss": 0.9029, + "num_input_tokens_seen": 201862960, + "step": 12546 + }, + { + "epoch": 0.8788953391648497, + "grad_norm": 4.147796630859375, + "learning_rate": 1.2192973730297722e-05, + "loss": 1.2221, + "num_input_tokens_seen": 201879128, + "step": 12547 + }, + { + "epoch": 0.8789653874105791, + "grad_norm": 4.386658191680908, + "learning_rate": 1.2185975481611212e-05, + "loss": 1.0141, + "num_input_tokens_seen": 201895512, + "step": 12548 + }, + { + "epoch": 0.8790354356563083, + "grad_norm": 4.554989814758301, + "learning_rate": 1.217897723292469e-05, + "loss": 0.9077, + "num_input_tokens_seen": 201911560, + "step": 12549 + }, + { + "epoch": 0.8791054839020376, + "grad_norm": 3.909036636352539, + "learning_rate": 1.2171978984238181e-05, + "loss": 0.8683, + "num_input_tokens_seen": 201927856, + "step": 12550 + }, + { + "epoch": 0.8791755321477668, + "grad_norm": 3.74406099319458, + "learning_rate": 1.216498073555167e-05, + "loss": 0.9973, + "num_input_tokens_seen": 201944240, + "step": 12551 + }, + { + "epoch": 0.879245580393496, + "grad_norm": 4.547219753265381, + "learning_rate": 1.2157982486865148e-05, + "loss": 1.0793, + "num_input_tokens_seen": 201960624, + "step": 12552 + }, + { + "epoch": 0.8793156286392253, + "grad_norm": 3.9297232627868652, + "learning_rate": 1.2150984238178638e-05, + "loss": 0.9191, + "num_input_tokens_seen": 201977008, + "step": 12553 + }, + { + "epoch": 0.8793856768849545, + "grad_norm": 3.757464647293091, + "learning_rate": 1.2143985989492117e-05, + "loss": 1.0359, + "num_input_tokens_seen": 201993392, + "step": 12554 + }, + { + "epoch": 0.8794557251306837, + "grad_norm": 3.714686155319214, + "learning_rate": 1.2136987740805607e-05, + "loss": 1.0288, + "num_input_tokens_seen": 202009600, + "step": 12555 + }, + { + "epoch": 0.879525773376413, + "grad_norm": 4.367456912994385, + "learning_rate": 1.2129989492119086e-05, + "loss": 0.9295, + "num_input_tokens_seen": 202025200, + "step": 12556 + }, + { + "epoch": 0.8795958216221422, + "grad_norm": 3.9171440601348877, + "learning_rate": 1.2122991243432574e-05, + "loss": 1.0031, + "num_input_tokens_seen": 202041328, + "step": 12557 + }, + { + "epoch": 0.8796658698678715, + "grad_norm": 3.8222570419311523, + "learning_rate": 1.2115992994746064e-05, + "loss": 0.9203, + "num_input_tokens_seen": 202057160, + "step": 12558 + }, + { + "epoch": 0.8797359181136007, + "grad_norm": 3.4387218952178955, + "learning_rate": 1.2108994746059543e-05, + "loss": 1.0436, + "num_input_tokens_seen": 202073544, + "step": 12559 + }, + { + "epoch": 0.87980596635933, + "grad_norm": 3.6614480018615723, + "learning_rate": 1.2101996497373033e-05, + "loss": 1.0492, + "num_input_tokens_seen": 202089792, + "step": 12560 + }, + { + "epoch": 0.8798760146050593, + "grad_norm": 4.620304584503174, + "learning_rate": 1.2094998248686512e-05, + "loss": 1.1139, + "num_input_tokens_seen": 202106176, + "step": 12561 + }, + { + "epoch": 0.8799460628507885, + "grad_norm": 3.3833374977111816, + "learning_rate": 1.2088e-05, + "loss": 0.9222, + "num_input_tokens_seen": 202122560, + "step": 12562 + }, + { + "epoch": 0.8800161110965177, + "grad_norm": 3.734360933303833, + "learning_rate": 1.208100175131349e-05, + "loss": 0.9659, + "num_input_tokens_seen": 202138944, + "step": 12563 + }, + { + "epoch": 0.880086159342247, + "grad_norm": 4.0044660568237305, + "learning_rate": 1.2074003502626969e-05, + "loss": 1.1575, + "num_input_tokens_seen": 202155328, + "step": 12564 + }, + { + "epoch": 0.8801562075879762, + "grad_norm": 3.7718546390533447, + "learning_rate": 1.2067005253940459e-05, + "loss": 1.0374, + "num_input_tokens_seen": 202171712, + "step": 12565 + }, + { + "epoch": 0.8802262558337055, + "grad_norm": 5.317460536956787, + "learning_rate": 1.2060007005253938e-05, + "loss": 1.17, + "num_input_tokens_seen": 202187416, + "step": 12566 + }, + { + "epoch": 0.8802963040794347, + "grad_norm": 4.103085517883301, + "learning_rate": 1.2053008756567426e-05, + "loss": 1.0852, + "num_input_tokens_seen": 202203800, + "step": 12567 + }, + { + "epoch": 0.8803663523251639, + "grad_norm": 3.439845561981201, + "learning_rate": 1.2046010507880916e-05, + "loss": 0.8537, + "num_input_tokens_seen": 202220184, + "step": 12568 + }, + { + "epoch": 0.8804364005708932, + "grad_norm": 4.826170444488525, + "learning_rate": 1.2039012259194395e-05, + "loss": 1.0311, + "num_input_tokens_seen": 202236288, + "step": 12569 + }, + { + "epoch": 0.8805064488166224, + "grad_norm": 4.193214416503906, + "learning_rate": 1.2032014010507885e-05, + "loss": 1.224, + "num_input_tokens_seen": 202252144, + "step": 12570 + }, + { + "epoch": 0.8805764970623517, + "grad_norm": 4.195227146148682, + "learning_rate": 1.2025015761821364e-05, + "loss": 1.0741, + "num_input_tokens_seen": 202268464, + "step": 12571 + }, + { + "epoch": 0.880646545308081, + "grad_norm": 3.8489561080932617, + "learning_rate": 1.2018017513134852e-05, + "loss": 1.2819, + "num_input_tokens_seen": 202284312, + "step": 12572 + }, + { + "epoch": 0.8807165935538102, + "grad_norm": 4.529621601104736, + "learning_rate": 1.2011019264448342e-05, + "loss": 1.3116, + "num_input_tokens_seen": 202300240, + "step": 12573 + }, + { + "epoch": 0.8807866417995395, + "grad_norm": 4.118465900421143, + "learning_rate": 1.2004021015761821e-05, + "loss": 1.0889, + "num_input_tokens_seen": 202316624, + "step": 12574 + }, + { + "epoch": 0.8808566900452687, + "grad_norm": 3.400087833404541, + "learning_rate": 1.1997022767075311e-05, + "loss": 0.9007, + "num_input_tokens_seen": 202333008, + "step": 12575 + }, + { + "epoch": 0.8809267382909979, + "grad_norm": 3.652501344680786, + "learning_rate": 1.199002451838879e-05, + "loss": 1.0249, + "num_input_tokens_seen": 202348928, + "step": 12576 + }, + { + "epoch": 0.8809967865367272, + "grad_norm": 3.7035112380981445, + "learning_rate": 1.1983026269702278e-05, + "loss": 0.8291, + "num_input_tokens_seen": 202365312, + "step": 12577 + }, + { + "epoch": 0.8810668347824564, + "grad_norm": 4.34960412979126, + "learning_rate": 1.1976028021015768e-05, + "loss": 0.9926, + "num_input_tokens_seen": 202380792, + "step": 12578 + }, + { + "epoch": 0.8811368830281857, + "grad_norm": 4.948240280151367, + "learning_rate": 1.1969029772329247e-05, + "loss": 1.1294, + "num_input_tokens_seen": 202396904, + "step": 12579 + }, + { + "epoch": 0.8812069312739149, + "grad_norm": 6.397356033325195, + "learning_rate": 1.1962031523642737e-05, + "loss": 0.8803, + "num_input_tokens_seen": 202413040, + "step": 12580 + }, + { + "epoch": 0.8812769795196441, + "grad_norm": 4.319753646850586, + "learning_rate": 1.1955033274956216e-05, + "loss": 1.231, + "num_input_tokens_seen": 202429424, + "step": 12581 + }, + { + "epoch": 0.8813470277653734, + "grad_norm": 5.086960792541504, + "learning_rate": 1.1948035026269704e-05, + "loss": 1.1347, + "num_input_tokens_seen": 202444840, + "step": 12582 + }, + { + "epoch": 0.8814170760111026, + "grad_norm": 3.547149896621704, + "learning_rate": 1.1941036777583183e-05, + "loss": 0.8604, + "num_input_tokens_seen": 202460920, + "step": 12583 + }, + { + "epoch": 0.8814871242568318, + "grad_norm": 4.047225475311279, + "learning_rate": 1.1934038528896673e-05, + "loss": 1.2209, + "num_input_tokens_seen": 202476840, + "step": 12584 + }, + { + "epoch": 0.8815571725025612, + "grad_norm": 3.473348617553711, + "learning_rate": 1.1927040280210161e-05, + "loss": 0.9913, + "num_input_tokens_seen": 202492864, + "step": 12585 + }, + { + "epoch": 0.8816272207482904, + "grad_norm": 3.9858241081237793, + "learning_rate": 1.1920042031523642e-05, + "loss": 1.1035, + "num_input_tokens_seen": 202508416, + "step": 12586 + }, + { + "epoch": 0.8816972689940197, + "grad_norm": 4.850385665893555, + "learning_rate": 1.191304378283713e-05, + "loss": 1.2204, + "num_input_tokens_seen": 202523888, + "step": 12587 + }, + { + "epoch": 0.8817673172397489, + "grad_norm": 5.069236755371094, + "learning_rate": 1.1906045534150609e-05, + "loss": 1.1134, + "num_input_tokens_seen": 202540064, + "step": 12588 + }, + { + "epoch": 0.8818373654854781, + "grad_norm": 4.104405403137207, + "learning_rate": 1.1899047285464099e-05, + "loss": 1.1278, + "num_input_tokens_seen": 202555896, + "step": 12589 + }, + { + "epoch": 0.8819074137312074, + "grad_norm": 3.819866895675659, + "learning_rate": 1.1892049036777587e-05, + "loss": 1.0808, + "num_input_tokens_seen": 202572280, + "step": 12590 + }, + { + "epoch": 0.8819774619769366, + "grad_norm": 3.585385799407959, + "learning_rate": 1.1885050788091068e-05, + "loss": 0.9783, + "num_input_tokens_seen": 202588152, + "step": 12591 + }, + { + "epoch": 0.8820475102226658, + "grad_norm": 4.98047399520874, + "learning_rate": 1.1878052539404556e-05, + "loss": 1.0887, + "num_input_tokens_seen": 202604536, + "step": 12592 + }, + { + "epoch": 0.8821175584683951, + "grad_norm": 3.7895700931549072, + "learning_rate": 1.1871054290718035e-05, + "loss": 1.183, + "num_input_tokens_seen": 202620920, + "step": 12593 + }, + { + "epoch": 0.8821876067141243, + "grad_norm": 4.028841972351074, + "learning_rate": 1.1864056042031525e-05, + "loss": 1.0744, + "num_input_tokens_seen": 202637016, + "step": 12594 + }, + { + "epoch": 0.8822576549598536, + "grad_norm": 3.751007556915283, + "learning_rate": 1.1857057793345014e-05, + "loss": 1.0787, + "num_input_tokens_seen": 202652544, + "step": 12595 + }, + { + "epoch": 0.8823277032055828, + "grad_norm": 3.411586284637451, + "learning_rate": 1.1850059544658494e-05, + "loss": 1.0261, + "num_input_tokens_seen": 202668928, + "step": 12596 + }, + { + "epoch": 0.882397751451312, + "grad_norm": 3.6525516510009766, + "learning_rate": 1.1843061295971982e-05, + "loss": 1.0598, + "num_input_tokens_seen": 202685168, + "step": 12597 + }, + { + "epoch": 0.8824677996970414, + "grad_norm": 5.606570720672607, + "learning_rate": 1.183606304728546e-05, + "loss": 0.9219, + "num_input_tokens_seen": 202701184, + "step": 12598 + }, + { + "epoch": 0.8825378479427706, + "grad_norm": 3.914767265319824, + "learning_rate": 1.1829064798598951e-05, + "loss": 1.1533, + "num_input_tokens_seen": 202717240, + "step": 12599 + }, + { + "epoch": 0.8826078961884998, + "grad_norm": 3.837833881378174, + "learning_rate": 1.182206654991244e-05, + "loss": 0.9077, + "num_input_tokens_seen": 202733624, + "step": 12600 + }, + { + "epoch": 0.8826078961884998, + "eval_loss": 1.1154844760894775, + "eval_runtime": 0.747, + "eval_samples_per_second": 1.339, + "eval_steps_per_second": 1.339, + "num_input_tokens_seen": 202733624, + "step": 12600 + }, + { + "epoch": 0.8826779444342291, + "grad_norm": 3.680651903152466, + "learning_rate": 1.181506830122592e-05, + "loss": 0.8793, + "num_input_tokens_seen": 202749976, + "step": 12601 + }, + { + "epoch": 0.8827479926799583, + "grad_norm": 3.879056930541992, + "learning_rate": 1.1808070052539408e-05, + "loss": 1.0997, + "num_input_tokens_seen": 202766360, + "step": 12602 + }, + { + "epoch": 0.8828180409256876, + "grad_norm": 3.578338384628296, + "learning_rate": 1.1801071803852887e-05, + "loss": 0.8773, + "num_input_tokens_seen": 202782560, + "step": 12603 + }, + { + "epoch": 0.8828880891714168, + "grad_norm": 5.370104789733887, + "learning_rate": 1.1794073555166377e-05, + "loss": 1.0421, + "num_input_tokens_seen": 202798944, + "step": 12604 + }, + { + "epoch": 0.882958137417146, + "grad_norm": 4.748933792114258, + "learning_rate": 1.1787075306479866e-05, + "loss": 1.2043, + "num_input_tokens_seen": 202814664, + "step": 12605 + }, + { + "epoch": 0.8830281856628753, + "grad_norm": 3.4428064823150635, + "learning_rate": 1.1780077057793344e-05, + "loss": 0.9542, + "num_input_tokens_seen": 202831048, + "step": 12606 + }, + { + "epoch": 0.8830982339086045, + "grad_norm": 5.423092365264893, + "learning_rate": 1.1773078809106834e-05, + "loss": 0.8927, + "num_input_tokens_seen": 202846080, + "step": 12607 + }, + { + "epoch": 0.8831682821543339, + "grad_norm": 3.899013042449951, + "learning_rate": 1.1766080560420313e-05, + "loss": 0.868, + "num_input_tokens_seen": 202862464, + "step": 12608 + }, + { + "epoch": 0.883238330400063, + "grad_norm": 4.950531482696533, + "learning_rate": 1.1759082311733803e-05, + "loss": 0.9725, + "num_input_tokens_seen": 202878848, + "step": 12609 + }, + { + "epoch": 0.8833083786457923, + "grad_norm": 3.714561939239502, + "learning_rate": 1.1752084063047281e-05, + "loss": 1.1705, + "num_input_tokens_seen": 202895232, + "step": 12610 + }, + { + "epoch": 0.8833784268915216, + "grad_norm": 3.4974570274353027, + "learning_rate": 1.174508581436077e-05, + "loss": 1.1086, + "num_input_tokens_seen": 202911616, + "step": 12611 + }, + { + "epoch": 0.8834484751372508, + "grad_norm": 3.231931686401367, + "learning_rate": 1.173808756567426e-05, + "loss": 0.9444, + "num_input_tokens_seen": 202927632, + "step": 12612 + }, + { + "epoch": 0.88351852338298, + "grad_norm": 3.7434840202331543, + "learning_rate": 1.1731089316987739e-05, + "loss": 1.1063, + "num_input_tokens_seen": 202944016, + "step": 12613 + }, + { + "epoch": 0.8835885716287093, + "grad_norm": 5.416693687438965, + "learning_rate": 1.1724091068301229e-05, + "loss": 1.1561, + "num_input_tokens_seen": 202959800, + "step": 12614 + }, + { + "epoch": 0.8836586198744385, + "grad_norm": 3.6579625606536865, + "learning_rate": 1.1717092819614707e-05, + "loss": 0.9564, + "num_input_tokens_seen": 202976184, + "step": 12615 + }, + { + "epoch": 0.8837286681201678, + "grad_norm": 4.083743572235107, + "learning_rate": 1.1710094570928196e-05, + "loss": 1.0985, + "num_input_tokens_seen": 202992568, + "step": 12616 + }, + { + "epoch": 0.883798716365897, + "grad_norm": 3.624979257583618, + "learning_rate": 1.1703096322241686e-05, + "loss": 0.9773, + "num_input_tokens_seen": 203008840, + "step": 12617 + }, + { + "epoch": 0.8838687646116262, + "grad_norm": 4.003345012664795, + "learning_rate": 1.1696098073555165e-05, + "loss": 1.0163, + "num_input_tokens_seen": 203025176, + "step": 12618 + }, + { + "epoch": 0.8839388128573555, + "grad_norm": 4.2342529296875, + "learning_rate": 1.1689099824868655e-05, + "loss": 0.9518, + "num_input_tokens_seen": 203040968, + "step": 12619 + }, + { + "epoch": 0.8840088611030847, + "grad_norm": 4.295236587524414, + "learning_rate": 1.1682101576182133e-05, + "loss": 1.1464, + "num_input_tokens_seen": 203057352, + "step": 12620 + }, + { + "epoch": 0.884078909348814, + "grad_norm": 4.334270000457764, + "learning_rate": 1.1675103327495622e-05, + "loss": 1.0265, + "num_input_tokens_seen": 203073736, + "step": 12621 + }, + { + "epoch": 0.8841489575945433, + "grad_norm": 3.9355452060699463, + "learning_rate": 1.1668105078809112e-05, + "loss": 1.1383, + "num_input_tokens_seen": 203089424, + "step": 12622 + }, + { + "epoch": 0.8842190058402725, + "grad_norm": 4.40088415145874, + "learning_rate": 1.166110683012259e-05, + "loss": 1.0976, + "num_input_tokens_seen": 203105808, + "step": 12623 + }, + { + "epoch": 0.8842890540860018, + "grad_norm": 4.909037113189697, + "learning_rate": 1.1654108581436081e-05, + "loss": 1.0478, + "num_input_tokens_seen": 203122192, + "step": 12624 + }, + { + "epoch": 0.884359102331731, + "grad_norm": 4.394054889678955, + "learning_rate": 1.164711033274956e-05, + "loss": 1.1584, + "num_input_tokens_seen": 203138104, + "step": 12625 + }, + { + "epoch": 0.8844291505774602, + "grad_norm": 3.797147274017334, + "learning_rate": 1.1640112084063048e-05, + "loss": 1.0426, + "num_input_tokens_seen": 203154080, + "step": 12626 + }, + { + "epoch": 0.8844991988231895, + "grad_norm": 6.216647624969482, + "learning_rate": 1.1633113835376538e-05, + "loss": 1.382, + "num_input_tokens_seen": 203170272, + "step": 12627 + }, + { + "epoch": 0.8845692470689187, + "grad_norm": 3.751054048538208, + "learning_rate": 1.1626115586690017e-05, + "loss": 1.0968, + "num_input_tokens_seen": 203186656, + "step": 12628 + }, + { + "epoch": 0.8846392953146479, + "grad_norm": 3.904749631881714, + "learning_rate": 1.1619117338003507e-05, + "loss": 1.0297, + "num_input_tokens_seen": 203203040, + "step": 12629 + }, + { + "epoch": 0.8847093435603772, + "grad_norm": 3.7687580585479736, + "learning_rate": 1.1612119089316985e-05, + "loss": 1.0633, + "num_input_tokens_seen": 203219424, + "step": 12630 + }, + { + "epoch": 0.8847793918061064, + "grad_norm": 3.593787670135498, + "learning_rate": 1.1605120840630474e-05, + "loss": 1.1324, + "num_input_tokens_seen": 203235808, + "step": 12631 + }, + { + "epoch": 0.8848494400518357, + "grad_norm": 3.4688005447387695, + "learning_rate": 1.1598122591943964e-05, + "loss": 0.877, + "num_input_tokens_seen": 203252176, + "step": 12632 + }, + { + "epoch": 0.884919488297565, + "grad_norm": 3.81750226020813, + "learning_rate": 1.1591124343257443e-05, + "loss": 1.0479, + "num_input_tokens_seen": 203267960, + "step": 12633 + }, + { + "epoch": 0.8849895365432942, + "grad_norm": 5.334212779998779, + "learning_rate": 1.1584126094570933e-05, + "loss": 0.9283, + "num_input_tokens_seen": 203284344, + "step": 12634 + }, + { + "epoch": 0.8850595847890235, + "grad_norm": 3.9472601413726807, + "learning_rate": 1.1577127845884411e-05, + "loss": 1.0645, + "num_input_tokens_seen": 203300344, + "step": 12635 + }, + { + "epoch": 0.8851296330347527, + "grad_norm": 5.129589557647705, + "learning_rate": 1.15701295971979e-05, + "loss": 1.2578, + "num_input_tokens_seen": 203316728, + "step": 12636 + }, + { + "epoch": 0.885199681280482, + "grad_norm": 3.6468305587768555, + "learning_rate": 1.1563131348511378e-05, + "loss": 0.9779, + "num_input_tokens_seen": 203333112, + "step": 12637 + }, + { + "epoch": 0.8852697295262112, + "grad_norm": 3.619377851486206, + "learning_rate": 1.1556133099824869e-05, + "loss": 0.9734, + "num_input_tokens_seen": 203349496, + "step": 12638 + }, + { + "epoch": 0.8853397777719404, + "grad_norm": 4.821696758270264, + "learning_rate": 1.1549134851138359e-05, + "loss": 0.8822, + "num_input_tokens_seen": 203365032, + "step": 12639 + }, + { + "epoch": 0.8854098260176697, + "grad_norm": 3.578596353530884, + "learning_rate": 1.1542136602451837e-05, + "loss": 0.9265, + "num_input_tokens_seen": 203381416, + "step": 12640 + }, + { + "epoch": 0.8854798742633989, + "grad_norm": 4.534537315368652, + "learning_rate": 1.1535138353765326e-05, + "loss": 1.0165, + "num_input_tokens_seen": 203397768, + "step": 12641 + }, + { + "epoch": 0.8855499225091281, + "grad_norm": 4.63020133972168, + "learning_rate": 1.1528140105078804e-05, + "loss": 1.2249, + "num_input_tokens_seen": 203414152, + "step": 12642 + }, + { + "epoch": 0.8856199707548574, + "grad_norm": 6.061887264251709, + "learning_rate": 1.1521141856392295e-05, + "loss": 1.2341, + "num_input_tokens_seen": 203430536, + "step": 12643 + }, + { + "epoch": 0.8856900190005866, + "grad_norm": 3.6352615356445312, + "learning_rate": 1.1514143607705785e-05, + "loss": 1.0435, + "num_input_tokens_seen": 203446912, + "step": 12644 + }, + { + "epoch": 0.885760067246316, + "grad_norm": 5.083562850952148, + "learning_rate": 1.1507145359019263e-05, + "loss": 1.1478, + "num_input_tokens_seen": 203463296, + "step": 12645 + }, + { + "epoch": 0.8858301154920452, + "grad_norm": 3.8505465984344482, + "learning_rate": 1.1500147110332752e-05, + "loss": 0.9334, + "num_input_tokens_seen": 203478472, + "step": 12646 + }, + { + "epoch": 0.8859001637377744, + "grad_norm": 5.8323774337768555, + "learning_rate": 1.149314886164623e-05, + "loss": 1.0819, + "num_input_tokens_seen": 203494856, + "step": 12647 + }, + { + "epoch": 0.8859702119835037, + "grad_norm": 4.194119930267334, + "learning_rate": 1.148615061295972e-05, + "loss": 1.204, + "num_input_tokens_seen": 203510656, + "step": 12648 + }, + { + "epoch": 0.8860402602292329, + "grad_norm": 4.123297214508057, + "learning_rate": 1.1479152364273211e-05, + "loss": 0.9454, + "num_input_tokens_seen": 203527040, + "step": 12649 + }, + { + "epoch": 0.8861103084749621, + "grad_norm": 4.585346221923828, + "learning_rate": 1.147215411558669e-05, + "loss": 1.2653, + "num_input_tokens_seen": 203542448, + "step": 12650 + }, + { + "epoch": 0.8861803567206914, + "grad_norm": 3.7528719902038574, + "learning_rate": 1.1465155866900178e-05, + "loss": 1.0771, + "num_input_tokens_seen": 203558832, + "step": 12651 + }, + { + "epoch": 0.8862504049664206, + "grad_norm": 3.5222363471984863, + "learning_rate": 1.1458157618213656e-05, + "loss": 0.874, + "num_input_tokens_seen": 203575216, + "step": 12652 + }, + { + "epoch": 0.8863204532121499, + "grad_norm": 3.902620315551758, + "learning_rate": 1.1451159369527147e-05, + "loss": 1.1561, + "num_input_tokens_seen": 203591600, + "step": 12653 + }, + { + "epoch": 0.8863905014578791, + "grad_norm": 5.071498394012451, + "learning_rate": 1.1444161120840637e-05, + "loss": 1.1509, + "num_input_tokens_seen": 203607608, + "step": 12654 + }, + { + "epoch": 0.8864605497036083, + "grad_norm": 3.502537488937378, + "learning_rate": 1.1437162872154115e-05, + "loss": 0.9235, + "num_input_tokens_seen": 203623992, + "step": 12655 + }, + { + "epoch": 0.8865305979493376, + "grad_norm": 4.573894500732422, + "learning_rate": 1.1430164623467604e-05, + "loss": 1.1158, + "num_input_tokens_seen": 203640376, + "step": 12656 + }, + { + "epoch": 0.8866006461950668, + "grad_norm": 3.442065477371216, + "learning_rate": 1.1423166374781082e-05, + "loss": 0.856, + "num_input_tokens_seen": 203656760, + "step": 12657 + }, + { + "epoch": 0.886670694440796, + "grad_norm": 5.290163040161133, + "learning_rate": 1.1416168126094573e-05, + "loss": 1.1494, + "num_input_tokens_seen": 203673144, + "step": 12658 + }, + { + "epoch": 0.8867407426865254, + "grad_norm": 4.824940204620361, + "learning_rate": 1.1409169877408063e-05, + "loss": 1.017, + "num_input_tokens_seen": 203688912, + "step": 12659 + }, + { + "epoch": 0.8868107909322546, + "grad_norm": 4.397287845611572, + "learning_rate": 1.1402171628721541e-05, + "loss": 1.2192, + "num_input_tokens_seen": 203705296, + "step": 12660 + }, + { + "epoch": 0.8868808391779839, + "grad_norm": 3.67203688621521, + "learning_rate": 1.139517338003503e-05, + "loss": 1.1086, + "num_input_tokens_seen": 203721680, + "step": 12661 + }, + { + "epoch": 0.8869508874237131, + "grad_norm": 4.464781284332275, + "learning_rate": 1.1388175131348509e-05, + "loss": 1.1222, + "num_input_tokens_seen": 203737696, + "step": 12662 + }, + { + "epoch": 0.8870209356694423, + "grad_norm": 3.650658369064331, + "learning_rate": 1.1381176882661999e-05, + "loss": 0.9418, + "num_input_tokens_seen": 203754080, + "step": 12663 + }, + { + "epoch": 0.8870909839151716, + "grad_norm": 3.5596141815185547, + "learning_rate": 1.1374178633975477e-05, + "loss": 0.9055, + "num_input_tokens_seen": 203770464, + "step": 12664 + }, + { + "epoch": 0.8871610321609008, + "grad_norm": 6.462835788726807, + "learning_rate": 1.1367180385288967e-05, + "loss": 1.0438, + "num_input_tokens_seen": 203786848, + "step": 12665 + }, + { + "epoch": 0.88723108040663, + "grad_norm": 4.41941499710083, + "learning_rate": 1.1360182136602456e-05, + "loss": 1.0578, + "num_input_tokens_seen": 203801776, + "step": 12666 + }, + { + "epoch": 0.8873011286523593, + "grad_norm": 5.5245184898376465, + "learning_rate": 1.1353183887915935e-05, + "loss": 1.1967, + "num_input_tokens_seen": 203818160, + "step": 12667 + }, + { + "epoch": 0.8873711768980885, + "grad_norm": 4.376595497131348, + "learning_rate": 1.1346185639229425e-05, + "loss": 0.8817, + "num_input_tokens_seen": 203833584, + "step": 12668 + }, + { + "epoch": 0.8874412251438178, + "grad_norm": 4.4320783615112305, + "learning_rate": 1.1339187390542903e-05, + "loss": 1.0249, + "num_input_tokens_seen": 203849968, + "step": 12669 + }, + { + "epoch": 0.887511273389547, + "grad_norm": 4.340517520904541, + "learning_rate": 1.1332189141856394e-05, + "loss": 1.0156, + "num_input_tokens_seen": 203866352, + "step": 12670 + }, + { + "epoch": 0.8875813216352763, + "grad_norm": 3.749523878097534, + "learning_rate": 1.1325190893169882e-05, + "loss": 1.1399, + "num_input_tokens_seen": 203882736, + "step": 12671 + }, + { + "epoch": 0.8876513698810056, + "grad_norm": 3.80598521232605, + "learning_rate": 1.131819264448336e-05, + "loss": 1.1726, + "num_input_tokens_seen": 203899120, + "step": 12672 + }, + { + "epoch": 0.8877214181267348, + "grad_norm": 3.564436197280884, + "learning_rate": 1.131119439579685e-05, + "loss": 0.9397, + "num_input_tokens_seen": 203915208, + "step": 12673 + }, + { + "epoch": 0.8877914663724641, + "grad_norm": 3.9914798736572266, + "learning_rate": 1.130419614711033e-05, + "loss": 0.9577, + "num_input_tokens_seen": 203931280, + "step": 12674 + }, + { + "epoch": 0.8878615146181933, + "grad_norm": 4.691357135772705, + "learning_rate": 1.129719789842382e-05, + "loss": 1.1601, + "num_input_tokens_seen": 203947664, + "step": 12675 + }, + { + "epoch": 0.8879315628639225, + "grad_norm": 3.812379837036133, + "learning_rate": 1.1290199649737308e-05, + "loss": 1.0871, + "num_input_tokens_seen": 203963832, + "step": 12676 + }, + { + "epoch": 0.8880016111096518, + "grad_norm": 4.895635604858398, + "learning_rate": 1.1283201401050787e-05, + "loss": 1.0851, + "num_input_tokens_seen": 203980216, + "step": 12677 + }, + { + "epoch": 0.888071659355381, + "grad_norm": 4.791111469268799, + "learning_rate": 1.1276203152364277e-05, + "loss": 1.0208, + "num_input_tokens_seen": 203996600, + "step": 12678 + }, + { + "epoch": 0.8881417076011102, + "grad_norm": 4.284975528717041, + "learning_rate": 1.1269204903677755e-05, + "loss": 0.7626, + "num_input_tokens_seen": 204012248, + "step": 12679 + }, + { + "epoch": 0.8882117558468395, + "grad_norm": 4.01584529876709, + "learning_rate": 1.1262206654991246e-05, + "loss": 0.9328, + "num_input_tokens_seen": 204028352, + "step": 12680 + }, + { + "epoch": 0.8882818040925687, + "grad_norm": 4.6868977546691895, + "learning_rate": 1.1255208406304734e-05, + "loss": 1.0679, + "num_input_tokens_seen": 204044232, + "step": 12681 + }, + { + "epoch": 0.888351852338298, + "grad_norm": 5.224238872528076, + "learning_rate": 1.1248210157618213e-05, + "loss": 0.9131, + "num_input_tokens_seen": 204060616, + "step": 12682 + }, + { + "epoch": 0.8884219005840273, + "grad_norm": 5.436654567718506, + "learning_rate": 1.1241211908931703e-05, + "loss": 0.9787, + "num_input_tokens_seen": 204076120, + "step": 12683 + }, + { + "epoch": 0.8884919488297565, + "grad_norm": 5.744635105133057, + "learning_rate": 1.1234213660245181e-05, + "loss": 1.5788, + "num_input_tokens_seen": 204092504, + "step": 12684 + }, + { + "epoch": 0.8885619970754858, + "grad_norm": 4.575104713439941, + "learning_rate": 1.1227215411558672e-05, + "loss": 1.0955, + "num_input_tokens_seen": 204108888, + "step": 12685 + }, + { + "epoch": 0.888632045321215, + "grad_norm": 4.090554237365723, + "learning_rate": 1.122021716287216e-05, + "loss": 1.3329, + "num_input_tokens_seen": 204125272, + "step": 12686 + }, + { + "epoch": 0.8887020935669442, + "grad_norm": 4.415184020996094, + "learning_rate": 1.1213218914185639e-05, + "loss": 1.0163, + "num_input_tokens_seen": 204141656, + "step": 12687 + }, + { + "epoch": 0.8887721418126735, + "grad_norm": 6.465402603149414, + "learning_rate": 1.1206220665499129e-05, + "loss": 1.165, + "num_input_tokens_seen": 204156480, + "step": 12688 + }, + { + "epoch": 0.8888421900584027, + "grad_norm": 4.015601634979248, + "learning_rate": 1.1199222416812607e-05, + "loss": 1.1579, + "num_input_tokens_seen": 204171752, + "step": 12689 + }, + { + "epoch": 0.888912238304132, + "grad_norm": 4.026886940002441, + "learning_rate": 1.1192224168126098e-05, + "loss": 1.1049, + "num_input_tokens_seen": 204187032, + "step": 12690 + }, + { + "epoch": 0.8889822865498612, + "grad_norm": 4.066308498382568, + "learning_rate": 1.1185225919439576e-05, + "loss": 1.0676, + "num_input_tokens_seen": 204203416, + "step": 12691 + }, + { + "epoch": 0.8890523347955904, + "grad_norm": 4.137761116027832, + "learning_rate": 1.1178227670753065e-05, + "loss": 1.1582, + "num_input_tokens_seen": 204219800, + "step": 12692 + }, + { + "epoch": 0.8891223830413197, + "grad_norm": 4.114308834075928, + "learning_rate": 1.1171229422066555e-05, + "loss": 1.1551, + "num_input_tokens_seen": 204236168, + "step": 12693 + }, + { + "epoch": 0.889192431287049, + "grad_norm": 4.309042930603027, + "learning_rate": 1.1164231173380033e-05, + "loss": 1.1515, + "num_input_tokens_seen": 204252320, + "step": 12694 + }, + { + "epoch": 0.8892624795327781, + "grad_norm": 4.22834587097168, + "learning_rate": 1.1157232924693524e-05, + "loss": 1.0496, + "num_input_tokens_seen": 204268704, + "step": 12695 + }, + { + "epoch": 0.8893325277785075, + "grad_norm": 4.161382675170898, + "learning_rate": 1.1150234676007002e-05, + "loss": 1.019, + "num_input_tokens_seen": 204285088, + "step": 12696 + }, + { + "epoch": 0.8894025760242367, + "grad_norm": 4.120362758636475, + "learning_rate": 1.114323642732049e-05, + "loss": 1.1767, + "num_input_tokens_seen": 204301320, + "step": 12697 + }, + { + "epoch": 0.889472624269966, + "grad_norm": 3.753422498703003, + "learning_rate": 1.1136238178633981e-05, + "loss": 1.1704, + "num_input_tokens_seen": 204317632, + "step": 12698 + }, + { + "epoch": 0.8895426725156952, + "grad_norm": 4.732109546661377, + "learning_rate": 1.112923992994746e-05, + "loss": 1.0424, + "num_input_tokens_seen": 204333760, + "step": 12699 + }, + { + "epoch": 0.8896127207614244, + "grad_norm": 4.9418721199035645, + "learning_rate": 1.112224168126095e-05, + "loss": 0.9856, + "num_input_tokens_seen": 204349216, + "step": 12700 + }, + { + "epoch": 0.8896827690071537, + "grad_norm": 3.732372522354126, + "learning_rate": 1.1115243432574428e-05, + "loss": 1.2365, + "num_input_tokens_seen": 204365480, + "step": 12701 + }, + { + "epoch": 0.8897528172528829, + "grad_norm": 3.6092376708984375, + "learning_rate": 1.1108245183887917e-05, + "loss": 0.9191, + "num_input_tokens_seen": 204380952, + "step": 12702 + }, + { + "epoch": 0.8898228654986121, + "grad_norm": 5.024523735046387, + "learning_rate": 1.1101246935201407e-05, + "loss": 1.0431, + "num_input_tokens_seen": 204396312, + "step": 12703 + }, + { + "epoch": 0.8898929137443414, + "grad_norm": 4.15120267868042, + "learning_rate": 1.1094248686514885e-05, + "loss": 0.9837, + "num_input_tokens_seen": 204412144, + "step": 12704 + }, + { + "epoch": 0.8899629619900706, + "grad_norm": 3.4581830501556396, + "learning_rate": 1.1087250437828376e-05, + "loss": 1.0186, + "num_input_tokens_seen": 204428352, + "step": 12705 + }, + { + "epoch": 0.8900330102358, + "grad_norm": 3.712510824203491, + "learning_rate": 1.1080252189141854e-05, + "loss": 0.9363, + "num_input_tokens_seen": 204443744, + "step": 12706 + }, + { + "epoch": 0.8901030584815292, + "grad_norm": 4.2328948974609375, + "learning_rate": 1.1073253940455343e-05, + "loss": 1.2155, + "num_input_tokens_seen": 204459544, + "step": 12707 + }, + { + "epoch": 0.8901731067272584, + "grad_norm": 5.145882606506348, + "learning_rate": 1.1066255691768833e-05, + "loss": 1.0897, + "num_input_tokens_seen": 204475872, + "step": 12708 + }, + { + "epoch": 0.8902431549729877, + "grad_norm": 3.9659552574157715, + "learning_rate": 1.1059257443082311e-05, + "loss": 1.1289, + "num_input_tokens_seen": 204492256, + "step": 12709 + }, + { + "epoch": 0.8903132032187169, + "grad_norm": 3.703108072280884, + "learning_rate": 1.1052259194395802e-05, + "loss": 0.9464, + "num_input_tokens_seen": 204507448, + "step": 12710 + }, + { + "epoch": 0.8903832514644462, + "grad_norm": 4.181830406188965, + "learning_rate": 1.104526094570928e-05, + "loss": 1.1199, + "num_input_tokens_seen": 204523824, + "step": 12711 + }, + { + "epoch": 0.8904532997101754, + "grad_norm": 3.704587459564209, + "learning_rate": 1.1038262697022769e-05, + "loss": 1.0868, + "num_input_tokens_seen": 204539384, + "step": 12712 + }, + { + "epoch": 0.8905233479559046, + "grad_norm": 4.851490497589111, + "learning_rate": 1.1031264448336259e-05, + "loss": 1.1661, + "num_input_tokens_seen": 204554520, + "step": 12713 + }, + { + "epoch": 0.8905933962016339, + "grad_norm": 3.9408321380615234, + "learning_rate": 1.1024266199649737e-05, + "loss": 1.0458, + "num_input_tokens_seen": 204570904, + "step": 12714 + }, + { + "epoch": 0.8906634444473631, + "grad_norm": 4.01575231552124, + "learning_rate": 1.1017267950963228e-05, + "loss": 1.0005, + "num_input_tokens_seen": 204587288, + "step": 12715 + }, + { + "epoch": 0.8907334926930923, + "grad_norm": 4.693532943725586, + "learning_rate": 1.1010269702276706e-05, + "loss": 1.0998, + "num_input_tokens_seen": 204603672, + "step": 12716 + }, + { + "epoch": 0.8908035409388216, + "grad_norm": 4.866438388824463, + "learning_rate": 1.1003271453590195e-05, + "loss": 1.1457, + "num_input_tokens_seen": 204620056, + "step": 12717 + }, + { + "epoch": 0.8908735891845508, + "grad_norm": 4.008555889129639, + "learning_rate": 1.0996273204903673e-05, + "loss": 1.08, + "num_input_tokens_seen": 204636440, + "step": 12718 + }, + { + "epoch": 0.8909436374302802, + "grad_norm": 3.888995885848999, + "learning_rate": 1.0989274956217163e-05, + "loss": 1.011, + "num_input_tokens_seen": 204651712, + "step": 12719 + }, + { + "epoch": 0.8910136856760094, + "grad_norm": 3.5560851097106934, + "learning_rate": 1.0982276707530654e-05, + "loss": 0.9322, + "num_input_tokens_seen": 204668096, + "step": 12720 + }, + { + "epoch": 0.8910837339217386, + "grad_norm": 4.6521382331848145, + "learning_rate": 1.0975278458844132e-05, + "loss": 1.2343, + "num_input_tokens_seen": 204684400, + "step": 12721 + }, + { + "epoch": 0.8911537821674679, + "grad_norm": 4.952293872833252, + "learning_rate": 1.096828021015762e-05, + "loss": 1.1847, + "num_input_tokens_seen": 204700408, + "step": 12722 + }, + { + "epoch": 0.8912238304131971, + "grad_norm": 3.7201311588287354, + "learning_rate": 1.0961281961471099e-05, + "loss": 1.1462, + "num_input_tokens_seen": 204716792, + "step": 12723 + }, + { + "epoch": 0.8912938786589263, + "grad_norm": 4.006890773773193, + "learning_rate": 1.095428371278459e-05, + "loss": 1.1118, + "num_input_tokens_seen": 204733176, + "step": 12724 + }, + { + "epoch": 0.8913639269046556, + "grad_norm": 4.14346981048584, + "learning_rate": 1.094728546409808e-05, + "loss": 1.0813, + "num_input_tokens_seen": 204749560, + "step": 12725 + }, + { + "epoch": 0.8914339751503848, + "grad_norm": 4.3975677490234375, + "learning_rate": 1.0940287215411558e-05, + "loss": 1.0794, + "num_input_tokens_seen": 204765944, + "step": 12726 + }, + { + "epoch": 0.8915040233961141, + "grad_norm": 4.7937188148498535, + "learning_rate": 1.0933288966725047e-05, + "loss": 0.9626, + "num_input_tokens_seen": 204782328, + "step": 12727 + }, + { + "epoch": 0.8915740716418433, + "grad_norm": 4.681787014007568, + "learning_rate": 1.0926290718038525e-05, + "loss": 1.033, + "num_input_tokens_seen": 204798712, + "step": 12728 + }, + { + "epoch": 0.8916441198875725, + "grad_norm": 4.040367126464844, + "learning_rate": 1.0919292469352015e-05, + "loss": 0.9591, + "num_input_tokens_seen": 204815096, + "step": 12729 + }, + { + "epoch": 0.8917141681333018, + "grad_norm": 3.9239797592163086, + "learning_rate": 1.0912294220665504e-05, + "loss": 0.8398, + "num_input_tokens_seen": 204831272, + "step": 12730 + }, + { + "epoch": 0.891784216379031, + "grad_norm": 3.8803927898406982, + "learning_rate": 1.0905295971978984e-05, + "loss": 1.11, + "num_input_tokens_seen": 204847656, + "step": 12731 + }, + { + "epoch": 0.8918542646247603, + "grad_norm": 3.8660476207733154, + "learning_rate": 1.0898297723292473e-05, + "loss": 0.9715, + "num_input_tokens_seen": 204863176, + "step": 12732 + }, + { + "epoch": 0.8919243128704896, + "grad_norm": 4.107539653778076, + "learning_rate": 1.0891299474605951e-05, + "loss": 1.074, + "num_input_tokens_seen": 204878672, + "step": 12733 + }, + { + "epoch": 0.8919943611162188, + "grad_norm": 3.814284324645996, + "learning_rate": 1.0884301225919441e-05, + "loss": 1.1186, + "num_input_tokens_seen": 204894272, + "step": 12734 + }, + { + "epoch": 0.8920644093619481, + "grad_norm": 3.5680246353149414, + "learning_rate": 1.087730297723293e-05, + "loss": 0.9046, + "num_input_tokens_seen": 204910112, + "step": 12735 + }, + { + "epoch": 0.8921344576076773, + "grad_norm": 3.9398746490478516, + "learning_rate": 1.087030472854641e-05, + "loss": 1.0408, + "num_input_tokens_seen": 204926496, + "step": 12736 + }, + { + "epoch": 0.8922045058534065, + "grad_norm": 3.9033074378967285, + "learning_rate": 1.0863306479859899e-05, + "loss": 1.0145, + "num_input_tokens_seen": 204942736, + "step": 12737 + }, + { + "epoch": 0.8922745540991358, + "grad_norm": 3.7678627967834473, + "learning_rate": 1.0856308231173377e-05, + "loss": 1.0677, + "num_input_tokens_seen": 204958336, + "step": 12738 + }, + { + "epoch": 0.892344602344865, + "grad_norm": 4.389559268951416, + "learning_rate": 1.0849309982486867e-05, + "loss": 1.4174, + "num_input_tokens_seen": 204974400, + "step": 12739 + }, + { + "epoch": 0.8924146505905943, + "grad_norm": 4.158446311950684, + "learning_rate": 1.0842311733800356e-05, + "loss": 1.0213, + "num_input_tokens_seen": 204990584, + "step": 12740 + }, + { + "epoch": 0.8924846988363235, + "grad_norm": 4.261350631713867, + "learning_rate": 1.0835313485113836e-05, + "loss": 1.1207, + "num_input_tokens_seen": 205006368, + "step": 12741 + }, + { + "epoch": 0.8925547470820527, + "grad_norm": 4.63979434967041, + "learning_rate": 1.0828315236427325e-05, + "loss": 1.0442, + "num_input_tokens_seen": 205022752, + "step": 12742 + }, + { + "epoch": 0.892624795327782, + "grad_norm": 4.248417377471924, + "learning_rate": 1.0821316987740803e-05, + "loss": 1.1443, + "num_input_tokens_seen": 205039136, + "step": 12743 + }, + { + "epoch": 0.8926948435735113, + "grad_norm": 4.217923641204834, + "learning_rate": 1.0814318739054293e-05, + "loss": 0.9622, + "num_input_tokens_seen": 205055520, + "step": 12744 + }, + { + "epoch": 0.8927648918192405, + "grad_norm": 3.6576955318450928, + "learning_rate": 1.0807320490367772e-05, + "loss": 1.0269, + "num_input_tokens_seen": 205071048, + "step": 12745 + }, + { + "epoch": 0.8928349400649698, + "grad_norm": 4.138359546661377, + "learning_rate": 1.0800322241681262e-05, + "loss": 1.2812, + "num_input_tokens_seen": 205086784, + "step": 12746 + }, + { + "epoch": 0.892904988310699, + "grad_norm": 3.865264415740967, + "learning_rate": 1.079332399299475e-05, + "loss": 0.9663, + "num_input_tokens_seen": 205103168, + "step": 12747 + }, + { + "epoch": 0.8929750365564283, + "grad_norm": 4.385140419006348, + "learning_rate": 1.0786325744308229e-05, + "loss": 0.8329, + "num_input_tokens_seen": 205119552, + "step": 12748 + }, + { + "epoch": 0.8930450848021575, + "grad_norm": 4.502419471740723, + "learning_rate": 1.077932749562172e-05, + "loss": 1.0377, + "num_input_tokens_seen": 205134840, + "step": 12749 + }, + { + "epoch": 0.8931151330478867, + "grad_norm": 3.6818301677703857, + "learning_rate": 1.0772329246935198e-05, + "loss": 0.8962, + "num_input_tokens_seen": 205151224, + "step": 12750 + }, + { + "epoch": 0.893185181293616, + "grad_norm": 4.132830619812012, + "learning_rate": 1.0765330998248686e-05, + "loss": 1.0371, + "num_input_tokens_seen": 205167312, + "step": 12751 + }, + { + "epoch": 0.8932552295393452, + "grad_norm": 3.7152721881866455, + "learning_rate": 1.0758332749562177e-05, + "loss": 0.9206, + "num_input_tokens_seen": 205182840, + "step": 12752 + }, + { + "epoch": 0.8933252777850744, + "grad_norm": 4.311387062072754, + "learning_rate": 1.0751334500875655e-05, + "loss": 1.0534, + "num_input_tokens_seen": 205199208, + "step": 12753 + }, + { + "epoch": 0.8933953260308037, + "grad_norm": 3.597762107849121, + "learning_rate": 1.0744336252189145e-05, + "loss": 0.8838, + "num_input_tokens_seen": 205214864, + "step": 12754 + }, + { + "epoch": 0.8934653742765329, + "grad_norm": 4.119356632232666, + "learning_rate": 1.0737338003502624e-05, + "loss": 1.1051, + "num_input_tokens_seen": 205231248, + "step": 12755 + }, + { + "epoch": 0.8935354225222623, + "grad_norm": 3.7663848400115967, + "learning_rate": 1.0730339754816112e-05, + "loss": 0.9277, + "num_input_tokens_seen": 205247224, + "step": 12756 + }, + { + "epoch": 0.8936054707679915, + "grad_norm": 3.887713670730591, + "learning_rate": 1.0723341506129603e-05, + "loss": 0.9445, + "num_input_tokens_seen": 205263608, + "step": 12757 + }, + { + "epoch": 0.8936755190137207, + "grad_norm": 4.153920650482178, + "learning_rate": 1.0716343257443081e-05, + "loss": 0.9679, + "num_input_tokens_seen": 205279456, + "step": 12758 + }, + { + "epoch": 0.89374556725945, + "grad_norm": 4.783162593841553, + "learning_rate": 1.0709345008756571e-05, + "loss": 1.1365, + "num_input_tokens_seen": 205294824, + "step": 12759 + }, + { + "epoch": 0.8938156155051792, + "grad_norm": 4.095372200012207, + "learning_rate": 1.070234676007005e-05, + "loss": 1.157, + "num_input_tokens_seen": 205310368, + "step": 12760 + }, + { + "epoch": 0.8938856637509084, + "grad_norm": 3.767780303955078, + "learning_rate": 1.0695348511383538e-05, + "loss": 1.1082, + "num_input_tokens_seen": 205326752, + "step": 12761 + }, + { + "epoch": 0.8939557119966377, + "grad_norm": 3.6640048027038574, + "learning_rate": 1.0688350262697029e-05, + "loss": 1.068, + "num_input_tokens_seen": 205342224, + "step": 12762 + }, + { + "epoch": 0.8940257602423669, + "grad_norm": 4.246589660644531, + "learning_rate": 1.0681352014010507e-05, + "loss": 1.1632, + "num_input_tokens_seen": 205358032, + "step": 12763 + }, + { + "epoch": 0.8940958084880962, + "grad_norm": 4.523597240447998, + "learning_rate": 1.0674353765323997e-05, + "loss": 1.2278, + "num_input_tokens_seen": 205374416, + "step": 12764 + }, + { + "epoch": 0.8941658567338254, + "grad_norm": 4.507352352142334, + "learning_rate": 1.0667355516637476e-05, + "loss": 1.1026, + "num_input_tokens_seen": 205390440, + "step": 12765 + }, + { + "epoch": 0.8942359049795546, + "grad_norm": 4.757911682128906, + "learning_rate": 1.0660357267950964e-05, + "loss": 1.3588, + "num_input_tokens_seen": 205406640, + "step": 12766 + }, + { + "epoch": 0.894305953225284, + "grad_norm": 7.188485145568848, + "learning_rate": 1.0653359019264455e-05, + "loss": 0.916, + "num_input_tokens_seen": 205421928, + "step": 12767 + }, + { + "epoch": 0.8943760014710131, + "grad_norm": 5.692143440246582, + "learning_rate": 1.0646360770577933e-05, + "loss": 0.968, + "num_input_tokens_seen": 205438312, + "step": 12768 + }, + { + "epoch": 0.8944460497167424, + "grad_norm": 3.678093433380127, + "learning_rate": 1.0639362521891423e-05, + "loss": 1.0251, + "num_input_tokens_seen": 205454696, + "step": 12769 + }, + { + "epoch": 0.8945160979624717, + "grad_norm": 3.581860303878784, + "learning_rate": 1.0632364273204902e-05, + "loss": 1.0009, + "num_input_tokens_seen": 205470832, + "step": 12770 + }, + { + "epoch": 0.8945861462082009, + "grad_norm": 4.782081127166748, + "learning_rate": 1.062536602451839e-05, + "loss": 1.1111, + "num_input_tokens_seen": 205487216, + "step": 12771 + }, + { + "epoch": 0.8946561944539302, + "grad_norm": 4.475836277008057, + "learning_rate": 1.0618367775831869e-05, + "loss": 1.2437, + "num_input_tokens_seen": 205502512, + "step": 12772 + }, + { + "epoch": 0.8947262426996594, + "grad_norm": 4.174612522125244, + "learning_rate": 1.061136952714536e-05, + "loss": 0.9565, + "num_input_tokens_seen": 205518896, + "step": 12773 + }, + { + "epoch": 0.8947962909453886, + "grad_norm": 4.19733190536499, + "learning_rate": 1.060437127845885e-05, + "loss": 1.0273, + "num_input_tokens_seen": 205535280, + "step": 12774 + }, + { + "epoch": 0.8948663391911179, + "grad_norm": 4.5620622634887695, + "learning_rate": 1.0597373029772328e-05, + "loss": 1.107, + "num_input_tokens_seen": 205551664, + "step": 12775 + }, + { + "epoch": 0.8949363874368471, + "grad_norm": 4.051815509796143, + "learning_rate": 1.0590374781085816e-05, + "loss": 0.9932, + "num_input_tokens_seen": 205567648, + "step": 12776 + }, + { + "epoch": 0.8950064356825764, + "grad_norm": 4.6475138664245605, + "learning_rate": 1.0583376532399295e-05, + "loss": 1.0543, + "num_input_tokens_seen": 205583560, + "step": 12777 + }, + { + "epoch": 0.8950764839283056, + "grad_norm": 3.940277099609375, + "learning_rate": 1.0576378283712785e-05, + "loss": 1.0869, + "num_input_tokens_seen": 205599232, + "step": 12778 + }, + { + "epoch": 0.8951465321740348, + "grad_norm": 5.537580966949463, + "learning_rate": 1.0569380035026275e-05, + "loss": 1.0076, + "num_input_tokens_seen": 205615616, + "step": 12779 + }, + { + "epoch": 0.8952165804197642, + "grad_norm": 3.9762701988220215, + "learning_rate": 1.0562381786339754e-05, + "loss": 1.1049, + "num_input_tokens_seen": 205632000, + "step": 12780 + }, + { + "epoch": 0.8952866286654934, + "grad_norm": 3.8344132900238037, + "learning_rate": 1.0555383537653242e-05, + "loss": 0.9496, + "num_input_tokens_seen": 205648384, + "step": 12781 + }, + { + "epoch": 0.8953566769112226, + "grad_norm": 4.1706438064575195, + "learning_rate": 1.0548385288966721e-05, + "loss": 1.0368, + "num_input_tokens_seen": 205664768, + "step": 12782 + }, + { + "epoch": 0.8954267251569519, + "grad_norm": 4.091454982757568, + "learning_rate": 1.0541387040280211e-05, + "loss": 1.1276, + "num_input_tokens_seen": 205681104, + "step": 12783 + }, + { + "epoch": 0.8954967734026811, + "grad_norm": 4.253333568572998, + "learning_rate": 1.0534388791593701e-05, + "loss": 1.2046, + "num_input_tokens_seen": 205697000, + "step": 12784 + }, + { + "epoch": 0.8955668216484104, + "grad_norm": 4.834908485412598, + "learning_rate": 1.052739054290718e-05, + "loss": 1.1556, + "num_input_tokens_seen": 205712352, + "step": 12785 + }, + { + "epoch": 0.8956368698941396, + "grad_norm": 4.8462324142456055, + "learning_rate": 1.0520392294220668e-05, + "loss": 1.1693, + "num_input_tokens_seen": 205728448, + "step": 12786 + }, + { + "epoch": 0.8957069181398688, + "grad_norm": 5.827909469604492, + "learning_rate": 1.0513394045534147e-05, + "loss": 1.1524, + "num_input_tokens_seen": 205743456, + "step": 12787 + }, + { + "epoch": 0.8957769663855981, + "grad_norm": 3.643212080001831, + "learning_rate": 1.0506395796847637e-05, + "loss": 0.8341, + "num_input_tokens_seen": 205758864, + "step": 12788 + }, + { + "epoch": 0.8958470146313273, + "grad_norm": 5.922754764556885, + "learning_rate": 1.0499397548161127e-05, + "loss": 1.1758, + "num_input_tokens_seen": 205775248, + "step": 12789 + }, + { + "epoch": 0.8959170628770565, + "grad_norm": 3.5493719577789307, + "learning_rate": 1.0492399299474606e-05, + "loss": 0.9376, + "num_input_tokens_seen": 205790312, + "step": 12790 + }, + { + "epoch": 0.8959871111227858, + "grad_norm": 3.7190961837768555, + "learning_rate": 1.0485401050788095e-05, + "loss": 1.142, + "num_input_tokens_seen": 205806696, + "step": 12791 + }, + { + "epoch": 0.896057159368515, + "grad_norm": 4.968666076660156, + "learning_rate": 1.0478402802101573e-05, + "loss": 1.1428, + "num_input_tokens_seen": 205822168, + "step": 12792 + }, + { + "epoch": 0.8961272076142444, + "grad_norm": 3.7896578311920166, + "learning_rate": 1.0471404553415063e-05, + "loss": 0.9177, + "num_input_tokens_seen": 205838536, + "step": 12793 + }, + { + "epoch": 0.8961972558599736, + "grad_norm": 4.737802505493164, + "learning_rate": 1.0464406304728553e-05, + "loss": 1.0385, + "num_input_tokens_seen": 205854920, + "step": 12794 + }, + { + "epoch": 0.8962673041057028, + "grad_norm": 3.4700424671173096, + "learning_rate": 1.0457408056042032e-05, + "loss": 1.077, + "num_input_tokens_seen": 205871096, + "step": 12795 + }, + { + "epoch": 0.8963373523514321, + "grad_norm": 3.5950992107391357, + "learning_rate": 1.045040980735552e-05, + "loss": 0.9393, + "num_input_tokens_seen": 205887480, + "step": 12796 + }, + { + "epoch": 0.8964074005971613, + "grad_norm": 3.926687717437744, + "learning_rate": 1.0443411558668999e-05, + "loss": 1.0693, + "num_input_tokens_seen": 205902880, + "step": 12797 + }, + { + "epoch": 0.8964774488428905, + "grad_norm": 4.453563213348389, + "learning_rate": 1.043641330998249e-05, + "loss": 1.0519, + "num_input_tokens_seen": 205919264, + "step": 12798 + }, + { + "epoch": 0.8965474970886198, + "grad_norm": 3.6000733375549316, + "learning_rate": 1.0429415061295968e-05, + "loss": 0.951, + "num_input_tokens_seen": 205935648, + "step": 12799 + }, + { + "epoch": 0.896617545334349, + "grad_norm": 3.7590701580047607, + "learning_rate": 1.0422416812609458e-05, + "loss": 1.096, + "num_input_tokens_seen": 205952032, + "step": 12800 + }, + { + "epoch": 0.896617545334349, + "eval_loss": 1.1163299083709717, + "eval_runtime": 0.8319, + "eval_samples_per_second": 1.202, + "eval_steps_per_second": 1.202, + "num_input_tokens_seen": 205952032, + "step": 12800 + }, + { + "epoch": 0.8966875935800783, + "grad_norm": 4.134552955627441, + "learning_rate": 1.0415418563922947e-05, + "loss": 0.9709, + "num_input_tokens_seen": 205968224, + "step": 12801 + }, + { + "epoch": 0.8967576418258075, + "grad_norm": 3.9842371940612793, + "learning_rate": 1.0408420315236425e-05, + "loss": 1.1384, + "num_input_tokens_seen": 205984424, + "step": 12802 + }, + { + "epoch": 0.8968276900715367, + "grad_norm": 6.153411865234375, + "learning_rate": 1.0401422066549915e-05, + "loss": 1.0533, + "num_input_tokens_seen": 205999536, + "step": 12803 + }, + { + "epoch": 0.896897738317266, + "grad_norm": 5.525405406951904, + "learning_rate": 1.0394423817863394e-05, + "loss": 1.0807, + "num_input_tokens_seen": 206015920, + "step": 12804 + }, + { + "epoch": 0.8969677865629953, + "grad_norm": 3.806974172592163, + "learning_rate": 1.0387425569176884e-05, + "loss": 1.0534, + "num_input_tokens_seen": 206031304, + "step": 12805 + }, + { + "epoch": 0.8970378348087245, + "grad_norm": 6.930634021759033, + "learning_rate": 1.0380427320490373e-05, + "loss": 1.0804, + "num_input_tokens_seen": 206047688, + "step": 12806 + }, + { + "epoch": 0.8971078830544538, + "grad_norm": 3.6126344203948975, + "learning_rate": 1.0373429071803851e-05, + "loss": 1.0122, + "num_input_tokens_seen": 206064072, + "step": 12807 + }, + { + "epoch": 0.897177931300183, + "grad_norm": 3.422776699066162, + "learning_rate": 1.0366430823117341e-05, + "loss": 0.95, + "num_input_tokens_seen": 206080456, + "step": 12808 + }, + { + "epoch": 0.8972479795459123, + "grad_norm": 3.5997068881988525, + "learning_rate": 1.035943257443082e-05, + "loss": 0.9557, + "num_input_tokens_seen": 206096840, + "step": 12809 + }, + { + "epoch": 0.8973180277916415, + "grad_norm": 3.91896653175354, + "learning_rate": 1.035243432574431e-05, + "loss": 0.9414, + "num_input_tokens_seen": 206113224, + "step": 12810 + }, + { + "epoch": 0.8973880760373707, + "grad_norm": 3.794443368911743, + "learning_rate": 1.0345436077057799e-05, + "loss": 0.941, + "num_input_tokens_seen": 206129384, + "step": 12811 + }, + { + "epoch": 0.8974581242831, + "grad_norm": 3.7103588581085205, + "learning_rate": 1.0338437828371277e-05, + "loss": 1.1203, + "num_input_tokens_seen": 206145464, + "step": 12812 + }, + { + "epoch": 0.8975281725288292, + "grad_norm": 4.205925941467285, + "learning_rate": 1.0331439579684767e-05, + "loss": 0.9824, + "num_input_tokens_seen": 206161624, + "step": 12813 + }, + { + "epoch": 0.8975982207745585, + "grad_norm": 4.226700305938721, + "learning_rate": 1.0324441330998246e-05, + "loss": 1.0674, + "num_input_tokens_seen": 206177448, + "step": 12814 + }, + { + "epoch": 0.8976682690202877, + "grad_norm": 4.8231306076049805, + "learning_rate": 1.0317443082311736e-05, + "loss": 0.9937, + "num_input_tokens_seen": 206193832, + "step": 12815 + }, + { + "epoch": 0.8977383172660169, + "grad_norm": 3.8949713706970215, + "learning_rate": 1.0310444833625225e-05, + "loss": 1.0797, + "num_input_tokens_seen": 206210216, + "step": 12816 + }, + { + "epoch": 0.8978083655117463, + "grad_norm": 4.604421615600586, + "learning_rate": 1.0303446584938703e-05, + "loss": 1.0736, + "num_input_tokens_seen": 206225440, + "step": 12817 + }, + { + "epoch": 0.8978784137574755, + "grad_norm": 3.706749439239502, + "learning_rate": 1.0296448336252193e-05, + "loss": 1.1199, + "num_input_tokens_seen": 206241824, + "step": 12818 + }, + { + "epoch": 0.8979484620032047, + "grad_norm": 3.769963264465332, + "learning_rate": 1.0289450087565672e-05, + "loss": 1.1352, + "num_input_tokens_seen": 206258152, + "step": 12819 + }, + { + "epoch": 0.898018510248934, + "grad_norm": 3.607473134994507, + "learning_rate": 1.0282451838879162e-05, + "loss": 1.0334, + "num_input_tokens_seen": 206274536, + "step": 12820 + }, + { + "epoch": 0.8980885584946632, + "grad_norm": 3.7783496379852295, + "learning_rate": 1.027545359019264e-05, + "loss": 1.093, + "num_input_tokens_seen": 206290920, + "step": 12821 + }, + { + "epoch": 0.8981586067403925, + "grad_norm": 3.680699348449707, + "learning_rate": 1.0268455341506129e-05, + "loss": 1.1595, + "num_input_tokens_seen": 206307152, + "step": 12822 + }, + { + "epoch": 0.8982286549861217, + "grad_norm": 3.40242338180542, + "learning_rate": 1.026145709281962e-05, + "loss": 0.8342, + "num_input_tokens_seen": 206322832, + "step": 12823 + }, + { + "epoch": 0.8982987032318509, + "grad_norm": 4.114190101623535, + "learning_rate": 1.0254458844133098e-05, + "loss": 1.0805, + "num_input_tokens_seen": 206337656, + "step": 12824 + }, + { + "epoch": 0.8983687514775802, + "grad_norm": 3.5946953296661377, + "learning_rate": 1.0247460595446588e-05, + "loss": 1.1029, + "num_input_tokens_seen": 206353552, + "step": 12825 + }, + { + "epoch": 0.8984387997233094, + "grad_norm": 5.311532497406006, + "learning_rate": 1.0240462346760066e-05, + "loss": 1.104, + "num_input_tokens_seen": 206369936, + "step": 12826 + }, + { + "epoch": 0.8985088479690386, + "grad_norm": 4.307740211486816, + "learning_rate": 1.0233464098073555e-05, + "loss": 1.1524, + "num_input_tokens_seen": 206384728, + "step": 12827 + }, + { + "epoch": 0.8985788962147679, + "grad_norm": 4.263360500335693, + "learning_rate": 1.0226465849387045e-05, + "loss": 1.0065, + "num_input_tokens_seen": 206400696, + "step": 12828 + }, + { + "epoch": 0.8986489444604971, + "grad_norm": 4.757111072540283, + "learning_rate": 1.0219467600700524e-05, + "loss": 1.1887, + "num_input_tokens_seen": 206417080, + "step": 12829 + }, + { + "epoch": 0.8987189927062265, + "grad_norm": 4.438943862915039, + "learning_rate": 1.0212469352014014e-05, + "loss": 1.0879, + "num_input_tokens_seen": 206433464, + "step": 12830 + }, + { + "epoch": 0.8987890409519557, + "grad_norm": 6.071875095367432, + "learning_rate": 1.0205471103327492e-05, + "loss": 1.0637, + "num_input_tokens_seen": 206449400, + "step": 12831 + }, + { + "epoch": 0.8988590891976849, + "grad_norm": 3.467252492904663, + "learning_rate": 1.0198472854640981e-05, + "loss": 0.9811, + "num_input_tokens_seen": 206465472, + "step": 12832 + }, + { + "epoch": 0.8989291374434142, + "grad_norm": 3.7486941814422607, + "learning_rate": 1.0191474605954471e-05, + "loss": 1.0259, + "num_input_tokens_seen": 206481448, + "step": 12833 + }, + { + "epoch": 0.8989991856891434, + "grad_norm": 4.869624614715576, + "learning_rate": 1.018447635726795e-05, + "loss": 1.0102, + "num_input_tokens_seen": 206497152, + "step": 12834 + }, + { + "epoch": 0.8990692339348726, + "grad_norm": 4.445520877838135, + "learning_rate": 1.017747810858144e-05, + "loss": 1.0115, + "num_input_tokens_seen": 206513536, + "step": 12835 + }, + { + "epoch": 0.8991392821806019, + "grad_norm": 4.345690727233887, + "learning_rate": 1.0170479859894918e-05, + "loss": 1.183, + "num_input_tokens_seen": 206529120, + "step": 12836 + }, + { + "epoch": 0.8992093304263311, + "grad_norm": 4.9699907302856445, + "learning_rate": 1.0163481611208407e-05, + "loss": 1.1341, + "num_input_tokens_seen": 206544936, + "step": 12837 + }, + { + "epoch": 0.8992793786720604, + "grad_norm": 3.7695329189300537, + "learning_rate": 1.0156483362521897e-05, + "loss": 1.0233, + "num_input_tokens_seen": 206561320, + "step": 12838 + }, + { + "epoch": 0.8993494269177896, + "grad_norm": 4.013484477996826, + "learning_rate": 1.0149485113835376e-05, + "loss": 1.2189, + "num_input_tokens_seen": 206577688, + "step": 12839 + }, + { + "epoch": 0.8994194751635188, + "grad_norm": 3.9423582553863525, + "learning_rate": 1.0142486865148866e-05, + "loss": 0.9282, + "num_input_tokens_seen": 206594072, + "step": 12840 + }, + { + "epoch": 0.8994895234092481, + "grad_norm": 7.315654277801514, + "learning_rate": 1.0135488616462344e-05, + "loss": 1.0355, + "num_input_tokens_seen": 206609416, + "step": 12841 + }, + { + "epoch": 0.8995595716549774, + "grad_norm": 3.9201135635375977, + "learning_rate": 1.0128490367775833e-05, + "loss": 1.0161, + "num_input_tokens_seen": 206625800, + "step": 12842 + }, + { + "epoch": 0.8996296199007066, + "grad_norm": 3.767376184463501, + "learning_rate": 1.0121492119089323e-05, + "loss": 0.8986, + "num_input_tokens_seen": 206641264, + "step": 12843 + }, + { + "epoch": 0.8996996681464359, + "grad_norm": 3.7077198028564453, + "learning_rate": 1.0114493870402802e-05, + "loss": 0.9127, + "num_input_tokens_seen": 206657648, + "step": 12844 + }, + { + "epoch": 0.8997697163921651, + "grad_norm": 4.8657708168029785, + "learning_rate": 1.0107495621716292e-05, + "loss": 1.1832, + "num_input_tokens_seen": 206673968, + "step": 12845 + }, + { + "epoch": 0.8998397646378944, + "grad_norm": 3.4755969047546387, + "learning_rate": 1.010049737302977e-05, + "loss": 1.0008, + "num_input_tokens_seen": 206690352, + "step": 12846 + }, + { + "epoch": 0.8999098128836236, + "grad_norm": 3.5904476642608643, + "learning_rate": 1.0093499124343259e-05, + "loss": 0.9705, + "num_input_tokens_seen": 206706736, + "step": 12847 + }, + { + "epoch": 0.8999798611293528, + "grad_norm": 3.971721887588501, + "learning_rate": 1.0086500875656737e-05, + "loss": 0.9187, + "num_input_tokens_seen": 206722584, + "step": 12848 + }, + { + "epoch": 0.9000499093750821, + "grad_norm": 3.729898452758789, + "learning_rate": 1.0079502626970228e-05, + "loss": 1.0715, + "num_input_tokens_seen": 206738944, + "step": 12849 + }, + { + "epoch": 0.9001199576208113, + "grad_norm": 3.8826589584350586, + "learning_rate": 1.0072504378283718e-05, + "loss": 1.1047, + "num_input_tokens_seen": 206754488, + "step": 12850 + }, + { + "epoch": 0.9001900058665406, + "grad_norm": 3.4438259601593018, + "learning_rate": 1.0065506129597196e-05, + "loss": 0.86, + "num_input_tokens_seen": 206770632, + "step": 12851 + }, + { + "epoch": 0.9002600541122698, + "grad_norm": 4.990970611572266, + "learning_rate": 1.0058507880910685e-05, + "loss": 1.2025, + "num_input_tokens_seen": 206787016, + "step": 12852 + }, + { + "epoch": 0.900330102357999, + "grad_norm": 5.725255012512207, + "learning_rate": 1.0051509632224163e-05, + "loss": 1.1916, + "num_input_tokens_seen": 206803400, + "step": 12853 + }, + { + "epoch": 0.9004001506037284, + "grad_norm": 6.1324005126953125, + "learning_rate": 1.0044511383537654e-05, + "loss": 1.1785, + "num_input_tokens_seen": 206819608, + "step": 12854 + }, + { + "epoch": 0.9004701988494576, + "grad_norm": 3.997680187225342, + "learning_rate": 1.0037513134851144e-05, + "loss": 1.1274, + "num_input_tokens_seen": 206835688, + "step": 12855 + }, + { + "epoch": 0.9005402470951868, + "grad_norm": 3.545053005218506, + "learning_rate": 1.0030514886164622e-05, + "loss": 1.1581, + "num_input_tokens_seen": 206851448, + "step": 12856 + }, + { + "epoch": 0.9006102953409161, + "grad_norm": 3.549908399581909, + "learning_rate": 1.0023516637478111e-05, + "loss": 1.0181, + "num_input_tokens_seen": 206866928, + "step": 12857 + }, + { + "epoch": 0.9006803435866453, + "grad_norm": 3.916532278060913, + "learning_rate": 1.001651838879159e-05, + "loss": 0.9762, + "num_input_tokens_seen": 206883176, + "step": 12858 + }, + { + "epoch": 0.9007503918323746, + "grad_norm": 4.589010715484619, + "learning_rate": 1.000952014010508e-05, + "loss": 0.9739, + "num_input_tokens_seen": 206899376, + "step": 12859 + }, + { + "epoch": 0.9008204400781038, + "grad_norm": 4.356659412384033, + "learning_rate": 1.000252189141857e-05, + "loss": 1.0057, + "num_input_tokens_seen": 206915216, + "step": 12860 + }, + { + "epoch": 0.900890488323833, + "grad_norm": 3.9849157333374023, + "learning_rate": 9.995523642732048e-06, + "loss": 0.8728, + "num_input_tokens_seen": 206931600, + "step": 12861 + }, + { + "epoch": 0.9009605365695623, + "grad_norm": 4.146997451782227, + "learning_rate": 9.988525394045537e-06, + "loss": 1.2281, + "num_input_tokens_seen": 206947984, + "step": 12862 + }, + { + "epoch": 0.9010305848152915, + "grad_norm": 3.7092323303222656, + "learning_rate": 9.981527145359016e-06, + "loss": 1.0893, + "num_input_tokens_seen": 206963856, + "step": 12863 + }, + { + "epoch": 0.9011006330610207, + "grad_norm": 4.339881896972656, + "learning_rate": 9.974528896672506e-06, + "loss": 1.0622, + "num_input_tokens_seen": 206978912, + "step": 12864 + }, + { + "epoch": 0.90117068130675, + "grad_norm": 4.774752616882324, + "learning_rate": 9.967530647985996e-06, + "loss": 1.1552, + "num_input_tokens_seen": 206994864, + "step": 12865 + }, + { + "epoch": 0.9012407295524792, + "grad_norm": 4.005700588226318, + "learning_rate": 9.960532399299475e-06, + "loss": 1.145, + "num_input_tokens_seen": 207011104, + "step": 12866 + }, + { + "epoch": 0.9013107777982086, + "grad_norm": 3.887990713119507, + "learning_rate": 9.953534150612963e-06, + "loss": 1.0645, + "num_input_tokens_seen": 207027320, + "step": 12867 + }, + { + "epoch": 0.9013808260439378, + "grad_norm": 5.5870561599731445, + "learning_rate": 9.946535901926442e-06, + "loss": 0.9663, + "num_input_tokens_seen": 207043704, + "step": 12868 + }, + { + "epoch": 0.901450874289667, + "grad_norm": 4.3734331130981445, + "learning_rate": 9.939537653239932e-06, + "loss": 1.0164, + "num_input_tokens_seen": 207059928, + "step": 12869 + }, + { + "epoch": 0.9015209225353963, + "grad_norm": 6.498212814331055, + "learning_rate": 9.93253940455342e-06, + "loss": 1.0738, + "num_input_tokens_seen": 207075672, + "step": 12870 + }, + { + "epoch": 0.9015909707811255, + "grad_norm": 3.744643449783325, + "learning_rate": 9.9255411558669e-06, + "loss": 1.1182, + "num_input_tokens_seen": 207092056, + "step": 12871 + }, + { + "epoch": 0.9016610190268547, + "grad_norm": 4.745702743530273, + "learning_rate": 9.918542907180389e-06, + "loss": 0.9721, + "num_input_tokens_seen": 207107560, + "step": 12872 + }, + { + "epoch": 0.901731067272584, + "grad_norm": 3.9844791889190674, + "learning_rate": 9.911544658493868e-06, + "loss": 1.0528, + "num_input_tokens_seen": 207123160, + "step": 12873 + }, + { + "epoch": 0.9018011155183132, + "grad_norm": 4.183824062347412, + "learning_rate": 9.904546409807358e-06, + "loss": 0.9625, + "num_input_tokens_seen": 207139120, + "step": 12874 + }, + { + "epoch": 0.9018711637640425, + "grad_norm": 3.60221791267395, + "learning_rate": 9.897548161120836e-06, + "loss": 1.1905, + "num_input_tokens_seen": 207155504, + "step": 12875 + }, + { + "epoch": 0.9019412120097717, + "grad_norm": 4.188063621520996, + "learning_rate": 9.890549912434327e-06, + "loss": 1.1405, + "num_input_tokens_seen": 207171888, + "step": 12876 + }, + { + "epoch": 0.9020112602555009, + "grad_norm": 3.7000772953033447, + "learning_rate": 9.883551663747815e-06, + "loss": 0.9895, + "num_input_tokens_seen": 207188272, + "step": 12877 + }, + { + "epoch": 0.9020813085012303, + "grad_norm": 3.5896668434143066, + "learning_rate": 9.876553415061294e-06, + "loss": 1.112, + "num_input_tokens_seen": 207204440, + "step": 12878 + }, + { + "epoch": 0.9021513567469595, + "grad_norm": 4.9455437660217285, + "learning_rate": 9.869555166374784e-06, + "loss": 1.2105, + "num_input_tokens_seen": 207220824, + "step": 12879 + }, + { + "epoch": 0.9022214049926888, + "grad_norm": 4.934106826782227, + "learning_rate": 9.862556917688262e-06, + "loss": 0.9813, + "num_input_tokens_seen": 207236040, + "step": 12880 + }, + { + "epoch": 0.902291453238418, + "grad_norm": 5.254598617553711, + "learning_rate": 9.855558669001753e-06, + "loss": 1.024, + "num_input_tokens_seen": 207252016, + "step": 12881 + }, + { + "epoch": 0.9023615014841472, + "grad_norm": 4.385216236114502, + "learning_rate": 9.848560420315241e-06, + "loss": 0.9644, + "num_input_tokens_seen": 207268400, + "step": 12882 + }, + { + "epoch": 0.9024315497298765, + "grad_norm": 4.5486955642700195, + "learning_rate": 9.84156217162872e-06, + "loss": 1.1238, + "num_input_tokens_seen": 207284784, + "step": 12883 + }, + { + "epoch": 0.9025015979756057, + "grad_norm": 3.856724262237549, + "learning_rate": 9.83456392294221e-06, + "loss": 0.9656, + "num_input_tokens_seen": 207300952, + "step": 12884 + }, + { + "epoch": 0.9025716462213349, + "grad_norm": 3.9005239009857178, + "learning_rate": 9.827565674255688e-06, + "loss": 1.0196, + "num_input_tokens_seen": 207316984, + "step": 12885 + }, + { + "epoch": 0.9026416944670642, + "grad_norm": 4.938909530639648, + "learning_rate": 9.820567425569179e-06, + "loss": 1.0051, + "num_input_tokens_seen": 207333200, + "step": 12886 + }, + { + "epoch": 0.9027117427127934, + "grad_norm": 4.389106750488281, + "learning_rate": 9.813569176882667e-06, + "loss": 1.0022, + "num_input_tokens_seen": 207349352, + "step": 12887 + }, + { + "epoch": 0.9027817909585227, + "grad_norm": 5.383081436157227, + "learning_rate": 9.806570928196146e-06, + "loss": 1.1617, + "num_input_tokens_seen": 207365736, + "step": 12888 + }, + { + "epoch": 0.9028518392042519, + "grad_norm": 3.531245231628418, + "learning_rate": 9.799572679509636e-06, + "loss": 0.9769, + "num_input_tokens_seen": 207381400, + "step": 12889 + }, + { + "epoch": 0.9029218874499811, + "grad_norm": 3.7782986164093018, + "learning_rate": 9.792574430823114e-06, + "loss": 1.2207, + "num_input_tokens_seen": 207397456, + "step": 12890 + }, + { + "epoch": 0.9029919356957105, + "grad_norm": 5.043989181518555, + "learning_rate": 9.785576182136603e-06, + "loss": 1.1014, + "num_input_tokens_seen": 207413840, + "step": 12891 + }, + { + "epoch": 0.9030619839414397, + "grad_norm": 3.80126953125, + "learning_rate": 9.778577933450093e-06, + "loss": 1.1354, + "num_input_tokens_seen": 207430224, + "step": 12892 + }, + { + "epoch": 0.9031320321871689, + "grad_norm": 4.785402774810791, + "learning_rate": 9.771579684763572e-06, + "loss": 1.2119, + "num_input_tokens_seen": 207446064, + "step": 12893 + }, + { + "epoch": 0.9032020804328982, + "grad_norm": 5.528894901275635, + "learning_rate": 9.764581436077062e-06, + "loss": 1.0321, + "num_input_tokens_seen": 207461296, + "step": 12894 + }, + { + "epoch": 0.9032721286786274, + "grad_norm": 3.7744951248168945, + "learning_rate": 9.75758318739054e-06, + "loss": 1.0906, + "num_input_tokens_seen": 207476480, + "step": 12895 + }, + { + "epoch": 0.9033421769243567, + "grad_norm": 3.7924485206604004, + "learning_rate": 9.750584938704029e-06, + "loss": 0.992, + "num_input_tokens_seen": 207492752, + "step": 12896 + }, + { + "epoch": 0.9034122251700859, + "grad_norm": 3.502990961074829, + "learning_rate": 9.74358669001752e-06, + "loss": 0.8829, + "num_input_tokens_seen": 207507952, + "step": 12897 + }, + { + "epoch": 0.9034822734158151, + "grad_norm": 3.559575080871582, + "learning_rate": 9.736588441330998e-06, + "loss": 1.0204, + "num_input_tokens_seen": 207524336, + "step": 12898 + }, + { + "epoch": 0.9035523216615444, + "grad_norm": 5.148831367492676, + "learning_rate": 9.729590192644488e-06, + "loss": 0.9467, + "num_input_tokens_seen": 207539608, + "step": 12899 + }, + { + "epoch": 0.9036223699072736, + "grad_norm": 4.7795515060424805, + "learning_rate": 9.722591943957966e-06, + "loss": 1.2959, + "num_input_tokens_seen": 207555992, + "step": 12900 + }, + { + "epoch": 0.9036924181530028, + "grad_norm": 3.7832281589508057, + "learning_rate": 9.715593695271455e-06, + "loss": 0.8448, + "num_input_tokens_seen": 207572376, + "step": 12901 + }, + { + "epoch": 0.9037624663987321, + "grad_norm": 4.564687728881836, + "learning_rate": 9.708595446584935e-06, + "loss": 1.0792, + "num_input_tokens_seen": 207588472, + "step": 12902 + }, + { + "epoch": 0.9038325146444613, + "grad_norm": 3.408050298690796, + "learning_rate": 9.701597197898424e-06, + "loss": 0.9636, + "num_input_tokens_seen": 207603888, + "step": 12903 + }, + { + "epoch": 0.9039025628901907, + "grad_norm": 3.8292696475982666, + "learning_rate": 9.694598949211914e-06, + "loss": 1.0513, + "num_input_tokens_seen": 207620144, + "step": 12904 + }, + { + "epoch": 0.9039726111359199, + "grad_norm": 3.599189519882202, + "learning_rate": 9.687600700525392e-06, + "loss": 0.9273, + "num_input_tokens_seen": 207636528, + "step": 12905 + }, + { + "epoch": 0.9040426593816491, + "grad_norm": 3.8321776390075684, + "learning_rate": 9.680602451838881e-06, + "loss": 1.2282, + "num_input_tokens_seen": 207652912, + "step": 12906 + }, + { + "epoch": 0.9041127076273784, + "grad_norm": 4.6585001945495605, + "learning_rate": 9.673604203152361e-06, + "loss": 0.8681, + "num_input_tokens_seen": 207669296, + "step": 12907 + }, + { + "epoch": 0.9041827558731076, + "grad_norm": 3.820298910140991, + "learning_rate": 9.66660595446585e-06, + "loss": 0.884, + "num_input_tokens_seen": 207685680, + "step": 12908 + }, + { + "epoch": 0.9042528041188368, + "grad_norm": 3.412508487701416, + "learning_rate": 9.65960770577934e-06, + "loss": 0.7913, + "num_input_tokens_seen": 207701848, + "step": 12909 + }, + { + "epoch": 0.9043228523645661, + "grad_norm": 4.576141834259033, + "learning_rate": 9.652609457092818e-06, + "loss": 1.1244, + "num_input_tokens_seen": 207717824, + "step": 12910 + }, + { + "epoch": 0.9043929006102953, + "grad_norm": 3.5337913036346436, + "learning_rate": 9.645611208406307e-06, + "loss": 1.0258, + "num_input_tokens_seen": 207734208, + "step": 12911 + }, + { + "epoch": 0.9044629488560246, + "grad_norm": 3.891900062561035, + "learning_rate": 9.638612959719787e-06, + "loss": 1.1433, + "num_input_tokens_seen": 207750592, + "step": 12912 + }, + { + "epoch": 0.9045329971017538, + "grad_norm": 4.04062557220459, + "learning_rate": 9.631614711033276e-06, + "loss": 0.9849, + "num_input_tokens_seen": 207766720, + "step": 12913 + }, + { + "epoch": 0.904603045347483, + "grad_norm": 4.240694046020508, + "learning_rate": 9.624616462346766e-06, + "loss": 1.101, + "num_input_tokens_seen": 207783104, + "step": 12914 + }, + { + "epoch": 0.9046730935932124, + "grad_norm": 3.7026684284210205, + "learning_rate": 9.617618213660244e-06, + "loss": 0.9006, + "num_input_tokens_seen": 207799096, + "step": 12915 + }, + { + "epoch": 0.9047431418389416, + "grad_norm": 5.79006290435791, + "learning_rate": 9.610619964973733e-06, + "loss": 1.0798, + "num_input_tokens_seen": 207815336, + "step": 12916 + }, + { + "epoch": 0.9048131900846709, + "grad_norm": 3.928856611251831, + "learning_rate": 9.603621716287211e-06, + "loss": 1.1805, + "num_input_tokens_seen": 207831720, + "step": 12917 + }, + { + "epoch": 0.9048832383304001, + "grad_norm": 3.9499402046203613, + "learning_rate": 9.596623467600702e-06, + "loss": 0.9822, + "num_input_tokens_seen": 207848104, + "step": 12918 + }, + { + "epoch": 0.9049532865761293, + "grad_norm": 5.155144691467285, + "learning_rate": 9.589625218914192e-06, + "loss": 1.233, + "num_input_tokens_seen": 207863840, + "step": 12919 + }, + { + "epoch": 0.9050233348218586, + "grad_norm": 3.737583875656128, + "learning_rate": 9.58262697022767e-06, + "loss": 1.0339, + "num_input_tokens_seen": 207879600, + "step": 12920 + }, + { + "epoch": 0.9050933830675878, + "grad_norm": 4.049322605133057, + "learning_rate": 9.575628721541159e-06, + "loss": 1.0517, + "num_input_tokens_seen": 207895984, + "step": 12921 + }, + { + "epoch": 0.905163431313317, + "grad_norm": 3.8788633346557617, + "learning_rate": 9.568630472854637e-06, + "loss": 1.0178, + "num_input_tokens_seen": 207912216, + "step": 12922 + }, + { + "epoch": 0.9052334795590463, + "grad_norm": 3.9737536907196045, + "learning_rate": 9.561632224168128e-06, + "loss": 0.9527, + "num_input_tokens_seen": 207928600, + "step": 12923 + }, + { + "epoch": 0.9053035278047755, + "grad_norm": 4.204390525817871, + "learning_rate": 9.554633975481618e-06, + "loss": 0.8983, + "num_input_tokens_seen": 207944984, + "step": 12924 + }, + { + "epoch": 0.9053735760505048, + "grad_norm": 3.5716028213500977, + "learning_rate": 9.547635726795096e-06, + "loss": 0.932, + "num_input_tokens_seen": 207961368, + "step": 12925 + }, + { + "epoch": 0.905443624296234, + "grad_norm": 3.730551242828369, + "learning_rate": 9.540637478108585e-06, + "loss": 1.0941, + "num_input_tokens_seen": 207977720, + "step": 12926 + }, + { + "epoch": 0.9055136725419632, + "grad_norm": 4.1951775550842285, + "learning_rate": 9.533639229422063e-06, + "loss": 1.1867, + "num_input_tokens_seen": 207993272, + "step": 12927 + }, + { + "epoch": 0.9055837207876926, + "grad_norm": 3.4958338737487793, + "learning_rate": 9.526640980735554e-06, + "loss": 0.9963, + "num_input_tokens_seen": 208009568, + "step": 12928 + }, + { + "epoch": 0.9056537690334218, + "grad_norm": 3.9609909057617188, + "learning_rate": 9.519642732049032e-06, + "loss": 0.9696, + "num_input_tokens_seen": 208025904, + "step": 12929 + }, + { + "epoch": 0.905723817279151, + "grad_norm": 4.9373273849487305, + "learning_rate": 9.512644483362522e-06, + "loss": 1.0089, + "num_input_tokens_seen": 208042112, + "step": 12930 + }, + { + "epoch": 0.9057938655248803, + "grad_norm": 3.934352397918701, + "learning_rate": 9.505646234676011e-06, + "loss": 1.044, + "num_input_tokens_seen": 208058496, + "step": 12931 + }, + { + "epoch": 0.9058639137706095, + "grad_norm": 4.120293140411377, + "learning_rate": 9.49864798598949e-06, + "loss": 1.0543, + "num_input_tokens_seen": 208074256, + "step": 12932 + }, + { + "epoch": 0.9059339620163388, + "grad_norm": 4.89858341217041, + "learning_rate": 9.49164973730298e-06, + "loss": 1.1108, + "num_input_tokens_seen": 208090640, + "step": 12933 + }, + { + "epoch": 0.906004010262068, + "grad_norm": 3.784271478652954, + "learning_rate": 9.484651488616458e-06, + "loss": 0.8875, + "num_input_tokens_seen": 208106048, + "step": 12934 + }, + { + "epoch": 0.9060740585077972, + "grad_norm": 3.4337575435638428, + "learning_rate": 9.477653239929948e-06, + "loss": 0.887, + "num_input_tokens_seen": 208122432, + "step": 12935 + }, + { + "epoch": 0.9061441067535265, + "grad_norm": 3.8159854412078857, + "learning_rate": 9.470654991243437e-06, + "loss": 0.9702, + "num_input_tokens_seen": 208138720, + "step": 12936 + }, + { + "epoch": 0.9062141549992557, + "grad_norm": 4.035446643829346, + "learning_rate": 9.463656742556915e-06, + "loss": 0.8276, + "num_input_tokens_seen": 208155104, + "step": 12937 + }, + { + "epoch": 0.9062842032449849, + "grad_norm": 5.713715076446533, + "learning_rate": 9.456658493870406e-06, + "loss": 1.0195, + "num_input_tokens_seen": 208170816, + "step": 12938 + }, + { + "epoch": 0.9063542514907142, + "grad_norm": 4.1760125160217285, + "learning_rate": 9.449660245183884e-06, + "loss": 0.9686, + "num_input_tokens_seen": 208186752, + "step": 12939 + }, + { + "epoch": 0.9064242997364435, + "grad_norm": 4.099884986877441, + "learning_rate": 9.442661996497374e-06, + "loss": 1.0185, + "num_input_tokens_seen": 208203136, + "step": 12940 + }, + { + "epoch": 0.9064943479821728, + "grad_norm": 3.4834136962890625, + "learning_rate": 9.435663747810863e-06, + "loss": 0.8478, + "num_input_tokens_seen": 208219024, + "step": 12941 + }, + { + "epoch": 0.906564396227902, + "grad_norm": 5.997000217437744, + "learning_rate": 9.428665499124341e-06, + "loss": 1.3272, + "num_input_tokens_seen": 208235408, + "step": 12942 + }, + { + "epoch": 0.9066344444736312, + "grad_norm": 4.685998439788818, + "learning_rate": 9.421667250437832e-06, + "loss": 1.0121, + "num_input_tokens_seen": 208251792, + "step": 12943 + }, + { + "epoch": 0.9067044927193605, + "grad_norm": 6.710029125213623, + "learning_rate": 9.41466900175131e-06, + "loss": 1.1854, + "num_input_tokens_seen": 208268176, + "step": 12944 + }, + { + "epoch": 0.9067745409650897, + "grad_norm": 4.690121650695801, + "learning_rate": 9.4076707530648e-06, + "loss": 1.1663, + "num_input_tokens_seen": 208284152, + "step": 12945 + }, + { + "epoch": 0.9068445892108189, + "grad_norm": 5.085326194763184, + "learning_rate": 9.400672504378289e-06, + "loss": 1.1137, + "num_input_tokens_seen": 208299840, + "step": 12946 + }, + { + "epoch": 0.9069146374565482, + "grad_norm": 3.9999020099639893, + "learning_rate": 9.393674255691767e-06, + "loss": 0.9963, + "num_input_tokens_seen": 208316224, + "step": 12947 + }, + { + "epoch": 0.9069846857022774, + "grad_norm": 4.7660231590271, + "learning_rate": 9.386676007005258e-06, + "loss": 1.025, + "num_input_tokens_seen": 208332224, + "step": 12948 + }, + { + "epoch": 0.9070547339480067, + "grad_norm": 3.5870227813720703, + "learning_rate": 9.379677758318736e-06, + "loss": 0.9802, + "num_input_tokens_seen": 208348272, + "step": 12949 + }, + { + "epoch": 0.9071247821937359, + "grad_norm": 3.6911203861236572, + "learning_rate": 9.372679509632226e-06, + "loss": 1.0651, + "num_input_tokens_seen": 208364656, + "step": 12950 + }, + { + "epoch": 0.9071948304394651, + "grad_norm": 4.179163932800293, + "learning_rate": 9.365681260945715e-06, + "loss": 1.1331, + "num_input_tokens_seen": 208381040, + "step": 12951 + }, + { + "epoch": 0.9072648786851945, + "grad_norm": 3.7565526962280273, + "learning_rate": 9.358683012259193e-06, + "loss": 0.9194, + "num_input_tokens_seen": 208396608, + "step": 12952 + }, + { + "epoch": 0.9073349269309237, + "grad_norm": 3.5818684101104736, + "learning_rate": 9.351684763572684e-06, + "loss": 0.9273, + "num_input_tokens_seen": 208412992, + "step": 12953 + }, + { + "epoch": 0.907404975176653, + "grad_norm": 4.4392290115356445, + "learning_rate": 9.344686514886162e-06, + "loss": 0.9803, + "num_input_tokens_seen": 208429176, + "step": 12954 + }, + { + "epoch": 0.9074750234223822, + "grad_norm": 3.5877010822296143, + "learning_rate": 9.337688266199652e-06, + "loss": 0.9334, + "num_input_tokens_seen": 208445288, + "step": 12955 + }, + { + "epoch": 0.9075450716681114, + "grad_norm": 3.8117401599884033, + "learning_rate": 9.33069001751313e-06, + "loss": 1.0093, + "num_input_tokens_seen": 208460008, + "step": 12956 + }, + { + "epoch": 0.9076151199138407, + "grad_norm": 3.4710583686828613, + "learning_rate": 9.32369176882662e-06, + "loss": 0.9396, + "num_input_tokens_seen": 208476088, + "step": 12957 + }, + { + "epoch": 0.9076851681595699, + "grad_norm": 3.6367835998535156, + "learning_rate": 9.31669352014011e-06, + "loss": 1.0144, + "num_input_tokens_seen": 208492472, + "step": 12958 + }, + { + "epoch": 0.9077552164052991, + "grad_norm": 4.401979446411133, + "learning_rate": 9.309695271453588e-06, + "loss": 0.8367, + "num_input_tokens_seen": 208508856, + "step": 12959 + }, + { + "epoch": 0.9078252646510284, + "grad_norm": 4.070242881774902, + "learning_rate": 9.302697022767078e-06, + "loss": 0.9757, + "num_input_tokens_seen": 208524872, + "step": 12960 + }, + { + "epoch": 0.9078953128967576, + "grad_norm": 4.285597324371338, + "learning_rate": 9.295698774080557e-06, + "loss": 1.0803, + "num_input_tokens_seen": 208540424, + "step": 12961 + }, + { + "epoch": 0.9079653611424869, + "grad_norm": 4.453644752502441, + "learning_rate": 9.288700525394045e-06, + "loss": 1.0071, + "num_input_tokens_seen": 208556808, + "step": 12962 + }, + { + "epoch": 0.9080354093882161, + "grad_norm": 3.803657293319702, + "learning_rate": 9.281702276707536e-06, + "loss": 0.9938, + "num_input_tokens_seen": 208573136, + "step": 12963 + }, + { + "epoch": 0.9081054576339453, + "grad_norm": 3.7270936965942383, + "learning_rate": 9.274704028021014e-06, + "loss": 0.9517, + "num_input_tokens_seen": 208589520, + "step": 12964 + }, + { + "epoch": 0.9081755058796747, + "grad_norm": 4.087285995483398, + "learning_rate": 9.267705779334504e-06, + "loss": 0.9659, + "num_input_tokens_seen": 208605576, + "step": 12965 + }, + { + "epoch": 0.9082455541254039, + "grad_norm": 3.7018704414367676, + "learning_rate": 9.260707530647983e-06, + "loss": 0.9758, + "num_input_tokens_seen": 208621528, + "step": 12966 + }, + { + "epoch": 0.9083156023711331, + "grad_norm": 3.8596179485321045, + "learning_rate": 9.253709281961471e-06, + "loss": 1.0206, + "num_input_tokens_seen": 208637912, + "step": 12967 + }, + { + "epoch": 0.9083856506168624, + "grad_norm": 4.573639869689941, + "learning_rate": 9.246711033274962e-06, + "loss": 0.8317, + "num_input_tokens_seen": 208654264, + "step": 12968 + }, + { + "epoch": 0.9084556988625916, + "grad_norm": 3.8285508155822754, + "learning_rate": 9.23971278458844e-06, + "loss": 1.0729, + "num_input_tokens_seen": 208670280, + "step": 12969 + }, + { + "epoch": 0.9085257471083209, + "grad_norm": 4.242320537567139, + "learning_rate": 9.23271453590193e-06, + "loss": 1.1249, + "num_input_tokens_seen": 208686664, + "step": 12970 + }, + { + "epoch": 0.9085957953540501, + "grad_norm": 3.671973466873169, + "learning_rate": 9.225716287215409e-06, + "loss": 1.012, + "num_input_tokens_seen": 208702984, + "step": 12971 + }, + { + "epoch": 0.9086658435997793, + "grad_norm": 3.515763521194458, + "learning_rate": 9.218718038528897e-06, + "loss": 0.9475, + "num_input_tokens_seen": 208719368, + "step": 12972 + }, + { + "epoch": 0.9087358918455086, + "grad_norm": 5.696224212646484, + "learning_rate": 9.211719789842388e-06, + "loss": 1.131, + "num_input_tokens_seen": 208734272, + "step": 12973 + }, + { + "epoch": 0.9088059400912378, + "grad_norm": 4.085351943969727, + "learning_rate": 9.204721541155866e-06, + "loss": 1.1803, + "num_input_tokens_seen": 208750592, + "step": 12974 + }, + { + "epoch": 0.908875988336967, + "grad_norm": 4.466163158416748, + "learning_rate": 9.197723292469356e-06, + "loss": 1.2309, + "num_input_tokens_seen": 208766976, + "step": 12975 + }, + { + "epoch": 0.9089460365826963, + "grad_norm": 3.543994188308716, + "learning_rate": 9.190725043782835e-06, + "loss": 0.8953, + "num_input_tokens_seen": 208783360, + "step": 12976 + }, + { + "epoch": 0.9090160848284256, + "grad_norm": 4.2074360847473145, + "learning_rate": 9.183726795096323e-06, + "loss": 0.9629, + "num_input_tokens_seen": 208799744, + "step": 12977 + }, + { + "epoch": 0.9090861330741549, + "grad_norm": 3.967421054840088, + "learning_rate": 9.176728546409814e-06, + "loss": 1.0965, + "num_input_tokens_seen": 208816128, + "step": 12978 + }, + { + "epoch": 0.9091561813198841, + "grad_norm": 4.6681718826293945, + "learning_rate": 9.169730297723292e-06, + "loss": 1.0284, + "num_input_tokens_seen": 208832512, + "step": 12979 + }, + { + "epoch": 0.9092262295656133, + "grad_norm": 4.545192718505859, + "learning_rate": 9.162732049036782e-06, + "loss": 1.0723, + "num_input_tokens_seen": 208848376, + "step": 12980 + }, + { + "epoch": 0.9092962778113426, + "grad_norm": 5.8451828956604, + "learning_rate": 9.155733800350261e-06, + "loss": 0.9283, + "num_input_tokens_seen": 208864184, + "step": 12981 + }, + { + "epoch": 0.9093663260570718, + "grad_norm": 5.042548179626465, + "learning_rate": 9.14873555166375e-06, + "loss": 1.0531, + "num_input_tokens_seen": 208880016, + "step": 12982 + }, + { + "epoch": 0.9094363743028011, + "grad_norm": 5.4026875495910645, + "learning_rate": 9.141737302977228e-06, + "loss": 1.0983, + "num_input_tokens_seen": 208896400, + "step": 12983 + }, + { + "epoch": 0.9095064225485303, + "grad_norm": 7.124366283416748, + "learning_rate": 9.134739054290718e-06, + "loss": 1.1499, + "num_input_tokens_seen": 208911944, + "step": 12984 + }, + { + "epoch": 0.9095764707942595, + "grad_norm": 5.680197715759277, + "learning_rate": 9.127740805604208e-06, + "loss": 1.0182, + "num_input_tokens_seen": 208928144, + "step": 12985 + }, + { + "epoch": 0.9096465190399888, + "grad_norm": 4.048745632171631, + "learning_rate": 9.120742556917687e-06, + "loss": 1.0929, + "num_input_tokens_seen": 208944528, + "step": 12986 + }, + { + "epoch": 0.909716567285718, + "grad_norm": 3.6971688270568848, + "learning_rate": 9.113744308231175e-06, + "loss": 1.0414, + "num_input_tokens_seen": 208960912, + "step": 12987 + }, + { + "epoch": 0.9097866155314472, + "grad_norm": 3.8971190452575684, + "learning_rate": 9.106746059544654e-06, + "loss": 0.9865, + "num_input_tokens_seen": 208977296, + "step": 12988 + }, + { + "epoch": 0.9098566637771766, + "grad_norm": 3.86953067779541, + "learning_rate": 9.099747810858144e-06, + "loss": 1.07, + "num_input_tokens_seen": 208993680, + "step": 12989 + }, + { + "epoch": 0.9099267120229058, + "grad_norm": 4.077042102813721, + "learning_rate": 9.092749562171634e-06, + "loss": 0.921, + "num_input_tokens_seen": 209009672, + "step": 12990 + }, + { + "epoch": 0.9099967602686351, + "grad_norm": 6.2340545654296875, + "learning_rate": 9.085751313485113e-06, + "loss": 1.141, + "num_input_tokens_seen": 209026056, + "step": 12991 + }, + { + "epoch": 0.9100668085143643, + "grad_norm": 5.116674900054932, + "learning_rate": 9.078753064798602e-06, + "loss": 0.9583, + "num_input_tokens_seen": 209041904, + "step": 12992 + }, + { + "epoch": 0.9101368567600935, + "grad_norm": 4.686500549316406, + "learning_rate": 9.07175481611208e-06, + "loss": 1.2027, + "num_input_tokens_seen": 209057528, + "step": 12993 + }, + { + "epoch": 0.9102069050058228, + "grad_norm": 3.880321502685547, + "learning_rate": 9.06475656742557e-06, + "loss": 1.1909, + "num_input_tokens_seen": 209072344, + "step": 12994 + }, + { + "epoch": 0.910276953251552, + "grad_norm": 3.456702470779419, + "learning_rate": 9.05775831873906e-06, + "loss": 0.9914, + "num_input_tokens_seen": 209088728, + "step": 12995 + }, + { + "epoch": 0.9103470014972812, + "grad_norm": 4.203364372253418, + "learning_rate": 9.050760070052539e-06, + "loss": 0.9899, + "num_input_tokens_seen": 209104576, + "step": 12996 + }, + { + "epoch": 0.9104170497430105, + "grad_norm": 3.851496458053589, + "learning_rate": 9.043761821366028e-06, + "loss": 1.1551, + "num_input_tokens_seen": 209120960, + "step": 12997 + }, + { + "epoch": 0.9104870979887397, + "grad_norm": 4.076294422149658, + "learning_rate": 9.036763572679506e-06, + "loss": 0.9997, + "num_input_tokens_seen": 209137344, + "step": 12998 + }, + { + "epoch": 0.910557146234469, + "grad_norm": 4.082998275756836, + "learning_rate": 9.029765323992996e-06, + "loss": 1.119, + "num_input_tokens_seen": 209153352, + "step": 12999 + }, + { + "epoch": 0.9106271944801982, + "grad_norm": 5.754385471343994, + "learning_rate": 9.022767075306487e-06, + "loss": 1.0946, + "num_input_tokens_seen": 209169736, + "step": 13000 + }, + { + "epoch": 0.9106271944801982, + "eval_loss": 1.1150249242782593, + "eval_runtime": 0.203, + "eval_samples_per_second": 4.926, + "eval_steps_per_second": 4.926, + "num_input_tokens_seen": 209169736, + "step": 13000 + }, + { + "epoch": 0.9106271944801982, + "num_input_tokens_seen": 209169736, + "step": 13000, + "total_flos": 4.491687216116306e+17, + "train_loss": 0.0, + "train_runtime": 0.037, + "train_samples_per_second": 2778031.466, + "train_steps_per_second": 347243.798 + } + ], + "logging_steps": 1, + "max_steps": 12848, + "num_input_tokens_seen": 209169736, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.491687216116306e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}