diff --git "a/checkpoint-1800/trainer_state.json" "b/checkpoint-1800/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1800/trainer_state.json" +++ /dev/null @@ -1,14514 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.12608684231264283, - "eval_steps": 200, - "global_step": 1800, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 7.004824572924602e-05, - "grad_norm": 6.222772121429443, - "learning_rate": 9.99930017513135e-05, - "loss": 1.1076, - "num_input_tokens_seen": 16384, - "step": 1 - }, - { - "epoch": 0.00014009649145849205, - "grad_norm": 6.042057037353516, - "learning_rate": 9.998600350262697e-05, - "loss": 1.1086, - "num_input_tokens_seen": 32768, - "step": 2 - }, - { - "epoch": 0.00021014473718773804, - "grad_norm": 7.119229316711426, - "learning_rate": 9.997900525394046e-05, - "loss": 1.4047, - "num_input_tokens_seen": 49152, - "step": 3 - }, - { - "epoch": 0.0002801929829169841, - "grad_norm": 7.133191108703613, - "learning_rate": 9.997200700525395e-05, - "loss": 1.3921, - "num_input_tokens_seen": 65536, - "step": 4 - }, - { - "epoch": 0.0003502412286462301, - "grad_norm": 6.1078338623046875, - "learning_rate": 9.996500875656743e-05, - "loss": 1.3171, - "num_input_tokens_seen": 81920, - "step": 5 - }, - { - "epoch": 0.0004202894743754761, - "grad_norm": 6.466420650482178, - "learning_rate": 9.995801050788092e-05, - "loss": 1.0732, - "num_input_tokens_seen": 97344, - "step": 6 - }, - { - "epoch": 0.0004903377201047221, - "grad_norm": 5.578189849853516, - "learning_rate": 9.99510122591944e-05, - "loss": 0.9929, - "num_input_tokens_seen": 113728, - "step": 7 - }, - { - "epoch": 0.0005603859658339682, - "grad_norm": 7.197720527648926, - "learning_rate": 9.994401401050789e-05, - "loss": 1.2512, - "num_input_tokens_seen": 129528, - "step": 8 - }, - { - "epoch": 0.0006304342115632141, - "grad_norm": 6.618913650512695, - "learning_rate": 9.993701576182136e-05, - "loss": 1.3495, - "num_input_tokens_seen": 145704, - "step": 9 - }, - { - "epoch": 0.0007004824572924602, - "grad_norm": 6.955508232116699, - "learning_rate": 9.993001751313485e-05, - "loss": 1.1823, - "num_input_tokens_seen": 161664, - "step": 10 - }, - { - "epoch": 0.0007705307030217062, - "grad_norm": 6.6807074546813965, - "learning_rate": 9.992301926444835e-05, - "loss": 1.1693, - "num_input_tokens_seen": 177960, - "step": 11 - }, - { - "epoch": 0.0008405789487509522, - "grad_norm": 6.784447193145752, - "learning_rate": 9.991602101576183e-05, - "loss": 1.3744, - "num_input_tokens_seen": 194344, - "step": 12 - }, - { - "epoch": 0.0009106271944801982, - "grad_norm": 6.7418437004089355, - "learning_rate": 9.990902276707532e-05, - "loss": 1.22, - "num_input_tokens_seen": 210728, - "step": 13 - }, - { - "epoch": 0.0009806754402094443, - "grad_norm": 6.43395471572876, - "learning_rate": 9.990202451838879e-05, - "loss": 1.1772, - "num_input_tokens_seen": 227112, - "step": 14 - }, - { - "epoch": 0.0010507236859386903, - "grad_norm": 6.09422492980957, - "learning_rate": 9.989502626970228e-05, - "loss": 1.195, - "num_input_tokens_seen": 243496, - "step": 15 - }, - { - "epoch": 0.0011207719316679364, - "grad_norm": 6.238271236419678, - "learning_rate": 9.988802802101577e-05, - "loss": 1.2623, - "num_input_tokens_seen": 259744, - "step": 16 - }, - { - "epoch": 0.0011908201773971822, - "grad_norm": 6.56187629699707, - "learning_rate": 9.988102977232926e-05, - "loss": 1.2721, - "num_input_tokens_seen": 276128, - "step": 17 - }, - { - "epoch": 0.0012608684231264283, - "grad_norm": 6.818358898162842, - "learning_rate": 9.987403152364275e-05, - "loss": 1.2649, - "num_input_tokens_seen": 292512, - "step": 18 - }, - { - "epoch": 0.0013309166688556743, - "grad_norm": 5.950352191925049, - "learning_rate": 9.986703327495622e-05, - "loss": 1.0024, - "num_input_tokens_seen": 308632, - "step": 19 - }, - { - "epoch": 0.0014009649145849204, - "grad_norm": 6.387479305267334, - "learning_rate": 9.986003502626971e-05, - "loss": 1.2783, - "num_input_tokens_seen": 325016, - "step": 20 - }, - { - "epoch": 0.0014710131603141664, - "grad_norm": 6.187346458435059, - "learning_rate": 9.985303677758318e-05, - "loss": 1.1701, - "num_input_tokens_seen": 341384, - "step": 21 - }, - { - "epoch": 0.0015410614060434125, - "grad_norm": 5.371951103210449, - "learning_rate": 9.984603852889667e-05, - "loss": 1.0483, - "num_input_tokens_seen": 357768, - "step": 22 - }, - { - "epoch": 0.0016111096517726585, - "grad_norm": 6.2206807136535645, - "learning_rate": 9.983904028021016e-05, - "loss": 1.2516, - "num_input_tokens_seen": 374152, - "step": 23 - }, - { - "epoch": 0.0016811578975019044, - "grad_norm": 6.121264457702637, - "learning_rate": 9.983204203152365e-05, - "loss": 1.1506, - "num_input_tokens_seen": 390536, - "step": 24 - }, - { - "epoch": 0.0017512061432311504, - "grad_norm": 6.353756904602051, - "learning_rate": 9.982504378283714e-05, - "loss": 1.3118, - "num_input_tokens_seen": 406920, - "step": 25 - }, - { - "epoch": 0.0018212543889603965, - "grad_norm": 6.270686149597168, - "learning_rate": 9.981804553415061e-05, - "loss": 1.0883, - "num_input_tokens_seen": 422728, - "step": 26 - }, - { - "epoch": 0.0018913026346896425, - "grad_norm": 6.117632865905762, - "learning_rate": 9.98110472854641e-05, - "loss": 1.3346, - "num_input_tokens_seen": 439112, - "step": 27 - }, - { - "epoch": 0.0019613508804188886, - "grad_norm": 6.429015159606934, - "learning_rate": 9.980404903677759e-05, - "loss": 1.2494, - "num_input_tokens_seen": 455144, - "step": 28 - }, - { - "epoch": 0.0020313991261481346, - "grad_norm": 6.4467620849609375, - "learning_rate": 9.979705078809107e-05, - "loss": 1.3335, - "num_input_tokens_seen": 470360, - "step": 29 - }, - { - "epoch": 0.0021014473718773807, - "grad_norm": 6.57926082611084, - "learning_rate": 9.979005253940455e-05, - "loss": 1.2126, - "num_input_tokens_seen": 486120, - "step": 30 - }, - { - "epoch": 0.0021714956176066267, - "grad_norm": 5.650569915771484, - "learning_rate": 9.978305429071804e-05, - "loss": 1.1363, - "num_input_tokens_seen": 501896, - "step": 31 - }, - { - "epoch": 0.0022415438633358728, - "grad_norm": 6.380292892456055, - "learning_rate": 9.977605604203153e-05, - "loss": 1.2251, - "num_input_tokens_seen": 517752, - "step": 32 - }, - { - "epoch": 0.002311592109065119, - "grad_norm": 5.704173564910889, - "learning_rate": 9.976905779334502e-05, - "loss": 1.1685, - "num_input_tokens_seen": 534136, - "step": 33 - }, - { - "epoch": 0.0023816403547943644, - "grad_norm": 5.342978000640869, - "learning_rate": 9.97620595446585e-05, - "loss": 1.2012, - "num_input_tokens_seen": 550216, - "step": 34 - }, - { - "epoch": 0.0024516886005236105, - "grad_norm": 5.7014241218566895, - "learning_rate": 9.975506129597198e-05, - "loss": 1.2342, - "num_input_tokens_seen": 566600, - "step": 35 - }, - { - "epoch": 0.0025217368462528565, - "grad_norm": 6.26229190826416, - "learning_rate": 9.974806304728546e-05, - "loss": 1.2041, - "num_input_tokens_seen": 582984, - "step": 36 - }, - { - "epoch": 0.0025917850919821026, - "grad_norm": 6.583463191986084, - "learning_rate": 9.974106479859896e-05, - "loss": 1.3021, - "num_input_tokens_seen": 598968, - "step": 37 - }, - { - "epoch": 0.0026618333377113486, - "grad_norm": 5.58498477935791, - "learning_rate": 9.973406654991245e-05, - "loss": 1.1622, - "num_input_tokens_seen": 614840, - "step": 38 - }, - { - "epoch": 0.0027318815834405947, - "grad_norm": 5.906906604766846, - "learning_rate": 9.972706830122592e-05, - "loss": 1.1971, - "num_input_tokens_seen": 631224, - "step": 39 - }, - { - "epoch": 0.0028019298291698407, - "grad_norm": 5.962359428405762, - "learning_rate": 9.972007005253941e-05, - "loss": 1.1326, - "num_input_tokens_seen": 647000, - "step": 40 - }, - { - "epoch": 0.002871978074899087, - "grad_norm": 6.447500705718994, - "learning_rate": 9.971307180385289e-05, - "loss": 1.0905, - "num_input_tokens_seen": 662480, - "step": 41 - }, - { - "epoch": 0.002942026320628333, - "grad_norm": 5.7290520668029785, - "learning_rate": 9.970607355516638e-05, - "loss": 1.3585, - "num_input_tokens_seen": 678480, - "step": 42 - }, - { - "epoch": 0.003012074566357579, - "grad_norm": 6.063445568084717, - "learning_rate": 9.969907530647987e-05, - "loss": 1.2841, - "num_input_tokens_seen": 694256, - "step": 43 - }, - { - "epoch": 0.003082122812086825, - "grad_norm": 5.302809238433838, - "learning_rate": 9.969207705779335e-05, - "loss": 1.1168, - "num_input_tokens_seen": 710152, - "step": 44 - }, - { - "epoch": 0.003152171057816071, - "grad_norm": 5.634128093719482, - "learning_rate": 9.968507880910684e-05, - "loss": 1.0609, - "num_input_tokens_seen": 726184, - "step": 45 - }, - { - "epoch": 0.003222219303545317, - "grad_norm": 5.652642726898193, - "learning_rate": 9.967808056042032e-05, - "loss": 1.2228, - "num_input_tokens_seen": 742520, - "step": 46 - }, - { - "epoch": 0.0032922675492745627, - "grad_norm": 5.340751647949219, - "learning_rate": 9.96710823117338e-05, - "loss": 1.0595, - "num_input_tokens_seen": 758904, - "step": 47 - }, - { - "epoch": 0.0033623157950038087, - "grad_norm": 5.422239780426025, - "learning_rate": 9.966408406304728e-05, - "loss": 1.1161, - "num_input_tokens_seen": 775040, - "step": 48 - }, - { - "epoch": 0.0034323640407330548, - "grad_norm": 5.29241418838501, - "learning_rate": 9.965708581436077e-05, - "loss": 1.0255, - "num_input_tokens_seen": 790856, - "step": 49 - }, - { - "epoch": 0.003502412286462301, - "grad_norm": 5.146270275115967, - "learning_rate": 9.965008756567426e-05, - "loss": 0.9762, - "num_input_tokens_seen": 807064, - "step": 50 - }, - { - "epoch": 0.003572460532191547, - "grad_norm": 5.825758457183838, - "learning_rate": 9.964308931698775e-05, - "loss": 1.2108, - "num_input_tokens_seen": 823448, - "step": 51 - }, - { - "epoch": 0.003642508777920793, - "grad_norm": 6.179538726806641, - "learning_rate": 9.963609106830124e-05, - "loss": 1.322, - "num_input_tokens_seen": 838888, - "step": 52 - }, - { - "epoch": 0.003712557023650039, - "grad_norm": 6.464454174041748, - "learning_rate": 9.962909281961471e-05, - "loss": 1.5077, - "num_input_tokens_seen": 855272, - "step": 53 - }, - { - "epoch": 0.003782605269379285, - "grad_norm": 5.4227294921875, - "learning_rate": 9.96220945709282e-05, - "loss": 1.2679, - "num_input_tokens_seen": 871656, - "step": 54 - }, - { - "epoch": 0.003852653515108531, - "grad_norm": 5.949041366577148, - "learning_rate": 9.961509632224169e-05, - "loss": 1.3618, - "num_input_tokens_seen": 888040, - "step": 55 - }, - { - "epoch": 0.003922701760837777, - "grad_norm": 6.050904750823975, - "learning_rate": 9.960809807355516e-05, - "loss": 1.3155, - "num_input_tokens_seen": 904400, - "step": 56 - }, - { - "epoch": 0.003992750006567023, - "grad_norm": 6.048308849334717, - "learning_rate": 9.960109982486866e-05, - "loss": 1.3131, - "num_input_tokens_seen": 919952, - "step": 57 - }, - { - "epoch": 0.004062798252296269, - "grad_norm": 5.683863162994385, - "learning_rate": 9.959410157618214e-05, - "loss": 1.1692, - "num_input_tokens_seen": 936336, - "step": 58 - }, - { - "epoch": 0.004132846498025515, - "grad_norm": 5.449287414550781, - "learning_rate": 9.958710332749563e-05, - "loss": 1.0613, - "num_input_tokens_seen": 952152, - "step": 59 - }, - { - "epoch": 0.004202894743754761, - "grad_norm": 5.31496524810791, - "learning_rate": 9.958010507880912e-05, - "loss": 0.9605, - "num_input_tokens_seen": 967824, - "step": 60 - }, - { - "epoch": 0.004272942989484007, - "grad_norm": 5.57105016708374, - "learning_rate": 9.957310683012259e-05, - "loss": 1.1701, - "num_input_tokens_seen": 983864, - "step": 61 - }, - { - "epoch": 0.004342991235213253, - "grad_norm": 5.3456830978393555, - "learning_rate": 9.956610858143608e-05, - "loss": 1.0995, - "num_input_tokens_seen": 1000248, - "step": 62 - }, - { - "epoch": 0.004413039480942499, - "grad_norm": 5.453295707702637, - "learning_rate": 9.955911033274957e-05, - "loss": 1.2413, - "num_input_tokens_seen": 1016632, - "step": 63 - }, - { - "epoch": 0.0044830877266717455, - "grad_norm": 4.975449562072754, - "learning_rate": 9.955211208406306e-05, - "loss": 1.0961, - "num_input_tokens_seen": 1033016, - "step": 64 - }, - { - "epoch": 0.004553135972400991, - "grad_norm": 5.542137145996094, - "learning_rate": 9.954511383537655e-05, - "loss": 1.1171, - "num_input_tokens_seen": 1049400, - "step": 65 - }, - { - "epoch": 0.004623184218130238, - "grad_norm": 5.213950157165527, - "learning_rate": 9.953811558669002e-05, - "loss": 1.2228, - "num_input_tokens_seen": 1065784, - "step": 66 - }, - { - "epoch": 0.004693232463859483, - "grad_norm": 5.496099948883057, - "learning_rate": 9.953111733800351e-05, - "loss": 1.1529, - "num_input_tokens_seen": 1082168, - "step": 67 - }, - { - "epoch": 0.004763280709588729, - "grad_norm": 5.64145565032959, - "learning_rate": 9.952411908931698e-05, - "loss": 1.2301, - "num_input_tokens_seen": 1098024, - "step": 68 - }, - { - "epoch": 0.004833328955317975, - "grad_norm": 5.566709995269775, - "learning_rate": 9.951712084063047e-05, - "loss": 1.2679, - "num_input_tokens_seen": 1114408, - "step": 69 - }, - { - "epoch": 0.004903377201047221, - "grad_norm": 6.443673133850098, - "learning_rate": 9.951012259194396e-05, - "loss": 1.2313, - "num_input_tokens_seen": 1130792, - "step": 70 - }, - { - "epoch": 0.0049734254467764675, - "grad_norm": 5.882962226867676, - "learning_rate": 9.950312434325745e-05, - "loss": 1.4304, - "num_input_tokens_seen": 1147176, - "step": 71 - }, - { - "epoch": 0.005043473692505713, - "grad_norm": 6.0052666664123535, - "learning_rate": 9.949612609457094e-05, - "loss": 1.3027, - "num_input_tokens_seen": 1160968, - "step": 72 - }, - { - "epoch": 0.0051135219382349596, - "grad_norm": 5.260256767272949, - "learning_rate": 9.948912784588441e-05, - "loss": 1.1526, - "num_input_tokens_seen": 1177352, - "step": 73 - }, - { - "epoch": 0.005183570183964205, - "grad_norm": 5.641814708709717, - "learning_rate": 9.94821295971979e-05, - "loss": 1.0666, - "num_input_tokens_seen": 1193032, - "step": 74 - }, - { - "epoch": 0.005253618429693452, - "grad_norm": 5.121115207672119, - "learning_rate": 9.947513134851138e-05, - "loss": 1.2404, - "num_input_tokens_seen": 1208952, - "step": 75 - }, - { - "epoch": 0.005323666675422697, - "grad_norm": 5.63930082321167, - "learning_rate": 9.946813309982487e-05, - "loss": 1.5127, - "num_input_tokens_seen": 1225000, - "step": 76 - }, - { - "epoch": 0.005393714921151944, - "grad_norm": 4.880716800689697, - "learning_rate": 9.946113485113837e-05, - "loss": 1.1484, - "num_input_tokens_seen": 1241384, - "step": 77 - }, - { - "epoch": 0.005463763166881189, - "grad_norm": 5.59611177444458, - "learning_rate": 9.945413660245184e-05, - "loss": 1.1678, - "num_input_tokens_seen": 1257680, - "step": 78 - }, - { - "epoch": 0.005533811412610436, - "grad_norm": 5.052026271820068, - "learning_rate": 9.944713835376533e-05, - "loss": 1.2207, - "num_input_tokens_seen": 1274064, - "step": 79 - }, - { - "epoch": 0.0056038596583396815, - "grad_norm": 5.285096168518066, - "learning_rate": 9.944014010507881e-05, - "loss": 1.1457, - "num_input_tokens_seen": 1290448, - "step": 80 - }, - { - "epoch": 0.005673907904068927, - "grad_norm": 5.4286580085754395, - "learning_rate": 9.94331418563923e-05, - "loss": 1.3047, - "num_input_tokens_seen": 1306832, - "step": 81 - }, - { - "epoch": 0.005743956149798174, - "grad_norm": 5.937953472137451, - "learning_rate": 9.942614360770578e-05, - "loss": 1.4353, - "num_input_tokens_seen": 1323216, - "step": 82 - }, - { - "epoch": 0.005814004395527419, - "grad_norm": 5.129006385803223, - "learning_rate": 9.941914535901927e-05, - "loss": 1.1434, - "num_input_tokens_seen": 1339408, - "step": 83 - }, - { - "epoch": 0.005884052641256666, - "grad_norm": 5.179675102233887, - "learning_rate": 9.941214711033276e-05, - "loss": 1.2452, - "num_input_tokens_seen": 1355792, - "step": 84 - }, - { - "epoch": 0.005954100886985911, - "grad_norm": 4.912832736968994, - "learning_rate": 9.940514886164624e-05, - "loss": 1.1255, - "num_input_tokens_seen": 1372176, - "step": 85 - }, - { - "epoch": 0.006024149132715158, - "grad_norm": 5.190899848937988, - "learning_rate": 9.939815061295973e-05, - "loss": 1.2543, - "num_input_tokens_seen": 1388560, - "step": 86 - }, - { - "epoch": 0.006094197378444403, - "grad_norm": 5.1751275062561035, - "learning_rate": 9.939115236427321e-05, - "loss": 1.3145, - "num_input_tokens_seen": 1404944, - "step": 87 - }, - { - "epoch": 0.00616424562417365, - "grad_norm": 5.450705528259277, - "learning_rate": 9.938415411558669e-05, - "loss": 1.2844, - "num_input_tokens_seen": 1421328, - "step": 88 - }, - { - "epoch": 0.0062342938699028955, - "grad_norm": 5.593935012817383, - "learning_rate": 9.937715586690018e-05, - "loss": 1.3284, - "num_input_tokens_seen": 1437464, - "step": 89 - }, - { - "epoch": 0.006304342115632142, - "grad_norm": 5.156428813934326, - "learning_rate": 9.937015761821367e-05, - "loss": 1.1682, - "num_input_tokens_seen": 1452952, - "step": 90 - }, - { - "epoch": 0.006374390361361388, - "grad_norm": 4.673638820648193, - "learning_rate": 9.936315936952715e-05, - "loss": 1.004, - "num_input_tokens_seen": 1469336, - "step": 91 - }, - { - "epoch": 0.006444438607090634, - "grad_norm": 4.996700763702393, - "learning_rate": 9.935616112084064e-05, - "loss": 1.087, - "num_input_tokens_seen": 1485448, - "step": 92 - }, - { - "epoch": 0.00651448685281988, - "grad_norm": 4.817474365234375, - "learning_rate": 9.934916287215412e-05, - "loss": 1.151, - "num_input_tokens_seen": 1501472, - "step": 93 - }, - { - "epoch": 0.006584535098549125, - "grad_norm": 5.400479316711426, - "learning_rate": 9.934216462346761e-05, - "loss": 1.3144, - "num_input_tokens_seen": 1516424, - "step": 94 - }, - { - "epoch": 0.006654583344278372, - "grad_norm": 5.232216835021973, - "learning_rate": 9.933516637478108e-05, - "loss": 1.0019, - "num_input_tokens_seen": 1532792, - "step": 95 - }, - { - "epoch": 0.006724631590007617, - "grad_norm": 5.392521381378174, - "learning_rate": 9.932816812609457e-05, - "loss": 1.3195, - "num_input_tokens_seen": 1548600, - "step": 96 - }, - { - "epoch": 0.006794679835736864, - "grad_norm": 5.5280866622924805, - "learning_rate": 9.932116987740806e-05, - "loss": 1.283, - "num_input_tokens_seen": 1564088, - "step": 97 - }, - { - "epoch": 0.0068647280814661095, - "grad_norm": 4.963179588317871, - "learning_rate": 9.931417162872155e-05, - "loss": 1.2716, - "num_input_tokens_seen": 1580040, - "step": 98 - }, - { - "epoch": 0.006934776327195356, - "grad_norm": 4.920302391052246, - "learning_rate": 9.930717338003504e-05, - "loss": 1.088, - "num_input_tokens_seen": 1595880, - "step": 99 - }, - { - "epoch": 0.007004824572924602, - "grad_norm": 4.935486793518066, - "learning_rate": 9.930017513134851e-05, - "loss": 1.0122, - "num_input_tokens_seen": 1611864, - "step": 100 - }, - { - "epoch": 0.007074872818653848, - "grad_norm": 5.099087238311768, - "learning_rate": 9.9293176882662e-05, - "loss": 1.1605, - "num_input_tokens_seen": 1627472, - "step": 101 - }, - { - "epoch": 0.007144921064383094, - "grad_norm": 5.3764328956604, - "learning_rate": 9.928617863397548e-05, - "loss": 1.2225, - "num_input_tokens_seen": 1643856, - "step": 102 - }, - { - "epoch": 0.00721496931011234, - "grad_norm": 5.281564712524414, - "learning_rate": 9.927918038528898e-05, - "loss": 1.1483, - "num_input_tokens_seen": 1660240, - "step": 103 - }, - { - "epoch": 0.007285017555841586, - "grad_norm": 5.395167827606201, - "learning_rate": 9.927218213660247e-05, - "loss": 1.6014, - "num_input_tokens_seen": 1676624, - "step": 104 - }, - { - "epoch": 0.007355065801570832, - "grad_norm": 5.322319507598877, - "learning_rate": 9.926518388791594e-05, - "loss": 1.0933, - "num_input_tokens_seen": 1693008, - "step": 105 - }, - { - "epoch": 0.007425114047300078, - "grad_norm": 5.301229953765869, - "learning_rate": 9.925818563922943e-05, - "loss": 1.1998, - "num_input_tokens_seen": 1708424, - "step": 106 - }, - { - "epoch": 0.0074951622930293236, - "grad_norm": 4.958597183227539, - "learning_rate": 9.92511873905429e-05, - "loss": 1.3285, - "num_input_tokens_seen": 1724808, - "step": 107 - }, - { - "epoch": 0.00756521053875857, - "grad_norm": 4.3913960456848145, - "learning_rate": 9.924418914185639e-05, - "loss": 0.9017, - "num_input_tokens_seen": 1740752, - "step": 108 - }, - { - "epoch": 0.007635258784487816, - "grad_norm": 5.401021480560303, - "learning_rate": 9.923719089316988e-05, - "loss": 1.3646, - "num_input_tokens_seen": 1755176, - "step": 109 - }, - { - "epoch": 0.007705307030217062, - "grad_norm": 4.894444942474365, - "learning_rate": 9.923019264448337e-05, - "loss": 0.9955, - "num_input_tokens_seen": 1771560, - "step": 110 - }, - { - "epoch": 0.007775355275946308, - "grad_norm": 4.878688335418701, - "learning_rate": 9.922319439579686e-05, - "loss": 1.1766, - "num_input_tokens_seen": 1787944, - "step": 111 - }, - { - "epoch": 0.007845403521675554, - "grad_norm": 4.9379777908325195, - "learning_rate": 9.921619614711033e-05, - "loss": 1.1631, - "num_input_tokens_seen": 1803568, - "step": 112 - }, - { - "epoch": 0.0079154517674048, - "grad_norm": 5.101811408996582, - "learning_rate": 9.920919789842382e-05, - "loss": 1.2165, - "num_input_tokens_seen": 1819952, - "step": 113 - }, - { - "epoch": 0.007985500013134045, - "grad_norm": 5.32574987411499, - "learning_rate": 9.920219964973731e-05, - "loss": 1.3012, - "num_input_tokens_seen": 1835296, - "step": 114 - }, - { - "epoch": 0.008055548258863293, - "grad_norm": 5.2391180992126465, - "learning_rate": 9.919520140105079e-05, - "loss": 1.2451, - "num_input_tokens_seen": 1851224, - "step": 115 - }, - { - "epoch": 0.008125596504592538, - "grad_norm": 4.865017890930176, - "learning_rate": 9.918820315236427e-05, - "loss": 1.1683, - "num_input_tokens_seen": 1867608, - "step": 116 - }, - { - "epoch": 0.008195644750321784, - "grad_norm": 4.943136215209961, - "learning_rate": 9.918120490367776e-05, - "loss": 1.31, - "num_input_tokens_seen": 1883696, - "step": 117 - }, - { - "epoch": 0.00826569299605103, - "grad_norm": 4.769871711730957, - "learning_rate": 9.917420665499125e-05, - "loss": 1.1212, - "num_input_tokens_seen": 1900080, - "step": 118 - }, - { - "epoch": 0.008335741241780275, - "grad_norm": 4.785780429840088, - "learning_rate": 9.916720840630474e-05, - "loss": 1.2415, - "num_input_tokens_seen": 1916464, - "step": 119 - }, - { - "epoch": 0.008405789487509523, - "grad_norm": 4.802333831787109, - "learning_rate": 9.916021015761822e-05, - "loss": 1.0513, - "num_input_tokens_seen": 1932848, - "step": 120 - }, - { - "epoch": 0.008475837733238768, - "grad_norm": 5.22212553024292, - "learning_rate": 9.91532119089317e-05, - "loss": 1.2574, - "num_input_tokens_seen": 1949232, - "step": 121 - }, - { - "epoch": 0.008545885978968014, - "grad_norm": 5.104204177856445, - "learning_rate": 9.914621366024518e-05, - "loss": 1.0436, - "num_input_tokens_seen": 1964184, - "step": 122 - }, - { - "epoch": 0.00861593422469726, - "grad_norm": 5.11055326461792, - "learning_rate": 9.913921541155868e-05, - "loss": 1.1939, - "num_input_tokens_seen": 1980568, - "step": 123 - }, - { - "epoch": 0.008685982470426507, - "grad_norm": 4.784866809844971, - "learning_rate": 9.913221716287216e-05, - "loss": 1.2056, - "num_input_tokens_seen": 1996952, - "step": 124 - }, - { - "epoch": 0.008756030716155752, - "grad_norm": 4.763037204742432, - "learning_rate": 9.912521891418564e-05, - "loss": 1.1403, - "num_input_tokens_seen": 2013336, - "step": 125 - }, - { - "epoch": 0.008826078961884998, - "grad_norm": 4.813408851623535, - "learning_rate": 9.911822066549913e-05, - "loss": 1.1897, - "num_input_tokens_seen": 2029720, - "step": 126 - }, - { - "epoch": 0.008896127207614244, - "grad_norm": 4.79008674621582, - "learning_rate": 9.911122241681261e-05, - "loss": 1.2315, - "num_input_tokens_seen": 2046104, - "step": 127 - }, - { - "epoch": 0.008966175453343491, - "grad_norm": 4.843508720397949, - "learning_rate": 9.91042241681261e-05, - "loss": 1.0883, - "num_input_tokens_seen": 2061592, - "step": 128 - }, - { - "epoch": 0.009036223699072737, - "grad_norm": 4.917592525482178, - "learning_rate": 9.909722591943959e-05, - "loss": 1.2512, - "num_input_tokens_seen": 2077792, - "step": 129 - }, - { - "epoch": 0.009106271944801982, - "grad_norm": 4.9154133796691895, - "learning_rate": 9.909022767075307e-05, - "loss": 1.3284, - "num_input_tokens_seen": 2094176, - "step": 130 - }, - { - "epoch": 0.009176320190531228, - "grad_norm": 5.2125420570373535, - "learning_rate": 9.908322942206656e-05, - "loss": 1.3469, - "num_input_tokens_seen": 2110480, - "step": 131 - }, - { - "epoch": 0.009246368436260475, - "grad_norm": 4.715712547302246, - "learning_rate": 9.907623117338004e-05, - "loss": 1.0844, - "num_input_tokens_seen": 2126864, - "step": 132 - }, - { - "epoch": 0.009316416681989721, - "grad_norm": 4.805694580078125, - "learning_rate": 9.906923292469353e-05, - "loss": 1.069, - "num_input_tokens_seen": 2142848, - "step": 133 - }, - { - "epoch": 0.009386464927718966, - "grad_norm": 4.961355209350586, - "learning_rate": 9.9062234676007e-05, - "loss": 1.3387, - "num_input_tokens_seen": 2159232, - "step": 134 - }, - { - "epoch": 0.009456513173448212, - "grad_norm": 4.582219123840332, - "learning_rate": 9.905523642732049e-05, - "loss": 1.2013, - "num_input_tokens_seen": 2175616, - "step": 135 - }, - { - "epoch": 0.009526561419177458, - "grad_norm": 5.195998191833496, - "learning_rate": 9.904823817863398e-05, - "loss": 1.2552, - "num_input_tokens_seen": 2191872, - "step": 136 - }, - { - "epoch": 0.009596609664906705, - "grad_norm": 4.934189319610596, - "learning_rate": 9.904123992994747e-05, - "loss": 1.2961, - "num_input_tokens_seen": 2208208, - "step": 137 - }, - { - "epoch": 0.00966665791063595, - "grad_norm": 4.981037616729736, - "learning_rate": 9.903424168126096e-05, - "loss": 1.1546, - "num_input_tokens_seen": 2224592, - "step": 138 - }, - { - "epoch": 0.009736706156365196, - "grad_norm": 5.469496250152588, - "learning_rate": 9.902724343257443e-05, - "loss": 1.3833, - "num_input_tokens_seen": 2240976, - "step": 139 - }, - { - "epoch": 0.009806754402094442, - "grad_norm": 4.889583587646484, - "learning_rate": 9.902024518388792e-05, - "loss": 1.2095, - "num_input_tokens_seen": 2257360, - "step": 140 - }, - { - "epoch": 0.00987680264782369, - "grad_norm": 4.532052516937256, - "learning_rate": 9.901324693520141e-05, - "loss": 1.143, - "num_input_tokens_seen": 2272848, - "step": 141 - }, - { - "epoch": 0.009946850893552935, - "grad_norm": 5.278079032897949, - "learning_rate": 9.900624868651488e-05, - "loss": 1.2849, - "num_input_tokens_seen": 2289232, - "step": 142 - }, - { - "epoch": 0.01001689913928218, - "grad_norm": 4.549891948699951, - "learning_rate": 9.899925043782839e-05, - "loss": 1.0482, - "num_input_tokens_seen": 2305424, - "step": 143 - }, - { - "epoch": 0.010086947385011426, - "grad_norm": 4.7777180671691895, - "learning_rate": 9.899225218914186e-05, - "loss": 1.1926, - "num_input_tokens_seen": 2320968, - "step": 144 - }, - { - "epoch": 0.010156995630740673, - "grad_norm": 4.320313453674316, - "learning_rate": 9.898525394045535e-05, - "loss": 1.0468, - "num_input_tokens_seen": 2337352, - "step": 145 - }, - { - "epoch": 0.010227043876469919, - "grad_norm": 4.915202617645264, - "learning_rate": 9.897825569176882e-05, - "loss": 1.1326, - "num_input_tokens_seen": 2353064, - "step": 146 - }, - { - "epoch": 0.010297092122199165, - "grad_norm": 4.569783687591553, - "learning_rate": 9.897125744308231e-05, - "loss": 0.8586, - "num_input_tokens_seen": 2369128, - "step": 147 - }, - { - "epoch": 0.01036714036792841, - "grad_norm": 4.591664791107178, - "learning_rate": 9.89642591943958e-05, - "loss": 1.1369, - "num_input_tokens_seen": 2385512, - "step": 148 - }, - { - "epoch": 0.010437188613657656, - "grad_norm": 4.913016319274902, - "learning_rate": 9.895726094570929e-05, - "loss": 1.1564, - "num_input_tokens_seen": 2401208, - "step": 149 - }, - { - "epoch": 0.010507236859386903, - "grad_norm": 4.908018112182617, - "learning_rate": 9.895026269702278e-05, - "loss": 1.1247, - "num_input_tokens_seen": 2417592, - "step": 150 - }, - { - "epoch": 0.010577285105116149, - "grad_norm": 4.536910057067871, - "learning_rate": 9.894326444833625e-05, - "loss": 1.014, - "num_input_tokens_seen": 2433976, - "step": 151 - }, - { - "epoch": 0.010647333350845395, - "grad_norm": 4.899227142333984, - "learning_rate": 9.893626619964974e-05, - "loss": 1.0418, - "num_input_tokens_seen": 2448072, - "step": 152 - }, - { - "epoch": 0.01071738159657464, - "grad_norm": 4.600861072540283, - "learning_rate": 9.892926795096323e-05, - "loss": 1.0459, - "num_input_tokens_seen": 2464240, - "step": 153 - }, - { - "epoch": 0.010787429842303888, - "grad_norm": 4.707681179046631, - "learning_rate": 9.89222697022767e-05, - "loss": 1.0859, - "num_input_tokens_seen": 2480624, - "step": 154 - }, - { - "epoch": 0.010857478088033133, - "grad_norm": 4.748518466949463, - "learning_rate": 9.89152714535902e-05, - "loss": 1.0608, - "num_input_tokens_seen": 2497008, - "step": 155 - }, - { - "epoch": 0.010927526333762379, - "grad_norm": 4.794179439544678, - "learning_rate": 9.890827320490368e-05, - "loss": 1.2243, - "num_input_tokens_seen": 2513392, - "step": 156 - }, - { - "epoch": 0.010997574579491624, - "grad_norm": 4.593925476074219, - "learning_rate": 9.890127495621717e-05, - "loss": 1.1002, - "num_input_tokens_seen": 2529776, - "step": 157 - }, - { - "epoch": 0.011067622825220872, - "grad_norm": 4.318257808685303, - "learning_rate": 9.889427670753066e-05, - "loss": 0.9561, - "num_input_tokens_seen": 2546160, - "step": 158 - }, - { - "epoch": 0.011137671070950117, - "grad_norm": 4.631777286529541, - "learning_rate": 9.888727845884414e-05, - "loss": 1.1553, - "num_input_tokens_seen": 2562544, - "step": 159 - }, - { - "epoch": 0.011207719316679363, - "grad_norm": 4.896609783172607, - "learning_rate": 9.888028021015762e-05, - "loss": 1.1779, - "num_input_tokens_seen": 2578088, - "step": 160 - }, - { - "epoch": 0.011277767562408609, - "grad_norm": 4.3978681564331055, - "learning_rate": 9.88732819614711e-05, - "loss": 1.1778, - "num_input_tokens_seen": 2594416, - "step": 161 - }, - { - "epoch": 0.011347815808137854, - "grad_norm": 4.82927942276001, - "learning_rate": 9.886628371278459e-05, - "loss": 1.0339, - "num_input_tokens_seen": 2609776, - "step": 162 - }, - { - "epoch": 0.011417864053867102, - "grad_norm": 4.413319110870361, - "learning_rate": 9.885928546409809e-05, - "loss": 1.0992, - "num_input_tokens_seen": 2626160, - "step": 163 - }, - { - "epoch": 0.011487912299596347, - "grad_norm": 4.626354694366455, - "learning_rate": 9.885228721541156e-05, - "loss": 1.1948, - "num_input_tokens_seen": 2642464, - "step": 164 - }, - { - "epoch": 0.011557960545325593, - "grad_norm": 4.328434467315674, - "learning_rate": 9.884528896672505e-05, - "loss": 1.1493, - "num_input_tokens_seen": 2658528, - "step": 165 - }, - { - "epoch": 0.011628008791054838, - "grad_norm": 4.57839822769165, - "learning_rate": 9.883829071803853e-05, - "loss": 1.0775, - "num_input_tokens_seen": 2674912, - "step": 166 - }, - { - "epoch": 0.011698057036784086, - "grad_norm": 5.103973865509033, - "learning_rate": 9.883129246935202e-05, - "loss": 1.2458, - "num_input_tokens_seen": 2690792, - "step": 167 - }, - { - "epoch": 0.011768105282513331, - "grad_norm": 4.558016300201416, - "learning_rate": 9.88242942206655e-05, - "loss": 1.0122, - "num_input_tokens_seen": 2705616, - "step": 168 - }, - { - "epoch": 0.011838153528242577, - "grad_norm": 4.811260223388672, - "learning_rate": 9.8817295971979e-05, - "loss": 1.2989, - "num_input_tokens_seen": 2721704, - "step": 169 - }, - { - "epoch": 0.011908201773971823, - "grad_norm": 4.726966857910156, - "learning_rate": 9.881029772329248e-05, - "loss": 1.176, - "num_input_tokens_seen": 2738088, - "step": 170 - }, - { - "epoch": 0.01197825001970107, - "grad_norm": 4.874902725219727, - "learning_rate": 9.880329947460596e-05, - "loss": 1.2586, - "num_input_tokens_seen": 2754040, - "step": 171 - }, - { - "epoch": 0.012048298265430316, - "grad_norm": 4.379549980163574, - "learning_rate": 9.879630122591945e-05, - "loss": 1.1771, - "num_input_tokens_seen": 2770424, - "step": 172 - }, - { - "epoch": 0.012118346511159561, - "grad_norm": 4.455331802368164, - "learning_rate": 9.878930297723292e-05, - "loss": 1.0714, - "num_input_tokens_seen": 2786808, - "step": 173 - }, - { - "epoch": 0.012188394756888807, - "grad_norm": 4.42273473739624, - "learning_rate": 9.878230472854641e-05, - "loss": 1.1798, - "num_input_tokens_seen": 2803176, - "step": 174 - }, - { - "epoch": 0.012258443002618052, - "grad_norm": 4.4078874588012695, - "learning_rate": 9.87753064798599e-05, - "loss": 1.1672, - "num_input_tokens_seen": 2819448, - "step": 175 - }, - { - "epoch": 0.0123284912483473, - "grad_norm": 4.79048490524292, - "learning_rate": 9.876830823117339e-05, - "loss": 1.3331, - "num_input_tokens_seen": 2835832, - "step": 176 - }, - { - "epoch": 0.012398539494076545, - "grad_norm": 4.212133884429932, - "learning_rate": 9.876130998248688e-05, - "loss": 1.0007, - "num_input_tokens_seen": 2851776, - "step": 177 - }, - { - "epoch": 0.012468587739805791, - "grad_norm": 5.7587738037109375, - "learning_rate": 9.875431173380035e-05, - "loss": 1.4729, - "num_input_tokens_seen": 2867896, - "step": 178 - }, - { - "epoch": 0.012538635985535037, - "grad_norm": 4.3469462394714355, - "learning_rate": 9.874731348511384e-05, - "loss": 0.957, - "num_input_tokens_seen": 2884280, - "step": 179 - }, - { - "epoch": 0.012608684231264284, - "grad_norm": 4.584625244140625, - "learning_rate": 9.874031523642733e-05, - "loss": 1.0753, - "num_input_tokens_seen": 2899208, - "step": 180 - }, - { - "epoch": 0.01267873247699353, - "grad_norm": 4.544627666473389, - "learning_rate": 9.87333169877408e-05, - "loss": 1.1706, - "num_input_tokens_seen": 2915416, - "step": 181 - }, - { - "epoch": 0.012748780722722775, - "grad_norm": 4.8749237060546875, - "learning_rate": 9.872631873905429e-05, - "loss": 1.3382, - "num_input_tokens_seen": 2931360, - "step": 182 - }, - { - "epoch": 0.01281882896845202, - "grad_norm": 4.593903541564941, - "learning_rate": 9.871932049036778e-05, - "loss": 1.1588, - "num_input_tokens_seen": 2947744, - "step": 183 - }, - { - "epoch": 0.012888877214181268, - "grad_norm": 4.478219509124756, - "learning_rate": 9.871232224168127e-05, - "loss": 1.1013, - "num_input_tokens_seen": 2963664, - "step": 184 - }, - { - "epoch": 0.012958925459910514, - "grad_norm": 5.028106212615967, - "learning_rate": 9.870532399299476e-05, - "loss": 1.3223, - "num_input_tokens_seen": 2980048, - "step": 185 - }, - { - "epoch": 0.01302897370563976, - "grad_norm": 4.866946697235107, - "learning_rate": 9.869832574430823e-05, - "loss": 1.2376, - "num_input_tokens_seen": 2995992, - "step": 186 - }, - { - "epoch": 0.013099021951369005, - "grad_norm": 4.421341419219971, - "learning_rate": 9.869132749562172e-05, - "loss": 1.2252, - "num_input_tokens_seen": 3012000, - "step": 187 - }, - { - "epoch": 0.01316907019709825, - "grad_norm": 4.88083028793335, - "learning_rate": 9.86843292469352e-05, - "loss": 1.2951, - "num_input_tokens_seen": 3028384, - "step": 188 - }, - { - "epoch": 0.013239118442827498, - "grad_norm": 4.654318809509277, - "learning_rate": 9.86773309982487e-05, - "loss": 1.2839, - "num_input_tokens_seen": 3044768, - "step": 189 - }, - { - "epoch": 0.013309166688556744, - "grad_norm": 4.626763820648193, - "learning_rate": 9.867033274956219e-05, - "loss": 1.2389, - "num_input_tokens_seen": 3061152, - "step": 190 - }, - { - "epoch": 0.01337921493428599, - "grad_norm": 4.178484916687012, - "learning_rate": 9.866333450087566e-05, - "loss": 1.1186, - "num_input_tokens_seen": 3077056, - "step": 191 - }, - { - "epoch": 0.013449263180015235, - "grad_norm": 4.755034923553467, - "learning_rate": 9.865633625218915e-05, - "loss": 1.0594, - "num_input_tokens_seen": 3093400, - "step": 192 - }, - { - "epoch": 0.013519311425744482, - "grad_norm": 4.437506198883057, - "learning_rate": 9.864933800350263e-05, - "loss": 1.2078, - "num_input_tokens_seen": 3109784, - "step": 193 - }, - { - "epoch": 0.013589359671473728, - "grad_norm": 5.140488624572754, - "learning_rate": 9.864233975481611e-05, - "loss": 1.4312, - "num_input_tokens_seen": 3124976, - "step": 194 - }, - { - "epoch": 0.013659407917202973, - "grad_norm": 4.72155237197876, - "learning_rate": 9.86353415061296e-05, - "loss": 1.1752, - "num_input_tokens_seen": 3140632, - "step": 195 - }, - { - "epoch": 0.013729456162932219, - "grad_norm": 4.914645671844482, - "learning_rate": 9.862834325744309e-05, - "loss": 1.2464, - "num_input_tokens_seen": 3156616, - "step": 196 - }, - { - "epoch": 0.013799504408661466, - "grad_norm": 4.23387336730957, - "learning_rate": 9.862134500875658e-05, - "loss": 0.9722, - "num_input_tokens_seen": 3172840, - "step": 197 - }, - { - "epoch": 0.013869552654390712, - "grad_norm": 4.659370422363281, - "learning_rate": 9.861434676007005e-05, - "loss": 1.1981, - "num_input_tokens_seen": 3188584, - "step": 198 - }, - { - "epoch": 0.013939600900119958, - "grad_norm": 4.580902576446533, - "learning_rate": 9.860734851138354e-05, - "loss": 1.1913, - "num_input_tokens_seen": 3204432, - "step": 199 - }, - { - "epoch": 0.014009649145849203, - "grad_norm": 4.208237648010254, - "learning_rate": 9.860035026269702e-05, - "loss": 1.2056, - "num_input_tokens_seen": 3220816, - "step": 200 - }, - { - "epoch": 0.014009649145849203, - "eval_loss": 1.2226407527923584, - "eval_runtime": 0.3992, - "eval_samples_per_second": 2.505, - "eval_steps_per_second": 2.505, - "num_input_tokens_seen": 3220816, - "step": 200 - }, - { - "epoch": 0.014079697391578449, - "grad_norm": 4.526260852813721, - "learning_rate": 9.85933520140105e-05, - "loss": 1.0488, - "num_input_tokens_seen": 3237200, - "step": 201 - }, - { - "epoch": 0.014149745637307696, - "grad_norm": 4.46895170211792, - "learning_rate": 9.8586353765324e-05, - "loss": 1.1101, - "num_input_tokens_seen": 3253336, - "step": 202 - }, - { - "epoch": 0.014219793883036942, - "grad_norm": 4.367347717285156, - "learning_rate": 9.857935551663748e-05, - "loss": 1.0425, - "num_input_tokens_seen": 3269632, - "step": 203 - }, - { - "epoch": 0.014289842128766187, - "grad_norm": 4.860860347747803, - "learning_rate": 9.857235726795097e-05, - "loss": 1.4068, - "num_input_tokens_seen": 3285432, - "step": 204 - }, - { - "epoch": 0.014359890374495433, - "grad_norm": 4.336480617523193, - "learning_rate": 9.856535901926445e-05, - "loss": 1.2579, - "num_input_tokens_seen": 3301632, - "step": 205 - }, - { - "epoch": 0.01442993862022468, - "grad_norm": 4.587873458862305, - "learning_rate": 9.855836077057794e-05, - "loss": 1.1508, - "num_input_tokens_seen": 3318016, - "step": 206 - }, - { - "epoch": 0.014499986865953926, - "grad_norm": 4.719262599945068, - "learning_rate": 9.855136252189142e-05, - "loss": 1.0208, - "num_input_tokens_seen": 3333168, - "step": 207 - }, - { - "epoch": 0.014570035111683172, - "grad_norm": 4.419138431549072, - "learning_rate": 9.85443642732049e-05, - "loss": 1.2576, - "num_input_tokens_seen": 3349384, - "step": 208 - }, - { - "epoch": 0.014640083357412417, - "grad_norm": 4.3150835037231445, - "learning_rate": 9.85373660245184e-05, - "loss": 1.1786, - "num_input_tokens_seen": 3365768, - "step": 209 - }, - { - "epoch": 0.014710131603141665, - "grad_norm": 4.5917649269104, - "learning_rate": 9.853036777583188e-05, - "loss": 1.2821, - "num_input_tokens_seen": 3382152, - "step": 210 - }, - { - "epoch": 0.01478017984887091, - "grad_norm": 4.9094343185424805, - "learning_rate": 9.852336952714537e-05, - "loss": 1.2415, - "num_input_tokens_seen": 3397896, - "step": 211 - }, - { - "epoch": 0.014850228094600156, - "grad_norm": 4.394861698150635, - "learning_rate": 9.851637127845885e-05, - "loss": 1.1776, - "num_input_tokens_seen": 3414280, - "step": 212 - }, - { - "epoch": 0.014920276340329401, - "grad_norm": 4.196374416351318, - "learning_rate": 9.850937302977233e-05, - "loss": 1.065, - "num_input_tokens_seen": 3430584, - "step": 213 - }, - { - "epoch": 0.014990324586058647, - "grad_norm": 4.728682518005371, - "learning_rate": 9.850237478108582e-05, - "loss": 1.2686, - "num_input_tokens_seen": 3446968, - "step": 214 - }, - { - "epoch": 0.015060372831787894, - "grad_norm": 4.291411876678467, - "learning_rate": 9.84953765323993e-05, - "loss": 1.1877, - "num_input_tokens_seen": 3462568, - "step": 215 - }, - { - "epoch": 0.01513042107751714, - "grad_norm": 4.405060768127441, - "learning_rate": 9.84883782837128e-05, - "loss": 1.2873, - "num_input_tokens_seen": 3478952, - "step": 216 - }, - { - "epoch": 0.015200469323246386, - "grad_norm": 4.254365921020508, - "learning_rate": 9.848138003502628e-05, - "loss": 1.1062, - "num_input_tokens_seen": 3495304, - "step": 217 - }, - { - "epoch": 0.015270517568975631, - "grad_norm": 4.741672039031982, - "learning_rate": 9.847438178633976e-05, - "loss": 1.1983, - "num_input_tokens_seen": 3511688, - "step": 218 - }, - { - "epoch": 0.015340565814704879, - "grad_norm": 4.352742671966553, - "learning_rate": 9.846738353765325e-05, - "loss": 1.2028, - "num_input_tokens_seen": 3528072, - "step": 219 - }, - { - "epoch": 0.015410614060434124, - "grad_norm": 4.996603488922119, - "learning_rate": 9.846038528896672e-05, - "loss": 1.1561, - "num_input_tokens_seen": 3542904, - "step": 220 - }, - { - "epoch": 0.01548066230616337, - "grad_norm": 4.911815166473389, - "learning_rate": 9.845338704028021e-05, - "loss": 1.3375, - "num_input_tokens_seen": 3558352, - "step": 221 - }, - { - "epoch": 0.015550710551892616, - "grad_norm": 4.638419151306152, - "learning_rate": 9.84463887915937e-05, - "loss": 1.1963, - "num_input_tokens_seen": 3574736, - "step": 222 - }, - { - "epoch": 0.015620758797621863, - "grad_norm": 4.323521614074707, - "learning_rate": 9.843939054290719e-05, - "loss": 1.1224, - "num_input_tokens_seen": 3591120, - "step": 223 - }, - { - "epoch": 0.01569080704335111, - "grad_norm": 4.466544151306152, - "learning_rate": 9.843239229422068e-05, - "loss": 1.3988, - "num_input_tokens_seen": 3607392, - "step": 224 - }, - { - "epoch": 0.015760855289080354, - "grad_norm": 4.476973533630371, - "learning_rate": 9.842539404553415e-05, - "loss": 1.184, - "num_input_tokens_seen": 3623776, - "step": 225 - }, - { - "epoch": 0.0158309035348096, - "grad_norm": 4.648625373840332, - "learning_rate": 9.841839579684764e-05, - "loss": 1.1768, - "num_input_tokens_seen": 3640008, - "step": 226 - }, - { - "epoch": 0.015900951780538845, - "grad_norm": 4.364476203918457, - "learning_rate": 9.841139754816112e-05, - "loss": 1.0208, - "num_input_tokens_seen": 3656392, - "step": 227 - }, - { - "epoch": 0.01597100002626809, - "grad_norm": 4.3054633140563965, - "learning_rate": 9.84043992994746e-05, - "loss": 1.1215, - "num_input_tokens_seen": 3672392, - "step": 228 - }, - { - "epoch": 0.016041048271997337, - "grad_norm": 4.83436918258667, - "learning_rate": 9.83974010507881e-05, - "loss": 1.2284, - "num_input_tokens_seen": 3688776, - "step": 229 - }, - { - "epoch": 0.016111096517726586, - "grad_norm": 4.447519779205322, - "learning_rate": 9.839040280210158e-05, - "loss": 1.1765, - "num_input_tokens_seen": 3705080, - "step": 230 - }, - { - "epoch": 0.01618114476345583, - "grad_norm": 4.269217491149902, - "learning_rate": 9.838340455341507e-05, - "loss": 1.0466, - "num_input_tokens_seen": 3721464, - "step": 231 - }, - { - "epoch": 0.016251193009185077, - "grad_norm": 4.41223669052124, - "learning_rate": 9.837640630472854e-05, - "loss": 1.2098, - "num_input_tokens_seen": 3737184, - "step": 232 - }, - { - "epoch": 0.016321241254914323, - "grad_norm": 4.632737159729004, - "learning_rate": 9.836940805604203e-05, - "loss": 1.1562, - "num_input_tokens_seen": 3753192, - "step": 233 - }, - { - "epoch": 0.016391289500643568, - "grad_norm": 4.379425525665283, - "learning_rate": 9.836240980735552e-05, - "loss": 1.1219, - "num_input_tokens_seen": 3767976, - "step": 234 - }, - { - "epoch": 0.016461337746372814, - "grad_norm": 4.28551721572876, - "learning_rate": 9.835541155866901e-05, - "loss": 1.0259, - "num_input_tokens_seen": 3784008, - "step": 235 - }, - { - "epoch": 0.01653138599210206, - "grad_norm": 4.642453670501709, - "learning_rate": 9.83484133099825e-05, - "loss": 1.1684, - "num_input_tokens_seen": 3800000, - "step": 236 - }, - { - "epoch": 0.016601434237831305, - "grad_norm": 4.367178440093994, - "learning_rate": 9.834141506129597e-05, - "loss": 1.2877, - "num_input_tokens_seen": 3816384, - "step": 237 - }, - { - "epoch": 0.01667148248356055, - "grad_norm": 4.5724005699157715, - "learning_rate": 9.833441681260946e-05, - "loss": 1.1814, - "num_input_tokens_seen": 3830328, - "step": 238 - }, - { - "epoch": 0.0167415307292898, - "grad_norm": 4.318159580230713, - "learning_rate": 9.832741856392295e-05, - "loss": 1.1143, - "num_input_tokens_seen": 3846712, - "step": 239 - }, - { - "epoch": 0.016811578975019045, - "grad_norm": 4.408501625061035, - "learning_rate": 9.832042031523643e-05, - "loss": 1.1508, - "num_input_tokens_seen": 3861776, - "step": 240 - }, - { - "epoch": 0.01688162722074829, - "grad_norm": 4.20060920715332, - "learning_rate": 9.831342206654991e-05, - "loss": 1.209, - "num_input_tokens_seen": 3877736, - "step": 241 - }, - { - "epoch": 0.016951675466477537, - "grad_norm": 4.431649208068848, - "learning_rate": 9.83064238178634e-05, - "loss": 1.2458, - "num_input_tokens_seen": 3893320, - "step": 242 - }, - { - "epoch": 0.017021723712206782, - "grad_norm": 4.000490188598633, - "learning_rate": 9.829942556917689e-05, - "loss": 1.0274, - "num_input_tokens_seen": 3909704, - "step": 243 - }, - { - "epoch": 0.017091771957936028, - "grad_norm": 4.703495025634766, - "learning_rate": 9.829242732049038e-05, - "loss": 1.1711, - "num_input_tokens_seen": 3925808, - "step": 244 - }, - { - "epoch": 0.017161820203665273, - "grad_norm": 4.639338970184326, - "learning_rate": 9.828542907180386e-05, - "loss": 1.3046, - "num_input_tokens_seen": 3942192, - "step": 245 - }, - { - "epoch": 0.01723186844939452, - "grad_norm": 4.414276599884033, - "learning_rate": 9.827843082311734e-05, - "loss": 1.271, - "num_input_tokens_seen": 3958528, - "step": 246 - }, - { - "epoch": 0.017301916695123768, - "grad_norm": 4.404853820800781, - "learning_rate": 9.827143257443082e-05, - "loss": 1.0693, - "num_input_tokens_seen": 3974912, - "step": 247 - }, - { - "epoch": 0.017371964940853014, - "grad_norm": 4.519491195678711, - "learning_rate": 9.826443432574431e-05, - "loss": 1.2894, - "num_input_tokens_seen": 3991296, - "step": 248 - }, - { - "epoch": 0.01744201318658226, - "grad_norm": 4.261727809906006, - "learning_rate": 9.825743607705781e-05, - "loss": 1.2059, - "num_input_tokens_seen": 4006544, - "step": 249 - }, - { - "epoch": 0.017512061432311505, - "grad_norm": 4.102485656738281, - "learning_rate": 9.825043782837129e-05, - "loss": 0.9365, - "num_input_tokens_seen": 4022320, - "step": 250 - }, - { - "epoch": 0.01758210967804075, - "grad_norm": 4.804764270782471, - "learning_rate": 9.824343957968477e-05, - "loss": 1.3344, - "num_input_tokens_seen": 4037048, - "step": 251 - }, - { - "epoch": 0.017652157923769996, - "grad_norm": 4.130600452423096, - "learning_rate": 9.823644133099825e-05, - "loss": 1.2349, - "num_input_tokens_seen": 4053432, - "step": 252 - }, - { - "epoch": 0.017722206169499242, - "grad_norm": 4.234742641448975, - "learning_rate": 9.822944308231174e-05, - "loss": 1.1371, - "num_input_tokens_seen": 4069816, - "step": 253 - }, - { - "epoch": 0.017792254415228487, - "grad_norm": 4.754928112030029, - "learning_rate": 9.822244483362521e-05, - "loss": 1.5168, - "num_input_tokens_seen": 4085864, - "step": 254 - }, - { - "epoch": 0.017862302660957733, - "grad_norm": 4.542768478393555, - "learning_rate": 9.821544658493871e-05, - "loss": 1.1943, - "num_input_tokens_seen": 4102240, - "step": 255 - }, - { - "epoch": 0.017932350906686982, - "grad_norm": 4.411310195922852, - "learning_rate": 9.82084483362522e-05, - "loss": 1.2694, - "num_input_tokens_seen": 4118544, - "step": 256 - }, - { - "epoch": 0.018002399152416228, - "grad_norm": 4.205377101898193, - "learning_rate": 9.820145008756568e-05, - "loss": 1.1581, - "num_input_tokens_seen": 4134928, - "step": 257 - }, - { - "epoch": 0.018072447398145473, - "grad_norm": 4.451165199279785, - "learning_rate": 9.819445183887917e-05, - "loss": 1.089, - "num_input_tokens_seen": 4150848, - "step": 258 - }, - { - "epoch": 0.01814249564387472, - "grad_norm": 4.366336822509766, - "learning_rate": 9.818745359019264e-05, - "loss": 1.1767, - "num_input_tokens_seen": 4167184, - "step": 259 - }, - { - "epoch": 0.018212543889603965, - "grad_norm": 4.394649982452393, - "learning_rate": 9.818045534150613e-05, - "loss": 1.0741, - "num_input_tokens_seen": 4183376, - "step": 260 - }, - { - "epoch": 0.01828259213533321, - "grad_norm": 4.344518184661865, - "learning_rate": 9.817345709281962e-05, - "loss": 1.2282, - "num_input_tokens_seen": 4199760, - "step": 261 - }, - { - "epoch": 0.018352640381062456, - "grad_norm": 4.403041362762451, - "learning_rate": 9.816645884413311e-05, - "loss": 1.2317, - "num_input_tokens_seen": 4215816, - "step": 262 - }, - { - "epoch": 0.0184226886267917, - "grad_norm": 4.715320110321045, - "learning_rate": 9.81594605954466e-05, - "loss": 1.3074, - "num_input_tokens_seen": 4231504, - "step": 263 - }, - { - "epoch": 0.01849273687252095, - "grad_norm": 4.5754265785217285, - "learning_rate": 9.815246234676007e-05, - "loss": 1.253, - "num_input_tokens_seen": 4247888, - "step": 264 - }, - { - "epoch": 0.018562785118250196, - "grad_norm": 4.2346930503845215, - "learning_rate": 9.814546409807356e-05, - "loss": 1.1727, - "num_input_tokens_seen": 4264248, - "step": 265 - }, - { - "epoch": 0.018632833363979442, - "grad_norm": 4.186713218688965, - "learning_rate": 9.813846584938705e-05, - "loss": 1.2693, - "num_input_tokens_seen": 4280632, - "step": 266 - }, - { - "epoch": 0.018702881609708687, - "grad_norm": 4.6356706619262695, - "learning_rate": 9.813146760070052e-05, - "loss": 1.3755, - "num_input_tokens_seen": 4296648, - "step": 267 - }, - { - "epoch": 0.018772929855437933, - "grad_norm": 4.466466903686523, - "learning_rate": 9.812446935201401e-05, - "loss": 1.283, - "num_input_tokens_seen": 4311408, - "step": 268 - }, - { - "epoch": 0.01884297810116718, - "grad_norm": 4.3369140625, - "learning_rate": 9.81174711033275e-05, - "loss": 1.1555, - "num_input_tokens_seen": 4326736, - "step": 269 - }, - { - "epoch": 0.018913026346896424, - "grad_norm": 4.434782028198242, - "learning_rate": 9.811047285464099e-05, - "loss": 1.2859, - "num_input_tokens_seen": 4343120, - "step": 270 - }, - { - "epoch": 0.01898307459262567, - "grad_norm": 4.346708297729492, - "learning_rate": 9.810347460595448e-05, - "loss": 1.1421, - "num_input_tokens_seen": 4359504, - "step": 271 - }, - { - "epoch": 0.019053122838354915, - "grad_norm": 4.529878616333008, - "learning_rate": 9.809647635726795e-05, - "loss": 1.2654, - "num_input_tokens_seen": 4375888, - "step": 272 - }, - { - "epoch": 0.019123171084084165, - "grad_norm": 4.051745891571045, - "learning_rate": 9.808947810858144e-05, - "loss": 1.1469, - "num_input_tokens_seen": 4392224, - "step": 273 - }, - { - "epoch": 0.01919321932981341, - "grad_norm": 4.403522491455078, - "learning_rate": 9.808247985989492e-05, - "loss": 1.233, - "num_input_tokens_seen": 4408608, - "step": 274 - }, - { - "epoch": 0.019263267575542656, - "grad_norm": 4.166261196136475, - "learning_rate": 9.807548161120842e-05, - "loss": 1.1697, - "num_input_tokens_seen": 4424992, - "step": 275 - }, - { - "epoch": 0.0193333158212719, - "grad_norm": 4.29187536239624, - "learning_rate": 9.806848336252191e-05, - "loss": 1.0503, - "num_input_tokens_seen": 4441376, - "step": 276 - }, - { - "epoch": 0.019403364067001147, - "grad_norm": 4.4056172370910645, - "learning_rate": 9.806148511383538e-05, - "loss": 1.1965, - "num_input_tokens_seen": 4457760, - "step": 277 - }, - { - "epoch": 0.019473412312730393, - "grad_norm": 4.355875015258789, - "learning_rate": 9.805448686514887e-05, - "loss": 1.1024, - "num_input_tokens_seen": 4474144, - "step": 278 - }, - { - "epoch": 0.019543460558459638, - "grad_norm": 4.46420955657959, - "learning_rate": 9.804748861646235e-05, - "loss": 1.203, - "num_input_tokens_seen": 4488912, - "step": 279 - }, - { - "epoch": 0.019613508804188884, - "grad_norm": 4.48052453994751, - "learning_rate": 9.804049036777583e-05, - "loss": 1.2089, - "num_input_tokens_seen": 4505296, - "step": 280 - }, - { - "epoch": 0.01968355704991813, - "grad_norm": 4.458749294281006, - "learning_rate": 9.803349211908932e-05, - "loss": 1.1557, - "num_input_tokens_seen": 4520576, - "step": 281 - }, - { - "epoch": 0.01975360529564738, - "grad_norm": 4.551771640777588, - "learning_rate": 9.802649387040281e-05, - "loss": 1.1671, - "num_input_tokens_seen": 4536960, - "step": 282 - }, - { - "epoch": 0.019823653541376624, - "grad_norm": 4.038064956665039, - "learning_rate": 9.80194956217163e-05, - "loss": 1.1562, - "num_input_tokens_seen": 4553344, - "step": 283 - }, - { - "epoch": 0.01989370178710587, - "grad_norm": 4.647075653076172, - "learning_rate": 9.801249737302978e-05, - "loss": 1.3069, - "num_input_tokens_seen": 4568928, - "step": 284 - }, - { - "epoch": 0.019963750032835115, - "grad_norm": 4.258941650390625, - "learning_rate": 9.800549912434326e-05, - "loss": 1.0349, - "num_input_tokens_seen": 4585312, - "step": 285 - }, - { - "epoch": 0.02003379827856436, - "grad_norm": 4.348769664764404, - "learning_rate": 9.799850087565674e-05, - "loss": 1.1163, - "num_input_tokens_seen": 4601696, - "step": 286 - }, - { - "epoch": 0.020103846524293607, - "grad_norm": 4.105901718139648, - "learning_rate": 9.799150262697023e-05, - "loss": 1.0313, - "num_input_tokens_seen": 4617312, - "step": 287 - }, - { - "epoch": 0.020173894770022852, - "grad_norm": 4.079495429992676, - "learning_rate": 9.798450437828372e-05, - "loss": 1.0828, - "num_input_tokens_seen": 4633696, - "step": 288 - }, - { - "epoch": 0.020243943015752098, - "grad_norm": 4.03472375869751, - "learning_rate": 9.79775061295972e-05, - "loss": 0.9475, - "num_input_tokens_seen": 4650080, - "step": 289 - }, - { - "epoch": 0.020313991261481347, - "grad_norm": 4.077049732208252, - "learning_rate": 9.797050788091069e-05, - "loss": 1.1323, - "num_input_tokens_seen": 4666328, - "step": 290 - }, - { - "epoch": 0.020384039507210593, - "grad_norm": 4.086606025695801, - "learning_rate": 9.796350963222417e-05, - "loss": 1.1218, - "num_input_tokens_seen": 4682256, - "step": 291 - }, - { - "epoch": 0.020454087752939838, - "grad_norm": 4.296900749206543, - "learning_rate": 9.795651138353766e-05, - "loss": 1.2964, - "num_input_tokens_seen": 4698640, - "step": 292 - }, - { - "epoch": 0.020524135998669084, - "grad_norm": 4.040759086608887, - "learning_rate": 9.794951313485115e-05, - "loss": 1.1077, - "num_input_tokens_seen": 4714928, - "step": 293 - }, - { - "epoch": 0.02059418424439833, - "grad_norm": 3.8260273933410645, - "learning_rate": 9.794251488616462e-05, - "loss": 0.9667, - "num_input_tokens_seen": 4731312, - "step": 294 - }, - { - "epoch": 0.020664232490127575, - "grad_norm": 4.294517993927002, - "learning_rate": 9.793551663747811e-05, - "loss": 1.2704, - "num_input_tokens_seen": 4747544, - "step": 295 - }, - { - "epoch": 0.02073428073585682, - "grad_norm": 4.206037521362305, - "learning_rate": 9.79285183887916e-05, - "loss": 1.1593, - "num_input_tokens_seen": 4763928, - "step": 296 - }, - { - "epoch": 0.020804328981586066, - "grad_norm": 4.147867202758789, - "learning_rate": 9.792152014010509e-05, - "loss": 1.1256, - "num_input_tokens_seen": 4780312, - "step": 297 - }, - { - "epoch": 0.020874377227315312, - "grad_norm": 4.23718786239624, - "learning_rate": 9.791452189141857e-05, - "loss": 1.2353, - "num_input_tokens_seen": 4796384, - "step": 298 - }, - { - "epoch": 0.02094442547304456, - "grad_norm": 4.172685146331787, - "learning_rate": 9.790752364273205e-05, - "loss": 1.1868, - "num_input_tokens_seen": 4812768, - "step": 299 - }, - { - "epoch": 0.021014473718773807, - "grad_norm": 4.167289733886719, - "learning_rate": 9.790052539404554e-05, - "loss": 1.0606, - "num_input_tokens_seen": 4829152, - "step": 300 - }, - { - "epoch": 0.021084521964503052, - "grad_norm": 4.096963882446289, - "learning_rate": 9.789352714535903e-05, - "loss": 1.0557, - "num_input_tokens_seen": 4845384, - "step": 301 - }, - { - "epoch": 0.021154570210232298, - "grad_norm": 4.223779678344727, - "learning_rate": 9.788652889667252e-05, - "loss": 1.1485, - "num_input_tokens_seen": 4861768, - "step": 302 - }, - { - "epoch": 0.021224618455961543, - "grad_norm": 3.8243472576141357, - "learning_rate": 9.7879530647986e-05, - "loss": 1.004, - "num_input_tokens_seen": 4878152, - "step": 303 - }, - { - "epoch": 0.02129466670169079, - "grad_norm": 4.092590808868408, - "learning_rate": 9.787253239929948e-05, - "loss": 1.0211, - "num_input_tokens_seen": 4894536, - "step": 304 - }, - { - "epoch": 0.021364714947420035, - "grad_norm": 4.42412805557251, - "learning_rate": 9.786553415061297e-05, - "loss": 0.9915, - "num_input_tokens_seen": 4910320, - "step": 305 - }, - { - "epoch": 0.02143476319314928, - "grad_norm": 4.488316535949707, - "learning_rate": 9.785853590192644e-05, - "loss": 1.1782, - "num_input_tokens_seen": 4926704, - "step": 306 - }, - { - "epoch": 0.021504811438878526, - "grad_norm": 4.110256195068359, - "learning_rate": 9.785153765323993e-05, - "loss": 1.102, - "num_input_tokens_seen": 4943088, - "step": 307 - }, - { - "epoch": 0.021574859684607775, - "grad_norm": 4.246950149536133, - "learning_rate": 9.784453940455342e-05, - "loss": 1.067, - "num_input_tokens_seen": 4958736, - "step": 308 - }, - { - "epoch": 0.02164490793033702, - "grad_norm": 4.175214767456055, - "learning_rate": 9.783754115586691e-05, - "loss": 1.0638, - "num_input_tokens_seen": 4975120, - "step": 309 - }, - { - "epoch": 0.021714956176066266, - "grad_norm": 4.427795886993408, - "learning_rate": 9.78305429071804e-05, - "loss": 1.1347, - "num_input_tokens_seen": 4991504, - "step": 310 - }, - { - "epoch": 0.021785004421795512, - "grad_norm": 4.158191204071045, - "learning_rate": 9.782354465849387e-05, - "loss": 1.1662, - "num_input_tokens_seen": 5007152, - "step": 311 - }, - { - "epoch": 0.021855052667524758, - "grad_norm": 4.184347629547119, - "learning_rate": 9.781654640980736e-05, - "loss": 1.0791, - "num_input_tokens_seen": 5023536, - "step": 312 - }, - { - "epoch": 0.021925100913254003, - "grad_norm": 3.8506295680999756, - "learning_rate": 9.780954816112084e-05, - "loss": 1.0615, - "num_input_tokens_seen": 5039728, - "step": 313 - }, - { - "epoch": 0.02199514915898325, - "grad_norm": 4.310062408447266, - "learning_rate": 9.780254991243432e-05, - "loss": 1.1363, - "num_input_tokens_seen": 5056008, - "step": 314 - }, - { - "epoch": 0.022065197404712494, - "grad_norm": 4.215006351470947, - "learning_rate": 9.779555166374781e-05, - "loss": 1.1715, - "num_input_tokens_seen": 5072096, - "step": 315 - }, - { - "epoch": 0.022135245650441743, - "grad_norm": 4.219073295593262, - "learning_rate": 9.77885534150613e-05, - "loss": 1.219, - "num_input_tokens_seen": 5088432, - "step": 316 - }, - { - "epoch": 0.02220529389617099, - "grad_norm": 4.319522857666016, - "learning_rate": 9.778155516637479e-05, - "loss": 1.3085, - "num_input_tokens_seen": 5104240, - "step": 317 - }, - { - "epoch": 0.022275342141900235, - "grad_norm": 4.118961334228516, - "learning_rate": 9.777455691768827e-05, - "loss": 1.0926, - "num_input_tokens_seen": 5120624, - "step": 318 - }, - { - "epoch": 0.02234539038762948, - "grad_norm": 4.195051193237305, - "learning_rate": 9.776755866900175e-05, - "loss": 1.0894, - "num_input_tokens_seen": 5137008, - "step": 319 - }, - { - "epoch": 0.022415438633358726, - "grad_norm": 4.114197254180908, - "learning_rate": 9.776056042031524e-05, - "loss": 1.1897, - "num_input_tokens_seen": 5153272, - "step": 320 - }, - { - "epoch": 0.02248548687908797, - "grad_norm": 4.014908313751221, - "learning_rate": 9.775356217162872e-05, - "loss": 1.0932, - "num_input_tokens_seen": 5169472, - "step": 321 - }, - { - "epoch": 0.022555535124817217, - "grad_norm": 4.190642356872559, - "learning_rate": 9.774656392294222e-05, - "loss": 1.1413, - "num_input_tokens_seen": 5185856, - "step": 322 - }, - { - "epoch": 0.022625583370546463, - "grad_norm": 4.562993049621582, - "learning_rate": 9.77395656742557e-05, - "loss": 1.2865, - "num_input_tokens_seen": 5202240, - "step": 323 - }, - { - "epoch": 0.02269563161627571, - "grad_norm": 4.607022762298584, - "learning_rate": 9.773256742556918e-05, - "loss": 1.1465, - "num_input_tokens_seen": 5218168, - "step": 324 - }, - { - "epoch": 0.022765679862004957, - "grad_norm": 3.956439256668091, - "learning_rate": 9.772556917688267e-05, - "loss": 1.028, - "num_input_tokens_seen": 5234368, - "step": 325 - }, - { - "epoch": 0.022835728107734203, - "grad_norm": 4.20713472366333, - "learning_rate": 9.771857092819615e-05, - "loss": 1.2332, - "num_input_tokens_seen": 5249808, - "step": 326 - }, - { - "epoch": 0.02290577635346345, - "grad_norm": 4.4092864990234375, - "learning_rate": 9.771157267950964e-05, - "loss": 1.104, - "num_input_tokens_seen": 5266120, - "step": 327 - }, - { - "epoch": 0.022975824599192694, - "grad_norm": 4.529845237731934, - "learning_rate": 9.770457443082312e-05, - "loss": 1.3475, - "num_input_tokens_seen": 5282504, - "step": 328 - }, - { - "epoch": 0.02304587284492194, - "grad_norm": 4.221986293792725, - "learning_rate": 9.769757618213661e-05, - "loss": 1.4115, - "num_input_tokens_seen": 5298344, - "step": 329 - }, - { - "epoch": 0.023115921090651186, - "grad_norm": 4.29000186920166, - "learning_rate": 9.76905779334501e-05, - "loss": 1.2855, - "num_input_tokens_seen": 5314728, - "step": 330 - }, - { - "epoch": 0.02318596933638043, - "grad_norm": 4.426812648773193, - "learning_rate": 9.768357968476358e-05, - "loss": 1.514, - "num_input_tokens_seen": 5330816, - "step": 331 - }, - { - "epoch": 0.023256017582109677, - "grad_norm": 4.210752964019775, - "learning_rate": 9.767658143607706e-05, - "loss": 1.0854, - "num_input_tokens_seen": 5346552, - "step": 332 - }, - { - "epoch": 0.023326065827838922, - "grad_norm": 4.216427326202393, - "learning_rate": 9.766958318739054e-05, - "loss": 1.1573, - "num_input_tokens_seen": 5362936, - "step": 333 - }, - { - "epoch": 0.02339611407356817, - "grad_norm": 4.132325649261475, - "learning_rate": 9.766258493870403e-05, - "loss": 1.0942, - "num_input_tokens_seen": 5379320, - "step": 334 - }, - { - "epoch": 0.023466162319297417, - "grad_norm": 4.277027130126953, - "learning_rate": 9.765558669001752e-05, - "loss": 1.1227, - "num_input_tokens_seen": 5395704, - "step": 335 - }, - { - "epoch": 0.023536210565026663, - "grad_norm": 4.228096961975098, - "learning_rate": 9.7648588441331e-05, - "loss": 1.1094, - "num_input_tokens_seen": 5412088, - "step": 336 - }, - { - "epoch": 0.02360625881075591, - "grad_norm": 4.194522380828857, - "learning_rate": 9.76415901926445e-05, - "loss": 1.2066, - "num_input_tokens_seen": 5428472, - "step": 337 - }, - { - "epoch": 0.023676307056485154, - "grad_norm": 4.336326599121094, - "learning_rate": 9.763459194395797e-05, - "loss": 1.2251, - "num_input_tokens_seen": 5444856, - "step": 338 - }, - { - "epoch": 0.0237463553022144, - "grad_norm": 4.2723307609558105, - "learning_rate": 9.762759369527146e-05, - "loss": 1.0927, - "num_input_tokens_seen": 5460304, - "step": 339 - }, - { - "epoch": 0.023816403547943645, - "grad_norm": 4.190036773681641, - "learning_rate": 9.762059544658493e-05, - "loss": 1.2036, - "num_input_tokens_seen": 5476688, - "step": 340 - }, - { - "epoch": 0.02388645179367289, - "grad_norm": 4.477560043334961, - "learning_rate": 9.761359719789842e-05, - "loss": 1.362, - "num_input_tokens_seen": 5493072, - "step": 341 - }, - { - "epoch": 0.02395650003940214, - "grad_norm": 4.160232067108154, - "learning_rate": 9.760659894921192e-05, - "loss": 1.1602, - "num_input_tokens_seen": 5509456, - "step": 342 - }, - { - "epoch": 0.024026548285131386, - "grad_norm": 3.857335090637207, - "learning_rate": 9.75996007005254e-05, - "loss": 1.0963, - "num_input_tokens_seen": 5525840, - "step": 343 - }, - { - "epoch": 0.02409659653086063, - "grad_norm": 4.141246318817139, - "learning_rate": 9.759260245183889e-05, - "loss": 1.2009, - "num_input_tokens_seen": 5541888, - "step": 344 - }, - { - "epoch": 0.024166644776589877, - "grad_norm": 4.50364875793457, - "learning_rate": 9.758560420315236e-05, - "loss": 1.1483, - "num_input_tokens_seen": 5557848, - "step": 345 - }, - { - "epoch": 0.024236693022319122, - "grad_norm": 4.3343353271484375, - "learning_rate": 9.757860595446585e-05, - "loss": 1.3594, - "num_input_tokens_seen": 5573504, - "step": 346 - }, - { - "epoch": 0.024306741268048368, - "grad_norm": 4.050408363342285, - "learning_rate": 9.757160770577934e-05, - "loss": 1.0563, - "num_input_tokens_seen": 5589544, - "step": 347 - }, - { - "epoch": 0.024376789513777614, - "grad_norm": 4.051811695098877, - "learning_rate": 9.756460945709283e-05, - "loss": 1.0288, - "num_input_tokens_seen": 5605368, - "step": 348 - }, - { - "epoch": 0.02444683775950686, - "grad_norm": 4.365113258361816, - "learning_rate": 9.755761120840632e-05, - "loss": 1.3054, - "num_input_tokens_seen": 5621752, - "step": 349 - }, - { - "epoch": 0.024516886005236105, - "grad_norm": 4.0057501792907715, - "learning_rate": 9.755061295971979e-05, - "loss": 1.1302, - "num_input_tokens_seen": 5638136, - "step": 350 - }, - { - "epoch": 0.024586934250965354, - "grad_norm": 4.254896640777588, - "learning_rate": 9.754361471103328e-05, - "loss": 1.0495, - "num_input_tokens_seen": 5653168, - "step": 351 - }, - { - "epoch": 0.0246569824966946, - "grad_norm": 3.8119771480560303, - "learning_rate": 9.753661646234677e-05, - "loss": 1.0349, - "num_input_tokens_seen": 5669504, - "step": 352 - }, - { - "epoch": 0.024727030742423845, - "grad_norm": 4.5082621574401855, - "learning_rate": 9.752961821366024e-05, - "loss": 1.2537, - "num_input_tokens_seen": 5685168, - "step": 353 - }, - { - "epoch": 0.02479707898815309, - "grad_norm": 4.392731189727783, - "learning_rate": 9.752261996497373e-05, - "loss": 1.2534, - "num_input_tokens_seen": 5701240, - "step": 354 - }, - { - "epoch": 0.024867127233882336, - "grad_norm": 4.293395519256592, - "learning_rate": 9.751562171628722e-05, - "loss": 1.2774, - "num_input_tokens_seen": 5717624, - "step": 355 - }, - { - "epoch": 0.024937175479611582, - "grad_norm": 4.64813756942749, - "learning_rate": 9.750862346760071e-05, - "loss": 1.2795, - "num_input_tokens_seen": 5733104, - "step": 356 - }, - { - "epoch": 0.025007223725340828, - "grad_norm": 4.5166778564453125, - "learning_rate": 9.75016252189142e-05, - "loss": 1.1301, - "num_input_tokens_seen": 5749488, - "step": 357 - }, - { - "epoch": 0.025077271971070073, - "grad_norm": 3.894291400909424, - "learning_rate": 9.749462697022767e-05, - "loss": 0.901, - "num_input_tokens_seen": 5765872, - "step": 358 - }, - { - "epoch": 0.02514732021679932, - "grad_norm": 4.10056209564209, - "learning_rate": 9.748762872154116e-05, - "loss": 1.0529, - "num_input_tokens_seen": 5780856, - "step": 359 - }, - { - "epoch": 0.025217368462528568, - "grad_norm": 4.6277666091918945, - "learning_rate": 9.748063047285464e-05, - "loss": 1.3649, - "num_input_tokens_seen": 5796856, - "step": 360 - }, - { - "epoch": 0.025287416708257814, - "grad_norm": 4.029720306396484, - "learning_rate": 9.747363222416813e-05, - "loss": 0.8863, - "num_input_tokens_seen": 5812176, - "step": 361 - }, - { - "epoch": 0.02535746495398706, - "grad_norm": 3.7772202491760254, - "learning_rate": 9.746663397548161e-05, - "loss": 1.0448, - "num_input_tokens_seen": 5828064, - "step": 362 - }, - { - "epoch": 0.025427513199716305, - "grad_norm": 4.379861354827881, - "learning_rate": 9.74596357267951e-05, - "loss": 1.3274, - "num_input_tokens_seen": 5843680, - "step": 363 - }, - { - "epoch": 0.02549756144544555, - "grad_norm": 4.254587173461914, - "learning_rate": 9.745263747810859e-05, - "loss": 1.1502, - "num_input_tokens_seen": 5859024, - "step": 364 - }, - { - "epoch": 0.025567609691174796, - "grad_norm": 4.271276473999023, - "learning_rate": 9.744563922942207e-05, - "loss": 1.2785, - "num_input_tokens_seen": 5874320, - "step": 365 - }, - { - "epoch": 0.02563765793690404, - "grad_norm": 4.224324703216553, - "learning_rate": 9.743864098073555e-05, - "loss": 1.0926, - "num_input_tokens_seen": 5890704, - "step": 366 - }, - { - "epoch": 0.025707706182633287, - "grad_norm": 4.289444446563721, - "learning_rate": 9.743164273204903e-05, - "loss": 1.1913, - "num_input_tokens_seen": 5906016, - "step": 367 - }, - { - "epoch": 0.025777754428362536, - "grad_norm": 4.280707359313965, - "learning_rate": 9.742464448336253e-05, - "loss": 1.2238, - "num_input_tokens_seen": 5921784, - "step": 368 - }, - { - "epoch": 0.025847802674091782, - "grad_norm": 4.554803848266602, - "learning_rate": 9.741764623467602e-05, - "loss": 1.2491, - "num_input_tokens_seen": 5938072, - "step": 369 - }, - { - "epoch": 0.025917850919821028, - "grad_norm": 4.677784442901611, - "learning_rate": 9.74106479859895e-05, - "loss": 1.2387, - "num_input_tokens_seen": 5954456, - "step": 370 - }, - { - "epoch": 0.025987899165550273, - "grad_norm": 4.268225193023682, - "learning_rate": 9.740364973730298e-05, - "loss": 1.2983, - "num_input_tokens_seen": 5970664, - "step": 371 - }, - { - "epoch": 0.02605794741127952, - "grad_norm": 4.361818790435791, - "learning_rate": 9.739665148861646e-05, - "loss": 1.199, - "num_input_tokens_seen": 5987048, - "step": 372 - }, - { - "epoch": 0.026127995657008764, - "grad_norm": 3.9990735054016113, - "learning_rate": 9.738965323992995e-05, - "loss": 1.0777, - "num_input_tokens_seen": 6003432, - "step": 373 - }, - { - "epoch": 0.02619804390273801, - "grad_norm": 3.992142915725708, - "learning_rate": 9.738265499124344e-05, - "loss": 1.0443, - "num_input_tokens_seen": 6019816, - "step": 374 - }, - { - "epoch": 0.026268092148467256, - "grad_norm": 4.270167827606201, - "learning_rate": 9.737565674255693e-05, - "loss": 1.1764, - "num_input_tokens_seen": 6036200, - "step": 375 - }, - { - "epoch": 0.0263381403941965, - "grad_norm": 4.362086296081543, - "learning_rate": 9.736865849387041e-05, - "loss": 1.2735, - "num_input_tokens_seen": 6052120, - "step": 376 - }, - { - "epoch": 0.02640818863992575, - "grad_norm": 3.6900475025177, - "learning_rate": 9.736166024518389e-05, - "loss": 0.8729, - "num_input_tokens_seen": 6068264, - "step": 377 - }, - { - "epoch": 0.026478236885654996, - "grad_norm": 3.8281285762786865, - "learning_rate": 9.735466199649738e-05, - "loss": 1.1096, - "num_input_tokens_seen": 6084504, - "step": 378 - }, - { - "epoch": 0.02654828513138424, - "grad_norm": 3.9335553646087646, - "learning_rate": 9.734766374781087e-05, - "loss": 1.0763, - "num_input_tokens_seen": 6100592, - "step": 379 - }, - { - "epoch": 0.026618333377113487, - "grad_norm": 4.332645416259766, - "learning_rate": 9.734066549912434e-05, - "loss": 1.1751, - "num_input_tokens_seen": 6116976, - "step": 380 - }, - { - "epoch": 0.026688381622842733, - "grad_norm": 4.160863399505615, - "learning_rate": 9.733366725043783e-05, - "loss": 1.0778, - "num_input_tokens_seen": 6133360, - "step": 381 - }, - { - "epoch": 0.02675842986857198, - "grad_norm": 4.388178825378418, - "learning_rate": 9.732666900175132e-05, - "loss": 1.2214, - "num_input_tokens_seen": 6149744, - "step": 382 - }, - { - "epoch": 0.026828478114301224, - "grad_norm": 4.354910373687744, - "learning_rate": 9.73196707530648e-05, - "loss": 1.4115, - "num_input_tokens_seen": 6166048, - "step": 383 - }, - { - "epoch": 0.02689852636003047, - "grad_norm": 4.058071613311768, - "learning_rate": 9.73126725043783e-05, - "loss": 1.0934, - "num_input_tokens_seen": 6181840, - "step": 384 - }, - { - "epoch": 0.026968574605759715, - "grad_norm": 4.060855865478516, - "learning_rate": 9.730567425569177e-05, - "loss": 1.1395, - "num_input_tokens_seen": 6198224, - "step": 385 - }, - { - "epoch": 0.027038622851488964, - "grad_norm": 4.316681385040283, - "learning_rate": 9.729867600700526e-05, - "loss": 1.1052, - "num_input_tokens_seen": 6214608, - "step": 386 - }, - { - "epoch": 0.02710867109721821, - "grad_norm": 4.322516918182373, - "learning_rate": 9.729167775831873e-05, - "loss": 1.2512, - "num_input_tokens_seen": 6230992, - "step": 387 - }, - { - "epoch": 0.027178719342947456, - "grad_norm": 4.090857028961182, - "learning_rate": 9.728467950963224e-05, - "loss": 1.0772, - "num_input_tokens_seen": 6246760, - "step": 388 - }, - { - "epoch": 0.0272487675886767, - "grad_norm": 4.0143961906433105, - "learning_rate": 9.727768126094571e-05, - "loss": 1.0578, - "num_input_tokens_seen": 6261968, - "step": 389 - }, - { - "epoch": 0.027318815834405947, - "grad_norm": 4.911194324493408, - "learning_rate": 9.72706830122592e-05, - "loss": 1.3016, - "num_input_tokens_seen": 6276664, - "step": 390 - }, - { - "epoch": 0.027388864080135192, - "grad_norm": 4.057498931884766, - "learning_rate": 9.726368476357269e-05, - "loss": 1.026, - "num_input_tokens_seen": 6293048, - "step": 391 - }, - { - "epoch": 0.027458912325864438, - "grad_norm": 3.9827401638031006, - "learning_rate": 9.725668651488616e-05, - "loss": 1.136, - "num_input_tokens_seen": 6309432, - "step": 392 - }, - { - "epoch": 0.027528960571593684, - "grad_norm": 4.640822887420654, - "learning_rate": 9.724968826619965e-05, - "loss": 1.2823, - "num_input_tokens_seen": 6325568, - "step": 393 - }, - { - "epoch": 0.027599008817322933, - "grad_norm": 4.372538089752197, - "learning_rate": 9.724269001751314e-05, - "loss": 1.0354, - "num_input_tokens_seen": 6341952, - "step": 394 - }, - { - "epoch": 0.02766905706305218, - "grad_norm": 4.018289566040039, - "learning_rate": 9.723569176882663e-05, - "loss": 1.029, - "num_input_tokens_seen": 6358336, - "step": 395 - }, - { - "epoch": 0.027739105308781424, - "grad_norm": 4.440858364105225, - "learning_rate": 9.722869352014012e-05, - "loss": 1.2272, - "num_input_tokens_seen": 6374680, - "step": 396 - }, - { - "epoch": 0.02780915355451067, - "grad_norm": 4.246788024902344, - "learning_rate": 9.722169527145359e-05, - "loss": 1.0161, - "num_input_tokens_seen": 6390672, - "step": 397 - }, - { - "epoch": 0.027879201800239915, - "grad_norm": 4.27274751663208, - "learning_rate": 9.721469702276708e-05, - "loss": 1.293, - "num_input_tokens_seen": 6407056, - "step": 398 - }, - { - "epoch": 0.02794925004596916, - "grad_norm": 4.171760559082031, - "learning_rate": 9.720769877408056e-05, - "loss": 1.2766, - "num_input_tokens_seen": 6423440, - "step": 399 - }, - { - "epoch": 0.028019298291698407, - "grad_norm": 4.174622535705566, - "learning_rate": 9.720070052539405e-05, - "loss": 1.049, - "num_input_tokens_seen": 6439824, - "step": 400 - }, - { - "epoch": 0.028019298291698407, - "eval_loss": 1.1994441747665405, - "eval_runtime": 0.2131, - "eval_samples_per_second": 4.693, - "eval_steps_per_second": 4.693, - "num_input_tokens_seen": 6439824, - "step": 400 - }, - { - "epoch": 0.028089346537427652, - "grad_norm": 4.199150562286377, - "learning_rate": 9.719370227670753e-05, - "loss": 1.3432, - "num_input_tokens_seen": 6456208, - "step": 401 - }, - { - "epoch": 0.028159394783156898, - "grad_norm": 3.9011733531951904, - "learning_rate": 9.718670402802102e-05, - "loss": 1.0895, - "num_input_tokens_seen": 6472592, - "step": 402 - }, - { - "epoch": 0.028229443028886147, - "grad_norm": 4.142306327819824, - "learning_rate": 9.717970577933451e-05, - "loss": 0.9031, - "num_input_tokens_seen": 6488976, - "step": 403 - }, - { - "epoch": 0.028299491274615392, - "grad_norm": 3.9745633602142334, - "learning_rate": 9.717270753064799e-05, - "loss": 0.9951, - "num_input_tokens_seen": 6505360, - "step": 404 - }, - { - "epoch": 0.028369539520344638, - "grad_norm": 3.838865280151367, - "learning_rate": 9.716570928196147e-05, - "loss": 0.809, - "num_input_tokens_seen": 6521744, - "step": 405 - }, - { - "epoch": 0.028439587766073884, - "grad_norm": 4.48146390914917, - "learning_rate": 9.715871103327496e-05, - "loss": 1.4985, - "num_input_tokens_seen": 6538128, - "step": 406 - }, - { - "epoch": 0.02850963601180313, - "grad_norm": 4.393556594848633, - "learning_rate": 9.715171278458844e-05, - "loss": 1.2355, - "num_input_tokens_seen": 6554512, - "step": 407 - }, - { - "epoch": 0.028579684257532375, - "grad_norm": 3.970860004425049, - "learning_rate": 9.714471453590194e-05, - "loss": 1.1513, - "num_input_tokens_seen": 6570896, - "step": 408 - }, - { - "epoch": 0.02864973250326162, - "grad_norm": 4.166610240936279, - "learning_rate": 9.713771628721542e-05, - "loss": 1.108, - "num_input_tokens_seen": 6587216, - "step": 409 - }, - { - "epoch": 0.028719780748990866, - "grad_norm": 3.9887096881866455, - "learning_rate": 9.71307180385289e-05, - "loss": 1.1639, - "num_input_tokens_seen": 6603600, - "step": 410 - }, - { - "epoch": 0.028789828994720112, - "grad_norm": 4.195802211761475, - "learning_rate": 9.712371978984239e-05, - "loss": 1.1478, - "num_input_tokens_seen": 6619984, - "step": 411 - }, - { - "epoch": 0.02885987724044936, - "grad_norm": 4.011331081390381, - "learning_rate": 9.711672154115587e-05, - "loss": 0.9554, - "num_input_tokens_seen": 6635904, - "step": 412 - }, - { - "epoch": 0.028929925486178606, - "grad_norm": 4.4170026779174805, - "learning_rate": 9.710972329246936e-05, - "loss": 1.1452, - "num_input_tokens_seen": 6651944, - "step": 413 - }, - { - "epoch": 0.028999973731907852, - "grad_norm": 4.073450088500977, - "learning_rate": 9.710272504378284e-05, - "loss": 1.1187, - "num_input_tokens_seen": 6668096, - "step": 414 - }, - { - "epoch": 0.029070021977637098, - "grad_norm": 4.161722183227539, - "learning_rate": 9.709572679509633e-05, - "loss": 1.1603, - "num_input_tokens_seen": 6684480, - "step": 415 - }, - { - "epoch": 0.029140070223366343, - "grad_norm": 4.540097713470459, - "learning_rate": 9.708872854640981e-05, - "loss": 1.2143, - "num_input_tokens_seen": 6700536, - "step": 416 - }, - { - "epoch": 0.02921011846909559, - "grad_norm": 4.030871868133545, - "learning_rate": 9.70817302977233e-05, - "loss": 0.9791, - "num_input_tokens_seen": 6716920, - "step": 417 - }, - { - "epoch": 0.029280166714824835, - "grad_norm": 4.1743268966674805, - "learning_rate": 9.707473204903679e-05, - "loss": 0.9818, - "num_input_tokens_seen": 6733304, - "step": 418 - }, - { - "epoch": 0.02935021496055408, - "grad_norm": 4.227272987365723, - "learning_rate": 9.706773380035026e-05, - "loss": 1.0945, - "num_input_tokens_seen": 6749688, - "step": 419 - }, - { - "epoch": 0.02942026320628333, - "grad_norm": 4.406428813934326, - "learning_rate": 9.706073555166375e-05, - "loss": 1.0302, - "num_input_tokens_seen": 6766072, - "step": 420 - }, - { - "epoch": 0.029490311452012575, - "grad_norm": 4.17899227142334, - "learning_rate": 9.705373730297724e-05, - "loss": 1.1048, - "num_input_tokens_seen": 6782456, - "step": 421 - }, - { - "epoch": 0.02956035969774182, - "grad_norm": 4.034752368927002, - "learning_rate": 9.704673905429073e-05, - "loss": 1.2639, - "num_input_tokens_seen": 6798840, - "step": 422 - }, - { - "epoch": 0.029630407943471066, - "grad_norm": 4.795727729797363, - "learning_rate": 9.703974080560421e-05, - "loss": 1.2448, - "num_input_tokens_seen": 6814912, - "step": 423 - }, - { - "epoch": 0.029700456189200312, - "grad_norm": 4.509056568145752, - "learning_rate": 9.703274255691769e-05, - "loss": 1.2157, - "num_input_tokens_seen": 6830720, - "step": 424 - }, - { - "epoch": 0.029770504434929557, - "grad_norm": 4.064620494842529, - "learning_rate": 9.702574430823118e-05, - "loss": 1.2042, - "num_input_tokens_seen": 6847104, - "step": 425 - }, - { - "epoch": 0.029840552680658803, - "grad_norm": 3.9060182571411133, - "learning_rate": 9.701874605954465e-05, - "loss": 0.9116, - "num_input_tokens_seen": 6862952, - "step": 426 - }, - { - "epoch": 0.02991060092638805, - "grad_norm": 3.9900951385498047, - "learning_rate": 9.701174781085814e-05, - "loss": 1.1621, - "num_input_tokens_seen": 6879336, - "step": 427 - }, - { - "epoch": 0.029980649172117294, - "grad_norm": 4.371436595916748, - "learning_rate": 9.700474956217164e-05, - "loss": 1.2731, - "num_input_tokens_seen": 6895720, - "step": 428 - }, - { - "epoch": 0.030050697417846543, - "grad_norm": 3.9422085285186768, - "learning_rate": 9.699775131348512e-05, - "loss": 0.9636, - "num_input_tokens_seen": 6912104, - "step": 429 - }, - { - "epoch": 0.03012074566357579, - "grad_norm": 4.080913543701172, - "learning_rate": 9.699075306479861e-05, - "loss": 1.1507, - "num_input_tokens_seen": 6928488, - "step": 430 - }, - { - "epoch": 0.030190793909305035, - "grad_norm": 4.493942737579346, - "learning_rate": 9.698375481611208e-05, - "loss": 1.2274, - "num_input_tokens_seen": 6944664, - "step": 431 - }, - { - "epoch": 0.03026084215503428, - "grad_norm": 4.073723793029785, - "learning_rate": 9.697675656742557e-05, - "loss": 1.0498, - "num_input_tokens_seen": 6960344, - "step": 432 - }, - { - "epoch": 0.030330890400763526, - "grad_norm": 3.9672274589538574, - "learning_rate": 9.696975831873906e-05, - "loss": 1.007, - "num_input_tokens_seen": 6976720, - "step": 433 - }, - { - "epoch": 0.03040093864649277, - "grad_norm": 4.497872829437256, - "learning_rate": 9.696276007005255e-05, - "loss": 1.1339, - "num_input_tokens_seen": 6992552, - "step": 434 - }, - { - "epoch": 0.030470986892222017, - "grad_norm": 4.422168731689453, - "learning_rate": 9.695576182136604e-05, - "loss": 1.34, - "num_input_tokens_seen": 7008936, - "step": 435 - }, - { - "epoch": 0.030541035137951263, - "grad_norm": 4.3009138107299805, - "learning_rate": 9.694876357267951e-05, - "loss": 1.2479, - "num_input_tokens_seen": 7024512, - "step": 436 - }, - { - "epoch": 0.030611083383680508, - "grad_norm": 4.04030704498291, - "learning_rate": 9.6941765323993e-05, - "loss": 1.097, - "num_input_tokens_seen": 7040896, - "step": 437 - }, - { - "epoch": 0.030681131629409757, - "grad_norm": 3.877417802810669, - "learning_rate": 9.693476707530649e-05, - "loss": 1.1363, - "num_input_tokens_seen": 7057280, - "step": 438 - }, - { - "epoch": 0.030751179875139003, - "grad_norm": 3.8185505867004395, - "learning_rate": 9.692776882661996e-05, - "loss": 0.9067, - "num_input_tokens_seen": 7072544, - "step": 439 - }, - { - "epoch": 0.03082122812086825, - "grad_norm": 4.028950214385986, - "learning_rate": 9.692077057793345e-05, - "loss": 1.1195, - "num_input_tokens_seen": 7088928, - "step": 440 - }, - { - "epoch": 0.030891276366597494, - "grad_norm": 4.2786431312561035, - "learning_rate": 9.691377232924694e-05, - "loss": 1.1199, - "num_input_tokens_seen": 7105248, - "step": 441 - }, - { - "epoch": 0.03096132461232674, - "grad_norm": 4.193462371826172, - "learning_rate": 9.690677408056043e-05, - "loss": 1.1812, - "num_input_tokens_seen": 7121008, - "step": 442 - }, - { - "epoch": 0.031031372858055985, - "grad_norm": 3.93597412109375, - "learning_rate": 9.68997758318739e-05, - "loss": 1.0677, - "num_input_tokens_seen": 7136944, - "step": 443 - }, - { - "epoch": 0.03110142110378523, - "grad_norm": 4.3208537101745605, - "learning_rate": 9.68927775831874e-05, - "loss": 1.1358, - "num_input_tokens_seen": 7152928, - "step": 444 - }, - { - "epoch": 0.031171469349514477, - "grad_norm": 3.9743378162384033, - "learning_rate": 9.688577933450088e-05, - "loss": 1.094, - "num_input_tokens_seen": 7169312, - "step": 445 - }, - { - "epoch": 0.031241517595243726, - "grad_norm": 4.226114273071289, - "learning_rate": 9.687878108581436e-05, - "loss": 1.1752, - "num_input_tokens_seen": 7185696, - "step": 446 - }, - { - "epoch": 0.03131156584097297, - "grad_norm": 4.210222244262695, - "learning_rate": 9.687178283712785e-05, - "loss": 1.1262, - "num_input_tokens_seen": 7201784, - "step": 447 - }, - { - "epoch": 0.03138161408670222, - "grad_norm": 4.311635971069336, - "learning_rate": 9.686478458844133e-05, - "loss": 1.2491, - "num_input_tokens_seen": 7218168, - "step": 448 - }, - { - "epoch": 0.03145166233243146, - "grad_norm": 4.56603479385376, - "learning_rate": 9.685778633975482e-05, - "loss": 1.3512, - "num_input_tokens_seen": 7233360, - "step": 449 - }, - { - "epoch": 0.03152171057816071, - "grad_norm": 4.232856750488281, - "learning_rate": 9.685078809106831e-05, - "loss": 0.9387, - "num_input_tokens_seen": 7248280, - "step": 450 - }, - { - "epoch": 0.031591758823889954, - "grad_norm": 4.512947082519531, - "learning_rate": 9.684378984238179e-05, - "loss": 1.1988, - "num_input_tokens_seen": 7264664, - "step": 451 - }, - { - "epoch": 0.0316618070696192, - "grad_norm": 4.273897171020508, - "learning_rate": 9.683679159369528e-05, - "loss": 1.2523, - "num_input_tokens_seen": 7281048, - "step": 452 - }, - { - "epoch": 0.031731855315348445, - "grad_norm": 4.288438320159912, - "learning_rate": 9.682979334500875e-05, - "loss": 1.1692, - "num_input_tokens_seen": 7297424, - "step": 453 - }, - { - "epoch": 0.03180190356107769, - "grad_norm": 4.27367639541626, - "learning_rate": 9.682279509632225e-05, - "loss": 1.1868, - "num_input_tokens_seen": 7312792, - "step": 454 - }, - { - "epoch": 0.031871951806806936, - "grad_norm": 3.978926181793213, - "learning_rate": 9.681579684763574e-05, - "loss": 1.0382, - "num_input_tokens_seen": 7329176, - "step": 455 - }, - { - "epoch": 0.03194200005253618, - "grad_norm": 4.4399919509887695, - "learning_rate": 9.680879859894922e-05, - "loss": 1.2072, - "num_input_tokens_seen": 7345560, - "step": 456 - }, - { - "epoch": 0.03201204829826543, - "grad_norm": 3.9786529541015625, - "learning_rate": 9.68018003502627e-05, - "loss": 1.1704, - "num_input_tokens_seen": 7361944, - "step": 457 - }, - { - "epoch": 0.03208209654399467, - "grad_norm": 4.171195030212402, - "learning_rate": 9.679480210157618e-05, - "loss": 1.1307, - "num_input_tokens_seen": 7378328, - "step": 458 - }, - { - "epoch": 0.032152144789723926, - "grad_norm": 3.9415268898010254, - "learning_rate": 9.678780385288967e-05, - "loss": 0.9971, - "num_input_tokens_seen": 7394208, - "step": 459 - }, - { - "epoch": 0.03222219303545317, - "grad_norm": 4.066036224365234, - "learning_rate": 9.678080560420316e-05, - "loss": 1.1227, - "num_input_tokens_seen": 7410328, - "step": 460 - }, - { - "epoch": 0.03229224128118242, - "grad_norm": 4.22513484954834, - "learning_rate": 9.677380735551665e-05, - "loss": 1.0883, - "num_input_tokens_seen": 7426712, - "step": 461 - }, - { - "epoch": 0.03236228952691166, - "grad_norm": 4.310954570770264, - "learning_rate": 9.676680910683013e-05, - "loss": 1.1695, - "num_input_tokens_seen": 7442736, - "step": 462 - }, - { - "epoch": 0.03243233777264091, - "grad_norm": 4.2868828773498535, - "learning_rate": 9.675981085814361e-05, - "loss": 1.0594, - "num_input_tokens_seen": 7458560, - "step": 463 - }, - { - "epoch": 0.032502386018370154, - "grad_norm": 4.318186283111572, - "learning_rate": 9.67528126094571e-05, - "loss": 1.1791, - "num_input_tokens_seen": 7474944, - "step": 464 - }, - { - "epoch": 0.0325724342640994, - "grad_norm": 4.040421009063721, - "learning_rate": 9.674581436077059e-05, - "loss": 1.0649, - "num_input_tokens_seen": 7490344, - "step": 465 - }, - { - "epoch": 0.032642482509828645, - "grad_norm": 3.914815902709961, - "learning_rate": 9.673881611208406e-05, - "loss": 1.1381, - "num_input_tokens_seen": 7506728, - "step": 466 - }, - { - "epoch": 0.03271253075555789, - "grad_norm": 4.054527282714844, - "learning_rate": 9.673181786339755e-05, - "loss": 1.2264, - "num_input_tokens_seen": 7522912, - "step": 467 - }, - { - "epoch": 0.032782579001287136, - "grad_norm": 4.295147895812988, - "learning_rate": 9.672481961471104e-05, - "loss": 1.1369, - "num_input_tokens_seen": 7539040, - "step": 468 - }, - { - "epoch": 0.03285262724701638, - "grad_norm": 4.109183311462402, - "learning_rate": 9.671782136602453e-05, - "loss": 1.1676, - "num_input_tokens_seen": 7555424, - "step": 469 - }, - { - "epoch": 0.03292267549274563, - "grad_norm": 4.131369590759277, - "learning_rate": 9.6710823117338e-05, - "loss": 1.1188, - "num_input_tokens_seen": 7571808, - "step": 470 - }, - { - "epoch": 0.03299272373847487, - "grad_norm": 3.998414993286133, - "learning_rate": 9.670382486865149e-05, - "loss": 1.0201, - "num_input_tokens_seen": 7587528, - "step": 471 - }, - { - "epoch": 0.03306277198420412, - "grad_norm": 4.1235551834106445, - "learning_rate": 9.669682661996498e-05, - "loss": 1.1265, - "num_input_tokens_seen": 7603912, - "step": 472 - }, - { - "epoch": 0.033132820229933364, - "grad_norm": 4.800798416137695, - "learning_rate": 9.668982837127845e-05, - "loss": 1.3634, - "num_input_tokens_seen": 7617512, - "step": 473 - }, - { - "epoch": 0.03320286847566261, - "grad_norm": 4.068000316619873, - "learning_rate": 9.668283012259196e-05, - "loss": 1.1427, - "num_input_tokens_seen": 7633040, - "step": 474 - }, - { - "epoch": 0.033272916721391856, - "grad_norm": 4.0715484619140625, - "learning_rate": 9.667583187390543e-05, - "loss": 1.0633, - "num_input_tokens_seen": 7648416, - "step": 475 - }, - { - "epoch": 0.0333429649671211, - "grad_norm": 3.937807321548462, - "learning_rate": 9.666883362521892e-05, - "loss": 1.1393, - "num_input_tokens_seen": 7664624, - "step": 476 - }, - { - "epoch": 0.033413013212850354, - "grad_norm": 4.195656776428223, - "learning_rate": 9.666183537653241e-05, - "loss": 1.1801, - "num_input_tokens_seen": 7680480, - "step": 477 - }, - { - "epoch": 0.0334830614585796, - "grad_norm": 4.227575778961182, - "learning_rate": 9.665483712784588e-05, - "loss": 1.0453, - "num_input_tokens_seen": 7696632, - "step": 478 - }, - { - "epoch": 0.033553109704308845, - "grad_norm": 4.328822135925293, - "learning_rate": 9.664783887915937e-05, - "loss": 1.221, - "num_input_tokens_seen": 7713016, - "step": 479 - }, - { - "epoch": 0.03362315795003809, - "grad_norm": 4.086736679077148, - "learning_rate": 9.664084063047286e-05, - "loss": 1.2817, - "num_input_tokens_seen": 7729400, - "step": 480 - }, - { - "epoch": 0.033693206195767336, - "grad_norm": 4.555233955383301, - "learning_rate": 9.663384238178635e-05, - "loss": 1.483, - "num_input_tokens_seen": 7745784, - "step": 481 - }, - { - "epoch": 0.03376325444149658, - "grad_norm": 4.118983745574951, - "learning_rate": 9.662684413309984e-05, - "loss": 0.9139, - "num_input_tokens_seen": 7762168, - "step": 482 - }, - { - "epoch": 0.03383330268722583, - "grad_norm": 4.232059001922607, - "learning_rate": 9.661984588441331e-05, - "loss": 1.1269, - "num_input_tokens_seen": 7777920, - "step": 483 - }, - { - "epoch": 0.03390335093295507, - "grad_norm": 6.288865089416504, - "learning_rate": 9.66128476357268e-05, - "loss": 1.0642, - "num_input_tokens_seen": 7794304, - "step": 484 - }, - { - "epoch": 0.03397339917868432, - "grad_norm": 4.133046627044678, - "learning_rate": 9.660584938704028e-05, - "loss": 1.2067, - "num_input_tokens_seen": 7810200, - "step": 485 - }, - { - "epoch": 0.034043447424413564, - "grad_norm": 4.147965431213379, - "learning_rate": 9.659885113835377e-05, - "loss": 1.0367, - "num_input_tokens_seen": 7826384, - "step": 486 - }, - { - "epoch": 0.03411349567014281, - "grad_norm": 4.1191020011901855, - "learning_rate": 9.659185288966725e-05, - "loss": 1.0972, - "num_input_tokens_seen": 7841704, - "step": 487 - }, - { - "epoch": 0.034183543915872056, - "grad_norm": 4.518441677093506, - "learning_rate": 9.658485464098074e-05, - "loss": 1.263, - "num_input_tokens_seen": 7858088, - "step": 488 - }, - { - "epoch": 0.0342535921616013, - "grad_norm": 4.321181297302246, - "learning_rate": 9.657785639229423e-05, - "loss": 1.1378, - "num_input_tokens_seen": 7874472, - "step": 489 - }, - { - "epoch": 0.03432364040733055, - "grad_norm": 4.366185665130615, - "learning_rate": 9.65708581436077e-05, - "loss": 1.1636, - "num_input_tokens_seen": 7890856, - "step": 490 - }, - { - "epoch": 0.03439368865305979, - "grad_norm": 4.042731761932373, - "learning_rate": 9.65638598949212e-05, - "loss": 1.0601, - "num_input_tokens_seen": 7906776, - "step": 491 - }, - { - "epoch": 0.03446373689878904, - "grad_norm": 3.743668556213379, - "learning_rate": 9.655686164623468e-05, - "loss": 1.0441, - "num_input_tokens_seen": 7923160, - "step": 492 - }, - { - "epoch": 0.034533785144518284, - "grad_norm": 3.8547139167785645, - "learning_rate": 9.654986339754816e-05, - "loss": 1.0842, - "num_input_tokens_seen": 7939296, - "step": 493 - }, - { - "epoch": 0.034603833390247536, - "grad_norm": 4.238414287567139, - "learning_rate": 9.654286514886166e-05, - "loss": 1.2498, - "num_input_tokens_seen": 7955504, - "step": 494 - }, - { - "epoch": 0.03467388163597678, - "grad_norm": 4.134857177734375, - "learning_rate": 9.653586690017514e-05, - "loss": 1.1241, - "num_input_tokens_seen": 7971888, - "step": 495 - }, - { - "epoch": 0.03474392988170603, - "grad_norm": 4.2501983642578125, - "learning_rate": 9.652886865148862e-05, - "loss": 1.1829, - "num_input_tokens_seen": 7988272, - "step": 496 - }, - { - "epoch": 0.03481397812743527, - "grad_norm": 7.4397053718566895, - "learning_rate": 9.65218704028021e-05, - "loss": 0.9952, - "num_input_tokens_seen": 8003744, - "step": 497 - }, - { - "epoch": 0.03488402637316452, - "grad_norm": 4.2750959396362305, - "learning_rate": 9.651487215411559e-05, - "loss": 1.2387, - "num_input_tokens_seen": 8019184, - "step": 498 - }, - { - "epoch": 0.034954074618893764, - "grad_norm": 4.156162261962891, - "learning_rate": 9.650787390542908e-05, - "loss": 1.1201, - "num_input_tokens_seen": 8035176, - "step": 499 - }, - { - "epoch": 0.03502412286462301, - "grad_norm": 4.178225040435791, - "learning_rate": 9.650087565674257e-05, - "loss": 1.2026, - "num_input_tokens_seen": 8051560, - "step": 500 - }, - { - "epoch": 0.035094171110352256, - "grad_norm": 4.147096157073975, - "learning_rate": 9.649387740805605e-05, - "loss": 1.2465, - "num_input_tokens_seen": 8067944, - "step": 501 - }, - { - "epoch": 0.0351642193560815, - "grad_norm": 4.329249858856201, - "learning_rate": 9.648687915936953e-05, - "loss": 1.2742, - "num_input_tokens_seen": 8083824, - "step": 502 - }, - { - "epoch": 0.03523426760181075, - "grad_norm": 4.404232978820801, - "learning_rate": 9.647988091068302e-05, - "loss": 1.1511, - "num_input_tokens_seen": 8100208, - "step": 503 - }, - { - "epoch": 0.03530431584753999, - "grad_norm": 4.190586090087891, - "learning_rate": 9.64728826619965e-05, - "loss": 0.9884, - "num_input_tokens_seen": 8116048, - "step": 504 - }, - { - "epoch": 0.03537436409326924, - "grad_norm": 4.262845516204834, - "learning_rate": 9.646588441330998e-05, - "loss": 1.1321, - "num_input_tokens_seen": 8132432, - "step": 505 - }, - { - "epoch": 0.035444412338998484, - "grad_norm": 4.452746391296387, - "learning_rate": 9.645888616462347e-05, - "loss": 1.1667, - "num_input_tokens_seen": 8148816, - "step": 506 - }, - { - "epoch": 0.03551446058472773, - "grad_norm": 4.111443042755127, - "learning_rate": 9.645188791593696e-05, - "loss": 1.0049, - "num_input_tokens_seen": 8164856, - "step": 507 - }, - { - "epoch": 0.035584508830456975, - "grad_norm": 4.292227268218994, - "learning_rate": 9.644488966725045e-05, - "loss": 1.1535, - "num_input_tokens_seen": 8181240, - "step": 508 - }, - { - "epoch": 0.03565455707618622, - "grad_norm": 4.295238971710205, - "learning_rate": 9.643789141856394e-05, - "loss": 1.236, - "num_input_tokens_seen": 8197624, - "step": 509 - }, - { - "epoch": 0.035724605321915466, - "grad_norm": 3.930659294128418, - "learning_rate": 9.643089316987741e-05, - "loss": 0.9195, - "num_input_tokens_seen": 8213816, - "step": 510 - }, - { - "epoch": 0.03579465356764472, - "grad_norm": 4.092316150665283, - "learning_rate": 9.64238949211909e-05, - "loss": 1.0799, - "num_input_tokens_seen": 8229632, - "step": 511 - }, - { - "epoch": 0.035864701813373964, - "grad_norm": 4.2939252853393555, - "learning_rate": 9.641689667250437e-05, - "loss": 1.111, - "num_input_tokens_seen": 8245232, - "step": 512 - }, - { - "epoch": 0.03593475005910321, - "grad_norm": 4.191503524780273, - "learning_rate": 9.640989842381786e-05, - "loss": 0.9399, - "num_input_tokens_seen": 8260912, - "step": 513 - }, - { - "epoch": 0.036004798304832455, - "grad_norm": 4.141485214233398, - "learning_rate": 9.640290017513136e-05, - "loss": 1.1334, - "num_input_tokens_seen": 8276864, - "step": 514 - }, - { - "epoch": 0.0360748465505617, - "grad_norm": 3.890547752380371, - "learning_rate": 9.639590192644484e-05, - "loss": 1.0055, - "num_input_tokens_seen": 8292720, - "step": 515 - }, - { - "epoch": 0.03614489479629095, - "grad_norm": 4.405922889709473, - "learning_rate": 9.638890367775833e-05, - "loss": 1.2238, - "num_input_tokens_seen": 8309104, - "step": 516 - }, - { - "epoch": 0.03621494304202019, - "grad_norm": 4.207942485809326, - "learning_rate": 9.63819054290718e-05, - "loss": 1.0688, - "num_input_tokens_seen": 8325304, - "step": 517 - }, - { - "epoch": 0.03628499128774944, - "grad_norm": 4.174366474151611, - "learning_rate": 9.637490718038529e-05, - "loss": 1.2303, - "num_input_tokens_seen": 8341688, - "step": 518 - }, - { - "epoch": 0.036355039533478684, - "grad_norm": 3.9641714096069336, - "learning_rate": 9.636790893169878e-05, - "loss": 1.2244, - "num_input_tokens_seen": 8357760, - "step": 519 - }, - { - "epoch": 0.03642508777920793, - "grad_norm": 5.832678318023682, - "learning_rate": 9.636091068301227e-05, - "loss": 1.0645, - "num_input_tokens_seen": 8372712, - "step": 520 - }, - { - "epoch": 0.036495136024937175, - "grad_norm": 3.7905161380767822, - "learning_rate": 9.635391243432576e-05, - "loss": 1.0551, - "num_input_tokens_seen": 8389096, - "step": 521 - }, - { - "epoch": 0.03656518427066642, - "grad_norm": 3.6744072437286377, - "learning_rate": 9.634691418563923e-05, - "loss": 1.0687, - "num_input_tokens_seen": 8405216, - "step": 522 - }, - { - "epoch": 0.036635232516395666, - "grad_norm": 4.897486209869385, - "learning_rate": 9.633991593695272e-05, - "loss": 1.1968, - "num_input_tokens_seen": 8421600, - "step": 523 - }, - { - "epoch": 0.03670528076212491, - "grad_norm": 3.821457862854004, - "learning_rate": 9.63329176882662e-05, - "loss": 1.0473, - "num_input_tokens_seen": 8437984, - "step": 524 - }, - { - "epoch": 0.03677532900785416, - "grad_norm": 3.873832941055298, - "learning_rate": 9.632591943957969e-05, - "loss": 0.9656, - "num_input_tokens_seen": 8453760, - "step": 525 - }, - { - "epoch": 0.0368453772535834, - "grad_norm": 4.139901161193848, - "learning_rate": 9.631892119089317e-05, - "loss": 1.0881, - "num_input_tokens_seen": 8470144, - "step": 526 - }, - { - "epoch": 0.03691542549931265, - "grad_norm": 3.9512782096862793, - "learning_rate": 9.631192294220666e-05, - "loss": 1.1093, - "num_input_tokens_seen": 8486528, - "step": 527 - }, - { - "epoch": 0.0369854737450419, - "grad_norm": 3.8937103748321533, - "learning_rate": 9.630492469352015e-05, - "loss": 0.9722, - "num_input_tokens_seen": 8502912, - "step": 528 - }, - { - "epoch": 0.03705552199077115, - "grad_norm": 4.482640743255615, - "learning_rate": 9.629792644483363e-05, - "loss": 1.056, - "num_input_tokens_seen": 8519296, - "step": 529 - }, - { - "epoch": 0.03712557023650039, - "grad_norm": 4.127941131591797, - "learning_rate": 9.629092819614711e-05, - "loss": 1.0285, - "num_input_tokens_seen": 8535160, - "step": 530 - }, - { - "epoch": 0.03719561848222964, - "grad_norm": 3.973585844039917, - "learning_rate": 9.62839299474606e-05, - "loss": 1.0356, - "num_input_tokens_seen": 8551256, - "step": 531 - }, - { - "epoch": 0.037265666727958884, - "grad_norm": 4.22855281829834, - "learning_rate": 9.627693169877408e-05, - "loss": 1.134, - "num_input_tokens_seen": 8567640, - "step": 532 - }, - { - "epoch": 0.03733571497368813, - "grad_norm": 4.144021511077881, - "learning_rate": 9.626993345008757e-05, - "loss": 1.0963, - "num_input_tokens_seen": 8583504, - "step": 533 - }, - { - "epoch": 0.037405763219417375, - "grad_norm": 3.8666226863861084, - "learning_rate": 9.626293520140106e-05, - "loss": 0.912, - "num_input_tokens_seen": 8599888, - "step": 534 - }, - { - "epoch": 0.03747581146514662, - "grad_norm": 4.215412616729736, - "learning_rate": 9.625593695271454e-05, - "loss": 1.1055, - "num_input_tokens_seen": 8616256, - "step": 535 - }, - { - "epoch": 0.037545859710875866, - "grad_norm": 4.353022575378418, - "learning_rate": 9.624893870402803e-05, - "loss": 1.0379, - "num_input_tokens_seen": 8632640, - "step": 536 - }, - { - "epoch": 0.03761590795660511, - "grad_norm": 3.778947591781616, - "learning_rate": 9.624194045534151e-05, - "loss": 1.0547, - "num_input_tokens_seen": 8648624, - "step": 537 - }, - { - "epoch": 0.03768595620233436, - "grad_norm": 4.481568336486816, - "learning_rate": 9.6234942206655e-05, - "loss": 1.3407, - "num_input_tokens_seen": 8664200, - "step": 538 - }, - { - "epoch": 0.0377560044480636, - "grad_norm": 4.066302299499512, - "learning_rate": 9.622794395796847e-05, - "loss": 0.995, - "num_input_tokens_seen": 8680584, - "step": 539 - }, - { - "epoch": 0.03782605269379285, - "grad_norm": 4.262768268585205, - "learning_rate": 9.622094570928197e-05, - "loss": 1.3054, - "num_input_tokens_seen": 8696968, - "step": 540 - }, - { - "epoch": 0.037896100939522094, - "grad_norm": 3.777597665786743, - "learning_rate": 9.621394746059546e-05, - "loss": 0.9831, - "num_input_tokens_seen": 8713352, - "step": 541 - }, - { - "epoch": 0.03796614918525134, - "grad_norm": 3.9732742309570312, - "learning_rate": 9.620694921190894e-05, - "loss": 1.0699, - "num_input_tokens_seen": 8729048, - "step": 542 - }, - { - "epoch": 0.038036197430980585, - "grad_norm": 4.543329238891602, - "learning_rate": 9.619995096322243e-05, - "loss": 1.1546, - "num_input_tokens_seen": 8745432, - "step": 543 - }, - { - "epoch": 0.03810624567670983, - "grad_norm": 4.903865814208984, - "learning_rate": 9.61929527145359e-05, - "loss": 1.1548, - "num_input_tokens_seen": 8760296, - "step": 544 - }, - { - "epoch": 0.03817629392243908, - "grad_norm": 4.197691917419434, - "learning_rate": 9.618595446584939e-05, - "loss": 1.1616, - "num_input_tokens_seen": 8776680, - "step": 545 - }, - { - "epoch": 0.03824634216816833, - "grad_norm": 3.912689208984375, - "learning_rate": 9.617895621716288e-05, - "loss": 0.9926, - "num_input_tokens_seen": 8793064, - "step": 546 - }, - { - "epoch": 0.038316390413897575, - "grad_norm": 4.291840076446533, - "learning_rate": 9.617195796847637e-05, - "loss": 1.1943, - "num_input_tokens_seen": 8809448, - "step": 547 - }, - { - "epoch": 0.03838643865962682, - "grad_norm": 3.9053072929382324, - "learning_rate": 9.616495971978985e-05, - "loss": 1.2437, - "num_input_tokens_seen": 8825536, - "step": 548 - }, - { - "epoch": 0.038456486905356066, - "grad_norm": 4.860696315765381, - "learning_rate": 9.615796147110333e-05, - "loss": 1.3045, - "num_input_tokens_seen": 8841920, - "step": 549 - }, - { - "epoch": 0.03852653515108531, - "grad_norm": 3.9394373893737793, - "learning_rate": 9.615096322241682e-05, - "loss": 1.1367, - "num_input_tokens_seen": 8858304, - "step": 550 - }, - { - "epoch": 0.03859658339681456, - "grad_norm": 3.8160409927368164, - "learning_rate": 9.61439649737303e-05, - "loss": 1.0864, - "num_input_tokens_seen": 8874688, - "step": 551 - }, - { - "epoch": 0.0386666316425438, - "grad_norm": 4.3792805671691895, - "learning_rate": 9.613696672504378e-05, - "loss": 1.2516, - "num_input_tokens_seen": 8891072, - "step": 552 - }, - { - "epoch": 0.03873667988827305, - "grad_norm": 4.103452682495117, - "learning_rate": 9.612996847635727e-05, - "loss": 0.9737, - "num_input_tokens_seen": 8907456, - "step": 553 - }, - { - "epoch": 0.038806728134002294, - "grad_norm": 4.117603302001953, - "learning_rate": 9.612297022767076e-05, - "loss": 1.096, - "num_input_tokens_seen": 8923816, - "step": 554 - }, - { - "epoch": 0.03887677637973154, - "grad_norm": 4.272468566894531, - "learning_rate": 9.611597197898425e-05, - "loss": 1.161, - "num_input_tokens_seen": 8939344, - "step": 555 - }, - { - "epoch": 0.038946824625460785, - "grad_norm": 4.323635578155518, - "learning_rate": 9.610897373029772e-05, - "loss": 1.1922, - "num_input_tokens_seen": 8954920, - "step": 556 - }, - { - "epoch": 0.03901687287119003, - "grad_norm": 3.783510684967041, - "learning_rate": 9.610197548161121e-05, - "loss": 1.0658, - "num_input_tokens_seen": 8971304, - "step": 557 - }, - { - "epoch": 0.039086921116919277, - "grad_norm": 4.3757548332214355, - "learning_rate": 9.60949772329247e-05, - "loss": 1.3186, - "num_input_tokens_seen": 8987672, - "step": 558 - }, - { - "epoch": 0.03915696936264852, - "grad_norm": 4.048824787139893, - "learning_rate": 9.608797898423818e-05, - "loss": 1.1452, - "num_input_tokens_seen": 9003896, - "step": 559 - }, - { - "epoch": 0.03922701760837777, - "grad_norm": 4.06865930557251, - "learning_rate": 9.608098073555168e-05, - "loss": 0.9861, - "num_input_tokens_seen": 9020280, - "step": 560 - }, - { - "epoch": 0.03929706585410701, - "grad_norm": 3.966737747192383, - "learning_rate": 9.607398248686515e-05, - "loss": 1.0323, - "num_input_tokens_seen": 9036280, - "step": 561 - }, - { - "epoch": 0.03936711409983626, - "grad_norm": 4.466656684875488, - "learning_rate": 9.606698423817864e-05, - "loss": 1.2462, - "num_input_tokens_seen": 9052664, - "step": 562 - }, - { - "epoch": 0.03943716234556551, - "grad_norm": 4.312132358551025, - "learning_rate": 9.605998598949213e-05, - "loss": 1.2133, - "num_input_tokens_seen": 9068832, - "step": 563 - }, - { - "epoch": 0.03950721059129476, - "grad_norm": 3.9202895164489746, - "learning_rate": 9.60529877408056e-05, - "loss": 1.0723, - "num_input_tokens_seen": 9084680, - "step": 564 - }, - { - "epoch": 0.039577258837024, - "grad_norm": 5.139899730682373, - "learning_rate": 9.604598949211909e-05, - "loss": 1.1165, - "num_input_tokens_seen": 9099792, - "step": 565 - }, - { - "epoch": 0.03964730708275325, - "grad_norm": 4.398557186126709, - "learning_rate": 9.603899124343258e-05, - "loss": 1.1737, - "num_input_tokens_seen": 9116136, - "step": 566 - }, - { - "epoch": 0.039717355328482494, - "grad_norm": 4.350982666015625, - "learning_rate": 9.603199299474607e-05, - "loss": 1.2174, - "num_input_tokens_seen": 9132520, - "step": 567 - }, - { - "epoch": 0.03978740357421174, - "grad_norm": 3.787644386291504, - "learning_rate": 9.602499474605956e-05, - "loss": 0.9914, - "num_input_tokens_seen": 9148856, - "step": 568 - }, - { - "epoch": 0.039857451819940985, - "grad_norm": 4.630245685577393, - "learning_rate": 9.601799649737303e-05, - "loss": 1.4135, - "num_input_tokens_seen": 9164888, - "step": 569 - }, - { - "epoch": 0.03992750006567023, - "grad_norm": 4.063969135284424, - "learning_rate": 9.601099824868652e-05, - "loss": 1.1312, - "num_input_tokens_seen": 9181272, - "step": 570 - }, - { - "epoch": 0.039997548311399476, - "grad_norm": 4.2443413734436035, - "learning_rate": 9.6004e-05, - "loss": 1.1627, - "num_input_tokens_seen": 9197344, - "step": 571 - }, - { - "epoch": 0.04006759655712872, - "grad_norm": 4.396352767944336, - "learning_rate": 9.599700175131349e-05, - "loss": 1.1222, - "num_input_tokens_seen": 9212312, - "step": 572 - }, - { - "epoch": 0.04013764480285797, - "grad_norm": 4.364585876464844, - "learning_rate": 9.599000350262697e-05, - "loss": 1.0522, - "num_input_tokens_seen": 9228696, - "step": 573 - }, - { - "epoch": 0.04020769304858721, - "grad_norm": 3.9348409175872803, - "learning_rate": 9.598300525394046e-05, - "loss": 1.1375, - "num_input_tokens_seen": 9245080, - "step": 574 - }, - { - "epoch": 0.04027774129431646, - "grad_norm": 4.051416873931885, - "learning_rate": 9.597600700525395e-05, - "loss": 1.0265, - "num_input_tokens_seen": 9260752, - "step": 575 - }, - { - "epoch": 0.040347789540045705, - "grad_norm": 4.661770820617676, - "learning_rate": 9.596900875656743e-05, - "loss": 1.192, - "num_input_tokens_seen": 9276792, - "step": 576 - }, - { - "epoch": 0.04041783778577495, - "grad_norm": 4.378422260284424, - "learning_rate": 9.596201050788092e-05, - "loss": 1.0497, - "num_input_tokens_seen": 9292768, - "step": 577 - }, - { - "epoch": 0.040487886031504196, - "grad_norm": 4.4690399169921875, - "learning_rate": 9.595501225919439e-05, - "loss": 1.2398, - "num_input_tokens_seen": 9309152, - "step": 578 - }, - { - "epoch": 0.04055793427723344, - "grad_norm": 4.1711273193359375, - "learning_rate": 9.594801401050788e-05, - "loss": 1.097, - "num_input_tokens_seen": 9325536, - "step": 579 - }, - { - "epoch": 0.040627982522962694, - "grad_norm": 3.8115949630737305, - "learning_rate": 9.594101576182137e-05, - "loss": 1.0317, - "num_input_tokens_seen": 9341920, - "step": 580 - }, - { - "epoch": 0.04069803076869194, - "grad_norm": 4.072190284729004, - "learning_rate": 9.593401751313486e-05, - "loss": 1.0649, - "num_input_tokens_seen": 9357904, - "step": 581 - }, - { - "epoch": 0.040768079014421185, - "grad_norm": 3.895766258239746, - "learning_rate": 9.592701926444835e-05, - "loss": 1.1906, - "num_input_tokens_seen": 9373496, - "step": 582 - }, - { - "epoch": 0.04083812726015043, - "grad_norm": 4.026490688323975, - "learning_rate": 9.592002101576182e-05, - "loss": 0.9913, - "num_input_tokens_seen": 9389824, - "step": 583 - }, - { - "epoch": 0.040908175505879676, - "grad_norm": 3.612987518310547, - "learning_rate": 9.591302276707531e-05, - "loss": 0.9376, - "num_input_tokens_seen": 9406208, - "step": 584 - }, - { - "epoch": 0.04097822375160892, - "grad_norm": 4.4619646072387695, - "learning_rate": 9.59060245183888e-05, - "loss": 1.2198, - "num_input_tokens_seen": 9422592, - "step": 585 - }, - { - "epoch": 0.04104827199733817, - "grad_norm": 3.990372896194458, - "learning_rate": 9.589902626970229e-05, - "loss": 1.082, - "num_input_tokens_seen": 9438816, - "step": 586 - }, - { - "epoch": 0.04111832024306741, - "grad_norm": 3.7697947025299072, - "learning_rate": 9.589202802101577e-05, - "loss": 1.0173, - "num_input_tokens_seen": 9455200, - "step": 587 - }, - { - "epoch": 0.04118836848879666, - "grad_norm": 4.066056728363037, - "learning_rate": 9.588502977232925e-05, - "loss": 1.124, - "num_input_tokens_seen": 9471320, - "step": 588 - }, - { - "epoch": 0.041258416734525905, - "grad_norm": 3.913506507873535, - "learning_rate": 9.587803152364274e-05, - "loss": 1.0501, - "num_input_tokens_seen": 9487304, - "step": 589 - }, - { - "epoch": 0.04132846498025515, - "grad_norm": 3.9049429893493652, - "learning_rate": 9.587103327495623e-05, - "loss": 1.0563, - "num_input_tokens_seen": 9503688, - "step": 590 - }, - { - "epoch": 0.041398513225984396, - "grad_norm": 4.316978454589844, - "learning_rate": 9.58640350262697e-05, - "loss": 1.1333, - "num_input_tokens_seen": 9519488, - "step": 591 - }, - { - "epoch": 0.04146856147171364, - "grad_norm": 3.7818517684936523, - "learning_rate": 9.585703677758319e-05, - "loss": 1.0537, - "num_input_tokens_seen": 9535872, - "step": 592 - }, - { - "epoch": 0.04153860971744289, - "grad_norm": 3.8751401901245117, - "learning_rate": 9.585003852889668e-05, - "loss": 1.1745, - "num_input_tokens_seen": 9551928, - "step": 593 - }, - { - "epoch": 0.04160865796317213, - "grad_norm": 4.357265949249268, - "learning_rate": 9.584304028021017e-05, - "loss": 1.1154, - "num_input_tokens_seen": 9568312, - "step": 594 - }, - { - "epoch": 0.04167870620890138, - "grad_norm": 4.184159755706787, - "learning_rate": 9.583604203152366e-05, - "loss": 1.125, - "num_input_tokens_seen": 9583968, - "step": 595 - }, - { - "epoch": 0.041748754454630624, - "grad_norm": 3.9540369510650635, - "learning_rate": 9.582904378283713e-05, - "loss": 1.2032, - "num_input_tokens_seen": 9600152, - "step": 596 - }, - { - "epoch": 0.04181880270035987, - "grad_norm": 4.401122093200684, - "learning_rate": 9.582204553415062e-05, - "loss": 1.4808, - "num_input_tokens_seen": 9615632, - "step": 597 - }, - { - "epoch": 0.04188885094608912, - "grad_norm": 4.418131351470947, - "learning_rate": 9.58150472854641e-05, - "loss": 1.0077, - "num_input_tokens_seen": 9631712, - "step": 598 - }, - { - "epoch": 0.04195889919181837, - "grad_norm": 4.362226963043213, - "learning_rate": 9.580804903677758e-05, - "loss": 1.1614, - "num_input_tokens_seen": 9648096, - "step": 599 - }, - { - "epoch": 0.04202894743754761, - "grad_norm": 4.051177024841309, - "learning_rate": 9.580105078809107e-05, - "loss": 1.0718, - "num_input_tokens_seen": 9663792, - "step": 600 - }, - { - "epoch": 0.04202894743754761, - "eval_loss": 1.1809133291244507, - "eval_runtime": 0.2062, - "eval_samples_per_second": 4.849, - "eval_steps_per_second": 4.849, - "num_input_tokens_seen": 9663792, - "step": 600 - }, - { - "epoch": 0.04209899568327686, - "grad_norm": 4.478739261627197, - "learning_rate": 9.579405253940456e-05, - "loss": 1.1963, - "num_input_tokens_seen": 9680176, - "step": 601 - }, - { - "epoch": 0.042169043929006104, - "grad_norm": 4.05004358291626, - "learning_rate": 9.578705429071805e-05, - "loss": 1.1005, - "num_input_tokens_seen": 9696560, - "step": 602 - }, - { - "epoch": 0.04223909217473535, - "grad_norm": 4.092396259307861, - "learning_rate": 9.578005604203152e-05, - "loss": 1.1796, - "num_input_tokens_seen": 9712944, - "step": 603 - }, - { - "epoch": 0.042309140420464596, - "grad_norm": 4.428014278411865, - "learning_rate": 9.577305779334501e-05, - "loss": 0.9734, - "num_input_tokens_seen": 9729096, - "step": 604 - }, - { - "epoch": 0.04237918866619384, - "grad_norm": 4.202315807342529, - "learning_rate": 9.576605954465849e-05, - "loss": 1.0502, - "num_input_tokens_seen": 9745480, - "step": 605 - }, - { - "epoch": 0.04244923691192309, - "grad_norm": 3.7633514404296875, - "learning_rate": 9.575906129597198e-05, - "loss": 0.9218, - "num_input_tokens_seen": 9761272, - "step": 606 - }, - { - "epoch": 0.04251928515765233, - "grad_norm": 4.170671463012695, - "learning_rate": 9.575206304728548e-05, - "loss": 1.1196, - "num_input_tokens_seen": 9777656, - "step": 607 - }, - { - "epoch": 0.04258933340338158, - "grad_norm": 4.20021915435791, - "learning_rate": 9.574506479859895e-05, - "loss": 1.1146, - "num_input_tokens_seen": 9794032, - "step": 608 - }, - { - "epoch": 0.042659381649110824, - "grad_norm": 4.437755107879639, - "learning_rate": 9.573806654991244e-05, - "loss": 1.0911, - "num_input_tokens_seen": 9809936, - "step": 609 - }, - { - "epoch": 0.04272942989484007, - "grad_norm": 4.417452335357666, - "learning_rate": 9.573106830122592e-05, - "loss": 1.2079, - "num_input_tokens_seen": 9825232, - "step": 610 - }, - { - "epoch": 0.042799478140569315, - "grad_norm": 4.144030570983887, - "learning_rate": 9.57240700525394e-05, - "loss": 1.1229, - "num_input_tokens_seen": 9840648, - "step": 611 - }, - { - "epoch": 0.04286952638629856, - "grad_norm": 3.991605043411255, - "learning_rate": 9.57170718038529e-05, - "loss": 1.0762, - "num_input_tokens_seen": 9857032, - "step": 612 - }, - { - "epoch": 0.042939574632027806, - "grad_norm": 4.516556262969971, - "learning_rate": 9.571007355516638e-05, - "loss": 1.3056, - "num_input_tokens_seen": 9872328, - "step": 613 - }, - { - "epoch": 0.04300962287775705, - "grad_norm": 4.030200481414795, - "learning_rate": 9.570307530647987e-05, - "loss": 0.9493, - "num_input_tokens_seen": 9887832, - "step": 614 - }, - { - "epoch": 0.043079671123486304, - "grad_norm": 4.345893859863281, - "learning_rate": 9.569607705779335e-05, - "loss": 1.2707, - "num_input_tokens_seen": 9904216, - "step": 615 - }, - { - "epoch": 0.04314971936921555, - "grad_norm": 4.158145427703857, - "learning_rate": 9.568907880910684e-05, - "loss": 1.0377, - "num_input_tokens_seen": 9920072, - "step": 616 - }, - { - "epoch": 0.043219767614944796, - "grad_norm": 4.155702590942383, - "learning_rate": 9.568208056042032e-05, - "loss": 1.091, - "num_input_tokens_seen": 9936416, - "step": 617 - }, - { - "epoch": 0.04328981586067404, - "grad_norm": 3.76328444480896, - "learning_rate": 9.56750823117338e-05, - "loss": 1.1011, - "num_input_tokens_seen": 9952456, - "step": 618 - }, - { - "epoch": 0.04335986410640329, - "grad_norm": 4.252495765686035, - "learning_rate": 9.566808406304729e-05, - "loss": 1.0616, - "num_input_tokens_seen": 9968608, - "step": 619 - }, - { - "epoch": 0.04342991235213253, - "grad_norm": 9.254091262817383, - "learning_rate": 9.566108581436078e-05, - "loss": 1.0315, - "num_input_tokens_seen": 9983016, - "step": 620 - }, - { - "epoch": 0.04349996059786178, - "grad_norm": 4.028343200683594, - "learning_rate": 9.565408756567426e-05, - "loss": 1.0667, - "num_input_tokens_seen": 9999400, - "step": 621 - }, - { - "epoch": 0.043570008843591024, - "grad_norm": 4.051328659057617, - "learning_rate": 9.564708931698775e-05, - "loss": 1.1375, - "num_input_tokens_seen": 10015384, - "step": 622 - }, - { - "epoch": 0.04364005708932027, - "grad_norm": 4.495016098022461, - "learning_rate": 9.564009106830123e-05, - "loss": 1.0691, - "num_input_tokens_seen": 10031152, - "step": 623 - }, - { - "epoch": 0.043710105335049515, - "grad_norm": 4.876840114593506, - "learning_rate": 9.563309281961472e-05, - "loss": 1.17, - "num_input_tokens_seen": 10047536, - "step": 624 - }, - { - "epoch": 0.04378015358077876, - "grad_norm": 4.407329559326172, - "learning_rate": 9.562609457092819e-05, - "loss": 1.2381, - "num_input_tokens_seen": 10063920, - "step": 625 - }, - { - "epoch": 0.043850201826508006, - "grad_norm": 4.161394119262695, - "learning_rate": 9.561909632224168e-05, - "loss": 1.0903, - "num_input_tokens_seen": 10079024, - "step": 626 - }, - { - "epoch": 0.04392025007223725, - "grad_norm": 4.382974624633789, - "learning_rate": 9.561209807355518e-05, - "loss": 1.3156, - "num_input_tokens_seen": 10095408, - "step": 627 - }, - { - "epoch": 0.0439902983179665, - "grad_norm": 4.004157543182373, - "learning_rate": 9.560509982486866e-05, - "loss": 1.1333, - "num_input_tokens_seen": 10111792, - "step": 628 - }, - { - "epoch": 0.04406034656369574, - "grad_norm": 3.9019265174865723, - "learning_rate": 9.559810157618215e-05, - "loss": 1.0948, - "num_input_tokens_seen": 10128144, - "step": 629 - }, - { - "epoch": 0.04413039480942499, - "grad_norm": 4.410470485687256, - "learning_rate": 9.559110332749562e-05, - "loss": 1.3219, - "num_input_tokens_seen": 10144288, - "step": 630 - }, - { - "epoch": 0.044200443055154234, - "grad_norm": 4.233544826507568, - "learning_rate": 9.558410507880911e-05, - "loss": 0.999, - "num_input_tokens_seen": 10160296, - "step": 631 - }, - { - "epoch": 0.04427049130088349, - "grad_norm": 4.120091438293457, - "learning_rate": 9.557710683012258e-05, - "loss": 1.0166, - "num_input_tokens_seen": 10176680, - "step": 632 - }, - { - "epoch": 0.04434053954661273, - "grad_norm": 5.061972618103027, - "learning_rate": 9.557010858143609e-05, - "loss": 1.251, - "num_input_tokens_seen": 10192088, - "step": 633 - }, - { - "epoch": 0.04441058779234198, - "grad_norm": 4.3690948486328125, - "learning_rate": 9.556311033274958e-05, - "loss": 1.2113, - "num_input_tokens_seen": 10208472, - "step": 634 - }, - { - "epoch": 0.044480636038071224, - "grad_norm": 3.798710346221924, - "learning_rate": 9.555611208406305e-05, - "loss": 1.0286, - "num_input_tokens_seen": 10224856, - "step": 635 - }, - { - "epoch": 0.04455068428380047, - "grad_norm": 4.41818380355835, - "learning_rate": 9.554911383537654e-05, - "loss": 1.14, - "num_input_tokens_seen": 10241200, - "step": 636 - }, - { - "epoch": 0.044620732529529715, - "grad_norm": 4.256262302398682, - "learning_rate": 9.554211558669001e-05, - "loss": 1.3103, - "num_input_tokens_seen": 10257584, - "step": 637 - }, - { - "epoch": 0.04469078077525896, - "grad_norm": 4.176064968109131, - "learning_rate": 9.55351173380035e-05, - "loss": 1.1985, - "num_input_tokens_seen": 10273760, - "step": 638 - }, - { - "epoch": 0.044760829020988206, - "grad_norm": 3.9971530437469482, - "learning_rate": 9.552811908931699e-05, - "loss": 1.1579, - "num_input_tokens_seen": 10290144, - "step": 639 - }, - { - "epoch": 0.04483087726671745, - "grad_norm": 4.150514602661133, - "learning_rate": 9.552112084063048e-05, - "loss": 1.1144, - "num_input_tokens_seen": 10306528, - "step": 640 - }, - { - "epoch": 0.0449009255124467, - "grad_norm": 4.1868367195129395, - "learning_rate": 9.551412259194397e-05, - "loss": 1.0099, - "num_input_tokens_seen": 10322480, - "step": 641 - }, - { - "epoch": 0.04497097375817594, - "grad_norm": 4.409821510314941, - "learning_rate": 9.550712434325744e-05, - "loss": 1.2574, - "num_input_tokens_seen": 10338864, - "step": 642 - }, - { - "epoch": 0.04504102200390519, - "grad_norm": 4.500023365020752, - "learning_rate": 9.550012609457093e-05, - "loss": 1.35, - "num_input_tokens_seen": 10355072, - "step": 643 - }, - { - "epoch": 0.045111070249634434, - "grad_norm": 10.278129577636719, - "learning_rate": 9.549312784588442e-05, - "loss": 1.0618, - "num_input_tokens_seen": 10371456, - "step": 644 - }, - { - "epoch": 0.04518111849536368, - "grad_norm": 3.9800543785095215, - "learning_rate": 9.54861295971979e-05, - "loss": 1.0341, - "num_input_tokens_seen": 10387720, - "step": 645 - }, - { - "epoch": 0.045251166741092926, - "grad_norm": 3.855720281600952, - "learning_rate": 9.547913134851138e-05, - "loss": 1.1323, - "num_input_tokens_seen": 10403936, - "step": 646 - }, - { - "epoch": 0.04532121498682217, - "grad_norm": 4.719264984130859, - "learning_rate": 9.547213309982487e-05, - "loss": 1.1407, - "num_input_tokens_seen": 10420320, - "step": 647 - }, - { - "epoch": 0.04539126323255142, - "grad_norm": 4.6528167724609375, - "learning_rate": 9.546513485113836e-05, - "loss": 1.1014, - "num_input_tokens_seen": 10436704, - "step": 648 - }, - { - "epoch": 0.04546131147828066, - "grad_norm": 4.0597028732299805, - "learning_rate": 9.545813660245185e-05, - "loss": 1.116, - "num_input_tokens_seen": 10452592, - "step": 649 - }, - { - "epoch": 0.045531359724009915, - "grad_norm": 4.161896705627441, - "learning_rate": 9.545113835376533e-05, - "loss": 1.1373, - "num_input_tokens_seen": 10468976, - "step": 650 - }, - { - "epoch": 0.04560140796973916, - "grad_norm": 4.125041961669922, - "learning_rate": 9.544414010507881e-05, - "loss": 1.0947, - "num_input_tokens_seen": 10484584, - "step": 651 - }, - { - "epoch": 0.045671456215468406, - "grad_norm": 4.278462886810303, - "learning_rate": 9.543714185639229e-05, - "loss": 1.1369, - "num_input_tokens_seen": 10500504, - "step": 652 - }, - { - "epoch": 0.04574150446119765, - "grad_norm": 4.766538619995117, - "learning_rate": 9.543014360770579e-05, - "loss": 1.1876, - "num_input_tokens_seen": 10516472, - "step": 653 - }, - { - "epoch": 0.0458115527069269, - "grad_norm": 4.457921504974365, - "learning_rate": 9.542314535901928e-05, - "loss": 1.0788, - "num_input_tokens_seen": 10532272, - "step": 654 - }, - { - "epoch": 0.04588160095265614, - "grad_norm": 5.021823883056641, - "learning_rate": 9.541614711033275e-05, - "loss": 1.1152, - "num_input_tokens_seen": 10547696, - "step": 655 - }, - { - "epoch": 0.04595164919838539, - "grad_norm": 4.407228469848633, - "learning_rate": 9.540914886164624e-05, - "loss": 1.0863, - "num_input_tokens_seen": 10564080, - "step": 656 - }, - { - "epoch": 0.046021697444114634, - "grad_norm": 3.9986062049865723, - "learning_rate": 9.540215061295972e-05, - "loss": 1.1624, - "num_input_tokens_seen": 10580464, - "step": 657 - }, - { - "epoch": 0.04609174568984388, - "grad_norm": 7.9165191650390625, - "learning_rate": 9.539515236427321e-05, - "loss": 1.0809, - "num_input_tokens_seen": 10595336, - "step": 658 - }, - { - "epoch": 0.046161793935573125, - "grad_norm": 4.357856273651123, - "learning_rate": 9.53881541155867e-05, - "loss": 1.0324, - "num_input_tokens_seen": 10611720, - "step": 659 - }, - { - "epoch": 0.04623184218130237, - "grad_norm": 3.8115761280059814, - "learning_rate": 9.538115586690018e-05, - "loss": 1.1499, - "num_input_tokens_seen": 10628104, - "step": 660 - }, - { - "epoch": 0.04630189042703162, - "grad_norm": 3.879671096801758, - "learning_rate": 9.537415761821367e-05, - "loss": 1.0474, - "num_input_tokens_seen": 10644096, - "step": 661 - }, - { - "epoch": 0.04637193867276086, - "grad_norm": 4.324586391448975, - "learning_rate": 9.536715936952715e-05, - "loss": 1.1904, - "num_input_tokens_seen": 10659408, - "step": 662 - }, - { - "epoch": 0.04644198691849011, - "grad_norm": 4.020029067993164, - "learning_rate": 9.536016112084064e-05, - "loss": 1.0848, - "num_input_tokens_seen": 10675792, - "step": 663 - }, - { - "epoch": 0.046512035164219354, - "grad_norm": 4.563455581665039, - "learning_rate": 9.535316287215411e-05, - "loss": 1.1735, - "num_input_tokens_seen": 10691632, - "step": 664 - }, - { - "epoch": 0.0465820834099486, - "grad_norm": 4.444424629211426, - "learning_rate": 9.53461646234676e-05, - "loss": 1.258, - "num_input_tokens_seen": 10708016, - "step": 665 - }, - { - "epoch": 0.046652131655677845, - "grad_norm": 3.9864089488983154, - "learning_rate": 9.533916637478109e-05, - "loss": 1.1315, - "num_input_tokens_seen": 10724176, - "step": 666 - }, - { - "epoch": 0.0467221799014071, - "grad_norm": 4.860849857330322, - "learning_rate": 9.533216812609458e-05, - "loss": 1.2276, - "num_input_tokens_seen": 10740560, - "step": 667 - }, - { - "epoch": 0.04679222814713634, - "grad_norm": 3.9701120853424072, - "learning_rate": 9.532516987740807e-05, - "loss": 1.1406, - "num_input_tokens_seen": 10756864, - "step": 668 - }, - { - "epoch": 0.04686227639286559, - "grad_norm": 3.660257577896118, - "learning_rate": 9.531817162872154e-05, - "loss": 1.0182, - "num_input_tokens_seen": 10773248, - "step": 669 - }, - { - "epoch": 0.046932324638594834, - "grad_norm": 3.888510227203369, - "learning_rate": 9.531117338003503e-05, - "loss": 1.0223, - "num_input_tokens_seen": 10789632, - "step": 670 - }, - { - "epoch": 0.04700237288432408, - "grad_norm": 4.794105052947998, - "learning_rate": 9.530417513134852e-05, - "loss": 1.0565, - "num_input_tokens_seen": 10804496, - "step": 671 - }, - { - "epoch": 0.047072421130053325, - "grad_norm": 4.293116092681885, - "learning_rate": 9.529717688266199e-05, - "loss": 1.2509, - "num_input_tokens_seen": 10819976, - "step": 672 - }, - { - "epoch": 0.04714246937578257, - "grad_norm": 5.112069129943848, - "learning_rate": 9.52901786339755e-05, - "loss": 1.0964, - "num_input_tokens_seen": 10836360, - "step": 673 - }, - { - "epoch": 0.04721251762151182, - "grad_norm": 3.9091360569000244, - "learning_rate": 9.528318038528897e-05, - "loss": 1.0647, - "num_input_tokens_seen": 10852744, - "step": 674 - }, - { - "epoch": 0.04728256586724106, - "grad_norm": 4.032161235809326, - "learning_rate": 9.527618213660246e-05, - "loss": 1.2362, - "num_input_tokens_seen": 10868928, - "step": 675 - }, - { - "epoch": 0.04735261411297031, - "grad_norm": 3.931156635284424, - "learning_rate": 9.526918388791595e-05, - "loss": 1.0571, - "num_input_tokens_seen": 10884776, - "step": 676 - }, - { - "epoch": 0.047422662358699554, - "grad_norm": 3.9511048793792725, - "learning_rate": 9.526218563922942e-05, - "loss": 1.0249, - "num_input_tokens_seen": 10901160, - "step": 677 - }, - { - "epoch": 0.0474927106044288, - "grad_norm": 4.199029445648193, - "learning_rate": 9.525518739054291e-05, - "loss": 1.2813, - "num_input_tokens_seen": 10917544, - "step": 678 - }, - { - "epoch": 0.047562758850158045, - "grad_norm": 3.8590247631073, - "learning_rate": 9.52481891418564e-05, - "loss": 1.02, - "num_input_tokens_seen": 10933928, - "step": 679 - }, - { - "epoch": 0.04763280709588729, - "grad_norm": 5.530341625213623, - "learning_rate": 9.524119089316989e-05, - "loss": 1.2316, - "num_input_tokens_seen": 10949600, - "step": 680 - }, - { - "epoch": 0.047702855341616536, - "grad_norm": 4.17647123336792, - "learning_rate": 9.523419264448338e-05, - "loss": 1.2985, - "num_input_tokens_seen": 10965984, - "step": 681 - }, - { - "epoch": 0.04777290358734578, - "grad_norm": 4.250451564788818, - "learning_rate": 9.522719439579685e-05, - "loss": 1.1638, - "num_input_tokens_seen": 10982368, - "step": 682 - }, - { - "epoch": 0.04784295183307503, - "grad_norm": 4.132594108581543, - "learning_rate": 9.522019614711034e-05, - "loss": 0.9638, - "num_input_tokens_seen": 10998752, - "step": 683 - }, - { - "epoch": 0.04791300007880428, - "grad_norm": 5.863363265991211, - "learning_rate": 9.521319789842382e-05, - "loss": 1.0736, - "num_input_tokens_seen": 11014376, - "step": 684 - }, - { - "epoch": 0.047983048324533525, - "grad_norm": 3.740323543548584, - "learning_rate": 9.52061996497373e-05, - "loss": 0.9958, - "num_input_tokens_seen": 11030440, - "step": 685 - }, - { - "epoch": 0.04805309657026277, - "grad_norm": 4.927120685577393, - "learning_rate": 9.519920140105079e-05, - "loss": 1.156, - "num_input_tokens_seen": 11046824, - "step": 686 - }, - { - "epoch": 0.04812314481599202, - "grad_norm": 4.708818435668945, - "learning_rate": 9.519220315236428e-05, - "loss": 1.2139, - "num_input_tokens_seen": 11063208, - "step": 687 - }, - { - "epoch": 0.04819319306172126, - "grad_norm": 3.7547767162323, - "learning_rate": 9.518520490367777e-05, - "loss": 0.9557, - "num_input_tokens_seen": 11079592, - "step": 688 - }, - { - "epoch": 0.04826324130745051, - "grad_norm": 4.038534641265869, - "learning_rate": 9.517820665499124e-05, - "loss": 1.1124, - "num_input_tokens_seen": 11095976, - "step": 689 - }, - { - "epoch": 0.048333289553179754, - "grad_norm": 4.159554481506348, - "learning_rate": 9.517120840630473e-05, - "loss": 1.0043, - "num_input_tokens_seen": 11112360, - "step": 690 - }, - { - "epoch": 0.048403337798909, - "grad_norm": 7.104836463928223, - "learning_rate": 9.516421015761821e-05, - "loss": 0.9736, - "num_input_tokens_seen": 11127800, - "step": 691 - }, - { - "epoch": 0.048473386044638245, - "grad_norm": 4.073885917663574, - "learning_rate": 9.51572119089317e-05, - "loss": 1.1249, - "num_input_tokens_seen": 11144184, - "step": 692 - }, - { - "epoch": 0.04854343429036749, - "grad_norm": 3.7190351486206055, - "learning_rate": 9.51502136602452e-05, - "loss": 1.1035, - "num_input_tokens_seen": 11160568, - "step": 693 - }, - { - "epoch": 0.048613482536096736, - "grad_norm": 4.252142429351807, - "learning_rate": 9.514321541155867e-05, - "loss": 1.1588, - "num_input_tokens_seen": 11176952, - "step": 694 - }, - { - "epoch": 0.04868353078182598, - "grad_norm": 4.418105125427246, - "learning_rate": 9.513621716287216e-05, - "loss": 1.2496, - "num_input_tokens_seen": 11193336, - "step": 695 - }, - { - "epoch": 0.04875357902755523, - "grad_norm": 4.195918560028076, - "learning_rate": 9.512921891418564e-05, - "loss": 1.0193, - "num_input_tokens_seen": 11209720, - "step": 696 - }, - { - "epoch": 0.04882362727328447, - "grad_norm": 5.138080596923828, - "learning_rate": 9.512222066549913e-05, - "loss": 1.1861, - "num_input_tokens_seen": 11225888, - "step": 697 - }, - { - "epoch": 0.04889367551901372, - "grad_norm": 4.489223003387451, - "learning_rate": 9.511522241681261e-05, - "loss": 1.1497, - "num_input_tokens_seen": 11241744, - "step": 698 - }, - { - "epoch": 0.048963723764742964, - "grad_norm": 3.972590208053589, - "learning_rate": 9.51082241681261e-05, - "loss": 1.2765, - "num_input_tokens_seen": 11257768, - "step": 699 - }, - { - "epoch": 0.04903377201047221, - "grad_norm": 13.274886131286621, - "learning_rate": 9.510122591943959e-05, - "loss": 1.1124, - "num_input_tokens_seen": 11273216, - "step": 700 - }, - { - "epoch": 0.049103820256201455, - "grad_norm": 3.7899255752563477, - "learning_rate": 9.509422767075307e-05, - "loss": 1.0445, - "num_input_tokens_seen": 11289600, - "step": 701 - }, - { - "epoch": 0.04917386850193071, - "grad_norm": 4.226947784423828, - "learning_rate": 9.508722942206656e-05, - "loss": 1.4313, - "num_input_tokens_seen": 11305920, - "step": 702 - }, - { - "epoch": 0.049243916747659953, - "grad_norm": 4.098162651062012, - "learning_rate": 9.508023117338003e-05, - "loss": 0.952, - "num_input_tokens_seen": 11322304, - "step": 703 - }, - { - "epoch": 0.0493139649933892, - "grad_norm": 3.9205965995788574, - "learning_rate": 9.507323292469352e-05, - "loss": 1.1648, - "num_input_tokens_seen": 11338688, - "step": 704 - }, - { - "epoch": 0.049384013239118445, - "grad_norm": 4.06537389755249, - "learning_rate": 9.506623467600701e-05, - "loss": 1.1295, - "num_input_tokens_seen": 11353544, - "step": 705 - }, - { - "epoch": 0.04945406148484769, - "grad_norm": 4.309032440185547, - "learning_rate": 9.50592364273205e-05, - "loss": 1.1475, - "num_input_tokens_seen": 11369928, - "step": 706 - }, - { - "epoch": 0.049524109730576936, - "grad_norm": 4.320526599884033, - "learning_rate": 9.505223817863399e-05, - "loss": 1.0102, - "num_input_tokens_seen": 11386312, - "step": 707 - }, - { - "epoch": 0.04959415797630618, - "grad_norm": 5.025510787963867, - "learning_rate": 9.504523992994747e-05, - "loss": 1.1182, - "num_input_tokens_seen": 11402696, - "step": 708 - }, - { - "epoch": 0.04966420622203543, - "grad_norm": 3.9406464099884033, - "learning_rate": 9.503824168126095e-05, - "loss": 1.068, - "num_input_tokens_seen": 11419080, - "step": 709 - }, - { - "epoch": 0.04973425446776467, - "grad_norm": 3.9148502349853516, - "learning_rate": 9.503124343257444e-05, - "loss": 1.1062, - "num_input_tokens_seen": 11435464, - "step": 710 - }, - { - "epoch": 0.04980430271349392, - "grad_norm": 3.9386026859283447, - "learning_rate": 9.502424518388791e-05, - "loss": 0.9516, - "num_input_tokens_seen": 11451848, - "step": 711 - }, - { - "epoch": 0.049874350959223164, - "grad_norm": 3.9537665843963623, - "learning_rate": 9.50172469352014e-05, - "loss": 1.1372, - "num_input_tokens_seen": 11468216, - "step": 712 - }, - { - "epoch": 0.04994439920495241, - "grad_norm": 3.97929310798645, - "learning_rate": 9.501024868651489e-05, - "loss": 1.0705, - "num_input_tokens_seen": 11484192, - "step": 713 - }, - { - "epoch": 0.050014447450681655, - "grad_norm": 3.9326419830322266, - "learning_rate": 9.500325043782838e-05, - "loss": 1.0986, - "num_input_tokens_seen": 11500576, - "step": 714 - }, - { - "epoch": 0.0500844956964109, - "grad_norm": 3.769347667694092, - "learning_rate": 9.499625218914187e-05, - "loss": 0.9265, - "num_input_tokens_seen": 11516960, - "step": 715 - }, - { - "epoch": 0.050154543942140146, - "grad_norm": 4.264547348022461, - "learning_rate": 9.498925394045534e-05, - "loss": 1.3166, - "num_input_tokens_seen": 11532616, - "step": 716 - }, - { - "epoch": 0.05022459218786939, - "grad_norm": 4.885791778564453, - "learning_rate": 9.498225569176883e-05, - "loss": 1.0669, - "num_input_tokens_seen": 11548552, - "step": 717 - }, - { - "epoch": 0.05029464043359864, - "grad_norm": 5.4089741706848145, - "learning_rate": 9.49752574430823e-05, - "loss": 1.3986, - "num_input_tokens_seen": 11564936, - "step": 718 - }, - { - "epoch": 0.05036468867932789, - "grad_norm": 4.503393173217773, - "learning_rate": 9.496825919439581e-05, - "loss": 0.9947, - "num_input_tokens_seen": 11580720, - "step": 719 - }, - { - "epoch": 0.050434736925057136, - "grad_norm": 4.364518165588379, - "learning_rate": 9.49612609457093e-05, - "loss": 1.12, - "num_input_tokens_seen": 11597104, - "step": 720 - }, - { - "epoch": 0.05050478517078638, - "grad_norm": 4.229926109313965, - "learning_rate": 9.495426269702277e-05, - "loss": 1.098, - "num_input_tokens_seen": 11612120, - "step": 721 - }, - { - "epoch": 0.05057483341651563, - "grad_norm": 4.477171897888184, - "learning_rate": 9.494726444833626e-05, - "loss": 1.1565, - "num_input_tokens_seen": 11627000, - "step": 722 - }, - { - "epoch": 0.05064488166224487, - "grad_norm": 4.071736812591553, - "learning_rate": 9.494026619964973e-05, - "loss": 1.2951, - "num_input_tokens_seen": 11643256, - "step": 723 - }, - { - "epoch": 0.05071492990797412, - "grad_norm": 4.219758033752441, - "learning_rate": 9.493326795096322e-05, - "loss": 1.1408, - "num_input_tokens_seen": 11659424, - "step": 724 - }, - { - "epoch": 0.050784978153703364, - "grad_norm": 4.108195781707764, - "learning_rate": 9.492626970227671e-05, - "loss": 0.9847, - "num_input_tokens_seen": 11675808, - "step": 725 - }, - { - "epoch": 0.05085502639943261, - "grad_norm": 3.964359760284424, - "learning_rate": 9.49192714535902e-05, - "loss": 1.0935, - "num_input_tokens_seen": 11691760, - "step": 726 - }, - { - "epoch": 0.050925074645161855, - "grad_norm": 4.585779190063477, - "learning_rate": 9.491227320490369e-05, - "loss": 1.1561, - "num_input_tokens_seen": 11706600, - "step": 727 - }, - { - "epoch": 0.0509951228908911, - "grad_norm": 3.8540141582489014, - "learning_rate": 9.490527495621716e-05, - "loss": 1.0163, - "num_input_tokens_seen": 11722984, - "step": 728 - }, - { - "epoch": 0.051065171136620346, - "grad_norm": 4.138955593109131, - "learning_rate": 9.489827670753065e-05, - "loss": 1.2842, - "num_input_tokens_seen": 11738968, - "step": 729 - }, - { - "epoch": 0.05113521938234959, - "grad_norm": 4.138274192810059, - "learning_rate": 9.489127845884413e-05, - "loss": 1.1452, - "num_input_tokens_seen": 11754952, - "step": 730 - }, - { - "epoch": 0.05120526762807884, - "grad_norm": 4.374305248260498, - "learning_rate": 9.488428021015762e-05, - "loss": 1.3622, - "num_input_tokens_seen": 11770832, - "step": 731 - }, - { - "epoch": 0.05127531587380808, - "grad_norm": 4.242674350738525, - "learning_rate": 9.48772819614711e-05, - "loss": 1.1914, - "num_input_tokens_seen": 11786872, - "step": 732 - }, - { - "epoch": 0.05134536411953733, - "grad_norm": 4.173389911651611, - "learning_rate": 9.48702837127846e-05, - "loss": 1.1853, - "num_input_tokens_seen": 11803256, - "step": 733 - }, - { - "epoch": 0.051415412365266575, - "grad_norm": 4.014588356018066, - "learning_rate": 9.486328546409808e-05, - "loss": 1.0436, - "num_input_tokens_seen": 11819608, - "step": 734 - }, - { - "epoch": 0.05148546061099582, - "grad_norm": 4.759418964385986, - "learning_rate": 9.485628721541157e-05, - "loss": 1.1605, - "num_input_tokens_seen": 11834296, - "step": 735 - }, - { - "epoch": 0.05155550885672507, - "grad_norm": 4.258687973022461, - "learning_rate": 9.484928896672505e-05, - "loss": 1.2993, - "num_input_tokens_seen": 11849728, - "step": 736 - }, - { - "epoch": 0.05162555710245432, - "grad_norm": 4.690395832061768, - "learning_rate": 9.484229071803853e-05, - "loss": 1.0655, - "num_input_tokens_seen": 11866112, - "step": 737 - }, - { - "epoch": 0.051695605348183564, - "grad_norm": 4.373327255249023, - "learning_rate": 9.483529246935201e-05, - "loss": 1.1364, - "num_input_tokens_seen": 11881960, - "step": 738 - }, - { - "epoch": 0.05176565359391281, - "grad_norm": 4.008789539337158, - "learning_rate": 9.482829422066551e-05, - "loss": 1.1174, - "num_input_tokens_seen": 11897936, - "step": 739 - }, - { - "epoch": 0.051835701839642055, - "grad_norm": 4.391345977783203, - "learning_rate": 9.482129597197899e-05, - "loss": 1.2045, - "num_input_tokens_seen": 11914320, - "step": 740 - }, - { - "epoch": 0.0519057500853713, - "grad_norm": 4.119503021240234, - "learning_rate": 9.481429772329248e-05, - "loss": 0.927, - "num_input_tokens_seen": 11930440, - "step": 741 - }, - { - "epoch": 0.051975798331100546, - "grad_norm": 4.186014175415039, - "learning_rate": 9.480729947460596e-05, - "loss": 1.1583, - "num_input_tokens_seen": 11946720, - "step": 742 - }, - { - "epoch": 0.05204584657682979, - "grad_norm": 4.119131088256836, - "learning_rate": 9.480030122591944e-05, - "loss": 1.0792, - "num_input_tokens_seen": 11962360, - "step": 743 - }, - { - "epoch": 0.05211589482255904, - "grad_norm": 3.921030044555664, - "learning_rate": 9.479330297723293e-05, - "loss": 0.9966, - "num_input_tokens_seen": 11978744, - "step": 744 - }, - { - "epoch": 0.05218594306828828, - "grad_norm": 3.806251049041748, - "learning_rate": 9.478630472854642e-05, - "loss": 1.1207, - "num_input_tokens_seen": 11994912, - "step": 745 - }, - { - "epoch": 0.05225599131401753, - "grad_norm": 4.508687973022461, - "learning_rate": 9.47793064798599e-05, - "loss": 1.1038, - "num_input_tokens_seen": 12011296, - "step": 746 - }, - { - "epoch": 0.052326039559746775, - "grad_norm": 4.458346843719482, - "learning_rate": 9.47723082311734e-05, - "loss": 1.2878, - "num_input_tokens_seen": 12027408, - "step": 747 - }, - { - "epoch": 0.05239608780547602, - "grad_norm": 5.779678821563721, - "learning_rate": 9.476530998248687e-05, - "loss": 1.2722, - "num_input_tokens_seen": 12043792, - "step": 748 - }, - { - "epoch": 0.052466136051205266, - "grad_norm": 4.621145725250244, - "learning_rate": 9.475831173380036e-05, - "loss": 1.2636, - "num_input_tokens_seen": 12059856, - "step": 749 - }, - { - "epoch": 0.05253618429693451, - "grad_norm": 4.276626110076904, - "learning_rate": 9.475131348511383e-05, - "loss": 1.3378, - "num_input_tokens_seen": 12076240, - "step": 750 - }, - { - "epoch": 0.05260623254266376, - "grad_norm": 4.533468246459961, - "learning_rate": 9.474431523642732e-05, - "loss": 0.921, - "num_input_tokens_seen": 12092416, - "step": 751 - }, - { - "epoch": 0.052676280788393, - "grad_norm": 4.626596927642822, - "learning_rate": 9.473731698774081e-05, - "loss": 1.2807, - "num_input_tokens_seen": 12108664, - "step": 752 - }, - { - "epoch": 0.052746329034122255, - "grad_norm": 4.3372907638549805, - "learning_rate": 9.47303187390543e-05, - "loss": 1.2754, - "num_input_tokens_seen": 12125048, - "step": 753 - }, - { - "epoch": 0.0528163772798515, - "grad_norm": 3.6576266288757324, - "learning_rate": 9.472332049036779e-05, - "loss": 0.8487, - "num_input_tokens_seen": 12141296, - "step": 754 - }, - { - "epoch": 0.052886425525580746, - "grad_norm": 3.8973164558410645, - "learning_rate": 9.471632224168126e-05, - "loss": 1.1211, - "num_input_tokens_seen": 12157544, - "step": 755 - }, - { - "epoch": 0.05295647377130999, - "grad_norm": 3.9059019088745117, - "learning_rate": 9.470932399299475e-05, - "loss": 1.2484, - "num_input_tokens_seen": 12173928, - "step": 756 - }, - { - "epoch": 0.05302652201703924, - "grad_norm": 4.133029937744141, - "learning_rate": 9.470232574430822e-05, - "loss": 1.0762, - "num_input_tokens_seen": 12189864, - "step": 757 - }, - { - "epoch": 0.05309657026276848, - "grad_norm": 3.8380961418151855, - "learning_rate": 9.469532749562171e-05, - "loss": 0.9938, - "num_input_tokens_seen": 12206248, - "step": 758 - }, - { - "epoch": 0.05316661850849773, - "grad_norm": 4.753637790679932, - "learning_rate": 9.468832924693522e-05, - "loss": 1.1272, - "num_input_tokens_seen": 12222632, - "step": 759 - }, - { - "epoch": 0.053236666754226974, - "grad_norm": 4.704193592071533, - "learning_rate": 9.468133099824869e-05, - "loss": 1.2276, - "num_input_tokens_seen": 12239016, - "step": 760 - }, - { - "epoch": 0.05330671499995622, - "grad_norm": 3.870870351791382, - "learning_rate": 9.467433274956218e-05, - "loss": 0.916, - "num_input_tokens_seen": 12254784, - "step": 761 - }, - { - "epoch": 0.053376763245685466, - "grad_norm": 3.8597328662872314, - "learning_rate": 9.466733450087567e-05, - "loss": 0.9871, - "num_input_tokens_seen": 12271160, - "step": 762 - }, - { - "epoch": 0.05344681149141471, - "grad_norm": 3.7109553813934326, - "learning_rate": 9.466033625218914e-05, - "loss": 1.1248, - "num_input_tokens_seen": 12286944, - "step": 763 - }, - { - "epoch": 0.05351685973714396, - "grad_norm": 3.985595464706421, - "learning_rate": 9.465333800350263e-05, - "loss": 1.0524, - "num_input_tokens_seen": 12303312, - "step": 764 - }, - { - "epoch": 0.0535869079828732, - "grad_norm": 3.797247886657715, - "learning_rate": 9.464633975481612e-05, - "loss": 1.0799, - "num_input_tokens_seen": 12319696, - "step": 765 - }, - { - "epoch": 0.05365695622860245, - "grad_norm": 4.88303279876709, - "learning_rate": 9.463934150612961e-05, - "loss": 1.2865, - "num_input_tokens_seen": 12335448, - "step": 766 - }, - { - "epoch": 0.053727004474331694, - "grad_norm": 4.273831367492676, - "learning_rate": 9.463234325744308e-05, - "loss": 1.1724, - "num_input_tokens_seen": 12351720, - "step": 767 - }, - { - "epoch": 0.05379705272006094, - "grad_norm": 3.9505984783172607, - "learning_rate": 9.462534500875657e-05, - "loss": 1.1478, - "num_input_tokens_seen": 12368104, - "step": 768 - }, - { - "epoch": 0.053867100965790185, - "grad_norm": 4.20963191986084, - "learning_rate": 9.461834676007006e-05, - "loss": 1.1018, - "num_input_tokens_seen": 12384488, - "step": 769 - }, - { - "epoch": 0.05393714921151943, - "grad_norm": 4.106869220733643, - "learning_rate": 9.461134851138354e-05, - "loss": 1.1097, - "num_input_tokens_seen": 12400128, - "step": 770 - }, - { - "epoch": 0.05400719745724868, - "grad_norm": 4.28592586517334, - "learning_rate": 9.460435026269702e-05, - "loss": 1.036, - "num_input_tokens_seen": 12416512, - "step": 771 - }, - { - "epoch": 0.05407724570297793, - "grad_norm": 3.821927070617676, - "learning_rate": 9.459735201401051e-05, - "loss": 1.1215, - "num_input_tokens_seen": 12432896, - "step": 772 - }, - { - "epoch": 0.054147293948707174, - "grad_norm": 4.14424467086792, - "learning_rate": 9.4590353765324e-05, - "loss": 1.0092, - "num_input_tokens_seen": 12449208, - "step": 773 - }, - { - "epoch": 0.05421734219443642, - "grad_norm": 4.610694885253906, - "learning_rate": 9.458335551663749e-05, - "loss": 1.2265, - "num_input_tokens_seen": 12464128, - "step": 774 - }, - { - "epoch": 0.054287390440165666, - "grad_norm": 4.410182952880859, - "learning_rate": 9.457635726795097e-05, - "loss": 1.1904, - "num_input_tokens_seen": 12479728, - "step": 775 - }, - { - "epoch": 0.05435743868589491, - "grad_norm": 4.096780300140381, - "learning_rate": 9.456935901926445e-05, - "loss": 1.2317, - "num_input_tokens_seen": 12495720, - "step": 776 - }, - { - "epoch": 0.05442748693162416, - "grad_norm": 4.028350830078125, - "learning_rate": 9.456236077057793e-05, - "loss": 1.1825, - "num_input_tokens_seen": 12511480, - "step": 777 - }, - { - "epoch": 0.0544975351773534, - "grad_norm": 5.264276504516602, - "learning_rate": 9.455536252189142e-05, - "loss": 1.057, - "num_input_tokens_seen": 12527864, - "step": 778 - }, - { - "epoch": 0.05456758342308265, - "grad_norm": 4.371725082397461, - "learning_rate": 9.454836427320492e-05, - "loss": 1.1625, - "num_input_tokens_seen": 12544168, - "step": 779 - }, - { - "epoch": 0.054637631668811894, - "grad_norm": 4.692862510681152, - "learning_rate": 9.45413660245184e-05, - "loss": 1.2211, - "num_input_tokens_seen": 12560552, - "step": 780 - }, - { - "epoch": 0.05470767991454114, - "grad_norm": 3.7462823390960693, - "learning_rate": 9.453436777583188e-05, - "loss": 1.0815, - "num_input_tokens_seen": 12576936, - "step": 781 - }, - { - "epoch": 0.054777728160270385, - "grad_norm": 4.161571025848389, - "learning_rate": 9.452736952714536e-05, - "loss": 0.9788, - "num_input_tokens_seen": 12593040, - "step": 782 - }, - { - "epoch": 0.05484777640599963, - "grad_norm": 3.96793532371521, - "learning_rate": 9.452037127845885e-05, - "loss": 1.1396, - "num_input_tokens_seen": 12609424, - "step": 783 - }, - { - "epoch": 0.054917824651728876, - "grad_norm": 4.183755874633789, - "learning_rate": 9.451337302977232e-05, - "loss": 1.0868, - "num_input_tokens_seen": 12625312, - "step": 784 - }, - { - "epoch": 0.05498787289745812, - "grad_norm": 4.506673336029053, - "learning_rate": 9.450637478108582e-05, - "loss": 1.1112, - "num_input_tokens_seen": 12641696, - "step": 785 - }, - { - "epoch": 0.05505792114318737, - "grad_norm": 3.8601651191711426, - "learning_rate": 9.449937653239931e-05, - "loss": 1.2149, - "num_input_tokens_seen": 12658080, - "step": 786 - }, - { - "epoch": 0.05512796938891661, - "grad_norm": 5.190856456756592, - "learning_rate": 9.449237828371279e-05, - "loss": 1.2661, - "num_input_tokens_seen": 12673032, - "step": 787 - }, - { - "epoch": 0.055198017634645866, - "grad_norm": 4.323099136352539, - "learning_rate": 9.448538003502628e-05, - "loss": 1.139, - "num_input_tokens_seen": 12689064, - "step": 788 - }, - { - "epoch": 0.05526806588037511, - "grad_norm": 4.271193981170654, - "learning_rate": 9.447838178633976e-05, - "loss": 1.037, - "num_input_tokens_seen": 12705448, - "step": 789 - }, - { - "epoch": 0.05533811412610436, - "grad_norm": 3.793525218963623, - "learning_rate": 9.447138353765324e-05, - "loss": 1.0265, - "num_input_tokens_seen": 12721832, - "step": 790 - }, - { - "epoch": 0.0554081623718336, - "grad_norm": 3.747575283050537, - "learning_rate": 9.446438528896673e-05, - "loss": 0.9567, - "num_input_tokens_seen": 12738216, - "step": 791 - }, - { - "epoch": 0.05547821061756285, - "grad_norm": 4.222849369049072, - "learning_rate": 9.445738704028022e-05, - "loss": 1.1859, - "num_input_tokens_seen": 12754600, - "step": 792 - }, - { - "epoch": 0.055548258863292094, - "grad_norm": 9.102783203125, - "learning_rate": 9.44503887915937e-05, - "loss": 1.0361, - "num_input_tokens_seen": 12770568, - "step": 793 - }, - { - "epoch": 0.05561830710902134, - "grad_norm": 4.4447808265686035, - "learning_rate": 9.444339054290718e-05, - "loss": 1.2908, - "num_input_tokens_seen": 12785768, - "step": 794 - }, - { - "epoch": 0.055688355354750585, - "grad_norm": 4.038604259490967, - "learning_rate": 9.443639229422067e-05, - "loss": 0.9294, - "num_input_tokens_seen": 12801704, - "step": 795 - }, - { - "epoch": 0.05575840360047983, - "grad_norm": 4.492194652557373, - "learning_rate": 9.442939404553416e-05, - "loss": 1.0466, - "num_input_tokens_seen": 12818088, - "step": 796 - }, - { - "epoch": 0.055828451846209076, - "grad_norm": 3.978029489517212, - "learning_rate": 9.442239579684763e-05, - "loss": 1.1719, - "num_input_tokens_seen": 12834432, - "step": 797 - }, - { - "epoch": 0.05589850009193832, - "grad_norm": 4.014431476593018, - "learning_rate": 9.441539754816112e-05, - "loss": 1.1222, - "num_input_tokens_seen": 12850816, - "step": 798 - }, - { - "epoch": 0.05596854833766757, - "grad_norm": 4.0948638916015625, - "learning_rate": 9.440839929947461e-05, - "loss": 1.2013, - "num_input_tokens_seen": 12867200, - "step": 799 - }, - { - "epoch": 0.05603859658339681, - "grad_norm": 4.18120813369751, - "learning_rate": 9.44014010507881e-05, - "loss": 0.9403, - "num_input_tokens_seen": 12883072, - "step": 800 - }, - { - "epoch": 0.05603859658339681, - "eval_loss": 1.1718552112579346, - "eval_runtime": 0.2039, - "eval_samples_per_second": 4.905, - "eval_steps_per_second": 4.905, - "num_input_tokens_seen": 12883072, - "step": 800 - }, - { - "epoch": 0.05610864482912606, - "grad_norm": 4.425891399383545, - "learning_rate": 9.439440280210159e-05, - "loss": 1.0435, - "num_input_tokens_seen": 12899456, - "step": 801 - }, - { - "epoch": 0.056178693074855304, - "grad_norm": 4.319190979003906, - "learning_rate": 9.438740455341506e-05, - "loss": 1.2612, - "num_input_tokens_seen": 12915840, - "step": 802 - }, - { - "epoch": 0.05624874132058455, - "grad_norm": 4.28010892868042, - "learning_rate": 9.438040630472855e-05, - "loss": 1.0853, - "num_input_tokens_seen": 12932096, - "step": 803 - }, - { - "epoch": 0.056318789566313796, - "grad_norm": 3.9454870223999023, - "learning_rate": 9.437340805604203e-05, - "loss": 1.055, - "num_input_tokens_seen": 12948208, - "step": 804 - }, - { - "epoch": 0.05638883781204305, - "grad_norm": 4.009400367736816, - "learning_rate": 9.436640980735553e-05, - "loss": 1.0681, - "num_input_tokens_seen": 12964096, - "step": 805 - }, - { - "epoch": 0.056458886057772294, - "grad_norm": 3.7949161529541016, - "learning_rate": 9.435941155866902e-05, - "loss": 1.0787, - "num_input_tokens_seen": 12980480, - "step": 806 - }, - { - "epoch": 0.05652893430350154, - "grad_norm": 3.910456418991089, - "learning_rate": 9.435241330998249e-05, - "loss": 0.9212, - "num_input_tokens_seen": 12996864, - "step": 807 - }, - { - "epoch": 0.056598982549230785, - "grad_norm": 4.744706630706787, - "learning_rate": 9.434541506129598e-05, - "loss": 1.0582, - "num_input_tokens_seen": 13013248, - "step": 808 - }, - { - "epoch": 0.05666903079496003, - "grad_norm": 4.4282732009887695, - "learning_rate": 9.433841681260946e-05, - "loss": 1.1353, - "num_input_tokens_seen": 13029632, - "step": 809 - }, - { - "epoch": 0.056739079040689276, - "grad_norm": 3.8422467708587646, - "learning_rate": 9.433141856392294e-05, - "loss": 0.9881, - "num_input_tokens_seen": 13046016, - "step": 810 - }, - { - "epoch": 0.05680912728641852, - "grad_norm": 4.1764445304870605, - "learning_rate": 9.432442031523643e-05, - "loss": 1.183, - "num_input_tokens_seen": 13062400, - "step": 811 - }, - { - "epoch": 0.05687917553214777, - "grad_norm": 4.713895320892334, - "learning_rate": 9.431742206654992e-05, - "loss": 1.0752, - "num_input_tokens_seen": 13078584, - "step": 812 - }, - { - "epoch": 0.05694922377787701, - "grad_norm": 4.265610694885254, - "learning_rate": 9.431042381786341e-05, - "loss": 0.9469, - "num_input_tokens_seen": 13094968, - "step": 813 - }, - { - "epoch": 0.05701927202360626, - "grad_norm": 3.9274330139160156, - "learning_rate": 9.430342556917688e-05, - "loss": 1.1765, - "num_input_tokens_seen": 13111304, - "step": 814 - }, - { - "epoch": 0.057089320269335504, - "grad_norm": 4.44935941696167, - "learning_rate": 9.429642732049037e-05, - "loss": 1.1014, - "num_input_tokens_seen": 13127304, - "step": 815 - }, - { - "epoch": 0.05715936851506475, - "grad_norm": 5.019375801086426, - "learning_rate": 9.428942907180386e-05, - "loss": 1.0535, - "num_input_tokens_seen": 13143688, - "step": 816 - }, - { - "epoch": 0.057229416760793995, - "grad_norm": 4.743424892425537, - "learning_rate": 9.428243082311734e-05, - "loss": 1.3912, - "num_input_tokens_seen": 13160072, - "step": 817 - }, - { - "epoch": 0.05729946500652324, - "grad_norm": 3.921475887298584, - "learning_rate": 9.427543257443083e-05, - "loss": 1.1116, - "num_input_tokens_seen": 13176456, - "step": 818 - }, - { - "epoch": 0.05736951325225249, - "grad_norm": 4.106019020080566, - "learning_rate": 9.426843432574431e-05, - "loss": 0.9, - "num_input_tokens_seen": 13192840, - "step": 819 - }, - { - "epoch": 0.05743956149798173, - "grad_norm": 4.298704147338867, - "learning_rate": 9.42614360770578e-05, - "loss": 1.281, - "num_input_tokens_seen": 13209144, - "step": 820 - }, - { - "epoch": 0.05750960974371098, - "grad_norm": 4.29774284362793, - "learning_rate": 9.425443782837128e-05, - "loss": 1.2703, - "num_input_tokens_seen": 13224752, - "step": 821 - }, - { - "epoch": 0.057579657989440224, - "grad_norm": 4.6176838874816895, - "learning_rate": 9.424743957968477e-05, - "loss": 1.232, - "num_input_tokens_seen": 13240856, - "step": 822 - }, - { - "epoch": 0.057649706235169476, - "grad_norm": 4.450786590576172, - "learning_rate": 9.424044133099826e-05, - "loss": 1.1369, - "num_input_tokens_seen": 13256800, - "step": 823 - }, - { - "epoch": 0.05771975448089872, - "grad_norm": 3.8302414417266846, - "learning_rate": 9.423344308231173e-05, - "loss": 0.9985, - "num_input_tokens_seen": 13273032, - "step": 824 - }, - { - "epoch": 0.05778980272662797, - "grad_norm": 4.641941070556641, - "learning_rate": 9.422644483362523e-05, - "loss": 1.2238, - "num_input_tokens_seen": 13289104, - "step": 825 - }, - { - "epoch": 0.05785985097235721, - "grad_norm": 4.369805335998535, - "learning_rate": 9.421944658493871e-05, - "loss": 1.2047, - "num_input_tokens_seen": 13304752, - "step": 826 - }, - { - "epoch": 0.05792989921808646, - "grad_norm": 3.863507032394409, - "learning_rate": 9.42124483362522e-05, - "loss": 1.1098, - "num_input_tokens_seen": 13321088, - "step": 827 - }, - { - "epoch": 0.057999947463815704, - "grad_norm": 5.323369979858398, - "learning_rate": 9.420545008756568e-05, - "loss": 1.1722, - "num_input_tokens_seen": 13336912, - "step": 828 - }, - { - "epoch": 0.05806999570954495, - "grad_norm": 4.006597995758057, - "learning_rate": 9.419845183887916e-05, - "loss": 1.0382, - "num_input_tokens_seen": 13353280, - "step": 829 - }, - { - "epoch": 0.058140043955274195, - "grad_norm": 4.1039886474609375, - "learning_rate": 9.419145359019265e-05, - "loss": 1.2037, - "num_input_tokens_seen": 13369664, - "step": 830 - }, - { - "epoch": 0.05821009220100344, - "grad_norm": 3.903517007827759, - "learning_rate": 9.418445534150614e-05, - "loss": 1.2185, - "num_input_tokens_seen": 13386048, - "step": 831 - }, - { - "epoch": 0.05828014044673269, - "grad_norm": 4.434885025024414, - "learning_rate": 9.417745709281963e-05, - "loss": 1.2444, - "num_input_tokens_seen": 13402432, - "step": 832 - }, - { - "epoch": 0.05835018869246193, - "grad_norm": 4.6121296882629395, - "learning_rate": 9.417045884413311e-05, - "loss": 1.2831, - "num_input_tokens_seen": 13418816, - "step": 833 - }, - { - "epoch": 0.05842023693819118, - "grad_norm": 3.6966841220855713, - "learning_rate": 9.416346059544659e-05, - "loss": 1.0751, - "num_input_tokens_seen": 13435200, - "step": 834 - }, - { - "epoch": 0.058490285183920424, - "grad_norm": 4.292221546173096, - "learning_rate": 9.415646234676008e-05, - "loss": 1.2068, - "num_input_tokens_seen": 13451584, - "step": 835 - }, - { - "epoch": 0.05856033342964967, - "grad_norm": 4.053999900817871, - "learning_rate": 9.414946409807355e-05, - "loss": 1.1735, - "num_input_tokens_seen": 13467824, - "step": 836 - }, - { - "epoch": 0.058630381675378915, - "grad_norm": 4.4411234855651855, - "learning_rate": 9.414246584938704e-05, - "loss": 1.0647, - "num_input_tokens_seen": 13483200, - "step": 837 - }, - { - "epoch": 0.05870042992110816, - "grad_norm": 3.956787347793579, - "learning_rate": 9.413546760070053e-05, - "loss": 0.9813, - "num_input_tokens_seen": 13499584, - "step": 838 - }, - { - "epoch": 0.058770478166837406, - "grad_norm": 5.050291061401367, - "learning_rate": 9.412846935201402e-05, - "loss": 1.1193, - "num_input_tokens_seen": 13515448, - "step": 839 - }, - { - "epoch": 0.05884052641256666, - "grad_norm": 3.8736393451690674, - "learning_rate": 9.412147110332751e-05, - "loss": 1.0294, - "num_input_tokens_seen": 13531200, - "step": 840 - }, - { - "epoch": 0.058910574658295904, - "grad_norm": 6.07747745513916, - "learning_rate": 9.411447285464098e-05, - "loss": 0.9684, - "num_input_tokens_seen": 13547584, - "step": 841 - }, - { - "epoch": 0.05898062290402515, - "grad_norm": 4.606445789337158, - "learning_rate": 9.410747460595447e-05, - "loss": 1.2119, - "num_input_tokens_seen": 13563528, - "step": 842 - }, - { - "epoch": 0.059050671149754395, - "grad_norm": 4.3981709480285645, - "learning_rate": 9.410047635726796e-05, - "loss": 1.3313, - "num_input_tokens_seen": 13579912, - "step": 843 - }, - { - "epoch": 0.05912071939548364, - "grad_norm": 3.64546799659729, - "learning_rate": 9.409347810858143e-05, - "loss": 0.8892, - "num_input_tokens_seen": 13596296, - "step": 844 - }, - { - "epoch": 0.05919076764121289, - "grad_norm": 4.15845251083374, - "learning_rate": 9.408647985989494e-05, - "loss": 1.1464, - "num_input_tokens_seen": 13612680, - "step": 845 - }, - { - "epoch": 0.05926081588694213, - "grad_norm": 6.049203872680664, - "learning_rate": 9.407948161120841e-05, - "loss": 1.1907, - "num_input_tokens_seen": 13627832, - "step": 846 - }, - { - "epoch": 0.05933086413267138, - "grad_norm": 3.7192461490631104, - "learning_rate": 9.40724833625219e-05, - "loss": 1.165, - "num_input_tokens_seen": 13643824, - "step": 847 - }, - { - "epoch": 0.059400912378400623, - "grad_norm": 4.183239936828613, - "learning_rate": 9.406548511383537e-05, - "loss": 1.1697, - "num_input_tokens_seen": 13660208, - "step": 848 - }, - { - "epoch": 0.05947096062412987, - "grad_norm": 4.126212120056152, - "learning_rate": 9.405848686514886e-05, - "loss": 1.0532, - "num_input_tokens_seen": 13676592, - "step": 849 - }, - { - "epoch": 0.059541008869859115, - "grad_norm": 4.033525466918945, - "learning_rate": 9.405148861646235e-05, - "loss": 1.1497, - "num_input_tokens_seen": 13692600, - "step": 850 - }, - { - "epoch": 0.05961105711558836, - "grad_norm": 4.162797451019287, - "learning_rate": 9.404449036777584e-05, - "loss": 1.162, - "num_input_tokens_seen": 13708984, - "step": 851 - }, - { - "epoch": 0.059681105361317606, - "grad_norm": 4.057224750518799, - "learning_rate": 9.403749211908933e-05, - "loss": 1.2166, - "num_input_tokens_seen": 13724656, - "step": 852 - }, - { - "epoch": 0.05975115360704685, - "grad_norm": 4.201955318450928, - "learning_rate": 9.40304938704028e-05, - "loss": 1.2195, - "num_input_tokens_seen": 13741040, - "step": 853 - }, - { - "epoch": 0.0598212018527761, - "grad_norm": 3.8704352378845215, - "learning_rate": 9.402349562171629e-05, - "loss": 0.8946, - "num_input_tokens_seen": 13757424, - "step": 854 - }, - { - "epoch": 0.05989125009850534, - "grad_norm": 6.010958671569824, - "learning_rate": 9.401649737302978e-05, - "loss": 1.2095, - "num_input_tokens_seen": 13773808, - "step": 855 - }, - { - "epoch": 0.05996129834423459, - "grad_norm": 4.975742816925049, - "learning_rate": 9.400949912434326e-05, - "loss": 1.1064, - "num_input_tokens_seen": 13789704, - "step": 856 - }, - { - "epoch": 0.06003134658996384, - "grad_norm": 4.021739959716797, - "learning_rate": 9.400250087565675e-05, - "loss": 1.2036, - "num_input_tokens_seen": 13806088, - "step": 857 - }, - { - "epoch": 0.06010139483569309, - "grad_norm": 4.262394905090332, - "learning_rate": 9.399550262697023e-05, - "loss": 1.1053, - "num_input_tokens_seen": 13821928, - "step": 858 - }, - { - "epoch": 0.06017144308142233, - "grad_norm": 4.3033671379089355, - "learning_rate": 9.398850437828372e-05, - "loss": 1.0213, - "num_input_tokens_seen": 13838232, - "step": 859 - }, - { - "epoch": 0.06024149132715158, - "grad_norm": 4.066610336303711, - "learning_rate": 9.398150612959721e-05, - "loss": 1.0579, - "num_input_tokens_seen": 13853912, - "step": 860 - }, - { - "epoch": 0.06031153957288082, - "grad_norm": 4.308155059814453, - "learning_rate": 9.397450788091069e-05, - "loss": 1.3624, - "num_input_tokens_seen": 13870224, - "step": 861 - }, - { - "epoch": 0.06038158781861007, - "grad_norm": 4.307553291320801, - "learning_rate": 9.396750963222417e-05, - "loss": 1.0942, - "num_input_tokens_seen": 13886608, - "step": 862 - }, - { - "epoch": 0.060451636064339315, - "grad_norm": 3.8107142448425293, - "learning_rate": 9.396051138353765e-05, - "loss": 1.1285, - "num_input_tokens_seen": 13902992, - "step": 863 - }, - { - "epoch": 0.06052168431006856, - "grad_norm": 4.530765533447266, - "learning_rate": 9.395351313485114e-05, - "loss": 1.2028, - "num_input_tokens_seen": 13919376, - "step": 864 - }, - { - "epoch": 0.060591732555797806, - "grad_norm": 4.035069465637207, - "learning_rate": 9.394651488616463e-05, - "loss": 1.0291, - "num_input_tokens_seen": 13935664, - "step": 865 - }, - { - "epoch": 0.06066178080152705, - "grad_norm": 4.028316497802734, - "learning_rate": 9.393951663747812e-05, - "loss": 1.21, - "num_input_tokens_seen": 13951096, - "step": 866 - }, - { - "epoch": 0.0607318290472563, - "grad_norm": 4.039167881011963, - "learning_rate": 9.39325183887916e-05, - "loss": 0.929, - "num_input_tokens_seen": 13966272, - "step": 867 - }, - { - "epoch": 0.06080187729298554, - "grad_norm": 4.139703273773193, - "learning_rate": 9.392552014010508e-05, - "loss": 1.2575, - "num_input_tokens_seen": 13981848, - "step": 868 - }, - { - "epoch": 0.06087192553871479, - "grad_norm": 4.222180366516113, - "learning_rate": 9.391852189141857e-05, - "loss": 1.2067, - "num_input_tokens_seen": 13997920, - "step": 869 - }, - { - "epoch": 0.060941973784444034, - "grad_norm": 3.7993030548095703, - "learning_rate": 9.391152364273206e-05, - "loss": 1.0865, - "num_input_tokens_seen": 14014304, - "step": 870 - }, - { - "epoch": 0.06101202203017328, - "grad_norm": 4.811493396759033, - "learning_rate": 9.390452539404554e-05, - "loss": 1.1331, - "num_input_tokens_seen": 14030688, - "step": 871 - }, - { - "epoch": 0.061082070275902525, - "grad_norm": 13.88792610168457, - "learning_rate": 9.389752714535903e-05, - "loss": 1.1368, - "num_input_tokens_seen": 14045584, - "step": 872 - }, - { - "epoch": 0.06115211852163177, - "grad_norm": 3.7678709030151367, - "learning_rate": 9.389052889667251e-05, - "loss": 1.1012, - "num_input_tokens_seen": 14061968, - "step": 873 - }, - { - "epoch": 0.061222166767361016, - "grad_norm": 4.252075672149658, - "learning_rate": 9.3883530647986e-05, - "loss": 1.0472, - "num_input_tokens_seen": 14077584, - "step": 874 - }, - { - "epoch": 0.06129221501309027, - "grad_norm": 3.555629253387451, - "learning_rate": 9.387653239929947e-05, - "loss": 0.8653, - "num_input_tokens_seen": 14093704, - "step": 875 - }, - { - "epoch": 0.061362263258819515, - "grad_norm": 4.122331619262695, - "learning_rate": 9.386953415061296e-05, - "loss": 1.0395, - "num_input_tokens_seen": 14109624, - "step": 876 - }, - { - "epoch": 0.06143231150454876, - "grad_norm": 3.6772518157958984, - "learning_rate": 9.386253590192645e-05, - "loss": 0.8842, - "num_input_tokens_seen": 14126008, - "step": 877 - }, - { - "epoch": 0.061502359750278006, - "grad_norm": 3.791351079940796, - "learning_rate": 9.385553765323994e-05, - "loss": 1.1118, - "num_input_tokens_seen": 14142392, - "step": 878 - }, - { - "epoch": 0.06157240799600725, - "grad_norm": 3.781759738922119, - "learning_rate": 9.384853940455343e-05, - "loss": 1.0577, - "num_input_tokens_seen": 14158776, - "step": 879 - }, - { - "epoch": 0.0616424562417365, - "grad_norm": 4.2420830726623535, - "learning_rate": 9.38415411558669e-05, - "loss": 1.268, - "num_input_tokens_seen": 14173920, - "step": 880 - }, - { - "epoch": 0.06171250448746574, - "grad_norm": 4.000860214233398, - "learning_rate": 9.383454290718039e-05, - "loss": 1.1626, - "num_input_tokens_seen": 14190032, - "step": 881 - }, - { - "epoch": 0.06178255273319499, - "grad_norm": 3.760969877243042, - "learning_rate": 9.382754465849388e-05, - "loss": 0.9684, - "num_input_tokens_seen": 14206416, - "step": 882 - }, - { - "epoch": 0.061852600978924234, - "grad_norm": 4.81919002532959, - "learning_rate": 9.382054640980735e-05, - "loss": 1.1056, - "num_input_tokens_seen": 14222408, - "step": 883 - }, - { - "epoch": 0.06192264922465348, - "grad_norm": 4.951950550079346, - "learning_rate": 9.381354816112084e-05, - "loss": 1.0334, - "num_input_tokens_seen": 14238616, - "step": 884 - }, - { - "epoch": 0.061992697470382725, - "grad_norm": 4.15132999420166, - "learning_rate": 9.380654991243433e-05, - "loss": 1.3171, - "num_input_tokens_seen": 14254968, - "step": 885 - }, - { - "epoch": 0.06206274571611197, - "grad_norm": 5.100244998931885, - "learning_rate": 9.379955166374782e-05, - "loss": 1.1684, - "num_input_tokens_seen": 14271352, - "step": 886 - }, - { - "epoch": 0.062132793961841216, - "grad_norm": 5.999105453491211, - "learning_rate": 9.379255341506131e-05, - "loss": 0.9824, - "num_input_tokens_seen": 14287496, - "step": 887 - }, - { - "epoch": 0.06220284220757046, - "grad_norm": 3.8826348781585693, - "learning_rate": 9.378555516637478e-05, - "loss": 1.0829, - "num_input_tokens_seen": 14303880, - "step": 888 - }, - { - "epoch": 0.06227289045329971, - "grad_norm": 5.308819770812988, - "learning_rate": 9.377855691768827e-05, - "loss": 1.1377, - "num_input_tokens_seen": 14320264, - "step": 889 - }, - { - "epoch": 0.06234293869902895, - "grad_norm": 4.383331775665283, - "learning_rate": 9.377155866900175e-05, - "loss": 1.0147, - "num_input_tokens_seen": 14336232, - "step": 890 - }, - { - "epoch": 0.0624129869447582, - "grad_norm": 4.335045337677002, - "learning_rate": 9.376456042031524e-05, - "loss": 0.9807, - "num_input_tokens_seen": 14351704, - "step": 891 - }, - { - "epoch": 0.06248303519048745, - "grad_norm": 3.6901326179504395, - "learning_rate": 9.375756217162872e-05, - "loss": 1.0494, - "num_input_tokens_seen": 14368088, - "step": 892 - }, - { - "epoch": 0.0625530834362167, - "grad_norm": 3.912727117538452, - "learning_rate": 9.375056392294221e-05, - "loss": 1.1191, - "num_input_tokens_seen": 14383904, - "step": 893 - }, - { - "epoch": 0.06262313168194594, - "grad_norm": 3.5688252449035645, - "learning_rate": 9.37435656742557e-05, - "loss": 0.833, - "num_input_tokens_seen": 14399648, - "step": 894 - }, - { - "epoch": 0.06269317992767519, - "grad_norm": 4.6460137367248535, - "learning_rate": 9.373656742556918e-05, - "loss": 1.2523, - "num_input_tokens_seen": 14415640, - "step": 895 - }, - { - "epoch": 0.06276322817340443, - "grad_norm": 3.8113012313842773, - "learning_rate": 9.372956917688266e-05, - "loss": 1.1789, - "num_input_tokens_seen": 14432024, - "step": 896 - }, - { - "epoch": 0.06283327641913368, - "grad_norm": 3.8755953311920166, - "learning_rate": 9.372257092819615e-05, - "loss": 1.1506, - "num_input_tokens_seen": 14448152, - "step": 897 - }, - { - "epoch": 0.06290332466486293, - "grad_norm": 4.225901126861572, - "learning_rate": 9.371557267950964e-05, - "loss": 1.0754, - "num_input_tokens_seen": 14464536, - "step": 898 - }, - { - "epoch": 0.06297337291059217, - "grad_norm": 3.9437992572784424, - "learning_rate": 9.370857443082313e-05, - "loss": 1.049, - "num_input_tokens_seen": 14480072, - "step": 899 - }, - { - "epoch": 0.06304342115632142, - "grad_norm": 3.8961846828460693, - "learning_rate": 9.37015761821366e-05, - "loss": 1.1925, - "num_input_tokens_seen": 14496456, - "step": 900 - }, - { - "epoch": 0.06311346940205066, - "grad_norm": 4.844581604003906, - "learning_rate": 9.36945779334501e-05, - "loss": 1.0867, - "num_input_tokens_seen": 14512520, - "step": 901 - }, - { - "epoch": 0.06318351764777991, - "grad_norm": 4.89027214050293, - "learning_rate": 9.368757968476357e-05, - "loss": 1.0997, - "num_input_tokens_seen": 14528904, - "step": 902 - }, - { - "epoch": 0.06325356589350915, - "grad_norm": 4.303073883056641, - "learning_rate": 9.368058143607706e-05, - "loss": 1.0626, - "num_input_tokens_seen": 14545288, - "step": 903 - }, - { - "epoch": 0.0633236141392384, - "grad_norm": 5.145171165466309, - "learning_rate": 9.367358318739055e-05, - "loss": 1.3597, - "num_input_tokens_seen": 14561672, - "step": 904 - }, - { - "epoch": 0.06339366238496764, - "grad_norm": 5.7905964851379395, - "learning_rate": 9.366658493870403e-05, - "loss": 1.1075, - "num_input_tokens_seen": 14575896, - "step": 905 - }, - { - "epoch": 0.06346371063069689, - "grad_norm": 3.7394728660583496, - "learning_rate": 9.365958669001752e-05, - "loss": 0.9347, - "num_input_tokens_seen": 14592280, - "step": 906 - }, - { - "epoch": 0.06353375887642614, - "grad_norm": 3.916626453399658, - "learning_rate": 9.3652588441331e-05, - "loss": 1.0793, - "num_input_tokens_seen": 14608072, - "step": 907 - }, - { - "epoch": 0.06360380712215538, - "grad_norm": 5.088227272033691, - "learning_rate": 9.364559019264449e-05, - "loss": 1.158, - "num_input_tokens_seen": 14624360, - "step": 908 - }, - { - "epoch": 0.06367385536788463, - "grad_norm": 3.8519606590270996, - "learning_rate": 9.363859194395798e-05, - "loss": 1.1235, - "num_input_tokens_seen": 14640744, - "step": 909 - }, - { - "epoch": 0.06374390361361387, - "grad_norm": 4.450200080871582, - "learning_rate": 9.363159369527145e-05, - "loss": 1.0145, - "num_input_tokens_seen": 14657128, - "step": 910 - }, - { - "epoch": 0.06381395185934312, - "grad_norm": 4.188115119934082, - "learning_rate": 9.362459544658494e-05, - "loss": 1.1457, - "num_input_tokens_seen": 14673128, - "step": 911 - }, - { - "epoch": 0.06388400010507236, - "grad_norm": 4.67346715927124, - "learning_rate": 9.361759719789843e-05, - "loss": 1.2841, - "num_input_tokens_seen": 14689512, - "step": 912 - }, - { - "epoch": 0.06395404835080161, - "grad_norm": 3.737790822982788, - "learning_rate": 9.361059894921192e-05, - "loss": 1.0114, - "num_input_tokens_seen": 14705872, - "step": 913 - }, - { - "epoch": 0.06402409659653086, - "grad_norm": 4.2486653327941895, - "learning_rate": 9.36036007005254e-05, - "loss": 1.1526, - "num_input_tokens_seen": 14721816, - "step": 914 - }, - { - "epoch": 0.0640941448422601, - "grad_norm": 4.120566368103027, - "learning_rate": 9.359660245183888e-05, - "loss": 1.1045, - "num_input_tokens_seen": 14738200, - "step": 915 - }, - { - "epoch": 0.06416419308798935, - "grad_norm": 5.259902477264404, - "learning_rate": 9.358960420315237e-05, - "loss": 1.3544, - "num_input_tokens_seen": 14753920, - "step": 916 - }, - { - "epoch": 0.06423424133371859, - "grad_norm": 3.900827646255493, - "learning_rate": 9.358260595446584e-05, - "loss": 1.1079, - "num_input_tokens_seen": 14769640, - "step": 917 - }, - { - "epoch": 0.06430428957944785, - "grad_norm": 4.103065490722656, - "learning_rate": 9.357560770577935e-05, - "loss": 0.963, - "num_input_tokens_seen": 14786024, - "step": 918 - }, - { - "epoch": 0.0643743378251771, - "grad_norm": 3.9913623332977295, - "learning_rate": 9.356860945709282e-05, - "loss": 1.0959, - "num_input_tokens_seen": 14802408, - "step": 919 - }, - { - "epoch": 0.06444438607090634, - "grad_norm": 3.7369885444641113, - "learning_rate": 9.356161120840631e-05, - "loss": 1.131, - "num_input_tokens_seen": 14818792, - "step": 920 - }, - { - "epoch": 0.06451443431663559, - "grad_norm": 4.029351711273193, - "learning_rate": 9.35546129597198e-05, - "loss": 1.0378, - "num_input_tokens_seen": 14833792, - "step": 921 - }, - { - "epoch": 0.06458448256236483, - "grad_norm": 4.043665885925293, - "learning_rate": 9.354761471103327e-05, - "loss": 1.179, - "num_input_tokens_seen": 14850176, - "step": 922 - }, - { - "epoch": 0.06465453080809408, - "grad_norm": 3.7803280353546143, - "learning_rate": 9.354061646234676e-05, - "loss": 0.9886, - "num_input_tokens_seen": 14866096, - "step": 923 - }, - { - "epoch": 0.06472457905382333, - "grad_norm": 5.537375450134277, - "learning_rate": 9.353361821366025e-05, - "loss": 1.2519, - "num_input_tokens_seen": 14882480, - "step": 924 - }, - { - "epoch": 0.06479462729955257, - "grad_norm": 4.944652557373047, - "learning_rate": 9.352661996497374e-05, - "loss": 1.1963, - "num_input_tokens_seen": 14898864, - "step": 925 - }, - { - "epoch": 0.06486467554528182, - "grad_norm": 4.3231611251831055, - "learning_rate": 9.351962171628723e-05, - "loss": 1.1858, - "num_input_tokens_seen": 14913856, - "step": 926 - }, - { - "epoch": 0.06493472379101106, - "grad_norm": 4.386692523956299, - "learning_rate": 9.35126234676007e-05, - "loss": 1.0464, - "num_input_tokens_seen": 14929816, - "step": 927 - }, - { - "epoch": 0.06500477203674031, - "grad_norm": 4.607088088989258, - "learning_rate": 9.350562521891419e-05, - "loss": 1.2197, - "num_input_tokens_seen": 14946200, - "step": 928 - }, - { - "epoch": 0.06507482028246955, - "grad_norm": 4.7108001708984375, - "learning_rate": 9.349862697022767e-05, - "loss": 1.2335, - "num_input_tokens_seen": 14961816, - "step": 929 - }, - { - "epoch": 0.0651448685281988, - "grad_norm": 3.844571352005005, - "learning_rate": 9.349162872154115e-05, - "loss": 1.2745, - "num_input_tokens_seen": 14978200, - "step": 930 - }, - { - "epoch": 0.06521491677392804, - "grad_norm": 4.078561782836914, - "learning_rate": 9.348463047285464e-05, - "loss": 1.1737, - "num_input_tokens_seen": 14994440, - "step": 931 - }, - { - "epoch": 0.06528496501965729, - "grad_norm": 4.317986011505127, - "learning_rate": 9.347763222416813e-05, - "loss": 1.3046, - "num_input_tokens_seen": 15010824, - "step": 932 - }, - { - "epoch": 0.06535501326538654, - "grad_norm": 4.459141254425049, - "learning_rate": 9.347063397548162e-05, - "loss": 1.2893, - "num_input_tokens_seen": 15026608, - "step": 933 - }, - { - "epoch": 0.06542506151111578, - "grad_norm": 4.251399993896484, - "learning_rate": 9.34636357267951e-05, - "loss": 1.2346, - "num_input_tokens_seen": 15042328, - "step": 934 - }, - { - "epoch": 0.06549510975684503, - "grad_norm": 4.568341255187988, - "learning_rate": 9.345663747810858e-05, - "loss": 1.4343, - "num_input_tokens_seen": 15058712, - "step": 935 - }, - { - "epoch": 0.06556515800257427, - "grad_norm": 4.7616424560546875, - "learning_rate": 9.344963922942207e-05, - "loss": 1.0925, - "num_input_tokens_seen": 15075096, - "step": 936 - }, - { - "epoch": 0.06563520624830352, - "grad_norm": 3.8224191665649414, - "learning_rate": 9.344264098073555e-05, - "loss": 1.0958, - "num_input_tokens_seen": 15091480, - "step": 937 - }, - { - "epoch": 0.06570525449403276, - "grad_norm": 4.985624313354492, - "learning_rate": 9.343564273204905e-05, - "loss": 1.233, - "num_input_tokens_seen": 15107864, - "step": 938 - }, - { - "epoch": 0.06577530273976201, - "grad_norm": 4.3780975341796875, - "learning_rate": 9.342864448336252e-05, - "loss": 1.1819, - "num_input_tokens_seen": 15123656, - "step": 939 - }, - { - "epoch": 0.06584535098549125, - "grad_norm": 4.435183525085449, - "learning_rate": 9.342164623467601e-05, - "loss": 1.1107, - "num_input_tokens_seen": 15140040, - "step": 940 - }, - { - "epoch": 0.0659153992312205, - "grad_norm": 4.560804843902588, - "learning_rate": 9.34146479859895e-05, - "loss": 1.1274, - "num_input_tokens_seen": 15156424, - "step": 941 - }, - { - "epoch": 0.06598544747694975, - "grad_norm": 5.184841156005859, - "learning_rate": 9.340764973730298e-05, - "loss": 1.3124, - "num_input_tokens_seen": 15172504, - "step": 942 - }, - { - "epoch": 0.06605549572267899, - "grad_norm": 3.5243096351623535, - "learning_rate": 9.340065148861647e-05, - "loss": 0.8203, - "num_input_tokens_seen": 15188888, - "step": 943 - }, - { - "epoch": 0.06612554396840824, - "grad_norm": 4.041544437408447, - "learning_rate": 9.339365323992995e-05, - "loss": 1.0602, - "num_input_tokens_seen": 15204672, - "step": 944 - }, - { - "epoch": 0.06619559221413748, - "grad_norm": 3.720906972885132, - "learning_rate": 9.338665499124344e-05, - "loss": 1.0722, - "num_input_tokens_seen": 15220688, - "step": 945 - }, - { - "epoch": 0.06626564045986673, - "grad_norm": 3.9778380393981934, - "learning_rate": 9.337965674255692e-05, - "loss": 1.2653, - "num_input_tokens_seen": 15236856, - "step": 946 - }, - { - "epoch": 0.06633568870559597, - "grad_norm": 4.486488342285156, - "learning_rate": 9.33726584938704e-05, - "loss": 1.2408, - "num_input_tokens_seen": 15253240, - "step": 947 - }, - { - "epoch": 0.06640573695132522, - "grad_norm": 8.369994163513184, - "learning_rate": 9.33656602451839e-05, - "loss": 1.4841, - "num_input_tokens_seen": 15267728, - "step": 948 - }, - { - "epoch": 0.06647578519705447, - "grad_norm": 4.2056732177734375, - "learning_rate": 9.335866199649737e-05, - "loss": 1.4258, - "num_input_tokens_seen": 15284112, - "step": 949 - }, - { - "epoch": 0.06654583344278371, - "grad_norm": 4.396723747253418, - "learning_rate": 9.335166374781086e-05, - "loss": 1.1578, - "num_input_tokens_seen": 15300496, - "step": 950 - }, - { - "epoch": 0.06661588168851296, - "grad_norm": 3.7177491188049316, - "learning_rate": 9.334466549912435e-05, - "loss": 1.0664, - "num_input_tokens_seen": 15316608, - "step": 951 - }, - { - "epoch": 0.0666859299342422, - "grad_norm": 4.080933094024658, - "learning_rate": 9.333766725043784e-05, - "loss": 1.1282, - "num_input_tokens_seen": 15332976, - "step": 952 - }, - { - "epoch": 0.06675597817997146, - "grad_norm": 5.188856601715088, - "learning_rate": 9.333066900175132e-05, - "loss": 1.2079, - "num_input_tokens_seen": 15349080, - "step": 953 - }, - { - "epoch": 0.06682602642570071, - "grad_norm": 4.583539962768555, - "learning_rate": 9.33236707530648e-05, - "loss": 0.9047, - "num_input_tokens_seen": 15365256, - "step": 954 - }, - { - "epoch": 0.06689607467142995, - "grad_norm": 3.873830795288086, - "learning_rate": 9.331667250437829e-05, - "loss": 1.159, - "num_input_tokens_seen": 15381640, - "step": 955 - }, - { - "epoch": 0.0669661229171592, - "grad_norm": 3.9574460983276367, - "learning_rate": 9.330967425569176e-05, - "loss": 1.0696, - "num_input_tokens_seen": 15397800, - "step": 956 - }, - { - "epoch": 0.06703617116288844, - "grad_norm": 3.8933448791503906, - "learning_rate": 9.330267600700525e-05, - "loss": 0.9844, - "num_input_tokens_seen": 15414112, - "step": 957 - }, - { - "epoch": 0.06710621940861769, - "grad_norm": 4.748478412628174, - "learning_rate": 9.329567775831875e-05, - "loss": 1.1308, - "num_input_tokens_seen": 15430496, - "step": 958 - }, - { - "epoch": 0.06717626765434694, - "grad_norm": 6.755379676818848, - "learning_rate": 9.328867950963223e-05, - "loss": 1.206, - "num_input_tokens_seen": 15445072, - "step": 959 - }, - { - "epoch": 0.06724631590007618, - "grad_norm": 4.382065773010254, - "learning_rate": 9.328168126094572e-05, - "loss": 1.0753, - "num_input_tokens_seen": 15460336, - "step": 960 - }, - { - "epoch": 0.06731636414580543, - "grad_norm": 5.037116527557373, - "learning_rate": 9.327468301225919e-05, - "loss": 1.0562, - "num_input_tokens_seen": 15474752, - "step": 961 - }, - { - "epoch": 0.06738641239153467, - "grad_norm": 5.838945388793945, - "learning_rate": 9.326768476357268e-05, - "loss": 1.314, - "num_input_tokens_seen": 15491136, - "step": 962 - }, - { - "epoch": 0.06745646063726392, - "grad_norm": 3.690436840057373, - "learning_rate": 9.326068651488617e-05, - "loss": 0.996, - "num_input_tokens_seen": 15507520, - "step": 963 - }, - { - "epoch": 0.06752650888299316, - "grad_norm": 4.1123247146606445, - "learning_rate": 9.325368826619966e-05, - "loss": 1.2031, - "num_input_tokens_seen": 15523904, - "step": 964 - }, - { - "epoch": 0.06759655712872241, - "grad_norm": 4.120308876037598, - "learning_rate": 9.324669001751315e-05, - "loss": 0.9671, - "num_input_tokens_seen": 15540136, - "step": 965 - }, - { - "epoch": 0.06766660537445165, - "grad_norm": 3.9849514961242676, - "learning_rate": 9.323969176882662e-05, - "loss": 1.1669, - "num_input_tokens_seen": 15556312, - "step": 966 - }, - { - "epoch": 0.0677366536201809, - "grad_norm": 3.9164884090423584, - "learning_rate": 9.323269352014011e-05, - "loss": 1.0883, - "num_input_tokens_seen": 15571864, - "step": 967 - }, - { - "epoch": 0.06780670186591015, - "grad_norm": 4.282434940338135, - "learning_rate": 9.32256952714536e-05, - "loss": 1.241, - "num_input_tokens_seen": 15587800, - "step": 968 - }, - { - "epoch": 0.06787675011163939, - "grad_norm": 4.118724346160889, - "learning_rate": 9.321869702276707e-05, - "loss": 1.0905, - "num_input_tokens_seen": 15603128, - "step": 969 - }, - { - "epoch": 0.06794679835736864, - "grad_norm": 4.233770847320557, - "learning_rate": 9.321169877408056e-05, - "loss": 1.0618, - "num_input_tokens_seen": 15617864, - "step": 970 - }, - { - "epoch": 0.06801684660309788, - "grad_norm": 3.933587074279785, - "learning_rate": 9.320470052539405e-05, - "loss": 0.982, - "num_input_tokens_seen": 15634248, - "step": 971 - }, - { - "epoch": 0.06808689484882713, - "grad_norm": 4.641788482666016, - "learning_rate": 9.319770227670754e-05, - "loss": 0.9793, - "num_input_tokens_seen": 15650304, - "step": 972 - }, - { - "epoch": 0.06815694309455637, - "grad_norm": 4.138880729675293, - "learning_rate": 9.319070402802102e-05, - "loss": 1.1991, - "num_input_tokens_seen": 15666688, - "step": 973 - }, - { - "epoch": 0.06822699134028562, - "grad_norm": 4.823685169219971, - "learning_rate": 9.31837057793345e-05, - "loss": 0.9162, - "num_input_tokens_seen": 15682936, - "step": 974 - }, - { - "epoch": 0.06829703958601487, - "grad_norm": 4.432481288909912, - "learning_rate": 9.317670753064799e-05, - "loss": 0.9626, - "num_input_tokens_seen": 15699320, - "step": 975 - }, - { - "epoch": 0.06836708783174411, - "grad_norm": 4.115868091583252, - "learning_rate": 9.316970928196147e-05, - "loss": 1.105, - "num_input_tokens_seen": 15715296, - "step": 976 - }, - { - "epoch": 0.06843713607747336, - "grad_norm": 3.964905023574829, - "learning_rate": 9.316271103327496e-05, - "loss": 1.0064, - "num_input_tokens_seen": 15731680, - "step": 977 - }, - { - "epoch": 0.0685071843232026, - "grad_norm": 3.686522960662842, - "learning_rate": 9.315571278458846e-05, - "loss": 0.9924, - "num_input_tokens_seen": 15747808, - "step": 978 - }, - { - "epoch": 0.06857723256893185, - "grad_norm": 4.0614423751831055, - "learning_rate": 9.314871453590193e-05, - "loss": 1.0425, - "num_input_tokens_seen": 15764168, - "step": 979 - }, - { - "epoch": 0.0686472808146611, - "grad_norm": 3.756350517272949, - "learning_rate": 9.314171628721542e-05, - "loss": 1.0757, - "num_input_tokens_seen": 15780176, - "step": 980 - }, - { - "epoch": 0.06871732906039034, - "grad_norm": 4.30344820022583, - "learning_rate": 9.31347180385289e-05, - "loss": 0.9496, - "num_input_tokens_seen": 15795720, - "step": 981 - }, - { - "epoch": 0.06878737730611958, - "grad_norm": 4.055768013000488, - "learning_rate": 9.312771978984239e-05, - "loss": 1.0189, - "num_input_tokens_seen": 15811528, - "step": 982 - }, - { - "epoch": 0.06885742555184883, - "grad_norm": 3.8779115676879883, - "learning_rate": 9.312072154115586e-05, - "loss": 1.0516, - "num_input_tokens_seen": 15827392, - "step": 983 - }, - { - "epoch": 0.06892747379757808, - "grad_norm": 5.014206886291504, - "learning_rate": 9.311372329246936e-05, - "loss": 1.3421, - "num_input_tokens_seen": 15843776, - "step": 984 - }, - { - "epoch": 0.06899752204330732, - "grad_norm": 4.548489570617676, - "learning_rate": 9.310672504378285e-05, - "loss": 1.1652, - "num_input_tokens_seen": 15858880, - "step": 985 - }, - { - "epoch": 0.06906757028903657, - "grad_norm": 4.312918186187744, - "learning_rate": 9.309972679509633e-05, - "loss": 1.2728, - "num_input_tokens_seen": 15874840, - "step": 986 - }, - { - "epoch": 0.06913761853476583, - "grad_norm": 3.9783735275268555, - "learning_rate": 9.309272854640981e-05, - "loss": 0.9377, - "num_input_tokens_seen": 15890568, - "step": 987 - }, - { - "epoch": 0.06920766678049507, - "grad_norm": 4.155986309051514, - "learning_rate": 9.308573029772329e-05, - "loss": 1.0278, - "num_input_tokens_seen": 15906952, - "step": 988 - }, - { - "epoch": 0.06927771502622432, - "grad_norm": 3.633018732070923, - "learning_rate": 9.307873204903678e-05, - "loss": 1.1276, - "num_input_tokens_seen": 15923336, - "step": 989 - }, - { - "epoch": 0.06934776327195356, - "grad_norm": 3.9513449668884277, - "learning_rate": 9.307173380035027e-05, - "loss": 0.9076, - "num_input_tokens_seen": 15939720, - "step": 990 - }, - { - "epoch": 0.06941781151768281, - "grad_norm": 4.296191692352295, - "learning_rate": 9.306473555166376e-05, - "loss": 1.0375, - "num_input_tokens_seen": 15956104, - "step": 991 - }, - { - "epoch": 0.06948785976341205, - "grad_norm": 5.266847133636475, - "learning_rate": 9.305773730297724e-05, - "loss": 1.1645, - "num_input_tokens_seen": 15972488, - "step": 992 - }, - { - "epoch": 0.0695579080091413, - "grad_norm": 4.321287155151367, - "learning_rate": 9.305073905429072e-05, - "loss": 1.046, - "num_input_tokens_seen": 15988408, - "step": 993 - }, - { - "epoch": 0.06962795625487055, - "grad_norm": 4.1421613693237305, - "learning_rate": 9.304374080560421e-05, - "loss": 1.0639, - "num_input_tokens_seen": 16002904, - "step": 994 - }, - { - "epoch": 0.06969800450059979, - "grad_norm": 6.811270713806152, - "learning_rate": 9.30367425569177e-05, - "loss": 1.1012, - "num_input_tokens_seen": 16017424, - "step": 995 - }, - { - "epoch": 0.06976805274632904, - "grad_norm": 4.968684196472168, - "learning_rate": 9.302974430823117e-05, - "loss": 1.0935, - "num_input_tokens_seen": 16033808, - "step": 996 - }, - { - "epoch": 0.06983810099205828, - "grad_norm": 4.592737197875977, - "learning_rate": 9.302274605954466e-05, - "loss": 0.9698, - "num_input_tokens_seen": 16050192, - "step": 997 - }, - { - "epoch": 0.06990814923778753, - "grad_norm": 3.7984917163848877, - "learning_rate": 9.301574781085815e-05, - "loss": 1.0976, - "num_input_tokens_seen": 16066192, - "step": 998 - }, - { - "epoch": 0.06997819748351677, - "grad_norm": 4.594212055206299, - "learning_rate": 9.300874956217164e-05, - "loss": 1.3718, - "num_input_tokens_seen": 16082576, - "step": 999 - }, - { - "epoch": 0.07004824572924602, - "grad_norm": 5.062666893005371, - "learning_rate": 9.300175131348511e-05, - "loss": 1.3139, - "num_input_tokens_seen": 16098960, - "step": 1000 - }, - { - "epoch": 0.07004824572924602, - "eval_loss": 1.1650840044021606, - "eval_runtime": 0.192, - "eval_samples_per_second": 5.208, - "eval_steps_per_second": 5.208, - "num_input_tokens_seen": 16098960, - "step": 1000 - }, - { - "epoch": 0.07011829397497527, - "grad_norm": 4.100902557373047, - "learning_rate": 9.29947530647986e-05, - "loss": 1.2711, - "num_input_tokens_seen": 16115216, - "step": 1001 - }, - { - "epoch": 0.07018834222070451, - "grad_norm": 4.24728536605835, - "learning_rate": 9.298775481611209e-05, - "loss": 0.9946, - "num_input_tokens_seen": 16130080, - "step": 1002 - }, - { - "epoch": 0.07025839046643376, - "grad_norm": 3.4653356075286865, - "learning_rate": 9.298075656742556e-05, - "loss": 0.8736, - "num_input_tokens_seen": 16146400, - "step": 1003 - }, - { - "epoch": 0.070328438712163, - "grad_norm": 5.548775672912598, - "learning_rate": 9.297375831873907e-05, - "loss": 0.9841, - "num_input_tokens_seen": 16162784, - "step": 1004 - }, - { - "epoch": 0.07039848695789225, - "grad_norm": 4.11661958694458, - "learning_rate": 9.296676007005256e-05, - "loss": 0.9857, - "num_input_tokens_seen": 16179024, - "step": 1005 - }, - { - "epoch": 0.0704685352036215, - "grad_norm": 4.006300449371338, - "learning_rate": 9.295976182136603e-05, - "loss": 1.0587, - "num_input_tokens_seen": 16195408, - "step": 1006 - }, - { - "epoch": 0.07053858344935074, - "grad_norm": 4.418802261352539, - "learning_rate": 9.295276357267952e-05, - "loss": 1.3845, - "num_input_tokens_seen": 16211792, - "step": 1007 - }, - { - "epoch": 0.07060863169507998, - "grad_norm": 5.625720024108887, - "learning_rate": 9.2945765323993e-05, - "loss": 1.2198, - "num_input_tokens_seen": 16226584, - "step": 1008 - }, - { - "epoch": 0.07067867994080923, - "grad_norm": 4.209630489349365, - "learning_rate": 9.293876707530648e-05, - "loss": 0.9387, - "num_input_tokens_seen": 16242256, - "step": 1009 - }, - { - "epoch": 0.07074872818653848, - "grad_norm": 4.0324788093566895, - "learning_rate": 9.293176882661997e-05, - "loss": 1.0713, - "num_input_tokens_seen": 16258640, - "step": 1010 - }, - { - "epoch": 0.07081877643226772, - "grad_norm": 4.0557684898376465, - "learning_rate": 9.292477057793346e-05, - "loss": 1.2831, - "num_input_tokens_seen": 16275024, - "step": 1011 - }, - { - "epoch": 0.07088882467799697, - "grad_norm": 4.511384010314941, - "learning_rate": 9.291777232924695e-05, - "loss": 1.1949, - "num_input_tokens_seen": 16291112, - "step": 1012 - }, - { - "epoch": 0.07095887292372621, - "grad_norm": 3.8120172023773193, - "learning_rate": 9.291077408056042e-05, - "loss": 1.013, - "num_input_tokens_seen": 16307496, - "step": 1013 - }, - { - "epoch": 0.07102892116945546, - "grad_norm": 4.039558410644531, - "learning_rate": 9.290377583187391e-05, - "loss": 1.1575, - "num_input_tokens_seen": 16323880, - "step": 1014 - }, - { - "epoch": 0.0710989694151847, - "grad_norm": 3.9076366424560547, - "learning_rate": 9.289677758318739e-05, - "loss": 1.1776, - "num_input_tokens_seen": 16339624, - "step": 1015 - }, - { - "epoch": 0.07116901766091395, - "grad_norm": 3.8083527088165283, - "learning_rate": 9.288977933450088e-05, - "loss": 0.965, - "num_input_tokens_seen": 16356008, - "step": 1016 - }, - { - "epoch": 0.0712390659066432, - "grad_norm": 4.5387282371521, - "learning_rate": 9.288278108581436e-05, - "loss": 1.1113, - "num_input_tokens_seen": 16372392, - "step": 1017 - }, - { - "epoch": 0.07130911415237244, - "grad_norm": 3.9228522777557373, - "learning_rate": 9.287578283712785e-05, - "loss": 1.1609, - "num_input_tokens_seen": 16388776, - "step": 1018 - }, - { - "epoch": 0.07137916239810169, - "grad_norm": 4.170912742614746, - "learning_rate": 9.286878458844134e-05, - "loss": 1.1324, - "num_input_tokens_seen": 16405160, - "step": 1019 - }, - { - "epoch": 0.07144921064383093, - "grad_norm": 4.426759719848633, - "learning_rate": 9.286178633975482e-05, - "loss": 1.2825, - "num_input_tokens_seen": 16421544, - "step": 1020 - }, - { - "epoch": 0.07151925888956018, - "grad_norm": 3.8606133460998535, - "learning_rate": 9.28547880910683e-05, - "loss": 1.1734, - "num_input_tokens_seen": 16437736, - "step": 1021 - }, - { - "epoch": 0.07158930713528944, - "grad_norm": 4.040006637573242, - "learning_rate": 9.28477898423818e-05, - "loss": 1.0824, - "num_input_tokens_seen": 16453776, - "step": 1022 - }, - { - "epoch": 0.07165935538101868, - "grad_norm": 3.7698042392730713, - "learning_rate": 9.284079159369527e-05, - "loss": 1.0951, - "num_input_tokens_seen": 16470160, - "step": 1023 - }, - { - "epoch": 0.07172940362674793, - "grad_norm": 4.180328369140625, - "learning_rate": 9.283379334500877e-05, - "loss": 1.0087, - "num_input_tokens_seen": 16486280, - "step": 1024 - }, - { - "epoch": 0.07179945187247717, - "grad_norm": 6.02299690246582, - "learning_rate": 9.282679509632225e-05, - "loss": 0.9788, - "num_input_tokens_seen": 16501784, - "step": 1025 - }, - { - "epoch": 0.07186950011820642, - "grad_norm": 4.239454746246338, - "learning_rate": 9.281979684763573e-05, - "loss": 1.3031, - "num_input_tokens_seen": 16518096, - "step": 1026 - }, - { - "epoch": 0.07193954836393567, - "grad_norm": 3.446030616760254, - "learning_rate": 9.281279859894921e-05, - "loss": 0.9523, - "num_input_tokens_seen": 16534480, - "step": 1027 - }, - { - "epoch": 0.07200959660966491, - "grad_norm": 4.2813568115234375, - "learning_rate": 9.28058003502627e-05, - "loss": 1.1041, - "num_input_tokens_seen": 16550864, - "step": 1028 - }, - { - "epoch": 0.07207964485539416, - "grad_norm": 5.289443016052246, - "learning_rate": 9.279880210157619e-05, - "loss": 1.3036, - "num_input_tokens_seen": 16567248, - "step": 1029 - }, - { - "epoch": 0.0721496931011234, - "grad_norm": 3.680283308029175, - "learning_rate": 9.279180385288967e-05, - "loss": 1.1434, - "num_input_tokens_seen": 16583632, - "step": 1030 - }, - { - "epoch": 0.07221974134685265, - "grad_norm": 4.283925533294678, - "learning_rate": 9.278480560420316e-05, - "loss": 1.1569, - "num_input_tokens_seen": 16600016, - "step": 1031 - }, - { - "epoch": 0.0722897895925819, - "grad_norm": 4.913532733917236, - "learning_rate": 9.277780735551665e-05, - "loss": 1.218, - "num_input_tokens_seen": 16616400, - "step": 1032 - }, - { - "epoch": 0.07235983783831114, - "grad_norm": 4.344277381896973, - "learning_rate": 9.277080910683013e-05, - "loss": 1.1495, - "num_input_tokens_seen": 16632024, - "step": 1033 - }, - { - "epoch": 0.07242988608404038, - "grad_norm": 3.9231889247894287, - "learning_rate": 9.276381085814362e-05, - "loss": 1.0492, - "num_input_tokens_seen": 16648408, - "step": 1034 - }, - { - "epoch": 0.07249993432976963, - "grad_norm": 4.062288284301758, - "learning_rate": 9.275681260945709e-05, - "loss": 0.927, - "num_input_tokens_seen": 16664792, - "step": 1035 - }, - { - "epoch": 0.07256998257549888, - "grad_norm": 4.163131237030029, - "learning_rate": 9.274981436077058e-05, - "loss": 1.0782, - "num_input_tokens_seen": 16680216, - "step": 1036 - }, - { - "epoch": 0.07264003082122812, - "grad_norm": 5.220231056213379, - "learning_rate": 9.274281611208407e-05, - "loss": 1.125, - "num_input_tokens_seen": 16696160, - "step": 1037 - }, - { - "epoch": 0.07271007906695737, - "grad_norm": 3.63785457611084, - "learning_rate": 9.273581786339756e-05, - "loss": 1.0229, - "num_input_tokens_seen": 16712544, - "step": 1038 - }, - { - "epoch": 0.07278012731268661, - "grad_norm": 4.612295627593994, - "learning_rate": 9.272881961471105e-05, - "loss": 1.3076, - "num_input_tokens_seen": 16728928, - "step": 1039 - }, - { - "epoch": 0.07285017555841586, - "grad_norm": 5.278262615203857, - "learning_rate": 9.272182136602452e-05, - "loss": 1.2682, - "num_input_tokens_seen": 16744184, - "step": 1040 - }, - { - "epoch": 0.0729202238041451, - "grad_norm": 4.3274455070495605, - "learning_rate": 9.271482311733801e-05, - "loss": 1.3517, - "num_input_tokens_seen": 16760056, - "step": 1041 - }, - { - "epoch": 0.07299027204987435, - "grad_norm": 4.1077375411987305, - "learning_rate": 9.270782486865148e-05, - "loss": 1.175, - "num_input_tokens_seen": 16776280, - "step": 1042 - }, - { - "epoch": 0.0730603202956036, - "grad_norm": 3.954604148864746, - "learning_rate": 9.270082661996497e-05, - "loss": 1.189, - "num_input_tokens_seen": 16792456, - "step": 1043 - }, - { - "epoch": 0.07313036854133284, - "grad_norm": 4.111297607421875, - "learning_rate": 9.269382837127847e-05, - "loss": 1.0265, - "num_input_tokens_seen": 16808840, - "step": 1044 - }, - { - "epoch": 0.07320041678706209, - "grad_norm": 3.56953501701355, - "learning_rate": 9.268683012259195e-05, - "loss": 1.0114, - "num_input_tokens_seen": 16824720, - "step": 1045 - }, - { - "epoch": 0.07327046503279133, - "grad_norm": 4.962648868560791, - "learning_rate": 9.267983187390544e-05, - "loss": 1.1714, - "num_input_tokens_seen": 16841104, - "step": 1046 - }, - { - "epoch": 0.07334051327852058, - "grad_norm": 3.7930710315704346, - "learning_rate": 9.267283362521891e-05, - "loss": 1.0903, - "num_input_tokens_seen": 16857488, - "step": 1047 - }, - { - "epoch": 0.07341056152424982, - "grad_norm": 4.158027172088623, - "learning_rate": 9.26658353765324e-05, - "loss": 1.1823, - "num_input_tokens_seen": 16873856, - "step": 1048 - }, - { - "epoch": 0.07348060976997907, - "grad_norm": 4.1571197509765625, - "learning_rate": 9.265883712784589e-05, - "loss": 1.2572, - "num_input_tokens_seen": 16890240, - "step": 1049 - }, - { - "epoch": 0.07355065801570831, - "grad_norm": 4.330874443054199, - "learning_rate": 9.265183887915938e-05, - "loss": 1.194, - "num_input_tokens_seen": 16906624, - "step": 1050 - }, - { - "epoch": 0.07362070626143756, - "grad_norm": 6.105716705322266, - "learning_rate": 9.264484063047287e-05, - "loss": 1.0685, - "num_input_tokens_seen": 16922864, - "step": 1051 - }, - { - "epoch": 0.0736907545071668, - "grad_norm": 4.8344407081604, - "learning_rate": 9.263784238178634e-05, - "loss": 1.1992, - "num_input_tokens_seen": 16939200, - "step": 1052 - }, - { - "epoch": 0.07376080275289605, - "grad_norm": 3.553568124771118, - "learning_rate": 9.263084413309983e-05, - "loss": 0.7907, - "num_input_tokens_seen": 16955584, - "step": 1053 - }, - { - "epoch": 0.0738308509986253, - "grad_norm": 3.8178694248199463, - "learning_rate": 9.26238458844133e-05, - "loss": 1.2031, - "num_input_tokens_seen": 16971968, - "step": 1054 - }, - { - "epoch": 0.07390089924435454, - "grad_norm": 3.5509321689605713, - "learning_rate": 9.26168476357268e-05, - "loss": 1.1189, - "num_input_tokens_seen": 16988352, - "step": 1055 - }, - { - "epoch": 0.0739709474900838, - "grad_norm": 3.870811939239502, - "learning_rate": 9.260984938704028e-05, - "loss": 1.0205, - "num_input_tokens_seen": 17004736, - "step": 1056 - }, - { - "epoch": 0.07404099573581305, - "grad_norm": 11.86201286315918, - "learning_rate": 9.260285113835377e-05, - "loss": 1.037, - "num_input_tokens_seen": 17020544, - "step": 1057 - }, - { - "epoch": 0.0741110439815423, - "grad_norm": 5.2176127433776855, - "learning_rate": 9.259585288966726e-05, - "loss": 1.0797, - "num_input_tokens_seen": 17036472, - "step": 1058 - }, - { - "epoch": 0.07418109222727154, - "grad_norm": 3.72566819190979, - "learning_rate": 9.258885464098075e-05, - "loss": 0.9307, - "num_input_tokens_seen": 17052360, - "step": 1059 - }, - { - "epoch": 0.07425114047300078, - "grad_norm": 4.323361396789551, - "learning_rate": 9.258185639229422e-05, - "loss": 1.0783, - "num_input_tokens_seen": 17067672, - "step": 1060 - }, - { - "epoch": 0.07432118871873003, - "grad_norm": 4.01705265045166, - "learning_rate": 9.257485814360771e-05, - "loss": 1.0402, - "num_input_tokens_seen": 17084056, - "step": 1061 - }, - { - "epoch": 0.07439123696445928, - "grad_norm": 4.4460039138793945, - "learning_rate": 9.256785989492119e-05, - "loss": 1.2294, - "num_input_tokens_seen": 17100096, - "step": 1062 - }, - { - "epoch": 0.07446128521018852, - "grad_norm": 4.634500503540039, - "learning_rate": 9.256086164623468e-05, - "loss": 1.1479, - "num_input_tokens_seen": 17116440, - "step": 1063 - }, - { - "epoch": 0.07453133345591777, - "grad_norm": 4.146971702575684, - "learning_rate": 9.255386339754817e-05, - "loss": 0.9052, - "num_input_tokens_seen": 17132592, - "step": 1064 - }, - { - "epoch": 0.07460138170164701, - "grad_norm": 6.171874523162842, - "learning_rate": 9.254686514886165e-05, - "loss": 1.1135, - "num_input_tokens_seen": 17148704, - "step": 1065 - }, - { - "epoch": 0.07467142994737626, - "grad_norm": 6.25461483001709, - "learning_rate": 9.253986690017514e-05, - "loss": 1.0003, - "num_input_tokens_seen": 17164920, - "step": 1066 - }, - { - "epoch": 0.0747414781931055, - "grad_norm": 3.886582851409912, - "learning_rate": 9.253286865148862e-05, - "loss": 1.1917, - "num_input_tokens_seen": 17181304, - "step": 1067 - }, - { - "epoch": 0.07481152643883475, - "grad_norm": 5.067885398864746, - "learning_rate": 9.25258704028021e-05, - "loss": 1.4475, - "num_input_tokens_seen": 17197208, - "step": 1068 - }, - { - "epoch": 0.074881574684564, - "grad_norm": 4.186190128326416, - "learning_rate": 9.251887215411558e-05, - "loss": 1.1255, - "num_input_tokens_seen": 17212680, - "step": 1069 - }, - { - "epoch": 0.07495162293029324, - "grad_norm": 4.059047698974609, - "learning_rate": 9.251187390542908e-05, - "loss": 1.1467, - "num_input_tokens_seen": 17229064, - "step": 1070 - }, - { - "epoch": 0.07502167117602249, - "grad_norm": 4.154530048370361, - "learning_rate": 9.250487565674257e-05, - "loss": 1.0811, - "num_input_tokens_seen": 17245448, - "step": 1071 - }, - { - "epoch": 0.07509171942175173, - "grad_norm": 3.760453701019287, - "learning_rate": 9.249787740805605e-05, - "loss": 1.1493, - "num_input_tokens_seen": 17261832, - "step": 1072 - }, - { - "epoch": 0.07516176766748098, - "grad_norm": 3.8155417442321777, - "learning_rate": 9.249087915936954e-05, - "loss": 1.0934, - "num_input_tokens_seen": 17278216, - "step": 1073 - }, - { - "epoch": 0.07523181591321022, - "grad_norm": 4.807973384857178, - "learning_rate": 9.248388091068301e-05, - "loss": 1.0704, - "num_input_tokens_seen": 17294600, - "step": 1074 - }, - { - "epoch": 0.07530186415893947, - "grad_norm": 11.421661376953125, - "learning_rate": 9.24768826619965e-05, - "loss": 0.9472, - "num_input_tokens_seen": 17308960, - "step": 1075 - }, - { - "epoch": 0.07537191240466871, - "grad_norm": 3.7491819858551025, - "learning_rate": 9.246988441330999e-05, - "loss": 1.1395, - "num_input_tokens_seen": 17324536, - "step": 1076 - }, - { - "epoch": 0.07544196065039796, - "grad_norm": 3.6289992332458496, - "learning_rate": 9.246288616462348e-05, - "loss": 0.9375, - "num_input_tokens_seen": 17340920, - "step": 1077 - }, - { - "epoch": 0.0755120088961272, - "grad_norm": 5.741896629333496, - "learning_rate": 9.245588791593696e-05, - "loss": 1.1656, - "num_input_tokens_seen": 17357304, - "step": 1078 - }, - { - "epoch": 0.07558205714185645, - "grad_norm": 3.5879697799682617, - "learning_rate": 9.244888966725044e-05, - "loss": 0.9421, - "num_input_tokens_seen": 17373592, - "step": 1079 - }, - { - "epoch": 0.0756521053875857, - "grad_norm": 7.3384504318237305, - "learning_rate": 9.244189141856393e-05, - "loss": 1.1358, - "num_input_tokens_seen": 17387872, - "step": 1080 - }, - { - "epoch": 0.07572215363331494, - "grad_norm": 3.6677255630493164, - "learning_rate": 9.24348931698774e-05, - "loss": 0.892, - "num_input_tokens_seen": 17403088, - "step": 1081 - }, - { - "epoch": 0.07579220187904419, - "grad_norm": 3.953216075897217, - "learning_rate": 9.242789492119089e-05, - "loss": 0.9757, - "num_input_tokens_seen": 17419392, - "step": 1082 - }, - { - "epoch": 0.07586225012477343, - "grad_norm": 4.827987194061279, - "learning_rate": 9.242089667250438e-05, - "loss": 1.1493, - "num_input_tokens_seen": 17435776, - "step": 1083 - }, - { - "epoch": 0.07593229837050268, - "grad_norm": 4.416223526000977, - "learning_rate": 9.241389842381787e-05, - "loss": 0.9913, - "num_input_tokens_seen": 17452080, - "step": 1084 - }, - { - "epoch": 0.07600234661623193, - "grad_norm": 3.7776753902435303, - "learning_rate": 9.240690017513136e-05, - "loss": 1.0589, - "num_input_tokens_seen": 17468160, - "step": 1085 - }, - { - "epoch": 0.07607239486196117, - "grad_norm": 4.139477252960205, - "learning_rate": 9.239990192644485e-05, - "loss": 0.9475, - "num_input_tokens_seen": 17484544, - "step": 1086 - }, - { - "epoch": 0.07614244310769042, - "grad_norm": 5.218942642211914, - "learning_rate": 9.239290367775832e-05, - "loss": 1.1626, - "num_input_tokens_seen": 17500928, - "step": 1087 - }, - { - "epoch": 0.07621249135341966, - "grad_norm": 4.773080348968506, - "learning_rate": 9.238590542907181e-05, - "loss": 1.154, - "num_input_tokens_seen": 17517312, - "step": 1088 - }, - { - "epoch": 0.07628253959914891, - "grad_norm": 3.840151309967041, - "learning_rate": 9.237890718038528e-05, - "loss": 1.0862, - "num_input_tokens_seen": 17533696, - "step": 1089 - }, - { - "epoch": 0.07635258784487815, - "grad_norm": 4.201962471008301, - "learning_rate": 9.237190893169879e-05, - "loss": 1.0945, - "num_input_tokens_seen": 17549512, - "step": 1090 - }, - { - "epoch": 0.07642263609060741, - "grad_norm": 4.4583001136779785, - "learning_rate": 9.236491068301226e-05, - "loss": 1.074, - "num_input_tokens_seen": 17565896, - "step": 1091 - }, - { - "epoch": 0.07649268433633666, - "grad_norm": 4.013672351837158, - "learning_rate": 9.235791243432575e-05, - "loss": 1.2545, - "num_input_tokens_seen": 17582264, - "step": 1092 - }, - { - "epoch": 0.0765627325820659, - "grad_norm": 3.69555926322937, - "learning_rate": 9.235091418563924e-05, - "loss": 1.1615, - "num_input_tokens_seen": 17597888, - "step": 1093 - }, - { - "epoch": 0.07663278082779515, - "grad_norm": 4.341784954071045, - "learning_rate": 9.234391593695271e-05, - "loss": 1.0369, - "num_input_tokens_seen": 17613392, - "step": 1094 - }, - { - "epoch": 0.0767028290735244, - "grad_norm": 4.043522357940674, - "learning_rate": 9.23369176882662e-05, - "loss": 1.0509, - "num_input_tokens_seen": 17629216, - "step": 1095 - }, - { - "epoch": 0.07677287731925364, - "grad_norm": 4.330739498138428, - "learning_rate": 9.232991943957969e-05, - "loss": 1.2208, - "num_input_tokens_seen": 17645600, - "step": 1096 - }, - { - "epoch": 0.07684292556498289, - "grad_norm": 4.8433122634887695, - "learning_rate": 9.232292119089318e-05, - "loss": 0.9492, - "num_input_tokens_seen": 17660952, - "step": 1097 - }, - { - "epoch": 0.07691297381071213, - "grad_norm": 3.9039859771728516, - "learning_rate": 9.231592294220667e-05, - "loss": 1.0601, - "num_input_tokens_seen": 17677336, - "step": 1098 - }, - { - "epoch": 0.07698302205644138, - "grad_norm": 3.814103126525879, - "learning_rate": 9.230892469352014e-05, - "loss": 0.9902, - "num_input_tokens_seen": 17693720, - "step": 1099 - }, - { - "epoch": 0.07705307030217062, - "grad_norm": 3.9864039421081543, - "learning_rate": 9.230192644483363e-05, - "loss": 1.1622, - "num_input_tokens_seen": 17710104, - "step": 1100 - }, - { - "epoch": 0.07712311854789987, - "grad_norm": 4.469820499420166, - "learning_rate": 9.229492819614711e-05, - "loss": 1.044, - "num_input_tokens_seen": 17726488, - "step": 1101 - }, - { - "epoch": 0.07719316679362911, - "grad_norm": 3.8044216632843018, - "learning_rate": 9.22879299474606e-05, - "loss": 1.1283, - "num_input_tokens_seen": 17742648, - "step": 1102 - }, - { - "epoch": 0.07726321503935836, - "grad_norm": 4.859435558319092, - "learning_rate": 9.228093169877408e-05, - "loss": 1.0995, - "num_input_tokens_seen": 17759032, - "step": 1103 - }, - { - "epoch": 0.0773332632850876, - "grad_norm": 3.830214023590088, - "learning_rate": 9.227393345008757e-05, - "loss": 1.1731, - "num_input_tokens_seen": 17774872, - "step": 1104 - }, - { - "epoch": 0.07740331153081685, - "grad_norm": 4.196676254272461, - "learning_rate": 9.226693520140106e-05, - "loss": 1.2055, - "num_input_tokens_seen": 17790832, - "step": 1105 - }, - { - "epoch": 0.0774733597765461, - "grad_norm": 4.50007438659668, - "learning_rate": 9.225993695271454e-05, - "loss": 0.952, - "num_input_tokens_seen": 17805024, - "step": 1106 - }, - { - "epoch": 0.07754340802227534, - "grad_norm": 4.392070293426514, - "learning_rate": 9.225293870402803e-05, - "loss": 1.1548, - "num_input_tokens_seen": 17820008, - "step": 1107 - }, - { - "epoch": 0.07761345626800459, - "grad_norm": 4.09447717666626, - "learning_rate": 9.22459404553415e-05, - "loss": 1.1233, - "num_input_tokens_seen": 17836392, - "step": 1108 - }, - { - "epoch": 0.07768350451373383, - "grad_norm": 4.591554641723633, - "learning_rate": 9.223894220665499e-05, - "loss": 1.2772, - "num_input_tokens_seen": 17852776, - "step": 1109 - }, - { - "epoch": 0.07775355275946308, - "grad_norm": 5.629931926727295, - "learning_rate": 9.223194395796849e-05, - "loss": 1.1453, - "num_input_tokens_seen": 17869160, - "step": 1110 - }, - { - "epoch": 0.07782360100519232, - "grad_norm": 4.307553768157959, - "learning_rate": 9.222494570928197e-05, - "loss": 1.1479, - "num_input_tokens_seen": 17885544, - "step": 1111 - }, - { - "epoch": 0.07789364925092157, - "grad_norm": 4.599300384521484, - "learning_rate": 9.221794746059545e-05, - "loss": 1.1304, - "num_input_tokens_seen": 17901848, - "step": 1112 - }, - { - "epoch": 0.07796369749665082, - "grad_norm": 4.217408657073975, - "learning_rate": 9.221094921190894e-05, - "loss": 1.1611, - "num_input_tokens_seen": 17918232, - "step": 1113 - }, - { - "epoch": 0.07803374574238006, - "grad_norm": 3.885847568511963, - "learning_rate": 9.220395096322242e-05, - "loss": 0.968, - "num_input_tokens_seen": 17934504, - "step": 1114 - }, - { - "epoch": 0.07810379398810931, - "grad_norm": 4.280134677886963, - "learning_rate": 9.219695271453591e-05, - "loss": 1.0944, - "num_input_tokens_seen": 17950888, - "step": 1115 - }, - { - "epoch": 0.07817384223383855, - "grad_norm": 4.081259727478027, - "learning_rate": 9.21899544658494e-05, - "loss": 1.0872, - "num_input_tokens_seen": 17967088, - "step": 1116 - }, - { - "epoch": 0.0782438904795678, - "grad_norm": 4.206293106079102, - "learning_rate": 9.218295621716288e-05, - "loss": 1.2013, - "num_input_tokens_seen": 17983312, - "step": 1117 - }, - { - "epoch": 0.07831393872529704, - "grad_norm": 4.837226390838623, - "learning_rate": 9.217595796847636e-05, - "loss": 1.2628, - "num_input_tokens_seen": 17998768, - "step": 1118 - }, - { - "epoch": 0.07838398697102629, - "grad_norm": 4.344440460205078, - "learning_rate": 9.216895971978985e-05, - "loss": 1.0389, - "num_input_tokens_seen": 18014840, - "step": 1119 - }, - { - "epoch": 0.07845403521675554, - "grad_norm": 4.357896327972412, - "learning_rate": 9.216196147110334e-05, - "loss": 1.2444, - "num_input_tokens_seen": 18030696, - "step": 1120 - }, - { - "epoch": 0.07852408346248478, - "grad_norm": 3.6449878215789795, - "learning_rate": 9.215496322241681e-05, - "loss": 1.0622, - "num_input_tokens_seen": 18047024, - "step": 1121 - }, - { - "epoch": 0.07859413170821403, - "grad_norm": 4.154385566711426, - "learning_rate": 9.21479649737303e-05, - "loss": 1.1551, - "num_input_tokens_seen": 18063408, - "step": 1122 - }, - { - "epoch": 0.07866417995394327, - "grad_norm": 3.5929031372070312, - "learning_rate": 9.214096672504379e-05, - "loss": 0.9682, - "num_input_tokens_seen": 18079280, - "step": 1123 - }, - { - "epoch": 0.07873422819967252, - "grad_norm": 3.5724170207977295, - "learning_rate": 9.213396847635728e-05, - "loss": 0.8952, - "num_input_tokens_seen": 18094488, - "step": 1124 - }, - { - "epoch": 0.07880427644540176, - "grad_norm": 4.100067615509033, - "learning_rate": 9.212697022767077e-05, - "loss": 0.9066, - "num_input_tokens_seen": 18110872, - "step": 1125 - }, - { - "epoch": 0.07887432469113102, - "grad_norm": 4.431338787078857, - "learning_rate": 9.211997197898424e-05, - "loss": 1.0116, - "num_input_tokens_seen": 18127256, - "step": 1126 - }, - { - "epoch": 0.07894437293686027, - "grad_norm": 3.9577043056488037, - "learning_rate": 9.211297373029773e-05, - "loss": 1.1299, - "num_input_tokens_seen": 18143208, - "step": 1127 - }, - { - "epoch": 0.07901442118258951, - "grad_norm": 4.753921985626221, - "learning_rate": 9.21059754816112e-05, - "loss": 1.0686, - "num_input_tokens_seen": 18158888, - "step": 1128 - }, - { - "epoch": 0.07908446942831876, - "grad_norm": 3.763982057571411, - "learning_rate": 9.209897723292469e-05, - "loss": 1.0467, - "num_input_tokens_seen": 18175192, - "step": 1129 - }, - { - "epoch": 0.079154517674048, - "grad_norm": 3.729553699493408, - "learning_rate": 9.20919789842382e-05, - "loss": 1.1152, - "num_input_tokens_seen": 18191384, - "step": 1130 - }, - { - "epoch": 0.07922456591977725, - "grad_norm": 3.7760956287384033, - "learning_rate": 9.208498073555167e-05, - "loss": 1.0994, - "num_input_tokens_seen": 18207768, - "step": 1131 - }, - { - "epoch": 0.0792946141655065, - "grad_norm": 4.64035177230835, - "learning_rate": 9.207798248686516e-05, - "loss": 1.1037, - "num_input_tokens_seen": 18224152, - "step": 1132 - }, - { - "epoch": 0.07936466241123574, - "grad_norm": 4.1443352699279785, - "learning_rate": 9.207098423817863e-05, - "loss": 1.2329, - "num_input_tokens_seen": 18240536, - "step": 1133 - }, - { - "epoch": 0.07943471065696499, - "grad_norm": 5.332706451416016, - "learning_rate": 9.206398598949212e-05, - "loss": 1.1303, - "num_input_tokens_seen": 18255528, - "step": 1134 - }, - { - "epoch": 0.07950475890269423, - "grad_norm": 3.914705514907837, - "learning_rate": 9.20569877408056e-05, - "loss": 1.1182, - "num_input_tokens_seen": 18271768, - "step": 1135 - }, - { - "epoch": 0.07957480714842348, - "grad_norm": 4.994162559509277, - "learning_rate": 9.20499894921191e-05, - "loss": 1.175, - "num_input_tokens_seen": 18288152, - "step": 1136 - }, - { - "epoch": 0.07964485539415272, - "grad_norm": 4.132298946380615, - "learning_rate": 9.204299124343259e-05, - "loss": 0.9402, - "num_input_tokens_seen": 18303784, - "step": 1137 - }, - { - "epoch": 0.07971490363988197, - "grad_norm": 3.9048449993133545, - "learning_rate": 9.203599299474606e-05, - "loss": 1.1283, - "num_input_tokens_seen": 18319968, - "step": 1138 - }, - { - "epoch": 0.07978495188561122, - "grad_norm": 3.981844425201416, - "learning_rate": 9.202899474605955e-05, - "loss": 1.0472, - "num_input_tokens_seen": 18335976, - "step": 1139 - }, - { - "epoch": 0.07985500013134046, - "grad_norm": 4.491240501403809, - "learning_rate": 9.202199649737304e-05, - "loss": 1.1022, - "num_input_tokens_seen": 18352360, - "step": 1140 - }, - { - "epoch": 0.07992504837706971, - "grad_norm": 4.152430534362793, - "learning_rate": 9.201499824868652e-05, - "loss": 1.0688, - "num_input_tokens_seen": 18368736, - "step": 1141 - }, - { - "epoch": 0.07999509662279895, - "grad_norm": 4.337832450866699, - "learning_rate": 9.2008e-05, - "loss": 1.0397, - "num_input_tokens_seen": 18385120, - "step": 1142 - }, - { - "epoch": 0.0800651448685282, - "grad_norm": 4.865042209625244, - "learning_rate": 9.200100175131349e-05, - "loss": 0.9616, - "num_input_tokens_seen": 18401504, - "step": 1143 - }, - { - "epoch": 0.08013519311425744, - "grad_norm": 3.783113479614258, - "learning_rate": 9.199400350262698e-05, - "loss": 1.0001, - "num_input_tokens_seen": 18417176, - "step": 1144 - }, - { - "epoch": 0.08020524135998669, - "grad_norm": 4.98455286026001, - "learning_rate": 9.198700525394046e-05, - "loss": 1.2139, - "num_input_tokens_seen": 18432584, - "step": 1145 - }, - { - "epoch": 0.08027528960571594, - "grad_norm": 4.1859517097473145, - "learning_rate": 9.198000700525394e-05, - "loss": 1.1333, - "num_input_tokens_seen": 18448968, - "step": 1146 - }, - { - "epoch": 0.08034533785144518, - "grad_norm": 3.7193386554718018, - "learning_rate": 9.197300875656743e-05, - "loss": 1.0055, - "num_input_tokens_seen": 18465352, - "step": 1147 - }, - { - "epoch": 0.08041538609717443, - "grad_norm": 4.280893325805664, - "learning_rate": 9.196601050788091e-05, - "loss": 1.1261, - "num_input_tokens_seen": 18481736, - "step": 1148 - }, - { - "epoch": 0.08048543434290367, - "grad_norm": 3.9979352951049805, - "learning_rate": 9.19590122591944e-05, - "loss": 1.025, - "num_input_tokens_seen": 18498120, - "step": 1149 - }, - { - "epoch": 0.08055548258863292, - "grad_norm": 5.594225883483887, - "learning_rate": 9.195201401050789e-05, - "loss": 1.0527, - "num_input_tokens_seen": 18513944, - "step": 1150 - }, - { - "epoch": 0.08062553083436216, - "grad_norm": 4.758842468261719, - "learning_rate": 9.194501576182137e-05, - "loss": 1.0915, - "num_input_tokens_seen": 18530328, - "step": 1151 - }, - { - "epoch": 0.08069557908009141, - "grad_norm": 5.597489356994629, - "learning_rate": 9.193801751313486e-05, - "loss": 1.0673, - "num_input_tokens_seen": 18546632, - "step": 1152 - }, - { - "epoch": 0.08076562732582065, - "grad_norm": 5.279472827911377, - "learning_rate": 9.193101926444834e-05, - "loss": 1.2897, - "num_input_tokens_seen": 18561856, - "step": 1153 - }, - { - "epoch": 0.0808356755715499, - "grad_norm": 4.672069072723389, - "learning_rate": 9.192402101576183e-05, - "loss": 1.0298, - "num_input_tokens_seen": 18577944, - "step": 1154 - }, - { - "epoch": 0.08090572381727915, - "grad_norm": 3.65533447265625, - "learning_rate": 9.19170227670753e-05, - "loss": 0.933, - "num_input_tokens_seen": 18593720, - "step": 1155 - }, - { - "epoch": 0.08097577206300839, - "grad_norm": 4.212414741516113, - "learning_rate": 9.19100245183888e-05, - "loss": 1.0496, - "num_input_tokens_seen": 18609864, - "step": 1156 - }, - { - "epoch": 0.08104582030873764, - "grad_norm": 4.471503734588623, - "learning_rate": 9.190302626970229e-05, - "loss": 1.2261, - "num_input_tokens_seen": 18626248, - "step": 1157 - }, - { - "epoch": 0.08111586855446688, - "grad_norm": 4.952723979949951, - "learning_rate": 9.189602802101577e-05, - "loss": 1.056, - "num_input_tokens_seen": 18642632, - "step": 1158 - }, - { - "epoch": 0.08118591680019613, - "grad_norm": 3.921449661254883, - "learning_rate": 9.188902977232926e-05, - "loss": 1.1617, - "num_input_tokens_seen": 18659016, - "step": 1159 - }, - { - "epoch": 0.08125596504592539, - "grad_norm": 3.728752374649048, - "learning_rate": 9.188203152364273e-05, - "loss": 1.1217, - "num_input_tokens_seen": 18675400, - "step": 1160 - }, - { - "epoch": 0.08132601329165463, - "grad_norm": 3.8742613792419434, - "learning_rate": 9.187503327495622e-05, - "loss": 1.1538, - "num_input_tokens_seen": 18691232, - "step": 1161 - }, - { - "epoch": 0.08139606153738388, - "grad_norm": 3.827157735824585, - "learning_rate": 9.186803502626971e-05, - "loss": 1.1457, - "num_input_tokens_seen": 18707616, - "step": 1162 - }, - { - "epoch": 0.08146610978311312, - "grad_norm": 3.8507778644561768, - "learning_rate": 9.18610367775832e-05, - "loss": 1.0317, - "num_input_tokens_seen": 18724000, - "step": 1163 - }, - { - "epoch": 0.08153615802884237, - "grad_norm": 5.328095436096191, - "learning_rate": 9.185403852889669e-05, - "loss": 1.0921, - "num_input_tokens_seen": 18740384, - "step": 1164 - }, - { - "epoch": 0.08160620627457162, - "grad_norm": 4.8900322914123535, - "learning_rate": 9.184704028021016e-05, - "loss": 1.1308, - "num_input_tokens_seen": 18756768, - "step": 1165 - }, - { - "epoch": 0.08167625452030086, - "grad_norm": 3.810084104537964, - "learning_rate": 9.184004203152365e-05, - "loss": 1.1244, - "num_input_tokens_seen": 18772632, - "step": 1166 - }, - { - "epoch": 0.08174630276603011, - "grad_norm": 4.318419456481934, - "learning_rate": 9.183304378283714e-05, - "loss": 1.0372, - "num_input_tokens_seen": 18788272, - "step": 1167 - }, - { - "epoch": 0.08181635101175935, - "grad_norm": 4.093379020690918, - "learning_rate": 9.182604553415061e-05, - "loss": 1.18, - "num_input_tokens_seen": 18803672, - "step": 1168 - }, - { - "epoch": 0.0818863992574886, - "grad_norm": 4.630450248718262, - "learning_rate": 9.18190472854641e-05, - "loss": 1.1439, - "num_input_tokens_seen": 18820056, - "step": 1169 - }, - { - "epoch": 0.08195644750321784, - "grad_norm": 4.388457775115967, - "learning_rate": 9.181204903677759e-05, - "loss": 1.0971, - "num_input_tokens_seen": 18836440, - "step": 1170 - }, - { - "epoch": 0.08202649574894709, - "grad_norm": 3.6942262649536133, - "learning_rate": 9.180505078809108e-05, - "loss": 1.1594, - "num_input_tokens_seen": 18852824, - "step": 1171 - }, - { - "epoch": 0.08209654399467634, - "grad_norm": 3.937696933746338, - "learning_rate": 9.179805253940455e-05, - "loss": 1.1841, - "num_input_tokens_seen": 18869208, - "step": 1172 - }, - { - "epoch": 0.08216659224040558, - "grad_norm": 4.062703609466553, - "learning_rate": 9.179105429071804e-05, - "loss": 1.083, - "num_input_tokens_seen": 18885320, - "step": 1173 - }, - { - "epoch": 0.08223664048613483, - "grad_norm": 7.794081211090088, - "learning_rate": 9.178405604203153e-05, - "loss": 1.2287, - "num_input_tokens_seen": 18900224, - "step": 1174 - }, - { - "epoch": 0.08230668873186407, - "grad_norm": 4.429391860961914, - "learning_rate": 9.1777057793345e-05, - "loss": 1.0504, - "num_input_tokens_seen": 18916456, - "step": 1175 - }, - { - "epoch": 0.08237673697759332, - "grad_norm": 3.954869508743286, - "learning_rate": 9.17700595446585e-05, - "loss": 1.1558, - "num_input_tokens_seen": 18932840, - "step": 1176 - }, - { - "epoch": 0.08244678522332256, - "grad_norm": 5.555337429046631, - "learning_rate": 9.176306129597198e-05, - "loss": 1.3628, - "num_input_tokens_seen": 18949224, - "step": 1177 - }, - { - "epoch": 0.08251683346905181, - "grad_norm": 3.575295925140381, - "learning_rate": 9.175606304728547e-05, - "loss": 1.0651, - "num_input_tokens_seen": 18965552, - "step": 1178 - }, - { - "epoch": 0.08258688171478105, - "grad_norm": 5.927703380584717, - "learning_rate": 9.174906479859896e-05, - "loss": 1.0582, - "num_input_tokens_seen": 18981496, - "step": 1179 - }, - { - "epoch": 0.0826569299605103, - "grad_norm": 6.553986549377441, - "learning_rate": 9.174206654991243e-05, - "loss": 1.4058, - "num_input_tokens_seen": 18996808, - "step": 1180 - }, - { - "epoch": 0.08272697820623955, - "grad_norm": 4.315832138061523, - "learning_rate": 9.173506830122592e-05, - "loss": 1.1166, - "num_input_tokens_seen": 19013192, - "step": 1181 - }, - { - "epoch": 0.08279702645196879, - "grad_norm": 3.818033218383789, - "learning_rate": 9.172807005253941e-05, - "loss": 1.0744, - "num_input_tokens_seen": 19029464, - "step": 1182 - }, - { - "epoch": 0.08286707469769804, - "grad_norm": 3.4207711219787598, - "learning_rate": 9.17210718038529e-05, - "loss": 0.8952, - "num_input_tokens_seen": 19045592, - "step": 1183 - }, - { - "epoch": 0.08293712294342728, - "grad_norm": 4.3305864334106445, - "learning_rate": 9.171407355516639e-05, - "loss": 0.9617, - "num_input_tokens_seen": 19061864, - "step": 1184 - }, - { - "epoch": 0.08300717118915653, - "grad_norm": 5.365218162536621, - "learning_rate": 9.170707530647986e-05, - "loss": 1.1669, - "num_input_tokens_seen": 19075448, - "step": 1185 - }, - { - "epoch": 0.08307721943488577, - "grad_norm": 3.9939708709716797, - "learning_rate": 9.170007705779335e-05, - "loss": 1.1325, - "num_input_tokens_seen": 19091832, - "step": 1186 - }, - { - "epoch": 0.08314726768061502, - "grad_norm": 3.8088884353637695, - "learning_rate": 9.169307880910683e-05, - "loss": 1.0132, - "num_input_tokens_seen": 19107920, - "step": 1187 - }, - { - "epoch": 0.08321731592634427, - "grad_norm": 3.858799457550049, - "learning_rate": 9.168608056042032e-05, - "loss": 0.9805, - "num_input_tokens_seen": 19123776, - "step": 1188 - }, - { - "epoch": 0.08328736417207351, - "grad_norm": 4.042770862579346, - "learning_rate": 9.16790823117338e-05, - "loss": 1.1668, - "num_input_tokens_seen": 19139752, - "step": 1189 - }, - { - "epoch": 0.08335741241780276, - "grad_norm": 4.2054762840271, - "learning_rate": 9.16720840630473e-05, - "loss": 1.0702, - "num_input_tokens_seen": 19156136, - "step": 1190 - }, - { - "epoch": 0.083427460663532, - "grad_norm": 4.450238227844238, - "learning_rate": 9.166508581436078e-05, - "loss": 1.0751, - "num_input_tokens_seen": 19172240, - "step": 1191 - }, - { - "epoch": 0.08349750890926125, - "grad_norm": 4.126129627227783, - "learning_rate": 9.165808756567426e-05, - "loss": 0.9957, - "num_input_tokens_seen": 19188624, - "step": 1192 - }, - { - "epoch": 0.0835675571549905, - "grad_norm": 4.131893157958984, - "learning_rate": 9.165108931698775e-05, - "loss": 1.2004, - "num_input_tokens_seen": 19205008, - "step": 1193 - }, - { - "epoch": 0.08363760540071974, - "grad_norm": 4.25187873840332, - "learning_rate": 9.164409106830123e-05, - "loss": 1.3571, - "num_input_tokens_seen": 19220856, - "step": 1194 - }, - { - "epoch": 0.083707653646449, - "grad_norm": 3.842498302459717, - "learning_rate": 9.163709281961471e-05, - "loss": 1.0963, - "num_input_tokens_seen": 19237208, - "step": 1195 - }, - { - "epoch": 0.08377770189217824, - "grad_norm": 3.694279432296753, - "learning_rate": 9.16300945709282e-05, - "loss": 1.1177, - "num_input_tokens_seen": 19253592, - "step": 1196 - }, - { - "epoch": 0.08384775013790749, - "grad_norm": 4.382254123687744, - "learning_rate": 9.162309632224169e-05, - "loss": 1.0344, - "num_input_tokens_seen": 19269976, - "step": 1197 - }, - { - "epoch": 0.08391779838363674, - "grad_norm": 4.267289161682129, - "learning_rate": 9.161609807355518e-05, - "loss": 1.1211, - "num_input_tokens_seen": 19286360, - "step": 1198 - }, - { - "epoch": 0.08398784662936598, - "grad_norm": 5.554534435272217, - "learning_rate": 9.160909982486865e-05, - "loss": 0.9674, - "num_input_tokens_seen": 19301800, - "step": 1199 - }, - { - "epoch": 0.08405789487509523, - "grad_norm": 4.1479668617248535, - "learning_rate": 9.160210157618214e-05, - "loss": 1.2334, - "num_input_tokens_seen": 19317392, - "step": 1200 - }, - { - "epoch": 0.08405789487509523, - "eval_loss": 1.1600490808486938, - "eval_runtime": 0.2015, - "eval_samples_per_second": 4.962, - "eval_steps_per_second": 4.962, - "num_input_tokens_seen": 19317392, - "step": 1200 - }, - { - "epoch": 0.08412794312082447, - "grad_norm": 4.1876349449157715, - "learning_rate": 9.159510332749563e-05, - "loss": 1.2036, - "num_input_tokens_seen": 19333776, - "step": 1201 - }, - { - "epoch": 0.08419799136655372, - "grad_norm": 4.031203746795654, - "learning_rate": 9.15881050788091e-05, - "loss": 1.2127, - "num_input_tokens_seen": 19349616, - "step": 1202 - }, - { - "epoch": 0.08426803961228296, - "grad_norm": 4.013350963592529, - "learning_rate": 9.15811068301226e-05, - "loss": 1.2147, - "num_input_tokens_seen": 19366000, - "step": 1203 - }, - { - "epoch": 0.08433808785801221, - "grad_norm": 4.509790897369385, - "learning_rate": 9.157410858143608e-05, - "loss": 1.3484, - "num_input_tokens_seen": 19381904, - "step": 1204 - }, - { - "epoch": 0.08440813610374145, - "grad_norm": 4.630336761474609, - "learning_rate": 9.156711033274957e-05, - "loss": 1.0246, - "num_input_tokens_seen": 19398288, - "step": 1205 - }, - { - "epoch": 0.0844781843494707, - "grad_norm": 3.819884777069092, - "learning_rate": 9.156011208406304e-05, - "loss": 1.1242, - "num_input_tokens_seen": 19414248, - "step": 1206 - }, - { - "epoch": 0.08454823259519995, - "grad_norm": 3.7933132648468018, - "learning_rate": 9.155311383537653e-05, - "loss": 1.0766, - "num_input_tokens_seen": 19430632, - "step": 1207 - }, - { - "epoch": 0.08461828084092919, - "grad_norm": 5.7384934425354, - "learning_rate": 9.154611558669002e-05, - "loss": 1.0691, - "num_input_tokens_seen": 19446248, - "step": 1208 - }, - { - "epoch": 0.08468832908665844, - "grad_norm": 3.9594175815582275, - "learning_rate": 9.153911733800351e-05, - "loss": 1.2029, - "num_input_tokens_seen": 19462632, - "step": 1209 - }, - { - "epoch": 0.08475837733238768, - "grad_norm": 3.8251891136169434, - "learning_rate": 9.1532119089317e-05, - "loss": 0.9994, - "num_input_tokens_seen": 19479016, - "step": 1210 - }, - { - "epoch": 0.08482842557811693, - "grad_norm": 3.9750332832336426, - "learning_rate": 9.152512084063049e-05, - "loss": 1.1737, - "num_input_tokens_seen": 19495112, - "step": 1211 - }, - { - "epoch": 0.08489847382384617, - "grad_norm": 3.986170530319214, - "learning_rate": 9.151812259194396e-05, - "loss": 1.1441, - "num_input_tokens_seen": 19511216, - "step": 1212 - }, - { - "epoch": 0.08496852206957542, - "grad_norm": 3.914065361022949, - "learning_rate": 9.151112434325745e-05, - "loss": 1.2233, - "num_input_tokens_seen": 19527600, - "step": 1213 - }, - { - "epoch": 0.08503857031530467, - "grad_norm": 4.328094482421875, - "learning_rate": 9.150412609457093e-05, - "loss": 1.2076, - "num_input_tokens_seen": 19543984, - "step": 1214 - }, - { - "epoch": 0.08510861856103391, - "grad_norm": 4.112467288970947, - "learning_rate": 9.149712784588441e-05, - "loss": 1.1732, - "num_input_tokens_seen": 19560368, - "step": 1215 - }, - { - "epoch": 0.08517866680676316, - "grad_norm": 4.680009365081787, - "learning_rate": 9.14901295971979e-05, - "loss": 0.985, - "num_input_tokens_seen": 19575616, - "step": 1216 - }, - { - "epoch": 0.0852487150524924, - "grad_norm": 4.4872660636901855, - "learning_rate": 9.148313134851139e-05, - "loss": 1.1799, - "num_input_tokens_seen": 19592000, - "step": 1217 - }, - { - "epoch": 0.08531876329822165, - "grad_norm": 3.7546637058258057, - "learning_rate": 9.147613309982488e-05, - "loss": 1.1989, - "num_input_tokens_seen": 19608384, - "step": 1218 - }, - { - "epoch": 0.0853888115439509, - "grad_norm": 5.590888500213623, - "learning_rate": 9.146913485113835e-05, - "loss": 1.1411, - "num_input_tokens_seen": 19624768, - "step": 1219 - }, - { - "epoch": 0.08545885978968014, - "grad_norm": 3.958021640777588, - "learning_rate": 9.146213660245184e-05, - "loss": 0.9309, - "num_input_tokens_seen": 19641152, - "step": 1220 - }, - { - "epoch": 0.08552890803540938, - "grad_norm": 3.7641196250915527, - "learning_rate": 9.145513835376533e-05, - "loss": 1.0299, - "num_input_tokens_seen": 19657536, - "step": 1221 - }, - { - "epoch": 0.08559895628113863, - "grad_norm": 4.395461559295654, - "learning_rate": 9.14481401050788e-05, - "loss": 1.1404, - "num_input_tokens_seen": 19673712, - "step": 1222 - }, - { - "epoch": 0.08566900452686788, - "grad_norm": 3.8162319660186768, - "learning_rate": 9.144114185639231e-05, - "loss": 1.1638, - "num_input_tokens_seen": 19689336, - "step": 1223 - }, - { - "epoch": 0.08573905277259712, - "grad_norm": 3.7025444507598877, - "learning_rate": 9.143414360770578e-05, - "loss": 0.9995, - "num_input_tokens_seen": 19705464, - "step": 1224 - }, - { - "epoch": 0.08580910101832637, - "grad_norm": 3.8621439933776855, - "learning_rate": 9.142714535901927e-05, - "loss": 1.1639, - "num_input_tokens_seen": 19721848, - "step": 1225 - }, - { - "epoch": 0.08587914926405561, - "grad_norm": 4.243250846862793, - "learning_rate": 9.142014711033275e-05, - "loss": 1.0104, - "num_input_tokens_seen": 19738072, - "step": 1226 - }, - { - "epoch": 0.08594919750978486, - "grad_norm": 4.05800724029541, - "learning_rate": 9.141314886164624e-05, - "loss": 1.0257, - "num_input_tokens_seen": 19754456, - "step": 1227 - }, - { - "epoch": 0.0860192457555141, - "grad_norm": 4.0894455909729, - "learning_rate": 9.140615061295972e-05, - "loss": 1.254, - "num_input_tokens_seen": 19770840, - "step": 1228 - }, - { - "epoch": 0.08608929400124336, - "grad_norm": 4.296894073486328, - "learning_rate": 9.139915236427321e-05, - "loss": 1.1298, - "num_input_tokens_seen": 19786864, - "step": 1229 - }, - { - "epoch": 0.08615934224697261, - "grad_norm": 4.0352888107299805, - "learning_rate": 9.13921541155867e-05, - "loss": 1.0611, - "num_input_tokens_seen": 19801800, - "step": 1230 - }, - { - "epoch": 0.08622939049270185, - "grad_norm": 4.087375640869141, - "learning_rate": 9.138515586690018e-05, - "loss": 0.9686, - "num_input_tokens_seen": 19818184, - "step": 1231 - }, - { - "epoch": 0.0862994387384311, - "grad_norm": 4.045078754425049, - "learning_rate": 9.137815761821367e-05, - "loss": 1.0915, - "num_input_tokens_seen": 19833016, - "step": 1232 - }, - { - "epoch": 0.08636948698416035, - "grad_norm": 4.399363040924072, - "learning_rate": 9.137115936952714e-05, - "loss": 1.1875, - "num_input_tokens_seen": 19848912, - "step": 1233 - }, - { - "epoch": 0.08643953522988959, - "grad_norm": 4.420406818389893, - "learning_rate": 9.136416112084063e-05, - "loss": 1.0534, - "num_input_tokens_seen": 19865296, - "step": 1234 - }, - { - "epoch": 0.08650958347561884, - "grad_norm": 4.131808280944824, - "learning_rate": 9.135716287215412e-05, - "loss": 1.1865, - "num_input_tokens_seen": 19881376, - "step": 1235 - }, - { - "epoch": 0.08657963172134808, - "grad_norm": 3.8256850242614746, - "learning_rate": 9.13501646234676e-05, - "loss": 1.0539, - "num_input_tokens_seen": 19897704, - "step": 1236 - }, - { - "epoch": 0.08664967996707733, - "grad_norm": 4.3497233390808105, - "learning_rate": 9.13431663747811e-05, - "loss": 1.191, - "num_input_tokens_seen": 19914088, - "step": 1237 - }, - { - "epoch": 0.08671972821280657, - "grad_norm": 4.18136739730835, - "learning_rate": 9.133616812609458e-05, - "loss": 1.0539, - "num_input_tokens_seen": 19930128, - "step": 1238 - }, - { - "epoch": 0.08678977645853582, - "grad_norm": 4.782970905303955, - "learning_rate": 9.132916987740806e-05, - "loss": 1.1992, - "num_input_tokens_seen": 19946512, - "step": 1239 - }, - { - "epoch": 0.08685982470426507, - "grad_norm": 4.16589879989624, - "learning_rate": 9.132217162872155e-05, - "loss": 1.1463, - "num_input_tokens_seen": 19962488, - "step": 1240 - }, - { - "epoch": 0.08692987294999431, - "grad_norm": 3.73541522026062, - "learning_rate": 9.131517338003502e-05, - "loss": 1.0272, - "num_input_tokens_seen": 19978584, - "step": 1241 - }, - { - "epoch": 0.08699992119572356, - "grad_norm": 4.225815773010254, - "learning_rate": 9.130817513134851e-05, - "loss": 1.177, - "num_input_tokens_seen": 19994816, - "step": 1242 - }, - { - "epoch": 0.0870699694414528, - "grad_norm": 7.807470321655273, - "learning_rate": 9.1301176882662e-05, - "loss": 1.1635, - "num_input_tokens_seen": 20010576, - "step": 1243 - }, - { - "epoch": 0.08714001768718205, - "grad_norm": 4.818174839019775, - "learning_rate": 9.129417863397549e-05, - "loss": 1.1892, - "num_input_tokens_seen": 20025712, - "step": 1244 - }, - { - "epoch": 0.0872100659329113, - "grad_norm": 3.8367979526519775, - "learning_rate": 9.128718038528898e-05, - "loss": 1.0096, - "num_input_tokens_seen": 20041904, - "step": 1245 - }, - { - "epoch": 0.08728011417864054, - "grad_norm": 3.9912586212158203, - "learning_rate": 9.128018213660245e-05, - "loss": 1.097, - "num_input_tokens_seen": 20058288, - "step": 1246 - }, - { - "epoch": 0.08735016242436978, - "grad_norm": 4.842557907104492, - "learning_rate": 9.127318388791594e-05, - "loss": 1.2012, - "num_input_tokens_seen": 20074672, - "step": 1247 - }, - { - "epoch": 0.08742021067009903, - "grad_norm": 3.816938877105713, - "learning_rate": 9.126618563922943e-05, - "loss": 1.1683, - "num_input_tokens_seen": 20090664, - "step": 1248 - }, - { - "epoch": 0.08749025891582828, - "grad_norm": 3.712480306625366, - "learning_rate": 9.125918739054292e-05, - "loss": 1.1978, - "num_input_tokens_seen": 20107048, - "step": 1249 - }, - { - "epoch": 0.08756030716155752, - "grad_norm": 4.185492515563965, - "learning_rate": 9.12521891418564e-05, - "loss": 1.2042, - "num_input_tokens_seen": 20123432, - "step": 1250 - }, - { - "epoch": 0.08763035540728677, - "grad_norm": 5.510714530944824, - "learning_rate": 9.124519089316988e-05, - "loss": 0.9757, - "num_input_tokens_seen": 20139112, - "step": 1251 - }, - { - "epoch": 0.08770040365301601, - "grad_norm": 3.9170289039611816, - "learning_rate": 9.123819264448337e-05, - "loss": 1.0213, - "num_input_tokens_seen": 20155496, - "step": 1252 - }, - { - "epoch": 0.08777045189874526, - "grad_norm": 3.738008975982666, - "learning_rate": 9.123119439579684e-05, - "loss": 0.9446, - "num_input_tokens_seen": 20171760, - "step": 1253 - }, - { - "epoch": 0.0878405001444745, - "grad_norm": 4.845873832702637, - "learning_rate": 9.122419614711033e-05, - "loss": 1.2135, - "num_input_tokens_seen": 20188056, - "step": 1254 - }, - { - "epoch": 0.08791054839020375, - "grad_norm": 4.166906356811523, - "learning_rate": 9.121719789842382e-05, - "loss": 1.1558, - "num_input_tokens_seen": 20204440, - "step": 1255 - }, - { - "epoch": 0.087980596635933, - "grad_norm": 4.039194107055664, - "learning_rate": 9.121019964973731e-05, - "loss": 1.0297, - "num_input_tokens_seen": 20220824, - "step": 1256 - }, - { - "epoch": 0.08805064488166224, - "grad_norm": 3.545482635498047, - "learning_rate": 9.12032014010508e-05, - "loss": 0.9757, - "num_input_tokens_seen": 20236888, - "step": 1257 - }, - { - "epoch": 0.08812069312739149, - "grad_norm": 3.82114839553833, - "learning_rate": 9.119620315236427e-05, - "loss": 1.1637, - "num_input_tokens_seen": 20253272, - "step": 1258 - }, - { - "epoch": 0.08819074137312073, - "grad_norm": 4.770678997039795, - "learning_rate": 9.118920490367776e-05, - "loss": 1.1421, - "num_input_tokens_seen": 20269656, - "step": 1259 - }, - { - "epoch": 0.08826078961884998, - "grad_norm": 4.4319539070129395, - "learning_rate": 9.118220665499124e-05, - "loss": 1.1565, - "num_input_tokens_seen": 20285456, - "step": 1260 - }, - { - "epoch": 0.08833083786457922, - "grad_norm": 4.0923357009887695, - "learning_rate": 9.117520840630473e-05, - "loss": 1.2328, - "num_input_tokens_seen": 20301232, - "step": 1261 - }, - { - "epoch": 0.08840088611030847, - "grad_norm": 5.8347344398498535, - "learning_rate": 9.116821015761821e-05, - "loss": 0.8824, - "num_input_tokens_seen": 20317224, - "step": 1262 - }, - { - "epoch": 0.08847093435603771, - "grad_norm": 4.525367259979248, - "learning_rate": 9.11612119089317e-05, - "loss": 1.1554, - "num_input_tokens_seen": 20332616, - "step": 1263 - }, - { - "epoch": 0.08854098260176697, - "grad_norm": 3.9754436016082764, - "learning_rate": 9.115421366024519e-05, - "loss": 1.0423, - "num_input_tokens_seen": 20348336, - "step": 1264 - }, - { - "epoch": 0.08861103084749622, - "grad_norm": 4.40745735168457, - "learning_rate": 9.114721541155868e-05, - "loss": 1.0485, - "num_input_tokens_seen": 20364312, - "step": 1265 - }, - { - "epoch": 0.08868107909322547, - "grad_norm": 7.126221179962158, - "learning_rate": 9.114021716287216e-05, - "loss": 1.2035, - "num_input_tokens_seen": 20380696, - "step": 1266 - }, - { - "epoch": 0.08875112733895471, - "grad_norm": 4.306386947631836, - "learning_rate": 9.113321891418564e-05, - "loss": 1.0399, - "num_input_tokens_seen": 20397080, - "step": 1267 - }, - { - "epoch": 0.08882117558468396, - "grad_norm": 3.566943407058716, - "learning_rate": 9.112622066549912e-05, - "loss": 1.0463, - "num_input_tokens_seen": 20413464, - "step": 1268 - }, - { - "epoch": 0.0888912238304132, - "grad_norm": 3.975228786468506, - "learning_rate": 9.111922241681262e-05, - "loss": 1.2576, - "num_input_tokens_seen": 20429848, - "step": 1269 - }, - { - "epoch": 0.08896127207614245, - "grad_norm": 4.928854465484619, - "learning_rate": 9.11122241681261e-05, - "loss": 1.1555, - "num_input_tokens_seen": 20446192, - "step": 1270 - }, - { - "epoch": 0.0890313203218717, - "grad_norm": 4.288821697235107, - "learning_rate": 9.110522591943958e-05, - "loss": 1.2559, - "num_input_tokens_seen": 20462576, - "step": 1271 - }, - { - "epoch": 0.08910136856760094, - "grad_norm": 3.9346396923065186, - "learning_rate": 9.109822767075307e-05, - "loss": 1.1479, - "num_input_tokens_seen": 20478520, - "step": 1272 - }, - { - "epoch": 0.08917141681333018, - "grad_norm": 3.7976620197296143, - "learning_rate": 9.109122942206655e-05, - "loss": 0.9903, - "num_input_tokens_seen": 20494408, - "step": 1273 - }, - { - "epoch": 0.08924146505905943, - "grad_norm": 5.373577117919922, - "learning_rate": 9.108423117338004e-05, - "loss": 0.8863, - "num_input_tokens_seen": 20510792, - "step": 1274 - }, - { - "epoch": 0.08931151330478868, - "grad_norm": 4.248324394226074, - "learning_rate": 9.107723292469353e-05, - "loss": 1.3492, - "num_input_tokens_seen": 20527064, - "step": 1275 - }, - { - "epoch": 0.08938156155051792, - "grad_norm": 4.453672885894775, - "learning_rate": 9.107023467600701e-05, - "loss": 0.9763, - "num_input_tokens_seen": 20543448, - "step": 1276 - }, - { - "epoch": 0.08945160979624717, - "grad_norm": 4.8721184730529785, - "learning_rate": 9.10632364273205e-05, - "loss": 0.9455, - "num_input_tokens_seen": 20559832, - "step": 1277 - }, - { - "epoch": 0.08952165804197641, - "grad_norm": 5.0173540115356445, - "learning_rate": 9.105623817863398e-05, - "loss": 1.0303, - "num_input_tokens_seen": 20576216, - "step": 1278 - }, - { - "epoch": 0.08959170628770566, - "grad_norm": 5.00100040435791, - "learning_rate": 9.104923992994747e-05, - "loss": 1.0393, - "num_input_tokens_seen": 20592600, - "step": 1279 - }, - { - "epoch": 0.0896617545334349, - "grad_norm": 4.271099090576172, - "learning_rate": 9.104224168126094e-05, - "loss": 1.2307, - "num_input_tokens_seen": 20608632, - "step": 1280 - }, - { - "epoch": 0.08973180277916415, - "grad_norm": 4.246976852416992, - "learning_rate": 9.103524343257443e-05, - "loss": 1.1405, - "num_input_tokens_seen": 20625016, - "step": 1281 - }, - { - "epoch": 0.0898018510248934, - "grad_norm": 5.033923149108887, - "learning_rate": 9.102824518388792e-05, - "loss": 1.0849, - "num_input_tokens_seen": 20641400, - "step": 1282 - }, - { - "epoch": 0.08987189927062264, - "grad_norm": 4.4118571281433105, - "learning_rate": 9.102124693520141e-05, - "loss": 1.118, - "num_input_tokens_seen": 20657448, - "step": 1283 - }, - { - "epoch": 0.08994194751635189, - "grad_norm": 4.150144577026367, - "learning_rate": 9.10142486865149e-05, - "loss": 1.0676, - "num_input_tokens_seen": 20673080, - "step": 1284 - }, - { - "epoch": 0.09001199576208113, - "grad_norm": 3.767683744430542, - "learning_rate": 9.100725043782837e-05, - "loss": 0.8968, - "num_input_tokens_seen": 20689464, - "step": 1285 - }, - { - "epoch": 0.09008204400781038, - "grad_norm": 4.816582202911377, - "learning_rate": 9.100025218914186e-05, - "loss": 1.0039, - "num_input_tokens_seen": 20703896, - "step": 1286 - }, - { - "epoch": 0.09015209225353962, - "grad_norm": 3.8913414478302, - "learning_rate": 9.099325394045533e-05, - "loss": 1.0077, - "num_input_tokens_seen": 20720280, - "step": 1287 - }, - { - "epoch": 0.09022214049926887, - "grad_norm": 4.305298328399658, - "learning_rate": 9.098625569176882e-05, - "loss": 1.1555, - "num_input_tokens_seen": 20735944, - "step": 1288 - }, - { - "epoch": 0.09029218874499811, - "grad_norm": 3.3120992183685303, - "learning_rate": 9.097925744308233e-05, - "loss": 0.8591, - "num_input_tokens_seen": 20752128, - "step": 1289 - }, - { - "epoch": 0.09036223699072736, - "grad_norm": 4.705013751983643, - "learning_rate": 9.09722591943958e-05, - "loss": 1.4579, - "num_input_tokens_seen": 20768512, - "step": 1290 - }, - { - "epoch": 0.0904322852364566, - "grad_norm": 5.08630895614624, - "learning_rate": 9.096526094570929e-05, - "loss": 1.1049, - "num_input_tokens_seen": 20783976, - "step": 1291 - }, - { - "epoch": 0.09050233348218585, - "grad_norm": 3.634686231613159, - "learning_rate": 9.095826269702278e-05, - "loss": 1.0344, - "num_input_tokens_seen": 20800360, - "step": 1292 - }, - { - "epoch": 0.0905723817279151, - "grad_norm": 4.220744609832764, - "learning_rate": 9.095126444833625e-05, - "loss": 1.1843, - "num_input_tokens_seen": 20816744, - "step": 1293 - }, - { - "epoch": 0.09064242997364434, - "grad_norm": 4.724472522735596, - "learning_rate": 9.094426619964974e-05, - "loss": 1.1365, - "num_input_tokens_seen": 20833128, - "step": 1294 - }, - { - "epoch": 0.09071247821937359, - "grad_norm": 3.9398090839385986, - "learning_rate": 9.093726795096323e-05, - "loss": 1.0703, - "num_input_tokens_seen": 20849448, - "step": 1295 - }, - { - "epoch": 0.09078252646510283, - "grad_norm": 4.260062217712402, - "learning_rate": 9.093026970227672e-05, - "loss": 1.0968, - "num_input_tokens_seen": 20865832, - "step": 1296 - }, - { - "epoch": 0.09085257471083208, - "grad_norm": 4.383310317993164, - "learning_rate": 9.09232714535902e-05, - "loss": 1.2542, - "num_input_tokens_seen": 20881288, - "step": 1297 - }, - { - "epoch": 0.09092262295656132, - "grad_norm": 4.479433059692383, - "learning_rate": 9.091627320490368e-05, - "loss": 0.9533, - "num_input_tokens_seen": 20897328, - "step": 1298 - }, - { - "epoch": 0.09099267120229058, - "grad_norm": 4.911858081817627, - "learning_rate": 9.090927495621717e-05, - "loss": 1.3399, - "num_input_tokens_seen": 20913712, - "step": 1299 - }, - { - "epoch": 0.09106271944801983, - "grad_norm": 4.015485763549805, - "learning_rate": 9.090227670753065e-05, - "loss": 1.1156, - "num_input_tokens_seen": 20929984, - "step": 1300 - }, - { - "epoch": 0.09113276769374908, - "grad_norm": 3.8690338134765625, - "learning_rate": 9.089527845884413e-05, - "loss": 1.0634, - "num_input_tokens_seen": 20946368, - "step": 1301 - }, - { - "epoch": 0.09120281593947832, - "grad_norm": 5.142012596130371, - "learning_rate": 9.088828021015762e-05, - "loss": 1.0579, - "num_input_tokens_seen": 20962752, - "step": 1302 - }, - { - "epoch": 0.09127286418520757, - "grad_norm": 3.954049587249756, - "learning_rate": 9.088128196147111e-05, - "loss": 1.0862, - "num_input_tokens_seen": 20979136, - "step": 1303 - }, - { - "epoch": 0.09134291243093681, - "grad_norm": 4.13312292098999, - "learning_rate": 9.08742837127846e-05, - "loss": 1.0548, - "num_input_tokens_seen": 20995520, - "step": 1304 - }, - { - "epoch": 0.09141296067666606, - "grad_norm": 4.24699592590332, - "learning_rate": 9.086728546409808e-05, - "loss": 1.0126, - "num_input_tokens_seen": 21011904, - "step": 1305 - }, - { - "epoch": 0.0914830089223953, - "grad_norm": 4.847048759460449, - "learning_rate": 9.086028721541156e-05, - "loss": 0.9973, - "num_input_tokens_seen": 21027664, - "step": 1306 - }, - { - "epoch": 0.09155305716812455, - "grad_norm": 4.573661804199219, - "learning_rate": 9.085328896672504e-05, - "loss": 1.005, - "num_input_tokens_seen": 21044048, - "step": 1307 - }, - { - "epoch": 0.0916231054138538, - "grad_norm": 4.13530158996582, - "learning_rate": 9.084629071803853e-05, - "loss": 1.1033, - "num_input_tokens_seen": 21060432, - "step": 1308 - }, - { - "epoch": 0.09169315365958304, - "grad_norm": 4.017937183380127, - "learning_rate": 9.083929246935203e-05, - "loss": 1.1971, - "num_input_tokens_seen": 21076816, - "step": 1309 - }, - { - "epoch": 0.09176320190531229, - "grad_norm": 5.928586483001709, - "learning_rate": 9.08322942206655e-05, - "loss": 1.0547, - "num_input_tokens_seen": 21093200, - "step": 1310 - }, - { - "epoch": 0.09183325015104153, - "grad_norm": 4.2442169189453125, - "learning_rate": 9.082529597197899e-05, - "loss": 1.2794, - "num_input_tokens_seen": 21109256, - "step": 1311 - }, - { - "epoch": 0.09190329839677078, - "grad_norm": 4.891444683074951, - "learning_rate": 9.081829772329247e-05, - "loss": 1.1833, - "num_input_tokens_seen": 21124848, - "step": 1312 - }, - { - "epoch": 0.09197334664250002, - "grad_norm": 4.323850154876709, - "learning_rate": 9.081129947460596e-05, - "loss": 1.1683, - "num_input_tokens_seen": 21141176, - "step": 1313 - }, - { - "epoch": 0.09204339488822927, - "grad_norm": 4.239765644073486, - "learning_rate": 9.080430122591943e-05, - "loss": 1.1073, - "num_input_tokens_seen": 21157240, - "step": 1314 - }, - { - "epoch": 0.09211344313395851, - "grad_norm": 4.12881326675415, - "learning_rate": 9.079730297723293e-05, - "loss": 1.2522, - "num_input_tokens_seen": 21173624, - "step": 1315 - }, - { - "epoch": 0.09218349137968776, - "grad_norm": 4.238161087036133, - "learning_rate": 9.079030472854642e-05, - "loss": 1.1828, - "num_input_tokens_seen": 21190008, - "step": 1316 - }, - { - "epoch": 0.092253539625417, - "grad_norm": 4.124176502227783, - "learning_rate": 9.07833064798599e-05, - "loss": 1.1388, - "num_input_tokens_seen": 21206392, - "step": 1317 - }, - { - "epoch": 0.09232358787114625, - "grad_norm": 3.772136926651001, - "learning_rate": 9.077630823117339e-05, - "loss": 1.068, - "num_input_tokens_seen": 21222776, - "step": 1318 - }, - { - "epoch": 0.0923936361168755, - "grad_norm": 4.628321170806885, - "learning_rate": 9.076930998248687e-05, - "loss": 1.2363, - "num_input_tokens_seen": 21239160, - "step": 1319 - }, - { - "epoch": 0.09246368436260474, - "grad_norm": 5.3034348487854, - "learning_rate": 9.076231173380035e-05, - "loss": 1.0638, - "num_input_tokens_seen": 21255544, - "step": 1320 - }, - { - "epoch": 0.09253373260833399, - "grad_norm": 3.6543760299682617, - "learning_rate": 9.075531348511384e-05, - "loss": 1.0071, - "num_input_tokens_seen": 21271928, - "step": 1321 - }, - { - "epoch": 0.09260378085406323, - "grad_norm": 4.1335062980651855, - "learning_rate": 9.074831523642733e-05, - "loss": 1.084, - "num_input_tokens_seen": 21288312, - "step": 1322 - }, - { - "epoch": 0.09267382909979248, - "grad_norm": 3.6392204761505127, - "learning_rate": 9.074131698774082e-05, - "loss": 1.1146, - "num_input_tokens_seen": 21304696, - "step": 1323 - }, - { - "epoch": 0.09274387734552172, - "grad_norm": 4.035269737243652, - "learning_rate": 9.073431873905429e-05, - "loss": 0.9578, - "num_input_tokens_seen": 21321080, - "step": 1324 - }, - { - "epoch": 0.09281392559125097, - "grad_norm": 4.650269508361816, - "learning_rate": 9.072732049036778e-05, - "loss": 1.0242, - "num_input_tokens_seen": 21337464, - "step": 1325 - }, - { - "epoch": 0.09288397383698022, - "grad_norm": 5.850543022155762, - "learning_rate": 9.072032224168127e-05, - "loss": 1.1196, - "num_input_tokens_seen": 21352968, - "step": 1326 - }, - { - "epoch": 0.09295402208270946, - "grad_norm": 4.177901744842529, - "learning_rate": 9.071332399299474e-05, - "loss": 1.1351, - "num_input_tokens_seen": 21368968, - "step": 1327 - }, - { - "epoch": 0.09302407032843871, - "grad_norm": 4.582173824310303, - "learning_rate": 9.070632574430823e-05, - "loss": 0.9115, - "num_input_tokens_seen": 21385352, - "step": 1328 - }, - { - "epoch": 0.09309411857416795, - "grad_norm": 4.7911787033081055, - "learning_rate": 9.069932749562173e-05, - "loss": 1.0413, - "num_input_tokens_seen": 21401144, - "step": 1329 - }, - { - "epoch": 0.0931641668198972, - "grad_norm": 4.058457374572754, - "learning_rate": 9.069232924693521e-05, - "loss": 1.0611, - "num_input_tokens_seen": 21416640, - "step": 1330 - }, - { - "epoch": 0.09323421506562644, - "grad_norm": 4.972208499908447, - "learning_rate": 9.06853309982487e-05, - "loss": 1.016, - "num_input_tokens_seen": 21433024, - "step": 1331 - }, - { - "epoch": 0.09330426331135569, - "grad_norm": 4.0875091552734375, - "learning_rate": 9.067833274956217e-05, - "loss": 1.089, - "num_input_tokens_seen": 21448888, - "step": 1332 - }, - { - "epoch": 0.09337431155708495, - "grad_norm": 3.923112154006958, - "learning_rate": 9.067133450087566e-05, - "loss": 0.9824, - "num_input_tokens_seen": 21465272, - "step": 1333 - }, - { - "epoch": 0.0934443598028142, - "grad_norm": 4.067697525024414, - "learning_rate": 9.066433625218914e-05, - "loss": 1.0492, - "num_input_tokens_seen": 21481656, - "step": 1334 - }, - { - "epoch": 0.09351440804854344, - "grad_norm": 4.185417652130127, - "learning_rate": 9.065733800350264e-05, - "loss": 1.1073, - "num_input_tokens_seen": 21498040, - "step": 1335 - }, - { - "epoch": 0.09358445629427269, - "grad_norm": 7.31542444229126, - "learning_rate": 9.065033975481613e-05, - "loss": 1.4322, - "num_input_tokens_seen": 21514088, - "step": 1336 - }, - { - "epoch": 0.09365450454000193, - "grad_norm": 4.754745006561279, - "learning_rate": 9.06433415061296e-05, - "loss": 0.9953, - "num_input_tokens_seen": 21530472, - "step": 1337 - }, - { - "epoch": 0.09372455278573118, - "grad_norm": 5.81265926361084, - "learning_rate": 9.063634325744309e-05, - "loss": 1.1434, - "num_input_tokens_seen": 21545728, - "step": 1338 - }, - { - "epoch": 0.09379460103146042, - "grad_norm": 5.586238861083984, - "learning_rate": 9.062934500875657e-05, - "loss": 0.9818, - "num_input_tokens_seen": 21562112, - "step": 1339 - }, - { - "epoch": 0.09386464927718967, - "grad_norm": 4.096534729003906, - "learning_rate": 9.062234676007005e-05, - "loss": 1.1856, - "num_input_tokens_seen": 21578496, - "step": 1340 - }, - { - "epoch": 0.09393469752291891, - "grad_norm": 4.913814544677734, - "learning_rate": 9.061534851138354e-05, - "loss": 1.041, - "num_input_tokens_seen": 21594792, - "step": 1341 - }, - { - "epoch": 0.09400474576864816, - "grad_norm": 3.8853912353515625, - "learning_rate": 9.060835026269703e-05, - "loss": 1.1651, - "num_input_tokens_seen": 21611176, - "step": 1342 - }, - { - "epoch": 0.0940747940143774, - "grad_norm": 4.187959671020508, - "learning_rate": 9.060135201401052e-05, - "loss": 1.1757, - "num_input_tokens_seen": 21627560, - "step": 1343 - }, - { - "epoch": 0.09414484226010665, - "grad_norm": 4.128627777099609, - "learning_rate": 9.0594353765324e-05, - "loss": 0.9243, - "num_input_tokens_seen": 21643576, - "step": 1344 - }, - { - "epoch": 0.0942148905058359, - "grad_norm": 4.7016825675964355, - "learning_rate": 9.058735551663748e-05, - "loss": 1.2425, - "num_input_tokens_seen": 21658600, - "step": 1345 - }, - { - "epoch": 0.09428493875156514, - "grad_norm": 3.970548391342163, - "learning_rate": 9.058035726795097e-05, - "loss": 1.0495, - "num_input_tokens_seen": 21674264, - "step": 1346 - }, - { - "epoch": 0.09435498699729439, - "grad_norm": 3.812196731567383, - "learning_rate": 9.057335901926445e-05, - "loss": 0.9558, - "num_input_tokens_seen": 21690112, - "step": 1347 - }, - { - "epoch": 0.09442503524302363, - "grad_norm": 3.6845176219940186, - "learning_rate": 9.056636077057794e-05, - "loss": 0.9758, - "num_input_tokens_seen": 21705744, - "step": 1348 - }, - { - "epoch": 0.09449508348875288, - "grad_norm": 4.119202136993408, - "learning_rate": 9.055936252189142e-05, - "loss": 1.0948, - "num_input_tokens_seen": 21721776, - "step": 1349 - }, - { - "epoch": 0.09456513173448212, - "grad_norm": 4.176985740661621, - "learning_rate": 9.055236427320491e-05, - "loss": 0.9475, - "num_input_tokens_seen": 21737912, - "step": 1350 - }, - { - "epoch": 0.09463517998021137, - "grad_norm": 4.057264804840088, - "learning_rate": 9.054536602451839e-05, - "loss": 1.1746, - "num_input_tokens_seen": 21754296, - "step": 1351 - }, - { - "epoch": 0.09470522822594062, - "grad_norm": 4.5631914138793945, - "learning_rate": 9.053836777583188e-05, - "loss": 1.0894, - "num_input_tokens_seen": 21770680, - "step": 1352 - }, - { - "epoch": 0.09477527647166986, - "grad_norm": 4.854849815368652, - "learning_rate": 9.053136952714536e-05, - "loss": 1.0686, - "num_input_tokens_seen": 21787064, - "step": 1353 - }, - { - "epoch": 0.09484532471739911, - "grad_norm": 5.326946258544922, - "learning_rate": 9.052437127845884e-05, - "loss": 0.872, - "num_input_tokens_seen": 21803448, - "step": 1354 - }, - { - "epoch": 0.09491537296312835, - "grad_norm": 4.283742904663086, - "learning_rate": 9.051737302977234e-05, - "loss": 1.2683, - "num_input_tokens_seen": 21819832, - "step": 1355 - }, - { - "epoch": 0.0949854212088576, - "grad_norm": 4.165935039520264, - "learning_rate": 9.051037478108582e-05, - "loss": 0.977, - "num_input_tokens_seen": 21836216, - "step": 1356 - }, - { - "epoch": 0.09505546945458684, - "grad_norm": 4.502480983734131, - "learning_rate": 9.05033765323993e-05, - "loss": 1.2854, - "num_input_tokens_seen": 21852600, - "step": 1357 - }, - { - "epoch": 0.09512551770031609, - "grad_norm": 4.185445308685303, - "learning_rate": 9.04963782837128e-05, - "loss": 1.2225, - "num_input_tokens_seen": 21868984, - "step": 1358 - }, - { - "epoch": 0.09519556594604534, - "grad_norm": 7.288909435272217, - "learning_rate": 9.048938003502627e-05, - "loss": 1.154, - "num_input_tokens_seen": 21884648, - "step": 1359 - }, - { - "epoch": 0.09526561419177458, - "grad_norm": 4.038896560668945, - "learning_rate": 9.048238178633976e-05, - "loss": 1.1437, - "num_input_tokens_seen": 21900704, - "step": 1360 - }, - { - "epoch": 0.09533566243750383, - "grad_norm": 4.216241836547852, - "learning_rate": 9.047538353765325e-05, - "loss": 1.1379, - "num_input_tokens_seen": 21916520, - "step": 1361 - }, - { - "epoch": 0.09540571068323307, - "grad_norm": 4.2549147605896, - "learning_rate": 9.046838528896673e-05, - "loss": 1.2578, - "num_input_tokens_seen": 21932904, - "step": 1362 - }, - { - "epoch": 0.09547575892896232, - "grad_norm": 3.6919445991516113, - "learning_rate": 9.046138704028022e-05, - "loss": 0.9876, - "num_input_tokens_seen": 21949288, - "step": 1363 - }, - { - "epoch": 0.09554580717469156, - "grad_norm": 5.467876434326172, - "learning_rate": 9.04543887915937e-05, - "loss": 0.9735, - "num_input_tokens_seen": 21965672, - "step": 1364 - }, - { - "epoch": 0.09561585542042081, - "grad_norm": 4.036736011505127, - "learning_rate": 9.044739054290719e-05, - "loss": 1.0712, - "num_input_tokens_seen": 21980792, - "step": 1365 - }, - { - "epoch": 0.09568590366615005, - "grad_norm": 4.083346843719482, - "learning_rate": 9.044039229422066e-05, - "loss": 1.0883, - "num_input_tokens_seen": 21996888, - "step": 1366 - }, - { - "epoch": 0.0957559519118793, - "grad_norm": 3.553262948989868, - "learning_rate": 9.043339404553415e-05, - "loss": 1.0116, - "num_input_tokens_seen": 22013160, - "step": 1367 - }, - { - "epoch": 0.09582600015760856, - "grad_norm": 4.787721633911133, - "learning_rate": 9.042639579684764e-05, - "loss": 1.1444, - "num_input_tokens_seen": 22029544, - "step": 1368 - }, - { - "epoch": 0.0958960484033378, - "grad_norm": 3.8053700923919678, - "learning_rate": 9.041939754816113e-05, - "loss": 1.1654, - "num_input_tokens_seen": 22045888, - "step": 1369 - }, - { - "epoch": 0.09596609664906705, - "grad_norm": 3.7679660320281982, - "learning_rate": 9.041239929947462e-05, - "loss": 1.1753, - "num_input_tokens_seen": 22062272, - "step": 1370 - }, - { - "epoch": 0.0960361448947963, - "grad_norm": 5.086554527282715, - "learning_rate": 9.040540105078809e-05, - "loss": 0.9579, - "num_input_tokens_seen": 22078080, - "step": 1371 - }, - { - "epoch": 0.09610619314052554, - "grad_norm": 4.255527496337891, - "learning_rate": 9.039840280210158e-05, - "loss": 1.0953, - "num_input_tokens_seen": 22093808, - "step": 1372 - }, - { - "epoch": 0.09617624138625479, - "grad_norm": 6.081700325012207, - "learning_rate": 9.039140455341507e-05, - "loss": 1.0363, - "num_input_tokens_seen": 22110192, - "step": 1373 - }, - { - "epoch": 0.09624628963198403, - "grad_norm": 4.376565456390381, - "learning_rate": 9.038440630472854e-05, - "loss": 1.1737, - "num_input_tokens_seen": 22126576, - "step": 1374 - }, - { - "epoch": 0.09631633787771328, - "grad_norm": 4.051114559173584, - "learning_rate": 9.037740805604205e-05, - "loss": 1.1921, - "num_input_tokens_seen": 22142768, - "step": 1375 - }, - { - "epoch": 0.09638638612344252, - "grad_norm": 4.46164083480835, - "learning_rate": 9.037040980735552e-05, - "loss": 1.1541, - "num_input_tokens_seen": 22158600, - "step": 1376 - }, - { - "epoch": 0.09645643436917177, - "grad_norm": 4.242503643035889, - "learning_rate": 9.036341155866901e-05, - "loss": 1.1314, - "num_input_tokens_seen": 22174984, - "step": 1377 - }, - { - "epoch": 0.09652648261490102, - "grad_norm": 3.6338908672332764, - "learning_rate": 9.035641330998248e-05, - "loss": 0.9257, - "num_input_tokens_seen": 22190880, - "step": 1378 - }, - { - "epoch": 0.09659653086063026, - "grad_norm": 4.73402738571167, - "learning_rate": 9.034941506129597e-05, - "loss": 1.1981, - "num_input_tokens_seen": 22206632, - "step": 1379 - }, - { - "epoch": 0.09666657910635951, - "grad_norm": 4.450289726257324, - "learning_rate": 9.034241681260946e-05, - "loss": 1.0851, - "num_input_tokens_seen": 22222896, - "step": 1380 - }, - { - "epoch": 0.09673662735208875, - "grad_norm": 5.578179359436035, - "learning_rate": 9.033541856392295e-05, - "loss": 1.2856, - "num_input_tokens_seen": 22238280, - "step": 1381 - }, - { - "epoch": 0.096806675597818, - "grad_norm": 3.8745546340942383, - "learning_rate": 9.032842031523644e-05, - "loss": 0.9841, - "num_input_tokens_seen": 22254664, - "step": 1382 - }, - { - "epoch": 0.09687672384354724, - "grad_norm": 5.7268548011779785, - "learning_rate": 9.032142206654991e-05, - "loss": 1.2024, - "num_input_tokens_seen": 22270000, - "step": 1383 - }, - { - "epoch": 0.09694677208927649, - "grad_norm": 4.380898952484131, - "learning_rate": 9.03144238178634e-05, - "loss": 1.0589, - "num_input_tokens_seen": 22286384, - "step": 1384 - }, - { - "epoch": 0.09701682033500574, - "grad_norm": 5.762500762939453, - "learning_rate": 9.030742556917689e-05, - "loss": 1.2061, - "num_input_tokens_seen": 22302272, - "step": 1385 - }, - { - "epoch": 0.09708686858073498, - "grad_norm": 3.739488363265991, - "learning_rate": 9.030042732049037e-05, - "loss": 0.9867, - "num_input_tokens_seen": 22318656, - "step": 1386 - }, - { - "epoch": 0.09715691682646423, - "grad_norm": 4.584897994995117, - "learning_rate": 9.029342907180385e-05, - "loss": 1.1934, - "num_input_tokens_seen": 22334704, - "step": 1387 - }, - { - "epoch": 0.09722696507219347, - "grad_norm": 4.161139488220215, - "learning_rate": 9.028643082311734e-05, - "loss": 1.1638, - "num_input_tokens_seen": 22349800, - "step": 1388 - }, - { - "epoch": 0.09729701331792272, - "grad_norm": 4.115293979644775, - "learning_rate": 9.027943257443083e-05, - "loss": 1.0181, - "num_input_tokens_seen": 22366184, - "step": 1389 - }, - { - "epoch": 0.09736706156365196, - "grad_norm": 3.7355988025665283, - "learning_rate": 9.027243432574432e-05, - "loss": 1.1182, - "num_input_tokens_seen": 22382568, - "step": 1390 - }, - { - "epoch": 0.09743710980938121, - "grad_norm": 4.15507173538208, - "learning_rate": 9.02654360770578e-05, - "loss": 1.0272, - "num_input_tokens_seen": 22398480, - "step": 1391 - }, - { - "epoch": 0.09750715805511045, - "grad_norm": 3.770918607711792, - "learning_rate": 9.025843782837128e-05, - "loss": 0.9834, - "num_input_tokens_seen": 22414864, - "step": 1392 - }, - { - "epoch": 0.0975772063008397, - "grad_norm": 4.214321136474609, - "learning_rate": 9.025143957968476e-05, - "loss": 1.1738, - "num_input_tokens_seen": 22429752, - "step": 1393 - }, - { - "epoch": 0.09764725454656895, - "grad_norm": 3.9854986667633057, - "learning_rate": 9.024444133099825e-05, - "loss": 1.2832, - "num_input_tokens_seen": 22446136, - "step": 1394 - }, - { - "epoch": 0.09771730279229819, - "grad_norm": 4.996057510375977, - "learning_rate": 9.023744308231174e-05, - "loss": 1.1691, - "num_input_tokens_seen": 22461160, - "step": 1395 - }, - { - "epoch": 0.09778735103802744, - "grad_norm": 3.682765007019043, - "learning_rate": 9.023044483362523e-05, - "loss": 0.9548, - "num_input_tokens_seen": 22477336, - "step": 1396 - }, - { - "epoch": 0.09785739928375668, - "grad_norm": 4.367272853851318, - "learning_rate": 9.022344658493871e-05, - "loss": 1.0512, - "num_input_tokens_seen": 22492952, - "step": 1397 - }, - { - "epoch": 0.09792744752948593, - "grad_norm": 3.9716336727142334, - "learning_rate": 9.021644833625219e-05, - "loss": 1.103, - "num_input_tokens_seen": 22509336, - "step": 1398 - }, - { - "epoch": 0.09799749577521517, - "grad_norm": 4.043631553649902, - "learning_rate": 9.020945008756568e-05, - "loss": 1.1439, - "num_input_tokens_seen": 22525568, - "step": 1399 - }, - { - "epoch": 0.09806754402094442, - "grad_norm": 4.343166351318359, - "learning_rate": 9.020245183887917e-05, - "loss": 1.1948, - "num_input_tokens_seen": 22541328, - "step": 1400 - }, - { - "epoch": 0.09806754402094442, - "eval_loss": 1.1561514139175415, - "eval_runtime": 0.1977, - "eval_samples_per_second": 5.058, - "eval_steps_per_second": 5.058, - "num_input_tokens_seen": 22541328, - "step": 1400 - }, - { - "epoch": 0.09813759226667366, - "grad_norm": 4.709417819976807, - "learning_rate": 9.019545359019265e-05, - "loss": 1.1398, - "num_input_tokens_seen": 22557304, - "step": 1401 - }, - { - "epoch": 0.09820764051240291, - "grad_norm": 7.022638320922852, - "learning_rate": 9.018845534150614e-05, - "loss": 1.0342, - "num_input_tokens_seen": 22573688, - "step": 1402 - }, - { - "epoch": 0.09827768875813217, - "grad_norm": 3.7976694107055664, - "learning_rate": 9.018145709281962e-05, - "loss": 0.9829, - "num_input_tokens_seen": 22589848, - "step": 1403 - }, - { - "epoch": 0.09834773700386142, - "grad_norm": 3.70877742767334, - "learning_rate": 9.01744588441331e-05, - "loss": 0.9707, - "num_input_tokens_seen": 22606232, - "step": 1404 - }, - { - "epoch": 0.09841778524959066, - "grad_norm": 7.724960803985596, - "learning_rate": 9.016746059544658e-05, - "loss": 0.9602, - "num_input_tokens_seen": 22621912, - "step": 1405 - }, - { - "epoch": 0.09848783349531991, - "grad_norm": 3.9619522094726562, - "learning_rate": 9.016046234676007e-05, - "loss": 0.998, - "num_input_tokens_seen": 22638296, - "step": 1406 - }, - { - "epoch": 0.09855788174104915, - "grad_norm": 3.8303041458129883, - "learning_rate": 9.015346409807356e-05, - "loss": 1.0762, - "num_input_tokens_seen": 22654496, - "step": 1407 - }, - { - "epoch": 0.0986279299867784, - "grad_norm": 4.029507637023926, - "learning_rate": 9.014646584938705e-05, - "loss": 1.2072, - "num_input_tokens_seen": 22670544, - "step": 1408 - }, - { - "epoch": 0.09869797823250764, - "grad_norm": 3.8487346172332764, - "learning_rate": 9.013946760070054e-05, - "loss": 1.1834, - "num_input_tokens_seen": 22686592, - "step": 1409 - }, - { - "epoch": 0.09876802647823689, - "grad_norm": 3.700751543045044, - "learning_rate": 9.013246935201401e-05, - "loss": 0.8698, - "num_input_tokens_seen": 22702976, - "step": 1410 - }, - { - "epoch": 0.09883807472396614, - "grad_norm": 3.686884641647339, - "learning_rate": 9.01254711033275e-05, - "loss": 0.9591, - "num_input_tokens_seen": 22719360, - "step": 1411 - }, - { - "epoch": 0.09890812296969538, - "grad_norm": 4.176409721374512, - "learning_rate": 9.011847285464099e-05, - "loss": 1.1578, - "num_input_tokens_seen": 22735744, - "step": 1412 - }, - { - "epoch": 0.09897817121542463, - "grad_norm": 4.331852912902832, - "learning_rate": 9.011147460595446e-05, - "loss": 0.9769, - "num_input_tokens_seen": 22752128, - "step": 1413 - }, - { - "epoch": 0.09904821946115387, - "grad_norm": 3.8534255027770996, - "learning_rate": 9.010447635726795e-05, - "loss": 1.1536, - "num_input_tokens_seen": 22768512, - "step": 1414 - }, - { - "epoch": 0.09911826770688312, - "grad_norm": 4.066548824310303, - "learning_rate": 9.009747810858144e-05, - "loss": 1.1199, - "num_input_tokens_seen": 22784760, - "step": 1415 - }, - { - "epoch": 0.09918831595261236, - "grad_norm": 4.076517581939697, - "learning_rate": 9.009047985989493e-05, - "loss": 1.1132, - "num_input_tokens_seen": 22801144, - "step": 1416 - }, - { - "epoch": 0.09925836419834161, - "grad_norm": 3.8858346939086914, - "learning_rate": 9.008348161120842e-05, - "loss": 0.9509, - "num_input_tokens_seen": 22817320, - "step": 1417 - }, - { - "epoch": 0.09932841244407085, - "grad_norm": 6.4605584144592285, - "learning_rate": 9.007648336252189e-05, - "loss": 1.2701, - "num_input_tokens_seen": 22833704, - "step": 1418 - }, - { - "epoch": 0.0993984606898001, - "grad_norm": 4.157481670379639, - "learning_rate": 9.006948511383538e-05, - "loss": 1.0169, - "num_input_tokens_seen": 22850088, - "step": 1419 - }, - { - "epoch": 0.09946850893552935, - "grad_norm": 3.725755214691162, - "learning_rate": 9.006248686514886e-05, - "loss": 1.0183, - "num_input_tokens_seen": 22866472, - "step": 1420 - }, - { - "epoch": 0.09953855718125859, - "grad_norm": 4.012838363647461, - "learning_rate": 9.005548861646236e-05, - "loss": 0.8425, - "num_input_tokens_seen": 22882856, - "step": 1421 - }, - { - "epoch": 0.09960860542698784, - "grad_norm": 3.8754239082336426, - "learning_rate": 9.004849036777583e-05, - "loss": 1.1375, - "num_input_tokens_seen": 22899240, - "step": 1422 - }, - { - "epoch": 0.09967865367271708, - "grad_norm": 3.90873384475708, - "learning_rate": 9.004149211908932e-05, - "loss": 1.0574, - "num_input_tokens_seen": 22915160, - "step": 1423 - }, - { - "epoch": 0.09974870191844633, - "grad_norm": 5.698948860168457, - "learning_rate": 9.003449387040281e-05, - "loss": 1.1338, - "num_input_tokens_seen": 22930592, - "step": 1424 - }, - { - "epoch": 0.09981875016417557, - "grad_norm": 4.103662014007568, - "learning_rate": 9.002749562171629e-05, - "loss": 1.2384, - "num_input_tokens_seen": 22946976, - "step": 1425 - }, - { - "epoch": 0.09988879840990482, - "grad_norm": 4.404048442840576, - "learning_rate": 9.002049737302977e-05, - "loss": 1.3855, - "num_input_tokens_seen": 22963360, - "step": 1426 - }, - { - "epoch": 0.09995884665563406, - "grad_norm": 4.043710708618164, - "learning_rate": 9.001349912434326e-05, - "loss": 1.2713, - "num_input_tokens_seen": 22979544, - "step": 1427 - }, - { - "epoch": 0.10002889490136331, - "grad_norm": 4.169802188873291, - "learning_rate": 9.000650087565675e-05, - "loss": 1.0777, - "num_input_tokens_seen": 22995072, - "step": 1428 - }, - { - "epoch": 0.10009894314709256, - "grad_norm": 4.010350227355957, - "learning_rate": 8.999950262697024e-05, - "loss": 1.1245, - "num_input_tokens_seen": 23010904, - "step": 1429 - }, - { - "epoch": 0.1001689913928218, - "grad_norm": 4.496591567993164, - "learning_rate": 8.999250437828372e-05, - "loss": 1.3372, - "num_input_tokens_seen": 23027288, - "step": 1430 - }, - { - "epoch": 0.10023903963855105, - "grad_norm": 4.2428765296936035, - "learning_rate": 8.99855061295972e-05, - "loss": 1.0258, - "num_input_tokens_seen": 23043352, - "step": 1431 - }, - { - "epoch": 0.10030908788428029, - "grad_norm": 4.083342552185059, - "learning_rate": 8.997850788091068e-05, - "loss": 1.227, - "num_input_tokens_seen": 23059736, - "step": 1432 - }, - { - "epoch": 0.10037913613000954, - "grad_norm": 3.860734462738037, - "learning_rate": 8.997150963222417e-05, - "loss": 1.0791, - "num_input_tokens_seen": 23075400, - "step": 1433 - }, - { - "epoch": 0.10044918437573878, - "grad_norm": 3.985151767730713, - "learning_rate": 8.996451138353766e-05, - "loss": 1.0486, - "num_input_tokens_seen": 23091704, - "step": 1434 - }, - { - "epoch": 0.10051923262146803, - "grad_norm": 4.039731502532959, - "learning_rate": 8.995751313485114e-05, - "loss": 0.9793, - "num_input_tokens_seen": 23108088, - "step": 1435 - }, - { - "epoch": 0.10058928086719728, - "grad_norm": 6.1780619621276855, - "learning_rate": 8.995051488616463e-05, - "loss": 1.0645, - "num_input_tokens_seen": 23123128, - "step": 1436 - }, - { - "epoch": 0.10065932911292653, - "grad_norm": 4.5783586502075195, - "learning_rate": 8.994351663747811e-05, - "loss": 1.1634, - "num_input_tokens_seen": 23139168, - "step": 1437 - }, - { - "epoch": 0.10072937735865578, - "grad_norm": 3.889927864074707, - "learning_rate": 8.99365183887916e-05, - "loss": 0.97, - "num_input_tokens_seen": 23154952, - "step": 1438 - }, - { - "epoch": 0.10079942560438503, - "grad_norm": 3.927945852279663, - "learning_rate": 8.992952014010509e-05, - "loss": 1.2428, - "num_input_tokens_seen": 23170688, - "step": 1439 - }, - { - "epoch": 0.10086947385011427, - "grad_norm": 3.8991434574127197, - "learning_rate": 8.992252189141856e-05, - "loss": 0.9519, - "num_input_tokens_seen": 23186432, - "step": 1440 - }, - { - "epoch": 0.10093952209584352, - "grad_norm": 3.6479310989379883, - "learning_rate": 8.991552364273206e-05, - "loss": 0.9656, - "num_input_tokens_seen": 23202816, - "step": 1441 - }, - { - "epoch": 0.10100957034157276, - "grad_norm": 4.637960910797119, - "learning_rate": 8.990852539404554e-05, - "loss": 1.2853, - "num_input_tokens_seen": 23218304, - "step": 1442 - }, - { - "epoch": 0.10107961858730201, - "grad_norm": 4.000091552734375, - "learning_rate": 8.990152714535903e-05, - "loss": 1.0421, - "num_input_tokens_seen": 23234688, - "step": 1443 - }, - { - "epoch": 0.10114966683303125, - "grad_norm": 4.959738731384277, - "learning_rate": 8.989452889667251e-05, - "loss": 1.0904, - "num_input_tokens_seen": 23250656, - "step": 1444 - }, - { - "epoch": 0.1012197150787605, - "grad_norm": 3.9251675605773926, - "learning_rate": 8.988753064798599e-05, - "loss": 0.9219, - "num_input_tokens_seen": 23266984, - "step": 1445 - }, - { - "epoch": 0.10128976332448975, - "grad_norm": 4.28665828704834, - "learning_rate": 8.988053239929948e-05, - "loss": 1.1465, - "num_input_tokens_seen": 23283368, - "step": 1446 - }, - { - "epoch": 0.10135981157021899, - "grad_norm": 4.421731472015381, - "learning_rate": 8.987353415061297e-05, - "loss": 1.1098, - "num_input_tokens_seen": 23298728, - "step": 1447 - }, - { - "epoch": 0.10142985981594824, - "grad_norm": 5.080065727233887, - "learning_rate": 8.986653590192646e-05, - "loss": 1.1172, - "num_input_tokens_seen": 23315112, - "step": 1448 - }, - { - "epoch": 0.10149990806167748, - "grad_norm": 5.618803977966309, - "learning_rate": 8.985953765323993e-05, - "loss": 0.9551, - "num_input_tokens_seen": 23329864, - "step": 1449 - }, - { - "epoch": 0.10156995630740673, - "grad_norm": 3.756836414337158, - "learning_rate": 8.985253940455342e-05, - "loss": 1.0981, - "num_input_tokens_seen": 23345672, - "step": 1450 - }, - { - "epoch": 0.10164000455313597, - "grad_norm": 4.461424827575684, - "learning_rate": 8.984554115586691e-05, - "loss": 1.1914, - "num_input_tokens_seen": 23362056, - "step": 1451 - }, - { - "epoch": 0.10171005279886522, - "grad_norm": 5.267919063568115, - "learning_rate": 8.983854290718038e-05, - "loss": 1.1928, - "num_input_tokens_seen": 23378440, - "step": 1452 - }, - { - "epoch": 0.10178010104459446, - "grad_norm": 5.513551235198975, - "learning_rate": 8.983154465849387e-05, - "loss": 1.2405, - "num_input_tokens_seen": 23394824, - "step": 1453 - }, - { - "epoch": 0.10185014929032371, - "grad_norm": 4.46366548538208, - "learning_rate": 8.982454640980736e-05, - "loss": 1.1436, - "num_input_tokens_seen": 23410568, - "step": 1454 - }, - { - "epoch": 0.10192019753605296, - "grad_norm": 5.066692352294922, - "learning_rate": 8.981754816112085e-05, - "loss": 1.1389, - "num_input_tokens_seen": 23426952, - "step": 1455 - }, - { - "epoch": 0.1019902457817822, - "grad_norm": 3.980743169784546, - "learning_rate": 8.981054991243434e-05, - "loss": 1.0623, - "num_input_tokens_seen": 23443256, - "step": 1456 - }, - { - "epoch": 0.10206029402751145, - "grad_norm": 4.088611125946045, - "learning_rate": 8.980355166374781e-05, - "loss": 1.0388, - "num_input_tokens_seen": 23459640, - "step": 1457 - }, - { - "epoch": 0.10213034227324069, - "grad_norm": 3.9585626125335693, - "learning_rate": 8.97965534150613e-05, - "loss": 1.2051, - "num_input_tokens_seen": 23475176, - "step": 1458 - }, - { - "epoch": 0.10220039051896994, - "grad_norm": 3.7923290729522705, - "learning_rate": 8.978955516637478e-05, - "loss": 1.0001, - "num_input_tokens_seen": 23490704, - "step": 1459 - }, - { - "epoch": 0.10227043876469918, - "grad_norm": 3.9089629650115967, - "learning_rate": 8.978255691768826e-05, - "loss": 0.9786, - "num_input_tokens_seen": 23506168, - "step": 1460 - }, - { - "epoch": 0.10234048701042843, - "grad_norm": 6.2259039878845215, - "learning_rate": 8.977555866900175e-05, - "loss": 1.2854, - "num_input_tokens_seen": 23522552, - "step": 1461 - }, - { - "epoch": 0.10241053525615768, - "grad_norm": 4.071867942810059, - "learning_rate": 8.976856042031524e-05, - "loss": 1.0724, - "num_input_tokens_seen": 23538936, - "step": 1462 - }, - { - "epoch": 0.10248058350188692, - "grad_norm": 4.587897777557373, - "learning_rate": 8.976156217162873e-05, - "loss": 1.1307, - "num_input_tokens_seen": 23554536, - "step": 1463 - }, - { - "epoch": 0.10255063174761617, - "grad_norm": 3.944937229156494, - "learning_rate": 8.97545639229422e-05, - "loss": 1.1503, - "num_input_tokens_seen": 23570888, - "step": 1464 - }, - { - "epoch": 0.10262067999334541, - "grad_norm": 3.7418766021728516, - "learning_rate": 8.97475656742557e-05, - "loss": 1.0414, - "num_input_tokens_seen": 23587272, - "step": 1465 - }, - { - "epoch": 0.10269072823907466, - "grad_norm": 3.9055676460266113, - "learning_rate": 8.974056742556918e-05, - "loss": 1.2284, - "num_input_tokens_seen": 23603640, - "step": 1466 - }, - { - "epoch": 0.1027607764848039, - "grad_norm": 3.9338066577911377, - "learning_rate": 8.973356917688267e-05, - "loss": 1.2389, - "num_input_tokens_seen": 23620024, - "step": 1467 - }, - { - "epoch": 0.10283082473053315, - "grad_norm": 4.024602890014648, - "learning_rate": 8.972657092819616e-05, - "loss": 1.1358, - "num_input_tokens_seen": 23636408, - "step": 1468 - }, - { - "epoch": 0.1029008729762624, - "grad_norm": 4.09812068939209, - "learning_rate": 8.971957267950963e-05, - "loss": 1.0734, - "num_input_tokens_seen": 23652480, - "step": 1469 - }, - { - "epoch": 0.10297092122199164, - "grad_norm": 4.7382025718688965, - "learning_rate": 8.971257443082312e-05, - "loss": 1.4112, - "num_input_tokens_seen": 23668424, - "step": 1470 - }, - { - "epoch": 0.10304096946772089, - "grad_norm": 4.518669605255127, - "learning_rate": 8.970557618213661e-05, - "loss": 1.3466, - "num_input_tokens_seen": 23684808, - "step": 1471 - }, - { - "epoch": 0.10311101771345015, - "grad_norm": 4.023036003112793, - "learning_rate": 8.969857793345009e-05, - "loss": 1.0246, - "num_input_tokens_seen": 23701192, - "step": 1472 - }, - { - "epoch": 0.10318106595917939, - "grad_norm": 4.6244215965271, - "learning_rate": 8.969157968476358e-05, - "loss": 1.2391, - "num_input_tokens_seen": 23717576, - "step": 1473 - }, - { - "epoch": 0.10325111420490864, - "grad_norm": 4.517683506011963, - "learning_rate": 8.968458143607706e-05, - "loss": 1.3872, - "num_input_tokens_seen": 23733960, - "step": 1474 - }, - { - "epoch": 0.10332116245063788, - "grad_norm": 4.048764705657959, - "learning_rate": 8.967758318739055e-05, - "loss": 1.0453, - "num_input_tokens_seen": 23750344, - "step": 1475 - }, - { - "epoch": 0.10339121069636713, - "grad_norm": 4.248376369476318, - "learning_rate": 8.967058493870403e-05, - "loss": 1.176, - "num_input_tokens_seen": 23766160, - "step": 1476 - }, - { - "epoch": 0.10346125894209637, - "grad_norm": 3.780548095703125, - "learning_rate": 8.966358669001752e-05, - "loss": 0.9048, - "num_input_tokens_seen": 23782544, - "step": 1477 - }, - { - "epoch": 0.10353130718782562, - "grad_norm": 4.26375675201416, - "learning_rate": 8.9656588441331e-05, - "loss": 0.8651, - "num_input_tokens_seen": 23798928, - "step": 1478 - }, - { - "epoch": 0.10360135543355486, - "grad_norm": 3.9202687740325928, - "learning_rate": 8.964959019264448e-05, - "loss": 1.1058, - "num_input_tokens_seen": 23815312, - "step": 1479 - }, - { - "epoch": 0.10367140367928411, - "grad_norm": 3.983797788619995, - "learning_rate": 8.964259194395797e-05, - "loss": 1.0778, - "num_input_tokens_seen": 23831696, - "step": 1480 - }, - { - "epoch": 0.10374145192501336, - "grad_norm": 4.471195220947266, - "learning_rate": 8.963559369527146e-05, - "loss": 1.1858, - "num_input_tokens_seen": 23847768, - "step": 1481 - }, - { - "epoch": 0.1038115001707426, - "grad_norm": 3.560317039489746, - "learning_rate": 8.962859544658495e-05, - "loss": 1.0205, - "num_input_tokens_seen": 23864152, - "step": 1482 - }, - { - "epoch": 0.10388154841647185, - "grad_norm": 3.8699846267700195, - "learning_rate": 8.962159719789843e-05, - "loss": 1.1438, - "num_input_tokens_seen": 23880536, - "step": 1483 - }, - { - "epoch": 0.10395159666220109, - "grad_norm": 4.547862529754639, - "learning_rate": 8.961459894921191e-05, - "loss": 1.0303, - "num_input_tokens_seen": 23896704, - "step": 1484 - }, - { - "epoch": 0.10402164490793034, - "grad_norm": 4.669456481933594, - "learning_rate": 8.96076007005254e-05, - "loss": 1.1994, - "num_input_tokens_seen": 23913088, - "step": 1485 - }, - { - "epoch": 0.10409169315365958, - "grad_norm": 4.346285343170166, - "learning_rate": 8.960060245183887e-05, - "loss": 1.2677, - "num_input_tokens_seen": 23929472, - "step": 1486 - }, - { - "epoch": 0.10416174139938883, - "grad_norm": 6.5028581619262695, - "learning_rate": 8.959360420315236e-05, - "loss": 0.989, - "num_input_tokens_seen": 23945216, - "step": 1487 - }, - { - "epoch": 0.10423178964511808, - "grad_norm": 3.935488224029541, - "learning_rate": 8.958660595446586e-05, - "loss": 1.2657, - "num_input_tokens_seen": 23961600, - "step": 1488 - }, - { - "epoch": 0.10430183789084732, - "grad_norm": 3.772397518157959, - "learning_rate": 8.957960770577934e-05, - "loss": 1.1038, - "num_input_tokens_seen": 23977984, - "step": 1489 - }, - { - "epoch": 0.10437188613657657, - "grad_norm": 4.508286476135254, - "learning_rate": 8.957260945709283e-05, - "loss": 1.2694, - "num_input_tokens_seen": 23993752, - "step": 1490 - }, - { - "epoch": 0.10444193438230581, - "grad_norm": 4.667380332946777, - "learning_rate": 8.95656112084063e-05, - "loss": 1.2837, - "num_input_tokens_seen": 24009832, - "step": 1491 - }, - { - "epoch": 0.10451198262803506, - "grad_norm": 7.675503730773926, - "learning_rate": 8.955861295971979e-05, - "loss": 1.121, - "num_input_tokens_seen": 24025784, - "step": 1492 - }, - { - "epoch": 0.1045820308737643, - "grad_norm": 4.427548408508301, - "learning_rate": 8.955161471103328e-05, - "loss": 0.835, - "num_input_tokens_seen": 24041568, - "step": 1493 - }, - { - "epoch": 0.10465207911949355, - "grad_norm": 3.9065396785736084, - "learning_rate": 8.954461646234677e-05, - "loss": 1.1322, - "num_input_tokens_seen": 24057952, - "step": 1494 - }, - { - "epoch": 0.1047221273652228, - "grad_norm": 4.052605628967285, - "learning_rate": 8.953761821366026e-05, - "loss": 1.1133, - "num_input_tokens_seen": 24074336, - "step": 1495 - }, - { - "epoch": 0.10479217561095204, - "grad_norm": 3.758476734161377, - "learning_rate": 8.953061996497373e-05, - "loss": 1.1302, - "num_input_tokens_seen": 24090720, - "step": 1496 - }, - { - "epoch": 0.10486222385668129, - "grad_norm": 4.4470014572143555, - "learning_rate": 8.952362171628722e-05, - "loss": 1.0969, - "num_input_tokens_seen": 24107024, - "step": 1497 - }, - { - "epoch": 0.10493227210241053, - "grad_norm": 4.222001075744629, - "learning_rate": 8.951662346760071e-05, - "loss": 1.147, - "num_input_tokens_seen": 24123408, - "step": 1498 - }, - { - "epoch": 0.10500232034813978, - "grad_norm": 4.72997522354126, - "learning_rate": 8.950962521891418e-05, - "loss": 1.1086, - "num_input_tokens_seen": 24137672, - "step": 1499 - }, - { - "epoch": 0.10507236859386902, - "grad_norm": 4.342312812805176, - "learning_rate": 8.950262697022767e-05, - "loss": 1.2044, - "num_input_tokens_seen": 24153248, - "step": 1500 - }, - { - "epoch": 0.10514241683959827, - "grad_norm": 4.723706245422363, - "learning_rate": 8.949562872154116e-05, - "loss": 1.1075, - "num_input_tokens_seen": 24169240, - "step": 1501 - }, - { - "epoch": 0.10521246508532751, - "grad_norm": 4.244345188140869, - "learning_rate": 8.948863047285465e-05, - "loss": 1.1839, - "num_input_tokens_seen": 24184608, - "step": 1502 - }, - { - "epoch": 0.10528251333105676, - "grad_norm": 3.6271615028381348, - "learning_rate": 8.948163222416812e-05, - "loss": 1.0755, - "num_input_tokens_seen": 24200992, - "step": 1503 - }, - { - "epoch": 0.105352561576786, - "grad_norm": 3.858696937561035, - "learning_rate": 8.947463397548161e-05, - "loss": 1.0598, - "num_input_tokens_seen": 24217376, - "step": 1504 - }, - { - "epoch": 0.10542260982251525, - "grad_norm": 7.14077091217041, - "learning_rate": 8.94676357267951e-05, - "loss": 1.0362, - "num_input_tokens_seen": 24232368, - "step": 1505 - }, - { - "epoch": 0.10549265806824451, - "grad_norm": 4.203495979309082, - "learning_rate": 8.946063747810858e-05, - "loss": 1.2491, - "num_input_tokens_seen": 24248520, - "step": 1506 - }, - { - "epoch": 0.10556270631397376, - "grad_norm": 4.344188213348389, - "learning_rate": 8.945363922942207e-05, - "loss": 0.905, - "num_input_tokens_seen": 24264824, - "step": 1507 - }, - { - "epoch": 0.105632754559703, - "grad_norm": 6.156280517578125, - "learning_rate": 8.944664098073557e-05, - "loss": 1.3046, - "num_input_tokens_seen": 24281208, - "step": 1508 - }, - { - "epoch": 0.10570280280543225, - "grad_norm": 4.687212944030762, - "learning_rate": 8.943964273204904e-05, - "loss": 1.1898, - "num_input_tokens_seen": 24297592, - "step": 1509 - }, - { - "epoch": 0.10577285105116149, - "grad_norm": 3.9128546714782715, - "learning_rate": 8.943264448336253e-05, - "loss": 1.0506, - "num_input_tokens_seen": 24313976, - "step": 1510 - }, - { - "epoch": 0.10584289929689074, - "grad_norm": 5.766979694366455, - "learning_rate": 8.9425646234676e-05, - "loss": 1.119, - "num_input_tokens_seen": 24330296, - "step": 1511 - }, - { - "epoch": 0.10591294754261998, - "grad_norm": 3.9610238075256348, - "learning_rate": 8.94186479859895e-05, - "loss": 1.279, - "num_input_tokens_seen": 24346680, - "step": 1512 - }, - { - "epoch": 0.10598299578834923, - "grad_norm": 4.262688636779785, - "learning_rate": 8.941164973730297e-05, - "loss": 1.3217, - "num_input_tokens_seen": 24362408, - "step": 1513 - }, - { - "epoch": 0.10605304403407848, - "grad_norm": 5.190121173858643, - "learning_rate": 8.940465148861647e-05, - "loss": 1.0615, - "num_input_tokens_seen": 24378248, - "step": 1514 - }, - { - "epoch": 0.10612309227980772, - "grad_norm": 4.5859479904174805, - "learning_rate": 8.939765323992996e-05, - "loss": 1.1377, - "num_input_tokens_seen": 24394632, - "step": 1515 - }, - { - "epoch": 0.10619314052553697, - "grad_norm": 4.021294593811035, - "learning_rate": 8.939065499124344e-05, - "loss": 0.9913, - "num_input_tokens_seen": 24411016, - "step": 1516 - }, - { - "epoch": 0.10626318877126621, - "grad_norm": 4.296265602111816, - "learning_rate": 8.938365674255692e-05, - "loss": 1.1753, - "num_input_tokens_seen": 24426792, - "step": 1517 - }, - { - "epoch": 0.10633323701699546, - "grad_norm": 3.4397289752960205, - "learning_rate": 8.93766584938704e-05, - "loss": 0.8159, - "num_input_tokens_seen": 24443176, - "step": 1518 - }, - { - "epoch": 0.1064032852627247, - "grad_norm": 4.009952545166016, - "learning_rate": 8.936966024518389e-05, - "loss": 1.0728, - "num_input_tokens_seen": 24459384, - "step": 1519 - }, - { - "epoch": 0.10647333350845395, - "grad_norm": 4.786280632019043, - "learning_rate": 8.936266199649738e-05, - "loss": 1.1303, - "num_input_tokens_seen": 24474904, - "step": 1520 - }, - { - "epoch": 0.1065433817541832, - "grad_norm": 3.869297981262207, - "learning_rate": 8.935566374781087e-05, - "loss": 1.0829, - "num_input_tokens_seen": 24490456, - "step": 1521 - }, - { - "epoch": 0.10661342999991244, - "grad_norm": 3.995553731918335, - "learning_rate": 8.934866549912435e-05, - "loss": 1.0813, - "num_input_tokens_seen": 24506840, - "step": 1522 - }, - { - "epoch": 0.10668347824564169, - "grad_norm": 4.195018291473389, - "learning_rate": 8.934166725043783e-05, - "loss": 1.0585, - "num_input_tokens_seen": 24522440, - "step": 1523 - }, - { - "epoch": 0.10675352649137093, - "grad_norm": 4.0432515144348145, - "learning_rate": 8.933466900175132e-05, - "loss": 1.0757, - "num_input_tokens_seen": 24538824, - "step": 1524 - }, - { - "epoch": 0.10682357473710018, - "grad_norm": 5.120638847351074, - "learning_rate": 8.93276707530648e-05, - "loss": 1.1328, - "num_input_tokens_seen": 24555208, - "step": 1525 - }, - { - "epoch": 0.10689362298282942, - "grad_norm": 3.925096035003662, - "learning_rate": 8.932067250437828e-05, - "loss": 1.1569, - "num_input_tokens_seen": 24571544, - "step": 1526 - }, - { - "epoch": 0.10696367122855867, - "grad_norm": 3.930328130722046, - "learning_rate": 8.931367425569177e-05, - "loss": 0.9385, - "num_input_tokens_seen": 24587736, - "step": 1527 - }, - { - "epoch": 0.10703371947428791, - "grad_norm": 3.7056055068969727, - "learning_rate": 8.930667600700526e-05, - "loss": 0.8675, - "num_input_tokens_seen": 24604120, - "step": 1528 - }, - { - "epoch": 0.10710376772001716, - "grad_norm": 5.945568561553955, - "learning_rate": 8.929967775831875e-05, - "loss": 1.0395, - "num_input_tokens_seen": 24620504, - "step": 1529 - }, - { - "epoch": 0.1071738159657464, - "grad_norm": 3.7765939235687256, - "learning_rate": 8.929267950963222e-05, - "loss": 0.8796, - "num_input_tokens_seen": 24635440, - "step": 1530 - }, - { - "epoch": 0.10724386421147565, - "grad_norm": 4.229284286499023, - "learning_rate": 8.928568126094571e-05, - "loss": 1.0941, - "num_input_tokens_seen": 24651824, - "step": 1531 - }, - { - "epoch": 0.1073139124572049, - "grad_norm": 4.198834419250488, - "learning_rate": 8.92786830122592e-05, - "loss": 1.118, - "num_input_tokens_seen": 24668208, - "step": 1532 - }, - { - "epoch": 0.10738396070293414, - "grad_norm": 8.091620445251465, - "learning_rate": 8.927168476357267e-05, - "loss": 1.1515, - "num_input_tokens_seen": 24684248, - "step": 1533 - }, - { - "epoch": 0.10745400894866339, - "grad_norm": 4.091879844665527, - "learning_rate": 8.926468651488618e-05, - "loss": 1.1283, - "num_input_tokens_seen": 24700632, - "step": 1534 - }, - { - "epoch": 0.10752405719439263, - "grad_norm": 3.90326189994812, - "learning_rate": 8.925768826619966e-05, - "loss": 1.047, - "num_input_tokens_seen": 24717016, - "step": 1535 - }, - { - "epoch": 0.10759410544012188, - "grad_norm": 4.097111225128174, - "learning_rate": 8.925069001751314e-05, - "loss": 1.1623, - "num_input_tokens_seen": 24732776, - "step": 1536 - }, - { - "epoch": 0.10766415368585112, - "grad_norm": 3.5537095069885254, - "learning_rate": 8.924369176882663e-05, - "loss": 0.989, - "num_input_tokens_seen": 24749064, - "step": 1537 - }, - { - "epoch": 0.10773420193158037, - "grad_norm": 4.3086256980896, - "learning_rate": 8.92366935201401e-05, - "loss": 1.0864, - "num_input_tokens_seen": 24765448, - "step": 1538 - }, - { - "epoch": 0.10780425017730962, - "grad_norm": 4.177425861358643, - "learning_rate": 8.922969527145359e-05, - "loss": 1.0652, - "num_input_tokens_seen": 24780816, - "step": 1539 - }, - { - "epoch": 0.10787429842303886, - "grad_norm": 3.6013338565826416, - "learning_rate": 8.922269702276708e-05, - "loss": 1.1045, - "num_input_tokens_seen": 24796600, - "step": 1540 - }, - { - "epoch": 0.10794434666876812, - "grad_norm": 4.05686092376709, - "learning_rate": 8.921569877408057e-05, - "loss": 1.1408, - "num_input_tokens_seen": 24812984, - "step": 1541 - }, - { - "epoch": 0.10801439491449737, - "grad_norm": 4.245424747467041, - "learning_rate": 8.920870052539406e-05, - "loss": 1.2634, - "num_input_tokens_seen": 24829368, - "step": 1542 - }, - { - "epoch": 0.10808444316022661, - "grad_norm": 3.9563350677490234, - "learning_rate": 8.920170227670753e-05, - "loss": 1.1015, - "num_input_tokens_seen": 24845752, - "step": 1543 - }, - { - "epoch": 0.10815449140595586, - "grad_norm": 4.209373474121094, - "learning_rate": 8.919470402802102e-05, - "loss": 1.2394, - "num_input_tokens_seen": 24862136, - "step": 1544 - }, - { - "epoch": 0.1082245396516851, - "grad_norm": 3.6590163707733154, - "learning_rate": 8.91877057793345e-05, - "loss": 1.0168, - "num_input_tokens_seen": 24878520, - "step": 1545 - }, - { - "epoch": 0.10829458789741435, - "grad_norm": 3.937568187713623, - "learning_rate": 8.918070753064799e-05, - "loss": 1.0999, - "num_input_tokens_seen": 24894696, - "step": 1546 - }, - { - "epoch": 0.1083646361431436, - "grad_norm": 3.948453426361084, - "learning_rate": 8.917370928196147e-05, - "loss": 1.0565, - "num_input_tokens_seen": 24910208, - "step": 1547 - }, - { - "epoch": 0.10843468438887284, - "grad_norm": 3.61549711227417, - "learning_rate": 8.916671103327496e-05, - "loss": 1.0294, - "num_input_tokens_seen": 24926592, - "step": 1548 - }, - { - "epoch": 0.10850473263460209, - "grad_norm": 4.091664791107178, - "learning_rate": 8.915971278458845e-05, - "loss": 1.0596, - "num_input_tokens_seen": 24942976, - "step": 1549 - }, - { - "epoch": 0.10857478088033133, - "grad_norm": 5.494830131530762, - "learning_rate": 8.915271453590193e-05, - "loss": 1.1564, - "num_input_tokens_seen": 24957984, - "step": 1550 - }, - { - "epoch": 0.10864482912606058, - "grad_norm": 4.546476364135742, - "learning_rate": 8.914571628721541e-05, - "loss": 1.0753, - "num_input_tokens_seen": 24974368, - "step": 1551 - }, - { - "epoch": 0.10871487737178982, - "grad_norm": 3.775996446609497, - "learning_rate": 8.91387180385289e-05, - "loss": 1.11, - "num_input_tokens_seen": 24990200, - "step": 1552 - }, - { - "epoch": 0.10878492561751907, - "grad_norm": 3.9989728927612305, - "learning_rate": 8.913171978984238e-05, - "loss": 1.0121, - "num_input_tokens_seen": 25006584, - "step": 1553 - }, - { - "epoch": 0.10885497386324831, - "grad_norm": 4.417224884033203, - "learning_rate": 8.912472154115588e-05, - "loss": 1.1891, - "num_input_tokens_seen": 25022464, - "step": 1554 - }, - { - "epoch": 0.10892502210897756, - "grad_norm": 4.604903697967529, - "learning_rate": 8.911772329246936e-05, - "loss": 0.9414, - "num_input_tokens_seen": 25038848, - "step": 1555 - }, - { - "epoch": 0.1089950703547068, - "grad_norm": 4.823176860809326, - "learning_rate": 8.911072504378284e-05, - "loss": 1.1259, - "num_input_tokens_seen": 25053776, - "step": 1556 - }, - { - "epoch": 0.10906511860043605, - "grad_norm": 3.6778531074523926, - "learning_rate": 8.910372679509632e-05, - "loss": 0.9995, - "num_input_tokens_seen": 25069872, - "step": 1557 - }, - { - "epoch": 0.1091351668461653, - "grad_norm": 4.344213485717773, - "learning_rate": 8.909672854640981e-05, - "loss": 1.1984, - "num_input_tokens_seen": 25086256, - "step": 1558 - }, - { - "epoch": 0.10920521509189454, - "grad_norm": 4.592464923858643, - "learning_rate": 8.90897302977233e-05, - "loss": 1.502, - "num_input_tokens_seen": 25102640, - "step": 1559 - }, - { - "epoch": 0.10927526333762379, - "grad_norm": 4.103248119354248, - "learning_rate": 8.908273204903678e-05, - "loss": 0.9454, - "num_input_tokens_seen": 25118328, - "step": 1560 - }, - { - "epoch": 0.10934531158335303, - "grad_norm": 4.637456893920898, - "learning_rate": 8.907573380035027e-05, - "loss": 1.3611, - "num_input_tokens_seen": 25134712, - "step": 1561 - }, - { - "epoch": 0.10941535982908228, - "grad_norm": 4.4709930419921875, - "learning_rate": 8.906873555166376e-05, - "loss": 1.1147, - "num_input_tokens_seen": 25149304, - "step": 1562 - }, - { - "epoch": 0.10948540807481152, - "grad_norm": 4.154660701751709, - "learning_rate": 8.906173730297724e-05, - "loss": 1.2855, - "num_input_tokens_seen": 25165360, - "step": 1563 - }, - { - "epoch": 0.10955545632054077, - "grad_norm": 4.1212334632873535, - "learning_rate": 8.905473905429073e-05, - "loss": 1.2015, - "num_input_tokens_seen": 25181744, - "step": 1564 - }, - { - "epoch": 0.10962550456627002, - "grad_norm": 3.8060882091522217, - "learning_rate": 8.90477408056042e-05, - "loss": 1.0333, - "num_input_tokens_seen": 25197800, - "step": 1565 - }, - { - "epoch": 0.10969555281199926, - "grad_norm": 3.4948956966400146, - "learning_rate": 8.904074255691769e-05, - "loss": 0.941, - "num_input_tokens_seen": 25214008, - "step": 1566 - }, - { - "epoch": 0.1097656010577285, - "grad_norm": 4.181606292724609, - "learning_rate": 8.903374430823118e-05, - "loss": 1.1185, - "num_input_tokens_seen": 25229496, - "step": 1567 - }, - { - "epoch": 0.10983564930345775, - "grad_norm": 4.206098556518555, - "learning_rate": 8.902674605954467e-05, - "loss": 1.0363, - "num_input_tokens_seen": 25244864, - "step": 1568 - }, - { - "epoch": 0.109905697549187, - "grad_norm": 3.797475576400757, - "learning_rate": 8.901974781085815e-05, - "loss": 1.0443, - "num_input_tokens_seen": 25261248, - "step": 1569 - }, - { - "epoch": 0.10997574579491624, - "grad_norm": 4.131814479827881, - "learning_rate": 8.901274956217163e-05, - "loss": 0.9977, - "num_input_tokens_seen": 25277632, - "step": 1570 - }, - { - "epoch": 0.11004579404064549, - "grad_norm": 3.9447309970855713, - "learning_rate": 8.900575131348512e-05, - "loss": 1.0839, - "num_input_tokens_seen": 25294016, - "step": 1571 - }, - { - "epoch": 0.11011584228637473, - "grad_norm": 3.916949510574341, - "learning_rate": 8.89987530647986e-05, - "loss": 1.1793, - "num_input_tokens_seen": 25309912, - "step": 1572 - }, - { - "epoch": 0.11018589053210398, - "grad_norm": 3.7132885456085205, - "learning_rate": 8.899175481611208e-05, - "loss": 1.081, - "num_input_tokens_seen": 25326296, - "step": 1573 - }, - { - "epoch": 0.11025593877783323, - "grad_norm": 4.5842390060424805, - "learning_rate": 8.898475656742558e-05, - "loss": 0.926, - "num_input_tokens_seen": 25342328, - "step": 1574 - }, - { - "epoch": 0.11032598702356247, - "grad_norm": 3.578962802886963, - "learning_rate": 8.897775831873906e-05, - "loss": 1.0599, - "num_input_tokens_seen": 25357640, - "step": 1575 - }, - { - "epoch": 0.11039603526929173, - "grad_norm": 3.5823471546173096, - "learning_rate": 8.897076007005255e-05, - "loss": 0.9519, - "num_input_tokens_seen": 25373424, - "step": 1576 - }, - { - "epoch": 0.11046608351502098, - "grad_norm": 3.721482515335083, - "learning_rate": 8.896376182136602e-05, - "loss": 0.976, - "num_input_tokens_seen": 25389808, - "step": 1577 - }, - { - "epoch": 0.11053613176075022, - "grad_norm": 4.874295711517334, - "learning_rate": 8.895676357267951e-05, - "loss": 1.3507, - "num_input_tokens_seen": 25406192, - "step": 1578 - }, - { - "epoch": 0.11060618000647947, - "grad_norm": 3.8547258377075195, - "learning_rate": 8.8949765323993e-05, - "loss": 0.9444, - "num_input_tokens_seen": 25421632, - "step": 1579 - }, - { - "epoch": 0.11067622825220871, - "grad_norm": 4.847586631774902, - "learning_rate": 8.894276707530649e-05, - "loss": 1.0526, - "num_input_tokens_seen": 25438016, - "step": 1580 - }, - { - "epoch": 0.11074627649793796, - "grad_norm": 3.950594425201416, - "learning_rate": 8.893576882661998e-05, - "loss": 1.0688, - "num_input_tokens_seen": 25454400, - "step": 1581 - }, - { - "epoch": 0.1108163247436672, - "grad_norm": 3.7372758388519287, - "learning_rate": 8.892877057793345e-05, - "loss": 1.2211, - "num_input_tokens_seen": 25470304, - "step": 1582 - }, - { - "epoch": 0.11088637298939645, - "grad_norm": 3.8695788383483887, - "learning_rate": 8.892177232924694e-05, - "loss": 1.1006, - "num_input_tokens_seen": 25486688, - "step": 1583 - }, - { - "epoch": 0.1109564212351257, - "grad_norm": 4.623810768127441, - "learning_rate": 8.891477408056042e-05, - "loss": 1.034, - "num_input_tokens_seen": 25503072, - "step": 1584 - }, - { - "epoch": 0.11102646948085494, - "grad_norm": 4.03538179397583, - "learning_rate": 8.89077758318739e-05, - "loss": 1.0915, - "num_input_tokens_seen": 25519008, - "step": 1585 - }, - { - "epoch": 0.11109651772658419, - "grad_norm": 7.486603736877441, - "learning_rate": 8.890077758318739e-05, - "loss": 1.0137, - "num_input_tokens_seen": 25533808, - "step": 1586 - }, - { - "epoch": 0.11116656597231343, - "grad_norm": 4.660414218902588, - "learning_rate": 8.889377933450088e-05, - "loss": 1.0172, - "num_input_tokens_seen": 25549784, - "step": 1587 - }, - { - "epoch": 0.11123661421804268, - "grad_norm": 3.9375548362731934, - "learning_rate": 8.888678108581437e-05, - "loss": 0.9843, - "num_input_tokens_seen": 25566168, - "step": 1588 - }, - { - "epoch": 0.11130666246377192, - "grad_norm": 4.275035858154297, - "learning_rate": 8.887978283712786e-05, - "loss": 1.1802, - "num_input_tokens_seen": 25582552, - "step": 1589 - }, - { - "epoch": 0.11137671070950117, - "grad_norm": 4.799124240875244, - "learning_rate": 8.887278458844133e-05, - "loss": 1.2702, - "num_input_tokens_seen": 25598936, - "step": 1590 - }, - { - "epoch": 0.11144675895523042, - "grad_norm": 4.143614768981934, - "learning_rate": 8.886578633975482e-05, - "loss": 1.1797, - "num_input_tokens_seen": 25615320, - "step": 1591 - }, - { - "epoch": 0.11151680720095966, - "grad_norm": 4.490556716918945, - "learning_rate": 8.88587880910683e-05, - "loss": 1.1351, - "num_input_tokens_seen": 25630624, - "step": 1592 - }, - { - "epoch": 0.1115868554466889, - "grad_norm": 6.010688781738281, - "learning_rate": 8.885178984238179e-05, - "loss": 1.059, - "num_input_tokens_seen": 25646048, - "step": 1593 - }, - { - "epoch": 0.11165690369241815, - "grad_norm": 3.7447726726531982, - "learning_rate": 8.884479159369527e-05, - "loss": 0.9902, - "num_input_tokens_seen": 25661528, - "step": 1594 - }, - { - "epoch": 0.1117269519381474, - "grad_norm": 4.77920389175415, - "learning_rate": 8.883779334500876e-05, - "loss": 1.1158, - "num_input_tokens_seen": 25677912, - "step": 1595 - }, - { - "epoch": 0.11179700018387664, - "grad_norm": 3.9812231063842773, - "learning_rate": 8.883079509632225e-05, - "loss": 1.096, - "num_input_tokens_seen": 25694296, - "step": 1596 - }, - { - "epoch": 0.11186704842960589, - "grad_norm": 3.7404634952545166, - "learning_rate": 8.882379684763573e-05, - "loss": 0.9965, - "num_input_tokens_seen": 25710448, - "step": 1597 - }, - { - "epoch": 0.11193709667533513, - "grad_norm": 4.466211318969727, - "learning_rate": 8.881679859894922e-05, - "loss": 1.1495, - "num_input_tokens_seen": 25726624, - "step": 1598 - }, - { - "epoch": 0.11200714492106438, - "grad_norm": 3.6850225925445557, - "learning_rate": 8.880980035026269e-05, - "loss": 0.9685, - "num_input_tokens_seen": 25742456, - "step": 1599 - }, - { - "epoch": 0.11207719316679363, - "grad_norm": 4.128363609313965, - "learning_rate": 8.880280210157619e-05, - "loss": 1.1052, - "num_input_tokens_seen": 25758840, - "step": 1600 - }, - { - "epoch": 0.11207719316679363, - "eval_loss": 1.1512293815612793, - "eval_runtime": 0.1988, - "eval_samples_per_second": 5.031, - "eval_steps_per_second": 5.031, - "num_input_tokens_seen": 25758840, - "step": 1600 - }, - { - "epoch": 0.11214724141252287, - "grad_norm": 4.852661609649658, - "learning_rate": 8.879580385288968e-05, - "loss": 1.0778, - "num_input_tokens_seen": 25774312, - "step": 1601 - }, - { - "epoch": 0.11221728965825212, - "grad_norm": 4.501857280731201, - "learning_rate": 8.878880560420316e-05, - "loss": 1.302, - "num_input_tokens_seen": 25790696, - "step": 1602 - }, - { - "epoch": 0.11228733790398136, - "grad_norm": 4.142490863800049, - "learning_rate": 8.878180735551665e-05, - "loss": 1.0375, - "num_input_tokens_seen": 25807080, - "step": 1603 - }, - { - "epoch": 0.11235738614971061, - "grad_norm": 3.606905698776245, - "learning_rate": 8.877480910683012e-05, - "loss": 0.9254, - "num_input_tokens_seen": 25822552, - "step": 1604 - }, - { - "epoch": 0.11242743439543985, - "grad_norm": 3.837010145187378, - "learning_rate": 8.876781085814361e-05, - "loss": 1.1756, - "num_input_tokens_seen": 25838088, - "step": 1605 - }, - { - "epoch": 0.1124974826411691, - "grad_norm": 3.9082963466644287, - "learning_rate": 8.87608126094571e-05, - "loss": 1.0201, - "num_input_tokens_seen": 25854240, - "step": 1606 - }, - { - "epoch": 0.11256753088689835, - "grad_norm": 4.062923908233643, - "learning_rate": 8.875381436077059e-05, - "loss": 1.1034, - "num_input_tokens_seen": 25870624, - "step": 1607 - }, - { - "epoch": 0.11263757913262759, - "grad_norm": 4.331594944000244, - "learning_rate": 8.874681611208407e-05, - "loss": 1.2043, - "num_input_tokens_seen": 25886656, - "step": 1608 - }, - { - "epoch": 0.11270762737835684, - "grad_norm": 3.77466082572937, - "learning_rate": 8.873981786339755e-05, - "loss": 0.936, - "num_input_tokens_seen": 25902704, - "step": 1609 - }, - { - "epoch": 0.1127776756240861, - "grad_norm": 3.3747365474700928, - "learning_rate": 8.873281961471104e-05, - "loss": 0.9071, - "num_input_tokens_seen": 25919088, - "step": 1610 - }, - { - "epoch": 0.11284772386981534, - "grad_norm": 5.377493381500244, - "learning_rate": 8.872582136602451e-05, - "loss": 0.9246, - "num_input_tokens_seen": 25935472, - "step": 1611 - }, - { - "epoch": 0.11291777211554459, - "grad_norm": 5.506969451904297, - "learning_rate": 8.8718823117338e-05, - "loss": 0.9211, - "num_input_tokens_seen": 25951664, - "step": 1612 - }, - { - "epoch": 0.11298782036127383, - "grad_norm": 4.874104976654053, - "learning_rate": 8.871182486865149e-05, - "loss": 1.1654, - "num_input_tokens_seen": 25968048, - "step": 1613 - }, - { - "epoch": 0.11305786860700308, - "grad_norm": 4.666824817657471, - "learning_rate": 8.870482661996498e-05, - "loss": 1.2155, - "num_input_tokens_seen": 25983784, - "step": 1614 - }, - { - "epoch": 0.11312791685273232, - "grad_norm": 3.949862241744995, - "learning_rate": 8.869782837127847e-05, - "loss": 1.1243, - "num_input_tokens_seen": 26000168, - "step": 1615 - }, - { - "epoch": 0.11319796509846157, - "grad_norm": 3.866542339324951, - "learning_rate": 8.869083012259196e-05, - "loss": 1.1302, - "num_input_tokens_seen": 26015456, - "step": 1616 - }, - { - "epoch": 0.11326801334419082, - "grad_norm": 3.8679909706115723, - "learning_rate": 8.868383187390543e-05, - "loss": 1.0886, - "num_input_tokens_seen": 26031224, - "step": 1617 - }, - { - "epoch": 0.11333806158992006, - "grad_norm": 4.7508087158203125, - "learning_rate": 8.867683362521892e-05, - "loss": 1.2837, - "num_input_tokens_seen": 26046952, - "step": 1618 - }, - { - "epoch": 0.1134081098356493, - "grad_norm": 3.878549337387085, - "learning_rate": 8.86698353765324e-05, - "loss": 0.99, - "num_input_tokens_seen": 26063280, - "step": 1619 - }, - { - "epoch": 0.11347815808137855, - "grad_norm": 3.8016276359558105, - "learning_rate": 8.86628371278459e-05, - "loss": 1.1682, - "num_input_tokens_seen": 26079616, - "step": 1620 - }, - { - "epoch": 0.1135482063271078, - "grad_norm": 4.040102481842041, - "learning_rate": 8.865583887915937e-05, - "loss": 1.1008, - "num_input_tokens_seen": 26095232, - "step": 1621 - }, - { - "epoch": 0.11361825457283704, - "grad_norm": 3.932529926300049, - "learning_rate": 8.864884063047286e-05, - "loss": 1.1663, - "num_input_tokens_seen": 26111616, - "step": 1622 - }, - { - "epoch": 0.11368830281856629, - "grad_norm": 4.568112373352051, - "learning_rate": 8.864184238178635e-05, - "loss": 1.1932, - "num_input_tokens_seen": 26128000, - "step": 1623 - }, - { - "epoch": 0.11375835106429553, - "grad_norm": 4.23036527633667, - "learning_rate": 8.863484413309982e-05, - "loss": 1.0223, - "num_input_tokens_seen": 26144384, - "step": 1624 - }, - { - "epoch": 0.11382839931002478, - "grad_norm": 4.209012031555176, - "learning_rate": 8.862784588441331e-05, - "loss": 1.0992, - "num_input_tokens_seen": 26160768, - "step": 1625 - }, - { - "epoch": 0.11389844755575403, - "grad_norm": 3.865983724594116, - "learning_rate": 8.86208476357268e-05, - "loss": 1.1213, - "num_input_tokens_seen": 26177152, - "step": 1626 - }, - { - "epoch": 0.11396849580148327, - "grad_norm": 3.781083822250366, - "learning_rate": 8.861384938704029e-05, - "loss": 1.0132, - "num_input_tokens_seen": 26193536, - "step": 1627 - }, - { - "epoch": 0.11403854404721252, - "grad_norm": 4.330471038818359, - "learning_rate": 8.860685113835378e-05, - "loss": 0.9749, - "num_input_tokens_seen": 26208976, - "step": 1628 - }, - { - "epoch": 0.11410859229294176, - "grad_norm": 4.772238254547119, - "learning_rate": 8.859985288966725e-05, - "loss": 1.2796, - "num_input_tokens_seen": 26225360, - "step": 1629 - }, - { - "epoch": 0.11417864053867101, - "grad_norm": 4.0468668937683105, - "learning_rate": 8.859285464098074e-05, - "loss": 1.0056, - "num_input_tokens_seen": 26241744, - "step": 1630 - }, - { - "epoch": 0.11424868878440025, - "grad_norm": 3.9648735523223877, - "learning_rate": 8.858585639229422e-05, - "loss": 1.2185, - "num_input_tokens_seen": 26258128, - "step": 1631 - }, - { - "epoch": 0.1143187370301295, - "grad_norm": 4.7014079093933105, - "learning_rate": 8.85788581436077e-05, - "loss": 1.1795, - "num_input_tokens_seen": 26274512, - "step": 1632 - }, - { - "epoch": 0.11438878527585875, - "grad_norm": 4.6375627517700195, - "learning_rate": 8.85718598949212e-05, - "loss": 1.0074, - "num_input_tokens_seen": 26290008, - "step": 1633 - }, - { - "epoch": 0.11445883352158799, - "grad_norm": 4.427719593048096, - "learning_rate": 8.856486164623468e-05, - "loss": 1.2769, - "num_input_tokens_seen": 26305512, - "step": 1634 - }, - { - "epoch": 0.11452888176731724, - "grad_norm": 6.001821994781494, - "learning_rate": 8.855786339754817e-05, - "loss": 1.0606, - "num_input_tokens_seen": 26319504, - "step": 1635 - }, - { - "epoch": 0.11459893001304648, - "grad_norm": 3.970672369003296, - "learning_rate": 8.855086514886165e-05, - "loss": 1.1944, - "num_input_tokens_seen": 26335888, - "step": 1636 - }, - { - "epoch": 0.11466897825877573, - "grad_norm": 3.924450635910034, - "learning_rate": 8.854386690017514e-05, - "loss": 0.9607, - "num_input_tokens_seen": 26351536, - "step": 1637 - }, - { - "epoch": 0.11473902650450497, - "grad_norm": 4.400977611541748, - "learning_rate": 8.853686865148861e-05, - "loss": 1.0641, - "num_input_tokens_seen": 26367808, - "step": 1638 - }, - { - "epoch": 0.11480907475023422, - "grad_norm": 3.9734365940093994, - "learning_rate": 8.85298704028021e-05, - "loss": 1.2258, - "num_input_tokens_seen": 26383864, - "step": 1639 - }, - { - "epoch": 0.11487912299596346, - "grad_norm": 3.792949914932251, - "learning_rate": 8.85228721541156e-05, - "loss": 1.0401, - "num_input_tokens_seen": 26400248, - "step": 1640 - }, - { - "epoch": 0.11494917124169271, - "grad_norm": 5.14591121673584, - "learning_rate": 8.851587390542908e-05, - "loss": 1.0484, - "num_input_tokens_seen": 26416056, - "step": 1641 - }, - { - "epoch": 0.11501921948742196, - "grad_norm": 5.0158162117004395, - "learning_rate": 8.850887565674256e-05, - "loss": 1.2823, - "num_input_tokens_seen": 26431400, - "step": 1642 - }, - { - "epoch": 0.1150892677331512, - "grad_norm": 4.459201812744141, - "learning_rate": 8.850187740805605e-05, - "loss": 1.2371, - "num_input_tokens_seen": 26446920, - "step": 1643 - }, - { - "epoch": 0.11515931597888045, - "grad_norm": 3.717949867248535, - "learning_rate": 8.849487915936953e-05, - "loss": 1.1299, - "num_input_tokens_seen": 26463304, - "step": 1644 - }, - { - "epoch": 0.1152293642246097, - "grad_norm": 3.7555253505706787, - "learning_rate": 8.848788091068302e-05, - "loss": 1.0835, - "num_input_tokens_seen": 26479296, - "step": 1645 - }, - { - "epoch": 0.11529941247033895, - "grad_norm": 4.3726325035095215, - "learning_rate": 8.84808826619965e-05, - "loss": 0.9606, - "num_input_tokens_seen": 26495024, - "step": 1646 - }, - { - "epoch": 0.1153694607160682, - "grad_norm": 3.728700876235962, - "learning_rate": 8.847388441331e-05, - "loss": 1.0486, - "num_input_tokens_seen": 26511408, - "step": 1647 - }, - { - "epoch": 0.11543950896179744, - "grad_norm": 4.276855945587158, - "learning_rate": 8.846688616462347e-05, - "loss": 0.9869, - "num_input_tokens_seen": 26527688, - "step": 1648 - }, - { - "epoch": 0.11550955720752669, - "grad_norm": 5.386009693145752, - "learning_rate": 8.845988791593696e-05, - "loss": 1.0021, - "num_input_tokens_seen": 26544072, - "step": 1649 - }, - { - "epoch": 0.11557960545325593, - "grad_norm": 4.978610992431641, - "learning_rate": 8.845288966725045e-05, - "loss": 1.2531, - "num_input_tokens_seen": 26560456, - "step": 1650 - }, - { - "epoch": 0.11564965369898518, - "grad_norm": 5.325594425201416, - "learning_rate": 8.844589141856392e-05, - "loss": 0.9983, - "num_input_tokens_seen": 26576840, - "step": 1651 - }, - { - "epoch": 0.11571970194471443, - "grad_norm": 4.359868049621582, - "learning_rate": 8.843889316987741e-05, - "loss": 0.9652, - "num_input_tokens_seen": 26593224, - "step": 1652 - }, - { - "epoch": 0.11578975019044367, - "grad_norm": 7.921500205993652, - "learning_rate": 8.84318949211909e-05, - "loss": 1.0767, - "num_input_tokens_seen": 26607352, - "step": 1653 - }, - { - "epoch": 0.11585979843617292, - "grad_norm": 3.51788330078125, - "learning_rate": 8.842489667250439e-05, - "loss": 1.0677, - "num_input_tokens_seen": 26623696, - "step": 1654 - }, - { - "epoch": 0.11592984668190216, - "grad_norm": 4.120747089385986, - "learning_rate": 8.841789842381788e-05, - "loss": 1.2139, - "num_input_tokens_seen": 26639832, - "step": 1655 - }, - { - "epoch": 0.11599989492763141, - "grad_norm": 4.077361106872559, - "learning_rate": 8.841090017513135e-05, - "loss": 1.0639, - "num_input_tokens_seen": 26655432, - "step": 1656 - }, - { - "epoch": 0.11606994317336065, - "grad_norm": 3.9629955291748047, - "learning_rate": 8.840390192644484e-05, - "loss": 1.0846, - "num_input_tokens_seen": 26671816, - "step": 1657 - }, - { - "epoch": 0.1161399914190899, - "grad_norm": 3.933544635772705, - "learning_rate": 8.839690367775831e-05, - "loss": 1.1543, - "num_input_tokens_seen": 26688096, - "step": 1658 - }, - { - "epoch": 0.11621003966481915, - "grad_norm": 4.702983379364014, - "learning_rate": 8.83899054290718e-05, - "loss": 1.0699, - "num_input_tokens_seen": 26704480, - "step": 1659 - }, - { - "epoch": 0.11628008791054839, - "grad_norm": 4.536739826202393, - "learning_rate": 8.83829071803853e-05, - "loss": 1.149, - "num_input_tokens_seen": 26720864, - "step": 1660 - }, - { - "epoch": 0.11635013615627764, - "grad_norm": 4.419711589813232, - "learning_rate": 8.837590893169878e-05, - "loss": 1.1994, - "num_input_tokens_seen": 26737248, - "step": 1661 - }, - { - "epoch": 0.11642018440200688, - "grad_norm": 4.106175899505615, - "learning_rate": 8.836891068301227e-05, - "loss": 1.0682, - "num_input_tokens_seen": 26753632, - "step": 1662 - }, - { - "epoch": 0.11649023264773613, - "grad_norm": 3.469658374786377, - "learning_rate": 8.836191243432574e-05, - "loss": 1.0356, - "num_input_tokens_seen": 26769944, - "step": 1663 - }, - { - "epoch": 0.11656028089346537, - "grad_norm": 7.273227691650391, - "learning_rate": 8.835491418563923e-05, - "loss": 1.1699, - "num_input_tokens_seen": 26784520, - "step": 1664 - }, - { - "epoch": 0.11663032913919462, - "grad_norm": 3.611165761947632, - "learning_rate": 8.834791593695271e-05, - "loss": 0.8595, - "num_input_tokens_seen": 26800360, - "step": 1665 - }, - { - "epoch": 0.11670037738492386, - "grad_norm": 4.405304908752441, - "learning_rate": 8.834091768826621e-05, - "loss": 1.2055, - "num_input_tokens_seen": 26816744, - "step": 1666 - }, - { - "epoch": 0.11677042563065311, - "grad_norm": 3.897247791290283, - "learning_rate": 8.83339194395797e-05, - "loss": 0.9599, - "num_input_tokens_seen": 26832520, - "step": 1667 - }, - { - "epoch": 0.11684047387638236, - "grad_norm": 3.898019313812256, - "learning_rate": 8.832692119089317e-05, - "loss": 1.0838, - "num_input_tokens_seen": 26848080, - "step": 1668 - }, - { - "epoch": 0.1169105221221116, - "grad_norm": 4.6351542472839355, - "learning_rate": 8.831992294220666e-05, - "loss": 1.2776, - "num_input_tokens_seen": 26864464, - "step": 1669 - }, - { - "epoch": 0.11698057036784085, - "grad_norm": 4.020237922668457, - "learning_rate": 8.831292469352015e-05, - "loss": 0.9955, - "num_input_tokens_seen": 26880848, - "step": 1670 - }, - { - "epoch": 0.11705061861357009, - "grad_norm": 5.813192367553711, - "learning_rate": 8.830592644483363e-05, - "loss": 1.2867, - "num_input_tokens_seen": 26897232, - "step": 1671 - }, - { - "epoch": 0.11712066685929934, - "grad_norm": 4.058423042297363, - "learning_rate": 8.829892819614711e-05, - "loss": 1.0697, - "num_input_tokens_seen": 26912872, - "step": 1672 - }, - { - "epoch": 0.11719071510502858, - "grad_norm": 4.76987361907959, - "learning_rate": 8.82919299474606e-05, - "loss": 0.9226, - "num_input_tokens_seen": 26929256, - "step": 1673 - }, - { - "epoch": 0.11726076335075783, - "grad_norm": 3.8400967121124268, - "learning_rate": 8.828493169877409e-05, - "loss": 1.0089, - "num_input_tokens_seen": 26945624, - "step": 1674 - }, - { - "epoch": 0.11733081159648708, - "grad_norm": 4.49709415435791, - "learning_rate": 8.827793345008757e-05, - "loss": 1.0898, - "num_input_tokens_seen": 26961464, - "step": 1675 - }, - { - "epoch": 0.11740085984221632, - "grad_norm": 4.143093109130859, - "learning_rate": 8.827093520140105e-05, - "loss": 1.0493, - "num_input_tokens_seen": 26976720, - "step": 1676 - }, - { - "epoch": 0.11747090808794557, - "grad_norm": 4.138030529022217, - "learning_rate": 8.826393695271454e-05, - "loss": 1.1555, - "num_input_tokens_seen": 26993056, - "step": 1677 - }, - { - "epoch": 0.11754095633367481, - "grad_norm": 3.8191847801208496, - "learning_rate": 8.825693870402802e-05, - "loss": 1.0993, - "num_input_tokens_seen": 27009440, - "step": 1678 - }, - { - "epoch": 0.11761100457940406, - "grad_norm": 3.8392176628112793, - "learning_rate": 8.824994045534151e-05, - "loss": 1.1067, - "num_input_tokens_seen": 27024880, - "step": 1679 - }, - { - "epoch": 0.11768105282513332, - "grad_norm": 4.468568801879883, - "learning_rate": 8.8242942206655e-05, - "loss": 1.1424, - "num_input_tokens_seen": 27040672, - "step": 1680 - }, - { - "epoch": 0.11775110107086256, - "grad_norm": 3.6515510082244873, - "learning_rate": 8.823594395796848e-05, - "loss": 1.0659, - "num_input_tokens_seen": 27057056, - "step": 1681 - }, - { - "epoch": 0.11782114931659181, - "grad_norm": 4.479739189147949, - "learning_rate": 8.822894570928197e-05, - "loss": 1.0399, - "num_input_tokens_seen": 27073440, - "step": 1682 - }, - { - "epoch": 0.11789119756232105, - "grad_norm": 3.762479782104492, - "learning_rate": 8.822194746059545e-05, - "loss": 1.1041, - "num_input_tokens_seen": 27089824, - "step": 1683 - }, - { - "epoch": 0.1179612458080503, - "grad_norm": 4.694389343261719, - "learning_rate": 8.821494921190894e-05, - "loss": 1.2785, - "num_input_tokens_seen": 27106208, - "step": 1684 - }, - { - "epoch": 0.11803129405377955, - "grad_norm": 3.738931179046631, - "learning_rate": 8.820795096322241e-05, - "loss": 0.9039, - "num_input_tokens_seen": 27122352, - "step": 1685 - }, - { - "epoch": 0.11810134229950879, - "grad_norm": 4.065624237060547, - "learning_rate": 8.820095271453591e-05, - "loss": 1.0048, - "num_input_tokens_seen": 27138160, - "step": 1686 - }, - { - "epoch": 0.11817139054523804, - "grad_norm": 3.5373826026916504, - "learning_rate": 8.81939544658494e-05, - "loss": 0.8786, - "num_input_tokens_seen": 27154544, - "step": 1687 - }, - { - "epoch": 0.11824143879096728, - "grad_norm": 3.773066282272339, - "learning_rate": 8.818695621716288e-05, - "loss": 1.0043, - "num_input_tokens_seen": 27170928, - "step": 1688 - }, - { - "epoch": 0.11831148703669653, - "grad_norm": 3.3876242637634277, - "learning_rate": 8.817995796847637e-05, - "loss": 0.9909, - "num_input_tokens_seen": 27187312, - "step": 1689 - }, - { - "epoch": 0.11838153528242577, - "grad_norm": 4.526343822479248, - "learning_rate": 8.817295971978984e-05, - "loss": 1.0899, - "num_input_tokens_seen": 27202208, - "step": 1690 - }, - { - "epoch": 0.11845158352815502, - "grad_norm": 4.691114902496338, - "learning_rate": 8.816596147110333e-05, - "loss": 1.0823, - "num_input_tokens_seen": 27218592, - "step": 1691 - }, - { - "epoch": 0.11852163177388426, - "grad_norm": 3.90531849861145, - "learning_rate": 8.815896322241682e-05, - "loss": 1.1438, - "num_input_tokens_seen": 27234976, - "step": 1692 - }, - { - "epoch": 0.11859168001961351, - "grad_norm": 3.5546317100524902, - "learning_rate": 8.81519649737303e-05, - "loss": 1.0326, - "num_input_tokens_seen": 27251360, - "step": 1693 - }, - { - "epoch": 0.11866172826534276, - "grad_norm": 5.117360591888428, - "learning_rate": 8.81449667250438e-05, - "loss": 1.1921, - "num_input_tokens_seen": 27267744, - "step": 1694 - }, - { - "epoch": 0.118731776511072, - "grad_norm": 4.055267810821533, - "learning_rate": 8.813796847635727e-05, - "loss": 1.0607, - "num_input_tokens_seen": 27283688, - "step": 1695 - }, - { - "epoch": 0.11880182475680125, - "grad_norm": 4.04268741607666, - "learning_rate": 8.813097022767076e-05, - "loss": 1.1862, - "num_input_tokens_seen": 27300072, - "step": 1696 - }, - { - "epoch": 0.11887187300253049, - "grad_norm": 4.048800945281982, - "learning_rate": 8.812397197898425e-05, - "loss": 0.9231, - "num_input_tokens_seen": 27316456, - "step": 1697 - }, - { - "epoch": 0.11894192124825974, - "grad_norm": 4.445494174957275, - "learning_rate": 8.811697373029772e-05, - "loss": 1.241, - "num_input_tokens_seen": 27332464, - "step": 1698 - }, - { - "epoch": 0.11901196949398898, - "grad_norm": 4.522054672241211, - "learning_rate": 8.810997548161121e-05, - "loss": 1.3945, - "num_input_tokens_seen": 27348848, - "step": 1699 - }, - { - "epoch": 0.11908201773971823, - "grad_norm": 4.106349468231201, - "learning_rate": 8.81029772329247e-05, - "loss": 1.1457, - "num_input_tokens_seen": 27365232, - "step": 1700 - }, - { - "epoch": 0.11915206598544748, - "grad_norm": 6.059356689453125, - "learning_rate": 8.809597898423819e-05, - "loss": 1.3381, - "num_input_tokens_seen": 27380448, - "step": 1701 - }, - { - "epoch": 0.11922211423117672, - "grad_norm": 3.8089959621429443, - "learning_rate": 8.808898073555166e-05, - "loss": 1.0699, - "num_input_tokens_seen": 27396832, - "step": 1702 - }, - { - "epoch": 0.11929216247690597, - "grad_norm": 4.21024227142334, - "learning_rate": 8.808198248686515e-05, - "loss": 1.306, - "num_input_tokens_seen": 27413096, - "step": 1703 - }, - { - "epoch": 0.11936221072263521, - "grad_norm": 4.286004066467285, - "learning_rate": 8.807498423817864e-05, - "loss": 1.2325, - "num_input_tokens_seen": 27429480, - "step": 1704 - }, - { - "epoch": 0.11943225896836446, - "grad_norm": 3.512561559677124, - "learning_rate": 8.806798598949212e-05, - "loss": 0.8804, - "num_input_tokens_seen": 27445864, - "step": 1705 - }, - { - "epoch": 0.1195023072140937, - "grad_norm": 4.096526145935059, - "learning_rate": 8.806098774080562e-05, - "loss": 1.0591, - "num_input_tokens_seen": 27462248, - "step": 1706 - }, - { - "epoch": 0.11957235545982295, - "grad_norm": 5.032350063323975, - "learning_rate": 8.805398949211909e-05, - "loss": 0.8948, - "num_input_tokens_seen": 27478312, - "step": 1707 - }, - { - "epoch": 0.1196424037055522, - "grad_norm": 4.756420612335205, - "learning_rate": 8.804699124343258e-05, - "loss": 1.0584, - "num_input_tokens_seen": 27494696, - "step": 1708 - }, - { - "epoch": 0.11971245195128144, - "grad_norm": 4.869518756866455, - "learning_rate": 8.803999299474607e-05, - "loss": 0.9394, - "num_input_tokens_seen": 27511080, - "step": 1709 - }, - { - "epoch": 0.11978250019701069, - "grad_norm": 3.451759099960327, - "learning_rate": 8.803299474605954e-05, - "loss": 0.9171, - "num_input_tokens_seen": 27527328, - "step": 1710 - }, - { - "epoch": 0.11985254844273993, - "grad_norm": 4.247021675109863, - "learning_rate": 8.802599649737303e-05, - "loss": 1.1204, - "num_input_tokens_seen": 27543712, - "step": 1711 - }, - { - "epoch": 0.11992259668846918, - "grad_norm": 4.597024917602539, - "learning_rate": 8.801899824868652e-05, - "loss": 1.196, - "num_input_tokens_seen": 27560096, - "step": 1712 - }, - { - "epoch": 0.11999264493419842, - "grad_norm": 4.242952823638916, - "learning_rate": 8.801200000000001e-05, - "loss": 1.1747, - "num_input_tokens_seen": 27576320, - "step": 1713 - }, - { - "epoch": 0.12006269317992768, - "grad_norm": 5.1166486740112305, - "learning_rate": 8.80050017513135e-05, - "loss": 1.4222, - "num_input_tokens_seen": 27591024, - "step": 1714 - }, - { - "epoch": 0.12013274142565693, - "grad_norm": 4.6713714599609375, - "learning_rate": 8.799800350262697e-05, - "loss": 1.1869, - "num_input_tokens_seen": 27606352, - "step": 1715 - }, - { - "epoch": 0.12020278967138617, - "grad_norm": 4.62678861618042, - "learning_rate": 8.799100525394046e-05, - "loss": 1.1524, - "num_input_tokens_seen": 27622736, - "step": 1716 - }, - { - "epoch": 0.12027283791711542, - "grad_norm": 3.611985206604004, - "learning_rate": 8.798400700525394e-05, - "loss": 1.1179, - "num_input_tokens_seen": 27639120, - "step": 1717 - }, - { - "epoch": 0.12034288616284466, - "grad_norm": 4.165099143981934, - "learning_rate": 8.797700875656743e-05, - "loss": 1.0104, - "num_input_tokens_seen": 27654024, - "step": 1718 - }, - { - "epoch": 0.12041293440857391, - "grad_norm": 4.532061576843262, - "learning_rate": 8.797001050788091e-05, - "loss": 1.05, - "num_input_tokens_seen": 27670408, - "step": 1719 - }, - { - "epoch": 0.12048298265430316, - "grad_norm": 4.880197048187256, - "learning_rate": 8.79630122591944e-05, - "loss": 1.0321, - "num_input_tokens_seen": 27686792, - "step": 1720 - }, - { - "epoch": 0.1205530309000324, - "grad_norm": 3.521052360534668, - "learning_rate": 8.795601401050789e-05, - "loss": 0.9048, - "num_input_tokens_seen": 27703176, - "step": 1721 - }, - { - "epoch": 0.12062307914576165, - "grad_norm": 3.965725898742676, - "learning_rate": 8.794901576182137e-05, - "loss": 1.1348, - "num_input_tokens_seen": 27719024, - "step": 1722 - }, - { - "epoch": 0.12069312739149089, - "grad_norm": 3.936962842941284, - "learning_rate": 8.794201751313486e-05, - "loss": 1.1531, - "num_input_tokens_seen": 27734736, - "step": 1723 - }, - { - "epoch": 0.12076317563722014, - "grad_norm": 5.225526332855225, - "learning_rate": 8.793501926444834e-05, - "loss": 1.2784, - "num_input_tokens_seen": 27751120, - "step": 1724 - }, - { - "epoch": 0.12083322388294938, - "grad_norm": 4.125289440155029, - "learning_rate": 8.792802101576182e-05, - "loss": 1.1893, - "num_input_tokens_seen": 27767288, - "step": 1725 - }, - { - "epoch": 0.12090327212867863, - "grad_norm": 3.9352405071258545, - "learning_rate": 8.792102276707532e-05, - "loss": 1.1867, - "num_input_tokens_seen": 27783672, - "step": 1726 - }, - { - "epoch": 0.12097332037440787, - "grad_norm": 3.908578634262085, - "learning_rate": 8.79140245183888e-05, - "loss": 1.0024, - "num_input_tokens_seen": 27799640, - "step": 1727 - }, - { - "epoch": 0.12104336862013712, - "grad_norm": 3.694387435913086, - "learning_rate": 8.790702626970229e-05, - "loss": 1.0652, - "num_input_tokens_seen": 27816024, - "step": 1728 - }, - { - "epoch": 0.12111341686586637, - "grad_norm": 4.0100016593933105, - "learning_rate": 8.790002802101576e-05, - "loss": 1.0511, - "num_input_tokens_seen": 27832408, - "step": 1729 - }, - { - "epoch": 0.12118346511159561, - "grad_norm": 5.454882621765137, - "learning_rate": 8.789302977232925e-05, - "loss": 1.1096, - "num_input_tokens_seen": 27848792, - "step": 1730 - }, - { - "epoch": 0.12125351335732486, - "grad_norm": 5.065526485443115, - "learning_rate": 8.788603152364274e-05, - "loss": 1.0354, - "num_input_tokens_seen": 27864688, - "step": 1731 - }, - { - "epoch": 0.1213235616030541, - "grad_norm": 3.73103666305542, - "learning_rate": 8.787903327495623e-05, - "loss": 1.0328, - "num_input_tokens_seen": 27881072, - "step": 1732 - }, - { - "epoch": 0.12139360984878335, - "grad_norm": 3.971198081970215, - "learning_rate": 8.787203502626971e-05, - "loss": 1.1908, - "num_input_tokens_seen": 27896912, - "step": 1733 - }, - { - "epoch": 0.1214636580945126, - "grad_norm": 3.933809518814087, - "learning_rate": 8.786503677758319e-05, - "loss": 1.1125, - "num_input_tokens_seen": 27913104, - "step": 1734 - }, - { - "epoch": 0.12153370634024184, - "grad_norm": 3.92167329788208, - "learning_rate": 8.785803852889668e-05, - "loss": 1.0007, - "num_input_tokens_seen": 27929488, - "step": 1735 - }, - { - "epoch": 0.12160375458597109, - "grad_norm": 4.441089630126953, - "learning_rate": 8.785104028021017e-05, - "loss": 0.9748, - "num_input_tokens_seen": 27945504, - "step": 1736 - }, - { - "epoch": 0.12167380283170033, - "grad_norm": 4.023623466491699, - "learning_rate": 8.784404203152364e-05, - "loss": 0.8826, - "num_input_tokens_seen": 27961888, - "step": 1737 - }, - { - "epoch": 0.12174385107742958, - "grad_norm": 4.0328826904296875, - "learning_rate": 8.783704378283713e-05, - "loss": 1.2769, - "num_input_tokens_seen": 27978024, - "step": 1738 - }, - { - "epoch": 0.12181389932315882, - "grad_norm": 4.5445733070373535, - "learning_rate": 8.783004553415062e-05, - "loss": 1.3745, - "num_input_tokens_seen": 27993840, - "step": 1739 - }, - { - "epoch": 0.12188394756888807, - "grad_norm": 3.609834671020508, - "learning_rate": 8.782304728546411e-05, - "loss": 0.916, - "num_input_tokens_seen": 28010224, - "step": 1740 - }, - { - "epoch": 0.12195399581461731, - "grad_norm": 3.849306344985962, - "learning_rate": 8.78160490367776e-05, - "loss": 1.1135, - "num_input_tokens_seen": 28026232, - "step": 1741 - }, - { - "epoch": 0.12202404406034656, - "grad_norm": 4.11102294921875, - "learning_rate": 8.780905078809107e-05, - "loss": 1.2269, - "num_input_tokens_seen": 28041880, - "step": 1742 - }, - { - "epoch": 0.1220940923060758, - "grad_norm": 4.156986713409424, - "learning_rate": 8.780205253940456e-05, - "loss": 1.0321, - "num_input_tokens_seen": 28058264, - "step": 1743 - }, - { - "epoch": 0.12216414055180505, - "grad_norm": 3.9670159816741943, - "learning_rate": 8.779505429071803e-05, - "loss": 0.9752, - "num_input_tokens_seen": 28073168, - "step": 1744 - }, - { - "epoch": 0.1222341887975343, - "grad_norm": 5.342650890350342, - "learning_rate": 8.778805604203152e-05, - "loss": 1.1416, - "num_input_tokens_seen": 28089552, - "step": 1745 - }, - { - "epoch": 0.12230423704326354, - "grad_norm": 4.031285285949707, - "learning_rate": 8.778105779334501e-05, - "loss": 1.1134, - "num_input_tokens_seen": 28105264, - "step": 1746 - }, - { - "epoch": 0.12237428528899279, - "grad_norm": 3.5976450443267822, - "learning_rate": 8.77740595446585e-05, - "loss": 1.0342, - "num_input_tokens_seen": 28121648, - "step": 1747 - }, - { - "epoch": 0.12244433353472203, - "grad_norm": 4.947859764099121, - "learning_rate": 8.776706129597199e-05, - "loss": 1.0809, - "num_input_tokens_seen": 28137640, - "step": 1748 - }, - { - "epoch": 0.12251438178045129, - "grad_norm": 4.004949569702148, - "learning_rate": 8.776006304728546e-05, - "loss": 1.0921, - "num_input_tokens_seen": 28154024, - "step": 1749 - }, - { - "epoch": 0.12258443002618054, - "grad_norm": 3.9022445678710938, - "learning_rate": 8.775306479859895e-05, - "loss": 1.0844, - "num_input_tokens_seen": 28170408, - "step": 1750 - }, - { - "epoch": 0.12265447827190978, - "grad_norm": 4.171925067901611, - "learning_rate": 8.774606654991244e-05, - "loss": 1.1894, - "num_input_tokens_seen": 28186792, - "step": 1751 - }, - { - "epoch": 0.12272452651763903, - "grad_norm": 3.9387433528900146, - "learning_rate": 8.773906830122592e-05, - "loss": 1.0303, - "num_input_tokens_seen": 28203176, - "step": 1752 - }, - { - "epoch": 0.12279457476336827, - "grad_norm": 5.067278861999512, - "learning_rate": 8.773207005253942e-05, - "loss": 1.1924, - "num_input_tokens_seen": 28219192, - "step": 1753 - }, - { - "epoch": 0.12286462300909752, - "grad_norm": 3.673807144165039, - "learning_rate": 8.77250718038529e-05, - "loss": 1.0438, - "num_input_tokens_seen": 28235576, - "step": 1754 - }, - { - "epoch": 0.12293467125482677, - "grad_norm": 5.303588390350342, - "learning_rate": 8.771807355516638e-05, - "loss": 1.2601, - "num_input_tokens_seen": 28251960, - "step": 1755 - }, - { - "epoch": 0.12300471950055601, - "grad_norm": 5.343825340270996, - "learning_rate": 8.771107530647986e-05, - "loss": 1.1126, - "num_input_tokens_seen": 28268344, - "step": 1756 - }, - { - "epoch": 0.12307476774628526, - "grad_norm": 4.125874996185303, - "learning_rate": 8.770407705779335e-05, - "loss": 1.1497, - "num_input_tokens_seen": 28284144, - "step": 1757 - }, - { - "epoch": 0.1231448159920145, - "grad_norm": 4.628546714782715, - "learning_rate": 8.769707880910683e-05, - "loss": 1.1757, - "num_input_tokens_seen": 28299896, - "step": 1758 - }, - { - "epoch": 0.12321486423774375, - "grad_norm": 3.946603775024414, - "learning_rate": 8.769008056042032e-05, - "loss": 1.2739, - "num_input_tokens_seen": 28316280, - "step": 1759 - }, - { - "epoch": 0.123284912483473, - "grad_norm": 3.4837770462036133, - "learning_rate": 8.768308231173381e-05, - "loss": 0.9682, - "num_input_tokens_seen": 28332128, - "step": 1760 - }, - { - "epoch": 0.12335496072920224, - "grad_norm": 3.9601573944091797, - "learning_rate": 8.767608406304729e-05, - "loss": 1.2647, - "num_input_tokens_seen": 28347488, - "step": 1761 - }, - { - "epoch": 0.12342500897493149, - "grad_norm": 4.178001403808594, - "learning_rate": 8.766908581436078e-05, - "loss": 1.0055, - "num_input_tokens_seen": 28363872, - "step": 1762 - }, - { - "epoch": 0.12349505722066073, - "grad_norm": 3.9182498455047607, - "learning_rate": 8.766208756567426e-05, - "loss": 1.1407, - "num_input_tokens_seen": 28380208, - "step": 1763 - }, - { - "epoch": 0.12356510546638998, - "grad_norm": 4.071939468383789, - "learning_rate": 8.765508931698774e-05, - "loss": 1.3196, - "num_input_tokens_seen": 28396592, - "step": 1764 - }, - { - "epoch": 0.12363515371211922, - "grad_norm": 4.657908916473389, - "learning_rate": 8.764809106830123e-05, - "loss": 1.0739, - "num_input_tokens_seen": 28412976, - "step": 1765 - }, - { - "epoch": 0.12370520195784847, - "grad_norm": 3.9706201553344727, - "learning_rate": 8.764109281961472e-05, - "loss": 1.0904, - "num_input_tokens_seen": 28429088, - "step": 1766 - }, - { - "epoch": 0.12377525020357771, - "grad_norm": 4.571341514587402, - "learning_rate": 8.76340945709282e-05, - "loss": 1.1314, - "num_input_tokens_seen": 28445472, - "step": 1767 - }, - { - "epoch": 0.12384529844930696, - "grad_norm": 4.197002410888672, - "learning_rate": 8.762709632224169e-05, - "loss": 0.8251, - "num_input_tokens_seen": 28461656, - "step": 1768 - }, - { - "epoch": 0.1239153466950362, - "grad_norm": 5.376040935516357, - "learning_rate": 8.762009807355517e-05, - "loss": 1.1626, - "num_input_tokens_seen": 28477088, - "step": 1769 - }, - { - "epoch": 0.12398539494076545, - "grad_norm": 3.987495183944702, - "learning_rate": 8.761309982486866e-05, - "loss": 1.2449, - "num_input_tokens_seen": 28493472, - "step": 1770 - }, - { - "epoch": 0.1240554431864947, - "grad_norm": 4.379208564758301, - "learning_rate": 8.760610157618213e-05, - "loss": 1.2834, - "num_input_tokens_seen": 28509856, - "step": 1771 - }, - { - "epoch": 0.12412549143222394, - "grad_norm": 3.7258729934692383, - "learning_rate": 8.759910332749562e-05, - "loss": 1.1115, - "num_input_tokens_seen": 28525664, - "step": 1772 - }, - { - "epoch": 0.12419553967795319, - "grad_norm": 4.0574774742126465, - "learning_rate": 8.759210507880911e-05, - "loss": 1.1005, - "num_input_tokens_seen": 28541920, - "step": 1773 - }, - { - "epoch": 0.12426558792368243, - "grad_norm": 3.8423895835876465, - "learning_rate": 8.75851068301226e-05, - "loss": 1.1067, - "num_input_tokens_seen": 28558216, - "step": 1774 - }, - { - "epoch": 0.12433563616941168, - "grad_norm": 3.8898398876190186, - "learning_rate": 8.757810858143609e-05, - "loss": 1.1963, - "num_input_tokens_seen": 28574536, - "step": 1775 - }, - { - "epoch": 0.12440568441514092, - "grad_norm": 3.286412000656128, - "learning_rate": 8.757111033274956e-05, - "loss": 0.9159, - "num_input_tokens_seen": 28590920, - "step": 1776 - }, - { - "epoch": 0.12447573266087017, - "grad_norm": 3.7219464778900146, - "learning_rate": 8.756411208406305e-05, - "loss": 1.0883, - "num_input_tokens_seen": 28607192, - "step": 1777 - }, - { - "epoch": 0.12454578090659942, - "grad_norm": 3.8907012939453125, - "learning_rate": 8.755711383537654e-05, - "loss": 1.0226, - "num_input_tokens_seen": 28623176, - "step": 1778 - }, - { - "epoch": 0.12461582915232866, - "grad_norm": 3.8087925910949707, - "learning_rate": 8.755011558669003e-05, - "loss": 1.0115, - "num_input_tokens_seen": 28639528, - "step": 1779 - }, - { - "epoch": 0.1246858773980579, - "grad_norm": 4.8956217765808105, - "learning_rate": 8.754311733800352e-05, - "loss": 1.0108, - "num_input_tokens_seen": 28654976, - "step": 1780 - }, - { - "epoch": 0.12475592564378715, - "grad_norm": 3.7400572299957275, - "learning_rate": 8.753611908931699e-05, - "loss": 0.8787, - "num_input_tokens_seen": 28671064, - "step": 1781 - }, - { - "epoch": 0.1248259738895164, - "grad_norm": 4.689199924468994, - "learning_rate": 8.752912084063048e-05, - "loss": 1.2326, - "num_input_tokens_seen": 28686664, - "step": 1782 - }, - { - "epoch": 0.12489602213524566, - "grad_norm": 3.6594929695129395, - "learning_rate": 8.752212259194395e-05, - "loss": 1.1626, - "num_input_tokens_seen": 28703048, - "step": 1783 - }, - { - "epoch": 0.1249660703809749, - "grad_norm": 4.6070356369018555, - "learning_rate": 8.751512434325744e-05, - "loss": 1.358, - "num_input_tokens_seen": 28719000, - "step": 1784 - }, - { - "epoch": 0.12503611862670413, - "grad_norm": 4.658362865447998, - "learning_rate": 8.750812609457093e-05, - "loss": 1.2852, - "num_input_tokens_seen": 28735384, - "step": 1785 - }, - { - "epoch": 0.1251061668724334, - "grad_norm": 3.6963465213775635, - "learning_rate": 8.750112784588442e-05, - "loss": 1.1068, - "num_input_tokens_seen": 28750856, - "step": 1786 - }, - { - "epoch": 0.12517621511816263, - "grad_norm": 4.419562816619873, - "learning_rate": 8.749412959719791e-05, - "loss": 1.1559, - "num_input_tokens_seen": 28766824, - "step": 1787 - }, - { - "epoch": 0.12524626336389189, - "grad_norm": 4.601676940917969, - "learning_rate": 8.74871313485114e-05, - "loss": 1.0642, - "num_input_tokens_seen": 28783208, - "step": 1788 - }, - { - "epoch": 0.12531631160962112, - "grad_norm": 3.8597445487976074, - "learning_rate": 8.748013309982487e-05, - "loss": 1.1149, - "num_input_tokens_seen": 28799160, - "step": 1789 - }, - { - "epoch": 0.12538635985535038, - "grad_norm": 3.654649257659912, - "learning_rate": 8.747313485113835e-05, - "loss": 1.3127, - "num_input_tokens_seen": 28815440, - "step": 1790 - }, - { - "epoch": 0.1254564081010796, - "grad_norm": 4.043321132659912, - "learning_rate": 8.746613660245184e-05, - "loss": 1.0844, - "num_input_tokens_seen": 28831824, - "step": 1791 - }, - { - "epoch": 0.12552645634680887, - "grad_norm": 4.5223894119262695, - "learning_rate": 8.745913835376532e-05, - "loss": 1.0627, - "num_input_tokens_seen": 28846984, - "step": 1792 - }, - { - "epoch": 0.1255965045925381, - "grad_norm": 4.074361801147461, - "learning_rate": 8.745214010507881e-05, - "loss": 0.9772, - "num_input_tokens_seen": 28863368, - "step": 1793 - }, - { - "epoch": 0.12566655283826736, - "grad_norm": 4.661183834075928, - "learning_rate": 8.74451418563923e-05, - "loss": 1.152, - "num_input_tokens_seen": 28879752, - "step": 1794 - }, - { - "epoch": 0.1257366010839966, - "grad_norm": 3.95831561088562, - "learning_rate": 8.743814360770579e-05, - "loss": 1.117, - "num_input_tokens_seen": 28895728, - "step": 1795 - }, - { - "epoch": 0.12580664932972585, - "grad_norm": 4.271726131439209, - "learning_rate": 8.743114535901927e-05, - "loss": 1.0935, - "num_input_tokens_seen": 28912112, - "step": 1796 - }, - { - "epoch": 0.12587669757545508, - "grad_norm": 4.079075336456299, - "learning_rate": 8.742414711033275e-05, - "loss": 1.1397, - "num_input_tokens_seen": 28928496, - "step": 1797 - }, - { - "epoch": 0.12594674582118434, - "grad_norm": 4.030980587005615, - "learning_rate": 8.741714886164623e-05, - "loss": 0.9405, - "num_input_tokens_seen": 28943968, - "step": 1798 - }, - { - "epoch": 0.12601679406691357, - "grad_norm": 3.7285454273223877, - "learning_rate": 8.741015061295973e-05, - "loss": 0.8448, - "num_input_tokens_seen": 28959800, - "step": 1799 - }, - { - "epoch": 0.12608684231264283, - "grad_norm": 3.964663028717041, - "learning_rate": 8.74031523642732e-05, - "loss": 1.1614, - "num_input_tokens_seen": 28976184, - "step": 1800 - }, - { - "epoch": 0.12608684231264283, - "eval_loss": 1.1493111848831177, - "eval_runtime": 0.196, - "eval_samples_per_second": 5.102, - "eval_steps_per_second": 5.102, - "num_input_tokens_seen": 28976184, - "step": 1800 - } - ], - "logging_steps": 1, - "max_steps": 14275, - "num_input_tokens_seen": 28976184, - "num_train_epochs": 1, - "save_steps": 200, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 6.222312927938765e+16, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}