{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.35481963335304556, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.4166666666666664e-05, "loss": 2.8488, "step": 1 }, { "epoch": 0.0, "learning_rate": 0.00010833333333333333, "loss": 2.7815, "step": 2 }, { "epoch": 0.01, "learning_rate": 0.0001625, "loss": 2.8002, "step": 3 }, { "epoch": 0.01, "learning_rate": 0.00021666666666666666, "loss": 2.7473, "step": 4 }, { "epoch": 0.01, "learning_rate": 0.0002708333333333333, "loss": 2.4233, "step": 5 }, { "epoch": 0.01, "learning_rate": 0.000325, "loss": 1.9676, "step": 6 }, { "epoch": 0.01, "learning_rate": 0.00037916666666666665, "loss": 1.7562, "step": 7 }, { "epoch": 0.01, "learning_rate": 0.0004333333333333333, "loss": 1.3949, "step": 8 }, { "epoch": 0.02, "learning_rate": 0.0004875, "loss": 1.2908, "step": 9 }, { "epoch": 0.02, "learning_rate": 0.0005416666666666666, "loss": 1.2542, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.0005958333333333333, "loss": 1.2959, "step": 11 }, { "epoch": 0.02, "learning_rate": 0.00065, "loss": 1.1706, "step": 12 }, { "epoch": 0.02, "learning_rate": 0.0006499947173877214, "loss": 1.0829, "step": 13 }, { "epoch": 0.02, "learning_rate": 0.0006499788697226147, "loss": 1.128, "step": 14 }, { "epoch": 0.03, "learning_rate": 0.0006499524575198621, "loss": 1.0847, "step": 15 }, { "epoch": 0.03, "learning_rate": 0.0006499154816380815, "loss": 1.1143, "step": 16 }, { "epoch": 0.03, "learning_rate": 0.0006498679432792988, "loss": 1.0751, "step": 17 }, { "epoch": 0.03, "learning_rate": 0.0006498098439889095, "loss": 1.179, "step": 18 }, { "epoch": 0.03, "learning_rate": 0.0006497411856556275, "loss": 1.0327, "step": 19 }, { "epoch": 0.04, "learning_rate": 0.0006496619705114241, "loss": 1.0672, "step": 20 }, { "epoch": 0.04, "learning_rate": 0.0006495722011314557, "loss": 1.1625, "step": 21 }, { "epoch": 0.04, "learning_rate": 0.0006494718804339797, "loss": 1.0751, "step": 22 }, { "epoch": 0.04, "learning_rate": 0.0006493610116802598, "loss": 0.996, "step": 23 }, { "epoch": 0.04, "learning_rate": 0.0006492395984744599, "loss": 1.0478, "step": 24 }, { "epoch": 0.04, "learning_rate": 0.0006491076447635269, "loss": 1.064, "step": 25 }, { "epoch": 0.05, "learning_rate": 0.0006489651548370628, "loss": 0.9393, "step": 26 }, { "epoch": 0.05, "learning_rate": 0.0006488121333271846, "loss": 0.9282, "step": 27 }, { "epoch": 0.05, "learning_rate": 0.0006486485852083744, "loss": 1.0558, "step": 28 }, { "epoch": 0.05, "learning_rate": 0.0006484745157973169, "loss": 1.0015, "step": 29 }, { "epoch": 0.05, "learning_rate": 0.0006482899307527272, "loss": 1.0261, "step": 30 }, { "epoch": 0.05, "learning_rate": 0.0006480948360751669, "loss": 1.0507, "step": 31 }, { "epoch": 0.06, "learning_rate": 0.0006478892381068483, "loss": 1.0225, "step": 32 }, { "epoch": 0.06, "learning_rate": 0.0006476731435314292, "loss": 0.9411, "step": 33 }, { "epoch": 0.06, "learning_rate": 0.0006474465593737948, "loss": 0.9884, "step": 34 }, { "epoch": 0.06, "learning_rate": 0.0006472094929998295, "loss": 0.9892, "step": 35 }, { "epoch": 0.06, "learning_rate": 0.0006469619521161782, "loss": 1.0527, "step": 36 }, { "epoch": 0.07, "learning_rate": 0.0006467039447699945, "loss": 0.969, "step": 37 }, { "epoch": 0.07, "learning_rate": 0.0006464354793486803, "loss": 1.0009, "step": 38 }, { "epoch": 0.07, "learning_rate": 0.0006461565645796124, "loss": 1.0068, "step": 39 }, { "epoch": 0.07, "learning_rate": 0.0006458672095298589, "loss": 0.9626, "step": 40 }, { "epoch": 0.07, "learning_rate": 0.0006455674236058847, "loss": 0.934, "step": 41 }, { "epoch": 0.07, "learning_rate": 0.0006452572165532456, "loss": 1.0217, "step": 42 }, { "epoch": 0.08, "learning_rate": 0.0006449365984562712, "loss": 1.0036, "step": 43 }, { "epoch": 0.08, "learning_rate": 0.0006446055797377376, "loss": 0.9234, "step": 44 }, { "epoch": 0.08, "learning_rate": 0.000644264171158528, "loss": 0.9771, "step": 45 }, { "epoch": 0.08, "learning_rate": 0.0006439123838172836, "loss": 1.013, "step": 46 }, { "epoch": 0.08, "learning_rate": 0.0006435502291500418, "loss": 0.9154, "step": 47 }, { "epoch": 0.09, "learning_rate": 0.0006431777189298656, "loss": 0.9098, "step": 48 }, { "epoch": 0.09, "learning_rate": 0.0006427948652664599, "loss": 0.9243, "step": 49 }, { "epoch": 0.09, "learning_rate": 0.0006424016806057781, "loss": 0.9162, "step": 50 }, { "epoch": 0.09, "learning_rate": 0.0006419981777296182, "loss": 0.9538, "step": 51 }, { "epoch": 0.09, "learning_rate": 0.0006415843697552062, "loss": 0.9454, "step": 52 }, { "epoch": 0.09, "learning_rate": 0.0006411602701347703, "loss": 0.9296, "step": 53 }, { "epoch": 0.1, "learning_rate": 0.0006407258926551036, "loss": 0.929, "step": 54 }, { "epoch": 0.1, "learning_rate": 0.0006402812514371154, "loss": 0.9172, "step": 55 }, { "epoch": 0.1, "learning_rate": 0.0006398263609353731, "loss": 0.9871, "step": 56 }, { "epoch": 0.1, "learning_rate": 0.0006393612359376315, "loss": 0.9279, "step": 57 }, { "epoch": 0.1, "learning_rate": 0.0006388858915643519, "loss": 0.9191, "step": 58 }, { "epoch": 0.1, "learning_rate": 0.0006384003432682119, "loss": 0.9828, "step": 59 }, { "epoch": 0.11, "learning_rate": 0.0006379046068336013, "loss": 0.8912, "step": 60 }, { "epoch": 0.11, "learning_rate": 0.00063739869837611, "loss": 0.9023, "step": 61 }, { "epoch": 0.11, "learning_rate": 0.0006368826343420043, "loss": 0.9978, "step": 62 }, { "epoch": 0.11, "learning_rate": 0.0006363564315076915, "loss": 0.9097, "step": 63 }, { "epoch": 0.11, "learning_rate": 0.0006358201069791749, "loss": 0.8475, "step": 64 }, { "epoch": 0.12, "learning_rate": 0.000635273678191498, "loss": 0.9763, "step": 65 }, { "epoch": 0.12, "learning_rate": 0.000634717162908177, "loss": 0.8673, "step": 66 }, { "epoch": 0.12, "learning_rate": 0.0006341505792206243, "loss": 0.9188, "step": 67 }, { "epoch": 0.12, "learning_rate": 0.0006335739455475594, "loss": 0.865, "step": 68 }, { "epoch": 0.12, "learning_rate": 0.0006329872806344108, "loss": 0.9187, "step": 69 }, { "epoch": 0.12, "learning_rate": 0.0006323906035527062, "loss": 0.887, "step": 70 }, { "epoch": 0.13, "learning_rate": 0.0006317839336994531, "loss": 0.908, "step": 71 }, { "epoch": 0.13, "learning_rate": 0.0006311672907965074, "loss": 0.918, "step": 72 }, { "epoch": 0.13, "learning_rate": 0.0006305406948899329, "loss": 0.9399, "step": 73 }, { "epoch": 0.13, "learning_rate": 0.0006299041663493497, "loss": 0.9741, "step": 74 }, { "epoch": 0.13, "learning_rate": 0.0006292577258672713, "loss": 0.8738, "step": 75 }, { "epoch": 0.13, "learning_rate": 0.0006286013944584328, "loss": 0.9192, "step": 76 }, { "epoch": 0.14, "learning_rate": 0.0006279351934591071, "loss": 0.8589, "step": 77 }, { "epoch": 0.14, "learning_rate": 0.0006272591445264116, "loss": 0.955, "step": 78 }, { "epoch": 0.14, "learning_rate": 0.0006265732696376042, "loss": 0.928, "step": 79 }, { "epoch": 0.14, "learning_rate": 0.0006258775910893685, "loss": 0.8454, "step": 80 }, { "epoch": 0.14, "learning_rate": 0.0006251721314970894, "loss": 0.8709, "step": 81 }, { "epoch": 0.15, "learning_rate": 0.0006244569137941179, "loss": 0.8732, "step": 82 }, { "epoch": 0.15, "learning_rate": 0.0006237319612310249, "loss": 0.9345, "step": 83 }, { "epoch": 0.15, "learning_rate": 0.0006229972973748463, "loss": 0.9342, "step": 84 }, { "epoch": 0.15, "learning_rate": 0.0006222529461083165, "loss": 0.8803, "step": 85 }, { "epoch": 0.15, "learning_rate": 0.0006214989316290914, "loss": 0.8676, "step": 86 }, { "epoch": 0.15, "learning_rate": 0.0006207352784489629, "loss": 0.9195, "step": 87 }, { "epoch": 0.16, "learning_rate": 0.000619962011393061, "loss": 0.9505, "step": 88 }, { "epoch": 0.16, "learning_rate": 0.0006191791555990477, "loss": 0.8778, "step": 89 }, { "epoch": 0.16, "learning_rate": 0.0006183867365162994, "loss": 0.9663, "step": 90 }, { "epoch": 0.16, "learning_rate": 0.0006175847799050789, "loss": 0.9304, "step": 91 }, { "epoch": 0.16, "learning_rate": 0.0006167733118356993, "loss": 0.9233, "step": 92 }, { "epoch": 0.16, "learning_rate": 0.0006159523586876756, "loss": 0.9167, "step": 93 }, { "epoch": 0.17, "learning_rate": 0.0006151219471488673, "loss": 0.882, "step": 94 }, { "epoch": 0.17, "learning_rate": 0.0006142821042146112, "loss": 0.8295, "step": 95 }, { "epoch": 0.17, "learning_rate": 0.0006134328571868428, "loss": 0.7799, "step": 96 }, { "epoch": 0.17, "learning_rate": 0.0006125742336732103, "loss": 0.9368, "step": 97 }, { "epoch": 0.17, "learning_rate": 0.000611706261586176, "loss": 0.8542, "step": 98 }, { "epoch": 0.18, "learning_rate": 0.0006108289691421089, "loss": 0.9263, "step": 99 }, { "epoch": 0.18, "learning_rate": 0.0006099423848603682, "loss": 0.8572, "step": 100 }, { "epoch": 0.18, "learning_rate": 0.0006090465375623755, "loss": 0.905, "step": 101 }, { "epoch": 0.18, "learning_rate": 0.0006081414563706781, "loss": 0.8621, "step": 102 }, { "epoch": 0.18, "learning_rate": 0.0006072271707080021, "loss": 0.8745, "step": 103 }, { "epoch": 0.18, "learning_rate": 0.0006063037102962963, "loss": 0.928, "step": 104 }, { "epoch": 0.19, "learning_rate": 0.0006053711051557658, "loss": 0.908, "step": 105 }, { "epoch": 0.19, "learning_rate": 0.0006044293856038958, "loss": 0.8919, "step": 106 }, { "epoch": 0.19, "learning_rate": 0.0006034785822544665, "loss": 0.8665, "step": 107 }, { "epoch": 0.19, "learning_rate": 0.0006025187260165575, "loss": 0.8645, "step": 108 }, { "epoch": 0.19, "learning_rate": 0.0006015498480935434, "loss": 0.895, "step": 109 }, { "epoch": 0.2, "learning_rate": 0.0006005719799820788, "loss": 0.892, "step": 110 }, { "epoch": 0.2, "learning_rate": 0.0005995851534710752, "loss": 0.8843, "step": 111 }, { "epoch": 0.2, "learning_rate": 0.0005985894006406671, "loss": 0.8114, "step": 112 }, { "epoch": 0.2, "learning_rate": 0.0005975847538611689, "loss": 0.9086, "step": 113 }, { "epoch": 0.2, "learning_rate": 0.0005965712457920233, "loss": 0.8644, "step": 114 }, { "epoch": 0.2, "learning_rate": 0.000595548909380739, "loss": 0.8638, "step": 115 }, { "epoch": 0.21, "learning_rate": 0.00059451777786182, "loss": 0.8856, "step": 116 }, { "epoch": 0.21, "learning_rate": 0.0005934778847556848, "loss": 0.8749, "step": 117 }, { "epoch": 0.21, "learning_rate": 0.0005924292638675769, "loss": 0.8864, "step": 118 }, { "epoch": 0.21, "learning_rate": 0.0005913719492864662, "loss": 0.8317, "step": 119 }, { "epoch": 0.21, "learning_rate": 0.0005903059753839402, "loss": 0.8356, "step": 120 }, { "epoch": 0.21, "learning_rate": 0.0005892313768130872, "loss": 0.784, "step": 121 }, { "epoch": 0.22, "learning_rate": 0.0005881481885073694, "loss": 0.8377, "step": 122 }, { "epoch": 0.22, "learning_rate": 0.0005870564456794872, "loss": 0.7854, "step": 123 }, { "epoch": 0.22, "learning_rate": 0.0005859561838202349, "loss": 0.9538, "step": 124 }, { "epoch": 0.22, "learning_rate": 0.0005848474386973468, "loss": 0.8268, "step": 125 }, { "epoch": 0.22, "learning_rate": 0.0005837302463543341, "loss": 0.9009, "step": 126 }, { "epoch": 0.23, "learning_rate": 0.000582604643109314, "loss": 0.8684, "step": 127 }, { "epoch": 0.23, "learning_rate": 0.0005814706655538279, "loss": 0.7749, "step": 128 }, { "epoch": 0.23, "learning_rate": 0.0005803283505516529, "loss": 0.8931, "step": 129 }, { "epoch": 0.23, "learning_rate": 0.0005791777352376026, "loss": 0.8246, "step": 130 }, { "epoch": 0.23, "learning_rate": 0.0005780188570163211, "loss": 0.7862, "step": 131 }, { "epoch": 0.23, "learning_rate": 0.0005768517535610654, "loss": 0.9168, "step": 132 }, { "epoch": 0.24, "learning_rate": 0.0005756764628124819, "loss": 0.8706, "step": 133 }, { "epoch": 0.24, "learning_rate": 0.000574493022977373, "loss": 0.7976, "step": 134 }, { "epoch": 0.24, "learning_rate": 0.000573301472527454, "loss": 0.814, "step": 135 }, { "epoch": 0.24, "learning_rate": 0.000572101850198104, "loss": 0.8991, "step": 136 }, { "epoch": 0.24, "learning_rate": 0.0005708941949871053, "loss": 0.8539, "step": 137 }, { "epoch": 0.24, "learning_rate": 0.0005696785461533761, "loss": 0.9107, "step": 138 }, { "epoch": 0.25, "learning_rate": 0.0005684549432156948, "loss": 0.9165, "step": 139 }, { "epoch": 0.25, "learning_rate": 0.0005672234259514147, "loss": 0.843, "step": 140 }, { "epoch": 0.25, "learning_rate": 0.000565984034395171, "loss": 0.8328, "step": 141 }, { "epoch": 0.25, "learning_rate": 0.0005647368088375792, "loss": 0.884, "step": 142 }, { "epoch": 0.25, "learning_rate": 0.000563481789823926, "loss": 0.9101, "step": 143 }, { "epoch": 0.26, "learning_rate": 0.0005622190181528502, "loss": 0.8508, "step": 144 }, { "epoch": 0.26, "learning_rate": 0.0005609485348750175, "loss": 0.8575, "step": 145 }, { "epoch": 0.26, "learning_rate": 0.0005596703812917851, "loss": 0.8861, "step": 146 }, { "epoch": 0.26, "learning_rate": 0.0005583845989538596, "loss": 0.8163, "step": 147 }, { "epoch": 0.26, "learning_rate": 0.0005570912296599459, "loss": 0.8583, "step": 148 }, { "epoch": 0.26, "learning_rate": 0.0005557903154553888, "loss": 0.8635, "step": 149 }, { "epoch": 0.27, "learning_rate": 0.000554481898630806, "loss": 0.811, "step": 150 }, { "epoch": 0.27, "learning_rate": 0.0005531660217207126, "loss": 0.9116, "step": 151 }, { "epoch": 0.27, "learning_rate": 0.0005518427275021399, "loss": 0.868, "step": 152 }, { "epoch": 0.27, "learning_rate": 0.0005505120589932435, "loss": 0.8868, "step": 153 }, { "epoch": 0.27, "learning_rate": 0.0005491740594519051, "loss": 0.8816, "step": 154 }, { "epoch": 0.27, "learning_rate": 0.0005478287723743267, "loss": 0.8499, "step": 155 }, { "epoch": 0.28, "learning_rate": 0.0005464762414936163, "loss": 0.8502, "step": 156 }, { "epoch": 0.28, "learning_rate": 0.0005451165107783659, "loss": 0.86, "step": 157 }, { "epoch": 0.28, "learning_rate": 0.0005437496244312228, "loss": 0.8669, "step": 158 }, { "epoch": 0.28, "learning_rate": 0.0005423756268874522, "loss": 0.869, "step": 159 }, { "epoch": 0.28, "learning_rate": 0.000540994562813493, "loss": 0.8476, "step": 160 }, { "epoch": 0.29, "learning_rate": 0.0005396064771055053, "loss": 0.7992, "step": 161 }, { "epoch": 0.29, "learning_rate": 0.0005382114148879113, "loss": 0.8569, "step": 162 }, { "epoch": 0.29, "learning_rate": 0.0005368094215119282, "loss": 0.8367, "step": 163 }, { "epoch": 0.29, "learning_rate": 0.000535400542554094, "loss": 0.8455, "step": 164 }, { "epoch": 0.29, "learning_rate": 0.0005339848238147857, "loss": 0.9209, "step": 165 }, { "epoch": 0.29, "learning_rate": 0.0005325623113167311, "loss": 0.8577, "step": 166 }, { "epoch": 0.3, "learning_rate": 0.0005311330513035111, "loss": 0.8331, "step": 167 }, { "epoch": 0.3, "learning_rate": 0.0005296970902380583, "loss": 0.7925, "step": 168 }, { "epoch": 0.3, "learning_rate": 0.0005282544748011454, "loss": 0.8223, "step": 169 }, { "epoch": 0.3, "learning_rate": 0.0005268052518898676, "loss": 0.8555, "step": 170 }, { "epoch": 0.3, "learning_rate": 0.0005253494686161189, "loss": 0.9448, "step": 171 }, { "epoch": 0.31, "learning_rate": 0.00052388717230506, "loss": 0.8364, "step": 172 }, { "epoch": 0.31, "learning_rate": 0.0005224184104935797, "loss": 0.866, "step": 173 }, { "epoch": 0.31, "learning_rate": 0.0005209432309287499, "loss": 0.8312, "step": 174 }, { "epoch": 0.31, "learning_rate": 0.0005194616815662733, "loss": 0.8725, "step": 175 }, { "epoch": 0.31, "learning_rate": 0.0005179738105689243, "loss": 0.8199, "step": 176 }, { "epoch": 0.31, "learning_rate": 0.0005164796663049834, "loss": 0.8068, "step": 177 }, { "epoch": 0.32, "learning_rate": 0.0005149792973466653, "loss": 0.8533, "step": 178 }, { "epoch": 0.32, "learning_rate": 0.0005134727524685388, "loss": 0.9067, "step": 179 }, { "epoch": 0.32, "learning_rate": 0.0005119600806459426, "loss": 0.8105, "step": 180 }, { "epoch": 0.32, "learning_rate": 0.0005104413310533914, "loss": 0.8426, "step": 181 }, { "epoch": 0.32, "learning_rate": 0.0005089165530629796, "loss": 0.8854, "step": 182 }, { "epoch": 0.32, "learning_rate": 0.0005073857962427743, "loss": 0.8151, "step": 183 }, { "epoch": 0.33, "learning_rate": 0.0005058491103552046, "loss": 0.8467, "step": 184 }, { "epoch": 0.33, "learning_rate": 0.0005043065453554449, "loss": 0.8343, "step": 185 }, { "epoch": 0.33, "learning_rate": 0.0005027581513897888, "loss": 0.8139, "step": 186 }, { "epoch": 0.33, "learning_rate": 0.0005012039787940209, "loss": 0.8632, "step": 187 }, { "epoch": 0.33, "learning_rate": 0.0004996440780917798, "loss": 0.8525, "step": 188 }, { "epoch": 0.34, "learning_rate": 0.0004980784999929151, "loss": 0.9075, "step": 189 }, { "epoch": 0.34, "learning_rate": 0.00049650729539184, "loss": 0.7663, "step": 190 }, { "epoch": 0.34, "learning_rate": 0.0004949305153658755, "loss": 0.82, "step": 191 }, { "epoch": 0.34, "learning_rate": 0.0004933482111735912, "loss": 0.9614, "step": 192 }, { "epoch": 0.34, "learning_rate": 0.0004917604342531381, "loss": 0.8063, "step": 193 }, { "epoch": 0.34, "learning_rate": 0.0004901672362205767, "loss": 0.8729, "step": 194 }, { "epoch": 0.35, "learning_rate": 0.0004885686688681996, "loss": 0.8819, "step": 195 }, { "epoch": 0.35, "learning_rate": 0.0004869647841628468, "loss": 0.7797, "step": 196 }, { "epoch": 0.35, "learning_rate": 0.00048535563424421686, "loss": 0.7435, "step": 197 }, { "epoch": 0.35, "learning_rate": 0.0004837412714231722, "loss": 0.7985, "step": 198 }, { "epoch": 0.35, "learning_rate": 0.00048212174818003796, "loss": 0.8532, "step": 199 }, { "epoch": 0.35, "learning_rate": 0.00048049711716289666, "loss": 0.8559, "step": 200 } ], "logging_steps": 1, "max_steps": 563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 3.032690979844915e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }