{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26611472501478417, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.4166666666666664e-05, "loss": 2.8488, "step": 1 }, { "epoch": 0.0, "learning_rate": 0.00010833333333333333, "loss": 2.7815, "step": 2 }, { "epoch": 0.01, "learning_rate": 0.0001625, "loss": 2.8002, "step": 3 }, { "epoch": 0.01, "learning_rate": 0.00021666666666666666, "loss": 2.7473, "step": 4 }, { "epoch": 0.01, "learning_rate": 0.0002708333333333333, "loss": 2.4233, "step": 5 }, { "epoch": 0.01, "learning_rate": 0.000325, "loss": 1.9676, "step": 6 }, { "epoch": 0.01, "learning_rate": 0.00037916666666666665, "loss": 1.7562, "step": 7 }, { "epoch": 0.01, "learning_rate": 0.0004333333333333333, "loss": 1.3949, "step": 8 }, { "epoch": 0.02, "learning_rate": 0.0004875, "loss": 1.2908, "step": 9 }, { "epoch": 0.02, "learning_rate": 0.0005416666666666666, "loss": 1.2542, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.0005958333333333333, "loss": 1.2959, "step": 11 }, { "epoch": 0.02, "learning_rate": 0.00065, "loss": 1.1706, "step": 12 }, { "epoch": 0.02, "learning_rate": 0.0006499947173877214, "loss": 1.0829, "step": 13 }, { "epoch": 0.02, "learning_rate": 0.0006499788697226147, "loss": 1.128, "step": 14 }, { "epoch": 0.03, "learning_rate": 0.0006499524575198621, "loss": 1.0847, "step": 15 }, { "epoch": 0.03, "learning_rate": 0.0006499154816380815, "loss": 1.1143, "step": 16 }, { "epoch": 0.03, "learning_rate": 0.0006498679432792988, "loss": 1.0751, "step": 17 }, { "epoch": 0.03, "learning_rate": 0.0006498098439889095, "loss": 1.179, "step": 18 }, { "epoch": 0.03, "learning_rate": 0.0006497411856556275, "loss": 1.0327, "step": 19 }, { "epoch": 0.04, "learning_rate": 0.0006496619705114241, "loss": 1.0672, "step": 20 }, { "epoch": 0.04, "learning_rate": 0.0006495722011314557, "loss": 1.1625, "step": 21 }, { "epoch": 0.04, "learning_rate": 0.0006494718804339797, "loss": 1.0751, "step": 22 }, { "epoch": 0.04, "learning_rate": 0.0006493610116802598, "loss": 0.996, "step": 23 }, { "epoch": 0.04, "learning_rate": 0.0006492395984744599, "loss": 1.0478, "step": 24 }, { "epoch": 0.04, "learning_rate": 0.0006491076447635269, "loss": 1.064, "step": 25 }, { "epoch": 0.05, "learning_rate": 0.0006489651548370628, "loss": 0.9393, "step": 26 }, { "epoch": 0.05, "learning_rate": 0.0006488121333271846, "loss": 0.9282, "step": 27 }, { "epoch": 0.05, "learning_rate": 0.0006486485852083744, "loss": 1.0558, "step": 28 }, { "epoch": 0.05, "learning_rate": 0.0006484745157973169, "loss": 1.0015, "step": 29 }, { "epoch": 0.05, "learning_rate": 0.0006482899307527272, "loss": 1.0261, "step": 30 }, { "epoch": 0.05, "learning_rate": 0.0006480948360751669, "loss": 1.0507, "step": 31 }, { "epoch": 0.06, "learning_rate": 0.0006478892381068483, "loss": 1.0225, "step": 32 }, { "epoch": 0.06, "learning_rate": 0.0006476731435314292, "loss": 0.9411, "step": 33 }, { "epoch": 0.06, "learning_rate": 0.0006474465593737948, "loss": 0.9884, "step": 34 }, { "epoch": 0.06, "learning_rate": 0.0006472094929998295, "loss": 0.9892, "step": 35 }, { "epoch": 0.06, "learning_rate": 0.0006469619521161782, "loss": 1.0527, "step": 36 }, { "epoch": 0.07, "learning_rate": 0.0006467039447699945, "loss": 0.969, "step": 37 }, { "epoch": 0.07, "learning_rate": 0.0006464354793486803, "loss": 1.0009, "step": 38 }, { "epoch": 0.07, "learning_rate": 0.0006461565645796124, "loss": 1.0068, "step": 39 }, { "epoch": 0.07, "learning_rate": 0.0006458672095298589, "loss": 0.9626, "step": 40 }, { "epoch": 0.07, "learning_rate": 0.0006455674236058847, "loss": 0.934, "step": 41 }, { "epoch": 0.07, "learning_rate": 0.0006452572165532456, "loss": 1.0217, "step": 42 }, { "epoch": 0.08, "learning_rate": 0.0006449365984562712, "loss": 1.0036, "step": 43 }, { "epoch": 0.08, "learning_rate": 0.0006446055797377376, "loss": 0.9234, "step": 44 }, { "epoch": 0.08, "learning_rate": 0.000644264171158528, "loss": 0.9771, "step": 45 }, { "epoch": 0.08, "learning_rate": 0.0006439123838172836, "loss": 1.013, "step": 46 }, { "epoch": 0.08, "learning_rate": 0.0006435502291500418, "loss": 0.9154, "step": 47 }, { "epoch": 0.09, "learning_rate": 0.0006431777189298656, "loss": 0.9098, "step": 48 }, { "epoch": 0.09, "learning_rate": 0.0006427948652664599, "loss": 0.9243, "step": 49 }, { "epoch": 0.09, "learning_rate": 0.0006424016806057781, "loss": 0.9162, "step": 50 }, { "epoch": 0.09, "learning_rate": 0.0006419981777296182, "loss": 0.9538, "step": 51 }, { "epoch": 0.09, "learning_rate": 0.0006415843697552062, "loss": 0.9454, "step": 52 }, { "epoch": 0.09, "learning_rate": 0.0006411602701347703, "loss": 0.9296, "step": 53 }, { "epoch": 0.1, "learning_rate": 0.0006407258926551036, "loss": 0.929, "step": 54 }, { "epoch": 0.1, "learning_rate": 0.0006402812514371154, "loss": 0.9172, "step": 55 }, { "epoch": 0.1, "learning_rate": 0.0006398263609353731, "loss": 0.9871, "step": 56 }, { "epoch": 0.1, "learning_rate": 0.0006393612359376315, "loss": 0.9279, "step": 57 }, { "epoch": 0.1, "learning_rate": 0.0006388858915643519, "loss": 0.9191, "step": 58 }, { "epoch": 0.1, "learning_rate": 0.0006384003432682119, "loss": 0.9828, "step": 59 }, { "epoch": 0.11, "learning_rate": 0.0006379046068336013, "loss": 0.8912, "step": 60 }, { "epoch": 0.11, "learning_rate": 0.00063739869837611, "loss": 0.9023, "step": 61 }, { "epoch": 0.11, "learning_rate": 0.0006368826343420043, "loss": 0.9978, "step": 62 }, { "epoch": 0.11, "learning_rate": 0.0006363564315076915, "loss": 0.9097, "step": 63 }, { "epoch": 0.11, "learning_rate": 0.0006358201069791749, "loss": 0.8475, "step": 64 }, { "epoch": 0.12, "learning_rate": 0.000635273678191498, "loss": 0.9763, "step": 65 }, { "epoch": 0.12, "learning_rate": 0.000634717162908177, "loss": 0.8673, "step": 66 }, { "epoch": 0.12, "learning_rate": 0.0006341505792206243, "loss": 0.9188, "step": 67 }, { "epoch": 0.12, "learning_rate": 0.0006335739455475594, "loss": 0.865, "step": 68 }, { "epoch": 0.12, "learning_rate": 0.0006329872806344108, "loss": 0.9187, "step": 69 }, { "epoch": 0.12, "learning_rate": 0.0006323906035527062, "loss": 0.887, "step": 70 }, { "epoch": 0.13, "learning_rate": 0.0006317839336994531, "loss": 0.908, "step": 71 }, { "epoch": 0.13, "learning_rate": 0.0006311672907965074, "loss": 0.918, "step": 72 }, { "epoch": 0.13, "learning_rate": 0.0006305406948899329, "loss": 0.9399, "step": 73 }, { "epoch": 0.13, "learning_rate": 0.0006299041663493497, "loss": 0.9741, "step": 74 }, { "epoch": 0.13, "learning_rate": 0.0006292577258672713, "loss": 0.8738, "step": 75 }, { "epoch": 0.13, "learning_rate": 0.0006286013944584328, "loss": 0.9192, "step": 76 }, { "epoch": 0.14, "learning_rate": 0.0006279351934591071, "loss": 0.8589, "step": 77 }, { "epoch": 0.14, "learning_rate": 0.0006272591445264116, "loss": 0.955, "step": 78 }, { "epoch": 0.14, "learning_rate": 0.0006265732696376042, "loss": 0.928, "step": 79 }, { "epoch": 0.14, "learning_rate": 0.0006258775910893685, "loss": 0.8454, "step": 80 }, { "epoch": 0.14, "learning_rate": 0.0006251721314970894, "loss": 0.8709, "step": 81 }, { "epoch": 0.15, "learning_rate": 0.0006244569137941179, "loss": 0.8732, "step": 82 }, { "epoch": 0.15, "learning_rate": 0.0006237319612310249, "loss": 0.9345, "step": 83 }, { "epoch": 0.15, "learning_rate": 0.0006229972973748463, "loss": 0.9342, "step": 84 }, { "epoch": 0.15, "learning_rate": 0.0006222529461083165, "loss": 0.8803, "step": 85 }, { "epoch": 0.15, "learning_rate": 0.0006214989316290914, "loss": 0.8676, "step": 86 }, { "epoch": 0.15, "learning_rate": 0.0006207352784489629, "loss": 0.9195, "step": 87 }, { "epoch": 0.16, "learning_rate": 0.000619962011393061, "loss": 0.9505, "step": 88 }, { "epoch": 0.16, "learning_rate": 0.0006191791555990477, "loss": 0.8778, "step": 89 }, { "epoch": 0.16, "learning_rate": 0.0006183867365162994, "loss": 0.9663, "step": 90 }, { "epoch": 0.16, "learning_rate": 0.0006175847799050789, "loss": 0.9304, "step": 91 }, { "epoch": 0.16, "learning_rate": 0.0006167733118356993, "loss": 0.9233, "step": 92 }, { "epoch": 0.16, "learning_rate": 0.0006159523586876756, "loss": 0.9167, "step": 93 }, { "epoch": 0.17, "learning_rate": 0.0006151219471488673, "loss": 0.882, "step": 94 }, { "epoch": 0.17, "learning_rate": 0.0006142821042146112, "loss": 0.8295, "step": 95 }, { "epoch": 0.17, "learning_rate": 0.0006134328571868428, "loss": 0.7799, "step": 96 }, { "epoch": 0.17, "learning_rate": 0.0006125742336732103, "loss": 0.9368, "step": 97 }, { "epoch": 0.17, "learning_rate": 0.000611706261586176, "loss": 0.8542, "step": 98 }, { "epoch": 0.18, "learning_rate": 0.0006108289691421089, "loss": 0.9263, "step": 99 }, { "epoch": 0.18, "learning_rate": 0.0006099423848603682, "loss": 0.8572, "step": 100 }, { "epoch": 0.18, "learning_rate": 0.0006090465375623755, "loss": 0.905, "step": 101 }, { "epoch": 0.18, "learning_rate": 0.0006081414563706781, "loss": 0.8621, "step": 102 }, { "epoch": 0.18, "learning_rate": 0.0006072271707080021, "loss": 0.8745, "step": 103 }, { "epoch": 0.18, "learning_rate": 0.0006063037102962963, "loss": 0.928, "step": 104 }, { "epoch": 0.19, "learning_rate": 0.0006053711051557658, "loss": 0.908, "step": 105 }, { "epoch": 0.19, "learning_rate": 0.0006044293856038958, "loss": 0.8919, "step": 106 }, { "epoch": 0.19, "learning_rate": 0.0006034785822544665, "loss": 0.8665, "step": 107 }, { "epoch": 0.19, "learning_rate": 0.0006025187260165575, "loss": 0.8645, "step": 108 }, { "epoch": 0.19, "learning_rate": 0.0006015498480935434, "loss": 0.895, "step": 109 }, { "epoch": 0.2, "learning_rate": 0.0006005719799820788, "loss": 0.892, "step": 110 }, { "epoch": 0.2, "learning_rate": 0.0005995851534710752, "loss": 0.8843, "step": 111 }, { "epoch": 0.2, "learning_rate": 0.0005985894006406671, "loss": 0.8114, "step": 112 }, { "epoch": 0.2, "learning_rate": 0.0005975847538611689, "loss": 0.9086, "step": 113 }, { "epoch": 0.2, "learning_rate": 0.0005965712457920233, "loss": 0.8644, "step": 114 }, { "epoch": 0.2, "learning_rate": 0.000595548909380739, "loss": 0.8638, "step": 115 }, { "epoch": 0.21, "learning_rate": 0.00059451777786182, "loss": 0.8856, "step": 116 }, { "epoch": 0.21, "learning_rate": 0.0005934778847556848, "loss": 0.8749, "step": 117 }, { "epoch": 0.21, "learning_rate": 0.0005924292638675769, "loss": 0.8864, "step": 118 }, { "epoch": 0.21, "learning_rate": 0.0005913719492864662, "loss": 0.8317, "step": 119 }, { "epoch": 0.21, "learning_rate": 0.0005903059753839402, "loss": 0.8356, "step": 120 }, { "epoch": 0.21, "learning_rate": 0.0005892313768130872, "loss": 0.784, "step": 121 }, { "epoch": 0.22, "learning_rate": 0.0005881481885073694, "loss": 0.8377, "step": 122 }, { "epoch": 0.22, "learning_rate": 0.0005870564456794872, "loss": 0.7854, "step": 123 }, { "epoch": 0.22, "learning_rate": 0.0005859561838202349, "loss": 0.9538, "step": 124 }, { "epoch": 0.22, "learning_rate": 0.0005848474386973468, "loss": 0.8268, "step": 125 }, { "epoch": 0.22, "learning_rate": 0.0005837302463543341, "loss": 0.9009, "step": 126 }, { "epoch": 0.23, "learning_rate": 0.000582604643109314, "loss": 0.8684, "step": 127 }, { "epoch": 0.23, "learning_rate": 0.0005814706655538279, "loss": 0.7749, "step": 128 }, { "epoch": 0.23, "learning_rate": 0.0005803283505516529, "loss": 0.8931, "step": 129 }, { "epoch": 0.23, "learning_rate": 0.0005791777352376026, "loss": 0.8246, "step": 130 }, { "epoch": 0.23, "learning_rate": 0.0005780188570163211, "loss": 0.7862, "step": 131 }, { "epoch": 0.23, "learning_rate": 0.0005768517535610654, "loss": 0.9168, "step": 132 }, { "epoch": 0.24, "learning_rate": 0.0005756764628124819, "loss": 0.8706, "step": 133 }, { "epoch": 0.24, "learning_rate": 0.000574493022977373, "loss": 0.7976, "step": 134 }, { "epoch": 0.24, "learning_rate": 0.000573301472527454, "loss": 0.814, "step": 135 }, { "epoch": 0.24, "learning_rate": 0.000572101850198104, "loss": 0.8991, "step": 136 }, { "epoch": 0.24, "learning_rate": 0.0005708941949871053, "loss": 0.8539, "step": 137 }, { "epoch": 0.24, "learning_rate": 0.0005696785461533761, "loss": 0.9107, "step": 138 }, { "epoch": 0.25, "learning_rate": 0.0005684549432156948, "loss": 0.9165, "step": 139 }, { "epoch": 0.25, "learning_rate": 0.0005672234259514147, "loss": 0.843, "step": 140 }, { "epoch": 0.25, "learning_rate": 0.000565984034395171, "loss": 0.8328, "step": 141 }, { "epoch": 0.25, "learning_rate": 0.0005647368088375792, "loss": 0.884, "step": 142 }, { "epoch": 0.25, "learning_rate": 0.000563481789823926, "loss": 0.9101, "step": 143 }, { "epoch": 0.26, "learning_rate": 0.0005622190181528502, "loss": 0.8508, "step": 144 }, { "epoch": 0.26, "learning_rate": 0.0005609485348750175, "loss": 0.8575, "step": 145 }, { "epoch": 0.26, "learning_rate": 0.0005596703812917851, "loss": 0.8861, "step": 146 }, { "epoch": 0.26, "learning_rate": 0.0005583845989538596, "loss": 0.8163, "step": 147 }, { "epoch": 0.26, "learning_rate": 0.0005570912296599459, "loss": 0.8583, "step": 148 }, { "epoch": 0.26, "learning_rate": 0.0005557903154553888, "loss": 0.8635, "step": 149 }, { "epoch": 0.27, "learning_rate": 0.000554481898630806, "loss": 0.811, "step": 150 } ], "logging_steps": 1, "max_steps": 563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "total_flos": 2.2743182526234624e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }