{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.47961630695443647, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.00013, "loss": 1.1241, "step": 1 }, { "epoch": 0.0, "learning_rate": 0.00026, "loss": 1.0107, "step": 2 }, { "epoch": 0.01, "learning_rate": 0.00039, "loss": 1.1086, "step": 3 }, { "epoch": 0.01, "learning_rate": 0.00052, "loss": 1.0044, "step": 4 }, { "epoch": 0.01, "learning_rate": 0.00065, "loss": 1.0496, "step": 5 }, { "epoch": 0.01, "learning_rate": 0.0005933661039639299, "loss": 1.0199, "step": 6 }, { "epoch": 0.02, "learning_rate": 0.0005493502655735357, "loss": 1.0198, "step": 7 }, { "epoch": 0.02, "learning_rate": 0.0005138701197773616, "loss": 0.969, "step": 8 }, { "epoch": 0.02, "learning_rate": 0.0004844813951249544, "loss": 0.9383, "step": 9 }, { "epoch": 0.02, "learning_rate": 0.0004596194077712558, "loss": 0.8776, "step": 10 }, { "epoch": 0.03, "learning_rate": 0.0004382299106011073, "loss": 1.0173, "step": 11 }, { "epoch": 0.03, "learning_rate": 0.0004195731958391368, "loss": 1.1173, "step": 12 }, { "epoch": 0.03, "learning_rate": 0.0004031128874149274, "loss": 1.0876, "step": 13 }, { "epoch": 0.03, "learning_rate": 0.0003884492980336779, "loss": 1.0524, "step": 14 }, { "epoch": 0.04, "learning_rate": 0.0003752776749732568, "loss": 0.8953, "step": 15 }, { "epoch": 0.04, "learning_rate": 0.00036336104634371584, "loss": 1.1335, "step": 16 }, { "epoch": 0.04, "learning_rate": 0.00035251199395531623, "loss": 0.9837, "step": 17 }, { "epoch": 0.04, "learning_rate": 0.00034258007985157445, "loss": 0.9707, "step": 18 }, { "epoch": 0.05, "learning_rate": 0.0003334429644276751, "loss": 0.9149, "step": 19 }, { "epoch": 0.05, "learning_rate": 0.000325, "loss": 1.0043, "step": 20 }, { "epoch": 0.05, "learning_rate": 0.00031716752370827323, "loss": 1.001, "step": 21 }, { "epoch": 0.05, "learning_rate": 0.00030987534150481746, "loss": 1.0395, "step": 22 }, { "epoch": 0.06, "learning_rate": 0.000303064062678102, "loss": 0.8718, "step": 23 }, { "epoch": 0.06, "learning_rate": 0.00029668305198196496, "loss": 1.1114, "step": 24 }, { "epoch": 0.06, "learning_rate": 0.00029068883707497264, "loss": 0.7765, "step": 25 }, { "epoch": 0.06, "learning_rate": 0.0002850438562747845, "loss": 0.9522, "step": 26 }, { "epoch": 0.06, "learning_rate": 0.00027971546389275785, "loss": 0.9588, "step": 27 }, { "epoch": 0.07, "learning_rate": 0.00027467513278676785, "loss": 1.0313, "step": 28 }, { "epoch": 0.07, "learning_rate": 0.0002698978095246549, "loss": 0.9338, "step": 29 }, { "epoch": 0.07, "learning_rate": 0.000265361388801511, "loss": 0.892, "step": 30 }, { "epoch": 0.07, "learning_rate": 0.00026104628189331215, "loss": 0.893, "step": 31 }, { "epoch": 0.08, "learning_rate": 0.0002569350598886808, "loss": 0.8983, "step": 32 }, { "epoch": 0.08, "learning_rate": 0.00025301215685249496, "loss": 0.9277, "step": 33 }, { "epoch": 0.08, "learning_rate": 0.00024926362137539537, "loss": 0.8962, "step": 34 }, { "epoch": 0.08, "learning_rate": 0.00024567690745599767, "loss": 0.9124, "step": 35 }, { "epoch": 0.09, "learning_rate": 0.0002422406975624772, "loss": 0.9535, "step": 36 }, { "epoch": 0.09, "learning_rate": 0.00023894475218048754, "loss": 0.9019, "step": 37 }, { "epoch": 0.09, "learning_rate": 0.0002357797812857538, "loss": 1.024, "step": 38 }, { "epoch": 0.09, "learning_rate": 0.00023273733406281566, "loss": 0.8549, "step": 39 }, { "epoch": 0.1, "learning_rate": 0.0002298097038856279, "loss": 1.0489, "step": 40 }, { "epoch": 0.1, "learning_rate": 0.00022698984612511293, "loss": 0.751, "step": 41 }, { "epoch": 0.1, "learning_rate": 0.00022427130678626507, "loss": 0.834, "step": 42 }, { "epoch": 0.1, "learning_rate": 0.00022164816032790388, "loss": 0.889, "step": 43 }, { "epoch": 0.11, "learning_rate": 0.00021911495530055366, "loss": 1.0103, "step": 44 }, { "epoch": 0.11, "learning_rate": 0.00021666666666666666, "loss": 0.8766, "step": 45 }, { "epoch": 0.11, "learning_rate": 0.0002142986538536308, "loss": 0.8181, "step": 46 }, { "epoch": 0.11, "learning_rate": 0.0002120066237423687, "loss": 0.8754, "step": 47 }, { "epoch": 0.12, "learning_rate": 0.0002097865979195684, "loss": 0.9038, "step": 48 }, { "epoch": 0.12, "learning_rate": 0.00020763488362498048, "loss": 0.8646, "step": 49 }, { "epoch": 0.12, "learning_rate": 0.00020554804791094464, "loss": 0.8836, "step": 50 }, { "epoch": 0.12, "learning_rate": 0.0002035228946026736, "loss": 0.9962, "step": 51 }, { "epoch": 0.12, "learning_rate": 0.0002015564437074637, "loss": 0.8835, "step": 52 }, { "epoch": 0.13, "learning_rate": 0.00019964591297103414, "loss": 0.9196, "step": 53 }, { "epoch": 0.13, "learning_rate": 0.00019778870132130996, "loss": 0.8995, "step": 54 }, { "epoch": 0.13, "learning_rate": 0.00019598237397554634, "loss": 1.0178, "step": 55 }, { "epoch": 0.13, "learning_rate": 0.00019422464901683895, "loss": 0.9395, "step": 56 }, { "epoch": 0.14, "learning_rate": 0.00019251338527170498, "loss": 0.9882, "step": 57 }, { "epoch": 0.14, "learning_rate": 0.00019084657134227863, "loss": 0.9274, "step": 58 }, { "epoch": 0.14, "learning_rate": 0.00018922231566536414, "loss": 0.9517, "step": 59 }, { "epoch": 0.14, "learning_rate": 0.0001876388374866284, "loss": 0.865, "step": 60 }, { "epoch": 0.15, "learning_rate": 0.00018609445865200715, "loss": 0.9314, "step": 61 }, { "epoch": 0.15, "learning_rate": 0.00018458759613029606, "loss": 0.9224, "step": 62 }, { "epoch": 0.15, "learning_rate": 0.00018311675519117857, "loss": 0.788, "step": 63 }, { "epoch": 0.15, "learning_rate": 0.00018168052317185792, "loss": 0.9739, "step": 64 }, { "epoch": 0.16, "learning_rate": 0.00018027756377319947, "loss": 0.9419, "step": 65 }, { "epoch": 0.16, "learning_rate": 0.0001789066118330336, "loss": 0.8772, "step": 66 }, { "epoch": 0.16, "learning_rate": 0.00017756646853014972, "loss": 0.8707, "step": 67 }, { "epoch": 0.16, "learning_rate": 0.00017625599697765812, "loss": 0.8089, "step": 68 }, { "epoch": 0.17, "learning_rate": 0.00017497411816890378, "loss": 0.9303, "step": 69 }, { "epoch": 0.17, "learning_rate": 0.00017371980724307585, "loss": 0.9161, "step": 70 }, { "epoch": 0.17, "learning_rate": 0.00017249209004113945, "loss": 0.9064, "step": 71 }, { "epoch": 0.17, "learning_rate": 0.00017129003992578723, "loss": 1.0988, "step": 72 }, { "epoch": 0.18, "learning_rate": 0.00017011277484181944, "loss": 0.9804, "step": 73 }, { "epoch": 0.18, "learning_rate": 0.0001689594545957618, "loss": 0.8382, "step": 74 }, { "epoch": 0.18, "learning_rate": 0.00016782927833565472, "loss": 0.9632, "step": 75 }, { "epoch": 0.18, "learning_rate": 0.00016672148221383754, "loss": 0.9494, "step": 76 }, { "epoch": 0.18, "learning_rate": 0.00016563533721722828, "loss": 0.9253, "step": 77 }, { "epoch": 0.19, "learning_rate": 0.0001645701471510958, "loss": 0.9143, "step": 78 }, { "epoch": 0.19, "learning_rate": 0.00016352524676365398, "loss": 0.8907, "step": 79 }, { "epoch": 0.19, "learning_rate": 0.0001625, "loss": 0.9748, "step": 80 }, { "epoch": 0.19, "learning_rate": 0.00016149379837498482, "loss": 0.893, "step": 81 }, { "epoch": 0.2, "learning_rate": 0.00016050605945555833, "loss": 0.839, "step": 82 }, { "epoch": 0.2, "learning_rate": 0.0001595362254439902, "loss": 0.9276, "step": 83 }, { "epoch": 0.2, "learning_rate": 0.00015858376185413662, "loss": 0.8758, "step": 84 }, { "epoch": 0.2, "learning_rate": 0.00015764815627361642, "loss": 0.9125, "step": 85 }, { "epoch": 0.21, "learning_rate": 0.00015672891720538393, "loss": 0.955, "step": 86 }, { "epoch": 0.21, "learning_rate": 0.00015582557298274985, "loss": 0.9104, "step": 87 }, { "epoch": 0.21, "learning_rate": 0.00015493767075240873, "loss": 0.8861, "step": 88 }, { "epoch": 0.21, "learning_rate": 0.0001540647755204926, "loss": 0.9693, "step": 89 }, { "epoch": 0.22, "learning_rate": 0.0001532064692570853, "loss": 0.7245, "step": 90 }, { "epoch": 0.22, "learning_rate": 0.000152362350055011, "loss": 0.7523, "step": 91 }, { "epoch": 0.22, "learning_rate": 0.000151532031339051, "loss": 0.8522, "step": 92 }, { "epoch": 0.22, "learning_rate": 0.00015071514112205468, "loss": 0.9273, "step": 93 }, { "epoch": 0.23, "learning_rate": 0.0001499113213046938, "loss": 1.0303, "step": 94 }, { "epoch": 0.23, "learning_rate": 0.00014912022701586513, "loss": 0.9273, "step": 95 }, { "epoch": 0.23, "learning_rate": 0.00014834152599098248, "loss": 0.9071, "step": 96 }, { "epoch": 0.23, "learning_rate": 0.00014757489798561242, "loss": 0.954, "step": 97 }, { "epoch": 0.24, "learning_rate": 0.00014682003422210332, "loss": 0.7897, "step": 98 }, { "epoch": 0.24, "learning_rate": 0.00014607663686703578, "loss": 0.9045, "step": 99 }, { "epoch": 0.24, "learning_rate": 0.00014534441853748632, "loss": 0.7919, "step": 100 }, { "epoch": 0.24, "learning_rate": 0.00014462310183424506, "loss": 0.7449, "step": 101 }, { "epoch": 0.24, "learning_rate": 0.0001439124189002655, "loss": 0.8953, "step": 102 }, { "epoch": 0.25, "learning_rate": 0.0001432121110027503, "loss": 0.974, "step": 103 }, { "epoch": 0.25, "learning_rate": 0.00014252192813739225, "loss": 0.959, "step": 104 }, { "epoch": 0.25, "learning_rate": 0.00014184162865339505, "loss": 0.8767, "step": 105 }, { "epoch": 0.25, "learning_rate": 0.00014117097889799755, "loss": 0.9206, "step": 106 }, { "epoch": 0.26, "learning_rate": 0.000140509752879313, "loss": 0.8096, "step": 107 }, { "epoch": 0.26, "learning_rate": 0.00013985773194637893, "loss": 0.9726, "step": 108 }, { "epoch": 0.26, "learning_rate": 0.00013921470448538878, "loss": 0.7764, "step": 109 }, { "epoch": 0.26, "learning_rate": 0.00013858046563114675, "loss": 0.8414, "step": 110 }, { "epoch": 0.27, "learning_rate": 0.0001379548169928529, "loss": 0.9365, "step": 111 }, { "epoch": 0.27, "learning_rate": 0.00013733756639338393, "loss": 0.9857, "step": 112 }, { "epoch": 0.27, "learning_rate": 0.00013672852762129314, "loss": 0.8209, "step": 113 }, { "epoch": 0.27, "learning_rate": 0.00013612752019480102, "loss": 0.9954, "step": 114 }, { "epoch": 0.28, "learning_rate": 0.0001355343691370986, "loss": 0.9425, "step": 115 }, { "epoch": 0.28, "learning_rate": 0.00013494890476232745, "loss": 0.9199, "step": 116 }, { "epoch": 0.28, "learning_rate": 0.0001343709624716425, "loss": 1.0011, "step": 117 }, { "epoch": 0.28, "learning_rate": 0.00013380038255880045, "loss": 0.9335, "step": 118 }, { "epoch": 0.29, "learning_rate": 0.000133237010024753, "loss": 1.0612, "step": 119 }, { "epoch": 0.29, "learning_rate": 0.0001326806944007555, "loss": 0.8794, "step": 120 }, { "epoch": 0.29, "learning_rate": 0.00013213128957953303, "loss": 0.8557, "step": 121 }, { "epoch": 0.29, "learning_rate": 0.00013158865365407385, "loss": 0.931, "step": 122 }, { "epoch": 0.29, "learning_rate": 0.00013105264876364566, "loss": 0.9648, "step": 123 }, { "epoch": 0.3, "learning_rate": 0.00013052314094665608, "loss": 0.8448, "step": 124 }, { "epoch": 0.3, "learning_rate": 0.00013, "loss": 0.9247, "step": 125 }, { "epoch": 0.3, "learning_rate": 0.0001294830993445593, "loss": 0.9537, "step": 126 }, { "epoch": 0.3, "learning_rate": 0.00012897231589653857, "loss": 0.8049, "step": 127 }, { "epoch": 0.31, "learning_rate": 0.0001284675299443404, "loss": 0.8177, "step": 128 }, { "epoch": 0.31, "learning_rate": 0.00012796862503070062, "loss": 0.9717, "step": 129 }, { "epoch": 0.31, "learning_rate": 0.00012747548783981962, "loss": 0.8813, "step": 130 }, { "epoch": 0.31, "learning_rate": 0.00012698800808924157, "loss": 0.9708, "step": 131 }, { "epoch": 0.32, "learning_rate": 0.00012650607842624748, "loss": 0.8776, "step": 132 }, { "epoch": 0.32, "learning_rate": 0.0001260295943285407, "loss": 0.8564, "step": 133 }, { "epoch": 0.32, "learning_rate": 0.00012555845400901656, "loss": 0.8793, "step": 134 }, { "epoch": 0.32, "learning_rate": 0.0001250925583244189, "loss": 0.9288, "step": 135 }, { "epoch": 0.33, "learning_rate": 0.00012463181068769768, "loss": 0.9407, "step": 136 }, { "epoch": 0.33, "learning_rate": 0.0001241761169838914, "loss": 0.9746, "step": 137 }, { "epoch": 0.33, "learning_rate": 0.00012372538548936814, "loss": 1.0109, "step": 138 }, { "epoch": 0.33, "learning_rate": 0.00012327952679426827, "loss": 1.0695, "step": 139 }, { "epoch": 0.34, "learning_rate": 0.00012283845372799884, "loss": 0.9092, "step": 140 }, { "epoch": 0.34, "learning_rate": 0.00012240208128764027, "loss": 0.7535, "step": 141 }, { "epoch": 0.34, "learning_rate": 0.00012197032656913024, "loss": 0.7952, "step": 142 }, { "epoch": 0.34, "learning_rate": 0.00012154310870109942, "loss": 0.8747, "step": 143 }, { "epoch": 0.35, "learning_rate": 0.0001211203487812386, "loss": 0.823, "step": 144 }, { "epoch": 0.35, "learning_rate": 0.0001207019698150837, "loss": 0.838, "step": 145 }, { "epoch": 0.35, "learning_rate": 0.00012028789665711085, "loss": 0.8352, "step": 146 }, { "epoch": 0.35, "learning_rate": 0.00011987805595403907, "loss": 0.9483, "step": 147 }, { "epoch": 0.35, "learning_rate": 0.00011947237609024377, "loss": 0.8841, "step": 148 }, { "epoch": 0.36, "learning_rate": 0.00011907078713518815, "loss": 1.0176, "step": 149 }, { "epoch": 0.36, "learning_rate": 0.00011867322079278597, "loss": 0.9113, "step": 150 }, { "epoch": 0.36, "learning_rate": 0.00011827961035261132, "loss": 0.8683, "step": 151 }, { "epoch": 0.36, "learning_rate": 0.0001178898906428769, "loss": 0.8158, "step": 152 }, { "epoch": 0.37, "learning_rate": 0.0001175039979851054, "loss": 0.8146, "step": 153 }, { "epoch": 0.37, "learning_rate": 0.00011712187015042266, "loss": 0.8513, "step": 154 }, { "epoch": 0.37, "learning_rate": 0.00011674344631740369, "loss": 0.8071, "step": 155 }, { "epoch": 0.37, "learning_rate": 0.00011636866703140783, "loss": 0.8923, "step": 156 }, { "epoch": 0.38, "learning_rate": 0.00011599747416534057, "loss": 0.9082, "step": 157 }, { "epoch": 0.38, "learning_rate": 0.00011562981088178324, "loss": 0.8323, "step": 158 }, { "epoch": 0.38, "learning_rate": 0.00011526562159643515, "loss": 0.8079, "step": 159 }, { "epoch": 0.38, "learning_rate": 0.00011490485194281395, "loss": 0.8623, "step": 160 }, { "epoch": 0.39, "learning_rate": 0.00011454744873816422, "loss": 0.8465, "step": 161 }, { "epoch": 0.39, "learning_rate": 0.0001141933599505248, "loss": 0.9027, "step": 162 }, { "epoch": 0.39, "learning_rate": 0.00011384253466690954, "loss": 0.907, "step": 163 }, { "epoch": 0.39, "learning_rate": 0.00011349492306255647, "loss": 0.9152, "step": 164 }, { "epoch": 0.4, "learning_rate": 0.0001131504763712036, "loss": 0.7418, "step": 165 }, { "epoch": 0.4, "learning_rate": 0.00011280914685635128, "loss": 0.8328, "step": 166 }, { "epoch": 0.4, "learning_rate": 0.0001124708877834722, "loss": 0.9287, "step": 167 }, { "epoch": 0.4, "learning_rate": 0.00011213565339313254, "loss": 0.7967, "step": 168 }, { "epoch": 0.41, "learning_rate": 0.00011180339887498949, "loss": 0.84, "step": 169 }, { "epoch": 0.41, "learning_rate": 0.00011147408034263073, "loss": 0.8149, "step": 170 }, { "epoch": 0.41, "learning_rate": 0.00011114765480922503, "loss": 0.8555, "step": 171 }, { "epoch": 0.41, "learning_rate": 0.00011082408016395194, "loss": 0.814, "step": 172 }, { "epoch": 0.41, "learning_rate": 0.00011050331514918246, "loss": 0.8139, "step": 173 }, { "epoch": 0.42, "learning_rate": 0.0001101853193383817, "loss": 0.885, "step": 174 }, { "epoch": 0.42, "learning_rate": 0.00010987005311470715, "loss": 0.7682, "step": 175 }, { "epoch": 0.42, "learning_rate": 0.00010955747765027683, "loss": 0.8266, "step": 176 }, { "epoch": 0.42, "learning_rate": 0.00010924755488608232, "loss": 0.8699, "step": 177 }, { "epoch": 0.43, "learning_rate": 0.00010894024751252352, "loss": 0.957, "step": 178 }, { "epoch": 0.43, "learning_rate": 0.00010863551895054227, "loss": 0.854, "step": 179 }, { "epoch": 0.43, "learning_rate": 0.00010833333333333333, "loss": 0.7239, "step": 180 }, { "epoch": 0.43, "learning_rate": 0.00010803365548861171, "loss": 0.7825, "step": 181 }, { "epoch": 0.44, "learning_rate": 0.00010773645092141682, "loss": 0.8531, "step": 182 }, { "epoch": 0.44, "learning_rate": 0.00010744168579743401, "loss": 0.7602, "step": 183 }, { "epoch": 0.44, "learning_rate": 0.0001071493269268154, "loss": 0.8768, "step": 184 }, { "epoch": 0.44, "learning_rate": 0.00010685934174848223, "loss": 0.8294, "step": 185 }, { "epoch": 0.45, "learning_rate": 0.00010657169831489234, "loss": 0.8872, "step": 186 }, { "epoch": 0.45, "learning_rate": 0.0001062863652772559, "loss": 0.7016, "step": 187 }, { "epoch": 0.45, "learning_rate": 0.00010600331187118435, "loss": 0.8942, "step": 188 }, { "epoch": 0.45, "learning_rate": 0.00010572250790275775, "loss": 0.7416, "step": 189 }, { "epoch": 0.46, "learning_rate": 0.00010544392373499565, "loss": 0.8104, "step": 190 }, { "epoch": 0.46, "learning_rate": 0.0001051675302747182, "loss": 0.8349, "step": 191 }, { "epoch": 0.46, "learning_rate": 0.0001048932989597842, "loss": 0.9013, "step": 192 }, { "epoch": 0.46, "learning_rate": 0.00010462120174669319, "loss": 0.7646, "step": 193 }, { "epoch": 0.47, "learning_rate": 0.00010435121109853953, "loss": 0.8087, "step": 194 }, { "epoch": 0.47, "learning_rate": 0.00010408329997330662, "loss": 0.9798, "step": 195 }, { "epoch": 0.47, "learning_rate": 0.00010381744181249024, "loss": 0.7266, "step": 196 }, { "epoch": 0.47, "learning_rate": 0.0001035536105300395, "loss": 0.8502, "step": 197 }, { "epoch": 0.47, "learning_rate": 0.00010329178050160582, "loss": 0.7797, "step": 198 }, { "epoch": 0.48, "learning_rate": 0.00010303192655408924, "loss": 0.7328, "step": 199 }, { "epoch": 0.48, "learning_rate": 0.00010277402395547232, "loss": 0.7916, "step": 200 } ], "logging_steps": 1, "max_steps": 417, "num_train_epochs": 1, "save_steps": 10, "total_flos": 6.487869967879373e+16, "trial_name": null, "trial_params": null }