{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 840, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.0, "loss": 5.3457, "step": 4 }, { "epoch": 0.01, "learning_rate": 4e-08, "loss": 5.2606, "step": 8 }, { "epoch": 0.01, "learning_rate": 7e-08, "loss": 5.0776, "step": 12 }, { "epoch": 0.02, "learning_rate": 1e-07, "loss": 5.149, "step": 16 }, { "epoch": 0.02, "learning_rate": 1.4e-07, "loss": 5.3724, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.7000000000000001e-07, "loss": 5.2246, "step": 24 }, { "epoch": 0.03, "learning_rate": 2.0999999999999997e-07, "loss": 5.032, "step": 28 }, { "epoch": 0.04, "learning_rate": 2.5e-07, "loss": 5.0639, "step": 32 }, { "epoch": 0.04, "learning_rate": 2.9e-07, "loss": 4.5604, "step": 36 }, { "epoch": 0.05, "learning_rate": 3.3e-07, "loss": 4.1731, "step": 40 }, { "epoch": 0.05, "learning_rate": 3.7e-07, "loss": 4.1152, "step": 44 }, { "epoch": 0.06, "learning_rate": 4.0999999999999994e-07, "loss": 4.101, "step": 48 }, { "epoch": 0.06, "learning_rate": 4.5e-07, "loss": 3.8602, "step": 52 }, { "epoch": 0.07, "learning_rate": 4.9e-07, "loss": 3.7321, "step": 56 }, { "epoch": 0.07, "learning_rate": 5.3e-07, "loss": 3.6759, "step": 60 }, { "epoch": 0.08, "learning_rate": 5.699999999999999e-07, "loss": 3.6372, "step": 64 }, { "epoch": 0.08, "learning_rate": 6.1e-07, "loss": 3.5755, "step": 68 }, { "epoch": 0.09, "learning_rate": 6.5e-07, "loss": 3.415, "step": 72 }, { "epoch": 0.09, "learning_rate": 6.9e-07, "loss": 3.215, "step": 76 }, { "epoch": 0.1, "learning_rate": 7.3e-07, "loss": 2.8689, "step": 80 }, { "epoch": 0.1, "learning_rate": 7.699999999999999e-07, "loss": 2.843, "step": 84 }, { "epoch": 0.1, "learning_rate": 8.1e-07, "loss": 2.9887, "step": 88 }, { "epoch": 0.11, "learning_rate": 8.499999999999999e-07, "loss": 2.6316, "step": 92 }, { "epoch": 0.11, "learning_rate": 8.9e-07, "loss": 2.4875, "step": 96 }, { "epoch": 0.12, "learning_rate": 9.3e-07, "loss": 2.4555, "step": 100 }, { "epoch": 0.12, "learning_rate": 9.7e-07, "loss": 2.3976, "step": 104 }, { "epoch": 0.13, "learning_rate": 9.99367088607595e-07, "loss": 2.2534, "step": 108 }, { "epoch": 0.13, "learning_rate": 9.968354430379747e-07, "loss": 2.1527, "step": 112 }, { "epoch": 0.14, "learning_rate": 9.943037974683544e-07, "loss": 2.0296, "step": 116 }, { "epoch": 0.14, "learning_rate": 9.91772151898734e-07, "loss": 2.0501, "step": 120 }, { "epoch": 0.15, "learning_rate": 9.89240506329114e-07, "loss": 2.0385, "step": 124 }, { "epoch": 0.15, "learning_rate": 9.867088607594938e-07, "loss": 1.9757, "step": 128 }, { "epoch": 0.16, "learning_rate": 9.841772151898733e-07, "loss": 1.7414, "step": 132 }, { "epoch": 0.16, "learning_rate": 9.81645569620253e-07, "loss": 1.7658, "step": 136 }, { "epoch": 0.17, "learning_rate": 9.791139240506329e-07, "loss": 1.6414, "step": 140 }, { "epoch": 0.17, "learning_rate": 9.765822784810126e-07, "loss": 1.7797, "step": 144 }, { "epoch": 0.18, "learning_rate": 9.740506329113924e-07, "loss": 1.5513, "step": 148 }, { "epoch": 0.18, "learning_rate": 9.715189873417722e-07, "loss": 1.5515, "step": 152 }, { "epoch": 0.19, "learning_rate": 9.689873417721517e-07, "loss": 1.471, "step": 156 }, { "epoch": 0.19, "learning_rate": 9.664556962025317e-07, "loss": 1.3805, "step": 160 }, { "epoch": 0.2, "learning_rate": 9.639240506329115e-07, "loss": 1.3442, "step": 164 }, { "epoch": 0.2, "learning_rate": 9.61392405063291e-07, "loss": 1.2072, "step": 168 }, { "epoch": 0.2, "learning_rate": 9.588607594936708e-07, "loss": 1.2704, "step": 172 }, { "epoch": 0.21, "learning_rate": 9.563291139240506e-07, "loss": 1.3914, "step": 176 }, { "epoch": 0.21, "learning_rate": 9.537974683544304e-07, "loss": 1.2708, "step": 180 }, { "epoch": 0.22, "learning_rate": 9.5126582278481e-07, "loss": 1.2163, "step": 184 }, { "epoch": 0.22, "learning_rate": 9.487341772151898e-07, "loss": 1.2604, "step": 188 }, { "epoch": 0.23, "learning_rate": 9.462025316455697e-07, "loss": 1.0958, "step": 192 }, { "epoch": 0.23, "learning_rate": 9.436708860759493e-07, "loss": 1.1216, "step": 196 }, { "epoch": 0.24, "learning_rate": 9.411392405063291e-07, "loss": 1.1185, "step": 200 }, { "epoch": 0.24, "learning_rate": 9.386075949367089e-07, "loss": 1.0496, "step": 204 }, { "epoch": 0.25, "learning_rate": 9.360759493670885e-07, "loss": 0.9438, "step": 208 }, { "epoch": 0.25, "learning_rate": 9.335443037974683e-07, "loss": 0.8912, "step": 212 }, { "epoch": 0.26, "learning_rate": 9.31012658227848e-07, "loss": 0.8424, "step": 216 }, { "epoch": 0.26, "learning_rate": 9.284810126582277e-07, "loss": 0.9812, "step": 220 }, { "epoch": 0.27, "learning_rate": 9.259493670886076e-07, "loss": 0.8999, "step": 224 }, { "epoch": 0.27, "learning_rate": 9.234177215189873e-07, "loss": 0.7623, "step": 228 }, { "epoch": 0.28, "learning_rate": 9.208860759493671e-07, "loss": 0.6592, "step": 232 }, { "epoch": 0.28, "learning_rate": 9.183544303797468e-07, "loss": 0.7013, "step": 236 }, { "epoch": 0.29, "learning_rate": 9.158227848101265e-07, "loss": 0.5614, "step": 240 }, { "epoch": 0.29, "learning_rate": 9.132911392405063e-07, "loss": 0.5246, "step": 244 }, { "epoch": 0.3, "learning_rate": 9.10759493670886e-07, "loss": 0.3732, "step": 248 }, { "epoch": 0.3, "learning_rate": 9.082278481012657e-07, "loss": 0.3963, "step": 252 }, { "epoch": 0.3, "learning_rate": 9.056962025316456e-07, "loss": 0.3281, "step": 256 }, { "epoch": 0.31, "learning_rate": 9.031645569620254e-07, "loss": 0.3707, "step": 260 }, { "epoch": 0.31, "learning_rate": 9.00632911392405e-07, "loss": 0.3442, "step": 264 }, { "epoch": 0.32, "learning_rate": 8.981012658227848e-07, "loss": 0.286, "step": 268 }, { "epoch": 0.32, "learning_rate": 8.955696202531646e-07, "loss": 0.2778, "step": 272 }, { "epoch": 0.33, "learning_rate": 8.930379746835442e-07, "loss": 0.3224, "step": 276 }, { "epoch": 0.33, "learning_rate": 8.90506329113924e-07, "loss": 0.2294, "step": 280 }, { "epoch": 0.34, "learning_rate": 8.879746835443038e-07, "loss": 0.3471, "step": 284 }, { "epoch": 0.34, "learning_rate": 8.854430379746834e-07, "loss": 0.2537, "step": 288 }, { "epoch": 0.35, "learning_rate": 8.829113924050633e-07, "loss": 0.3189, "step": 292 }, { "epoch": 0.35, "learning_rate": 8.803797468354431e-07, "loss": 0.2704, "step": 296 }, { "epoch": 0.36, "learning_rate": 8.778481012658227e-07, "loss": 0.2685, "step": 300 }, { "epoch": 0.36, "learning_rate": 8.753164556962025e-07, "loss": 0.4016, "step": 304 }, { "epoch": 0.37, "learning_rate": 8.727848101265822e-07, "loss": 0.2614, "step": 308 }, { "epoch": 0.37, "learning_rate": 8.70253164556962e-07, "loss": 0.2208, "step": 312 }, { "epoch": 0.38, "learning_rate": 8.677215189873417e-07, "loss": 0.3452, "step": 316 }, { "epoch": 0.38, "learning_rate": 8.651898734177214e-07, "loss": 0.2481, "step": 320 }, { "epoch": 0.39, "learning_rate": 8.626582278481013e-07, "loss": 0.2698, "step": 324 }, { "epoch": 0.39, "learning_rate": 8.60126582278481e-07, "loss": 0.2386, "step": 328 }, { "epoch": 0.4, "learning_rate": 8.575949367088607e-07, "loss": 0.27, "step": 332 }, { "epoch": 0.4, "learning_rate": 8.550632911392405e-07, "loss": 0.4073, "step": 336 }, { "epoch": 0.4, "learning_rate": 8.525316455696202e-07, "loss": 0.263, "step": 340 }, { "epoch": 0.41, "learning_rate": 8.499999999999999e-07, "loss": 0.2443, "step": 344 }, { "epoch": 0.41, "learning_rate": 8.474683544303797e-07, "loss": 0.1876, "step": 348 }, { "epoch": 0.42, "learning_rate": 8.449367088607595e-07, "loss": 0.1893, "step": 352 }, { "epoch": 0.42, "learning_rate": 8.424050632911392e-07, "loss": 0.356, "step": 356 }, { "epoch": 0.43, "learning_rate": 8.39873417721519e-07, "loss": 0.3026, "step": 360 }, { "epoch": 0.43, "learning_rate": 8.373417721518988e-07, "loss": 0.2265, "step": 364 }, { "epoch": 0.44, "learning_rate": 8.348101265822784e-07, "loss": 0.3389, "step": 368 }, { "epoch": 0.44, "learning_rate": 8.322784810126582e-07, "loss": 0.274, "step": 372 }, { "epoch": 0.45, "learning_rate": 8.297468354430379e-07, "loss": 0.3028, "step": 376 }, { "epoch": 0.45, "learning_rate": 8.272151898734176e-07, "loss": 0.2582, "step": 380 }, { "epoch": 0.46, "learning_rate": 8.246835443037974e-07, "loss": 0.253, "step": 384 }, { "epoch": 0.46, "learning_rate": 8.221518987341772e-07, "loss": 0.2579, "step": 388 }, { "epoch": 0.47, "learning_rate": 8.19620253164557e-07, "loss": 0.3189, "step": 392 }, { "epoch": 0.47, "learning_rate": 8.170886075949367e-07, "loss": 0.2599, "step": 396 }, { "epoch": 0.48, "learning_rate": 8.145569620253164e-07, "loss": 0.2791, "step": 400 }, { "epoch": 0.48, "learning_rate": 8.120253164556962e-07, "loss": 0.3284, "step": 404 }, { "epoch": 0.49, "learning_rate": 8.094936708860759e-07, "loss": 0.3026, "step": 408 }, { "epoch": 0.49, "learning_rate": 8.069620253164556e-07, "loss": 0.3195, "step": 412 }, { "epoch": 0.5, "learning_rate": 8.044303797468354e-07, "loss": 0.2776, "step": 416 }, { "epoch": 0.5, "learning_rate": 8.018987341772152e-07, "loss": 0.3193, "step": 420 }, { "epoch": 0.5, "learning_rate": 7.993670886075949e-07, "loss": 0.2963, "step": 424 }, { "epoch": 0.51, "learning_rate": 7.968354430379747e-07, "loss": 0.2865, "step": 428 }, { "epoch": 0.51, "learning_rate": 7.943037974683544e-07, "loss": 0.3051, "step": 432 }, { "epoch": 0.52, "learning_rate": 7.917721518987341e-07, "loss": 0.2691, "step": 436 }, { "epoch": 0.52, "learning_rate": 7.892405063291139e-07, "loss": 0.2542, "step": 440 }, { "epoch": 0.53, "learning_rate": 7.867088607594937e-07, "loss": 0.3001, "step": 444 }, { "epoch": 0.53, "learning_rate": 7.841772151898733e-07, "loss": 0.2793, "step": 448 }, { "epoch": 0.54, "learning_rate": 7.816455696202531e-07, "loss": 0.2546, "step": 452 }, { "epoch": 0.54, "learning_rate": 7.79113924050633e-07, "loss": 0.2801, "step": 456 }, { "epoch": 0.55, "learning_rate": 7.765822784810126e-07, "loss": 0.2635, "step": 460 }, { "epoch": 0.55, "learning_rate": 7.740506329113924e-07, "loss": 0.2678, "step": 464 }, { "epoch": 0.56, "learning_rate": 7.715189873417721e-07, "loss": 0.379, "step": 468 }, { "epoch": 0.56, "learning_rate": 7.689873417721518e-07, "loss": 0.3077, "step": 472 }, { "epoch": 0.57, "learning_rate": 7.664556962025316e-07, "loss": 0.1788, "step": 476 }, { "epoch": 0.57, "learning_rate": 7.639240506329113e-07, "loss": 0.3147, "step": 480 }, { "epoch": 0.58, "learning_rate": 7.61392405063291e-07, "loss": 0.3064, "step": 484 }, { "epoch": 0.58, "learning_rate": 7.588607594936709e-07, "loss": 0.2644, "step": 488 }, { "epoch": 0.59, "learning_rate": 7.563291139240506e-07, "loss": 0.1713, "step": 492 }, { "epoch": 0.59, "learning_rate": 7.537974683544304e-07, "loss": 0.2332, "step": 496 }, { "epoch": 0.6, "learning_rate": 7.512658227848101e-07, "loss": 0.3326, "step": 500 }, { "epoch": 0.6, "learning_rate": 7.487341772151898e-07, "loss": 0.3293, "step": 504 }, { "epoch": 0.6, "learning_rate": 7.462025316455696e-07, "loss": 0.1996, "step": 508 }, { "epoch": 0.61, "learning_rate": 7.436708860759493e-07, "loss": 0.2282, "step": 512 }, { "epoch": 0.61, "learning_rate": 7.41139240506329e-07, "loss": 0.2591, "step": 516 }, { "epoch": 0.62, "learning_rate": 7.386075949367089e-07, "loss": 0.2376, "step": 520 }, { "epoch": 0.62, "learning_rate": 7.360759493670887e-07, "loss": 0.2115, "step": 524 }, { "epoch": 0.63, "learning_rate": 7.335443037974683e-07, "loss": 0.2515, "step": 528 }, { "epoch": 0.63, "learning_rate": 7.310126582278481e-07, "loss": 0.3904, "step": 532 }, { "epoch": 0.64, "learning_rate": 7.284810126582279e-07, "loss": 0.2693, "step": 536 }, { "epoch": 0.64, "learning_rate": 7.259493670886075e-07, "loss": 0.2653, "step": 540 }, { "epoch": 0.65, "learning_rate": 7.234177215189873e-07, "loss": 0.2901, "step": 544 }, { "epoch": 0.65, "learning_rate": 7.20886075949367e-07, "loss": 0.3418, "step": 548 }, { "epoch": 0.66, "learning_rate": 7.183544303797468e-07, "loss": 0.2681, "step": 552 }, { "epoch": 0.66, "learning_rate": 7.158227848101266e-07, "loss": 0.2793, "step": 556 }, { "epoch": 0.67, "learning_rate": 7.132911392405063e-07, "loss": 0.2777, "step": 560 }, { "epoch": 0.67, "learning_rate": 7.10759493670886e-07, "loss": 0.1978, "step": 564 }, { "epoch": 0.68, "learning_rate": 7.082278481012658e-07, "loss": 0.2681, "step": 568 }, { "epoch": 0.68, "learning_rate": 7.056962025316455e-07, "loss": 0.2808, "step": 572 }, { "epoch": 0.69, "learning_rate": 7.031645569620253e-07, "loss": 0.2718, "step": 576 }, { "epoch": 0.69, "learning_rate": 7.00632911392405e-07, "loss": 0.2197, "step": 580 }, { "epoch": 0.7, "learning_rate": 6.981012658227847e-07, "loss": 0.2063, "step": 584 }, { "epoch": 0.7, "learning_rate": 6.955696202531646e-07, "loss": 0.2994, "step": 588 }, { "epoch": 0.7, "learning_rate": 6.930379746835443e-07, "loss": 0.3452, "step": 592 }, { "epoch": 0.71, "learning_rate": 6.90506329113924e-07, "loss": 0.2579, "step": 596 }, { "epoch": 0.71, "learning_rate": 6.879746835443038e-07, "loss": 0.2208, "step": 600 }, { "epoch": 0.72, "learning_rate": 6.854430379746835e-07, "loss": 0.2463, "step": 604 }, { "epoch": 0.72, "learning_rate": 6.829113924050632e-07, "loss": 0.2835, "step": 608 }, { "epoch": 0.73, "learning_rate": 6.80379746835443e-07, "loss": 0.2034, "step": 612 }, { "epoch": 0.73, "learning_rate": 6.778481012658228e-07, "loss": 0.2464, "step": 616 }, { "epoch": 0.74, "learning_rate": 6.753164556962025e-07, "loss": 0.2682, "step": 620 }, { "epoch": 0.74, "learning_rate": 6.727848101265823e-07, "loss": 0.285, "step": 624 }, { "epoch": 0.75, "learning_rate": 6.702531645569621e-07, "loss": 0.2276, "step": 628 }, { "epoch": 0.75, "learning_rate": 6.677215189873417e-07, "loss": 0.2754, "step": 632 }, { "epoch": 0.76, "learning_rate": 6.651898734177215e-07, "loss": 0.2306, "step": 636 }, { "epoch": 0.76, "learning_rate": 6.626582278481012e-07, "loss": 0.2431, "step": 640 }, { "epoch": 0.77, "learning_rate": 6.601265822784809e-07, "loss": 0.2079, "step": 644 }, { "epoch": 0.77, "learning_rate": 6.575949367088607e-07, "loss": 0.2426, "step": 648 }, { "epoch": 0.78, "learning_rate": 6.550632911392405e-07, "loss": 0.2854, "step": 652 }, { "epoch": 0.78, "learning_rate": 6.525316455696203e-07, "loss": 0.3083, "step": 656 }, { "epoch": 0.79, "learning_rate": 6.5e-07, "loss": 0.2305, "step": 660 }, { "epoch": 0.79, "learning_rate": 6.474683544303797e-07, "loss": 0.2648, "step": 664 }, { "epoch": 0.8, "learning_rate": 6.449367088607595e-07, "loss": 0.2992, "step": 668 }, { "epoch": 0.8, "learning_rate": 6.424050632911392e-07, "loss": 0.3059, "step": 672 }, { "epoch": 0.8, "learning_rate": 6.398734177215189e-07, "loss": 0.2373, "step": 676 }, { "epoch": 0.81, "learning_rate": 6.373417721518987e-07, "loss": 0.2808, "step": 680 }, { "epoch": 0.81, "learning_rate": 6.348101265822785e-07, "loss": 0.3125, "step": 684 }, { "epoch": 0.82, "learning_rate": 6.322784810126582e-07, "loss": 0.1994, "step": 688 }, { "epoch": 0.82, "learning_rate": 6.29746835443038e-07, "loss": 0.1612, "step": 692 }, { "epoch": 0.83, "learning_rate": 6.272151898734178e-07, "loss": 0.2562, "step": 696 }, { "epoch": 0.83, "learning_rate": 6.246835443037974e-07, "loss": 0.3455, "step": 700 }, { "epoch": 0.84, "learning_rate": 6.221518987341772e-07, "loss": 0.2547, "step": 704 }, { "epoch": 0.84, "learning_rate": 6.196202531645569e-07, "loss": 0.2416, "step": 708 }, { "epoch": 0.85, "learning_rate": 6.170886075949366e-07, "loss": 0.2431, "step": 712 }, { "epoch": 0.85, "learning_rate": 6.145569620253165e-07, "loss": 0.3004, "step": 716 }, { "epoch": 0.86, "learning_rate": 6.120253164556962e-07, "loss": 0.378, "step": 720 }, { "epoch": 0.86, "learning_rate": 6.094936708860759e-07, "loss": 0.2185, "step": 724 }, { "epoch": 0.87, "learning_rate": 6.069620253164557e-07, "loss": 0.2064, "step": 728 }, { "epoch": 0.87, "learning_rate": 6.044303797468354e-07, "loss": 0.2867, "step": 732 }, { "epoch": 0.88, "learning_rate": 6.018987341772151e-07, "loss": 0.178, "step": 736 }, { "epoch": 0.88, "learning_rate": 5.993670886075949e-07, "loss": 0.2377, "step": 740 }, { "epoch": 0.89, "learning_rate": 5.968354430379746e-07, "loss": 0.2475, "step": 744 }, { "epoch": 0.89, "learning_rate": 5.943037974683544e-07, "loss": 0.2546, "step": 748 }, { "epoch": 0.9, "learning_rate": 5.917721518987342e-07, "loss": 0.246, "step": 752 }, { "epoch": 0.9, "learning_rate": 5.892405063291139e-07, "loss": 0.1832, "step": 756 }, { "epoch": 0.9, "learning_rate": 5.867088607594937e-07, "loss": 0.2818, "step": 760 }, { "epoch": 0.91, "learning_rate": 5.841772151898734e-07, "loss": 0.3186, "step": 764 }, { "epoch": 0.91, "learning_rate": 5.816455696202531e-07, "loss": 0.1874, "step": 768 }, { "epoch": 0.92, "learning_rate": 5.791139240506329e-07, "loss": 0.2877, "step": 772 }, { "epoch": 0.92, "learning_rate": 5.765822784810126e-07, "loss": 0.2143, "step": 776 }, { "epoch": 0.93, "learning_rate": 5.740506329113923e-07, "loss": 0.1791, "step": 780 }, { "epoch": 0.93, "learning_rate": 5.715189873417722e-07, "loss": 0.2619, "step": 784 }, { "epoch": 0.94, "learning_rate": 5.68987341772152e-07, "loss": 0.2721, "step": 788 }, { "epoch": 0.94, "learning_rate": 5.664556962025316e-07, "loss": 0.2182, "step": 792 }, { "epoch": 0.95, "learning_rate": 5.639240506329114e-07, "loss": 0.3081, "step": 796 }, { "epoch": 0.95, "learning_rate": 5.613924050632911e-07, "loss": 0.1964, "step": 800 }, { "epoch": 0.96, "learning_rate": 5.588607594936708e-07, "loss": 0.221, "step": 804 }, { "epoch": 0.96, "learning_rate": 5.563291139240506e-07, "loss": 0.2069, "step": 808 }, { "epoch": 0.97, "learning_rate": 5.537974683544303e-07, "loss": 0.294, "step": 812 }, { "epoch": 0.97, "learning_rate": 5.512658227848101e-07, "loss": 0.1952, "step": 816 }, { "epoch": 0.98, "learning_rate": 5.487341772151899e-07, "loss": 0.2268, "step": 820 }, { "epoch": 0.98, "learning_rate": 5.462025316455696e-07, "loss": 0.232, "step": 824 }, { "epoch": 0.99, "learning_rate": 5.436708860759493e-07, "loss": 0.2639, "step": 828 }, { "epoch": 0.99, "learning_rate": 5.411392405063291e-07, "loss": 0.1895, "step": 832 }, { "epoch": 1.0, "learning_rate": 5.386075949367088e-07, "loss": 0.2694, "step": 836 }, { "epoch": 1.0, "learning_rate": 5.360759493670886e-07, "loss": 0.2513, "step": 840 } ], "logging_steps": 4, "max_steps": 1680, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "total_flos": 1.371080632762368e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }