{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.246376811594203, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07246376811594203, "grad_norm": 1.0494204759597778, "learning_rate": 0.00019800000000000002, "loss": 4.2586, "step": 10 }, { "epoch": 0.14492753623188406, "grad_norm": 2.3070778846740723, "learning_rate": 0.000196, "loss": 3.4035, "step": 20 }, { "epoch": 0.21739130434782608, "grad_norm": 3.2612266540527344, "learning_rate": 0.000194, "loss": 2.6736, "step": 30 }, { "epoch": 0.2898550724637681, "grad_norm": 2.4876601696014404, "learning_rate": 0.0001922, "loss": 2.1857, "step": 40 }, { "epoch": 0.36231884057971014, "grad_norm": 2.367360830307007, "learning_rate": 0.0001902, "loss": 2.0017, "step": 50 }, { "epoch": 0.43478260869565216, "grad_norm": 1.9053794145584106, "learning_rate": 0.0001882, "loss": 1.7045, "step": 60 }, { "epoch": 0.5072463768115942, "grad_norm": 2.813183069229126, "learning_rate": 0.00018620000000000003, "loss": 1.777, "step": 70 }, { "epoch": 0.5797101449275363, "grad_norm": 1.703532099723816, "learning_rate": 0.0001842, "loss": 1.8335, "step": 80 }, { "epoch": 0.6521739130434783, "grad_norm": 2.5876986980438232, "learning_rate": 0.0001822, "loss": 1.932, "step": 90 }, { "epoch": 0.7246376811594203, "grad_norm": 2.3653035163879395, "learning_rate": 0.00018020000000000002, "loss": 1.5939, "step": 100 }, { "epoch": 0.7971014492753623, "grad_norm": 1.8726710081100464, "learning_rate": 0.00017820000000000002, "loss": 1.816, "step": 110 }, { "epoch": 0.8695652173913043, "grad_norm": 1.8513643741607666, "learning_rate": 0.0001762, "loss": 1.4941, "step": 120 }, { "epoch": 0.9420289855072463, "grad_norm": 2.2692129611968994, "learning_rate": 0.0001742, "loss": 1.5971, "step": 130 }, { "epoch": 1.0144927536231885, "grad_norm": 2.6734588146209717, "learning_rate": 0.0001722, "loss": 1.6418, "step": 140 }, { "epoch": 1.0869565217391304, "grad_norm": 3.8182976245880127, "learning_rate": 0.00017020000000000002, "loss": 1.4724, "step": 150 }, { "epoch": 1.1594202898550725, "grad_norm": 2.3226006031036377, "learning_rate": 0.0001682, "loss": 1.3599, "step": 160 }, { "epoch": 1.2318840579710144, "grad_norm": 6.185395240783691, "learning_rate": 0.0001662, "loss": 1.3629, "step": 170 }, { "epoch": 1.3043478260869565, "grad_norm": 2.1148927211761475, "learning_rate": 0.0001642, "loss": 1.3997, "step": 180 }, { "epoch": 1.3768115942028984, "grad_norm": 3.705061197280884, "learning_rate": 0.0001622, "loss": 1.3026, "step": 190 }, { "epoch": 1.4492753623188406, "grad_norm": 3.832019090652466, "learning_rate": 0.00016020000000000002, "loss": 1.5701, "step": 200 }, { "epoch": 1.5217391304347827, "grad_norm": 2.6830215454101562, "learning_rate": 0.00015820000000000002, "loss": 1.6283, "step": 210 }, { "epoch": 1.5942028985507246, "grad_norm": 4.285362243652344, "learning_rate": 0.0001562, "loss": 1.3493, "step": 220 }, { "epoch": 1.6666666666666665, "grad_norm": 3.6948392391204834, "learning_rate": 0.0001542, "loss": 1.3661, "step": 230 }, { "epoch": 1.7391304347826086, "grad_norm": 3.770359992980957, "learning_rate": 0.0001522, "loss": 1.2724, "step": 240 }, { "epoch": 1.8115942028985508, "grad_norm": 4.342422008514404, "learning_rate": 0.00015020000000000002, "loss": 1.2295, "step": 250 }, { "epoch": 1.8840579710144927, "grad_norm": 4.003652572631836, "learning_rate": 0.0001482, "loss": 1.3862, "step": 260 }, { "epoch": 1.9565217391304348, "grad_norm": 3.8167941570281982, "learning_rate": 0.0001462, "loss": 1.3623, "step": 270 }, { "epoch": 2.028985507246377, "grad_norm": 4.051051139831543, "learning_rate": 0.0001442, "loss": 0.9666, "step": 280 }, { "epoch": 2.101449275362319, "grad_norm": 2.2572319507598877, "learning_rate": 0.0001422, "loss": 0.9071, "step": 290 }, { "epoch": 2.1739130434782608, "grad_norm": 5.0580878257751465, "learning_rate": 0.0001402, "loss": 0.9925, "step": 300 }, { "epoch": 2.246376811594203, "grad_norm": 4.157490253448486, "learning_rate": 0.0001382, "loss": 1.2288, "step": 310 }, { "epoch": 2.318840579710145, "grad_norm": 4.6029510498046875, "learning_rate": 0.0001362, "loss": 0.8646, "step": 320 }, { "epoch": 2.391304347826087, "grad_norm": 6.775791645050049, "learning_rate": 0.0001342, "loss": 1.0872, "step": 330 }, { "epoch": 2.463768115942029, "grad_norm": 4.0972113609313965, "learning_rate": 0.00013220000000000001, "loss": 0.9001, "step": 340 }, { "epoch": 2.536231884057971, "grad_norm": 8.093110084533691, "learning_rate": 0.00013020000000000002, "loss": 0.9329, "step": 350 }, { "epoch": 2.608695652173913, "grad_norm": 5.541107177734375, "learning_rate": 0.0001282, "loss": 1.1102, "step": 360 }, { "epoch": 2.681159420289855, "grad_norm": 4.768208980560303, "learning_rate": 0.0001262, "loss": 0.9413, "step": 370 }, { "epoch": 2.753623188405797, "grad_norm": 6.943519115447998, "learning_rate": 0.0001242, "loss": 1.1666, "step": 380 }, { "epoch": 2.8260869565217392, "grad_norm": 4.54674768447876, "learning_rate": 0.00012220000000000002, "loss": 0.9934, "step": 390 }, { "epoch": 2.898550724637681, "grad_norm": 4.663645267486572, "learning_rate": 0.00012020000000000001, "loss": 1.2474, "step": 400 }, { "epoch": 2.971014492753623, "grad_norm": 4.170300483703613, "learning_rate": 0.0001182, "loss": 1.1413, "step": 410 }, { "epoch": 3.0434782608695654, "grad_norm": 3.5200247764587402, "learning_rate": 0.00011619999999999999, "loss": 1.1336, "step": 420 }, { "epoch": 3.1159420289855073, "grad_norm": 6.17999792098999, "learning_rate": 0.0001142, "loss": 0.8005, "step": 430 }, { "epoch": 3.1884057971014492, "grad_norm": 6.850672245025635, "learning_rate": 0.00011220000000000002, "loss": 0.8476, "step": 440 }, { "epoch": 3.260869565217391, "grad_norm": 5.512606620788574, "learning_rate": 0.00011020000000000001, "loss": 0.9174, "step": 450 }, { "epoch": 3.3333333333333335, "grad_norm": 5.102043628692627, "learning_rate": 0.00010820000000000001, "loss": 0.7113, "step": 460 }, { "epoch": 3.4057971014492754, "grad_norm": 3.110646963119507, "learning_rate": 0.0001062, "loss": 0.752, "step": 470 }, { "epoch": 3.4782608695652173, "grad_norm": 4.310419082641602, "learning_rate": 0.00010420000000000001, "loss": 0.8261, "step": 480 }, { "epoch": 3.550724637681159, "grad_norm": 6.366318225860596, "learning_rate": 0.0001022, "loss": 0.793, "step": 490 }, { "epoch": 3.6231884057971016, "grad_norm": 7.170370578765869, "learning_rate": 0.00010020000000000001, "loss": 1.0607, "step": 500 }, { "epoch": 3.6956521739130435, "grad_norm": 5.464928150177002, "learning_rate": 9.82e-05, "loss": 0.5825, "step": 510 }, { "epoch": 3.7681159420289854, "grad_norm": 5.7281951904296875, "learning_rate": 9.620000000000001e-05, "loss": 0.786, "step": 520 }, { "epoch": 3.8405797101449277, "grad_norm": 5.510980129241943, "learning_rate": 9.42e-05, "loss": 0.6948, "step": 530 }, { "epoch": 3.9130434782608696, "grad_norm": 1.936035394668579, "learning_rate": 9.22e-05, "loss": 0.6707, "step": 540 }, { "epoch": 3.9855072463768115, "grad_norm": 3.7161924839019775, "learning_rate": 9.020000000000001e-05, "loss": 0.8582, "step": 550 }, { "epoch": 4.057971014492754, "grad_norm": 4.582805156707764, "learning_rate": 8.82e-05, "loss": 0.5581, "step": 560 }, { "epoch": 4.130434782608695, "grad_norm": 6.6975250244140625, "learning_rate": 8.620000000000001e-05, "loss": 0.6891, "step": 570 }, { "epoch": 4.202898550724638, "grad_norm": 4.396116256713867, "learning_rate": 8.42e-05, "loss": 0.6131, "step": 580 }, { "epoch": 4.27536231884058, "grad_norm": 8.45380687713623, "learning_rate": 8.22e-05, "loss": 0.49, "step": 590 }, { "epoch": 4.3478260869565215, "grad_norm": 5.600996017456055, "learning_rate": 8.020000000000001e-05, "loss": 0.6265, "step": 600 }, { "epoch": 4.420289855072464, "grad_norm": 9.154874801635742, "learning_rate": 7.82e-05, "loss": 0.6905, "step": 610 }, { "epoch": 4.492753623188406, "grad_norm": 6.6350202560424805, "learning_rate": 7.620000000000001e-05, "loss": 0.5512, "step": 620 }, { "epoch": 4.565217391304348, "grad_norm": 5.929750442504883, "learning_rate": 7.42e-05, "loss": 0.6337, "step": 630 }, { "epoch": 4.63768115942029, "grad_norm": 4.856590270996094, "learning_rate": 7.22e-05, "loss": 0.5576, "step": 640 }, { "epoch": 4.710144927536232, "grad_norm": 6.00139856338501, "learning_rate": 7.02e-05, "loss": 0.839, "step": 650 }, { "epoch": 4.782608695652174, "grad_norm": 7.68943452835083, "learning_rate": 6.82e-05, "loss": 0.6007, "step": 660 }, { "epoch": 4.855072463768116, "grad_norm": 3.7272567749023438, "learning_rate": 6.620000000000001e-05, "loss": 0.6239, "step": 670 }, { "epoch": 4.927536231884058, "grad_norm": 4.914477825164795, "learning_rate": 6.42e-05, "loss": 0.6188, "step": 680 }, { "epoch": 5.0, "grad_norm": 9.873302459716797, "learning_rate": 6.220000000000001e-05, "loss": 0.5975, "step": 690 }, { "epoch": 5.072463768115942, "grad_norm": 8.113228797912598, "learning_rate": 6.02e-05, "loss": 0.4262, "step": 700 }, { "epoch": 5.144927536231884, "grad_norm": 2.4135184288024902, "learning_rate": 5.82e-05, "loss": 0.3966, "step": 710 }, { "epoch": 5.217391304347826, "grad_norm": 3.7978782653808594, "learning_rate": 5.620000000000001e-05, "loss": 0.5201, "step": 720 }, { "epoch": 5.2898550724637685, "grad_norm": 4.620602607727051, "learning_rate": 5.420000000000001e-05, "loss": 0.5306, "step": 730 }, { "epoch": 5.36231884057971, "grad_norm": 7.97003173828125, "learning_rate": 5.22e-05, "loss": 0.4757, "step": 740 }, { "epoch": 5.434782608695652, "grad_norm": 3.4005777835845947, "learning_rate": 5.02e-05, "loss": 0.3991, "step": 750 }, { "epoch": 5.507246376811594, "grad_norm": 9.03802490234375, "learning_rate": 4.82e-05, "loss": 0.4278, "step": 760 }, { "epoch": 5.579710144927536, "grad_norm": 4.8757123947143555, "learning_rate": 4.6200000000000005e-05, "loss": 0.4691, "step": 770 }, { "epoch": 5.6521739130434785, "grad_norm": 7.352402210235596, "learning_rate": 4.4200000000000004e-05, "loss": 0.4877, "step": 780 }, { "epoch": 5.72463768115942, "grad_norm": 4.516758918762207, "learning_rate": 4.22e-05, "loss": 0.506, "step": 790 }, { "epoch": 5.797101449275362, "grad_norm": 6.949781894683838, "learning_rate": 4.02e-05, "loss": 0.688, "step": 800 }, { "epoch": 5.869565217391305, "grad_norm": 8.907429695129395, "learning_rate": 3.82e-05, "loss": 0.5848, "step": 810 }, { "epoch": 5.942028985507246, "grad_norm": 8.472686767578125, "learning_rate": 3.62e-05, "loss": 0.4619, "step": 820 }, { "epoch": 6.0144927536231885, "grad_norm": 3.424809217453003, "learning_rate": 3.4200000000000005e-05, "loss": 0.4204, "step": 830 }, { "epoch": 6.086956521739131, "grad_norm": 6.555367946624756, "learning_rate": 3.2200000000000003e-05, "loss": 0.3869, "step": 840 }, { "epoch": 6.159420289855072, "grad_norm": 7.202473163604736, "learning_rate": 3.02e-05, "loss": 0.4058, "step": 850 }, { "epoch": 6.231884057971015, "grad_norm": 9.05301570892334, "learning_rate": 2.8199999999999998e-05, "loss": 0.3875, "step": 860 }, { "epoch": 6.304347826086957, "grad_norm": 8.509578704833984, "learning_rate": 2.6200000000000003e-05, "loss": 0.4036, "step": 870 }, { "epoch": 6.3768115942028984, "grad_norm": 4.049665451049805, "learning_rate": 2.4200000000000002e-05, "loss": 0.4272, "step": 880 }, { "epoch": 6.449275362318841, "grad_norm": 3.557060718536377, "learning_rate": 2.22e-05, "loss": 0.3636, "step": 890 }, { "epoch": 6.521739130434782, "grad_norm": 8.958136558532715, "learning_rate": 2.0200000000000003e-05, "loss": 0.4378, "step": 900 }, { "epoch": 6.594202898550725, "grad_norm": 2.7690534591674805, "learning_rate": 1.8200000000000002e-05, "loss": 0.3435, "step": 910 }, { "epoch": 6.666666666666667, "grad_norm": 5.819123268127441, "learning_rate": 1.62e-05, "loss": 0.4098, "step": 920 }, { "epoch": 6.739130434782608, "grad_norm": 6.891845226287842, "learning_rate": 1.42e-05, "loss": 0.3363, "step": 930 }, { "epoch": 6.811594202898551, "grad_norm": 7.646413326263428, "learning_rate": 1.22e-05, "loss": 0.4361, "step": 940 }, { "epoch": 6.884057971014493, "grad_norm": 7.139030933380127, "learning_rate": 1.02e-05, "loss": 0.3213, "step": 950 }, { "epoch": 6.956521739130435, "grad_norm": 8.147725105285645, "learning_rate": 8.200000000000001e-06, "loss": 0.4433, "step": 960 }, { "epoch": 7.028985507246377, "grad_norm": 9.252585411071777, "learning_rate": 6.2e-06, "loss": 0.4423, "step": 970 }, { "epoch": 7.101449275362318, "grad_norm": 3.595215320587158, "learning_rate": 4.2000000000000004e-06, "loss": 0.389, "step": 980 }, { "epoch": 7.173913043478261, "grad_norm": 2.2256908416748047, "learning_rate": 2.2e-06, "loss": 0.3653, "step": 990 }, { "epoch": 7.246376811594203, "grad_norm": 7.12455415725708, "learning_rate": 2.0000000000000002e-07, "loss": 0.3505, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 787356038406144.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }