{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9942446043165467, "eval_steps": 500, "global_step": 1388, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.999994877043978e-05, "loss": 1.5749, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.997951166621575e-05, "loss": 1.2447, "step": 20 }, { "epoch": 0.12, "learning_rate": 3.9918088642045126e-05, "loss": 1.1636, "step": 40 }, { "epoch": 0.17, "learning_rate": 3.981585677303025e-05, "loss": 1.1379, "step": 60 }, { "epoch": 0.23, "learning_rate": 3.967302551523671e-05, "loss": 1.1114, "step": 80 }, { "epoch": 0.29, "learning_rate": 3.948988750611294e-05, "loss": 1.1086, "step": 100 }, { "epoch": 0.35, "learning_rate": 3.9266817964924905e-05, "loss": 1.0915, "step": 120 }, { "epoch": 0.4, "learning_rate": 3.900427392399429e-05, "loss": 1.075, "step": 140 }, { "epoch": 0.46, "learning_rate": 3.870279329231546e-05, "loss": 1.0875, "step": 160 }, { "epoch": 0.52, "learning_rate": 3.836299375346956e-05, "loss": 1.0696, "step": 180 }, { "epoch": 0.58, "learning_rate": 3.798557150009373e-05, "loss": 1.0614, "step": 200 }, { "epoch": 0.63, "learning_rate": 3.757129980749847e-05, "loss": 1.0638, "step": 220 }, { "epoch": 0.69, "learning_rate": 3.712102744935529e-05, "loss": 1.0545, "step": 240 }, { "epoch": 0.75, "learning_rate": 3.6635676958700946e-05, "loss": 1.0508, "step": 260 }, { "epoch": 0.81, "learning_rate": 3.611624273782092e-05, "loss": 1.0566, "step": 280 }, { "epoch": 0.86, "learning_rate": 3.556378902088484e-05, "loss": 1.0577, "step": 300 }, { "epoch": 0.92, "learning_rate": 3.4979447693508e-05, "loss": 1.0428, "step": 320 }, { "epoch": 0.98, "learning_rate": 3.436441597370635e-05, "loss": 1.0456, "step": 340 }, { "epoch": 1.0, "eval_loss": 1.035969614982605, "eval_runtime": 20.059, "eval_samples_per_second": 13.909, "eval_steps_per_second": 3.49, "step": 347 }, { "epoch": 1.04, "learning_rate": 3.371995395899618e-05, "loss": 1.0082, "step": 360 }, { "epoch": 1.09, "learning_rate": 3.304738204466437e-05, "loss": 0.9889, "step": 380 }, { "epoch": 1.15, "learning_rate": 3.234807821849838e-05, "loss": 0.9786, "step": 400 }, { "epoch": 1.21, "learning_rate": 3.162347523751894e-05, "loss": 0.9881, "step": 420 }, { "epoch": 1.27, "learning_rate": 3.0875057692499566e-05, "loss": 0.9747, "step": 440 }, { "epoch": 1.32, "learning_rate": 3.0104358966287503e-05, "loss": 0.9842, "step": 460 }, { "epoch": 1.38, "learning_rate": 2.9312958092157724e-05, "loss": 0.9846, "step": 480 }, { "epoch": 1.44, "learning_rate": 2.850247651863686e-05, "loss": 0.9801, "step": 500 }, { "epoch": 1.5, "learning_rate": 2.767457478742533e-05, "loss": 0.9834, "step": 520 }, { "epoch": 1.55, "learning_rate": 2.6830949131224118e-05, "loss": 0.9831, "step": 540 }, { "epoch": 1.61, "learning_rate": 2.5973327998436527e-05, "loss": 0.9787, "step": 560 }, { "epoch": 1.67, "learning_rate": 2.5103468511865456e-05, "loss": 0.981, "step": 580 }, { "epoch": 1.73, "learning_rate": 2.4223152868661535e-05, "loss": 0.9845, "step": 600 }, { "epoch": 1.78, "learning_rate": 2.3334184688898107e-05, "loss": 0.9754, "step": 620 }, { "epoch": 1.84, "learning_rate": 2.2438385320254234e-05, "loss": 0.9779, "step": 640 }, { "epoch": 1.9, "learning_rate": 2.1537590106376758e-05, "loss": 0.9737, "step": 660 }, { "epoch": 1.96, "learning_rate": 2.0633644626567007e-05, "loss": 0.9714, "step": 680 }, { "epoch": 2.0, "eval_loss": 1.0180176496505737, "eval_runtime": 20.0699, "eval_samples_per_second": 13.901, "eval_steps_per_second": 3.488, "step": 695 }, { "epoch": 2.01, "learning_rate": 1.9728400914496288e-05, "loss": 0.9669, "step": 700 }, { "epoch": 2.07, "learning_rate": 1.882371366369749e-05, "loss": 0.9478, "step": 720 }, { "epoch": 2.13, "learning_rate": 1.79214364276071e-05, "loss": 0.9458, "step": 740 }, { "epoch": 2.19, "learning_rate": 1.702341782194301e-05, "loss": 0.9307, "step": 760 }, { "epoch": 2.24, "learning_rate": 1.6131497737198942e-05, "loss": 0.9435, "step": 780 }, { "epoch": 2.3, "learning_rate": 1.5247503569015413e-05, "loss": 0.945, "step": 800 }, { "epoch": 2.36, "learning_rate": 1.437324647415053e-05, "loss": 0.9416, "step": 820 }, { "epoch": 2.42, "learning_rate": 1.3510517659721583e-05, "loss": 0.9476, "step": 840 }, { "epoch": 2.47, "learning_rate": 1.2661084713320093e-05, "loss": 0.946, "step": 860 }, { "epoch": 2.53, "learning_rate": 1.182668798151939e-05, "loss": 0.9414, "step": 880 }, { "epoch": 2.59, "learning_rate": 1.1009037004194424e-05, "loss": 0.9439, "step": 900 }, { "epoch": 2.65, "learning_rate": 1.020980701195946e-05, "loss": 0.9486, "step": 920 }, { "epoch": 2.71, "learning_rate": 9.430635493899609e-06, "loss": 0.949, "step": 940 }, { "epoch": 2.76, "learning_rate": 8.673118842628595e-06, "loss": 0.9376, "step": 960 }, { "epoch": 2.82, "learning_rate": 7.938809083546264e-06, "loss": 0.9432, "step": 980 }, { "epoch": 2.88, "learning_rate": 7.229210694997113e-06, "loss": 0.9457, "step": 1000 }, { "epoch": 2.94, "learning_rate": 6.545777525844883e-06, "loss": 0.9357, "step": 1020 }, { "epoch": 2.99, "learning_rate": 5.889909816778458e-06, "loss": 0.9335, "step": 1040 }, { "epoch": 3.0, "eval_loss": 1.0176299810409546, "eval_runtime": 20.0069, "eval_samples_per_second": 13.945, "eval_steps_per_second": 3.499, "step": 1042 }, { "epoch": 3.05, "learning_rate": 5.262951331452011e-06, "loss": 0.937, "step": 1060 }, { "epoch": 3.11, "learning_rate": 4.6661866033371506e-06, "loss": 0.9351, "step": 1080 }, { "epoch": 3.17, "learning_rate": 4.100838303927914e-06, "loss": 0.9415, "step": 1100 }, { "epoch": 3.22, "learning_rate": 3.5680647376905666e-06, "loss": 0.9293, "step": 1120 }, { "epoch": 3.28, "learning_rate": 3.0689574688907607e-06, "loss": 0.9304, "step": 1140 }, { "epoch": 3.34, "learning_rate": 2.604539085160218e-06, "loss": 0.9254, "step": 1160 }, { "epoch": 3.4, "learning_rate": 2.1757611023850876e-06, "loss": 0.9293, "step": 1180 }, { "epoch": 3.45, "learning_rate": 1.7835020152084116e-06, "loss": 0.9391, "step": 1200 }, { "epoch": 3.51, "learning_rate": 1.4285654971409902e-06, "loss": 0.9363, "step": 1220 }, { "epoch": 3.57, "learning_rate": 1.1116787539682571e-06, "loss": 0.9506, "step": 1240 }, { "epoch": 3.63, "learning_rate": 8.334910338268054e-07, "loss": 0.9226, "step": 1260 }, { "epoch": 3.68, "learning_rate": 5.945722970031332e-07, "loss": 0.9305, "step": 1280 }, { "epoch": 3.74, "learning_rate": 3.9541204817997283e-07, "loss": 0.9306, "step": 1300 }, { "epoch": 3.8, "learning_rate": 2.3641833352276768e-07, "loss": 0.9344, "step": 1320 }, { "epoch": 3.86, "learning_rate": 1.1791690466107286e-07, "loss": 0.93, "step": 1340 }, { "epoch": 3.91, "learning_rate": 4.0150551277724494e-08, "loss": 0.9344, "step": 1360 }, { "epoch": 3.97, "learning_rate": 3.2786036732557203e-09, "loss": 0.9348, "step": 1380 }, { "epoch": 3.99, "eval_loss": 1.0186352729797363, "eval_runtime": 19.987, "eval_samples_per_second": 13.959, "eval_steps_per_second": 3.502, "step": 1388 }, { "epoch": 3.99, "step": 1388, "total_flos": 1.7642090681398723e+18, "train_loss": 0.9866050820185747, "train_runtime": 37340.4665, "train_samples_per_second": 3.722, "train_steps_per_second": 0.037 } ], "logging_steps": 20, "max_steps": 1388, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 80, "total_flos": 1.7642090681398723e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null }