{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8896447467876039, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "learning_rate": 0.000975, "loss": 0.9774, "step": 500 }, { "epoch": 0.06, "eval_loss": 0.8144938945770264, "eval_runtime": 14.6573, "eval_samples_per_second": 68.225, "eval_steps_per_second": 34.113, "step": 500 }, { "epoch": 0.13, "learning_rate": 0.00095, "loss": 0.9617, "step": 1000 }, { "epoch": 0.13, "eval_loss": 0.8077166080474854, "eval_runtime": 15.1765, "eval_samples_per_second": 65.891, "eval_steps_per_second": 32.946, "step": 1000 }, { "epoch": 0.19, "learning_rate": 0.000925, "loss": 0.911, "step": 1500 }, { "epoch": 0.19, "eval_loss": 0.8064053058624268, "eval_runtime": 15.3268, "eval_samples_per_second": 65.245, "eval_steps_per_second": 32.623, "step": 1500 }, { "epoch": 0.25, "learning_rate": 0.0009000000000000001, "loss": 0.8954, "step": 2000 }, { "epoch": 0.25, "eval_loss": 0.8163686990737915, "eval_runtime": 14.5944, "eval_samples_per_second": 68.519, "eval_steps_per_second": 34.26, "step": 2000 }, { "epoch": 0.31, "learning_rate": 0.000875, "loss": 0.883, "step": 2500 }, { "epoch": 0.31, "eval_loss": 0.8068735003471375, "eval_runtime": 14.8614, "eval_samples_per_second": 67.288, "eval_steps_per_second": 33.644, "step": 2500 }, { "epoch": 0.38, "learning_rate": 0.00085, "loss": 0.8867, "step": 3000 }, { "epoch": 0.38, "eval_loss": 0.7819482088088989, "eval_runtime": 14.6896, "eval_samples_per_second": 68.076, "eval_steps_per_second": 34.038, "step": 3000 }, { "epoch": 0.44, "learning_rate": 0.000825, "loss": 0.8688, "step": 3500 }, { "epoch": 0.44, "eval_loss": 0.8062307238578796, "eval_runtime": 14.6856, "eval_samples_per_second": 68.094, "eval_steps_per_second": 34.047, "step": 3500 }, { "epoch": 0.5, "learning_rate": 0.0008, "loss": 0.8446, "step": 4000 }, { "epoch": 0.5, "eval_loss": 0.7707250714302063, "eval_runtime": 15.0517, "eval_samples_per_second": 66.438, "eval_steps_per_second": 33.219, "step": 4000 }, { "epoch": 0.57, "learning_rate": 0.0007750000000000001, "loss": 0.8617, "step": 4500 }, { "epoch": 0.57, "eval_loss": 0.7528353333473206, "eval_runtime": 14.7368, "eval_samples_per_second": 67.857, "eval_steps_per_second": 33.929, "step": 4500 }, { "epoch": 0.63, "learning_rate": 0.00075, "loss": 0.8158, "step": 5000 }, { "epoch": 0.63, "eval_loss": 0.7551385760307312, "eval_runtime": 15.3736, "eval_samples_per_second": 65.047, "eval_steps_per_second": 32.523, "step": 5000 }, { "epoch": 0.69, "learning_rate": 0.000725, "loss": 0.7889, "step": 5500 }, { "epoch": 0.69, "eval_loss": 0.7405046820640564, "eval_runtime": 15.4488, "eval_samples_per_second": 64.73, "eval_steps_per_second": 32.365, "step": 5500 }, { "epoch": 0.76, "learning_rate": 0.0007, "loss": 0.7992, "step": 6000 }, { "epoch": 0.76, "eval_loss": 0.7292428016662598, "eval_runtime": 15.892, "eval_samples_per_second": 62.925, "eval_steps_per_second": 31.462, "step": 6000 }, { "epoch": 0.82, "learning_rate": 0.000675, "loss": 0.8051, "step": 6500 }, { "epoch": 0.82, "eval_loss": 0.7345249056816101, "eval_runtime": 14.8049, "eval_samples_per_second": 67.545, "eval_steps_per_second": 33.773, "step": 6500 }, { "epoch": 0.88, "learning_rate": 0.0006500000000000001, "loss": 0.7684, "step": 7000 }, { "epoch": 0.88, "eval_loss": 0.7357723712921143, "eval_runtime": 14.7316, "eval_samples_per_second": 67.881, "eval_steps_per_second": 33.941, "step": 7000 }, { "epoch": 0.94, "learning_rate": 0.000625, "loss": 0.753, "step": 7500 }, { "epoch": 0.94, "eval_loss": 0.7323009371757507, "eval_runtime": 14.6239, "eval_samples_per_second": 68.381, "eval_steps_per_second": 34.191, "step": 7500 }, { "epoch": 1.01, "learning_rate": 0.0006, "loss": 0.7464, "step": 8000 }, { "epoch": 1.01, "eval_loss": 0.7275989651679993, "eval_runtime": 15.2815, "eval_samples_per_second": 65.439, "eval_steps_per_second": 32.719, "step": 8000 }, { "epoch": 1.07, "learning_rate": 0.000575, "loss": 0.5429, "step": 8500 }, { "epoch": 1.07, "eval_loss": 0.7231326103210449, "eval_runtime": 15.5099, "eval_samples_per_second": 64.475, "eval_steps_per_second": 32.238, "step": 8500 }, { "epoch": 1.13, "learning_rate": 0.00055, "loss": 0.5704, "step": 9000 }, { "epoch": 1.13, "eval_loss": 0.717272162437439, "eval_runtime": 14.9897, "eval_samples_per_second": 66.712, "eval_steps_per_second": 33.356, "step": 9000 }, { "epoch": 1.2, "learning_rate": 0.0005250000000000001, "loss": 0.5459, "step": 9500 }, { "epoch": 1.2, "eval_loss": 0.7188604474067688, "eval_runtime": 14.7366, "eval_samples_per_second": 67.858, "eval_steps_per_second": 33.929, "step": 9500 }, { "epoch": 1.26, "learning_rate": 0.0005, "loss": 0.5435, "step": 10000 }, { "epoch": 1.26, "eval_loss": 0.7037996053695679, "eval_runtime": 14.5588, "eval_samples_per_second": 68.687, "eval_steps_per_second": 34.343, "step": 10000 }, { "epoch": 1.32, "learning_rate": 0.000475, "loss": 0.5429, "step": 10500 }, { "epoch": 1.32, "eval_loss": 0.7000067234039307, "eval_runtime": 14.629, "eval_samples_per_second": 68.357, "eval_steps_per_second": 34.179, "step": 10500 }, { "epoch": 1.39, "learning_rate": 0.00045000000000000004, "loss": 0.5363, "step": 11000 }, { "epoch": 1.39, "eval_loss": 0.7090610861778259, "eval_runtime": 15.5146, "eval_samples_per_second": 64.455, "eval_steps_per_second": 32.228, "step": 11000 }, { "epoch": 1.45, "learning_rate": 0.000425, "loss": 0.551, "step": 11500 }, { "epoch": 1.45, "eval_loss": 0.6937999129295349, "eval_runtime": 15.2752, "eval_samples_per_second": 65.466, "eval_steps_per_second": 32.733, "step": 11500 }, { "epoch": 1.51, "learning_rate": 0.0004, "loss": 0.5345, "step": 12000 }, { "epoch": 1.51, "eval_loss": 0.6926913261413574, "eval_runtime": 14.5585, "eval_samples_per_second": 68.688, "eval_steps_per_second": 34.344, "step": 12000 }, { "epoch": 1.57, "learning_rate": 0.000375, "loss": 0.5519, "step": 12500 }, { "epoch": 1.57, "eval_loss": 0.6763409972190857, "eval_runtime": 14.6685, "eval_samples_per_second": 68.173, "eval_steps_per_second": 34.087, "step": 12500 }, { "epoch": 1.64, "learning_rate": 0.00035, "loss": 0.5324, "step": 13000 }, { "epoch": 1.64, "eval_loss": 0.6778369545936584, "eval_runtime": 17.3898, "eval_samples_per_second": 57.505, "eval_steps_per_second": 28.753, "step": 13000 }, { "epoch": 1.7, "learning_rate": 0.00032500000000000004, "loss": 0.5272, "step": 13500 }, { "epoch": 1.7, "eval_loss": 0.6725330948829651, "eval_runtime": 14.6869, "eval_samples_per_second": 68.088, "eval_steps_per_second": 34.044, "step": 13500 }, { "epoch": 1.76, "learning_rate": 0.0003, "loss": 0.5258, "step": 14000 }, { "epoch": 1.76, "eval_loss": 0.6668800115585327, "eval_runtime": 15.0231, "eval_samples_per_second": 66.564, "eval_steps_per_second": 33.282, "step": 14000 }, { "epoch": 1.83, "learning_rate": 0.000275, "loss": 0.5229, "step": 14500 }, { "epoch": 1.83, "eval_loss": 0.6599903106689453, "eval_runtime": 14.6661, "eval_samples_per_second": 68.184, "eval_steps_per_second": 34.092, "step": 14500 }, { "epoch": 1.89, "learning_rate": 0.00025, "loss": 0.5386, "step": 15000 }, { "epoch": 1.89, "eval_loss": 0.659939706325531, "eval_runtime": 14.6708, "eval_samples_per_second": 68.163, "eval_steps_per_second": 34.081, "step": 15000 } ], "logging_steps": 500, "max_steps": 20000, "num_train_epochs": 3, "save_steps": 5000, "total_flos": 9.418761150170112e+16, "trial_name": null, "trial_params": null }