{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1720183486238532, "eval_steps": 25, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 2.4123539232053424e-05, "loss": 2.9404, "step": 25 }, { "epoch": 0.01, "eval_loss": 1.9332810640335083, "eval_runtime": 1268.6205, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 25 }, { "epoch": 0.01, "learning_rate": 2.3080133555926544e-05, "loss": 1.7498, "step": 50 }, { "epoch": 0.01, "eval_loss": 1.5833734273910522, "eval_runtime": 1268.0389, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 50 }, { "epoch": 0.02, "learning_rate": 2.2036727879799667e-05, "loss": 1.6641, "step": 75 }, { "epoch": 0.02, "eval_loss": 1.4574012756347656, "eval_runtime": 1270.5133, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 75 }, { "epoch": 0.03, "learning_rate": 2.0993322203672787e-05, "loss": 1.3824, "step": 100 }, { "epoch": 0.03, "eval_loss": 1.3970969915390015, "eval_runtime": 1269.8041, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 100 }, { "epoch": 0.04, "learning_rate": 1.9991652754590986e-05, "loss": 1.3602, "step": 125 }, { "epoch": 0.04, "eval_loss": 1.3633196353912354, "eval_runtime": 1268.49, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 125 }, { "epoch": 0.04, "learning_rate": 1.898998330550918e-05, "loss": 1.2812, "step": 150 }, { "epoch": 0.04, "eval_loss": 1.3380756378173828, "eval_runtime": 1270.2471, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 150 }, { "epoch": 0.05, "learning_rate": 1.7946577629382304e-05, "loss": 1.2471, "step": 175 }, { "epoch": 0.05, "eval_loss": 1.3300601243972778, "eval_runtime": 1270.2602, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 175 }, { "epoch": 0.06, "learning_rate": 1.69449081803005e-05, "loss": 1.2866, "step": 200 }, { "epoch": 0.06, "eval_loss": 1.3172357082366943, "eval_runtime": 1270.2386, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 200 }, { "epoch": 0.06, "learning_rate": 1.5901502504173623e-05, "loss": 1.2441, "step": 225 }, { "epoch": 0.06, "eval_loss": 1.2967969179153442, "eval_runtime": 1270.2167, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 225 }, { "epoch": 0.07, "learning_rate": 1.4858096828046744e-05, "loss": 1.3435, "step": 250 }, { "epoch": 0.07, "eval_loss": 1.2923862934112549, "eval_runtime": 1270.453, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 250 }, { "epoch": 0.08, "learning_rate": 1.3814691151919867e-05, "loss": 1.2896, "step": 275 }, { "epoch": 0.08, "eval_loss": 1.2753491401672363, "eval_runtime": 1269.9475, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 275 }, { "epoch": 0.09, "learning_rate": 1.2771285475792987e-05, "loss": 1.2362, "step": 300 }, { "epoch": 0.09, "eval_loss": 1.2694206237792969, "eval_runtime": 1270.1159, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 300 }, { "epoch": 0.09, "learning_rate": 1.1727879799666112e-05, "loss": 1.2737, "step": 325 }, { "epoch": 0.09, "eval_loss": 1.2611935138702393, "eval_runtime": 1268.4077, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 325 }, { "epoch": 0.1, "learning_rate": 1.0684474123539233e-05, "loss": 1.2854, "step": 350 }, { "epoch": 0.1, "eval_loss": 1.2529343366622925, "eval_runtime": 1268.716, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 350 }, { "epoch": 0.11, "learning_rate": 9.641068447412355e-06, "loss": 1.3302, "step": 375 }, { "epoch": 0.11, "eval_loss": 1.2476325035095215, "eval_runtime": 1267.034, "eval_samples_per_second": 1.376, "eval_steps_per_second": 0.172, "step": 375 }, { "epoch": 0.11, "learning_rate": 8.597662771285476e-06, "loss": 1.1982, "step": 400 }, { "epoch": 0.11, "eval_loss": 1.2402310371398926, "eval_runtime": 1268.1363, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 400 }, { "epoch": 0.12, "learning_rate": 7.554257095158598e-06, "loss": 1.1491, "step": 425 }, { "epoch": 0.12, "eval_loss": 1.2358269691467285, "eval_runtime": 1269.4424, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.172, "step": 425 }, { "epoch": 0.13, "learning_rate": 6.51085141903172e-06, "loss": 1.0247, "step": 450 }, { "epoch": 0.13, "eval_loss": 1.2344069480895996, "eval_runtime": 1269.7392, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.172, "step": 450 }, { "epoch": 0.14, "learning_rate": 5.467445742904841e-06, "loss": 1.2411, "step": 475 }, { "epoch": 0.14, "eval_loss": 1.2264015674591064, "eval_runtime": 1268.3253, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 475 }, { "epoch": 0.14, "learning_rate": 4.424040066777963e-06, "loss": 1.3197, "step": 500 }, { "epoch": 0.14, "eval_loss": 1.222678542137146, "eval_runtime": 1269.3766, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.172, "step": 500 }, { "epoch": 0.15, "learning_rate": 3.380634390651085e-06, "loss": 1.2782, "step": 525 }, { "epoch": 0.15, "eval_loss": 1.2200953960418701, "eval_runtime": 1268.3856, "eval_samples_per_second": 1.375, "eval_steps_per_second": 0.172, "step": 525 }, { "epoch": 0.16, "learning_rate": 2.337228714524207e-06, "loss": 1.2863, "step": 550 }, { "epoch": 0.16, "eval_loss": 1.2170121669769287, "eval_runtime": 1269.5926, "eval_samples_per_second": 1.374, "eval_steps_per_second": 0.172, "step": 550 }, { "epoch": 0.16, "learning_rate": 1.293823038397329e-06, "loss": 1.2774, "step": 575 }, { "epoch": 0.16, "eval_loss": 1.2154204845428467, "eval_runtime": 1270.383, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 575 }, { "epoch": 0.17, "learning_rate": 2.5041736227045074e-07, "loss": 1.2384, "step": 600 }, { "epoch": 0.17, "eval_loss": 1.214400291442871, "eval_runtime": 1270.0094, "eval_samples_per_second": 1.373, "eval_steps_per_second": 0.172, "step": 600 } ], "logging_steps": 25, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "total_flos": 6.775649796096e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }