{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2208, "eval_steps": 500, "global_step": 34500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 1.7230606079101562, "learning_rate": 4.99208e-05, "loss": 1.2281, "step": 500 }, { "epoch": 0.0064, "grad_norm": 3.655383348464966, "learning_rate": 4.9840800000000006e-05, "loss": 0.7566, "step": 1000 }, { "epoch": 0.0096, "grad_norm": 1.2925927639007568, "learning_rate": 4.97608e-05, "loss": 0.6764, "step": 1500 }, { "epoch": 0.0128, "grad_norm": 1.286004900932312, "learning_rate": 4.968080000000001e-05, "loss": 0.6304, "step": 2000 }, { "epoch": 0.016, "grad_norm": 1.2140214443206787, "learning_rate": 4.96008e-05, "loss": 0.5981, "step": 2500 }, { "epoch": 0.0192, "grad_norm": 1.2525482177734375, "learning_rate": 4.95208e-05, "loss": 0.5767, "step": 3000 }, { "epoch": 0.0224, "grad_norm": 1.2310410737991333, "learning_rate": 4.94408e-05, "loss": 0.5597, "step": 3500 }, { "epoch": 0.0256, "grad_norm": 1.1735206842422485, "learning_rate": 4.9360800000000004e-05, "loss": 0.5418, "step": 4000 }, { "epoch": 0.0288, "grad_norm": 1.114688754081726, "learning_rate": 4.9280800000000004e-05, "loss": 0.5335, "step": 4500 }, { "epoch": 0.032, "grad_norm": 0.8874593377113342, "learning_rate": 4.9200800000000005e-05, "loss": 0.5237, "step": 5000 }, { "epoch": 0.0352, "grad_norm": 1.1261299848556519, "learning_rate": 4.91208e-05, "loss": 0.5135, "step": 5500 }, { "epoch": 0.0384, "grad_norm": 0.9994556307792664, "learning_rate": 4.9040800000000007e-05, "loss": 0.5059, "step": 6000 }, { "epoch": 0.0416, "grad_norm": 1.2349673509597778, "learning_rate": 4.89608e-05, "loss": 0.4939, "step": 6500 }, { "epoch": 0.0448, "grad_norm": 0.9770995378494263, "learning_rate": 4.88808e-05, "loss": 0.4824, "step": 7000 }, { "epoch": 0.048, "grad_norm": 0.981966495513916, "learning_rate": 4.88008e-05, "loss": 0.4875, "step": 7500 }, { "epoch": 0.0512, "grad_norm": 1.0177415609359741, "learning_rate": 4.87208e-05, "loss": 0.4785, "step": 8000 }, { "epoch": 0.0544, "grad_norm": 1.0521667003631592, "learning_rate": 4.8640800000000004e-05, "loss": 0.4731, "step": 8500 }, { "epoch": 0.0576, "grad_norm": 0.8560615181922913, "learning_rate": 4.85608e-05, "loss": 0.4633, "step": 9000 }, { "epoch": 0.0608, "grad_norm": 1.0170217752456665, "learning_rate": 4.8480800000000005e-05, "loss": 0.4576, "step": 9500 }, { "epoch": 0.064, "grad_norm": 0.9891325831413269, "learning_rate": 4.84008e-05, "loss": 0.4556, "step": 10000 }, { "epoch": 0.0672, "grad_norm": 1.0609711408615112, "learning_rate": 4.832080000000001e-05, "loss": 0.4493, "step": 10500 }, { "epoch": 0.0704, "grad_norm": 0.8623799681663513, "learning_rate": 4.82408e-05, "loss": 0.4459, "step": 11000 }, { "epoch": 0.0736, "grad_norm": 0.9587870240211487, "learning_rate": 4.81608e-05, "loss": 0.4418, "step": 11500 }, { "epoch": 0.0768, "grad_norm": 0.8939447999000549, "learning_rate": 4.80808e-05, "loss": 0.4327, "step": 12000 }, { "epoch": 0.08, "grad_norm": 0.9886033535003662, "learning_rate": 4.80008e-05, "loss": 0.438, "step": 12500 }, { "epoch": 0.0832, "grad_norm": 0.9157513976097107, "learning_rate": 4.7920800000000004e-05, "loss": 0.4323, "step": 13000 }, { "epoch": 0.0864, "grad_norm": 0.9085854887962341, "learning_rate": 4.7840800000000005e-05, "loss": 0.4303, "step": 13500 }, { "epoch": 0.0896, "grad_norm": 0.9123984575271606, "learning_rate": 4.77608e-05, "loss": 0.4247, "step": 14000 }, { "epoch": 0.0928, "grad_norm": 0.839026689529419, "learning_rate": 4.7680960000000004e-05, "loss": 0.4233, "step": 14500 }, { "epoch": 0.096, "grad_norm": 0.8110847473144531, "learning_rate": 4.760096e-05, "loss": 0.4207, "step": 15000 }, { "epoch": 0.0992, "grad_norm": 0.8462579250335693, "learning_rate": 4.7520960000000005e-05, "loss": 0.421, "step": 15500 }, { "epoch": 0.1024, "grad_norm": 0.8980106711387634, "learning_rate": 4.744096e-05, "loss": 0.417, "step": 16000 }, { "epoch": 0.1056, "grad_norm": 0.8297702074050903, "learning_rate": 4.736096000000001e-05, "loss": 0.4139, "step": 16500 }, { "epoch": 0.1088, "grad_norm": 0.9856173992156982, "learning_rate": 4.728096e-05, "loss": 0.419, "step": 17000 }, { "epoch": 0.112, "grad_norm": 0.934256911277771, "learning_rate": 4.720096e-05, "loss": 0.4098, "step": 17500 }, { "epoch": 0.1152, "grad_norm": 0.9190649390220642, "learning_rate": 4.712096e-05, "loss": 0.412, "step": 18000 }, { "epoch": 0.1184, "grad_norm": 0.9078772664070129, "learning_rate": 4.704096e-05, "loss": 0.4043, "step": 18500 }, { "epoch": 0.1216, "grad_norm": 1.082939624786377, "learning_rate": 4.696112e-05, "loss": 0.4045, "step": 19000 }, { "epoch": 0.1248, "grad_norm": 0.9159390926361084, "learning_rate": 4.688112e-05, "loss": 0.4098, "step": 19500 }, { "epoch": 0.128, "grad_norm": 0.8420547842979431, "learning_rate": 4.680128e-05, "loss": 0.4033, "step": 20000 }, { "epoch": 0.1312, "grad_norm": 0.7658286094665527, "learning_rate": 4.672128e-05, "loss": 0.4002, "step": 20500 }, { "epoch": 0.1344, "grad_norm": 0.9074057340621948, "learning_rate": 4.664128e-05, "loss": 0.3964, "step": 21000 }, { "epoch": 0.1376, "grad_norm": 0.6065025329589844, "learning_rate": 4.656128e-05, "loss": 0.3984, "step": 21500 }, { "epoch": 0.1408, "grad_norm": 0.7523757219314575, "learning_rate": 4.6481280000000004e-05, "loss": 0.3959, "step": 22000 }, { "epoch": 0.144, "grad_norm": 0.807826042175293, "learning_rate": 4.6401280000000004e-05, "loss": 0.3921, "step": 22500 }, { "epoch": 0.1472, "grad_norm": 0.8530682325363159, "learning_rate": 4.632128e-05, "loss": 0.4002, "step": 23000 }, { "epoch": 0.1504, "grad_norm": 0.8661518692970276, "learning_rate": 4.6241280000000006e-05, "loss": 0.3856, "step": 23500 }, { "epoch": 0.1536, "grad_norm": 0.7473235130310059, "learning_rate": 4.616144e-05, "loss": 0.3854, "step": 24000 }, { "epoch": 0.1568, "grad_norm": 0.7954819202423096, "learning_rate": 4.6081440000000005e-05, "loss": 0.3871, "step": 24500 }, { "epoch": 0.16, "grad_norm": 0.8758727312088013, "learning_rate": 4.600144e-05, "loss": 0.3842, "step": 25000 }, { "epoch": 0.1632, "grad_norm": 0.8430293798446655, "learning_rate": 4.592144000000001e-05, "loss": 0.3886, "step": 25500 }, { "epoch": 0.1664, "grad_norm": 0.6557173728942871, "learning_rate": 4.584144e-05, "loss": 0.3854, "step": 26000 }, { "epoch": 0.1696, "grad_norm": 0.7791888117790222, "learning_rate": 4.576144e-05, "loss": 0.3796, "step": 26500 }, { "epoch": 0.1728, "grad_norm": 0.736084520816803, "learning_rate": 4.56816e-05, "loss": 0.3806, "step": 27000 }, { "epoch": 0.176, "grad_norm": 0.7714269161224365, "learning_rate": 4.56016e-05, "loss": 0.3781, "step": 27500 }, { "epoch": 0.1792, "grad_norm": 0.766144335269928, "learning_rate": 4.552176e-05, "loss": 0.3766, "step": 28000 }, { "epoch": 0.1824, "grad_norm": 0.7035301923751831, "learning_rate": 4.544176e-05, "loss": 0.3737, "step": 28500 }, { "epoch": 0.1856, "grad_norm": 0.7573793530464172, "learning_rate": 4.536176e-05, "loss": 0.3753, "step": 29000 }, { "epoch": 0.1888, "grad_norm": 0.8799508213996887, "learning_rate": 4.528176e-05, "loss": 0.373, "step": 29500 }, { "epoch": 0.192, "grad_norm": 0.8543264269828796, "learning_rate": 4.520176e-05, "loss": 0.3735, "step": 30000 }, { "epoch": 0.1952, "grad_norm": 0.6768947243690491, "learning_rate": 4.512176e-05, "loss": 0.3697, "step": 30500 }, { "epoch": 0.1984, "grad_norm": 0.8239702582359314, "learning_rate": 4.504176e-05, "loss": 0.3675, "step": 31000 }, { "epoch": 0.2016, "grad_norm": 0.8310449123382568, "learning_rate": 4.4961760000000004e-05, "loss": 0.3695, "step": 31500 }, { "epoch": 0.2048, "grad_norm": 0.8459475040435791, "learning_rate": 4.488176e-05, "loss": 0.3694, "step": 32000 }, { "epoch": 0.208, "grad_norm": 0.7346063852310181, "learning_rate": 4.4801760000000006e-05, "loss": 0.3646, "step": 32500 }, { "epoch": 0.2112, "grad_norm": 0.6958354115486145, "learning_rate": 4.472176e-05, "loss": 0.3704, "step": 33000 }, { "epoch": 0.2144, "grad_norm": 0.8244686722755432, "learning_rate": 4.464176000000001e-05, "loss": 0.3647, "step": 33500 }, { "epoch": 0.2176, "grad_norm": 0.7559502124786377, "learning_rate": 4.456192e-05, "loss": 0.3665, "step": 34000 }, { "epoch": 0.2208, "grad_norm": 0.9046504497528076, "learning_rate": 4.4481920000000007e-05, "loss": 0.3637, "step": 34500 } ], "logging_steps": 500, "max_steps": 312500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6807237779456e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }