{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9957805907172996, "eval_steps": 500, "global_step": 118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008438818565400843, "grad_norm": 1.0222958326339722, "learning_rate": 2.4999999999999998e-05, "loss": 2.86, "step": 1 }, { "epoch": 0.04219409282700422, "grad_norm": 1.6936215162277222, "learning_rate": 0.000125, "loss": 2.8152, "step": 5 }, { "epoch": 0.08438818565400844, "grad_norm": 2.3276407718658447, "learning_rate": 0.00025, "loss": 2.5324, "step": 10 }, { "epoch": 0.12658227848101267, "grad_norm": 6.024274826049805, "learning_rate": 0.0002994074763922825, "loss": 1.692, "step": 15 }, { "epoch": 0.16877637130801687, "grad_norm": 0.993145763874054, "learning_rate": 0.0002958034371120616, "loss": 1.4859, "step": 20 }, { "epoch": 0.2109704641350211, "grad_norm": 0.7716729640960693, "learning_rate": 0.000289003441114775, "loss": 1.326, "step": 25 }, { "epoch": 0.25316455696202533, "grad_norm": 0.5372955203056335, "learning_rate": 0.0002791565417651033, "loss": 1.2597, "step": 30 }, { "epoch": 0.29535864978902954, "grad_norm": 0.5266535878181458, "learning_rate": 0.00026647857940770634, "loss": 1.1995, "step": 35 }, { "epoch": 0.33755274261603374, "grad_norm": 0.3248014748096466, "learning_rate": 0.0002512474502277316, "loss": 1.1785, "step": 40 }, { "epoch": 0.379746835443038, "grad_norm": 0.22964197397232056, "learning_rate": 0.00023379701487054785, "loss": 1.1528, "step": 45 }, { "epoch": 0.4219409282700422, "grad_norm": 0.2224261313676834, "learning_rate": 0.00021450978034147806, "loss": 1.1343, "step": 50 }, { "epoch": 0.4641350210970464, "grad_norm": 0.22749578952789307, "learning_rate": 0.00019380851559554636, "loss": 1.1165, "step": 55 }, { "epoch": 0.5063291139240507, "grad_norm": 0.2278250902891159, "learning_rate": 0.00017214698460037218, "loss": 1.1106, "step": 60 }, { "epoch": 0.5485232067510548, "grad_norm": 0.22940242290496826, "learning_rate": 0.00015, "loss": 1.0946, "step": 65 }, { "epoch": 0.5907172995780591, "grad_norm": 0.20619849860668182, "learning_rate": 0.00012785301539962782, "loss": 1.0952, "step": 70 }, { "epoch": 0.6329113924050633, "grad_norm": 0.19738492369651794, "learning_rate": 0.00010619148440445364, "loss": 1.0891, "step": 75 }, { "epoch": 0.6751054852320675, "grad_norm": 0.2077159434556961, "learning_rate": 8.549021965852197e-05, "loss": 1.0828, "step": 80 }, { "epoch": 0.7172995780590717, "grad_norm": 0.1939452439546585, "learning_rate": 6.620298512945214e-05, "loss": 1.0723, "step": 85 }, { "epoch": 0.759493670886076, "grad_norm": 0.20105819404125214, "learning_rate": 4.8752549772268444e-05, "loss": 1.0617, "step": 90 }, { "epoch": 0.8016877637130801, "grad_norm": 0.2106432467699051, "learning_rate": 3.352142059229365e-05, "loss": 1.0647, "step": 95 }, { "epoch": 0.8438818565400844, "grad_norm": 0.19021108746528625, "learning_rate": 2.0843458234896666e-05, "loss": 1.0778, "step": 100 }, { "epoch": 0.8860759493670886, "grad_norm": 0.23987625539302826, "learning_rate": 1.0996558885224993e-05, "loss": 1.0761, "step": 105 }, { "epoch": 0.9282700421940928, "grad_norm": 0.1889180988073349, "learning_rate": 4.1965628879383875e-06, "loss": 1.0657, "step": 110 }, { "epoch": 0.9704641350210971, "grad_norm": 0.19509291648864746, "learning_rate": 5.925236077174655e-07, "loss": 1.0751, "step": 115 }, { "epoch": 0.9957805907172996, "eval_loss": 1.6322884559631348, "eval_runtime": 1.1519, "eval_samples_per_second": 1.736, "eval_steps_per_second": 0.868, "step": 118 }, { "epoch": 0.9957805907172996, "step": 118, "total_flos": 3.48383355932246e+17, "train_loss": 1.2928365105289523, "train_runtime": 1374.4457, "train_samples_per_second": 5.507, "train_steps_per_second": 0.086 } ], "logging_steps": 5, "max_steps": 118, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.48383355932246e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }