{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.988864142538976, "eval_steps": 20, "global_step": 280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17817371937639198, "grad_norm": 24.032651901245117, "learning_rate": 6.818181818181818e-06, "loss": 2.7471, "step": 10 }, { "epoch": 0.35634743875278396, "grad_norm": 21.925918579101562, "learning_rate": 1.4393939393939396e-05, "loss": 2.3011, "step": 20 }, { "epoch": 0.35634743875278396, "eval_loss": 1.9752988815307617, "eval_runtime": 36.3926, "eval_samples_per_second": 1.374, "eval_steps_per_second": 1.374, "step": 20 }, { "epoch": 0.534521158129176, "grad_norm": 8.960527420043945, "learning_rate": 2.1969696969696972e-05, "loss": 1.8473, "step": 30 }, { "epoch": 0.7126948775055679, "grad_norm": 7.120757102966309, "learning_rate": 2.954545454545455e-05, "loss": 1.7509, "step": 40 }, { "epoch": 0.7126948775055679, "eval_loss": 1.660855770111084, "eval_runtime": 36.3311, "eval_samples_per_second": 1.376, "eval_steps_per_second": 1.376, "step": 40 }, { "epoch": 0.89086859688196, "grad_norm": 6.989817142486572, "learning_rate": 3.712121212121212e-05, "loss": 1.6258, "step": 50 }, { "epoch": 1.069042316258352, "grad_norm": 6.359054088592529, "learning_rate": 4.46969696969697e-05, "loss": 1.6081, "step": 60 }, { "epoch": 1.069042316258352, "eval_loss": 1.5969637632369995, "eval_runtime": 36.2375, "eval_samples_per_second": 1.38, "eval_steps_per_second": 1.38, "step": 60 }, { "epoch": 1.247216035634744, "grad_norm": 7.143074035644531, "learning_rate": 5.2272727272727274e-05, "loss": 1.5792, "step": 70 }, { "epoch": 1.4253897550111359, "grad_norm": 7.04843807220459, "learning_rate": 5.9848484848484854e-05, "loss": 1.5251, "step": 80 }, { "epoch": 1.4253897550111359, "eval_loss": 1.5653889179229736, "eval_runtime": 36.2373, "eval_samples_per_second": 1.38, "eval_steps_per_second": 1.38, "step": 80 }, { "epoch": 1.6035634743875278, "grad_norm": 7.0300822257995605, "learning_rate": 6.742424242424242e-05, "loss": 1.4923, "step": 90 }, { "epoch": 1.7817371937639197, "grad_norm": 6.851181507110596, "learning_rate": 7.500000000000001e-05, "loss": 1.5468, "step": 100 }, { "epoch": 1.7817371937639197, "eval_loss": 1.554957389831543, "eval_runtime": 36.1708, "eval_samples_per_second": 1.382, "eval_steps_per_second": 1.382, "step": 100 }, { "epoch": 1.9599109131403119, "grad_norm": 7.405056476593018, "learning_rate": 8.257575757575758e-05, "loss": 1.4879, "step": 110 }, { "epoch": 2.138084632516704, "grad_norm": 7.301249027252197, "learning_rate": 9.015151515151515e-05, "loss": 1.3656, "step": 120 }, { "epoch": 2.138084632516704, "eval_loss": 1.5598678588867188, "eval_runtime": 36.2422, "eval_samples_per_second": 1.38, "eval_steps_per_second": 1.38, "step": 120 }, { "epoch": 2.316258351893096, "grad_norm": 7.064472198486328, "learning_rate": 9.772727272727274e-05, "loss": 1.3165, "step": 130 }, { "epoch": 2.494432071269488, "grad_norm": 6.906543731689453, "learning_rate": 9.99914337538133e-05, "loss": 1.2849, "step": 140 }, { "epoch": 2.494432071269488, "eval_loss": 1.57136869430542, "eval_runtime": 36.2404, "eval_samples_per_second": 1.38, "eval_steps_per_second": 1.38, "step": 140 }, { "epoch": 2.6726057906458798, "grad_norm": 6.771876335144043, "learning_rate": 9.994948369553145e-05, "loss": 1.3273, "step": 150 }, { "epoch": 2.8507795100222717, "grad_norm": 6.521801471710205, "learning_rate": 9.987260573051269e-05, "loss": 1.3514, "step": 160 }, { "epoch": 2.8507795100222717, "eval_loss": 1.5559300184249878, "eval_runtime": 36.2463, "eval_samples_per_second": 1.379, "eval_steps_per_second": 1.379, "step": 160 }, { "epoch": 3.0289532293986636, "grad_norm": 6.638123512268066, "learning_rate": 9.976085361679797e-05, "loss": 1.2518, "step": 170 }, { "epoch": 3.2071269487750556, "grad_norm": 7.935174465179443, "learning_rate": 9.961430549868666e-05, "loss": 1.0345, "step": 180 }, { "epoch": 3.2071269487750556, "eval_loss": 1.6498233079910278, "eval_runtime": 36.2857, "eval_samples_per_second": 1.378, "eval_steps_per_second": 1.378, "step": 180 }, { "epoch": 3.3853006681514475, "grad_norm": 8.291388511657715, "learning_rate": 9.943306385209289e-05, "loss": 1.0613, "step": 190 }, { "epoch": 3.5634743875278394, "grad_norm": 7.552478313446045, "learning_rate": 9.921725541288778e-05, "loss": 1.0083, "step": 200 }, { "epoch": 3.5634743875278394, "eval_loss": 1.6424150466918945, "eval_runtime": 36.2961, "eval_samples_per_second": 1.378, "eval_steps_per_second": 1.378, "step": 200 }, { "epoch": 3.7416481069042318, "grad_norm": 7.386646747589111, "learning_rate": 9.899359743693714e-05, "loss": 1.0191, "step": 210 }, { "epoch": 3.9198218262806237, "grad_norm": 7.476137161254883, "learning_rate": 9.871254773396837e-05, "loss": 1.1108, "step": 220 }, { "epoch": 3.9198218262806237, "eval_loss": 1.6132583618164062, "eval_runtime": 36.1952, "eval_samples_per_second": 1.381, "eval_steps_per_second": 1.381, "step": 220 }, { "epoch": 4.097995545657016, "grad_norm": 7.721164703369141, "learning_rate": 9.839743506981782e-05, "loss": 0.9813, "step": 230 }, { "epoch": 4.276169265033408, "grad_norm": 7.329690456390381, "learning_rate": 9.804847979162286e-05, "loss": 0.7993, "step": 240 }, { "epoch": 4.276169265033408, "eval_loss": 1.7469983100891113, "eval_runtime": 36.2855, "eval_samples_per_second": 1.378, "eval_steps_per_second": 1.378, "step": 240 }, { "epoch": 4.4543429844097995, "grad_norm": 9.068168640136719, "learning_rate": 9.766592591146352e-05, "loss": 0.7501, "step": 250 }, { "epoch": 4.632516703786192, "grad_norm": 8.153438568115234, "learning_rate": 9.725004093573342e-05, "loss": 0.8531, "step": 260 }, { "epoch": 4.632516703786192, "eval_loss": 1.7561492919921875, "eval_runtime": 36.2071, "eval_samples_per_second": 1.381, "eval_steps_per_second": 1.381, "step": 260 }, { "epoch": 4.810690423162583, "grad_norm": 8.576103210449219, "learning_rate": 9.680111567808213e-05, "loss": 0.8121, "step": 270 }, { "epoch": 4.988864142538976, "grad_norm": 8.27182674407959, "learning_rate": 9.631946405605932e-05, "loss": 0.7991, "step": 280 }, { "epoch": 4.988864142538976, "eval_loss": 1.7701832056045532, "eval_runtime": 36.2934, "eval_samples_per_second": 1.378, "eval_steps_per_second": 1.378, "step": 280 } ], "logging_steps": 10, "max_steps": 1320, "num_input_tokens_seen": 0, "num_train_epochs": 24, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1823497400039040.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }