{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9912023460410557, "eval_steps": 10, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05865102639296188, "eval_loss": 0.38869303464889526, "eval_runtime": 33.6966, "eval_samples_per_second": 22.465, "eval_steps_per_second": 5.639, "step": 10 }, { "epoch": 0.11730205278592376, "grad_norm": 1.1477874212327048, "learning_rate": 3.92156862745098e-06, "loss": 0.4096, "step": 20 }, { "epoch": 0.11730205278592376, "eval_loss": 0.33711880445480347, "eval_runtime": 32.8785, "eval_samples_per_second": 23.024, "eval_steps_per_second": 5.779, "step": 20 }, { "epoch": 0.17595307917888564, "eval_loss": 0.2933129668235779, "eval_runtime": 32.8713, "eval_samples_per_second": 23.029, "eval_steps_per_second": 5.78, "step": 30 }, { "epoch": 0.23460410557184752, "grad_norm": 1.121589060878037, "learning_rate": 7.84313725490196e-06, "loss": 0.3048, "step": 40 }, { "epoch": 0.23460410557184752, "eval_loss": 0.25960347056388855, "eval_runtime": 32.9112, "eval_samples_per_second": 23.001, "eval_steps_per_second": 5.773, "step": 40 }, { "epoch": 0.2932551319648094, "eval_loss": 0.24025067687034607, "eval_runtime": 32.9388, "eval_samples_per_second": 22.982, "eval_steps_per_second": 5.768, "step": 50 }, { "epoch": 0.3519061583577713, "grad_norm": 1.0677324544458529, "learning_rate": 9.990516643685222e-06, "loss": 0.2471, "step": 60 }, { "epoch": 0.3519061583577713, "eval_loss": 0.228533536195755, "eval_runtime": 32.9199, "eval_samples_per_second": 22.995, "eval_steps_per_second": 5.772, "step": 60 }, { "epoch": 0.41055718475073316, "eval_loss": 0.21884050965309143, "eval_runtime": 32.9261, "eval_samples_per_second": 22.991, "eval_steps_per_second": 5.77, "step": 70 }, { "epoch": 0.46920821114369504, "grad_norm": 0.8221117546113363, "learning_rate": 9.901828808578846e-06, "loss": 0.2281, "step": 80 }, { "epoch": 0.46920821114369504, "eval_loss": 0.21087835729122162, "eval_runtime": 32.9118, "eval_samples_per_second": 23.001, "eval_steps_per_second": 5.773, "step": 80 }, { "epoch": 0.5278592375366569, "eval_loss": 0.2049574851989746, "eval_runtime": 33.0853, "eval_samples_per_second": 22.88, "eval_steps_per_second": 5.743, "step": 90 }, { "epoch": 0.5865102639296188, "grad_norm": 0.7863445026416022, "learning_rate": 9.721431493385322e-06, "loss": 0.2073, "step": 100 }, { "epoch": 0.5865102639296188, "eval_loss": 0.20104646682739258, "eval_runtime": 33.0368, "eval_samples_per_second": 22.914, "eval_steps_per_second": 5.751, "step": 100 }, { "epoch": 0.6451612903225806, "eval_loss": 0.19683966040611267, "eval_runtime": 32.895, "eval_samples_per_second": 23.013, "eval_steps_per_second": 5.776, "step": 110 }, { "epoch": 0.7038123167155426, "grad_norm": 0.7460954708103601, "learning_rate": 9.452699794345583e-06, "loss": 0.1911, "step": 120 }, { "epoch": 0.7038123167155426, "eval_loss": 0.1945473700761795, "eval_runtime": 32.8944, "eval_samples_per_second": 23.013, "eval_steps_per_second": 5.776, "step": 120 }, { "epoch": 0.7624633431085044, "eval_loss": 0.1912163645029068, "eval_runtime": 32.9495, "eval_samples_per_second": 22.975, "eval_steps_per_second": 5.766, "step": 130 }, { "epoch": 0.8211143695014663, "grad_norm": 0.7504426793167469, "learning_rate": 9.100661476680379e-06, "loss": 0.1876, "step": 140 }, { "epoch": 0.8211143695014663, "eval_loss": 0.18776217103004456, "eval_runtime": 32.9203, "eval_samples_per_second": 22.995, "eval_steps_per_second": 5.772, "step": 140 }, { "epoch": 0.8797653958944281, "eval_loss": 0.18520714342594147, "eval_runtime": 32.8973, "eval_samples_per_second": 23.011, "eval_steps_per_second": 5.776, "step": 150 }, { "epoch": 0.9384164222873901, "grad_norm": 0.7748459339656144, "learning_rate": 8.671902908935942e-06, "loss": 0.1887, "step": 160 }, { "epoch": 0.9384164222873901, "eval_loss": 0.18254177272319794, "eval_runtime": 32.8779, "eval_samples_per_second": 23.025, "eval_steps_per_second": 5.779, "step": 160 }, { "epoch": 0.9970674486803519, "eval_loss": 0.18051140010356903, "eval_runtime": 32.9923, "eval_samples_per_second": 22.945, "eval_steps_per_second": 5.759, "step": 170 }, { "epoch": 1.0527859237536656, "grad_norm": 0.9218927864042982, "learning_rate": 8.174445837049614e-06, "loss": 0.1553, "step": 180 }, { "epoch": 1.0527859237536656, "eval_loss": 0.18640676140785217, "eval_runtime": 32.9183, "eval_samples_per_second": 22.996, "eval_steps_per_second": 5.772, "step": 180 }, { "epoch": 1.1114369501466275, "eval_loss": 0.18077336251735687, "eval_runtime": 32.8676, "eval_samples_per_second": 23.032, "eval_steps_per_second": 5.781, "step": 190 }, { "epoch": 1.1700879765395895, "grad_norm": 0.9109052804191333, "learning_rate": 7.617597303598754e-06, "loss": 0.1332, "step": 200 }, { "epoch": 1.1700879765395895, "eval_loss": 0.1823471635580063, "eval_runtime": 32.9625, "eval_samples_per_second": 22.965, "eval_steps_per_second": 5.764, "step": 200 }, { "epoch": 1.2287390029325513, "eval_loss": 0.17943565547466278, "eval_runtime": 32.9683, "eval_samples_per_second": 22.961, "eval_steps_per_second": 5.763, "step": 210 }, { "epoch": 1.2873900293255132, "grad_norm": 0.8287150136550924, "learning_rate": 7.011775520129363e-06, "loss": 0.1349, "step": 220 }, { "epoch": 1.2873900293255132, "eval_loss": 0.17819999158382416, "eval_runtime": 32.9652, "eval_samples_per_second": 22.964, "eval_steps_per_second": 5.764, "step": 220 }, { "epoch": 1.3460410557184752, "eval_loss": 0.1771620512008667, "eval_runtime": 32.8691, "eval_samples_per_second": 23.031, "eval_steps_per_second": 5.781, "step": 230 }, { "epoch": 1.404692082111437, "grad_norm": 0.7173372889400553, "learning_rate": 6.368314950360416e-06, "loss": 0.1333, "step": 240 }, { "epoch": 1.404692082111437, "eval_loss": 0.17452633380889893, "eval_runtime": 32.8914, "eval_samples_per_second": 23.015, "eval_steps_per_second": 5.777, "step": 240 }, { "epoch": 1.4633431085043989, "eval_loss": 0.17504557967185974, "eval_runtime": 32.8301, "eval_samples_per_second": 23.058, "eval_steps_per_second": 5.787, "step": 250 }, { "epoch": 1.5219941348973607, "grad_norm": 0.874094982171954, "learning_rate": 5.699254251008524e-06, "loss": 0.1338, "step": 260 }, { "epoch": 1.5219941348973607, "eval_loss": 0.17338429391384125, "eval_runtime": 32.8983, "eval_samples_per_second": 23.01, "eval_steps_per_second": 5.775, "step": 260 }, { "epoch": 1.5806451612903225, "eval_loss": 0.171478271484375, "eval_runtime": 32.8244, "eval_samples_per_second": 23.062, "eval_steps_per_second": 5.788, "step": 270 }, { "epoch": 1.6392961876832843, "grad_norm": 0.7979095239427625, "learning_rate": 5.017111037698477e-06, "loss": 0.1267, "step": 280 }, { "epoch": 1.6392961876832843, "eval_loss": 0.17220577597618103, "eval_runtime": 32.7725, "eval_samples_per_second": 23.099, "eval_steps_per_second": 5.798, "step": 280 }, { "epoch": 1.6979472140762464, "eval_loss": 0.16855686902999878, "eval_runtime": 32.8059, "eval_samples_per_second": 23.075, "eval_steps_per_second": 5.792, "step": 290 }, { "epoch": 1.7565982404692082, "grad_norm": 0.705233520985163, "learning_rate": 4.334647689917734e-06, "loss": 0.1317, "step": 300 }, { "epoch": 1.7565982404692082, "eval_loss": 0.1680660992860794, "eval_runtime": 32.9544, "eval_samples_per_second": 22.971, "eval_steps_per_second": 5.766, "step": 300 }, { "epoch": 1.8152492668621703, "eval_loss": 0.16704507172107697, "eval_runtime": 32.8838, "eval_samples_per_second": 23.02, "eval_steps_per_second": 5.778, "step": 310 }, { "epoch": 1.873900293255132, "grad_norm": 0.7535701160217406, "learning_rate": 3.6646325766256423e-06, "loss": 0.1251, "step": 320 }, { "epoch": 1.873900293255132, "eval_loss": 0.16622412204742432, "eval_runtime": 32.8893, "eval_samples_per_second": 23.017, "eval_steps_per_second": 5.777, "step": 320 }, { "epoch": 1.932551319648094, "eval_loss": 0.16477644443511963, "eval_runtime": 33.006, "eval_samples_per_second": 22.935, "eval_steps_per_second": 5.757, "step": 330 }, { "epoch": 1.9912023460410557, "grad_norm": 0.7373647805377234, "learning_rate": 3.019601169804216e-06, "loss": 0.1186, "step": 340 }, { "epoch": 1.9912023460410557, "eval_loss": 0.1639869511127472, "eval_runtime": 32.9932, "eval_samples_per_second": 22.944, "eval_steps_per_second": 5.759, "step": 340 } ], "logging_steps": 20, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 31189214167040.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }