{ "best_metric": null, "best_model_checkpoint": null, "epoch": 26.666666666666668, "eval_steps": 500, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5925925925925926, "grad_norm": 1.0835238695144653, "learning_rate": 0.00019555555555555556, "loss": 9.4668, "step": 2 }, { "epoch": 1.1851851851851851, "grad_norm": 1.3387597799301147, "learning_rate": 0.00019111111111111114, "loss": 9.0964, "step": 4 }, { "epoch": 1.7777777777777777, "grad_norm": 2.4566526412963867, "learning_rate": 0.0001866666666666667, "loss": 8.5058, "step": 6 }, { "epoch": 2.3703703703703702, "grad_norm": 1.337947964668274, "learning_rate": 0.00018222222222222224, "loss": 7.8295, "step": 8 }, { "epoch": 2.962962962962963, "grad_norm": 0.9213329553604126, "learning_rate": 0.00017777777777777779, "loss": 7.55, "step": 10 }, { "epoch": 3.5555555555555554, "grad_norm": 0.8294873237609863, "learning_rate": 0.00017333333333333334, "loss": 7.4103, "step": 12 }, { "epoch": 4.148148148148148, "grad_norm": 1.0435646772384644, "learning_rate": 0.00016888888888888889, "loss": 7.3669, "step": 14 }, { "epoch": 4.7407407407407405, "grad_norm": 2.152470111846924, "learning_rate": 0.00016444444444444444, "loss": 7.3731, "step": 16 }, { "epoch": 5.333333333333333, "grad_norm": 0.6850627064704895, "learning_rate": 0.00016, "loss": 7.3274, "step": 18 }, { "epoch": 5.925925925925926, "grad_norm": 0.9927621483802795, "learning_rate": 0.00015555555555555556, "loss": 7.2687, "step": 20 }, { "epoch": 6.518518518518518, "grad_norm": 0.6255151629447937, "learning_rate": 0.0001511111111111111, "loss": 7.2457, "step": 22 }, { "epoch": 7.111111111111111, "grad_norm": 0.4171934425830841, "learning_rate": 0.00014666666666666666, "loss": 7.265, "step": 24 }, { "epoch": 7.703703703703704, "grad_norm": 0.7594227194786072, "learning_rate": 0.00014222222222222224, "loss": 7.2059, "step": 26 }, { "epoch": 8.296296296296296, "grad_norm": 0.6241652965545654, "learning_rate": 0.0001377777777777778, "loss": 7.2223, "step": 28 }, { "epoch": 8.88888888888889, "grad_norm": 0.6361931562423706, "learning_rate": 0.00013333333333333334, "loss": 7.1813, "step": 30 }, { "epoch": 9.481481481481481, "grad_norm": 0.40144020318984985, "learning_rate": 0.00012888888888888892, "loss": 7.1944, "step": 32 }, { "epoch": 10.074074074074074, "grad_norm": 0.8601672053337097, "learning_rate": 0.00012444444444444444, "loss": 7.191, "step": 34 }, { "epoch": 10.666666666666666, "grad_norm": 0.47764790058135986, "learning_rate": 0.00012, "loss": 7.1806, "step": 36 }, { "epoch": 11.25925925925926, "grad_norm": 0.6741082072257996, "learning_rate": 0.00011555555555555555, "loss": 7.1588, "step": 38 }, { "epoch": 11.851851851851851, "grad_norm": 0.320622056722641, "learning_rate": 0.00011111111111111112, "loss": 7.1678, "step": 40 }, { "epoch": 12.444444444444445, "grad_norm": 0.41680437326431274, "learning_rate": 0.00010666666666666667, "loss": 7.1937, "step": 42 }, { "epoch": 13.037037037037036, "grad_norm": 0.5265238285064697, "learning_rate": 0.00010222222222222222, "loss": 7.1729, "step": 44 }, { "epoch": 13.62962962962963, "grad_norm": 0.2438785880804062, "learning_rate": 9.777777777777778e-05, "loss": 7.1447, "step": 46 }, { "epoch": 14.222222222222221, "grad_norm": 0.2717892527580261, "learning_rate": 9.333333333333334e-05, "loss": 7.1401, "step": 48 }, { "epoch": 14.814814814814815, "grad_norm": 0.27455106377601624, "learning_rate": 8.888888888888889e-05, "loss": 7.1263, "step": 50 }, { "epoch": 15.407407407407407, "grad_norm": 0.32036423683166504, "learning_rate": 8.444444444444444e-05, "loss": 7.1606, "step": 52 }, { "epoch": 16.0, "grad_norm": 0.3542112708091736, "learning_rate": 8e-05, "loss": 7.1612, "step": 54 }, { "epoch": 16.59259259259259, "grad_norm": 0.2720729112625122, "learning_rate": 7.555555555555556e-05, "loss": 7.157, "step": 56 }, { "epoch": 17.185185185185187, "grad_norm": 0.4563415050506592, "learning_rate": 7.111111111111112e-05, "loss": 7.1165, "step": 58 }, { "epoch": 17.77777777777778, "grad_norm": 0.3706892430782318, "learning_rate": 6.666666666666667e-05, "loss": 7.1504, "step": 60 }, { "epoch": 18.37037037037037, "grad_norm": 0.4367642402648926, "learning_rate": 6.222222222222222e-05, "loss": 7.1585, "step": 62 }, { "epoch": 18.962962962962962, "grad_norm": 0.35599270462989807, "learning_rate": 5.7777777777777776e-05, "loss": 7.121, "step": 64 }, { "epoch": 19.555555555555557, "grad_norm": 0.28254929184913635, "learning_rate": 5.333333333333333e-05, "loss": 7.1481, "step": 66 }, { "epoch": 20.14814814814815, "grad_norm": 0.3943895399570465, "learning_rate": 4.888888888888889e-05, "loss": 7.1082, "step": 68 }, { "epoch": 20.74074074074074, "grad_norm": 0.2905910313129425, "learning_rate": 4.4444444444444447e-05, "loss": 7.1596, "step": 70 }, { "epoch": 21.333333333333332, "grad_norm": 0.33000853657722473, "learning_rate": 4e-05, "loss": 7.1279, "step": 72 }, { "epoch": 21.925925925925927, "grad_norm": 0.30440381169319153, "learning_rate": 3.555555555555556e-05, "loss": 7.1007, "step": 74 }, { "epoch": 22.51851851851852, "grad_norm": 0.31049448251724243, "learning_rate": 3.111111111111111e-05, "loss": 7.1304, "step": 76 }, { "epoch": 23.11111111111111, "grad_norm": 0.33641088008880615, "learning_rate": 2.6666666666666667e-05, "loss": 7.1468, "step": 78 }, { "epoch": 23.703703703703702, "grad_norm": 0.5838092565536499, "learning_rate": 2.2222222222222223e-05, "loss": 7.1533, "step": 80 }, { "epoch": 24.296296296296298, "grad_norm": 0.2617644965648651, "learning_rate": 1.777777777777778e-05, "loss": 7.1076, "step": 82 }, { "epoch": 24.88888888888889, "grad_norm": 0.23922277987003326, "learning_rate": 1.3333333333333333e-05, "loss": 7.1408, "step": 84 }, { "epoch": 25.48148148148148, "grad_norm": 0.28710824251174927, "learning_rate": 8.88888888888889e-06, "loss": 7.0894, "step": 86 }, { "epoch": 26.074074074074073, "grad_norm": 0.5698680877685547, "learning_rate": 4.444444444444445e-06, "loss": 7.1309, "step": 88 }, { "epoch": 26.666666666666668, "grad_norm": 0.3135036528110504, "learning_rate": 0.0, "loss": 7.1452, "step": 90 }, { "epoch": 26.666666666666668, "step": 90, "total_flos": 410861219852448.0, "train_loss": 7.326659817165798, "train_runtime": 2404.3353, "train_samples_per_second": 0.674, "train_steps_per_second": 0.037 } ], "logging_steps": 2, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 410861219852448.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }