{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9966329966329966, "eval_steps": 100, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026936026936026935, "grad_norm": 67.7872314453125, "learning_rate": 1.9976687691905394e-05, "loss": 6.0971, "num_input_tokens_seen": 82560, "step": 5 }, { "epoch": 0.05387205387205387, "grad_norm": 1003.2200317382812, "learning_rate": 1.988216769442353e-05, "loss": 5.282, "num_input_tokens_seen": 165120, "step": 10 }, { "epoch": 0.08080808080808081, "grad_norm": 47.967403411865234, "learning_rate": 1.9715670893979416e-05, "loss": 4.301, "num_input_tokens_seen": 247680, "step": 15 }, { "epoch": 0.10774410774410774, "grad_norm": 66.77752685546875, "learning_rate": 1.947840997060136e-05, "loss": 2.5698, "num_input_tokens_seen": 330240, "step": 20 }, { "epoch": 0.13468013468013468, "grad_norm": 21.431270599365234, "learning_rate": 1.917211301505453e-05, "loss": 1.0565, "num_input_tokens_seen": 412800, "step": 25 }, { "epoch": 0.16161616161616163, "grad_norm": 5.1596455574035645, "learning_rate": 1.879901094228584e-05, "loss": 0.2428, "num_input_tokens_seen": 495360, "step": 30 }, { "epoch": 0.18855218855218855, "grad_norm": 2.640035390853882, "learning_rate": 1.836182124254711e-05, "loss": 0.2839, "num_input_tokens_seen": 577920, "step": 35 }, { "epoch": 0.21548821548821548, "grad_norm": 2.514108896255493, "learning_rate": 1.7863728188545326e-05, "loss": 0.0107, "num_input_tokens_seen": 660480, "step": 40 }, { "epoch": 0.24242424242424243, "grad_norm": 0.09124995023012161, "learning_rate": 1.730835964278124e-05, "loss": 0.0023, "num_input_tokens_seen": 743040, "step": 45 }, { "epoch": 0.26936026936026936, "grad_norm": 11.549006462097168, "learning_rate": 1.6699760634000166e-05, "loss": 0.2308, "num_input_tokens_seen": 825600, "step": 50 }, { "epoch": 0.2962962962962963, "grad_norm": 1.6471961736679077, "learning_rate": 1.6042363895210948e-05, "loss": 0.0011, "num_input_tokens_seen": 908160, "step": 55 }, { "epoch": 0.32323232323232326, "grad_norm": 0.02977609820663929, "learning_rate": 1.5340957577859605e-05, "loss": 0.0005, "num_input_tokens_seen": 990720, "step": 60 }, { "epoch": 0.3501683501683502, "grad_norm": 0.17247651517391205, "learning_rate": 1.4600650377311523e-05, "loss": 0.0012, "num_input_tokens_seen": 1073280, "step": 65 }, { "epoch": 0.3771043771043771, "grad_norm": 71.56356811523438, "learning_rate": 1.3826834323650899e-05, "loss": 0.6015, "num_input_tokens_seen": 1155840, "step": 70 }, { "epoch": 0.40404040404040403, "grad_norm": 0.016915204003453255, "learning_rate": 1.302514550881076e-05, "loss": 0.0003, "num_input_tokens_seen": 1238400, "step": 75 }, { "epoch": 0.43097643097643096, "grad_norm": 0.008289608173072338, "learning_rate": 1.2201423036077657e-05, "loss": 0.0002, "num_input_tokens_seen": 1320960, "step": 80 }, { "epoch": 0.45791245791245794, "grad_norm": 0.01277772057801485, "learning_rate": 1.1361666490962468e-05, "loss": 0.0025, "num_input_tokens_seen": 1403520, "step": 85 }, { "epoch": 0.48484848484848486, "grad_norm": 0.006596778519451618, "learning_rate": 1.0511992243198335e-05, "loss": 0.0003, "num_input_tokens_seen": 1486080, "step": 90 }, { "epoch": 0.5117845117845118, "grad_norm": 0.020030811429023743, "learning_rate": 9.658588898140322e-06, "loss": 0.046, "num_input_tokens_seen": 1568640, "step": 95 }, { "epoch": 0.5387205387205387, "grad_norm": 0.08077715337276459, "learning_rate": 8.807672222036692e-06, "loss": 0.0014, "num_input_tokens_seen": 1651200, "step": 100 }, { "epoch": 0.5387205387205387, "eval_loss": 0.00024870518245734274, "eval_runtime": 7.5641, "eval_samples_per_second": 3.966, "eval_steps_per_second": 1.983, "num_input_tokens_seen": 1651200, "step": 100 }, { "epoch": 0.5656565656565656, "grad_norm": 0.011016723699867725, "learning_rate": 7.965439869473664e-06, "loss": 0.0007, "num_input_tokens_seen": 1733760, "step": 105 }, { "epoch": 0.5925925925925926, "grad_norm": 0.23388978838920593, "learning_rate": 7.1380262427365885e-06, "loss": 0.0005, "num_input_tokens_seen": 1816320, "step": 110 }, { "epoch": 0.6195286195286195, "grad_norm": 0.07084622979164124, "learning_rate": 6.331457811869437e-06, "loss": 0.0002, "num_input_tokens_seen": 1898880, "step": 115 }, { "epoch": 0.6464646464646465, "grad_norm": 0.035638194531202316, "learning_rate": 5.55160922085916e-06, "loss": 0.0002, "num_input_tokens_seen": 1981440, "step": 120 }, { "epoch": 0.6734006734006734, "grad_norm": 0.008004716597497463, "learning_rate": 4.804160499645667e-06, "loss": 0.0001, "num_input_tokens_seen": 2064000, "step": 125 }, { "epoch": 0.7003367003367004, "grad_norm": 0.008612624369561672, "learning_rate": 4.094555693603891e-06, "loss": 0.0013, "num_input_tokens_seen": 2146560, "step": 130 }, { "epoch": 0.7272727272727273, "grad_norm": 0.03172842785716057, "learning_rate": 3.4279632118202744e-06, "loss": 0.0001, "num_input_tokens_seen": 2229120, "step": 135 }, { "epoch": 0.7542087542087542, "grad_norm": 0.004970299545675516, "learning_rate": 2.809238182967092e-06, "loss": 0.0002, "num_input_tokens_seen": 2311680, "step": 140 }, { "epoch": 0.7811447811447811, "grad_norm": 0.1790950447320938, "learning_rate": 2.2428870929558012e-06, "loss": 0.0002, "num_input_tokens_seen": 2394240, "step": 145 }, { "epoch": 0.8080808080808081, "grad_norm": 40.12051773071289, "learning_rate": 1.7330349619311415e-06, "loss": 0.0002, "num_input_tokens_seen": 2476800, "step": 150 }, { "epoch": 0.835016835016835, "grad_norm": 0.004456690046936274, "learning_rate": 1.2833952996724864e-06, "loss": 0.0743, "num_input_tokens_seen": 2559360, "step": 155 }, { "epoch": 0.8619528619528619, "grad_norm": 0.046330228447914124, "learning_rate": 8.972430582323788e-07, "loss": 0.0001, "num_input_tokens_seen": 2641920, "step": 160 }, { "epoch": 0.8888888888888888, "grad_norm": 0.015978099778294563, "learning_rate": 5.77390778811796e-07, "loss": 0.0001, "num_input_tokens_seen": 2724480, "step": 165 }, { "epoch": 0.9158249158249159, "grad_norm": 0.014428800903260708, "learning_rate": 3.261681066064859e-07, "loss": 0.0002, "num_input_tokens_seen": 2807040, "step": 170 }, { "epoch": 0.9427609427609428, "grad_norm": 0.03189552202820778, "learning_rate": 1.4540482282803136e-07, "loss": 0.0005, "num_input_tokens_seen": 2889600, "step": 175 }, { "epoch": 0.9696969696969697, "grad_norm": 0.008983040228486061, "learning_rate": 3.641751748600042e-08, "loss": 0.0001, "num_input_tokens_seen": 2972160, "step": 180 }, { "epoch": 0.9966329966329966, "grad_norm": 0.006297091022133827, "learning_rate": 0.0, "loss": 0.2164, "num_input_tokens_seen": 3054720, "step": 185 }, { "epoch": 0.9966329966329966, "num_input_tokens_seen": 3054720, "step": 185, "total_flos": 4.410158569555968e+16, "train_loss": 0.5683108514618456, "train_runtime": 2363.4699, "train_samples_per_second": 1.257, "train_steps_per_second": 0.078 } ], "logging_steps": 5, "max_steps": 185, "num_input_tokens_seen": 3054720, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.410158569555968e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }