{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.602890571231933, "eval_steps": 500, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1720578114246387, "grad_norm": 2.9087114334106445, "learning_rate": 2.867630190410645e-06, "loss": 8.3862, "step": 500 }, { "epoch": 0.3441156228492774, "grad_norm": 2.4263219833374023, "learning_rate": 5.73526038082129e-06, "loss": 6.9689, "step": 1000 }, { "epoch": 0.516173434273916, "grad_norm": 4.725559234619141, "learning_rate": 8.602890571231935e-06, "loss": 5.7891, "step": 1500 }, { "epoch": 0.6882312456985548, "grad_norm": 3.8100438117980957, "learning_rate": 1.147052076164258e-05, "loss": 4.8611, "step": 2000 }, { "epoch": 0.8602890571231934, "grad_norm": 2.8300278186798096, "learning_rate": 1.4338150952053223e-05, "loss": 4.2645, "step": 2500 }, { "epoch": 1.032346868547832, "grad_norm": 3.759568929672241, "learning_rate": 1.720578114246387e-05, "loss": 3.8533, "step": 3000 }, { "epoch": 1.2044046799724708, "grad_norm": 3.1716251373291016, "learning_rate": 2.0073411332874516e-05, "loss": 3.5417, "step": 3500 }, { "epoch": 1.3764624913971093, "grad_norm": 3.578864812850952, "learning_rate": 2.294104152328516e-05, "loss": 3.2905, "step": 4000 }, { "epoch": 1.548520302821748, "grad_norm": 2.953669309616089, "learning_rate": 2.5808671713695802e-05, "loss": 3.0965, "step": 4500 }, { "epoch": 1.7205781142463867, "grad_norm": 3.457939863204956, "learning_rate": 2.8676301904106445e-05, "loss": 2.9353, "step": 5000 }, { "epoch": 1.8926359256710255, "grad_norm": 2.893206834793091, "learning_rate": 3.1543932094517095e-05, "loss": 2.8001, "step": 5500 }, { "epoch": 2.064693737095664, "grad_norm": 2.9178900718688965, "learning_rate": 3.441156228492774e-05, "loss": 2.6832, "step": 6000 }, { "epoch": 2.236751548520303, "grad_norm": 2.7625951766967773, "learning_rate": 3.7273457214957565e-05, "loss": 2.5764, "step": 6500 }, { "epoch": 2.4088093599449416, "grad_norm": 2.936638116836548, "learning_rate": 4.01410874053682e-05, "loss": 2.4878, "step": 7000 }, { "epoch": 2.5808671713695803, "grad_norm": 2.8190789222717285, "learning_rate": 4.300871759577885e-05, "loss": 2.4027, "step": 7500 }, { "epoch": 2.7529249827942186, "grad_norm": 2.407775402069092, "learning_rate": 4.5876347786189495e-05, "loss": 2.3358, "step": 8000 }, { "epoch": 2.9249827942188578, "grad_norm": 2.336435317993164, "learning_rate": 4.8738242716219315e-05, "loss": 2.2753, "step": 8500 }, { "epoch": 3.097040605643496, "grad_norm": 2.211622953414917, "learning_rate": 5.160587290662996e-05, "loss": 2.2115, "step": 9000 }, { "epoch": 3.2690984170681348, "grad_norm": 2.3094568252563477, "learning_rate": 5.4473503097040615e-05, "loss": 2.1567, "step": 9500 }, { "epoch": 3.4411562284927735, "grad_norm": 2.2486841678619385, "learning_rate": 5.734113328745125e-05, "loss": 2.1201, "step": 10000 }, { "epoch": 3.613214039917412, "grad_norm": 2.023249864578247, "learning_rate": 6.020302821748107e-05, "loss": 2.0788, "step": 10500 }, { "epoch": 3.785271851342051, "grad_norm": 2.1235194206237793, "learning_rate": 6.307065840789173e-05, "loss": 2.0441, "step": 11000 }, { "epoch": 3.9573296627666896, "grad_norm": 2.0521788597106934, "learning_rate": 6.593828859830237e-05, "loss": 2.0071, "step": 11500 }, { "epoch": 4.129387474191328, "grad_norm": 2.0152077674865723, "learning_rate": 6.880591878871302e-05, "loss": 1.9692, "step": 12000 }, { "epoch": 4.301445285615967, "grad_norm": 1.9169940948486328, "learning_rate": 7.167354897912365e-05, "loss": 1.9382, "step": 12500 }, { "epoch": 4.473503097040606, "grad_norm": 1.7628319263458252, "learning_rate": 7.45411791695343e-05, "loss": 1.9156, "step": 13000 }, { "epoch": 4.645560908465244, "grad_norm": 1.7819803953170776, "learning_rate": 7.740880935994495e-05, "loss": 1.8907, "step": 13500 }, { "epoch": 4.817618719889883, "grad_norm": 1.6823229789733887, "learning_rate": 8.027643955035559e-05, "loss": 1.8663, "step": 14000 }, { "epoch": 4.9896765313145215, "grad_norm": 1.6162245273590088, "learning_rate": 8.313833448038542e-05, "loss": 1.8432, "step": 14500 }, { "epoch": 5.161734342739161, "grad_norm": 1.6801457405090332, "learning_rate": 8.600596467079606e-05, "loss": 1.8093, "step": 15000 }, { "epoch": 5.333792154163799, "grad_norm": 1.5677675008773804, "learning_rate": 8.88735948612067e-05, "loss": 1.7915, "step": 15500 }, { "epoch": 5.505849965588438, "grad_norm": 1.5037736892700195, "learning_rate": 9.174122505161736e-05, "loss": 1.7751, "step": 16000 }, { "epoch": 5.677907777013076, "grad_norm": 1.515109658241272, "learning_rate": 9.460311998164716e-05, "loss": 1.7607, "step": 16500 }, { "epoch": 5.8499655884377155, "grad_norm": 1.44197678565979, "learning_rate": 9.747075017205781e-05, "loss": 1.7413, "step": 17000 }, { "epoch": 6.022023399862354, "grad_norm": 1.3949217796325684, "learning_rate": 9.99994810855854e-05, "loss": 1.7247, "step": 17500 }, { "epoch": 6.194081211286992, "grad_norm": 1.3660074472427368, "learning_rate": 9.995342545257314e-05, "loss": 1.6924, "step": 18000 }, { "epoch": 6.366139022711631, "grad_norm": 1.3406497240066528, "learning_rate": 9.983322835071715e-05, "loss": 1.6832, "step": 18500 }, { "epoch": 6.5381968341362695, "grad_norm": 1.3978294134140015, "learning_rate": 9.963858724454791e-05, "loss": 1.667, "step": 19000 }, { "epoch": 6.710254645560909, "grad_norm": 1.2904192209243774, "learning_rate": 9.936995871142264e-05, "loss": 1.6525, "step": 19500 }, { "epoch": 6.882312456985547, "grad_norm": 1.2701600790023804, "learning_rate": 9.90277431481914e-05, "loss": 1.6389, "step": 20000 }, { "epoch": 7.054370268410186, "grad_norm": 1.305159568786621, "learning_rate": 9.861245063482206e-05, "loss": 1.6201, "step": 20500 }, { "epoch": 7.226428079834824, "grad_norm": 1.2807930707931519, "learning_rate": 9.812574752273107e-05, "loss": 1.5973, "step": 21000 }, { "epoch": 7.3984858912594635, "grad_norm": 1.1819531917572021, "learning_rate": 9.756640878275985e-05, "loss": 1.589, "step": 21500 }, { "epoch": 7.570543702684102, "grad_norm": 1.2513145208358765, "learning_rate": 9.693617124427856e-05, "loss": 1.574, "step": 22000 }, { "epoch": 7.74260151410874, "grad_norm": 1.2013038396835327, "learning_rate": 9.623597429046535e-05, "loss": 1.5713, "step": 22500 }, { "epoch": 7.914659325533379, "grad_norm": 1.1914407014846802, "learning_rate": 9.546846783889485e-05, "loss": 1.5583, "step": 23000 }, { "epoch": 8.086717136958018, "grad_norm": 1.1989445686340332, "learning_rate": 9.463172007988832e-05, "loss": 1.5376, "step": 23500 }, { "epoch": 8.258774948382657, "grad_norm": 1.1866331100463867, "learning_rate": 9.372844774166556e-05, "loss": 1.5211, "step": 24000 }, { "epoch": 8.430832759807295, "grad_norm": 1.1914212703704834, "learning_rate": 9.275999717189387e-05, "loss": 1.5163, "step": 24500 }, { "epoch": 8.602890571231933, "grad_norm": 1.153332233428955, "learning_rate": 9.172993883873278e-05, "loss": 1.5072, "step": 25000 } ], "logging_steps": 500, "max_steps": 58120, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.046659236626432e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }