{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99775617053104, "eval_steps": 100, "global_step": 501, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05983545250560957, "grad_norm": 7.40893212326487, "learning_rate": 1.96078431372549e-06, "loss": 0.6407, "step": 10 }, { "epoch": 0.11967090501121914, "grad_norm": 2.443879636951024, "learning_rate": 3.92156862745098e-06, "loss": 0.3887, "step": 20 }, { "epoch": 0.17950635751682872, "grad_norm": 2.182405695242156, "learning_rate": 5.882352941176471e-06, "loss": 0.2616, "step": 30 }, { "epoch": 0.2393418100224383, "grad_norm": 2.01489573168189, "learning_rate": 7.84313725490196e-06, "loss": 0.2571, "step": 40 }, { "epoch": 0.2991772625280479, "grad_norm": 1.979013183070111, "learning_rate": 9.803921568627451e-06, "loss": 0.2477, "step": 50 }, { "epoch": 0.35901271503365745, "grad_norm": 1.549355728093093, "learning_rate": 9.990133642141359e-06, "loss": 0.2176, "step": 60 }, { "epoch": 0.418848167539267, "grad_norm": 1.4426481204872057, "learning_rate": 9.95607770125771e-06, "loss": 0.2273, "step": 70 }, { "epoch": 0.4786836200448766, "grad_norm": 1.7789832633152836, "learning_rate": 9.89787624799672e-06, "loss": 0.2125, "step": 80 }, { "epoch": 0.5385190725504861, "grad_norm": 1.529318217095282, "learning_rate": 9.815812833988292e-06, "loss": 0.2229, "step": 90 }, { "epoch": 0.5983545250560958, "grad_norm": 1.6259265112226373, "learning_rate": 9.710287263936485e-06, "loss": 0.2062, "step": 100 }, { "epoch": 0.5983545250560958, "eval_loss": 0.2123425155878067, "eval_runtime": 33.4729, "eval_samples_per_second": 17.776, "eval_steps_per_second": 8.903, "step": 100 }, { "epoch": 0.6581899775617053, "grad_norm": 1.6245243576341002, "learning_rate": 9.581813647811199e-06, "loss": 0.2105, "step": 110 }, { "epoch": 0.7180254300673149, "grad_norm": 1.731561075586601, "learning_rate": 9.431017896156074e-06, "loss": 0.2048, "step": 120 }, { "epoch": 0.7778608825729244, "grad_norm": 1.7874480467541498, "learning_rate": 9.25863467071524e-06, "loss": 0.2113, "step": 130 }, { "epoch": 0.837696335078534, "grad_norm": 1.3708463663991368, "learning_rate": 9.065503805235139e-06, "loss": 0.1988, "step": 140 }, { "epoch": 0.8975317875841436, "grad_norm": 1.3567660521800535, "learning_rate": 8.852566213878947e-06, "loss": 0.2038, "step": 150 }, { "epoch": 0.9573672400897532, "grad_norm": 1.8281708498422444, "learning_rate": 8.620859307187339e-06, "loss": 0.2196, "step": 160 }, { "epoch": 1.0172026925953628, "grad_norm": 1.2318054900550177, "learning_rate": 8.371511937918616e-06, "loss": 0.1762, "step": 170 }, { "epoch": 1.0770381451009723, "grad_norm": 1.568321912319435, "learning_rate": 8.105738901391553e-06, "loss": 0.1288, "step": 180 }, { "epoch": 1.136873597606582, "grad_norm": 1.3819346363939895, "learning_rate": 7.82483501712469e-06, "loss": 0.1214, "step": 190 }, { "epoch": 1.1967090501121915, "grad_norm": 1.2680685647450163, "learning_rate": 7.530168820605819e-06, "loss": 0.1256, "step": 200 }, { "epoch": 1.1967090501121915, "eval_loss": 0.20358169078826904, "eval_runtime": 32.7594, "eval_samples_per_second": 18.163, "eval_steps_per_second": 9.097, "step": 200 }, { "epoch": 1.256544502617801, "grad_norm": 1.2942802177914767, "learning_rate": 7.223175895924638e-06, "loss": 0.1241, "step": 210 }, { "epoch": 1.3163799551234106, "grad_norm": 1.4364370392498633, "learning_rate": 6.905351881751372e-06, "loss": 0.1254, "step": 220 }, { "epoch": 1.37621540762902, "grad_norm": 1.330811194933078, "learning_rate": 6.578245184735513e-06, "loss": 0.1229, "step": 230 }, { "epoch": 1.4360508601346298, "grad_norm": 1.304831888309303, "learning_rate": 6.243449435824276e-06, "loss": 0.1147, "step": 240 }, { "epoch": 1.4958863126402393, "grad_norm": 1.2398683599838292, "learning_rate": 5.902595726252801e-06, "loss": 0.1345, "step": 250 }, { "epoch": 1.555721765145849, "grad_norm": 1.3240317320353998, "learning_rate": 5.557344661031628e-06, "loss": 0.1236, "step": 260 }, { "epoch": 1.6155572176514585, "grad_norm": 1.518581095835922, "learning_rate": 5.209378268645998e-06, "loss": 0.1218, "step": 270 }, { "epoch": 1.675392670157068, "grad_norm": 1.5653129689570715, "learning_rate": 4.860391806382157e-06, "loss": 0.1246, "step": 280 }, { "epoch": 1.7352281226626776, "grad_norm": 1.4836280079781416, "learning_rate": 4.512085501204254e-06, "loss": 0.1156, "step": 290 }, { "epoch": 1.795063575168287, "grad_norm": 1.4998045733125407, "learning_rate": 4.166156266419489e-06, "loss": 0.1296, "step": 300 }, { "epoch": 1.795063575168287, "eval_loss": 0.19370371103286743, "eval_runtime": 33.117, "eval_samples_per_second": 17.967, "eval_steps_per_second": 8.998, "step": 300 }, { "epoch": 1.8548990276738968, "grad_norm": 1.426847361331521, "learning_rate": 3.82428943448705e-06, "loss": 0.1294, "step": 310 }, { "epoch": 1.9147344801795063, "grad_norm": 1.1812939999353123, "learning_rate": 3.488150546247778e-06, "loss": 0.1219, "step": 320 }, { "epoch": 1.974569932685116, "grad_norm": 1.071812010046448, "learning_rate": 3.1593772365766107e-06, "loss": 0.1106, "step": 330 }, { "epoch": 2.0344053851907256, "grad_norm": 0.9913151474800547, "learning_rate": 2.839571255990088e-06, "loss": 0.0851, "step": 340 }, { "epoch": 2.094240837696335, "grad_norm": 1.0937548000001698, "learning_rate": 2.5302906670788463e-06, "loss": 0.0621, "step": 350 }, { "epoch": 2.1540762902019446, "grad_norm": 1.1416547973943143, "learning_rate": 2.23304225378328e-06, "loss": 0.0662, "step": 360 }, { "epoch": 2.213911742707554, "grad_norm": 1.2971227147360092, "learning_rate": 1.9492741804936623e-06, "loss": 0.0623, "step": 370 }, { "epoch": 2.273747195213164, "grad_norm": 1.0599796926376819, "learning_rate": 1.680368936738792e-06, "loss": 0.0604, "step": 380 }, { "epoch": 2.3335826477187736, "grad_norm": 1.0516418140346255, "learning_rate": 1.4276366018359845e-06, "loss": 0.0605, "step": 390 }, { "epoch": 2.393418100224383, "grad_norm": 1.1322674065288456, "learning_rate": 1.1923084623163172e-06, "loss": 0.0592, "step": 400 }, { "epoch": 2.393418100224383, "eval_loss": 0.21845205128192902, "eval_runtime": 33.1015, "eval_samples_per_second": 17.975, "eval_steps_per_second": 9.003, "step": 400 }, { "epoch": 2.4532535527299926, "grad_norm": 1.070168334944103, "learning_rate": 9.7553101322043e-07, "loss": 0.0595, "step": 410 }, { "epoch": 2.513089005235602, "grad_norm": 1.1779521251420957, "learning_rate": 7.783603724899258e-07, "loss": 0.0593, "step": 420 }, { "epoch": 2.5729244577412116, "grad_norm": 1.0392248842745917, "learning_rate": 6.017571356669183e-07, "loss": 0.0588, "step": 430 }, { "epoch": 2.632759910246821, "grad_norm": 1.1454410326378197, "learning_rate": 4.4658169596911493e-07, "loss": 0.0599, "step": 440 }, { "epoch": 2.6925953627524306, "grad_norm": 1.2414977662809759, "learning_rate": 3.135900525405428e-07, "loss": 0.0596, "step": 450 }, { "epoch": 2.75243081525804, "grad_norm": 0.9458175216720401, "learning_rate": 2.0343012729971244e-07, "loss": 0.0561, "step": 460 }, { "epoch": 2.81226626776365, "grad_norm": 1.2677156882489358, "learning_rate": 1.166386083291604e-07, "loss": 0.0566, "step": 470 }, { "epoch": 2.8721017202692596, "grad_norm": 1.0894838908576727, "learning_rate": 5.363833518505834e-08, "loss": 0.0608, "step": 480 }, { "epoch": 2.931937172774869, "grad_norm": 0.8300865767642356, "learning_rate": 1.4736238865398766e-08, "loss": 0.0548, "step": 490 }, { "epoch": 2.9917726252804786, "grad_norm": 1.1224586519889344, "learning_rate": 1.2184647302626585e-10, "loss": 0.0646, "step": 500 }, { "epoch": 2.9917726252804786, "eval_loss": 0.21988336741924286, "eval_runtime": 33.0412, "eval_samples_per_second": 18.008, "eval_steps_per_second": 9.019, "step": 500 }, { "epoch": 2.99775617053104, "step": 501, "total_flos": 9861900926976.0, "train_loss": 0.14637824013353345, "train_runtime": 2877.8871, "train_samples_per_second": 5.575, "train_steps_per_second": 0.174 } ], "logging_steps": 10, "max_steps": 501, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9861900926976.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }