|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99775617053104, |
|
"eval_steps": 100, |
|
"global_step": 501, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05983545250560957, |
|
"grad_norm": 7.40893212326487, |
|
"learning_rate": 1.96078431372549e-06, |
|
"loss": 0.6407, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11967090501121914, |
|
"grad_norm": 2.443879636951024, |
|
"learning_rate": 3.92156862745098e-06, |
|
"loss": 0.3887, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17950635751682872, |
|
"grad_norm": 2.182405695242156, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.2616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2393418100224383, |
|
"grad_norm": 2.01489573168189, |
|
"learning_rate": 7.84313725490196e-06, |
|
"loss": 0.2571, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2991772625280479, |
|
"grad_norm": 1.979013183070111, |
|
"learning_rate": 9.803921568627451e-06, |
|
"loss": 0.2477, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35901271503365745, |
|
"grad_norm": 1.549355728093093, |
|
"learning_rate": 9.990133642141359e-06, |
|
"loss": 0.2176, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 1.4426481204872057, |
|
"learning_rate": 9.95607770125771e-06, |
|
"loss": 0.2273, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4786836200448766, |
|
"grad_norm": 1.7789832633152836, |
|
"learning_rate": 9.89787624799672e-06, |
|
"loss": 0.2125, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5385190725504861, |
|
"grad_norm": 1.529318217095282, |
|
"learning_rate": 9.815812833988292e-06, |
|
"loss": 0.2229, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5983545250560958, |
|
"grad_norm": 1.6259265112226373, |
|
"learning_rate": 9.710287263936485e-06, |
|
"loss": 0.2062, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5983545250560958, |
|
"eval_loss": 0.2123425155878067, |
|
"eval_runtime": 33.4729, |
|
"eval_samples_per_second": 17.776, |
|
"eval_steps_per_second": 8.903, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6581899775617053, |
|
"grad_norm": 1.6245243576341002, |
|
"learning_rate": 9.581813647811199e-06, |
|
"loss": 0.2105, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7180254300673149, |
|
"grad_norm": 1.731561075586601, |
|
"learning_rate": 9.431017896156074e-06, |
|
"loss": 0.2048, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7778608825729244, |
|
"grad_norm": 1.7874480467541498, |
|
"learning_rate": 9.25863467071524e-06, |
|
"loss": 0.2113, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 1.3708463663991368, |
|
"learning_rate": 9.065503805235139e-06, |
|
"loss": 0.1988, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8975317875841436, |
|
"grad_norm": 1.3567660521800535, |
|
"learning_rate": 8.852566213878947e-06, |
|
"loss": 0.2038, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9573672400897532, |
|
"grad_norm": 1.8281708498422444, |
|
"learning_rate": 8.620859307187339e-06, |
|
"loss": 0.2196, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0172026925953628, |
|
"grad_norm": 1.2318054900550177, |
|
"learning_rate": 8.371511937918616e-06, |
|
"loss": 0.1762, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0770381451009723, |
|
"grad_norm": 1.568321912319435, |
|
"learning_rate": 8.105738901391553e-06, |
|
"loss": 0.1288, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.136873597606582, |
|
"grad_norm": 1.3819346363939895, |
|
"learning_rate": 7.82483501712469e-06, |
|
"loss": 0.1214, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1967090501121915, |
|
"grad_norm": 1.2680685647450163, |
|
"learning_rate": 7.530168820605819e-06, |
|
"loss": 0.1256, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.1967090501121915, |
|
"eval_loss": 0.20358169078826904, |
|
"eval_runtime": 32.7594, |
|
"eval_samples_per_second": 18.163, |
|
"eval_steps_per_second": 9.097, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 1.2942802177914767, |
|
"learning_rate": 7.223175895924638e-06, |
|
"loss": 0.1241, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3163799551234106, |
|
"grad_norm": 1.4364370392498633, |
|
"learning_rate": 6.905351881751372e-06, |
|
"loss": 0.1254, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.37621540762902, |
|
"grad_norm": 1.330811194933078, |
|
"learning_rate": 6.578245184735513e-06, |
|
"loss": 0.1229, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4360508601346298, |
|
"grad_norm": 1.304831888309303, |
|
"learning_rate": 6.243449435824276e-06, |
|
"loss": 0.1147, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4958863126402393, |
|
"grad_norm": 1.2398683599838292, |
|
"learning_rate": 5.902595726252801e-06, |
|
"loss": 0.1345, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.555721765145849, |
|
"grad_norm": 1.3240317320353998, |
|
"learning_rate": 5.557344661031628e-06, |
|
"loss": 0.1236, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6155572176514585, |
|
"grad_norm": 1.518581095835922, |
|
"learning_rate": 5.209378268645998e-06, |
|
"loss": 0.1218, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 1.5653129689570715, |
|
"learning_rate": 4.860391806382157e-06, |
|
"loss": 0.1246, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7352281226626776, |
|
"grad_norm": 1.4836280079781416, |
|
"learning_rate": 4.512085501204254e-06, |
|
"loss": 0.1156, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.795063575168287, |
|
"grad_norm": 1.4998045733125407, |
|
"learning_rate": 4.166156266419489e-06, |
|
"loss": 0.1296, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.795063575168287, |
|
"eval_loss": 0.19370371103286743, |
|
"eval_runtime": 33.117, |
|
"eval_samples_per_second": 17.967, |
|
"eval_steps_per_second": 8.998, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8548990276738968, |
|
"grad_norm": 1.426847361331521, |
|
"learning_rate": 3.82428943448705e-06, |
|
"loss": 0.1294, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9147344801795063, |
|
"grad_norm": 1.1812939999353123, |
|
"learning_rate": 3.488150546247778e-06, |
|
"loss": 0.1219, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.974569932685116, |
|
"grad_norm": 1.071812010046448, |
|
"learning_rate": 3.1593772365766107e-06, |
|
"loss": 0.1106, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0344053851907256, |
|
"grad_norm": 0.9913151474800547, |
|
"learning_rate": 2.839571255990088e-06, |
|
"loss": 0.0851, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 1.0937548000001698, |
|
"learning_rate": 2.5302906670788463e-06, |
|
"loss": 0.0621, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.1540762902019446, |
|
"grad_norm": 1.1416547973943143, |
|
"learning_rate": 2.23304225378328e-06, |
|
"loss": 0.0662, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.213911742707554, |
|
"grad_norm": 1.2971227147360092, |
|
"learning_rate": 1.9492741804936623e-06, |
|
"loss": 0.0623, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.273747195213164, |
|
"grad_norm": 1.0599796926376819, |
|
"learning_rate": 1.680368936738792e-06, |
|
"loss": 0.0604, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3335826477187736, |
|
"grad_norm": 1.0516418140346255, |
|
"learning_rate": 1.4276366018359845e-06, |
|
"loss": 0.0605, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.393418100224383, |
|
"grad_norm": 1.1322674065288456, |
|
"learning_rate": 1.1923084623163172e-06, |
|
"loss": 0.0592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.393418100224383, |
|
"eval_loss": 0.21845205128192902, |
|
"eval_runtime": 33.1015, |
|
"eval_samples_per_second": 17.975, |
|
"eval_steps_per_second": 9.003, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4532535527299926, |
|
"grad_norm": 1.070168334944103, |
|
"learning_rate": 9.7553101322043e-07, |
|
"loss": 0.0595, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 1.1779521251420957, |
|
"learning_rate": 7.783603724899258e-07, |
|
"loss": 0.0593, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5729244577412116, |
|
"grad_norm": 1.0392248842745917, |
|
"learning_rate": 6.017571356669183e-07, |
|
"loss": 0.0588, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.632759910246821, |
|
"grad_norm": 1.1454410326378197, |
|
"learning_rate": 4.4658169596911493e-07, |
|
"loss": 0.0599, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.6925953627524306, |
|
"grad_norm": 1.2414977662809759, |
|
"learning_rate": 3.135900525405428e-07, |
|
"loss": 0.0596, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.75243081525804, |
|
"grad_norm": 0.9458175216720401, |
|
"learning_rate": 2.0343012729971244e-07, |
|
"loss": 0.0561, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.81226626776365, |
|
"grad_norm": 1.2677156882489358, |
|
"learning_rate": 1.166386083291604e-07, |
|
"loss": 0.0566, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.8721017202692596, |
|
"grad_norm": 1.0894838908576727, |
|
"learning_rate": 5.363833518505834e-08, |
|
"loss": 0.0608, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 0.8300865767642356, |
|
"learning_rate": 1.4736238865398766e-08, |
|
"loss": 0.0548, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9917726252804786, |
|
"grad_norm": 1.1224586519889344, |
|
"learning_rate": 1.2184647302626585e-10, |
|
"loss": 0.0646, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.9917726252804786, |
|
"eval_loss": 0.21988336741924286, |
|
"eval_runtime": 33.0412, |
|
"eval_samples_per_second": 18.008, |
|
"eval_steps_per_second": 9.019, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.99775617053104, |
|
"step": 501, |
|
"total_flos": 9861900926976.0, |
|
"train_loss": 0.14637824013353345, |
|
"train_runtime": 2877.8871, |
|
"train_samples_per_second": 5.575, |
|
"train_steps_per_second": 0.174 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 501, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9861900926976.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|