llama-3.2-3b-sft / trainer_state.json
tanliboy's picture
Model save
ba96ecf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004484304932735426,
"grad_norm": 4.696539476451585,
"learning_rate": 1.3452914798206278e-08,
"loss": 0.9912,
"step": 1
},
{
"epoch": 0.004484304932735426,
"grad_norm": 5.089904667658368,
"learning_rate": 1.345291479820628e-07,
"loss": 1.0341,
"step": 10
},
{
"epoch": 0.008968609865470852,
"grad_norm": 5.546828630388097,
"learning_rate": 2.690582959641256e-07,
"loss": 1.0502,
"step": 20
},
{
"epoch": 0.013452914798206279,
"grad_norm": 4.113849381101499,
"learning_rate": 4.0358744394618834e-07,
"loss": 1.0386,
"step": 30
},
{
"epoch": 0.017937219730941704,
"grad_norm": 3.6548963622814887,
"learning_rate": 5.381165919282512e-07,
"loss": 1.0282,
"step": 40
},
{
"epoch": 0.02242152466367713,
"grad_norm": 2.157564670206396,
"learning_rate": 6.72645739910314e-07,
"loss": 0.9574,
"step": 50
},
{
"epoch": 0.026905829596412557,
"grad_norm": 2.0184475272019555,
"learning_rate": 8.071748878923767e-07,
"loss": 0.9263,
"step": 60
},
{
"epoch": 0.03139013452914798,
"grad_norm": 1.7894937443172652,
"learning_rate": 9.417040358744395e-07,
"loss": 0.9253,
"step": 70
},
{
"epoch": 0.03587443946188341,
"grad_norm": 1.6533764414432808,
"learning_rate": 1.0762331838565023e-06,
"loss": 0.9106,
"step": 80
},
{
"epoch": 0.04035874439461883,
"grad_norm": 1.9561381307359194,
"learning_rate": 1.2107623318385651e-06,
"loss": 0.8713,
"step": 90
},
{
"epoch": 0.04484304932735426,
"grad_norm": 1.5478472557018526,
"learning_rate": 1.345291479820628e-06,
"loss": 0.8741,
"step": 100
},
{
"epoch": 0.04484304932735426,
"eval_loss": 0.8599640727043152,
"eval_runtime": 430.7233,
"eval_samples_per_second": 116.263,
"eval_steps_per_second": 1.818,
"step": 100
},
{
"epoch": 0.04932735426008968,
"grad_norm": 1.5759592930264636,
"learning_rate": 1.4798206278026905e-06,
"loss": 0.8381,
"step": 110
},
{
"epoch": 0.053811659192825115,
"grad_norm": 1.5446577353242628,
"learning_rate": 1.6143497757847533e-06,
"loss": 0.8151,
"step": 120
},
{
"epoch": 0.05829596412556054,
"grad_norm": 1.6899841974229757,
"learning_rate": 1.7488789237668162e-06,
"loss": 0.8309,
"step": 130
},
{
"epoch": 0.06278026905829596,
"grad_norm": 1.6274283098945213,
"learning_rate": 1.883408071748879e-06,
"loss": 0.8509,
"step": 140
},
{
"epoch": 0.06726457399103139,
"grad_norm": 1.7690619100525546,
"learning_rate": 2.0179372197309418e-06,
"loss": 0.8057,
"step": 150
},
{
"epoch": 0.07174887892376682,
"grad_norm": 1.866473004768342,
"learning_rate": 2.1524663677130046e-06,
"loss": 0.8236,
"step": 160
},
{
"epoch": 0.07623318385650224,
"grad_norm": 1.5528009019380091,
"learning_rate": 2.2869955156950674e-06,
"loss": 0.7936,
"step": 170
},
{
"epoch": 0.08071748878923767,
"grad_norm": 1.8924349879943885,
"learning_rate": 2.4215246636771302e-06,
"loss": 0.8054,
"step": 180
},
{
"epoch": 0.08520179372197309,
"grad_norm": 1.5998254884542162,
"learning_rate": 2.556053811659193e-06,
"loss": 0.7971,
"step": 190
},
{
"epoch": 0.08968609865470852,
"grad_norm": 1.553085624058612,
"learning_rate": 2.690582959641256e-06,
"loss": 0.8038,
"step": 200
},
{
"epoch": 0.08968609865470852,
"eval_loss": 0.8094644546508789,
"eval_runtime": 412.1717,
"eval_samples_per_second": 121.495,
"eval_steps_per_second": 1.9,
"step": 200
},
{
"epoch": 0.09417040358744394,
"grad_norm": 1.6621378080442881,
"learning_rate": 2.8251121076233187e-06,
"loss": 0.7815,
"step": 210
},
{
"epoch": 0.09865470852017937,
"grad_norm": 1.5875832641605891,
"learning_rate": 2.959641255605381e-06,
"loss": 0.8088,
"step": 220
},
{
"epoch": 0.1031390134529148,
"grad_norm": 1.6006597094640902,
"learning_rate": 2.99990995533251e-06,
"loss": 0.8141,
"step": 230
},
{
"epoch": 0.10762331838565023,
"grad_norm": 1.7932554350094232,
"learning_rate": 2.9994689462512194e-06,
"loss": 0.7834,
"step": 240
},
{
"epoch": 0.11210762331838565,
"grad_norm": 1.6444723214299724,
"learning_rate": 2.998660541859271e-06,
"loss": 0.7797,
"step": 250
},
{
"epoch": 0.11659192825112108,
"grad_norm": 1.790145213655978,
"learning_rate": 2.9974849402294452e-06,
"loss": 0.8046,
"step": 260
},
{
"epoch": 0.1210762331838565,
"grad_norm": 1.8694283184605,
"learning_rate": 2.9959424294040703e-06,
"loss": 0.7802,
"step": 270
},
{
"epoch": 0.12556053811659193,
"grad_norm": 1.6030839509233756,
"learning_rate": 2.9940333873244464e-06,
"loss": 0.8032,
"step": 280
},
{
"epoch": 0.13004484304932734,
"grad_norm": 1.664910362160235,
"learning_rate": 2.991758281738245e-06,
"loss": 0.7802,
"step": 290
},
{
"epoch": 0.13452914798206278,
"grad_norm": 1.6726792291262853,
"learning_rate": 2.989117670084902e-06,
"loss": 0.7937,
"step": 300
},
{
"epoch": 0.13452914798206278,
"eval_loss": 0.7789004445075989,
"eval_runtime": 410.6605,
"eval_samples_per_second": 121.943,
"eval_steps_per_second": 1.907,
"step": 300
},
{
"epoch": 0.13901345291479822,
"grad_norm": 1.4685211047526556,
"learning_rate": 2.986112199359036e-06,
"loss": 0.7486,
"step": 310
},
{
"epoch": 0.14349775784753363,
"grad_norm": 2.0076694355781575,
"learning_rate": 2.9827426059519237e-06,
"loss": 0.808,
"step": 320
},
{
"epoch": 0.14798206278026907,
"grad_norm": 1.557780179088859,
"learning_rate": 2.9790097154710697e-06,
"loss": 0.7849,
"step": 330
},
{
"epoch": 0.15246636771300448,
"grad_norm": 1.3610248283116362,
"learning_rate": 2.9749144425379216e-06,
"loss": 0.7696,
"step": 340
},
{
"epoch": 0.15695067264573992,
"grad_norm": 1.5050628258310632,
"learning_rate": 2.9704577905637718e-06,
"loss": 0.7497,
"step": 350
},
{
"epoch": 0.16143497757847533,
"grad_norm": 1.4313536098763806,
"learning_rate": 2.9656408515039017e-06,
"loss": 0.7544,
"step": 360
},
{
"epoch": 0.16591928251121077,
"grad_norm": 1.6003065628553548,
"learning_rate": 2.9604648055900368e-06,
"loss": 0.7648,
"step": 370
},
{
"epoch": 0.17040358744394618,
"grad_norm": 1.633334409956319,
"learning_rate": 2.9549309210411697e-06,
"loss": 0.7471,
"step": 380
},
{
"epoch": 0.17488789237668162,
"grad_norm": 1.5700271693529286,
"learning_rate": 2.949040553752826e-06,
"loss": 0.8009,
"step": 390
},
{
"epoch": 0.17937219730941703,
"grad_norm": 1.4854276734758955,
"learning_rate": 2.9427951469648425e-06,
"loss": 0.7712,
"step": 400
},
{
"epoch": 0.17937219730941703,
"eval_loss": 0.7643527388572693,
"eval_runtime": 413.4678,
"eval_samples_per_second": 121.115,
"eval_steps_per_second": 1.894,
"step": 400
},
{
"epoch": 0.18385650224215247,
"grad_norm": 1.4160940764229815,
"learning_rate": 2.936196230907755e-06,
"loss": 0.7532,
"step": 410
},
{
"epoch": 0.18834080717488788,
"grad_norm": 1.4265290618310995,
"learning_rate": 2.929245422427861e-06,
"loss": 0.7703,
"step": 420
},
{
"epoch": 0.19282511210762332,
"grad_norm": 1.6899882763333507,
"learning_rate": 2.9219444245910674e-06,
"loss": 0.7919,
"step": 430
},
{
"epoch": 0.19730941704035873,
"grad_norm": 1.4186337044303068,
"learning_rate": 2.9142950262656098e-06,
"loss": 0.7477,
"step": 440
},
{
"epoch": 0.20179372197309417,
"grad_norm": 1.4178331376670448,
"learning_rate": 2.9062991016837496e-06,
"loss": 0.7734,
"step": 450
},
{
"epoch": 0.2062780269058296,
"grad_norm": 1.4503162574851487,
"learning_rate": 2.897958609982556e-06,
"loss": 0.7447,
"step": 460
},
{
"epoch": 0.21076233183856502,
"grad_norm": 1.558520612711291,
"learning_rate": 2.8892755947238818e-06,
"loss": 0.741,
"step": 470
},
{
"epoch": 0.21524663677130046,
"grad_norm": 1.4382572158325275,
"learning_rate": 2.8802521833936595e-06,
"loss": 0.7563,
"step": 480
},
{
"epoch": 0.21973094170403587,
"grad_norm": 1.5964216489171685,
"learning_rate": 2.870890586880629e-06,
"loss": 0.7554,
"step": 490
},
{
"epoch": 0.2242152466367713,
"grad_norm": 1.496069010720812,
"learning_rate": 2.8611930989346322e-06,
"loss": 0.7393,
"step": 500
},
{
"epoch": 0.2242152466367713,
"eval_loss": 0.7564548254013062,
"eval_runtime": 408.8965,
"eval_samples_per_second": 122.469,
"eval_steps_per_second": 1.915,
"step": 500
},
{
"epoch": 0.22869955156950672,
"grad_norm": 1.4866290735466012,
"learning_rate": 2.851162095604607e-06,
"loss": 0.7499,
"step": 510
},
{
"epoch": 0.23318385650224216,
"grad_norm": 1.3341919240907245,
"learning_rate": 2.8408000346564136e-06,
"loss": 0.7524,
"step": 520
},
{
"epoch": 0.23766816143497757,
"grad_norm": 1.6374942242171213,
"learning_rate": 2.8301094549706405e-06,
"loss": 0.7386,
"step": 530
},
{
"epoch": 0.242152466367713,
"grad_norm": 1.6225803035616944,
"learning_rate": 2.8190929759205366e-06,
"loss": 0.7616,
"step": 540
},
{
"epoch": 0.24663677130044842,
"grad_norm": 1.4683777464043755,
"learning_rate": 2.807753296730219e-06,
"loss": 0.7564,
"step": 550
},
{
"epoch": 0.25112107623318386,
"grad_norm": 1.350460716883926,
"learning_rate": 2.7960931958133183e-06,
"loss": 0.7424,
"step": 560
},
{
"epoch": 0.2556053811659193,
"grad_norm": 1.522474854464212,
"learning_rate": 2.7841155300922202e-06,
"loss": 0.7331,
"step": 570
},
{
"epoch": 0.2600896860986547,
"grad_norm": 1.448720887976205,
"learning_rate": 2.7718232342980693e-06,
"loss": 0.7657,
"step": 580
},
{
"epoch": 0.2645739910313901,
"grad_norm": 1.6744619426337854,
"learning_rate": 2.759219320251714e-06,
"loss": 0.7363,
"step": 590
},
{
"epoch": 0.26905829596412556,
"grad_norm": 1.3585539591402243,
"learning_rate": 2.7463068761257554e-06,
"loss": 0.7458,
"step": 600
},
{
"epoch": 0.26905829596412556,
"eval_loss": 0.7505608797073364,
"eval_runtime": 408.9234,
"eval_samples_per_second": 122.461,
"eval_steps_per_second": 1.915,
"step": 600
},
{
"epoch": 0.273542600896861,
"grad_norm": 1.580932873164111,
"learning_rate": 2.7330890656878943e-06,
"loss": 0.7565,
"step": 610
},
{
"epoch": 0.27802690582959644,
"grad_norm": 1.5329888412189265,
"learning_rate": 2.7195691275257547e-06,
"loss": 0.7457,
"step": 620
},
{
"epoch": 0.2825112107623318,
"grad_norm": 1.6754413400622026,
"learning_rate": 2.7057503742533753e-06,
"loss": 0.7392,
"step": 630
},
{
"epoch": 0.28699551569506726,
"grad_norm": 1.6247897070260917,
"learning_rate": 2.691636191699562e-06,
"loss": 0.758,
"step": 640
},
{
"epoch": 0.2914798206278027,
"grad_norm": 1.42356323236888,
"learning_rate": 2.6772300380783013e-06,
"loss": 0.7626,
"step": 650
},
{
"epoch": 0.29596412556053814,
"grad_norm": 1.4955853270730488,
"learning_rate": 2.662535443141443e-06,
"loss": 0.7355,
"step": 660
},
{
"epoch": 0.3004484304932735,
"grad_norm": 1.4879073313151545,
"learning_rate": 2.647556007313847e-06,
"loss": 0.7545,
"step": 670
},
{
"epoch": 0.30493273542600896,
"grad_norm": 1.4153755477305148,
"learning_rate": 2.6322954008112213e-06,
"loss": 0.7378,
"step": 680
},
{
"epoch": 0.3094170403587444,
"grad_norm": 1.4019993036978922,
"learning_rate": 2.616757362740855e-06,
"loss": 0.7387,
"step": 690
},
{
"epoch": 0.31390134529147984,
"grad_norm": 1.5335241758091316,
"learning_rate": 2.600945700185474e-06,
"loss": 0.7694,
"step": 700
},
{
"epoch": 0.31390134529147984,
"eval_loss": 0.7457958459854126,
"eval_runtime": 408.7761,
"eval_samples_per_second": 122.505,
"eval_steps_per_second": 1.915,
"step": 700
},
{
"epoch": 0.3183856502242152,
"grad_norm": 1.47263429505246,
"learning_rate": 2.5848642872704417e-06,
"loss": 0.7246,
"step": 710
},
{
"epoch": 0.32286995515695066,
"grad_norm": 1.5062835613914285,
"learning_rate": 2.5685170642145337e-06,
"loss": 0.7338,
"step": 720
},
{
"epoch": 0.3273542600896861,
"grad_norm": 1.6182138547104117,
"learning_rate": 2.5519080363645134e-06,
"loss": 0.73,
"step": 730
},
{
"epoch": 0.33183856502242154,
"grad_norm": 1.3515300425343295,
"learning_rate": 2.53504127321376e-06,
"loss": 0.7299,
"step": 740
},
{
"epoch": 0.336322869955157,
"grad_norm": 1.5798782493243635,
"learning_rate": 2.517920907405168e-06,
"loss": 0.7293,
"step": 750
},
{
"epoch": 0.34080717488789236,
"grad_norm": 1.4549259580353344,
"learning_rate": 2.5005511337185824e-06,
"loss": 0.7621,
"step": 760
},
{
"epoch": 0.3452914798206278,
"grad_norm": 1.456599605633329,
"learning_rate": 2.4829362080430077e-06,
"loss": 0.7438,
"step": 770
},
{
"epoch": 0.34977578475336324,
"grad_norm": 1.4128813340833153,
"learning_rate": 2.4650804463338406e-06,
"loss": 0.7413,
"step": 780
},
{
"epoch": 0.3542600896860987,
"grad_norm": 1.5613737124434628,
"learning_rate": 2.4469882235553887e-06,
"loss": 0.7477,
"step": 790
},
{
"epoch": 0.35874439461883406,
"grad_norm": 1.6383373422678345,
"learning_rate": 2.4286639726089293e-06,
"loss": 0.713,
"step": 800
},
{
"epoch": 0.35874439461883406,
"eval_loss": 0.7421520352363586,
"eval_runtime": 408.0589,
"eval_samples_per_second": 122.72,
"eval_steps_per_second": 1.919,
"step": 800
},
{
"epoch": 0.3632286995515695,
"grad_norm": 1.3492102003393152,
"learning_rate": 2.4101121832465754e-06,
"loss": 0.7185,
"step": 810
},
{
"epoch": 0.36771300448430494,
"grad_norm": 1.4117655797526263,
"learning_rate": 2.3913374009712084e-06,
"loss": 0.7379,
"step": 820
},
{
"epoch": 0.3721973094170404,
"grad_norm": 1.5281693242796246,
"learning_rate": 2.3723442259227547e-06,
"loss": 0.7406,
"step": 830
},
{
"epoch": 0.37668161434977576,
"grad_norm": 1.6990323130848894,
"learning_rate": 2.3531373117510695e-06,
"loss": 0.7388,
"step": 840
},
{
"epoch": 0.3811659192825112,
"grad_norm": 1.476162200960684,
"learning_rate": 2.33372136447572e-06,
"loss": 0.7434,
"step": 850
},
{
"epoch": 0.38565022421524664,
"grad_norm": 1.3930484173784414,
"learning_rate": 2.3141011413329244e-06,
"loss": 0.7372,
"step": 860
},
{
"epoch": 0.3901345291479821,
"grad_norm": 1.4071716332679987,
"learning_rate": 2.2942814496099532e-06,
"loss": 0.7531,
"step": 870
},
{
"epoch": 0.39461883408071746,
"grad_norm": 1.5479232446038012,
"learning_rate": 2.274267145467259e-06,
"loss": 0.7216,
"step": 880
},
{
"epoch": 0.3991031390134529,
"grad_norm": 1.4255077423798548,
"learning_rate": 2.254063132748637e-06,
"loss": 0.7343,
"step": 890
},
{
"epoch": 0.40358744394618834,
"grad_norm": 1.57276996130409,
"learning_rate": 2.2336743617797006e-06,
"loss": 0.7347,
"step": 900
},
{
"epoch": 0.40358744394618834,
"eval_loss": 0.7386789321899414,
"eval_runtime": 408.1839,
"eval_samples_per_second": 122.682,
"eval_steps_per_second": 1.918,
"step": 900
},
{
"epoch": 0.4080717488789238,
"grad_norm": 1.4568107529063017,
"learning_rate": 2.213105828154964e-06,
"loss": 0.7266,
"step": 910
},
{
"epoch": 0.4125560538116592,
"grad_norm": 1.374198091231606,
"learning_rate": 2.192362571513841e-06,
"loss": 0.7465,
"step": 920
},
{
"epoch": 0.4170403587443946,
"grad_norm": 1.3925457206301284,
"learning_rate": 2.171449674305846e-06,
"loss": 0.7427,
"step": 930
},
{
"epoch": 0.42152466367713004,
"grad_norm": 1.4443502855856463,
"learning_rate": 2.1503722605453083e-06,
"loss": 0.7428,
"step": 940
},
{
"epoch": 0.4260089686098655,
"grad_norm": 1.5268146365443709,
"learning_rate": 2.1291354945559004e-06,
"loss": 0.7163,
"step": 950
},
{
"epoch": 0.4304932735426009,
"grad_norm": 1.5000325455240473,
"learning_rate": 2.1077445797052945e-06,
"loss": 0.7472,
"step": 960
},
{
"epoch": 0.4349775784753363,
"grad_norm": 1.4869091852092478,
"learning_rate": 2.086204757130243e-06,
"loss": 0.7427,
"step": 970
},
{
"epoch": 0.43946188340807174,
"grad_norm": 1.4430282256544564,
"learning_rate": 2.0645213044524194e-06,
"loss": 0.7174,
"step": 980
},
{
"epoch": 0.4439461883408072,
"grad_norm": 1.4822025498870304,
"learning_rate": 2.0426995344853043e-06,
"loss": 0.7538,
"step": 990
},
{
"epoch": 0.4484304932735426,
"grad_norm": 1.5186234240452396,
"learning_rate": 2.0207447939324598e-06,
"loss": 0.7243,
"step": 1000
},
{
"epoch": 0.4484304932735426,
"eval_loss": 0.7356163859367371,
"eval_runtime": 407.0139,
"eval_samples_per_second": 123.035,
"eval_steps_per_second": 1.924,
"step": 1000
},
{
"epoch": 0.452914798206278,
"grad_norm": 1.5742685454152958,
"learning_rate": 1.998662462077496e-06,
"loss": 0.7475,
"step": 1010
},
{
"epoch": 0.45739910313901344,
"grad_norm": 1.3834168469611057,
"learning_rate": 1.976457949466054e-06,
"loss": 0.7568,
"step": 1020
},
{
"epoch": 0.4618834080717489,
"grad_norm": 1.4947961999330186,
"learning_rate": 1.954136696580132e-06,
"loss": 0.7464,
"step": 1030
},
{
"epoch": 0.4663677130044843,
"grad_norm": 1.4284253764088304,
"learning_rate": 1.9317041725050747e-06,
"loss": 0.7456,
"step": 1040
},
{
"epoch": 0.47085201793721976,
"grad_norm": 1.4247354157320633,
"learning_rate": 1.909165873589554e-06,
"loss": 0.7008,
"step": 1050
},
{
"epoch": 0.47533632286995514,
"grad_norm": 1.4525308368306575,
"learning_rate": 1.886527322098871e-06,
"loss": 0.7121,
"step": 1060
},
{
"epoch": 0.4798206278026906,
"grad_norm": 1.43738036112722,
"learning_rate": 1.8637940648619065e-06,
"loss": 0.7308,
"step": 1070
},
{
"epoch": 0.484304932735426,
"grad_norm": 1.402086349899742,
"learning_rate": 1.8409716719120561e-06,
"loss": 0.7164,
"step": 1080
},
{
"epoch": 0.48878923766816146,
"grad_norm": 1.5227358428935063,
"learning_rate": 1.8180657351224739e-06,
"loss": 0.732,
"step": 1090
},
{
"epoch": 0.49327354260089684,
"grad_norm": 1.5813743714389112,
"learning_rate": 1.7950818668359733e-06,
"loss": 0.7161,
"step": 1100
},
{
"epoch": 0.49327354260089684,
"eval_loss": 0.7330535054206848,
"eval_runtime": 408.4081,
"eval_samples_per_second": 122.615,
"eval_steps_per_second": 1.917,
"step": 1100
},
{
"epoch": 0.4977578475336323,
"grad_norm": 1.4881819590713468,
"learning_rate": 1.772025698489903e-06,
"loss": 0.7144,
"step": 1110
},
{
"epoch": 0.5022421524663677,
"grad_norm": 1.4750319990458514,
"learning_rate": 1.7489028792363549e-06,
"loss": 0.7365,
"step": 1120
},
{
"epoch": 0.5067264573991032,
"grad_norm": 1.4443590686278198,
"learning_rate": 1.7257190745580209e-06,
"loss": 0.7487,
"step": 1130
},
{
"epoch": 0.5112107623318386,
"grad_norm": 1.4695293763109774,
"learning_rate": 1.7024799648800555e-06,
"loss": 0.7233,
"step": 1140
},
{
"epoch": 0.515695067264574,
"grad_norm": 1.4328944860273993,
"learning_rate": 1.679191244178278e-06,
"loss": 0.7322,
"step": 1150
},
{
"epoch": 0.5201793721973094,
"grad_norm": 1.4157130638413895,
"learning_rate": 1.6558586185840473e-06,
"loss": 0.728,
"step": 1160
},
{
"epoch": 0.5246636771300448,
"grad_norm": 1.4117533616122613,
"learning_rate": 1.6324878049861656e-06,
"loss": 0.7331,
"step": 1170
},
{
"epoch": 0.5291479820627802,
"grad_norm": 1.4255877674393056,
"learning_rate": 1.609084529630145e-06,
"loss": 0.7491,
"step": 1180
},
{
"epoch": 0.5336322869955157,
"grad_norm": 1.4486300200418207,
"learning_rate": 1.5856545267151759e-06,
"loss": 0.7261,
"step": 1190
},
{
"epoch": 0.5381165919282511,
"grad_norm": 1.4628618883782867,
"learning_rate": 1.5622035369891561e-06,
"loss": 0.7247,
"step": 1200
},
{
"epoch": 0.5381165919282511,
"eval_loss": 0.7308038473129272,
"eval_runtime": 406.6873,
"eval_samples_per_second": 123.134,
"eval_steps_per_second": 1.925,
"step": 1200
},
{
"epoch": 0.5426008968609866,
"grad_norm": 1.4112256357672157,
"learning_rate": 1.5387373063421062e-06,
"loss": 0.7307,
"step": 1210
},
{
"epoch": 0.547085201793722,
"grad_norm": 1.3994109954542429,
"learning_rate": 1.515261584398333e-06,
"loss": 0.7062,
"step": 1220
},
{
"epoch": 0.5515695067264574,
"grad_norm": 1.5279436893984248,
"learning_rate": 1.491782123107669e-06,
"loss": 0.7314,
"step": 1230
},
{
"epoch": 0.5560538116591929,
"grad_norm": 1.4092281762272858,
"learning_rate": 1.4683046753361521e-06,
"loss": 0.7044,
"step": 1240
},
{
"epoch": 0.5605381165919282,
"grad_norm": 1.4363381867810665,
"learning_rate": 1.4448349934564736e-06,
"loss": 0.7287,
"step": 1250
},
{
"epoch": 0.5650224215246636,
"grad_norm": 1.4913351223697051,
"learning_rate": 1.421378827938549e-06,
"loss": 0.7254,
"step": 1260
},
{
"epoch": 0.5695067264573991,
"grad_norm": 1.5096384680619075,
"learning_rate": 1.3979419259405563e-06,
"loss": 0.7389,
"step": 1270
},
{
"epoch": 0.5739910313901345,
"grad_norm": 1.3495144573299676,
"learning_rate": 1.3745300299007856e-06,
"loss": 0.7247,
"step": 1280
},
{
"epoch": 0.57847533632287,
"grad_norm": 1.3641879848291365,
"learning_rate": 1.3511488761306412e-06,
"loss": 0.7312,
"step": 1290
},
{
"epoch": 0.5829596412556054,
"grad_norm": 1.3879105033157129,
"learning_rate": 1.3278041934091524e-06,
"loss": 0.7477,
"step": 1300
},
{
"epoch": 0.5829596412556054,
"eval_loss": 0.7287724018096924,
"eval_runtime": 406.882,
"eval_samples_per_second": 123.075,
"eval_steps_per_second": 1.924,
"step": 1300
},
{
"epoch": 0.5874439461883408,
"grad_norm": 1.3916697284582622,
"learning_rate": 1.3045017015793217e-06,
"loss": 0.7246,
"step": 1310
},
{
"epoch": 0.5919282511210763,
"grad_norm": 1.4328511876779917,
"learning_rate": 1.2812471101466687e-06,
"loss": 0.7303,
"step": 1320
},
{
"epoch": 0.5964125560538116,
"grad_norm": 1.4411092846252307,
"learning_rate": 1.2580461168803038e-06,
"loss": 0.7318,
"step": 1330
},
{
"epoch": 0.600896860986547,
"grad_norm": 1.4703965551927338,
"learning_rate": 1.2349044064168782e-06,
"loss": 0.7375,
"step": 1340
},
{
"epoch": 0.6053811659192825,
"grad_norm": 1.4319057117061509,
"learning_rate": 1.21182764886775e-06,
"loss": 0.7302,
"step": 1350
},
{
"epoch": 0.6098654708520179,
"grad_norm": 1.5017976848926429,
"learning_rate": 1.188821498429714e-06,
"loss": 0.7262,
"step": 1360
},
{
"epoch": 0.6143497757847534,
"grad_norm": 1.4553869576056546,
"learning_rate": 1.165891591999626e-06,
"loss": 0.7447,
"step": 1370
},
{
"epoch": 0.6188340807174888,
"grad_norm": 1.4128744043127173,
"learning_rate": 1.1430435477932646e-06,
"loss": 0.7423,
"step": 1380
},
{
"epoch": 0.6233183856502242,
"grad_norm": 1.3797159286061107,
"learning_rate": 1.1202829639687785e-06,
"loss": 0.744,
"step": 1390
},
{
"epoch": 0.6278026905829597,
"grad_norm": 1.487304571595245,
"learning_rate": 1.0976154172550408e-06,
"loss": 0.7429,
"step": 1400
},
{
"epoch": 0.6278026905829597,
"eval_loss": 0.7272571921348572,
"eval_runtime": 406.7541,
"eval_samples_per_second": 123.114,
"eval_steps_per_second": 1.925,
"step": 1400
},
{
"epoch": 0.6322869955156951,
"grad_norm": 1.544512062570189,
"learning_rate": 1.0750464615852523e-06,
"loss": 0.7251,
"step": 1410
},
{
"epoch": 0.6367713004484304,
"grad_norm": 1.422563130817404,
"learning_rate": 1.0525816267361398e-06,
"loss": 0.712,
"step": 1420
},
{
"epoch": 0.6412556053811659,
"grad_norm": 1.4937681764382644,
"learning_rate": 1.0302264169730613e-06,
"loss": 0.7203,
"step": 1430
},
{
"epoch": 0.6457399103139013,
"grad_norm": 1.50738757049434,
"learning_rate": 1.0079863097013722e-06,
"loss": 0.7121,
"step": 1440
},
{
"epoch": 0.6502242152466368,
"grad_norm": 1.286396172710849,
"learning_rate": 9.85866754124367e-07,
"loss": 0.7193,
"step": 1450
},
{
"epoch": 0.6547085201793722,
"grad_norm": 1.4997539342741677,
"learning_rate": 9.638731699081281e-07,
"loss": 0.7288,
"step": 1460
},
{
"epoch": 0.6591928251121076,
"grad_norm": 1.37434247409356,
"learning_rate": 9.42010945853623e-07,
"loss": 0.7597,
"step": 1470
},
{
"epoch": 0.6636771300448431,
"grad_norm": 1.3869436283100607,
"learning_rate": 9.202854385763502e-07,
"loss": 0.7184,
"step": 1480
},
{
"epoch": 0.6681614349775785,
"grad_norm": 1.3970067087387381,
"learning_rate": 8.987019711938812e-07,
"loss": 0.7326,
"step": 1490
},
{
"epoch": 0.672645739910314,
"grad_norm": 1.553183464191494,
"learning_rate": 8.772658320216047e-07,
"loss": 0.7317,
"step": 1500
},
{
"epoch": 0.672645739910314,
"eval_loss": 0.7256098389625549,
"eval_runtime": 406.6132,
"eval_samples_per_second": 123.156,
"eval_steps_per_second": 1.926,
"step": 1500
},
{
"epoch": 0.6771300448430493,
"grad_norm": 1.3357768297094936,
"learning_rate": 8.55982273277002e-07,
"loss": 0.7347,
"step": 1510
},
{
"epoch": 0.6816143497757847,
"grad_norm": 1.3249788097985131,
"learning_rate": 8.348565097927605e-07,
"loss": 0.7496,
"step": 1520
},
{
"epoch": 0.6860986547085202,
"grad_norm": 1.4578138220875878,
"learning_rate": 8.13893717739056e-07,
"loss": 0.7308,
"step": 1530
},
{
"epoch": 0.6905829596412556,
"grad_norm": 1.3268077719441809,
"learning_rate": 7.930990333553013e-07,
"loss": 0.7094,
"step": 1540
},
{
"epoch": 0.695067264573991,
"grad_norm": 1.47562182506043,
"learning_rate": 7.72477551691678e-07,
"loss": 0.697,
"step": 1550
},
{
"epoch": 0.6995515695067265,
"grad_norm": 1.4850843190566259,
"learning_rate": 7.520343253607677e-07,
"loss": 0.7301,
"step": 1560
},
{
"epoch": 0.7040358744394619,
"grad_norm": 1.5097763618083517,
"learning_rate": 7.317743632995731e-07,
"loss": 0.7217,
"step": 1570
},
{
"epoch": 0.7085201793721974,
"grad_norm": 1.3914348509226637,
"learning_rate": 7.117026295422425e-07,
"loss": 0.6957,
"step": 1580
},
{
"epoch": 0.7130044843049327,
"grad_norm": 1.5175208261545492,
"learning_rate": 6.918240420038007e-07,
"loss": 0.7317,
"step": 1590
},
{
"epoch": 0.7174887892376681,
"grad_norm": 1.4947559578839034,
"learning_rate": 6.721434712751745e-07,
"loss": 0.7226,
"step": 1600
},
{
"epoch": 0.7174887892376681,
"eval_loss": 0.7243176102638245,
"eval_runtime": 406.7899,
"eval_samples_per_second": 123.103,
"eval_steps_per_second": 1.925,
"step": 1600
},
{
"epoch": 0.7219730941704036,
"grad_norm": 1.5192098207309965,
"learning_rate": 6.526657394298154e-07,
"loss": 0.705,
"step": 1610
},
{
"epoch": 0.726457399103139,
"grad_norm": 1.3665027387136646,
"learning_rate": 6.333956188422088e-07,
"loss": 0.706,
"step": 1620
},
{
"epoch": 0.7309417040358744,
"grad_norm": 1.4974912840899435,
"learning_rate": 6.143378310185643e-07,
"loss": 0.6983,
"step": 1630
},
{
"epoch": 0.7354260089686099,
"grad_norm": 1.5477574584643699,
"learning_rate": 5.954970454399638e-07,
"loss": 0.7252,
"step": 1640
},
{
"epoch": 0.7399103139013453,
"grad_norm": 1.525090065151942,
"learning_rate": 5.768778784182616e-07,
"loss": 0.7087,
"step": 1650
},
{
"epoch": 0.7443946188340808,
"grad_norm": 1.4837554579437873,
"learning_rate": 5.584848919650069e-07,
"loss": 0.7075,
"step": 1660
},
{
"epoch": 0.7488789237668162,
"grad_norm": 1.3538329119260115,
"learning_rate": 5.403225926736772e-07,
"loss": 0.7057,
"step": 1670
},
{
"epoch": 0.7533632286995515,
"grad_norm": 1.359895087573495,
"learning_rate": 5.223954306154843e-07,
"loss": 0.7306,
"step": 1680
},
{
"epoch": 0.757847533632287,
"grad_norm": 1.4168148218595764,
"learning_rate": 5.047077982490311e-07,
"loss": 0.7424,
"step": 1690
},
{
"epoch": 0.7623318385650224,
"grad_norm": 1.4815842671642683,
"learning_rate": 4.872640293440861e-07,
"loss": 0.695,
"step": 1700
},
{
"epoch": 0.7623318385650224,
"eval_loss": 0.7233718633651733,
"eval_runtime": 406.8015,
"eval_samples_per_second": 123.099,
"eval_steps_per_second": 1.925,
"step": 1700
},
{
"epoch": 0.7668161434977578,
"grad_norm": 1.5501655544071418,
"learning_rate": 4.7006839791973673e-07,
"loss": 0.7327,
"step": 1710
},
{
"epoch": 0.7713004484304933,
"grad_norm": 1.3834984705411,
"learning_rate": 4.53125117197179e-07,
"loss": 0.7245,
"step": 1720
},
{
"epoch": 0.7757847533632287,
"grad_norm": 1.4041748328697374,
"learning_rate": 4.364383385674112e-07,
"loss": 0.7054,
"step": 1730
},
{
"epoch": 0.7802690582959642,
"grad_norm": 1.443104622604103,
"learning_rate": 4.2001215057407026e-07,
"loss": 0.7037,
"step": 1740
},
{
"epoch": 0.7847533632286996,
"grad_norm": 1.5632699202433824,
"learning_rate": 4.038505779116687e-07,
"loss": 0.705,
"step": 1750
},
{
"epoch": 0.7892376681614349,
"grad_norm": 1.349615732583278,
"learning_rate": 3.879575804394782e-07,
"loss": 0.7071,
"step": 1760
},
{
"epoch": 0.7937219730941704,
"grad_norm": 1.3657530768128234,
"learning_rate": 3.7233705221129646e-07,
"loss": 0.7273,
"step": 1770
},
{
"epoch": 0.7982062780269058,
"grad_norm": 1.5107387856649341,
"learning_rate": 3.569928205213354e-07,
"loss": 0.6975,
"step": 1780
},
{
"epoch": 0.8026905829596412,
"grad_norm": 1.4525568524987686,
"learning_rate": 3.419286449664741e-07,
"loss": 0.7095,
"step": 1790
},
{
"epoch": 0.8071748878923767,
"grad_norm": 1.4847854049722584,
"learning_rate": 3.2714821652508854e-07,
"loss": 0.7167,
"step": 1800
},
{
"epoch": 0.8071748878923767,
"eval_loss": 0.7225807309150696,
"eval_runtime": 406.5326,
"eval_samples_per_second": 123.181,
"eval_steps_per_second": 1.926,
"step": 1800
},
{
"epoch": 0.8116591928251121,
"grad_norm": 1.2447161837361285,
"learning_rate": 3.126551566527036e-07,
"loss": 0.7156,
"step": 1810
},
{
"epoch": 0.8161434977578476,
"grad_norm": 1.4139333132454484,
"learning_rate": 2.9845301639467284e-07,
"loss": 0.7537,
"step": 1820
},
{
"epoch": 0.820627802690583,
"grad_norm": 1.3663031642715642,
"learning_rate": 2.8454527551611205e-07,
"loss": 0.7238,
"step": 1830
},
{
"epoch": 0.8251121076233184,
"grad_norm": 1.389263976301968,
"learning_rate": 2.7093534164929904e-07,
"loss": 0.738,
"step": 1840
},
{
"epoch": 0.8295964125560538,
"grad_norm": 1.5068808968575202,
"learning_rate": 2.576265494587458e-07,
"loss": 0.7067,
"step": 1850
},
{
"epoch": 0.8340807174887892,
"grad_norm": 1.4226178531466935,
"learning_rate": 2.446221598241472e-07,
"loss": 0.7143,
"step": 1860
},
{
"epoch": 0.8385650224215246,
"grad_norm": 1.6881847148932905,
"learning_rate": 2.319253590414132e-07,
"loss": 0.7376,
"step": 1870
},
{
"epoch": 0.8430493273542601,
"grad_norm": 1.4353283330892004,
"learning_rate": 2.1953925804197056e-07,
"loss": 0.7095,
"step": 1880
},
{
"epoch": 0.8475336322869955,
"grad_norm": 1.4639605071750654,
"learning_rate": 2.0746689163053113e-07,
"loss": 0.7102,
"step": 1890
},
{
"epoch": 0.852017937219731,
"grad_norm": 1.458703799588621,
"learning_rate": 1.9571121774151545e-07,
"loss": 0.686,
"step": 1900
},
{
"epoch": 0.852017937219731,
"eval_loss": 0.7220604419708252,
"eval_runtime": 406.5609,
"eval_samples_per_second": 123.172,
"eval_steps_per_second": 1.926,
"step": 1900
},
{
"epoch": 0.8565022421524664,
"grad_norm": 1.470148783910905,
"learning_rate": 1.8427511671430757e-07,
"loss": 0.72,
"step": 1910
},
{
"epoch": 0.8609865470852018,
"grad_norm": 1.3891242748262451,
"learning_rate": 1.7316139058752194e-07,
"loss": 0.7318,
"step": 1920
},
{
"epoch": 0.8654708520179372,
"grad_norm": 1.2245069775705093,
"learning_rate": 1.6237276241245867e-07,
"loss": 0.7155,
"step": 1930
},
{
"epoch": 0.8699551569506726,
"grad_norm": 1.360510189488915,
"learning_rate": 1.519118755859084e-07,
"loss": 0.7255,
"step": 1940
},
{
"epoch": 0.874439461883408,
"grad_norm": 1.495119615923585,
"learning_rate": 1.4178129320247486e-07,
"loss": 0.7484,
"step": 1950
},
{
"epoch": 0.8789237668161435,
"grad_norm": 1.3674856635367474,
"learning_rate": 1.31983497426575e-07,
"loss": 0.7366,
"step": 1960
},
{
"epoch": 0.8834080717488789,
"grad_norm": 1.4494730150421093,
"learning_rate": 1.2252088888426431e-07,
"loss": 0.742,
"step": 1970
},
{
"epoch": 0.8878923766816144,
"grad_norm": 1.4368197978682802,
"learning_rate": 1.1339578607504536e-07,
"loss": 0.7269,
"step": 1980
},
{
"epoch": 0.8923766816143498,
"grad_norm": 1.4017197990051706,
"learning_rate": 1.0461042480379402e-07,
"loss": 0.7234,
"step": 1990
},
{
"epoch": 0.8968609865470852,
"grad_norm": 1.426560347266084,
"learning_rate": 9.616695763295007e-08,
"loss": 0.7214,
"step": 2000
},
{
"epoch": 0.8968609865470852,
"eval_loss": 0.721759557723999,
"eval_runtime": 406.5838,
"eval_samples_per_second": 123.165,
"eval_steps_per_second": 1.926,
"step": 2000
},
{
"epoch": 0.9013452914798207,
"grad_norm": 1.489947255967281,
"learning_rate": 8.806745335510297e-08,
"loss": 0.7341,
"step": 2010
},
{
"epoch": 0.905829596412556,
"grad_norm": 1.4312716003053576,
"learning_rate": 8.031389648610266e-08,
"loss": 0.7264,
"step": 2020
},
{
"epoch": 0.9103139013452914,
"grad_norm": 1.4764400641380824,
"learning_rate": 7.290818677881966e-08,
"loss": 0.7301,
"step": 2030
},
{
"epoch": 0.9147982062780269,
"grad_norm": 1.4381108917682341,
"learning_rate": 6.585213875767305e-08,
"loss": 0.6997,
"step": 2040
},
{
"epoch": 0.9192825112107623,
"grad_norm": 1.459723127188453,
"learning_rate": 5.914748127404102e-08,
"loss": 0.7168,
"step": 2050
},
{
"epoch": 0.9237668161434978,
"grad_norm": 1.5776619173541433,
"learning_rate": 5.2795857082663655e-08,
"loss": 0.72,
"step": 2060
},
{
"epoch": 0.9282511210762332,
"grad_norm": 1.438610611700907,
"learning_rate": 4.6798822439140185e-08,
"loss": 0.7035,
"step": 2070
},
{
"epoch": 0.9327354260089686,
"grad_norm": 1.4350411032390504,
"learning_rate": 4.115784671861916e-08,
"loss": 0.735,
"step": 2080
},
{
"epoch": 0.9372197309417041,
"grad_norm": 1.4822578142933729,
"learning_rate": 3.587431205577713e-08,
"loss": 0.7178,
"step": 2090
},
{
"epoch": 0.9417040358744395,
"grad_norm": 1.5001233187138816,
"learning_rate": 3.0949513006172325e-08,
"loss": 0.7358,
"step": 2100
},
{
"epoch": 0.9417040358744395,
"eval_loss": 0.7216091752052307,
"eval_runtime": 406.6258,
"eval_samples_per_second": 123.153,
"eval_steps_per_second": 1.926,
"step": 2100
},
{
"epoch": 0.9461883408071748,
"grad_norm": 1.4457564058059627,
"learning_rate": 2.6384656229056946e-08,
"loss": 0.7285,
"step": 2110
},
{
"epoch": 0.9506726457399103,
"grad_norm": 1.6789172768348999,
"learning_rate": 2.218086019172394e-08,
"loss": 0.7027,
"step": 2120
},
{
"epoch": 0.9551569506726457,
"grad_norm": 1.4039832008414181,
"learning_rate": 1.8339154895464894e-08,
"loss": 0.7285,
"step": 2130
},
{
"epoch": 0.9596412556053812,
"grad_norm": 1.7674026844330886,
"learning_rate": 1.4860481623201417e-08,
"loss": 0.713,
"step": 2140
},
{
"epoch": 0.9641255605381166,
"grad_norm": 1.531580121339593,
"learning_rate": 1.1745692708855282e-08,
"loss": 0.7328,
"step": 2150
},
{
"epoch": 0.968609865470852,
"grad_norm": 1.455884868550825,
"learning_rate": 8.99555132851232e-09,
"loss": 0.7196,
"step": 2160
},
{
"epoch": 0.9730941704035875,
"grad_norm": 1.3157536936429735,
"learning_rate": 6.610731313430318e-09,
"loss": 0.7277,
"step": 2170
},
{
"epoch": 0.9775784753363229,
"grad_norm": 1.5586404477319191,
"learning_rate": 4.5918169849406e-09,
"loss": 0.7265,
"step": 2180
},
{
"epoch": 0.9820627802690582,
"grad_norm": 1.3596393082767964,
"learning_rate": 2.939303011277872e-09,
"loss": 0.719,
"step": 2190
},
{
"epoch": 0.9865470852017937,
"grad_norm": 1.3866642718972106,
"learning_rate": 1.6535942863788456e-09,
"loss": 0.7259,
"step": 2200
},
{
"epoch": 0.9865470852017937,
"eval_loss": 0.7215752005577087,
"eval_runtime": 408.9437,
"eval_samples_per_second": 122.455,
"eval_steps_per_second": 1.915,
"step": 2200
},
{
"epoch": 0.9910313901345291,
"grad_norm": 1.6643780128489514,
"learning_rate": 7.350058306764273e-10,
"loss": 0.7044,
"step": 2210
},
{
"epoch": 0.9955156950672646,
"grad_norm": 1.428221428067804,
"learning_rate": 1.8376271391412624e-10,
"loss": 0.7109,
"step": 2220
},
{
"epoch": 1.0,
"grad_norm": 1.3882910125414851,
"learning_rate": 0.0,
"loss": 0.7123,
"step": 2230
},
{
"epoch": 1.0,
"step": 2230,
"total_flos": 250303561007104.0,
"train_loss": 0.7492096503219262,
"train_runtime": 18007.2993,
"train_samples_per_second": 15.851,
"train_steps_per_second": 0.124
}
],
"logging_steps": 10,
"max_steps": 2230,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 250303561007104.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}