|
{ |
|
"best_metric": 2.931408643722534, |
|
"best_model_checkpoint": "/content/drive/My Drive/Hugh Mann/Llama3.2-1B-SMS-All/checkpoint-1000", |
|
"epoch": 0.8733307403085699, |
|
"eval_steps": 50, |
|
"global_step": 1684, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005186049526772981, |
|
"grad_norm": 4.40848970413208, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 5.4447, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.010372099053545962, |
|
"grad_norm": 3.9115631580352783, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 5.2681, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.015558148580318941, |
|
"grad_norm": 4.100546836853027, |
|
"learning_rate": 1.5e-06, |
|
"loss": 5.3836, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.020744198107091924, |
|
"grad_norm": 3.005075693130493, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 5.2503, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.025930247633864905, |
|
"grad_norm": 4.4445929527282715, |
|
"learning_rate": 2.5e-06, |
|
"loss": 5.1054, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.025930247633864905, |
|
"eval_loss": 5.573875904083252, |
|
"eval_runtime": 86.7128, |
|
"eval_samples_per_second": 79.066, |
|
"eval_steps_per_second": 9.883, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.031116297160637883, |
|
"grad_norm": 4.169810771942139, |
|
"learning_rate": 3e-06, |
|
"loss": 5.1836, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.036302346687410864, |
|
"grad_norm": 3.448535442352295, |
|
"learning_rate": 3.5e-06, |
|
"loss": 5.206, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04148839621418385, |
|
"grad_norm": 4.423728942871094, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 5.1756, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.046674445740956826, |
|
"grad_norm": 2.7958264350891113, |
|
"learning_rate": 4.5e-06, |
|
"loss": 5.0787, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05186049526772981, |
|
"grad_norm": 3.927069902420044, |
|
"learning_rate": 5e-06, |
|
"loss": 5.2152, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05186049526772981, |
|
"eval_loss": 5.308503150939941, |
|
"eval_runtime": 86.6743, |
|
"eval_samples_per_second": 79.101, |
|
"eval_steps_per_second": 9.888, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05704654479450279, |
|
"grad_norm": 3.202113151550293, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 5.0395, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.062232594321275765, |
|
"grad_norm": 3.387860059738159, |
|
"learning_rate": 6e-06, |
|
"loss": 5.0002, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06741864384804874, |
|
"grad_norm": 3.1187663078308105, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 4.8854, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07260469337482173, |
|
"grad_norm": 2.3404035568237305, |
|
"learning_rate": 7e-06, |
|
"loss": 4.7552, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07779074290159471, |
|
"grad_norm": 1.9955620765686035, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 4.6971, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07779074290159471, |
|
"eval_loss": 4.777082443237305, |
|
"eval_runtime": 86.9581, |
|
"eval_samples_per_second": 78.843, |
|
"eval_steps_per_second": 9.855, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0829767924283677, |
|
"grad_norm": 2.9313905239105225, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 4.5428, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08816284195514067, |
|
"grad_norm": 2.6419179439544678, |
|
"learning_rate": 8.5e-06, |
|
"loss": 4.4158, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09334889148191365, |
|
"grad_norm": 3.8043394088745117, |
|
"learning_rate": 9e-06, |
|
"loss": 4.1911, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09853494100868664, |
|
"grad_norm": 4.278996467590332, |
|
"learning_rate": 9.5e-06, |
|
"loss": 4.1579, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10372099053545962, |
|
"grad_norm": 5.0697526931762695, |
|
"learning_rate": 1e-05, |
|
"loss": 3.7557, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10372099053545962, |
|
"eval_loss": 3.8124358654022217, |
|
"eval_runtime": 86.9124, |
|
"eval_samples_per_second": 78.884, |
|
"eval_steps_per_second": 9.861, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10890704006223259, |
|
"grad_norm": 2.8836448192596436, |
|
"learning_rate": 9.942129629629629e-06, |
|
"loss": 3.628, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11409308958900558, |
|
"grad_norm": 2.7785139083862305, |
|
"learning_rate": 9.88425925925926e-06, |
|
"loss": 3.523, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11927913911577856, |
|
"grad_norm": 2.3677213191986084, |
|
"learning_rate": 9.826388888888889e-06, |
|
"loss": 3.3988, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12446518864255153, |
|
"grad_norm": 1.9909400939941406, |
|
"learning_rate": 9.768518518518519e-06, |
|
"loss": 3.1866, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12965123816932453, |
|
"grad_norm": 2.019188404083252, |
|
"learning_rate": 9.710648148148149e-06, |
|
"loss": 3.119, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12965123816932453, |
|
"eval_loss": 3.190325975418091, |
|
"eval_runtime": 86.8985, |
|
"eval_samples_per_second": 78.897, |
|
"eval_steps_per_second": 9.862, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13483728769609749, |
|
"grad_norm": 1.8608777523040771, |
|
"learning_rate": 9.652777777777779e-06, |
|
"loss": 3.0325, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14002333722287047, |
|
"grad_norm": 1.8644330501556396, |
|
"learning_rate": 9.594907407407407e-06, |
|
"loss": 3.0687, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.14520938674964345, |
|
"grad_norm": 1.6900441646575928, |
|
"learning_rate": 9.537037037037037e-06, |
|
"loss": 2.96, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.15039543627641644, |
|
"grad_norm": 1.8331371545791626, |
|
"learning_rate": 9.479166666666667e-06, |
|
"loss": 2.8971, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.15558148580318942, |
|
"grad_norm": 1.2244369983673096, |
|
"learning_rate": 9.421296296296297e-06, |
|
"loss": 2.941, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15558148580318942, |
|
"eval_loss": 3.0352883338928223, |
|
"eval_runtime": 86.8833, |
|
"eval_samples_per_second": 78.91, |
|
"eval_steps_per_second": 9.864, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1607675353299624, |
|
"grad_norm": 1.599660873413086, |
|
"learning_rate": 9.363425925925927e-06, |
|
"loss": 2.9286, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1659535848567354, |
|
"grad_norm": 1.4191596508026123, |
|
"learning_rate": 9.305555555555557e-06, |
|
"loss": 3.0023, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17113963438350835, |
|
"grad_norm": 1.3025909662246704, |
|
"learning_rate": 9.247685185185185e-06, |
|
"loss": 2.9386, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.17632568391028133, |
|
"grad_norm": 1.919588327407837, |
|
"learning_rate": 9.189814814814815e-06, |
|
"loss": 2.8861, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18151173343705432, |
|
"grad_norm": 1.416272521018982, |
|
"learning_rate": 9.131944444444445e-06, |
|
"loss": 3.0905, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.18151173343705432, |
|
"eval_loss": 3.011669635772705, |
|
"eval_runtime": 86.7963, |
|
"eval_samples_per_second": 78.99, |
|
"eval_steps_per_second": 9.874, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1866977829638273, |
|
"grad_norm": 2.392760753631592, |
|
"learning_rate": 9.074074074074075e-06, |
|
"loss": 2.899, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1918838324906003, |
|
"grad_norm": 2.157073736190796, |
|
"learning_rate": 9.016203703703704e-06, |
|
"loss": 2.8805, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.19706988201737327, |
|
"grad_norm": 1.3468859195709229, |
|
"learning_rate": 8.958333333333334e-06, |
|
"loss": 2.9088, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.20225593154414626, |
|
"grad_norm": 1.31523597240448, |
|
"learning_rate": 8.900462962962964e-06, |
|
"loss": 2.9042, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.20744198107091924, |
|
"grad_norm": 1.5310451984405518, |
|
"learning_rate": 8.842592592592594e-06, |
|
"loss": 2.8714, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.20744198107091924, |
|
"eval_loss": 2.996051073074341, |
|
"eval_runtime": 86.8896, |
|
"eval_samples_per_second": 78.905, |
|
"eval_steps_per_second": 9.863, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2126280305976922, |
|
"grad_norm": 1.4996702671051025, |
|
"learning_rate": 8.784722222222224e-06, |
|
"loss": 2.8562, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.21781408012446518, |
|
"grad_norm": 1.5760114192962646, |
|
"learning_rate": 8.726851851851854e-06, |
|
"loss": 2.8765, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22300012965123817, |
|
"grad_norm": 1.4508875608444214, |
|
"learning_rate": 8.668981481481482e-06, |
|
"loss": 3.0157, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.22818617917801115, |
|
"grad_norm": 1.8075891733169556, |
|
"learning_rate": 8.611111111111112e-06, |
|
"loss": 2.9959, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.23337222870478413, |
|
"grad_norm": 1.3157118558883667, |
|
"learning_rate": 8.553240740740742e-06, |
|
"loss": 2.8601, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.23337222870478413, |
|
"eval_loss": 2.985109806060791, |
|
"eval_runtime": 86.8685, |
|
"eval_samples_per_second": 78.924, |
|
"eval_steps_per_second": 9.865, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.23855827823155712, |
|
"grad_norm": 1.4455087184906006, |
|
"learning_rate": 8.495370370370372e-06, |
|
"loss": 2.938, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2437443277583301, |
|
"grad_norm": 1.647705078125, |
|
"learning_rate": 8.4375e-06, |
|
"loss": 2.9344, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.24893037728510306, |
|
"grad_norm": 1.3752045631408691, |
|
"learning_rate": 8.37962962962963e-06, |
|
"loss": 2.9085, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2541164268118761, |
|
"grad_norm": 1.541107416152954, |
|
"learning_rate": 8.32175925925926e-06, |
|
"loss": 2.8821, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.25930247633864906, |
|
"grad_norm": 1.626123070716858, |
|
"learning_rate": 8.263888888888888e-06, |
|
"loss": 2.8736, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.25930247633864906, |
|
"eval_loss": 2.976685047149658, |
|
"eval_runtime": 87.0158, |
|
"eval_samples_per_second": 78.79, |
|
"eval_steps_per_second": 9.849, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26448852586542204, |
|
"grad_norm": 1.8355194330215454, |
|
"learning_rate": 8.20601851851852e-06, |
|
"loss": 2.9574, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.26967457539219497, |
|
"grad_norm": 1.9231911897659302, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 2.8982, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.27486062491896796, |
|
"grad_norm": 1.8790838718414307, |
|
"learning_rate": 8.090277777777778e-06, |
|
"loss": 2.9279, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.28004667444574094, |
|
"grad_norm": 1.870382308959961, |
|
"learning_rate": 8.032407407407408e-06, |
|
"loss": 2.868, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2852327239725139, |
|
"grad_norm": 1.8041619062423706, |
|
"learning_rate": 7.974537037037038e-06, |
|
"loss": 2.9388, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2852327239725139, |
|
"eval_loss": 2.969029664993286, |
|
"eval_runtime": 87.0488, |
|
"eval_samples_per_second": 78.76, |
|
"eval_steps_per_second": 9.845, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2904187734992869, |
|
"grad_norm": 1.3382441997528076, |
|
"learning_rate": 7.916666666666667e-06, |
|
"loss": 2.8644, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2956048230260599, |
|
"grad_norm": 1.5517808198928833, |
|
"learning_rate": 7.858796296296297e-06, |
|
"loss": 2.99, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3007908725528329, |
|
"grad_norm": 1.3038547039031982, |
|
"learning_rate": 7.800925925925926e-06, |
|
"loss": 2.8649, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.30597692207960586, |
|
"grad_norm": 1.515796184539795, |
|
"learning_rate": 7.743055555555556e-06, |
|
"loss": 2.8558, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.31116297160637885, |
|
"grad_norm": 1.7148423194885254, |
|
"learning_rate": 7.685185185185185e-06, |
|
"loss": 2.8933, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.31116297160637885, |
|
"eval_loss": 2.9627323150634766, |
|
"eval_runtime": 87.0142, |
|
"eval_samples_per_second": 78.792, |
|
"eval_steps_per_second": 9.849, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.31634902113315183, |
|
"grad_norm": 1.7385451793670654, |
|
"learning_rate": 7.627314814814816e-06, |
|
"loss": 2.9509, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3215350706599248, |
|
"grad_norm": 1.514612078666687, |
|
"learning_rate": 7.569444444444445e-06, |
|
"loss": 2.9305, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3267211201866978, |
|
"grad_norm": 1.6873114109039307, |
|
"learning_rate": 7.511574074074075e-06, |
|
"loss": 2.908, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3319071697134708, |
|
"grad_norm": 1.757570505142212, |
|
"learning_rate": 7.453703703703704e-06, |
|
"loss": 2.9484, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.33709321924024377, |
|
"grad_norm": 1.4156616926193237, |
|
"learning_rate": 7.395833333333335e-06, |
|
"loss": 2.9533, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.33709321924024377, |
|
"eval_loss": 2.9573678970336914, |
|
"eval_runtime": 86.9385, |
|
"eval_samples_per_second": 78.86, |
|
"eval_steps_per_second": 9.858, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3422792687670167, |
|
"grad_norm": 2.0717520713806152, |
|
"learning_rate": 7.337962962962964e-06, |
|
"loss": 2.8436, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3474653182937897, |
|
"grad_norm": 1.478298544883728, |
|
"learning_rate": 7.280092592592594e-06, |
|
"loss": 2.9204, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.35265136782056267, |
|
"grad_norm": 1.712254524230957, |
|
"learning_rate": 7.222222222222223e-06, |
|
"loss": 2.9538, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.35783741734733565, |
|
"grad_norm": 1.3590025901794434, |
|
"learning_rate": 7.164351851851853e-06, |
|
"loss": 2.9414, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.36302346687410864, |
|
"grad_norm": 2.1505372524261475, |
|
"learning_rate": 7.106481481481482e-06, |
|
"loss": 3.0889, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.36302346687410864, |
|
"eval_loss": 2.9526450634002686, |
|
"eval_runtime": 87.0066, |
|
"eval_samples_per_second": 78.799, |
|
"eval_steps_per_second": 9.85, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3682095164008816, |
|
"grad_norm": 1.6749532222747803, |
|
"learning_rate": 7.048611111111112e-06, |
|
"loss": 2.9108, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3733955659276546, |
|
"grad_norm": 1.8624922037124634, |
|
"learning_rate": 6.990740740740741e-06, |
|
"loss": 2.7685, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3785816154544276, |
|
"grad_norm": 1.6713345050811768, |
|
"learning_rate": 6.932870370370371e-06, |
|
"loss": 2.9072, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3837676649812006, |
|
"grad_norm": 1.852766513824463, |
|
"learning_rate": 6.875e-06, |
|
"loss": 2.8404, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.38895371450797356, |
|
"grad_norm": 1.5447180271148682, |
|
"learning_rate": 6.817129629629629e-06, |
|
"loss": 2.8887, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.38895371450797356, |
|
"eval_loss": 2.9479236602783203, |
|
"eval_runtime": 87.1235, |
|
"eval_samples_per_second": 78.693, |
|
"eval_steps_per_second": 9.837, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.39413976403474654, |
|
"grad_norm": 1.8909484148025513, |
|
"learning_rate": 6.75925925925926e-06, |
|
"loss": 2.9101, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3993258135615195, |
|
"grad_norm": 2.3639767169952393, |
|
"learning_rate": 6.701388888888889e-06, |
|
"loss": 2.6818, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4045118630882925, |
|
"grad_norm": 1.801645040512085, |
|
"learning_rate": 6.643518518518519e-06, |
|
"loss": 2.8709, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4096979126150655, |
|
"grad_norm": 1.6030060052871704, |
|
"learning_rate": 6.5856481481481484e-06, |
|
"loss": 2.9025, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4148839621418385, |
|
"grad_norm": 1.7171462774276733, |
|
"learning_rate": 6.5277777777777784e-06, |
|
"loss": 3.0349, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4148839621418385, |
|
"eval_loss": 2.943669557571411, |
|
"eval_runtime": 86.9632, |
|
"eval_samples_per_second": 78.838, |
|
"eval_steps_per_second": 9.855, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4200700116686114, |
|
"grad_norm": 1.5010789632797241, |
|
"learning_rate": 6.4699074074074076e-06, |
|
"loss": 2.8971, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4252560611953844, |
|
"grad_norm": 1.8026429414749146, |
|
"learning_rate": 6.4120370370370375e-06, |
|
"loss": 2.9048, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4304421107221574, |
|
"grad_norm": 2.0750973224639893, |
|
"learning_rate": 6.354166666666667e-06, |
|
"loss": 2.8126, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.43562816024893036, |
|
"grad_norm": 1.7901376485824585, |
|
"learning_rate": 6.296296296296297e-06, |
|
"loss": 2.8484, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.44081420977570335, |
|
"grad_norm": 1.805792212486267, |
|
"learning_rate": 6.238425925925926e-06, |
|
"loss": 2.8988, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.44081420977570335, |
|
"eval_loss": 2.9402518272399902, |
|
"eval_runtime": 87.0076, |
|
"eval_samples_per_second": 78.798, |
|
"eval_steps_per_second": 9.85, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.44600025930247633, |
|
"grad_norm": 1.8961949348449707, |
|
"learning_rate": 6.180555555555557e-06, |
|
"loss": 2.8334, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4511863088292493, |
|
"grad_norm": 1.4542121887207031, |
|
"learning_rate": 6.122685185185186e-06, |
|
"loss": 2.9224, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4563723583560223, |
|
"grad_norm": 1.6367915868759155, |
|
"learning_rate": 6.064814814814816e-06, |
|
"loss": 2.8376, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4615584078827953, |
|
"grad_norm": 1.796552062034607, |
|
"learning_rate": 6.006944444444445e-06, |
|
"loss": 2.8408, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.46674445740956827, |
|
"grad_norm": 1.7545686960220337, |
|
"learning_rate": 5.949074074074075e-06, |
|
"loss": 3.0174, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.46674445740956827, |
|
"eval_loss": 2.9366812705993652, |
|
"eval_runtime": 87.091, |
|
"eval_samples_per_second": 78.722, |
|
"eval_steps_per_second": 9.84, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.47193050693634125, |
|
"grad_norm": 1.7723950147628784, |
|
"learning_rate": 5.891203703703704e-06, |
|
"loss": 2.8517, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.47711655646311424, |
|
"grad_norm": 1.7314085960388184, |
|
"learning_rate": 5.833333333333334e-06, |
|
"loss": 2.8079, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4823026059898872, |
|
"grad_norm": 1.993122935295105, |
|
"learning_rate": 5.775462962962963e-06, |
|
"loss": 2.9262, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4874886555166602, |
|
"grad_norm": 2.3595573902130127, |
|
"learning_rate": 5.717592592592593e-06, |
|
"loss": 2.9056, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4926747050434332, |
|
"grad_norm": 1.8839762210845947, |
|
"learning_rate": 5.659722222222222e-06, |
|
"loss": 2.8617, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4926747050434332, |
|
"eval_loss": 2.9337050914764404, |
|
"eval_runtime": 87.1298, |
|
"eval_samples_per_second": 78.687, |
|
"eval_steps_per_second": 9.836, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4978607545702061, |
|
"grad_norm": 2.0222883224487305, |
|
"learning_rate": 5.601851851851853e-06, |
|
"loss": 2.869, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5030468040969791, |
|
"grad_norm": 1.7449781894683838, |
|
"learning_rate": 5.543981481481482e-06, |
|
"loss": 2.885, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5082328536237521, |
|
"grad_norm": 2.0181517601013184, |
|
"learning_rate": 5.486111111111112e-06, |
|
"loss": 2.8688, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5134189031505251, |
|
"grad_norm": 1.6998732089996338, |
|
"learning_rate": 5.428240740740741e-06, |
|
"loss": 2.917, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5186049526772981, |
|
"grad_norm": 1.8947477340698242, |
|
"learning_rate": 5.370370370370371e-06, |
|
"loss": 2.8555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5186049526772981, |
|
"eval_loss": 2.931408643722534, |
|
"eval_runtime": 87.0111, |
|
"eval_samples_per_second": 78.795, |
|
"eval_steps_per_second": 9.849, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.523791002204071, |
|
"grad_norm": 2.1364452838897705, |
|
"learning_rate": 5.3125e-06, |
|
"loss": 2.7673, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5289770517308441, |
|
"grad_norm": 2.374993085861206, |
|
"learning_rate": 5.2546296296296295e-06, |
|
"loss": 2.8901, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.534163101257617, |
|
"grad_norm": 2.291280746459961, |
|
"learning_rate": 5.1967592592592595e-06, |
|
"loss": 2.9203, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5393491507843899, |
|
"grad_norm": 2.6651227474212646, |
|
"learning_rate": 5.138888888888889e-06, |
|
"loss": 2.8047, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.544535200311163, |
|
"grad_norm": 1.8160004615783691, |
|
"learning_rate": 5.081018518518519e-06, |
|
"loss": 2.8169, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.544535200311163, |
|
"eval_loss": 2.9287192821502686, |
|
"eval_runtime": 87.097, |
|
"eval_samples_per_second": 78.717, |
|
"eval_steps_per_second": 9.84, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5497212498379359, |
|
"grad_norm": 1.9724560976028442, |
|
"learning_rate": 5.023148148148148e-06, |
|
"loss": 2.9291, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.554907299364709, |
|
"grad_norm": 2.5785677433013916, |
|
"learning_rate": 4.9652777777777786e-06, |
|
"loss": 2.8179, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.5600933488914819, |
|
"grad_norm": 1.6687511205673218, |
|
"learning_rate": 4.907407407407408e-06, |
|
"loss": 2.7731, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5652793984182549, |
|
"grad_norm": 2.003852605819702, |
|
"learning_rate": 4.849537037037038e-06, |
|
"loss": 2.8838, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.5704654479450278, |
|
"grad_norm": 2.078784227371216, |
|
"learning_rate": 4.791666666666668e-06, |
|
"loss": 2.9151, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5704654479450278, |
|
"eval_loss": 2.9263577461242676, |
|
"eval_runtime": 87.0972, |
|
"eval_samples_per_second": 78.717, |
|
"eval_steps_per_second": 9.84, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5756514974718009, |
|
"grad_norm": 1.9661970138549805, |
|
"learning_rate": 4.733796296296297e-06, |
|
"loss": 2.9136, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5808375469985738, |
|
"grad_norm": 2.3089287281036377, |
|
"learning_rate": 4.675925925925927e-06, |
|
"loss": 2.8187, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5860235965253469, |
|
"grad_norm": 1.668616771697998, |
|
"learning_rate": 4.618055555555556e-06, |
|
"loss": 2.777, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5912096460521198, |
|
"grad_norm": 1.6333500146865845, |
|
"learning_rate": 4.560185185185186e-06, |
|
"loss": 2.8093, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5963956955788928, |
|
"grad_norm": 1.801171064376831, |
|
"learning_rate": 4.502314814814815e-06, |
|
"loss": 2.774, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5963956955788928, |
|
"eval_loss": 2.92441725730896, |
|
"eval_runtime": 87.1718, |
|
"eval_samples_per_second": 78.649, |
|
"eval_steps_per_second": 9.831, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6015817451056658, |
|
"grad_norm": 1.9352556467056274, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 2.8634, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6067677946324388, |
|
"grad_norm": 2.076831817626953, |
|
"learning_rate": 4.386574074074074e-06, |
|
"loss": 2.8592, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6119538441592117, |
|
"grad_norm": 2.5992085933685303, |
|
"learning_rate": 4.328703703703704e-06, |
|
"loss": 2.7938, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6171398936859847, |
|
"grad_norm": 1.8197615146636963, |
|
"learning_rate": 4.270833333333333e-06, |
|
"loss": 2.8091, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6223259432127577, |
|
"grad_norm": 2.15179181098938, |
|
"learning_rate": 4.212962962962963e-06, |
|
"loss": 2.7899, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6223259432127577, |
|
"eval_loss": 2.922563314437866, |
|
"eval_runtime": 87.083, |
|
"eval_samples_per_second": 78.729, |
|
"eval_steps_per_second": 9.841, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6275119927395306, |
|
"grad_norm": 2.6506097316741943, |
|
"learning_rate": 4.155092592592593e-06, |
|
"loss": 2.8336, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6326980422663037, |
|
"grad_norm": 2.2627806663513184, |
|
"learning_rate": 4.097222222222222e-06, |
|
"loss": 2.8635, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6378840917930766, |
|
"grad_norm": 2.34142804145813, |
|
"learning_rate": 4.039351851851852e-06, |
|
"loss": 2.9534, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6430701413198496, |
|
"grad_norm": 1.9826796054840088, |
|
"learning_rate": 3.9814814814814814e-06, |
|
"loss": 2.8219, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6482561908466226, |
|
"grad_norm": 2.4062864780426025, |
|
"learning_rate": 3.9236111111111114e-06, |
|
"loss": 2.8348, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6482561908466226, |
|
"eval_loss": 2.921142339706421, |
|
"eval_runtime": 87.113, |
|
"eval_samples_per_second": 78.702, |
|
"eval_steps_per_second": 9.838, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6534422403733956, |
|
"grad_norm": 1.7716823816299438, |
|
"learning_rate": 3.865740740740741e-06, |
|
"loss": 2.8568, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6586282899001685, |
|
"grad_norm": 2.330777406692505, |
|
"learning_rate": 3.8078703703703705e-06, |
|
"loss": 2.9541, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6638143394269416, |
|
"grad_norm": 1.8923736810684204, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 2.8021, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6690003889537145, |
|
"grad_norm": 1.858708381652832, |
|
"learning_rate": 3.69212962962963e-06, |
|
"loss": 2.8218, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6741864384804875, |
|
"grad_norm": 1.7053884267807007, |
|
"learning_rate": 3.6342592592592596e-06, |
|
"loss": 2.7704, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6741864384804875, |
|
"eval_loss": 2.919332504272461, |
|
"eval_runtime": 87.1672, |
|
"eval_samples_per_second": 78.653, |
|
"eval_steps_per_second": 9.832, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6793724880072605, |
|
"grad_norm": 2.1638433933258057, |
|
"learning_rate": 3.576388888888889e-06, |
|
"loss": 2.8667, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6845585375340334, |
|
"grad_norm": 1.951277256011963, |
|
"learning_rate": 3.5185185185185187e-06, |
|
"loss": 2.8129, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6897445870608064, |
|
"grad_norm": 2.4789998531341553, |
|
"learning_rate": 3.4606481481481487e-06, |
|
"loss": 2.8462, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6949306365875794, |
|
"grad_norm": 1.6878948211669922, |
|
"learning_rate": 3.4027777777777783e-06, |
|
"loss": 2.8111, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7001166861143524, |
|
"grad_norm": 1.99266517162323, |
|
"learning_rate": 3.344907407407408e-06, |
|
"loss": 2.9152, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7001166861143524, |
|
"eval_loss": 2.917971134185791, |
|
"eval_runtime": 87.0384, |
|
"eval_samples_per_second": 78.77, |
|
"eval_steps_per_second": 9.846, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7053027356411253, |
|
"grad_norm": 1.6022475957870483, |
|
"learning_rate": 3.2870370370370374e-06, |
|
"loss": 2.8767, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7104887851678984, |
|
"grad_norm": 2.165276527404785, |
|
"learning_rate": 3.229166666666667e-06, |
|
"loss": 2.8321, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7156748346946713, |
|
"grad_norm": 2.1206817626953125, |
|
"learning_rate": 3.171296296296297e-06, |
|
"loss": 2.8714, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7208608842214443, |
|
"grad_norm": 2.4431049823760986, |
|
"learning_rate": 3.1134259259259265e-06, |
|
"loss": 2.8628, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7260469337482173, |
|
"grad_norm": 2.304499387741089, |
|
"learning_rate": 3.055555555555556e-06, |
|
"loss": 2.7895, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7260469337482173, |
|
"eval_loss": 2.917088270187378, |
|
"eval_runtime": 87.1021, |
|
"eval_samples_per_second": 78.712, |
|
"eval_steps_per_second": 9.839, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7312329832749903, |
|
"grad_norm": 1.3344799280166626, |
|
"learning_rate": 2.9976851851851856e-06, |
|
"loss": 2.9102, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7364190328017632, |
|
"grad_norm": 1.719092607498169, |
|
"learning_rate": 2.9398148148148147e-06, |
|
"loss": 2.7918, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7416050823285363, |
|
"grad_norm": 1.8856807947158813, |
|
"learning_rate": 2.8819444444444443e-06, |
|
"loss": 2.9078, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7467911318553092, |
|
"grad_norm": 2.222034215927124, |
|
"learning_rate": 2.8240740740740743e-06, |
|
"loss": 2.8047, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7519771813820822, |
|
"grad_norm": 2.0205276012420654, |
|
"learning_rate": 2.766203703703704e-06, |
|
"loss": 2.8561, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7519771813820822, |
|
"eval_loss": 2.9157726764678955, |
|
"eval_runtime": 87.0003, |
|
"eval_samples_per_second": 78.804, |
|
"eval_steps_per_second": 9.851, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7571632309088552, |
|
"grad_norm": 2.2088751792907715, |
|
"learning_rate": 2.7083333333333334e-06, |
|
"loss": 2.8221, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7623492804356281, |
|
"grad_norm": 1.8010945320129395, |
|
"learning_rate": 2.650462962962963e-06, |
|
"loss": 2.9598, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7675353299624011, |
|
"grad_norm": 1.8427363634109497, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 2.9063, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7727213794891741, |
|
"grad_norm": 2.112938404083252, |
|
"learning_rate": 2.5347222222222225e-06, |
|
"loss": 2.8391, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7779074290159471, |
|
"grad_norm": 1.8270915746688843, |
|
"learning_rate": 2.476851851851852e-06, |
|
"loss": 2.7957, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7779074290159471, |
|
"eval_loss": 2.9150469303131104, |
|
"eval_runtime": 87.2201, |
|
"eval_samples_per_second": 78.606, |
|
"eval_steps_per_second": 9.826, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.78309347854272, |
|
"grad_norm": 2.6664621829986572, |
|
"learning_rate": 2.4189814814814816e-06, |
|
"loss": 2.8278, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.7882795280694931, |
|
"grad_norm": 2.025564193725586, |
|
"learning_rate": 2.361111111111111e-06, |
|
"loss": 2.8109, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.793465577596266, |
|
"grad_norm": 1.752272605895996, |
|
"learning_rate": 2.3032407407407407e-06, |
|
"loss": 2.8855, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.798651627123039, |
|
"grad_norm": 1.9612879753112793, |
|
"learning_rate": 2.2453703703703707e-06, |
|
"loss": 2.8185, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.803837676649812, |
|
"grad_norm": 1.9773454666137695, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 2.8071, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.803837676649812, |
|
"eval_loss": 2.9142231941223145, |
|
"eval_runtime": 87.0585, |
|
"eval_samples_per_second": 78.752, |
|
"eval_steps_per_second": 9.844, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.809023726176585, |
|
"grad_norm": 3.341409206390381, |
|
"learning_rate": 2.1296296296296298e-06, |
|
"loss": 2.8467, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.814209775703358, |
|
"grad_norm": 1.7923222780227661, |
|
"learning_rate": 2.0717592592592593e-06, |
|
"loss": 2.8776, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.819395825230131, |
|
"grad_norm": 1.8666836023330688, |
|
"learning_rate": 2.0138888888888893e-06, |
|
"loss": 2.8347, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8245818747569039, |
|
"grad_norm": 2.2585649490356445, |
|
"learning_rate": 1.956018518518519e-06, |
|
"loss": 2.8068, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.829767924283677, |
|
"grad_norm": 1.8757206201553345, |
|
"learning_rate": 1.8981481481481484e-06, |
|
"loss": 2.8491, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.829767924283677, |
|
"eval_loss": 2.913360834121704, |
|
"eval_runtime": 87.2275, |
|
"eval_samples_per_second": 78.599, |
|
"eval_steps_per_second": 9.825, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8349539738104499, |
|
"grad_norm": 1.6995919942855835, |
|
"learning_rate": 1.840277777777778e-06, |
|
"loss": 2.8562, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8401400233372228, |
|
"grad_norm": 1.780346393585205, |
|
"learning_rate": 1.7824074074074073e-06, |
|
"loss": 2.8461, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8453260728639959, |
|
"grad_norm": 2.147747755050659, |
|
"learning_rate": 1.724537037037037e-06, |
|
"loss": 2.7899, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8505121223907688, |
|
"grad_norm": 1.769389033317566, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 2.8092, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8556981719175418, |
|
"grad_norm": 3.257025957107544, |
|
"learning_rate": 1.6087962962962964e-06, |
|
"loss": 2.8567, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8556981719175418, |
|
"eval_loss": 2.9123799800872803, |
|
"eval_runtime": 87.0603, |
|
"eval_samples_per_second": 78.75, |
|
"eval_steps_per_second": 9.844, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8608842214443148, |
|
"grad_norm": 1.7707329988479614, |
|
"learning_rate": 1.550925925925926e-06, |
|
"loss": 2.856, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8660702709710878, |
|
"grad_norm": 1.8951716423034668, |
|
"learning_rate": 1.4930555555555555e-06, |
|
"loss": 2.8768, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8712563204978607, |
|
"grad_norm": 2.0087685585021973, |
|
"learning_rate": 1.4351851851851853e-06, |
|
"loss": 2.8264, |
|
"step": 1680 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1928, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6099718731923456e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|