|
{ |
|
"best_metric": 0.44520583748817444, |
|
"best_model_checkpoint": "saves/Orca/lora/checkpoint-3000", |
|
"epoch": 0.9998413957176844, |
|
"eval_steps": 500, |
|
"global_step": 3152, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0031720856463124504, |
|
"grad_norm": 0.8383375406265259, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 2.0081, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.006344171292624901, |
|
"grad_norm": 0.9924185276031494, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.0141, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00951625693893735, |
|
"grad_norm": 1.7123312950134277, |
|
"learning_rate": 1.5e-06, |
|
"loss": 2.0134, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.012688342585249802, |
|
"grad_norm": 1.6002644300460815, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.9886, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01586042823156225, |
|
"grad_norm": 1.4284940958023071, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.9963, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0190325138778747, |
|
"grad_norm": 1.9293030500411987, |
|
"learning_rate": 3e-06, |
|
"loss": 1.9858, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.022204599524187154, |
|
"grad_norm": 1.8281705379486084, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 1.9608, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.025376685170499604, |
|
"grad_norm": 1.8404144048690796, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.9317, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.028548770816812053, |
|
"grad_norm": 1.8080644607543945, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.8928, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0317208564631245, |
|
"grad_norm": 1.4789849519729614, |
|
"learning_rate": 5e-06, |
|
"loss": 1.8453, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.034892942109436956, |
|
"grad_norm": 2.3675060272216797, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 1.7683, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0380650277557494, |
|
"grad_norm": 6.9262309074401855, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6648, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.041237113402061855, |
|
"grad_norm": 1.5342968702316284, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.5507, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04440919904837431, |
|
"grad_norm": 1.7138350009918213, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 1.4388, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.047581284694686754, |
|
"grad_norm": 0.9868146181106567, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.3484, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05075337034099921, |
|
"grad_norm": 0.649817705154419, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.2671, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05392545598731166, |
|
"grad_norm": 0.5127522945404053, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 1.19, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.057097541633624106, |
|
"grad_norm": 0.43639031052589417, |
|
"learning_rate": 9e-06, |
|
"loss": 1.1479, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06026962727993656, |
|
"grad_norm": 0.31427648663520813, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.0992, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.063441712926249, |
|
"grad_norm": 0.2897365689277649, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0673, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06661379857256146, |
|
"grad_norm": 0.43495845794677734, |
|
"learning_rate": 1.05e-05, |
|
"loss": 1.0399, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06978588421887391, |
|
"grad_norm": 0.2536582946777344, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.0043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07295796986518636, |
|
"grad_norm": 0.3480685353279114, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.9747, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0761300555114988, |
|
"grad_norm": 0.25048044323921204, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9477, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.07930214115781126, |
|
"grad_norm": 0.31881338357925415, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.9099, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08247422680412371, |
|
"grad_norm": 0.25372961163520813, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.8661, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08564631245043616, |
|
"grad_norm": 0.2892467975616455, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.8306, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08881839809674862, |
|
"grad_norm": 0.265289306640625, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.79, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09199048374306107, |
|
"grad_norm": 0.2384192794561386, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.7414, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.09516256938937351, |
|
"grad_norm": 0.2318211942911148, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6994, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09833465503568596, |
|
"grad_norm": 0.22584764659404755, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.6663, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.10150674068199841, |
|
"grad_norm": 0.2281956821680069, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6366, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.10467882632831087, |
|
"grad_norm": 0.23853954672813416, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.6105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.10785091197462332, |
|
"grad_norm": 0.2249988317489624, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.5923, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11102299762093576, |
|
"grad_norm": 0.22325117886066437, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.5782, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11419508326724821, |
|
"grad_norm": 0.26995840668678284, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.5588, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.11736716891356067, |
|
"grad_norm": 0.3128603994846344, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.5539, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12053925455987312, |
|
"grad_norm": 0.2401484102010727, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.5463, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.12371134020618557, |
|
"grad_norm": 0.252087265253067, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.5396, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.126883425852498, |
|
"grad_norm": 0.3551071286201477, |
|
"learning_rate": 2e-05, |
|
"loss": 0.5293, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13005551149881048, |
|
"grad_norm": 0.2852734625339508, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.5327, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.13322759714512292, |
|
"grad_norm": 0.25599339604377747, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.5215, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.13639968279143536, |
|
"grad_norm": 0.2308124303817749, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.5179, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.13957176843774782, |
|
"grad_norm": 0.28417715430259705, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.5122, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.14274385408406026, |
|
"grad_norm": 0.32094892859458923, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.5078, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.14591593973037273, |
|
"grad_norm": 0.28217512369155884, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.5064, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.14908802537668517, |
|
"grad_norm": 0.3319534659385681, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.5056, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1522601110229976, |
|
"grad_norm": 0.35069146752357483, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.5001, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.15543219666931007, |
|
"grad_norm": 0.2778172492980957, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.4994, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1586042823156225, |
|
"grad_norm": 0.2950364053249359, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.5009, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1586042823156225, |
|
"eval_loss": 0.49637770652770996, |
|
"eval_runtime": 23627.0877, |
|
"eval_samples_per_second": 0.044, |
|
"eval_steps_per_second": 0.022, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16177636796193498, |
|
"grad_norm": 0.26569539308547974, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.4978, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.16494845360824742, |
|
"grad_norm": 0.26948851346969604, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.4926, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1681205392545599, |
|
"grad_norm": 0.29083552956581116, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.4893, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.17129262490087233, |
|
"grad_norm": 0.2939367890357971, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.491, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.17446471054718476, |
|
"grad_norm": 0.28195658326148987, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.4865, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.17763679619349723, |
|
"grad_norm": 0.2827642858028412, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.4851, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.18080888183980967, |
|
"grad_norm": 0.2715272605419159, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.4833, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.18398096748612214, |
|
"grad_norm": 0.30428314208984375, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.4834, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.18715305313243458, |
|
"grad_norm": 0.2857949435710907, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.4796, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.19032513877874702, |
|
"grad_norm": 0.302212655544281, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4826, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.19349722442505948, |
|
"grad_norm": 0.34079357981681824, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.4826, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.19666931007137192, |
|
"grad_norm": 0.3239024877548218, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.4804, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1998413957176844, |
|
"grad_norm": 0.36531686782836914, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.4782, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.20301348136399683, |
|
"grad_norm": 0.3582271337509155, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.4784, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.20618556701030927, |
|
"grad_norm": 0.3250937759876251, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.4716, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.20935765265662173, |
|
"grad_norm": 0.28494277596473694, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.4764, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.21252973830293417, |
|
"grad_norm": 0.3156846761703491, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.4742, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.21570182394924664, |
|
"grad_norm": 0.27053794264793396, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.4765, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.21887390959555908, |
|
"grad_norm": 0.31487756967544556, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.4749, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.22204599524187152, |
|
"grad_norm": 0.2739400267601013, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.4693, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.22521808088818399, |
|
"grad_norm": 0.2924807369709015, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.4697, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.22839016653449642, |
|
"grad_norm": 0.27241086959838867, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.4711, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2315622521808089, |
|
"grad_norm": 0.249298095703125, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.4748, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.23473433782712133, |
|
"grad_norm": 0.33761996030807495, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.4692, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.23790642347343377, |
|
"grad_norm": 0.3454744517803192, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.4691, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.24107850911974624, |
|
"grad_norm": 0.3940749168395996, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.4694, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.24425059476605868, |
|
"grad_norm": 0.2833056151866913, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.4651, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.24742268041237114, |
|
"grad_norm": 0.3439841866493225, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.4718, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2505947660586836, |
|
"grad_norm": 0.28585824370384216, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.4678, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.253766851704996, |
|
"grad_norm": 0.2761208415031433, |
|
"learning_rate": 4e-05, |
|
"loss": 0.4672, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.25693893735130846, |
|
"grad_norm": 0.287431538105011, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.4654, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.26011102299762096, |
|
"grad_norm": 0.2529178261756897, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.4658, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2632831086439334, |
|
"grad_norm": 0.22234879434108734, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.467, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.26645519429024583, |
|
"grad_norm": 0.2847014367580414, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.4643, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.2696272799365583, |
|
"grad_norm": 0.27712419629096985, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.4604, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2727993655828707, |
|
"grad_norm": 0.24886064231395721, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.4657, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2759714512291832, |
|
"grad_norm": 0.3196752369403839, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.4652, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.27914353687549565, |
|
"grad_norm": 0.27527421712875366, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.4634, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2823156225218081, |
|
"grad_norm": 0.26819467544555664, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.4633, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.2854877081681205, |
|
"grad_norm": 0.26196032762527466, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.4656, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.28865979381443296, |
|
"grad_norm": 0.23337453603744507, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.4622, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.29183187946074546, |
|
"grad_norm": 0.2958989143371582, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.4627, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2950039651070579, |
|
"grad_norm": 0.23167885839939117, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.4636, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.29817605075337034, |
|
"grad_norm": 0.2608003318309784, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.4628, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3013481363996828, |
|
"grad_norm": 0.2806340456008911, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.4624, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3045202220459952, |
|
"grad_norm": 0.2206449806690216, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.4594, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.2883213460445404, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.4612, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.31086439333862015, |
|
"grad_norm": 0.23365403711795807, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.463, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3140364789849326, |
|
"grad_norm": 0.28844788670539856, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.4612, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.317208564631245, |
|
"grad_norm": 0.26974356174468994, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4641, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.317208564631245, |
|
"eval_loss": 0.45910927653312683, |
|
"eval_runtime": 419.1696, |
|
"eval_samples_per_second": 2.457, |
|
"eval_steps_per_second": 1.229, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3203806502775575, |
|
"grad_norm": 0.23146626353263855, |
|
"learning_rate": 4.9997336102492574e-05, |
|
"loss": 0.4583, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.32355273592386996, |
|
"grad_norm": 0.24651704728603363, |
|
"learning_rate": 4.9989344977678285e-05, |
|
"loss": 0.461, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3267248215701824, |
|
"grad_norm": 0.2995850741863251, |
|
"learning_rate": 4.997602832856013e-05, |
|
"loss": 0.4583, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.32989690721649484, |
|
"grad_norm": 0.2886098325252533, |
|
"learning_rate": 4.995738899307319e-05, |
|
"loss": 0.4565, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3330689928628073, |
|
"grad_norm": 0.2367076575756073, |
|
"learning_rate": 4.99334309434798e-05, |
|
"loss": 0.4598, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3362410785091198, |
|
"grad_norm": 0.29130271077156067, |
|
"learning_rate": 4.990415928552305e-05, |
|
"loss": 0.4564, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3394131641554322, |
|
"grad_norm": 0.23702391982078552, |
|
"learning_rate": 4.9869580257338685e-05, |
|
"loss": 0.4567, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.34258524980174465, |
|
"grad_norm": 0.24812078475952148, |
|
"learning_rate": 4.9829701228125655e-05, |
|
"loss": 0.4587, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3457573354480571, |
|
"grad_norm": 0.2195880264043808, |
|
"learning_rate": 4.9784530696575684e-05, |
|
"loss": 0.4578, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.34892942109436953, |
|
"grad_norm": 0.2531008720397949, |
|
"learning_rate": 4.973407828906208e-05, |
|
"loss": 0.4551, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.352101506740682, |
|
"grad_norm": 0.2672252655029297, |
|
"learning_rate": 4.967835475758825e-05, |
|
"loss": 0.4596, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.35527359238699446, |
|
"grad_norm": 0.2659941613674164, |
|
"learning_rate": 4.961737197749633e-05, |
|
"loss": 0.4536, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3584456780333069, |
|
"grad_norm": 0.2517182230949402, |
|
"learning_rate": 4.955114294493639e-05, |
|
"loss": 0.4579, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.36161776367961934, |
|
"grad_norm": 0.2654310166835785, |
|
"learning_rate": 4.947968177409681e-05, |
|
"loss": 0.4548, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.3647898493259318, |
|
"grad_norm": 0.36614441871643066, |
|
"learning_rate": 4.940300369419637e-05, |
|
"loss": 0.4575, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3679619349722443, |
|
"grad_norm": 0.2583513855934143, |
|
"learning_rate": 4.9321125046238756e-05, |
|
"loss": 0.4598, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3711340206185567, |
|
"grad_norm": 0.23981873691082, |
|
"learning_rate": 4.923406327953008e-05, |
|
"loss": 0.4546, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.37430610626486915, |
|
"grad_norm": 0.23798957467079163, |
|
"learning_rate": 4.9141836947960165e-05, |
|
"loss": 0.458, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3774781919111816, |
|
"grad_norm": 0.2931083142757416, |
|
"learning_rate": 4.904446570604863e-05, |
|
"loss": 0.4543, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.38065027755749403, |
|
"grad_norm": 0.27312615513801575, |
|
"learning_rate": 4.8941970304756144e-05, |
|
"loss": 0.4552, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3838223632038065, |
|
"grad_norm": 0.2659391164779663, |
|
"learning_rate": 4.883437258706224e-05, |
|
"loss": 0.4583, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.38699444885011897, |
|
"grad_norm": 0.23010990023612976, |
|
"learning_rate": 4.8721695483310275e-05, |
|
"loss": 0.4564, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3901665344964314, |
|
"grad_norm": 0.2815942168235779, |
|
"learning_rate": 4.860396300632072e-05, |
|
"loss": 0.4548, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.39333862014274384, |
|
"grad_norm": 0.36561158299446106, |
|
"learning_rate": 4.848120024627372e-05, |
|
"loss": 0.4528, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3965107057890563, |
|
"grad_norm": 0.25015613436698914, |
|
"learning_rate": 4.835343336536212e-05, |
|
"loss": 0.4567, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3996827914353688, |
|
"grad_norm": 0.2373395711183548, |
|
"learning_rate": 4.822068959221598e-05, |
|
"loss": 0.4544, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4028548770816812, |
|
"grad_norm": 0.26331648230552673, |
|
"learning_rate": 4.80829972160998e-05, |
|
"loss": 0.4532, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.40602696272799366, |
|
"grad_norm": 0.2622736692428589, |
|
"learning_rate": 4.794038558088378e-05, |
|
"loss": 0.4535, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4091990483743061, |
|
"grad_norm": 0.1999824345111847, |
|
"learning_rate": 4.779288507879031e-05, |
|
"loss": 0.4568, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.41237113402061853, |
|
"grad_norm": 0.24161754548549652, |
|
"learning_rate": 4.764052714391695e-05, |
|
"loss": 0.4534, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.41554321966693103, |
|
"grad_norm": 0.2368980497121811, |
|
"learning_rate": 4.7483344245537545e-05, |
|
"loss": 0.4558, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.41871530531324347, |
|
"grad_norm": 0.1948440819978714, |
|
"learning_rate": 4.7321369881182584e-05, |
|
"loss": 0.4564, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4218873909595559, |
|
"grad_norm": 0.2509821355342865, |
|
"learning_rate": 4.715463856950053e-05, |
|
"loss": 0.4558, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.42505947660586835, |
|
"grad_norm": 0.23150520026683807, |
|
"learning_rate": 4.698318584290141e-05, |
|
"loss": 0.4538, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.4282315622521808, |
|
"grad_norm": 0.22577635943889618, |
|
"learning_rate": 4.680704823998452e-05, |
|
"loss": 0.4571, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4314036478984933, |
|
"grad_norm": 0.22882512211799622, |
|
"learning_rate": 4.6626263297751546e-05, |
|
"loss": 0.453, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4345757335448057, |
|
"grad_norm": 0.1979285478591919, |
|
"learning_rate": 4.644086954360708e-05, |
|
"loss": 0.4542, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.43774781919111816, |
|
"grad_norm": 0.2554062008857727, |
|
"learning_rate": 4.625090648714786e-05, |
|
"loss": 0.4561, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4409199048374306, |
|
"grad_norm": 0.2330465316772461, |
|
"learning_rate": 4.60564146117429e-05, |
|
"loss": 0.4525, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.44409199048374304, |
|
"grad_norm": 0.22269243001937866, |
|
"learning_rate": 4.585743536590599e-05, |
|
"loss": 0.4544, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.44726407613005553, |
|
"grad_norm": 0.2291986495256424, |
|
"learning_rate": 4.565401115446246e-05, |
|
"loss": 0.4548, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.45043616177636797, |
|
"grad_norm": 0.21675272285938263, |
|
"learning_rate": 4.5446185329512314e-05, |
|
"loss": 0.4518, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.4536082474226804, |
|
"grad_norm": 0.21207275986671448, |
|
"learning_rate": 4.5234002181191304e-05, |
|
"loss": 0.4511, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.45678033306899285, |
|
"grad_norm": 0.22509776055812836, |
|
"learning_rate": 4.5017506928232247e-05, |
|
"loss": 0.4518, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.4599524187153053, |
|
"grad_norm": 0.23429358005523682, |
|
"learning_rate": 4.4796745708328294e-05, |
|
"loss": 0.4556, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4631245043616178, |
|
"grad_norm": 0.18599826097488403, |
|
"learning_rate": 4.457176556830054e-05, |
|
"loss": 0.4538, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4662965900079302, |
|
"grad_norm": 0.2500051259994507, |
|
"learning_rate": 4.434261445407172e-05, |
|
"loss": 0.4538, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.46946867565424266, |
|
"grad_norm": 0.213278666138649, |
|
"learning_rate": 4.410934120044838e-05, |
|
"loss": 0.4545, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4726407613005551, |
|
"grad_norm": 0.2551578879356384, |
|
"learning_rate": 4.387199552071366e-05, |
|
"loss": 0.4512, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.47581284694686754, |
|
"grad_norm": 0.23356303572654724, |
|
"learning_rate": 4.3630627996032706e-05, |
|
"loss": 0.4514, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.47581284694686754, |
|
"eval_loss": 0.4515945613384247, |
|
"eval_runtime": 293.5383, |
|
"eval_samples_per_second": 3.509, |
|
"eval_steps_per_second": 1.754, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.47898493259318004, |
|
"grad_norm": 0.22167843580245972, |
|
"learning_rate": 4.3385290064673316e-05, |
|
"loss": 0.4529, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4821570182394925, |
|
"grad_norm": 0.2573811709880829, |
|
"learning_rate": 4.31360340110438e-05, |
|
"loss": 0.4517, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4853291038858049, |
|
"grad_norm": 0.20118848979473114, |
|
"learning_rate": 4.288291295455055e-05, |
|
"loss": 0.4538, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.48850118953211735, |
|
"grad_norm": 0.24297507107257843, |
|
"learning_rate": 4.262598083827769e-05, |
|
"loss": 0.4519, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4916732751784298, |
|
"grad_norm": 0.18671482801437378, |
|
"learning_rate": 4.236529241749114e-05, |
|
"loss": 0.4515, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4948453608247423, |
|
"grad_norm": 0.20851927995681763, |
|
"learning_rate": 4.2100903247969644e-05, |
|
"loss": 0.4526, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4980174464710547, |
|
"grad_norm": 0.23694059252738953, |
|
"learning_rate": 4.1832869674165204e-05, |
|
"loss": 0.4519, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5011895321173672, |
|
"grad_norm": 0.3004760146141052, |
|
"learning_rate": 4.156124881719533e-05, |
|
"loss": 0.4493, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5043616177636796, |
|
"grad_norm": 0.20880846679210663, |
|
"learning_rate": 4.1286098562669925e-05, |
|
"loss": 0.4508, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.507533703409992, |
|
"grad_norm": 0.2033112347126007, |
|
"learning_rate": 4.1007477548355185e-05, |
|
"loss": 0.4522, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5107057890563045, |
|
"grad_norm": 0.24385987222194672, |
|
"learning_rate": 4.072544515167714e-05, |
|
"loss": 0.45, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5138778747026169, |
|
"grad_norm": 0.19505774974822998, |
|
"learning_rate": 4.044006147706768e-05, |
|
"loss": 0.4528, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5170499603489295, |
|
"grad_norm": 0.20980720221996307, |
|
"learning_rate": 4.0151387343155545e-05, |
|
"loss": 0.4512, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5202220459952419, |
|
"grad_norm": 0.2582622766494751, |
|
"learning_rate": 3.985948426980521e-05, |
|
"loss": 0.452, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.5233941316415543, |
|
"grad_norm": 0.18902088701725006, |
|
"learning_rate": 3.956441446500624e-05, |
|
"loss": 0.4498, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5265662172878668, |
|
"grad_norm": 0.21405866742134094, |
|
"learning_rate": 3.926624081161604e-05, |
|
"loss": 0.4462, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.5297383029341792, |
|
"grad_norm": 0.21340113878250122, |
|
"learning_rate": 3.896502685395876e-05, |
|
"loss": 0.449, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.5329103885804917, |
|
"grad_norm": 0.2296970933675766, |
|
"learning_rate": 3.8660836784283275e-05, |
|
"loss": 0.4513, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5360824742268041, |
|
"grad_norm": 0.2143164724111557, |
|
"learning_rate": 3.835373542908308e-05, |
|
"loss": 0.4488, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5392545598731165, |
|
"grad_norm": 0.1945250779390335, |
|
"learning_rate": 3.804378823528093e-05, |
|
"loss": 0.4529, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.542426645519429, |
|
"grad_norm": 0.20191755890846252, |
|
"learning_rate": 3.7731061256281394e-05, |
|
"loss": 0.4507, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5455987311657414, |
|
"grad_norm": 0.21345514059066772, |
|
"learning_rate": 3.7415621137894056e-05, |
|
"loss": 0.4487, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.548770816812054, |
|
"grad_norm": 0.2062884271144867, |
|
"learning_rate": 3.709753510413052e-05, |
|
"loss": 0.4504, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5519429024583664, |
|
"grad_norm": 0.20377105474472046, |
|
"learning_rate": 3.6776870942878196e-05, |
|
"loss": 0.4489, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5551149881046789, |
|
"grad_norm": 0.19985787570476532, |
|
"learning_rate": 3.645369699145387e-05, |
|
"loss": 0.4497, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5582870737509913, |
|
"grad_norm": 0.2308078110218048, |
|
"learning_rate": 3.6128082122040224e-05, |
|
"loss": 0.453, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5614591593973037, |
|
"grad_norm": 0.19868697226047516, |
|
"learning_rate": 3.5800095727008395e-05, |
|
"loss": 0.4512, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5646312450436162, |
|
"grad_norm": 0.20231026411056519, |
|
"learning_rate": 3.54698077041296e-05, |
|
"loss": 0.4482, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5678033306899286, |
|
"grad_norm": 0.21103453636169434, |
|
"learning_rate": 3.51372884416791e-05, |
|
"loss": 0.452, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.570975416336241, |
|
"grad_norm": 0.21350406110286713, |
|
"learning_rate": 3.4802608803435646e-05, |
|
"loss": 0.4518, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5741475019825535, |
|
"grad_norm": 0.2247006744146347, |
|
"learning_rate": 3.446584011357957e-05, |
|
"loss": 0.451, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5773195876288659, |
|
"grad_norm": 0.20522333681583405, |
|
"learning_rate": 3.412705414149276e-05, |
|
"loss": 0.4491, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5804916732751785, |
|
"grad_norm": 0.20249220728874207, |
|
"learning_rate": 3.3786323086463736e-05, |
|
"loss": 0.4508, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5836637589214909, |
|
"grad_norm": 0.17756901681423187, |
|
"learning_rate": 3.3443719562301147e-05, |
|
"loss": 0.4493, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5868358445678034, |
|
"grad_norm": 0.19246827065944672, |
|
"learning_rate": 3.309931658185892e-05, |
|
"loss": 0.4501, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5900079302141158, |
|
"grad_norm": 0.20991584658622742, |
|
"learning_rate": 3.275318754147636e-05, |
|
"loss": 0.4478, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5931800158604282, |
|
"grad_norm": 0.21498160064220428, |
|
"learning_rate": 3.240540620533649e-05, |
|
"loss": 0.4454, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5963521015067407, |
|
"grad_norm": 0.199667289853096, |
|
"learning_rate": 3.205604668974607e-05, |
|
"loss": 0.4469, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5995241871530531, |
|
"grad_norm": 0.21041718125343323, |
|
"learning_rate": 3.170518344734051e-05, |
|
"loss": 0.4517, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.6026962727993656, |
|
"grad_norm": 0.2041397988796234, |
|
"learning_rate": 3.135289125121718e-05, |
|
"loss": 0.4518, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.605868358445678, |
|
"grad_norm": 0.18860045075416565, |
|
"learning_rate": 3.0999245179000325e-05, |
|
"loss": 0.4511, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6090404440919904, |
|
"grad_norm": 0.20879918336868286, |
|
"learning_rate": 3.064432059684117e-05, |
|
"loss": 0.4504, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.612212529738303, |
|
"grad_norm": 0.18158119916915894, |
|
"learning_rate": 3.0288193143356484e-05, |
|
"loss": 0.4501, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.20405800640583038, |
|
"learning_rate": 2.9930938713509125e-05, |
|
"loss": 0.4478, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6185567010309279, |
|
"grad_norm": 0.19368775188922882, |
|
"learning_rate": 2.9572633442433917e-05, |
|
"loss": 0.449, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6217287866772403, |
|
"grad_norm": 0.19516830146312714, |
|
"learning_rate": 2.9213353689212337e-05, |
|
"loss": 0.4509, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6249008723235527, |
|
"grad_norm": 0.2207994908094406, |
|
"learning_rate": 2.8853176020599504e-05, |
|
"loss": 0.4524, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.6280729579698652, |
|
"grad_norm": 0.20896515250205994, |
|
"learning_rate": 2.849217719470691e-05, |
|
"loss": 0.447, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6312450436161776, |
|
"grad_norm": 0.19302915036678314, |
|
"learning_rate": 2.8130434144644364e-05, |
|
"loss": 0.4462, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.63441712926249, |
|
"grad_norm": 0.21044501662254333, |
|
"learning_rate": 2.776802396212461e-05, |
|
"loss": 0.4522, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.63441712926249, |
|
"eval_loss": 0.44816499948501587, |
|
"eval_runtime": 293.5621, |
|
"eval_samples_per_second": 3.509, |
|
"eval_steps_per_second": 1.754, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6375892149088025, |
|
"grad_norm": 0.2075473517179489, |
|
"learning_rate": 2.7405023881034204e-05, |
|
"loss": 0.4487, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.640761300555115, |
|
"grad_norm": 0.1936168372631073, |
|
"learning_rate": 2.7041511260974028e-05, |
|
"loss": 0.4485, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6439333862014275, |
|
"grad_norm": 0.2282039374113083, |
|
"learning_rate": 2.66775635707731e-05, |
|
"loss": 0.4478, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6471054718477399, |
|
"grad_norm": 0.18652349710464478, |
|
"learning_rate": 2.6313258371978994e-05, |
|
"loss": 0.449, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.6502775574940524, |
|
"grad_norm": 0.20407763123512268, |
|
"learning_rate": 2.5948673302328587e-05, |
|
"loss": 0.4486, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6534496431403648, |
|
"grad_norm": 0.22227470576763153, |
|
"learning_rate": 2.558388605920255e-05, |
|
"loss": 0.4506, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6566217287866772, |
|
"grad_norm": 0.20888665318489075, |
|
"learning_rate": 2.5218974383067085e-05, |
|
"loss": 0.4487, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.6597938144329897, |
|
"grad_norm": 0.19797295331954956, |
|
"learning_rate": 2.4854016040906572e-05, |
|
"loss": 0.4521, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6629659000793021, |
|
"grad_norm": 0.22293215990066528, |
|
"learning_rate": 2.4489088809650425e-05, |
|
"loss": 0.449, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6661379857256146, |
|
"grad_norm": 0.2310066670179367, |
|
"learning_rate": 2.4124270459598007e-05, |
|
"loss": 0.4479, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.669310071371927, |
|
"grad_norm": 0.19995397329330444, |
|
"learning_rate": 2.375963873784478e-05, |
|
"loss": 0.4504, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6724821570182395, |
|
"grad_norm": 0.18815317749977112, |
|
"learning_rate": 2.3395271351713515e-05, |
|
"loss": 0.4501, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.675654242664552, |
|
"grad_norm": 0.19861653447151184, |
|
"learning_rate": 2.303124595219395e-05, |
|
"loss": 0.4506, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6788263283108644, |
|
"grad_norm": 0.17454959452152252, |
|
"learning_rate": 2.2667640117394442e-05, |
|
"loss": 0.4453, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6819984139571769, |
|
"grad_norm": 0.19067569077014923, |
|
"learning_rate": 2.230453133600915e-05, |
|
"loss": 0.4466, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6851704996034893, |
|
"grad_norm": 0.1979096531867981, |
|
"learning_rate": 2.1941996990804288e-05, |
|
"loss": 0.4479, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6883425852498017, |
|
"grad_norm": 0.18306080996990204, |
|
"learning_rate": 2.1580114342126933e-05, |
|
"loss": 0.4482, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6915146708961142, |
|
"grad_norm": 0.18473172187805176, |
|
"learning_rate": 2.1218960511439952e-05, |
|
"loss": 0.4468, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6946867565424266, |
|
"grad_norm": 0.17530548572540283, |
|
"learning_rate": 2.0858612464886505e-05, |
|
"loss": 0.449, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6978588421887391, |
|
"grad_norm": 0.1818351149559021, |
|
"learning_rate": 2.0499146996887618e-05, |
|
"loss": 0.4458, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7010309278350515, |
|
"grad_norm": 0.19750525057315826, |
|
"learning_rate": 2.0140640713776438e-05, |
|
"loss": 0.4489, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.704203013481364, |
|
"grad_norm": 0.20060710608959198, |
|
"learning_rate": 1.978317001747248e-05, |
|
"loss": 0.4482, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.7073750991276765, |
|
"grad_norm": 0.19399641454219818, |
|
"learning_rate": 1.942681108919949e-05, |
|
"loss": 0.4494, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7105471847739889, |
|
"grad_norm": 0.2124367654323578, |
|
"learning_rate": 1.9071639873250334e-05, |
|
"loss": 0.4491, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.7137192704203014, |
|
"grad_norm": 0.2064894288778305, |
|
"learning_rate": 1.871773206080236e-05, |
|
"loss": 0.4433, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7168913560666138, |
|
"grad_norm": 0.20361967384815216, |
|
"learning_rate": 1.836516307378671e-05, |
|
"loss": 0.4482, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7200634417129262, |
|
"grad_norm": 0.22753477096557617, |
|
"learning_rate": 1.801400804881507e-05, |
|
"loss": 0.4434, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.7232355273592387, |
|
"grad_norm": 0.2242439240217209, |
|
"learning_rate": 1.766434182116708e-05, |
|
"loss": 0.4487, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7264076130055511, |
|
"grad_norm": 0.1731296181678772, |
|
"learning_rate": 1.7316238908842127e-05, |
|
"loss": 0.4456, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.7295796986518636, |
|
"grad_norm": 0.19372820854187012, |
|
"learning_rate": 1.6969773496678647e-05, |
|
"loss": 0.446, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.732751784298176, |
|
"grad_norm": 0.21722117066383362, |
|
"learning_rate": 1.662501942054447e-05, |
|
"loss": 0.4465, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7359238699444886, |
|
"grad_norm": 0.24181324243545532, |
|
"learning_rate": 1.6282050151601518e-05, |
|
"loss": 0.4496, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.739095955590801, |
|
"grad_norm": 0.2091435343027115, |
|
"learning_rate": 1.594093878064825e-05, |
|
"loss": 0.4469, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.7422680412371134, |
|
"grad_norm": 0.19474650919437408, |
|
"learning_rate": 1.5601758002543137e-05, |
|
"loss": 0.4465, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.7454401268834259, |
|
"grad_norm": 0.21081089973449707, |
|
"learning_rate": 1.5264580100712507e-05, |
|
"loss": 0.4449, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.7486122125297383, |
|
"grad_norm": 0.21220536530017853, |
|
"learning_rate": 1.4929476931746167e-05, |
|
"loss": 0.4461, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.7517842981760507, |
|
"grad_norm": 0.19944216310977936, |
|
"learning_rate": 1.4596519910083825e-05, |
|
"loss": 0.4476, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.7549563838223632, |
|
"grad_norm": 0.19958476722240448, |
|
"learning_rate": 1.4265779992795893e-05, |
|
"loss": 0.4467, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7581284694686756, |
|
"grad_norm": 0.22247722744941711, |
|
"learning_rate": 1.3937327664461672e-05, |
|
"loss": 0.4427, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.7613005551149881, |
|
"grad_norm": 0.19884702563285828, |
|
"learning_rate": 1.361123292214826e-05, |
|
"loss": 0.4498, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7644726407613005, |
|
"grad_norm": 0.17867441475391388, |
|
"learning_rate": 1.3287565260493357e-05, |
|
"loss": 0.4484, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.767644726407613, |
|
"grad_norm": 0.17917264997959137, |
|
"learning_rate": 1.2966393656895134e-05, |
|
"loss": 0.4439, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.7708168120539255, |
|
"grad_norm": 0.24791833758354187, |
|
"learning_rate": 1.2647786556812332e-05, |
|
"loss": 0.4463, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7739888977002379, |
|
"grad_norm": 0.1971379816532135, |
|
"learning_rate": 1.2331811859177722e-05, |
|
"loss": 0.4475, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7771609833465504, |
|
"grad_norm": 0.19460198283195496, |
|
"learning_rate": 1.2018536901928079e-05, |
|
"loss": 0.4468, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7803330689928628, |
|
"grad_norm": 0.21018314361572266, |
|
"learning_rate": 1.1708028447653613e-05, |
|
"loss": 0.4489, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7835051546391752, |
|
"grad_norm": 0.18419425189495087, |
|
"learning_rate": 1.1400352669370115e-05, |
|
"loss": 0.4469, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7866772402854877, |
|
"grad_norm": 0.192842036485672, |
|
"learning_rate": 1.1095575136416695e-05, |
|
"loss": 0.4457, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7898493259318001, |
|
"grad_norm": 0.21617653965950012, |
|
"learning_rate": 1.0793760800482179e-05, |
|
"loss": 0.4511, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7930214115781126, |
|
"grad_norm": 0.1891343593597412, |
|
"learning_rate": 1.0494973981763146e-05, |
|
"loss": 0.4436, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7930214115781126, |
|
"eval_loss": 0.44588717818260193, |
|
"eval_runtime": 293.564, |
|
"eval_samples_per_second": 3.509, |
|
"eval_steps_per_second": 1.754, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.796193497224425, |
|
"grad_norm": 0.193357452750206, |
|
"learning_rate": 1.0199278355256522e-05, |
|
"loss": 0.4436, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7993655828707376, |
|
"grad_norm": 0.19498059153556824, |
|
"learning_rate": 9.906736937189697e-06, |
|
"loss": 0.4457, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.80253766851705, |
|
"grad_norm": 0.2086293250322342, |
|
"learning_rate": 9.61741207159104e-06, |
|
"loss": 0.4488, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.8057097541633624, |
|
"grad_norm": 0.1872304081916809, |
|
"learning_rate": 9.331365417003601e-06, |
|
"loss": 0.4436, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.8088818398096749, |
|
"grad_norm": 0.196652352809906, |
|
"learning_rate": 9.048657933345e-06, |
|
"loss": 0.4448, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8120539254559873, |
|
"grad_norm": 0.21112217009067535, |
|
"learning_rate": 8.769349868916118e-06, |
|
"loss": 0.4481, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8152260111022998, |
|
"grad_norm": 0.19768798351287842, |
|
"learning_rate": 8.493500747561488e-06, |
|
"loss": 0.4493, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.8183980967486122, |
|
"grad_norm": 0.19622166454792023, |
|
"learning_rate": 8.221169355984052e-06, |
|
"loss": 0.4448, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8215701823949246, |
|
"grad_norm": 0.2044159471988678, |
|
"learning_rate": 7.952413731217025e-06, |
|
"loss": 0.4472, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.8247422680412371, |
|
"grad_norm": 0.206452876329422, |
|
"learning_rate": 7.687291148255527e-06, |
|
"loss": 0.4452, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8279143536875495, |
|
"grad_norm": 0.20858535170555115, |
|
"learning_rate": 7.425858107850578e-06, |
|
"loss": 0.4419, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.8310864393338621, |
|
"grad_norm": 0.18986332416534424, |
|
"learning_rate": 7.168170324468171e-06, |
|
"loss": 0.449, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.8342585249801745, |
|
"grad_norm": 0.21051757037639618, |
|
"learning_rate": 6.9142827144158066e-06, |
|
"loss": 0.4468, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.8374306106264869, |
|
"grad_norm": 0.20453451573848724, |
|
"learning_rate": 6.66424938413921e-06, |
|
"loss": 0.4471, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.8406026962727994, |
|
"grad_norm": 0.18574683368206024, |
|
"learning_rate": 6.418123618691607e-06, |
|
"loss": 0.4473, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.8437747819191118, |
|
"grad_norm": 0.2309403419494629, |
|
"learning_rate": 6.175957870378043e-06, |
|
"loss": 0.4477, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.8469468675654243, |
|
"grad_norm": 0.19485293328762054, |
|
"learning_rate": 5.937803747577186e-06, |
|
"loss": 0.4443, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.8501189532117367, |
|
"grad_norm": 0.17677082121372223, |
|
"learning_rate": 5.7037120037429645e-06, |
|
"loss": 0.4501, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.8532910388580491, |
|
"grad_norm": 0.18835794925689697, |
|
"learning_rate": 5.473732526588407e-06, |
|
"loss": 0.4451, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.8564631245043616, |
|
"grad_norm": 0.2117832899093628, |
|
"learning_rate": 5.2479143274539954e-06, |
|
"loss": 0.4464, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8596352101506741, |
|
"grad_norm": 0.18720006942749023, |
|
"learning_rate": 5.026305530862749e-06, |
|
"loss": 0.4484, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.8628072957969866, |
|
"grad_norm": 0.17910288274288177, |
|
"learning_rate": 4.80895336426434e-06, |
|
"loss": 0.4457, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.865979381443299, |
|
"grad_norm": 0.1965160220861435, |
|
"learning_rate": 4.595904147970356e-06, |
|
"loss": 0.4455, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8691514670896114, |
|
"grad_norm": 0.16718564927577972, |
|
"learning_rate": 4.3872032852828955e-06, |
|
"loss": 0.4455, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.8723235527359239, |
|
"grad_norm": 0.19451048970222473, |
|
"learning_rate": 4.182895252818589e-06, |
|
"loss": 0.4454, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8754956383822363, |
|
"grad_norm": 0.2160135954618454, |
|
"learning_rate": 3.983023591030113e-06, |
|
"loss": 0.4462, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8786677240285488, |
|
"grad_norm": 0.1994054913520813, |
|
"learning_rate": 3.7876308949271995e-06, |
|
"loss": 0.4455, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.8818398096748612, |
|
"grad_norm": 0.20801877975463867, |
|
"learning_rate": 3.5967588049991313e-06, |
|
"loss": 0.4465, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8850118953211736, |
|
"grad_norm": 0.20264749228954315, |
|
"learning_rate": 3.410447998340688e-06, |
|
"loss": 0.4451, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8881839809674861, |
|
"grad_norm": 0.173824280500412, |
|
"learning_rate": 3.2287381799833428e-06, |
|
"loss": 0.4453, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8913560666137986, |
|
"grad_norm": 0.18437042832374573, |
|
"learning_rate": 3.051668074433667e-06, |
|
"loss": 0.449, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8945281522601111, |
|
"grad_norm": 0.17420920729637146, |
|
"learning_rate": 2.8792754174206905e-06, |
|
"loss": 0.4453, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8977002379064235, |
|
"grad_norm": 0.18584947288036346, |
|
"learning_rate": 2.7115969478539562e-06, |
|
"loss": 0.4446, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.9008723235527359, |
|
"grad_norm": 0.18006384372711182, |
|
"learning_rate": 2.5486683999940335e-06, |
|
"loss": 0.4449, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.9040444091990484, |
|
"grad_norm": 0.19664862751960754, |
|
"learning_rate": 2.3905244958371263e-06, |
|
"loss": 0.4463, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9072164948453608, |
|
"grad_norm": 0.20159141719341278, |
|
"learning_rate": 2.2371989377154013e-06, |
|
"loss": 0.444, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.9103885804916733, |
|
"grad_norm": 0.23178640007972717, |
|
"learning_rate": 2.088724401114625e-06, |
|
"loss": 0.4472, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.9135606661379857, |
|
"grad_norm": 0.18047955632209778, |
|
"learning_rate": 1.9451325277106413e-06, |
|
"loss": 0.4489, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.9167327517842981, |
|
"grad_norm": 0.18607738614082336, |
|
"learning_rate": 1.8064539186261387e-06, |
|
"loss": 0.4489, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.9199048374306106, |
|
"grad_norm": 0.19285809993743896, |
|
"learning_rate": 1.6727181279092036e-06, |
|
"loss": 0.4448, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.1946035474538803, |
|
"learning_rate": 1.5439536562349976e-06, |
|
"loss": 0.4474, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9262490087232356, |
|
"grad_norm": 0.1947924941778183, |
|
"learning_rate": 1.4201879448319355e-06, |
|
"loss": 0.4465, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.929421094369548, |
|
"grad_norm": 0.17740875482559204, |
|
"learning_rate": 1.301447369633621e-06, |
|
"loss": 0.4475, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.9325931800158604, |
|
"grad_norm": 0.18701794743537903, |
|
"learning_rate": 1.187757235657841e-06, |
|
"loss": 0.4488, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.9357652656621729, |
|
"grad_norm": 0.19531536102294922, |
|
"learning_rate": 1.0791417716137565e-06, |
|
"loss": 0.4458, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.9389373513084853, |
|
"grad_norm": 0.17441792786121368, |
|
"learning_rate": 9.756241247384807e-07, |
|
"loss": 0.4426, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.9421094369547978, |
|
"grad_norm": 0.20463122427463531, |
|
"learning_rate": 8.772263558641386e-07, |
|
"loss": 0.4485, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.9452815226011102, |
|
"grad_norm": 0.18601520359516144, |
|
"learning_rate": 7.839694347164223e-07, |
|
"loss": 0.4421, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.9484536082474226, |
|
"grad_norm": 0.18209311366081238, |
|
"learning_rate": 6.958732354457209e-07, |
|
"loss": 0.4499, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.9516256938937351, |
|
"grad_norm": 0.1675465852022171, |
|
"learning_rate": 6.129565323916813e-07, |
|
"loss": 0.4463, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9516256938937351, |
|
"eval_loss": 0.44520583748817444, |
|
"eval_runtime": 293.5596, |
|
"eval_samples_per_second": 3.509, |
|
"eval_steps_per_second": 1.754, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9547977795400476, |
|
"grad_norm": 0.21267291903495789, |
|
"learning_rate": 5.352369960821946e-07, |
|
"loss": 0.4428, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.9579698651863601, |
|
"grad_norm": 0.19138512015342712, |
|
"learning_rate": 4.627311894675856e-07, |
|
"loss": 0.446, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.9611419508326725, |
|
"grad_norm": 0.19043755531311035, |
|
"learning_rate": 3.954545643908514e-07, |
|
"loss": 0.4446, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.964314036478985, |
|
"grad_norm": 0.1850534826517105, |
|
"learning_rate": 3.3342145829469983e-07, |
|
"loss": 0.4435, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.9674861221252974, |
|
"grad_norm": 0.21564887464046478, |
|
"learning_rate": 2.7664509116607506e-07, |
|
"loss": 0.4451, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9706582077716098, |
|
"grad_norm": 0.1970747411251068, |
|
"learning_rate": 2.251375627187996e-07, |
|
"loss": 0.4465, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9738302934179223, |
|
"grad_norm": 0.1968993842601776, |
|
"learning_rate": 1.789098498150066e-07, |
|
"loss": 0.4479, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.9770023790642347, |
|
"grad_norm": 0.18383654952049255, |
|
"learning_rate": 1.3797180412583322e-07, |
|
"loss": 0.4472, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9801744647105471, |
|
"grad_norm": 0.18183210492134094, |
|
"learning_rate": 1.0233215003190577e-07, |
|
"loss": 0.4484, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.9833465503568596, |
|
"grad_norm": 0.18428853154182434, |
|
"learning_rate": 7.199848276408238e-08, |
|
"loss": 0.4445, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9865186360031721, |
|
"grad_norm": 0.21674080193042755, |
|
"learning_rate": 4.6977266784811736e-08, |
|
"loss": 0.4465, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.9896907216494846, |
|
"grad_norm": 0.2023499608039856, |
|
"learning_rate": 2.7273834410485034e-08, |
|
"loss": 0.4448, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.992862807295797, |
|
"grad_norm": 0.19151511788368225, |
|
"learning_rate": 1.2892384675056158e-08, |
|
"loss": 0.4461, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9960348929421095, |
|
"grad_norm": 0.16607537865638733, |
|
"learning_rate": 3.8359824351685836e-09, |
|
"loss": 0.4453, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9992069785884219, |
|
"grad_norm": 0.21372340619564056, |
|
"learning_rate": 1.0655771701395534e-10, |
|
"loss": 0.448, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9998413957176844, |
|
"step": 3152, |
|
"total_flos": 2.0499162760961065e+18, |
|
"train_loss": 0.5496831483344742, |
|
"train_runtime": 71646.9274, |
|
"train_samples_per_second": 0.704, |
|
"train_steps_per_second": 0.044 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3152, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 2.0499162760961065e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|