|
{ |
|
"best_metric": 0.1869634985923767, |
|
"best_model_checkpoint": "saves/CADICA_qwenvl_direction_then_DetectAndClassify_scale6/lora/sft/checkpoint-2350", |
|
"epoch": 1.0047267355982274, |
|
"eval_steps": 50, |
|
"global_step": 3401, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014771048744460858, |
|
"grad_norm": 10.597820832195618, |
|
"learning_rate": 1.4749262536873157e-06, |
|
"loss": 1.8844, |
|
"num_input_tokens_seen": 52288, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0029542097488921715, |
|
"grad_norm": 10.631406320216014, |
|
"learning_rate": 2.9498525073746313e-06, |
|
"loss": 1.9494, |
|
"num_input_tokens_seen": 103976, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004431314623338257, |
|
"grad_norm": 9.455384953384051, |
|
"learning_rate": 4.424778761061947e-06, |
|
"loss": 1.995, |
|
"num_input_tokens_seen": 155560, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.005908419497784343, |
|
"grad_norm": 21.163264642835454, |
|
"learning_rate": 5.899705014749263e-06, |
|
"loss": 2.0327, |
|
"num_input_tokens_seen": 206520, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.007385524372230428, |
|
"grad_norm": 10.023350257859445, |
|
"learning_rate": 7.374631268436579e-06, |
|
"loss": 1.9153, |
|
"num_input_tokens_seen": 258464, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.008862629246676515, |
|
"grad_norm": 10.512071286717141, |
|
"learning_rate": 8.849557522123894e-06, |
|
"loss": 1.9723, |
|
"num_input_tokens_seen": 309800, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0103397341211226, |
|
"grad_norm": 11.56445556664387, |
|
"learning_rate": 1.032448377581121e-05, |
|
"loss": 1.6646, |
|
"num_input_tokens_seen": 361216, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.011816838995568686, |
|
"grad_norm": 12.04717354860619, |
|
"learning_rate": 1.1799410029498525e-05, |
|
"loss": 1.7057, |
|
"num_input_tokens_seen": 412680, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.013293943870014771, |
|
"grad_norm": 4.748411551761613, |
|
"learning_rate": 1.3274336283185843e-05, |
|
"loss": 1.4552, |
|
"num_input_tokens_seen": 464640, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.014771048744460856, |
|
"grad_norm": 7.526687970196985, |
|
"learning_rate": 1.4749262536873157e-05, |
|
"loss": 1.3918, |
|
"num_input_tokens_seen": 516240, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014771048744460856, |
|
"eval_loss": 1.042170763015747, |
|
"eval_runtime": 48.777, |
|
"eval_samples_per_second": 1.23, |
|
"eval_steps_per_second": 0.308, |
|
"num_input_tokens_seen": 516240, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01624815361890694, |
|
"grad_norm": 5.812969222976318, |
|
"learning_rate": 1.6224188790560475e-05, |
|
"loss": 1.2308, |
|
"num_input_tokens_seen": 567536, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01772525849335303, |
|
"grad_norm": 2.497864489287105, |
|
"learning_rate": 1.7699115044247787e-05, |
|
"loss": 1.0922, |
|
"num_input_tokens_seen": 619392, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.019202363367799114, |
|
"grad_norm": 1.4292999616096396, |
|
"learning_rate": 1.9174041297935107e-05, |
|
"loss": 0.9517, |
|
"num_input_tokens_seen": 671168, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0206794682422452, |
|
"grad_norm": 1.5696921990486115, |
|
"learning_rate": 2.064896755162242e-05, |
|
"loss": 0.9277, |
|
"num_input_tokens_seen": 722464, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.022156573116691284, |
|
"grad_norm": 2.288073027990093, |
|
"learning_rate": 2.2123893805309738e-05, |
|
"loss": 0.8741, |
|
"num_input_tokens_seen": 774120, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.023633677991137372, |
|
"grad_norm": 2.658756604981594, |
|
"learning_rate": 2.359882005899705e-05, |
|
"loss": 0.8837, |
|
"num_input_tokens_seen": 825944, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.025110782865583457, |
|
"grad_norm": 2.142324288581783, |
|
"learning_rate": 2.5073746312684367e-05, |
|
"loss": 0.8658, |
|
"num_input_tokens_seen": 877632, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.026587887740029542, |
|
"grad_norm": 1.4822847258427907, |
|
"learning_rate": 2.6548672566371686e-05, |
|
"loss": 0.8626, |
|
"num_input_tokens_seen": 928664, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.028064992614475627, |
|
"grad_norm": 1.2798949826045687, |
|
"learning_rate": 2.8023598820059e-05, |
|
"loss": 0.828, |
|
"num_input_tokens_seen": 980120, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.029542097488921712, |
|
"grad_norm": 1.0072961699690943, |
|
"learning_rate": 2.9498525073746314e-05, |
|
"loss": 0.8208, |
|
"num_input_tokens_seen": 1030696, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.029542097488921712, |
|
"eval_loss": 0.8917127847671509, |
|
"eval_runtime": 19.2166, |
|
"eval_samples_per_second": 3.122, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 1030696, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0310192023633678, |
|
"grad_norm": 1.8545776603803357, |
|
"learning_rate": 3.097345132743363e-05, |
|
"loss": 0.8858, |
|
"num_input_tokens_seen": 1083184, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03249630723781388, |
|
"grad_norm": 1.5328827906971274, |
|
"learning_rate": 3.244837758112095e-05, |
|
"loss": 0.8395, |
|
"num_input_tokens_seen": 1135216, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.033973412112259974, |
|
"grad_norm": 1.2347057774721741, |
|
"learning_rate": 3.3923303834808265e-05, |
|
"loss": 0.8729, |
|
"num_input_tokens_seen": 1187592, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03545051698670606, |
|
"grad_norm": 1.478939426983387, |
|
"learning_rate": 3.5398230088495574e-05, |
|
"loss": 0.8534, |
|
"num_input_tokens_seen": 1239544, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03692762186115214, |
|
"grad_norm": 1.7015290012303235, |
|
"learning_rate": 3.687315634218289e-05, |
|
"loss": 0.8621, |
|
"num_input_tokens_seen": 1291496, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03840472673559823, |
|
"grad_norm": 1.1767502699022077, |
|
"learning_rate": 3.834808259587021e-05, |
|
"loss": 0.8548, |
|
"num_input_tokens_seen": 1344704, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03988183161004431, |
|
"grad_norm": 0.8775651985355217, |
|
"learning_rate": 3.982300884955752e-05, |
|
"loss": 0.8555, |
|
"num_input_tokens_seen": 1396432, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.0413589364844904, |
|
"grad_norm": 1.0196921729702504, |
|
"learning_rate": 4.129793510324484e-05, |
|
"loss": 0.8503, |
|
"num_input_tokens_seen": 1448304, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04283604135893648, |
|
"grad_norm": 1.0316734242866998, |
|
"learning_rate": 4.2772861356932154e-05, |
|
"loss": 0.7974, |
|
"num_input_tokens_seen": 1500480, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.04431314623338257, |
|
"grad_norm": 1.3815092616140325, |
|
"learning_rate": 4.4247787610619477e-05, |
|
"loss": 0.8125, |
|
"num_input_tokens_seen": 1550792, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04431314623338257, |
|
"eval_loss": 0.9009397625923157, |
|
"eval_runtime": 19.1097, |
|
"eval_samples_per_second": 3.14, |
|
"eval_steps_per_second": 0.785, |
|
"num_input_tokens_seen": 1550792, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.04579025110782865, |
|
"grad_norm": 0.9311064643035706, |
|
"learning_rate": 4.5722713864306786e-05, |
|
"loss": 0.8444, |
|
"num_input_tokens_seen": 1602680, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.047267355982274745, |
|
"grad_norm": 0.8826375779686524, |
|
"learning_rate": 4.71976401179941e-05, |
|
"loss": 0.8832, |
|
"num_input_tokens_seen": 1655184, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.04874446085672083, |
|
"grad_norm": 1.7104989113767923, |
|
"learning_rate": 4.867256637168142e-05, |
|
"loss": 0.8428, |
|
"num_input_tokens_seen": 1707544, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.050221565731166914, |
|
"grad_norm": 1.0884565315781036, |
|
"learning_rate": 5.014749262536873e-05, |
|
"loss": 0.8235, |
|
"num_input_tokens_seen": 1759296, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.051698670605613, |
|
"grad_norm": 1.2821380368908926, |
|
"learning_rate": 5.162241887905604e-05, |
|
"loss": 0.8293, |
|
"num_input_tokens_seen": 1812488, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.053175775480059084, |
|
"grad_norm": 1.8069978017028316, |
|
"learning_rate": 5.309734513274337e-05, |
|
"loss": 0.8284, |
|
"num_input_tokens_seen": 1864408, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.05465288035450517, |
|
"grad_norm": 1.3165540148767247, |
|
"learning_rate": 5.457227138643069e-05, |
|
"loss": 0.8268, |
|
"num_input_tokens_seen": 1916744, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.056129985228951254, |
|
"grad_norm": 1.5174910847154595, |
|
"learning_rate": 5.6047197640118e-05, |
|
"loss": 0.8153, |
|
"num_input_tokens_seen": 1968128, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.05760709010339734, |
|
"grad_norm": 1.4410901045723357, |
|
"learning_rate": 5.752212389380531e-05, |
|
"loss": 0.8123, |
|
"num_input_tokens_seen": 2019312, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.059084194977843424, |
|
"grad_norm": 1.5417669370284124, |
|
"learning_rate": 5.899705014749263e-05, |
|
"loss": 0.7675, |
|
"num_input_tokens_seen": 2071176, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.059084194977843424, |
|
"eval_loss": 0.9007444977760315, |
|
"eval_runtime": 19.0725, |
|
"eval_samples_per_second": 3.146, |
|
"eval_steps_per_second": 0.786, |
|
"num_input_tokens_seen": 2071176, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.060561299852289516, |
|
"grad_norm": 3.485842845243708, |
|
"learning_rate": 6.0471976401179945e-05, |
|
"loss": 0.8075, |
|
"num_input_tokens_seen": 2122328, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.0620384047267356, |
|
"grad_norm": 1.9945831681716613, |
|
"learning_rate": 6.194690265486725e-05, |
|
"loss": 0.8207, |
|
"num_input_tokens_seen": 2174744, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.06351550960118169, |
|
"grad_norm": 2.5224102035468907, |
|
"learning_rate": 6.342182890855458e-05, |
|
"loss": 0.7867, |
|
"num_input_tokens_seen": 2227136, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.06499261447562776, |
|
"grad_norm": 2.7703394460607833, |
|
"learning_rate": 6.48967551622419e-05, |
|
"loss": 0.8256, |
|
"num_input_tokens_seen": 2278568, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.06646971935007386, |
|
"grad_norm": 4.676590157125056, |
|
"learning_rate": 6.637168141592921e-05, |
|
"loss": 0.7897, |
|
"num_input_tokens_seen": 2330224, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.06794682422451995, |
|
"grad_norm": 1.6101062596215647, |
|
"learning_rate": 6.784660766961653e-05, |
|
"loss": 0.792, |
|
"num_input_tokens_seen": 2381344, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06942392909896603, |
|
"grad_norm": 3.202676293331468, |
|
"learning_rate": 6.932153392330384e-05, |
|
"loss": 0.8309, |
|
"num_input_tokens_seen": 2432136, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.07090103397341212, |
|
"grad_norm": 1.3573723032246008, |
|
"learning_rate": 7.079646017699115e-05, |
|
"loss": 0.7974, |
|
"num_input_tokens_seen": 2483568, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0723781388478582, |
|
"grad_norm": 1.317885929036595, |
|
"learning_rate": 7.227138643067847e-05, |
|
"loss": 0.7739, |
|
"num_input_tokens_seen": 2535040, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.07385524372230429, |
|
"grad_norm": 2.1810508869311067, |
|
"learning_rate": 7.374631268436578e-05, |
|
"loss": 0.7558, |
|
"num_input_tokens_seen": 2587272, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07385524372230429, |
|
"eval_loss": 0.810763955116272, |
|
"eval_runtime": 19.1642, |
|
"eval_samples_per_second": 3.131, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 2587272, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.07533234859675036, |
|
"grad_norm": 2.353270620583961, |
|
"learning_rate": 7.522123893805309e-05, |
|
"loss": 0.7851, |
|
"num_input_tokens_seen": 2638632, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.07680945347119646, |
|
"grad_norm": 3.0664271713541873, |
|
"learning_rate": 7.669616519174043e-05, |
|
"loss": 0.7211, |
|
"num_input_tokens_seen": 2691016, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.07828655834564253, |
|
"grad_norm": 5.498034008223314, |
|
"learning_rate": 7.817109144542774e-05, |
|
"loss": 0.8082, |
|
"num_input_tokens_seen": 2742912, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.07976366322008863, |
|
"grad_norm": 14.573947499657416, |
|
"learning_rate": 7.964601769911504e-05, |
|
"loss": 0.7485, |
|
"num_input_tokens_seen": 2795264, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.08124076809453472, |
|
"grad_norm": 3.007807281619928, |
|
"learning_rate": 8.112094395280237e-05, |
|
"loss": 0.7454, |
|
"num_input_tokens_seen": 2846344, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.0827178729689808, |
|
"grad_norm": 6.015750773450144, |
|
"learning_rate": 8.259587020648968e-05, |
|
"loss": 0.7258, |
|
"num_input_tokens_seen": 2898304, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.08419497784342689, |
|
"grad_norm": 2.390238834834483, |
|
"learning_rate": 8.4070796460177e-05, |
|
"loss": 0.7863, |
|
"num_input_tokens_seen": 2951368, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.08567208271787297, |
|
"grad_norm": 3.0216023427899357, |
|
"learning_rate": 8.554572271386431e-05, |
|
"loss": 0.7105, |
|
"num_input_tokens_seen": 3003288, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.08714918759231906, |
|
"grad_norm": 3.255437171887138, |
|
"learning_rate": 8.702064896755162e-05, |
|
"loss": 0.6885, |
|
"num_input_tokens_seen": 3054808, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.08862629246676514, |
|
"grad_norm": 3.506440325367033, |
|
"learning_rate": 8.849557522123895e-05, |
|
"loss": 0.78, |
|
"num_input_tokens_seen": 3107200, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.08862629246676514, |
|
"eval_loss": 0.8194220662117004, |
|
"eval_runtime": 19.1748, |
|
"eval_samples_per_second": 3.129, |
|
"eval_steps_per_second": 0.782, |
|
"num_input_tokens_seen": 3107200, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09010339734121123, |
|
"grad_norm": 2.2491377996087385, |
|
"learning_rate": 8.997050147492626e-05, |
|
"loss": 0.7394, |
|
"num_input_tokens_seen": 3158648, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.0915805022156573, |
|
"grad_norm": 8.740989037358858, |
|
"learning_rate": 9.144542772861357e-05, |
|
"loss": 0.7371, |
|
"num_input_tokens_seen": 3210560, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.0930576070901034, |
|
"grad_norm": 4.552322042735297, |
|
"learning_rate": 9.29203539823009e-05, |
|
"loss": 0.7622, |
|
"num_input_tokens_seen": 3263664, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.09453471196454949, |
|
"grad_norm": 5.062297996808734, |
|
"learning_rate": 9.43952802359882e-05, |
|
"loss": 0.7214, |
|
"num_input_tokens_seen": 3315520, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.09601181683899557, |
|
"grad_norm": 4.295724990139621, |
|
"learning_rate": 9.587020648967551e-05, |
|
"loss": 0.7078, |
|
"num_input_tokens_seen": 3368088, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.09748892171344166, |
|
"grad_norm": 7.980776602247676, |
|
"learning_rate": 9.734513274336283e-05, |
|
"loss": 0.6852, |
|
"num_input_tokens_seen": 3420176, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.09896602658788774, |
|
"grad_norm": 33.51326353666061, |
|
"learning_rate": 9.882005899705014e-05, |
|
"loss": 0.7557, |
|
"num_input_tokens_seen": 3471184, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.10044313146233383, |
|
"grad_norm": 4.612370523858782, |
|
"learning_rate": 9.99999940340072e-05, |
|
"loss": 0.6709, |
|
"num_input_tokens_seen": 3523008, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1019202363367799, |
|
"grad_norm": 3.706129373980499, |
|
"learning_rate": 9.999978522440803e-05, |
|
"loss": 0.7252, |
|
"num_input_tokens_seen": 3573880, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.103397341211226, |
|
"grad_norm": 4.907030070826967, |
|
"learning_rate": 9.999927811659165e-05, |
|
"loss": 0.6602, |
|
"num_input_tokens_seen": 3625752, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.103397341211226, |
|
"eval_loss": 0.7663387656211853, |
|
"eval_runtime": 19.2114, |
|
"eval_samples_per_second": 3.123, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 3625752, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.10487444608567208, |
|
"grad_norm": 21.440776172892136, |
|
"learning_rate": 9.999847271358347e-05, |
|
"loss": 0.7222, |
|
"num_input_tokens_seen": 3676984, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.10635155096011817, |
|
"grad_norm": 3.9046938348237252, |
|
"learning_rate": 9.99973690201885e-05, |
|
"loss": 0.6639, |
|
"num_input_tokens_seen": 3729168, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.10782865583456426, |
|
"grad_norm": 5.783088469074539, |
|
"learning_rate": 9.999596704299139e-05, |
|
"loss": 0.6501, |
|
"num_input_tokens_seen": 3780672, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.10930576070901034, |
|
"grad_norm": 4.421078404189889, |
|
"learning_rate": 9.999426679035628e-05, |
|
"loss": 0.6871, |
|
"num_input_tokens_seen": 3832328, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.11078286558345643, |
|
"grad_norm": 6.348668054760613, |
|
"learning_rate": 9.99922682724269e-05, |
|
"loss": 0.6621, |
|
"num_input_tokens_seen": 3883112, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.11225997045790251, |
|
"grad_norm": 3.7958066942788573, |
|
"learning_rate": 9.998997150112635e-05, |
|
"loss": 0.7156, |
|
"num_input_tokens_seen": 3934976, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1137370753323486, |
|
"grad_norm": 2.694693287446712, |
|
"learning_rate": 9.998737649015718e-05, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 3986192, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.11521418020679468, |
|
"grad_norm": 6.428397441401454, |
|
"learning_rate": 9.998448325500118e-05, |
|
"loss": 0.682, |
|
"num_input_tokens_seen": 4037760, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.11669128508124077, |
|
"grad_norm": 6.165388347571309, |
|
"learning_rate": 9.998129181291936e-05, |
|
"loss": 0.6137, |
|
"num_input_tokens_seen": 4090872, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.11816838995568685, |
|
"grad_norm": 4.354814876263017, |
|
"learning_rate": 9.997780218295185e-05, |
|
"loss": 0.6739, |
|
"num_input_tokens_seen": 4142592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11816838995568685, |
|
"eval_loss": 0.7038857936859131, |
|
"eval_runtime": 19.0624, |
|
"eval_samples_per_second": 3.148, |
|
"eval_steps_per_second": 0.787, |
|
"num_input_tokens_seen": 4142592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11964549483013294, |
|
"grad_norm": 4.400494365327609, |
|
"learning_rate": 9.997401438591772e-05, |
|
"loss": 0.6209, |
|
"num_input_tokens_seen": 4194920, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.12112259970457903, |
|
"grad_norm": 4.518582133930376, |
|
"learning_rate": 9.996992844441495e-05, |
|
"loss": 0.6576, |
|
"num_input_tokens_seen": 4247048, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.12259970457902511, |
|
"grad_norm": 2.6773114089558043, |
|
"learning_rate": 9.996554438282022e-05, |
|
"loss": 0.6851, |
|
"num_input_tokens_seen": 4299728, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.1240768094534712, |
|
"grad_norm": 10.583849604294256, |
|
"learning_rate": 9.996086222728879e-05, |
|
"loss": 0.6288, |
|
"num_input_tokens_seen": 4351088, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1255539143279173, |
|
"grad_norm": 5.18430085456359, |
|
"learning_rate": 9.995588200575439e-05, |
|
"loss": 0.667, |
|
"num_input_tokens_seen": 4403016, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.12703101920236337, |
|
"grad_norm": 3.624202284960618, |
|
"learning_rate": 9.995060374792892e-05, |
|
"loss": 0.6747, |
|
"num_input_tokens_seen": 4453880, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.12850812407680945, |
|
"grad_norm": 8.193068077958594, |
|
"learning_rate": 9.994502748530244e-05, |
|
"loss": 0.6594, |
|
"num_input_tokens_seen": 4505616, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.12998522895125553, |
|
"grad_norm": 6.0681139183306145, |
|
"learning_rate": 9.993915325114288e-05, |
|
"loss": 0.6727, |
|
"num_input_tokens_seen": 4558384, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.13146233382570163, |
|
"grad_norm": 3.998790148445953, |
|
"learning_rate": 9.993298108049582e-05, |
|
"loss": 0.6526, |
|
"num_input_tokens_seen": 4611184, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.1329394387001477, |
|
"grad_norm": 3.1838689643423392, |
|
"learning_rate": 9.992651101018445e-05, |
|
"loss": 0.5661, |
|
"num_input_tokens_seen": 4663320, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1329394387001477, |
|
"eval_loss": 0.7132604718208313, |
|
"eval_runtime": 18.996, |
|
"eval_samples_per_second": 3.159, |
|
"eval_steps_per_second": 0.79, |
|
"num_input_tokens_seen": 4663320, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1344165435745938, |
|
"grad_norm": 8.239876852269616, |
|
"learning_rate": 9.991974307880907e-05, |
|
"loss": 0.5954, |
|
"num_input_tokens_seen": 4714448, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.1358936484490399, |
|
"grad_norm": 21.47914479659389, |
|
"learning_rate": 9.991267732674711e-05, |
|
"loss": 0.721, |
|
"num_input_tokens_seen": 4767136, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.13737075332348597, |
|
"grad_norm": 7.30460731660639, |
|
"learning_rate": 9.99053137961528e-05, |
|
"loss": 0.6578, |
|
"num_input_tokens_seen": 4819408, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.13884785819793205, |
|
"grad_norm": 8.944279395571234, |
|
"learning_rate": 9.989765253095686e-05, |
|
"loss": 0.6642, |
|
"num_input_tokens_seen": 4872120, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.14032496307237813, |
|
"grad_norm": 20.451510949854647, |
|
"learning_rate": 9.988969357686636e-05, |
|
"loss": 0.6462, |
|
"num_input_tokens_seen": 4924400, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.14180206794682423, |
|
"grad_norm": 6.498846626456819, |
|
"learning_rate": 9.988143698136429e-05, |
|
"loss": 0.6055, |
|
"num_input_tokens_seen": 4976504, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1432791728212703, |
|
"grad_norm": 9.137509561710141, |
|
"learning_rate": 9.987288279370945e-05, |
|
"loss": 0.5928, |
|
"num_input_tokens_seen": 5028648, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.1447562776957164, |
|
"grad_norm": 6.8502382033465885, |
|
"learning_rate": 9.986403106493604e-05, |
|
"loss": 0.5835, |
|
"num_input_tokens_seen": 5080488, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.14623338257016247, |
|
"grad_norm": 6.731902879463201, |
|
"learning_rate": 9.985488184785336e-05, |
|
"loss": 0.6641, |
|
"num_input_tokens_seen": 5131744, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.14771048744460857, |
|
"grad_norm": 5.718986997919185, |
|
"learning_rate": 9.984543519704557e-05, |
|
"loss": 0.6283, |
|
"num_input_tokens_seen": 5183664, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14771048744460857, |
|
"eval_loss": 0.6505001187324524, |
|
"eval_runtime": 18.9372, |
|
"eval_samples_per_second": 3.168, |
|
"eval_steps_per_second": 0.792, |
|
"num_input_tokens_seen": 5183664, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.14918759231905465, |
|
"grad_norm": 11.975231950682153, |
|
"learning_rate": 9.983569116887128e-05, |
|
"loss": 0.573, |
|
"num_input_tokens_seen": 5234920, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.15066469719350073, |
|
"grad_norm": 13.701425113518031, |
|
"learning_rate": 9.982564982146327e-05, |
|
"loss": 0.6261, |
|
"num_input_tokens_seen": 5287312, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.15214180206794684, |
|
"grad_norm": 6.956866723254775, |
|
"learning_rate": 9.981531121472811e-05, |
|
"loss": 0.6072, |
|
"num_input_tokens_seen": 5340240, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.1536189069423929, |
|
"grad_norm": 5.293847645949678, |
|
"learning_rate": 9.980467541034584e-05, |
|
"loss": 0.565, |
|
"num_input_tokens_seen": 5392600, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.155096011816839, |
|
"grad_norm": 3.5333148010719357, |
|
"learning_rate": 9.979374247176956e-05, |
|
"loss": 0.6188, |
|
"num_input_tokens_seen": 5445168, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.15657311669128507, |
|
"grad_norm": 2.715838950258193, |
|
"learning_rate": 9.978251246422505e-05, |
|
"loss": 0.6069, |
|
"num_input_tokens_seen": 5496384, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.15805022156573117, |
|
"grad_norm": 7.400638197441027, |
|
"learning_rate": 9.977098545471046e-05, |
|
"loss": 0.5805, |
|
"num_input_tokens_seen": 5548264, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.15952732644017725, |
|
"grad_norm": 8.936418653401088, |
|
"learning_rate": 9.975916151199579e-05, |
|
"loss": 0.6383, |
|
"num_input_tokens_seen": 5599216, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.16100443131462333, |
|
"grad_norm": 7.142901090509074, |
|
"learning_rate": 9.974704070662254e-05, |
|
"loss": 0.5845, |
|
"num_input_tokens_seen": 5650816, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.16248153618906944, |
|
"grad_norm": 18.523556086651276, |
|
"learning_rate": 9.973462311090336e-05, |
|
"loss": 0.5957, |
|
"num_input_tokens_seen": 5703016, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16248153618906944, |
|
"eval_loss": 0.6883422136306763, |
|
"eval_runtime": 19.183, |
|
"eval_samples_per_second": 3.128, |
|
"eval_steps_per_second": 0.782, |
|
"num_input_tokens_seen": 5703016, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.16395864106351551, |
|
"grad_norm": 10.880382658420737, |
|
"learning_rate": 9.972190879892147e-05, |
|
"loss": 0.6076, |
|
"num_input_tokens_seen": 5754192, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.1654357459379616, |
|
"grad_norm": 5.9115707757479345, |
|
"learning_rate": 9.970889784653033e-05, |
|
"loss": 0.6136, |
|
"num_input_tokens_seen": 5806272, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.16691285081240767, |
|
"grad_norm": 8.300559629359741, |
|
"learning_rate": 9.969559033135318e-05, |
|
"loss": 0.5554, |
|
"num_input_tokens_seen": 5858632, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.16838995568685378, |
|
"grad_norm": 19.24269810236072, |
|
"learning_rate": 9.96819863327825e-05, |
|
"loss": 0.5847, |
|
"num_input_tokens_seen": 5909936, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.16986706056129985, |
|
"grad_norm": 2.997295434716295, |
|
"learning_rate": 9.966808593197959e-05, |
|
"loss": 0.6217, |
|
"num_input_tokens_seen": 5961464, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.17134416543574593, |
|
"grad_norm": 8.454212007467431, |
|
"learning_rate": 9.965388921187413e-05, |
|
"loss": 0.5569, |
|
"num_input_tokens_seen": 6013696, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.172821270310192, |
|
"grad_norm": 11.728020547911296, |
|
"learning_rate": 9.963939625716361e-05, |
|
"loss": 0.5894, |
|
"num_input_tokens_seen": 6065736, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.17429837518463812, |
|
"grad_norm": 20.470288976160585, |
|
"learning_rate": 9.962460715431284e-05, |
|
"loss": 0.5783, |
|
"num_input_tokens_seen": 6118400, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.1757754800590842, |
|
"grad_norm": 4.675971808784723, |
|
"learning_rate": 9.960952199155347e-05, |
|
"loss": 0.5657, |
|
"num_input_tokens_seen": 6171120, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.17725258493353027, |
|
"grad_norm": 9.775804001092958, |
|
"learning_rate": 9.959414085888342e-05, |
|
"loss": 0.6331, |
|
"num_input_tokens_seen": 6222736, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17725258493353027, |
|
"eval_loss": 0.5883122682571411, |
|
"eval_runtime": 19.002, |
|
"eval_samples_per_second": 3.158, |
|
"eval_steps_per_second": 0.789, |
|
"num_input_tokens_seen": 6222736, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.17872968980797638, |
|
"grad_norm": 8.081060384434974, |
|
"learning_rate": 9.957846384806636e-05, |
|
"loss": 0.5678, |
|
"num_input_tokens_seen": 6274328, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.18020679468242246, |
|
"grad_norm": 10.520198943062466, |
|
"learning_rate": 9.956249105263121e-05, |
|
"loss": 0.5609, |
|
"num_input_tokens_seen": 6327088, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.18168389955686853, |
|
"grad_norm": 5.336067400981417, |
|
"learning_rate": 9.95462225678715e-05, |
|
"loss": 0.5177, |
|
"num_input_tokens_seen": 6378824, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.1831610044313146, |
|
"grad_norm": 5.263245734989025, |
|
"learning_rate": 9.952965849084483e-05, |
|
"loss": 0.5839, |
|
"num_input_tokens_seen": 6431024, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.18463810930576072, |
|
"grad_norm": 5.175847441048381, |
|
"learning_rate": 9.951279892037233e-05, |
|
"loss": 0.5069, |
|
"num_input_tokens_seen": 6483072, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.1861152141802068, |
|
"grad_norm": 12.247546396996816, |
|
"learning_rate": 9.949564395703803e-05, |
|
"loss": 0.495, |
|
"num_input_tokens_seen": 6534768, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.18759231905465287, |
|
"grad_norm": 8.126956720775665, |
|
"learning_rate": 9.947819370318825e-05, |
|
"loss": 0.6435, |
|
"num_input_tokens_seen": 6586416, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.18906942392909898, |
|
"grad_norm": 9.112136009018696, |
|
"learning_rate": 9.946044826293106e-05, |
|
"loss": 0.5014, |
|
"num_input_tokens_seen": 6638592, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.19054652880354506, |
|
"grad_norm": 7.086235271485555, |
|
"learning_rate": 9.944240774213556e-05, |
|
"loss": 0.529, |
|
"num_input_tokens_seen": 6689920, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.19202363367799113, |
|
"grad_norm": 21.538813510868643, |
|
"learning_rate": 9.942407224843132e-05, |
|
"loss": 0.5483, |
|
"num_input_tokens_seen": 6743120, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.19202363367799113, |
|
"eval_loss": 0.6100574135780334, |
|
"eval_runtime": 18.9585, |
|
"eval_samples_per_second": 3.165, |
|
"eval_steps_per_second": 0.791, |
|
"num_input_tokens_seen": 6743120, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1935007385524372, |
|
"grad_norm": 5.437189286202135, |
|
"learning_rate": 9.940544189120771e-05, |
|
"loss": 0.5499, |
|
"num_input_tokens_seen": 6794096, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.19497784342688332, |
|
"grad_norm": 11.687077385856876, |
|
"learning_rate": 9.938651678161326e-05, |
|
"loss": 0.5866, |
|
"num_input_tokens_seen": 6846200, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1964549483013294, |
|
"grad_norm": 7.149806146104705, |
|
"learning_rate": 9.936729703255498e-05, |
|
"loss": 0.4958, |
|
"num_input_tokens_seen": 6899280, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.19793205317577547, |
|
"grad_norm": 7.7389407646353225, |
|
"learning_rate": 9.93477827586977e-05, |
|
"loss": 0.4232, |
|
"num_input_tokens_seen": 6950608, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.19940915805022155, |
|
"grad_norm": 10.670856796845847, |
|
"learning_rate": 9.932797407646338e-05, |
|
"loss": 0.5407, |
|
"num_input_tokens_seen": 7002696, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.20088626292466766, |
|
"grad_norm": 4.979252179082294, |
|
"learning_rate": 9.93078711040304e-05, |
|
"loss": 0.4553, |
|
"num_input_tokens_seen": 7055160, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.20236336779911374, |
|
"grad_norm": 16.633134937375967, |
|
"learning_rate": 9.928747396133294e-05, |
|
"loss": 0.5565, |
|
"num_input_tokens_seen": 7107224, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2038404726735598, |
|
"grad_norm": 11.730384893098227, |
|
"learning_rate": 9.926678277006011e-05, |
|
"loss": 0.5951, |
|
"num_input_tokens_seen": 7158376, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.20531757754800592, |
|
"grad_norm": 7.120874965147562, |
|
"learning_rate": 9.924579765365536e-05, |
|
"loss": 0.4764, |
|
"num_input_tokens_seen": 7210552, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.206794682422452, |
|
"grad_norm": 7.463812624673142, |
|
"learning_rate": 9.922451873731569e-05, |
|
"loss": 0.477, |
|
"num_input_tokens_seen": 7262832, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.206794682422452, |
|
"eval_loss": 0.5883837938308716, |
|
"eval_runtime": 19.0983, |
|
"eval_samples_per_second": 3.142, |
|
"eval_steps_per_second": 0.785, |
|
"num_input_tokens_seen": 7262832, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.20827178729689808, |
|
"grad_norm": 5.1484150923671, |
|
"learning_rate": 9.92029461479909e-05, |
|
"loss": 0.5151, |
|
"num_input_tokens_seen": 7314520, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.20974889217134415, |
|
"grad_norm": 10.902688361325707, |
|
"learning_rate": 9.918108001438283e-05, |
|
"loss": 0.6158, |
|
"num_input_tokens_seen": 7365368, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.21122599704579026, |
|
"grad_norm": 6.960249994011121, |
|
"learning_rate": 9.915892046694464e-05, |
|
"loss": 0.5164, |
|
"num_input_tokens_seen": 7417296, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.21270310192023634, |
|
"grad_norm": 6.138105593354917, |
|
"learning_rate": 9.913646763787992e-05, |
|
"loss": 0.5823, |
|
"num_input_tokens_seen": 7469640, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.21418020679468242, |
|
"grad_norm": 5.865897561310743, |
|
"learning_rate": 9.911372166114208e-05, |
|
"loss": 0.5145, |
|
"num_input_tokens_seen": 7521520, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.21565731166912852, |
|
"grad_norm": 14.330290158050401, |
|
"learning_rate": 9.909068267243336e-05, |
|
"loss": 0.571, |
|
"num_input_tokens_seen": 7573880, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2171344165435746, |
|
"grad_norm": 6.90669118248274, |
|
"learning_rate": 9.906735080920413e-05, |
|
"loss": 0.4638, |
|
"num_input_tokens_seen": 7625896, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.21861152141802068, |
|
"grad_norm": 6.310395883326308, |
|
"learning_rate": 9.904372621065206e-05, |
|
"loss": 0.5449, |
|
"num_input_tokens_seen": 7676528, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.22008862629246675, |
|
"grad_norm": 27.446565035007364, |
|
"learning_rate": 9.901980901772126e-05, |
|
"loss": 0.5505, |
|
"num_input_tokens_seen": 7728240, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.22156573116691286, |
|
"grad_norm": 5.6934624405279655, |
|
"learning_rate": 9.899559937310148e-05, |
|
"loss": 0.514, |
|
"num_input_tokens_seen": 7779872, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.22156573116691286, |
|
"eval_loss": 0.4665524661540985, |
|
"eval_runtime": 19.1629, |
|
"eval_samples_per_second": 3.131, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 7779872, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.22304283604135894, |
|
"grad_norm": 7.392056712218606, |
|
"learning_rate": 9.897109742122721e-05, |
|
"loss": 0.5248, |
|
"num_input_tokens_seen": 7832168, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.22451994091580502, |
|
"grad_norm": 9.230824229530686, |
|
"learning_rate": 9.894630330827686e-05, |
|
"loss": 0.5017, |
|
"num_input_tokens_seen": 7884040, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2259970457902511, |
|
"grad_norm": 11.203609848309013, |
|
"learning_rate": 9.892121718217182e-05, |
|
"loss": 0.4896, |
|
"num_input_tokens_seen": 7935528, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.2274741506646972, |
|
"grad_norm": 30.185572869944284, |
|
"learning_rate": 9.88958391925757e-05, |
|
"loss": 0.5125, |
|
"num_input_tokens_seen": 7987760, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.22895125553914328, |
|
"grad_norm": 18.649424971543322, |
|
"learning_rate": 9.887016949089333e-05, |
|
"loss": 0.5615, |
|
"num_input_tokens_seen": 8039400, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.23042836041358936, |
|
"grad_norm": 5.360845077873566, |
|
"learning_rate": 9.884420823026989e-05, |
|
"loss": 0.494, |
|
"num_input_tokens_seen": 8092440, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.23190546528803546, |
|
"grad_norm": 10.101391912363345, |
|
"learning_rate": 9.881795556558999e-05, |
|
"loss": 0.5122, |
|
"num_input_tokens_seen": 8145040, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.23338257016248154, |
|
"grad_norm": 5.90491429019666, |
|
"learning_rate": 9.879141165347678e-05, |
|
"loss": 0.4925, |
|
"num_input_tokens_seen": 8196904, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.23485967503692762, |
|
"grad_norm": 6.228283676778458, |
|
"learning_rate": 9.876457665229097e-05, |
|
"loss": 0.4752, |
|
"num_input_tokens_seen": 8249232, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.2363367799113737, |
|
"grad_norm": 8.496099871334396, |
|
"learning_rate": 9.87374507221299e-05, |
|
"loss": 0.4239, |
|
"num_input_tokens_seen": 8301976, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2363367799113737, |
|
"eval_loss": 0.48219749331474304, |
|
"eval_runtime": 19.0825, |
|
"eval_samples_per_second": 3.144, |
|
"eval_steps_per_second": 0.786, |
|
"num_input_tokens_seen": 8301976, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2378138847858198, |
|
"grad_norm": 5.415068909643764, |
|
"learning_rate": 9.87100340248266e-05, |
|
"loss": 0.4482, |
|
"num_input_tokens_seen": 8353736, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.23929098966026588, |
|
"grad_norm": 24.48801392473646, |
|
"learning_rate": 9.868232672394881e-05, |
|
"loss": 0.4764, |
|
"num_input_tokens_seen": 8406216, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.24076809453471196, |
|
"grad_norm": 12.500688224717539, |
|
"learning_rate": 9.8654328984798e-05, |
|
"loss": 0.4476, |
|
"num_input_tokens_seen": 8457752, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.24224519940915806, |
|
"grad_norm": 6.4171543173466405, |
|
"learning_rate": 9.862604097440844e-05, |
|
"loss": 0.4295, |
|
"num_input_tokens_seen": 8510440, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.24372230428360414, |
|
"grad_norm": 8.42476760299212, |
|
"learning_rate": 9.859746286154607e-05, |
|
"loss": 0.5384, |
|
"num_input_tokens_seen": 8562016, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.24519940915805022, |
|
"grad_norm": 5.79752775822047, |
|
"learning_rate": 9.856859481670764e-05, |
|
"loss": 0.5357, |
|
"num_input_tokens_seen": 8614184, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2466765140324963, |
|
"grad_norm": 6.468218270458443, |
|
"learning_rate": 9.853943701211963e-05, |
|
"loss": 0.5309, |
|
"num_input_tokens_seen": 8666528, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.2481536189069424, |
|
"grad_norm": 6.446467495370782, |
|
"learning_rate": 9.850998962173719e-05, |
|
"loss": 0.4949, |
|
"num_input_tokens_seen": 8718048, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.24963072378138848, |
|
"grad_norm": 7.926476306574312, |
|
"learning_rate": 9.848025282124317e-05, |
|
"loss": 0.4681, |
|
"num_input_tokens_seen": 8769968, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.2511078286558346, |
|
"grad_norm": 19.45042923754815, |
|
"learning_rate": 9.845022678804701e-05, |
|
"loss": 0.4949, |
|
"num_input_tokens_seen": 8822832, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2511078286558346, |
|
"eval_loss": 0.6121839880943298, |
|
"eval_runtime": 19.1203, |
|
"eval_samples_per_second": 3.138, |
|
"eval_steps_per_second": 0.785, |
|
"num_input_tokens_seen": 8822832, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.25258493353028066, |
|
"grad_norm": 8.111385407981246, |
|
"learning_rate": 9.841991170128374e-05, |
|
"loss": 0.4636, |
|
"num_input_tokens_seen": 8875608, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.25406203840472674, |
|
"grad_norm": 4.789169716538139, |
|
"learning_rate": 9.838930774181285e-05, |
|
"loss": 0.4322, |
|
"num_input_tokens_seen": 8927600, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2555391432791728, |
|
"grad_norm": 3.6029916519925167, |
|
"learning_rate": 9.835841509221725e-05, |
|
"loss": 0.4302, |
|
"num_input_tokens_seen": 8980224, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.2570162481536189, |
|
"grad_norm": 17.138905616592684, |
|
"learning_rate": 9.83272339368022e-05, |
|
"loss": 0.5231, |
|
"num_input_tokens_seen": 9032112, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.258493353028065, |
|
"grad_norm": 6.810210745159563, |
|
"learning_rate": 9.829576446159416e-05, |
|
"loss": 0.4414, |
|
"num_input_tokens_seen": 9084480, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.25997045790251105, |
|
"grad_norm": 6.785950897404188, |
|
"learning_rate": 9.826400685433968e-05, |
|
"loss": 0.4469, |
|
"num_input_tokens_seen": 9136816, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2614475627769572, |
|
"grad_norm": 14.335926789263953, |
|
"learning_rate": 9.823196130450434e-05, |
|
"loss": 0.3859, |
|
"num_input_tokens_seen": 9189808, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.26292466765140327, |
|
"grad_norm": 24.791700587075013, |
|
"learning_rate": 9.819962800327156e-05, |
|
"loss": 0.4794, |
|
"num_input_tokens_seen": 9241712, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.26440177252584934, |
|
"grad_norm": 16.38282434047279, |
|
"learning_rate": 9.81670071435415e-05, |
|
"loss": 0.4476, |
|
"num_input_tokens_seen": 9293328, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.2658788774002954, |
|
"grad_norm": 3.8069696135300846, |
|
"learning_rate": 9.813409891992988e-05, |
|
"loss": 0.4852, |
|
"num_input_tokens_seen": 9345160, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2658788774002954, |
|
"eval_loss": 0.5605542063713074, |
|
"eval_runtime": 19.0274, |
|
"eval_samples_per_second": 3.153, |
|
"eval_steps_per_second": 0.788, |
|
"num_input_tokens_seen": 9345160, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2673559822747415, |
|
"grad_norm": 8.596452892791511, |
|
"learning_rate": 9.810090352876685e-05, |
|
"loss": 0.4973, |
|
"num_input_tokens_seen": 9396608, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.2688330871491876, |
|
"grad_norm": 22.570326690897016, |
|
"learning_rate": 9.806742116809575e-05, |
|
"loss": 0.4845, |
|
"num_input_tokens_seen": 9448264, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.27031019202363366, |
|
"grad_norm": 10.987740344554055, |
|
"learning_rate": 9.803365203767201e-05, |
|
"loss": 0.4405, |
|
"num_input_tokens_seen": 9501288, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2717872968980798, |
|
"grad_norm": 5.629472815681504, |
|
"learning_rate": 9.799959633896194e-05, |
|
"loss": 0.5228, |
|
"num_input_tokens_seen": 9552680, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.27326440177252587, |
|
"grad_norm": 4.8888946075363355, |
|
"learning_rate": 9.79652542751415e-05, |
|
"loss": 0.4189, |
|
"num_input_tokens_seen": 9604432, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.27474150664697194, |
|
"grad_norm": 11.753985857725072, |
|
"learning_rate": 9.793062605109509e-05, |
|
"loss": 0.4449, |
|
"num_input_tokens_seen": 9656992, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.276218611521418, |
|
"grad_norm": 10.956276728284152, |
|
"learning_rate": 9.789571187341433e-05, |
|
"loss": 0.4678, |
|
"num_input_tokens_seen": 9709016, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.2776957163958641, |
|
"grad_norm": 5.760995229664893, |
|
"learning_rate": 9.786051195039689e-05, |
|
"loss": 0.5359, |
|
"num_input_tokens_seen": 9759936, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.2791728212703102, |
|
"grad_norm": 5.4002668741362365, |
|
"learning_rate": 9.782502649204512e-05, |
|
"loss": 0.5507, |
|
"num_input_tokens_seen": 9811880, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.28064992614475626, |
|
"grad_norm": 3.1909077693586876, |
|
"learning_rate": 9.778925571006495e-05, |
|
"loss": 0.4737, |
|
"num_input_tokens_seen": 9863168, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.28064992614475626, |
|
"eval_loss": 0.479105681180954, |
|
"eval_runtime": 19.2085, |
|
"eval_samples_per_second": 3.124, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 9863168, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.2821270310192024, |
|
"grad_norm": 3.2511615728403744, |
|
"learning_rate": 9.775319981786445e-05, |
|
"loss": 0.4393, |
|
"num_input_tokens_seen": 9914672, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.28360413589364847, |
|
"grad_norm": 16.435101279621147, |
|
"learning_rate": 9.771685903055277e-05, |
|
"loss": 0.4355, |
|
"num_input_tokens_seen": 9966736, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.28508124076809455, |
|
"grad_norm": 15.842537939054491, |
|
"learning_rate": 9.768023356493864e-05, |
|
"loss": 0.4459, |
|
"num_input_tokens_seen": 10017984, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.2865583456425406, |
|
"grad_norm": 4.234230919149069, |
|
"learning_rate": 9.764332363952927e-05, |
|
"loss": 0.4774, |
|
"num_input_tokens_seen": 10069520, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2880354505169867, |
|
"grad_norm": 4.408868276054397, |
|
"learning_rate": 9.760612947452884e-05, |
|
"loss": 0.413, |
|
"num_input_tokens_seen": 10122208, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2895125553914328, |
|
"grad_norm": 18.46536438022927, |
|
"learning_rate": 9.756865129183741e-05, |
|
"loss": 0.5433, |
|
"num_input_tokens_seen": 10173760, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.29098966026587886, |
|
"grad_norm": 10.416515634178488, |
|
"learning_rate": 9.753088931504944e-05, |
|
"loss": 0.4096, |
|
"num_input_tokens_seen": 10224976, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.29246676514032494, |
|
"grad_norm": 8.959580527519506, |
|
"learning_rate": 9.749284376945248e-05, |
|
"loss": 0.3916, |
|
"num_input_tokens_seen": 10276928, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.29394387001477107, |
|
"grad_norm": 4.106784187834887, |
|
"learning_rate": 9.74545148820259e-05, |
|
"loss": 0.3899, |
|
"num_input_tokens_seen": 10328048, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.29542097488921715, |
|
"grad_norm": 7.661197997005464, |
|
"learning_rate": 9.741590288143944e-05, |
|
"loss": 0.4005, |
|
"num_input_tokens_seen": 10379136, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.29542097488921715, |
|
"eval_loss": 0.5501028299331665, |
|
"eval_runtime": 19.0051, |
|
"eval_samples_per_second": 3.157, |
|
"eval_steps_per_second": 0.789, |
|
"num_input_tokens_seen": 10379136, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2968980797636632, |
|
"grad_norm": 28.402265641893973, |
|
"learning_rate": 9.737700799805191e-05, |
|
"loss": 0.4585, |
|
"num_input_tokens_seen": 10430680, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.2983751846381093, |
|
"grad_norm": 7.624783658458961, |
|
"learning_rate": 9.73378304639098e-05, |
|
"loss": 0.4257, |
|
"num_input_tokens_seen": 10482472, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.2998522895125554, |
|
"grad_norm": 9.68942996120796, |
|
"learning_rate": 9.729837051274591e-05, |
|
"loss": 0.4359, |
|
"num_input_tokens_seen": 10534392, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.30132939438700146, |
|
"grad_norm": 4.997152707865521, |
|
"learning_rate": 9.725862837997786e-05, |
|
"loss": 0.4158, |
|
"num_input_tokens_seen": 10586104, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.30280649926144754, |
|
"grad_norm": 12.772476641379384, |
|
"learning_rate": 9.721860430270685e-05, |
|
"loss": 0.4067, |
|
"num_input_tokens_seen": 10637560, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.30428360413589367, |
|
"grad_norm": 11.194625798156807, |
|
"learning_rate": 9.717829851971612e-05, |
|
"loss": 0.4811, |
|
"num_input_tokens_seen": 10689552, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.30576070901033975, |
|
"grad_norm": 8.371187346484113, |
|
"learning_rate": 9.713771127146955e-05, |
|
"loss": 0.4732, |
|
"num_input_tokens_seen": 10742208, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.3072378138847858, |
|
"grad_norm": 20.552637977751065, |
|
"learning_rate": 9.70968428001103e-05, |
|
"loss": 0.4735, |
|
"num_input_tokens_seen": 10794008, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3087149187592319, |
|
"grad_norm": 6.868858377235537, |
|
"learning_rate": 9.705569334945921e-05, |
|
"loss": 0.4381, |
|
"num_input_tokens_seen": 10845736, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.310192023633678, |
|
"grad_norm": 12.499280962869927, |
|
"learning_rate": 9.701426316501352e-05, |
|
"loss": 0.3991, |
|
"num_input_tokens_seen": 10897528, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.310192023633678, |
|
"eval_loss": 0.4378110468387604, |
|
"eval_runtime": 47.2127, |
|
"eval_samples_per_second": 1.271, |
|
"eval_steps_per_second": 0.318, |
|
"num_input_tokens_seen": 10897528, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.31166912850812406, |
|
"grad_norm": 12.581621849544964, |
|
"learning_rate": 9.697255249394527e-05, |
|
"loss": 0.3724, |
|
"num_input_tokens_seen": 10949888, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.31314623338257014, |
|
"grad_norm": 6.318957148146118, |
|
"learning_rate": 9.693056158509992e-05, |
|
"loss": 0.4483, |
|
"num_input_tokens_seen": 11001208, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.31462333825701627, |
|
"grad_norm": 5.822614134671903, |
|
"learning_rate": 9.688829068899483e-05, |
|
"loss": 0.4133, |
|
"num_input_tokens_seen": 11052368, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.31610044313146235, |
|
"grad_norm": 6.115498616882066, |
|
"learning_rate": 9.684574005781772e-05, |
|
"loss": 0.5406, |
|
"num_input_tokens_seen": 11104008, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3175775480059084, |
|
"grad_norm": 12.381439695843321, |
|
"learning_rate": 9.680290994542523e-05, |
|
"loss": 0.4148, |
|
"num_input_tokens_seen": 11155888, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.3190546528803545, |
|
"grad_norm": 5.292808434292701, |
|
"learning_rate": 9.675980060734138e-05, |
|
"loss": 0.4169, |
|
"num_input_tokens_seen": 11207352, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3205317577548006, |
|
"grad_norm": 5.275144555938926, |
|
"learning_rate": 9.671641230075604e-05, |
|
"loss": 0.4706, |
|
"num_input_tokens_seen": 11257672, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.32200886262924666, |
|
"grad_norm": 7.458715041450571, |
|
"learning_rate": 9.667274528452344e-05, |
|
"loss": 0.3736, |
|
"num_input_tokens_seen": 11309944, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.32348596750369274, |
|
"grad_norm": 8.390618541362887, |
|
"learning_rate": 9.662879981916054e-05, |
|
"loss": 0.4413, |
|
"num_input_tokens_seen": 11361032, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.3249630723781389, |
|
"grad_norm": 14.65510134410483, |
|
"learning_rate": 9.658457616684555e-05, |
|
"loss": 0.4624, |
|
"num_input_tokens_seen": 11413120, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3249630723781389, |
|
"eval_loss": 0.5300672650337219, |
|
"eval_runtime": 19.0076, |
|
"eval_samples_per_second": 3.157, |
|
"eval_steps_per_second": 0.789, |
|
"num_input_tokens_seen": 11413120, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.32644017725258495, |
|
"grad_norm": 5.808097944570942, |
|
"learning_rate": 9.654007459141634e-05, |
|
"loss": 0.4121, |
|
"num_input_tokens_seen": 11465064, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.32791728212703103, |
|
"grad_norm": 4.015479597894709, |
|
"learning_rate": 9.649529535836887e-05, |
|
"loss": 0.4569, |
|
"num_input_tokens_seen": 11516304, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3293943870014771, |
|
"grad_norm": 8.768813687811088, |
|
"learning_rate": 9.645023873485557e-05, |
|
"loss": 0.4121, |
|
"num_input_tokens_seen": 11568568, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.3308714918759232, |
|
"grad_norm": 10.930663245586304, |
|
"learning_rate": 9.640490498968383e-05, |
|
"loss": 0.4112, |
|
"num_input_tokens_seen": 11620672, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.33234859675036926, |
|
"grad_norm": 6.691827883878219, |
|
"learning_rate": 9.63592943933143e-05, |
|
"loss": 0.3564, |
|
"num_input_tokens_seen": 11672864, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.33382570162481534, |
|
"grad_norm": 8.33100451031768, |
|
"learning_rate": 9.631340721785934e-05, |
|
"loss": 0.3909, |
|
"num_input_tokens_seen": 11724128, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3353028064992615, |
|
"grad_norm": 7.005971082198048, |
|
"learning_rate": 9.62672437370814e-05, |
|
"loss": 0.4636, |
|
"num_input_tokens_seen": 11776416, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.33677991137370755, |
|
"grad_norm": 14.154463913713748, |
|
"learning_rate": 9.622080422639133e-05, |
|
"loss": 0.4617, |
|
"num_input_tokens_seen": 11828256, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.33825701624815363, |
|
"grad_norm": 14.751813027169304, |
|
"learning_rate": 9.617408896284678e-05, |
|
"loss": 0.3443, |
|
"num_input_tokens_seen": 11882048, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.3397341211225997, |
|
"grad_norm": 3.2576085972339706, |
|
"learning_rate": 9.612709822515054e-05, |
|
"loss": 0.4432, |
|
"num_input_tokens_seen": 11933632, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3397341211225997, |
|
"eval_loss": 0.42494550347328186, |
|
"eval_runtime": 19.6038, |
|
"eval_samples_per_second": 3.061, |
|
"eval_steps_per_second": 0.765, |
|
"num_input_tokens_seen": 11933632, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3412112259970458, |
|
"grad_norm": 4.778973114307738, |
|
"learning_rate": 9.60798322936489e-05, |
|
"loss": 0.3716, |
|
"num_input_tokens_seen": 11986496, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.34268833087149186, |
|
"grad_norm": 8.20796735033587, |
|
"learning_rate": 9.603229145032993e-05, |
|
"loss": 0.4234, |
|
"num_input_tokens_seen": 12039112, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.34416543574593794, |
|
"grad_norm": 7.158508103350641, |
|
"learning_rate": 9.598447597882181e-05, |
|
"loss": 0.3973, |
|
"num_input_tokens_seen": 12091728, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.345642540620384, |
|
"grad_norm": 9.320131732384727, |
|
"learning_rate": 9.593638616439118e-05, |
|
"loss": 0.3494, |
|
"num_input_tokens_seen": 12143896, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.34711964549483015, |
|
"grad_norm": 10.150141046652656, |
|
"learning_rate": 9.588802229394137e-05, |
|
"loss": 0.4182, |
|
"num_input_tokens_seen": 12195336, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.34859675036927623, |
|
"grad_norm": 9.270011962927722, |
|
"learning_rate": 9.583938465601075e-05, |
|
"loss": 0.462, |
|
"num_input_tokens_seen": 12247696, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3500738552437223, |
|
"grad_norm": 8.96068778293971, |
|
"learning_rate": 9.5790473540771e-05, |
|
"loss": 0.4451, |
|
"num_input_tokens_seen": 12300040, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3515509601181684, |
|
"grad_norm": 24.761476817148992, |
|
"learning_rate": 9.574128924002533e-05, |
|
"loss": 0.4789, |
|
"num_input_tokens_seen": 12351904, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.35302806499261447, |
|
"grad_norm": 1.8519516556186366, |
|
"learning_rate": 9.569183204720677e-05, |
|
"loss": 0.3898, |
|
"num_input_tokens_seen": 12403280, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.35450516986706054, |
|
"grad_norm": 5.005586803143539, |
|
"learning_rate": 9.564210225737647e-05, |
|
"loss": 0.3296, |
|
"num_input_tokens_seen": 12456040, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.35450516986706054, |
|
"eval_loss": 0.2966395914554596, |
|
"eval_runtime": 19.5244, |
|
"eval_samples_per_second": 3.073, |
|
"eval_steps_per_second": 0.768, |
|
"num_input_tokens_seen": 12456040, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3559822747415066, |
|
"grad_norm": 5.5028656713393245, |
|
"learning_rate": 9.559210016722184e-05, |
|
"loss": 0.3717, |
|
"num_input_tokens_seen": 12507640, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.35745937961595275, |
|
"grad_norm": 13.214008089689216, |
|
"learning_rate": 9.554182607505484e-05, |
|
"loss": 0.541, |
|
"num_input_tokens_seen": 12559400, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.35893648449039883, |
|
"grad_norm": 6.269664608708862, |
|
"learning_rate": 9.54912802808102e-05, |
|
"loss": 0.3965, |
|
"num_input_tokens_seen": 12610992, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.3604135893648449, |
|
"grad_norm": 21.247512275128738, |
|
"learning_rate": 9.544046308604364e-05, |
|
"loss": 0.4834, |
|
"num_input_tokens_seen": 12662688, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.361890694239291, |
|
"grad_norm": 4.14272563629135, |
|
"learning_rate": 9.538937479393001e-05, |
|
"loss": 0.4538, |
|
"num_input_tokens_seen": 12713600, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.36336779911373707, |
|
"grad_norm": 12.86150407455535, |
|
"learning_rate": 9.533801570926157e-05, |
|
"loss": 0.4226, |
|
"num_input_tokens_seen": 12766360, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.36484490398818314, |
|
"grad_norm": 4.36264988758363, |
|
"learning_rate": 9.52863861384461e-05, |
|
"loss": 0.4315, |
|
"num_input_tokens_seen": 12817248, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.3663220088626292, |
|
"grad_norm": 1.6622545996067835, |
|
"learning_rate": 9.523448638950508e-05, |
|
"loss": 0.3567, |
|
"num_input_tokens_seen": 12868496, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.36779911373707536, |
|
"grad_norm": 5.5241376090939065, |
|
"learning_rate": 9.518231677207192e-05, |
|
"loss": 0.3431, |
|
"num_input_tokens_seen": 12920168, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.36927621861152143, |
|
"grad_norm": 5.717434283790562, |
|
"learning_rate": 9.512987759739003e-05, |
|
"loss": 0.335, |
|
"num_input_tokens_seen": 12972696, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.36927621861152143, |
|
"eval_loss": 0.31846168637275696, |
|
"eval_runtime": 18.9319, |
|
"eval_samples_per_second": 3.169, |
|
"eval_steps_per_second": 0.792, |
|
"num_input_tokens_seen": 12972696, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3707533234859675, |
|
"grad_norm": 2.1850732370217045, |
|
"learning_rate": 9.507716917831099e-05, |
|
"loss": 0.3242, |
|
"num_input_tokens_seen": 13025280, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.3722304283604136, |
|
"grad_norm": 7.470465465497159, |
|
"learning_rate": 9.50241918292927e-05, |
|
"loss": 0.4083, |
|
"num_input_tokens_seen": 13075992, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.37370753323485967, |
|
"grad_norm": 10.134768151698713, |
|
"learning_rate": 9.49709458663975e-05, |
|
"loss": 0.4043, |
|
"num_input_tokens_seen": 13128592, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.37518463810930575, |
|
"grad_norm": 7.635543650225297, |
|
"learning_rate": 9.491743160729026e-05, |
|
"loss": 0.3481, |
|
"num_input_tokens_seen": 13181824, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.3766617429837518, |
|
"grad_norm": 17.089924601510244, |
|
"learning_rate": 9.486364937123651e-05, |
|
"loss": 0.4121, |
|
"num_input_tokens_seen": 13233624, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.37813884785819796, |
|
"grad_norm": 8.52905916993994, |
|
"learning_rate": 9.480959947910055e-05, |
|
"loss": 0.487, |
|
"num_input_tokens_seen": 13285808, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.37961595273264404, |
|
"grad_norm": 11.841989523288227, |
|
"learning_rate": 9.47552822533435e-05, |
|
"loss": 0.3798, |
|
"num_input_tokens_seen": 13337864, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.3810930576070901, |
|
"grad_norm": 2.1853711175575734, |
|
"learning_rate": 9.470069801802135e-05, |
|
"loss": 0.348, |
|
"num_input_tokens_seen": 13390544, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.3825701624815362, |
|
"grad_norm": 2.9516647949035826, |
|
"learning_rate": 9.464584709878313e-05, |
|
"loss": 0.41, |
|
"num_input_tokens_seen": 13441664, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.38404726735598227, |
|
"grad_norm": 3.7764410954952514, |
|
"learning_rate": 9.459072982286886e-05, |
|
"loss": 0.3594, |
|
"num_input_tokens_seen": 13493264, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.38404726735598227, |
|
"eval_loss": 0.4715976417064667, |
|
"eval_runtime": 19.0919, |
|
"eval_samples_per_second": 3.143, |
|
"eval_steps_per_second": 0.786, |
|
"num_input_tokens_seen": 13493264, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.38552437223042835, |
|
"grad_norm": 15.50071615139337, |
|
"learning_rate": 9.453534651910765e-05, |
|
"loss": 0.402, |
|
"num_input_tokens_seen": 13545256, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3870014771048744, |
|
"grad_norm": 23.183495844663526, |
|
"learning_rate": 9.447969751791577e-05, |
|
"loss": 0.3075, |
|
"num_input_tokens_seen": 13597792, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.38847858197932056, |
|
"grad_norm": 9.67544956653079, |
|
"learning_rate": 9.442378315129455e-05, |
|
"loss": 0.3702, |
|
"num_input_tokens_seen": 13649848, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.38995568685376664, |
|
"grad_norm": 2.9059361985914416, |
|
"learning_rate": 9.436760375282859e-05, |
|
"loss": 0.3603, |
|
"num_input_tokens_seen": 13701592, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.3914327917282127, |
|
"grad_norm": 10.431238621222658, |
|
"learning_rate": 9.431115965768358e-05, |
|
"loss": 0.4072, |
|
"num_input_tokens_seen": 13753064, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.3929098966026588, |
|
"grad_norm": 11.216612661805582, |
|
"learning_rate": 9.425445120260445e-05, |
|
"loss": 0.3279, |
|
"num_input_tokens_seen": 13805528, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.39438700147710487, |
|
"grad_norm": 32.22838128750362, |
|
"learning_rate": 9.419747872591325e-05, |
|
"loss": 0.3754, |
|
"num_input_tokens_seen": 13858192, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.39586410635155095, |
|
"grad_norm": 1.8703742105152936, |
|
"learning_rate": 9.414024256750723e-05, |
|
"loss": 0.3754, |
|
"num_input_tokens_seen": 13910128, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.397341211225997, |
|
"grad_norm": 5.011302513950015, |
|
"learning_rate": 9.408274306885674e-05, |
|
"loss": 0.3235, |
|
"num_input_tokens_seen": 13962536, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3988183161004431, |
|
"grad_norm": 15.197987760428996, |
|
"learning_rate": 9.402498057300317e-05, |
|
"loss": 0.3731, |
|
"num_input_tokens_seen": 14014736, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3988183161004431, |
|
"eval_loss": 0.5565826892852783, |
|
"eval_runtime": 19.3029, |
|
"eval_samples_per_second": 3.108, |
|
"eval_steps_per_second": 0.777, |
|
"num_input_tokens_seen": 14014736, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.40029542097488924, |
|
"grad_norm": 2.292705535408954, |
|
"learning_rate": 9.396695542455704e-05, |
|
"loss": 0.4115, |
|
"num_input_tokens_seen": 14066880, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.4017725258493353, |
|
"grad_norm": 10.68072230240614, |
|
"learning_rate": 9.390866796969577e-05, |
|
"loss": 0.365, |
|
"num_input_tokens_seen": 14118320, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4032496307237814, |
|
"grad_norm": 28.45565288311722, |
|
"learning_rate": 9.385011855616177e-05, |
|
"loss": 0.3904, |
|
"num_input_tokens_seen": 14169208, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.40472673559822747, |
|
"grad_norm": 9.32794663574214, |
|
"learning_rate": 9.379130753326021e-05, |
|
"loss": 0.5425, |
|
"num_input_tokens_seen": 14220632, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.40620384047267355, |
|
"grad_norm": 4.737143544435888, |
|
"learning_rate": 9.373223525185709e-05, |
|
"loss": 0.3985, |
|
"num_input_tokens_seen": 14272640, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.4076809453471196, |
|
"grad_norm": 17.480173613134482, |
|
"learning_rate": 9.367290206437702e-05, |
|
"loss": 0.3528, |
|
"num_input_tokens_seen": 14324960, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4091580502215657, |
|
"grad_norm": 4.40598964753602, |
|
"learning_rate": 9.361330832480124e-05, |
|
"loss": 0.3687, |
|
"num_input_tokens_seen": 14376792, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.41063515509601184, |
|
"grad_norm": 15.961709998187562, |
|
"learning_rate": 9.355345438866538e-05, |
|
"loss": 0.3552, |
|
"num_input_tokens_seen": 14428192, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.4121122599704579, |
|
"grad_norm": 4.033485652398453, |
|
"learning_rate": 9.349334061305743e-05, |
|
"loss": 0.3194, |
|
"num_input_tokens_seen": 14480568, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.413589364844904, |
|
"grad_norm": 9.187315388235644, |
|
"learning_rate": 9.343296735661557e-05, |
|
"loss": 0.388, |
|
"num_input_tokens_seen": 14532288, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.413589364844904, |
|
"eval_loss": 0.38656601309776306, |
|
"eval_runtime": 19.1495, |
|
"eval_samples_per_second": 3.133, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 14532288, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4150664697193501, |
|
"grad_norm": 2.8279779742550626, |
|
"learning_rate": 9.337233497952604e-05, |
|
"loss": 0.3751, |
|
"num_input_tokens_seen": 14583680, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.41654357459379615, |
|
"grad_norm": 12.899164703482258, |
|
"learning_rate": 9.331144384352099e-05, |
|
"loss": 0.3431, |
|
"num_input_tokens_seen": 14635712, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.41802067946824223, |
|
"grad_norm": 10.924332413291099, |
|
"learning_rate": 9.325029431187635e-05, |
|
"loss": 0.3786, |
|
"num_input_tokens_seen": 14687048, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.4194977843426883, |
|
"grad_norm": 13.950543503215677, |
|
"learning_rate": 9.318888674940958e-05, |
|
"loss": 0.3427, |
|
"num_input_tokens_seen": 14739336, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.42097488921713444, |
|
"grad_norm": 11.615845590184648, |
|
"learning_rate": 9.31272215224776e-05, |
|
"loss": 0.3307, |
|
"num_input_tokens_seen": 14791656, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.4224519940915805, |
|
"grad_norm": 1.4760161640643292, |
|
"learning_rate": 9.306529899897451e-05, |
|
"loss": 0.3509, |
|
"num_input_tokens_seen": 14843288, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4239290989660266, |
|
"grad_norm": 5.925998864255826, |
|
"learning_rate": 9.300311954832952e-05, |
|
"loss": 0.4168, |
|
"num_input_tokens_seen": 14895040, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.4254062038404727, |
|
"grad_norm": 5.735184902025097, |
|
"learning_rate": 9.294068354150455e-05, |
|
"loss": 0.3203, |
|
"num_input_tokens_seen": 14947448, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.42688330871491875, |
|
"grad_norm": 19.73355339631789, |
|
"learning_rate": 9.287799135099225e-05, |
|
"loss": 0.3217, |
|
"num_input_tokens_seen": 14999480, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.42836041358936483, |
|
"grad_norm": 11.876408386949045, |
|
"learning_rate": 9.281504335081354e-05, |
|
"loss": 0.3131, |
|
"num_input_tokens_seen": 15050992, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.42836041358936483, |
|
"eval_loss": 0.4740215837955475, |
|
"eval_runtime": 19.3584, |
|
"eval_samples_per_second": 3.099, |
|
"eval_steps_per_second": 0.775, |
|
"num_input_tokens_seen": 15050992, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4298375184638109, |
|
"grad_norm": 5.756692171427544, |
|
"learning_rate": 9.275183991651558e-05, |
|
"loss": 0.3253, |
|
"num_input_tokens_seen": 15103328, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.43131462333825704, |
|
"grad_norm": 48.99329970671894, |
|
"learning_rate": 9.268838142516943e-05, |
|
"loss": 0.3999, |
|
"num_input_tokens_seen": 15154640, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4327917282127031, |
|
"grad_norm": 4.8123322290098764, |
|
"learning_rate": 9.262466825536782e-05, |
|
"loss": 0.3529, |
|
"num_input_tokens_seen": 15206264, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.4342688330871492, |
|
"grad_norm": 8.626202826826654, |
|
"learning_rate": 9.256070078722287e-05, |
|
"loss": 0.3363, |
|
"num_input_tokens_seen": 15258160, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.4357459379615953, |
|
"grad_norm": 24.191365994287917, |
|
"learning_rate": 9.249647940236385e-05, |
|
"loss": 0.4133, |
|
"num_input_tokens_seen": 15309224, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.43722304283604135, |
|
"grad_norm": 15.447487032462668, |
|
"learning_rate": 9.243200448393492e-05, |
|
"loss": 0.3306, |
|
"num_input_tokens_seen": 15361480, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.43870014771048743, |
|
"grad_norm": 2.714376790067382, |
|
"learning_rate": 9.236727641659277e-05, |
|
"loss": 0.2993, |
|
"num_input_tokens_seen": 15414680, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.4401772525849335, |
|
"grad_norm": 11.85875780519428, |
|
"learning_rate": 9.230229558650442e-05, |
|
"loss": 0.3324, |
|
"num_input_tokens_seen": 15466552, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.44165435745937964, |
|
"grad_norm": 7.338287245679936, |
|
"learning_rate": 9.223706238134485e-05, |
|
"loss": 0.2615, |
|
"num_input_tokens_seen": 15519472, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.4431314623338257, |
|
"grad_norm": 11.54360838796349, |
|
"learning_rate": 9.217157719029469e-05, |
|
"loss": 0.2928, |
|
"num_input_tokens_seen": 15572048, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4431314623338257, |
|
"eval_loss": 0.40494996309280396, |
|
"eval_runtime": 19.2568, |
|
"eval_samples_per_second": 3.116, |
|
"eval_steps_per_second": 0.779, |
|
"num_input_tokens_seen": 15572048, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4446085672082718, |
|
"grad_norm": 1.4684583536766802, |
|
"learning_rate": 9.210584040403793e-05, |
|
"loss": 0.3622, |
|
"num_input_tokens_seen": 15623400, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.4460856720827179, |
|
"grad_norm": 14.96137801867225, |
|
"learning_rate": 9.20398524147596e-05, |
|
"loss": 0.3192, |
|
"num_input_tokens_seen": 15676712, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.44756277695716395, |
|
"grad_norm": 6.400159478959921, |
|
"learning_rate": 9.197361361614339e-05, |
|
"loss": 0.3463, |
|
"num_input_tokens_seen": 15729304, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.44903988183161003, |
|
"grad_norm": 4.316336693336091, |
|
"learning_rate": 9.190712440336928e-05, |
|
"loss": 0.3675, |
|
"num_input_tokens_seen": 15780144, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4505169867060561, |
|
"grad_norm": 3.3824956383811586, |
|
"learning_rate": 9.184038517311126e-05, |
|
"loss": 0.3305, |
|
"num_input_tokens_seen": 15832032, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.4519940915805022, |
|
"grad_norm": 8.862997235708779, |
|
"learning_rate": 9.177339632353492e-05, |
|
"loss": 0.3817, |
|
"num_input_tokens_seen": 15884152, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4534711964549483, |
|
"grad_norm": 8.942655459138996, |
|
"learning_rate": 9.170615825429502e-05, |
|
"loss": 0.3742, |
|
"num_input_tokens_seen": 15935688, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.4549483013293944, |
|
"grad_norm": 29.75871004929942, |
|
"learning_rate": 9.163867136653327e-05, |
|
"loss": 0.34, |
|
"num_input_tokens_seen": 15987800, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.4564254062038405, |
|
"grad_norm": 2.7907856781115816, |
|
"learning_rate": 9.157093606287572e-05, |
|
"loss": 0.3389, |
|
"num_input_tokens_seen": 16039472, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.45790251107828656, |
|
"grad_norm": 17.658376604188195, |
|
"learning_rate": 9.150295274743053e-05, |
|
"loss": 0.3588, |
|
"num_input_tokens_seen": 16091960, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.45790251107828656, |
|
"eval_loss": 0.28714123368263245, |
|
"eval_runtime": 19.0565, |
|
"eval_samples_per_second": 3.149, |
|
"eval_steps_per_second": 0.787, |
|
"num_input_tokens_seen": 16091960, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.45937961595273263, |
|
"grad_norm": 2.40581004555412, |
|
"learning_rate": 9.143472182578547e-05, |
|
"loss": 0.3501, |
|
"num_input_tokens_seen": 16143672, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.4608567208271787, |
|
"grad_norm": 4.013146156526438, |
|
"learning_rate": 9.136624370500554e-05, |
|
"loss": 0.2684, |
|
"num_input_tokens_seen": 16195776, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4623338257016248, |
|
"grad_norm": 11.075749009255755, |
|
"learning_rate": 9.129751879363052e-05, |
|
"loss": 0.3294, |
|
"num_input_tokens_seen": 16247752, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.4638109305760709, |
|
"grad_norm": 1.5049575714077101, |
|
"learning_rate": 9.122854750167254e-05, |
|
"loss": 0.2906, |
|
"num_input_tokens_seen": 16300680, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.465288035450517, |
|
"grad_norm": 4.448041589727679, |
|
"learning_rate": 9.115933024061365e-05, |
|
"loss": 0.3498, |
|
"num_input_tokens_seen": 16352000, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4667651403249631, |
|
"grad_norm": 9.574179858800763, |
|
"learning_rate": 9.108986742340331e-05, |
|
"loss": 0.3262, |
|
"num_input_tokens_seen": 16403784, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.46824224519940916, |
|
"grad_norm": 5.326539404357799, |
|
"learning_rate": 9.102015946445601e-05, |
|
"loss": 0.318, |
|
"num_input_tokens_seen": 16455080, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.46971935007385524, |
|
"grad_norm": 17.72769299345548, |
|
"learning_rate": 9.095020677964874e-05, |
|
"loss": 0.3257, |
|
"num_input_tokens_seen": 16507712, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.4711964549483013, |
|
"grad_norm": 6.912966772855359, |
|
"learning_rate": 9.08800097863185e-05, |
|
"loss": 0.3253, |
|
"num_input_tokens_seen": 16559392, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.4726735598227474, |
|
"grad_norm": 3.3138006639125344, |
|
"learning_rate": 9.080956890325985e-05, |
|
"loss": 0.3879, |
|
"num_input_tokens_seen": 16609960, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4726735598227474, |
|
"eval_loss": 0.3135533034801483, |
|
"eval_runtime": 19.3013, |
|
"eval_samples_per_second": 3.109, |
|
"eval_steps_per_second": 0.777, |
|
"num_input_tokens_seen": 16609960, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4741506646971935, |
|
"grad_norm": 11.101854016846067, |
|
"learning_rate": 9.07388845507224e-05, |
|
"loss": 0.3644, |
|
"num_input_tokens_seen": 16661440, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.4756277695716396, |
|
"grad_norm": 12.638527847469565, |
|
"learning_rate": 9.066795715040825e-05, |
|
"loss": 0.2733, |
|
"num_input_tokens_seen": 16714200, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.4771048744460857, |
|
"grad_norm": 2.7966971298188694, |
|
"learning_rate": 9.059678712546963e-05, |
|
"loss": 0.3063, |
|
"num_input_tokens_seen": 16766904, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.47858197932053176, |
|
"grad_norm": 8.210735089414575, |
|
"learning_rate": 9.052537490050614e-05, |
|
"loss": 0.3769, |
|
"num_input_tokens_seen": 16818168, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.48005908419497784, |
|
"grad_norm": 8.942406690882898, |
|
"learning_rate": 9.045372090156243e-05, |
|
"loss": 0.3089, |
|
"num_input_tokens_seen": 16869952, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.4815361890694239, |
|
"grad_norm": 11.68569569670719, |
|
"learning_rate": 9.038182555612551e-05, |
|
"loss": 0.2953, |
|
"num_input_tokens_seen": 16922608, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.48301329394387, |
|
"grad_norm": 17.89504289247946, |
|
"learning_rate": 9.030968929312231e-05, |
|
"loss": 0.3286, |
|
"num_input_tokens_seen": 16974824, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.4844903988183161, |
|
"grad_norm": 1.1612341646192372, |
|
"learning_rate": 9.023731254291705e-05, |
|
"loss": 0.3552, |
|
"num_input_tokens_seen": 17026088, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.4859675036927622, |
|
"grad_norm": 4.624302477495485, |
|
"learning_rate": 9.016469573730869e-05, |
|
"loss": 0.326, |
|
"num_input_tokens_seen": 17077904, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.4874446085672083, |
|
"grad_norm": 1.126074328380494, |
|
"learning_rate": 9.009183930952836e-05, |
|
"loss": 0.2698, |
|
"num_input_tokens_seen": 17130896, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.4874446085672083, |
|
"eval_loss": 0.40201568603515625, |
|
"eval_runtime": 19.4814, |
|
"eval_samples_per_second": 3.08, |
|
"eval_steps_per_second": 0.77, |
|
"num_input_tokens_seen": 17130896, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.48892171344165436, |
|
"grad_norm": 0.9908097045731693, |
|
"learning_rate": 9.00187436942368e-05, |
|
"loss": 0.2956, |
|
"num_input_tokens_seen": 17182896, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.49039881831610044, |
|
"grad_norm": 2.7463420963023415, |
|
"learning_rate": 8.994540932752167e-05, |
|
"loss": 0.3027, |
|
"num_input_tokens_seen": 17235552, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.4918759231905465, |
|
"grad_norm": 12.1024464355288, |
|
"learning_rate": 8.987183664689511e-05, |
|
"loss": 0.3295, |
|
"num_input_tokens_seen": 17286816, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.4933530280649926, |
|
"grad_norm": 1.7833060218575738, |
|
"learning_rate": 8.9798026091291e-05, |
|
"loss": 0.3201, |
|
"num_input_tokens_seen": 17339072, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.4948301329394387, |
|
"grad_norm": 1.7534258589471414, |
|
"learning_rate": 8.972397810106235e-05, |
|
"loss": 0.3044, |
|
"num_input_tokens_seen": 17391288, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.4963072378138848, |
|
"grad_norm": 3.562194425832391, |
|
"learning_rate": 8.964969311797871e-05, |
|
"loss": 0.2781, |
|
"num_input_tokens_seen": 17443456, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.4977843426883309, |
|
"grad_norm": 3.1653143619735484, |
|
"learning_rate": 8.957517158522359e-05, |
|
"loss": 0.423, |
|
"num_input_tokens_seen": 17494832, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.49926144756277696, |
|
"grad_norm": 9.755061601968617, |
|
"learning_rate": 8.950041394739168e-05, |
|
"loss": 0.2747, |
|
"num_input_tokens_seen": 17547384, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.5007385524372231, |
|
"grad_norm": 6.30268931800531, |
|
"learning_rate": 8.942542065048632e-05, |
|
"loss": 0.3162, |
|
"num_input_tokens_seen": 17599120, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.5022156573116692, |
|
"grad_norm": 16.220873737158193, |
|
"learning_rate": 8.935019214191672e-05, |
|
"loss": 0.3904, |
|
"num_input_tokens_seen": 17650984, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5022156573116692, |
|
"eval_loss": 0.3297054171562195, |
|
"eval_runtime": 47.9781, |
|
"eval_samples_per_second": 1.251, |
|
"eval_steps_per_second": 0.313, |
|
"num_input_tokens_seen": 17650984, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5036927621861153, |
|
"grad_norm": 12.859842090444923, |
|
"learning_rate": 8.927472887049545e-05, |
|
"loss": 0.3484, |
|
"num_input_tokens_seen": 17702864, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.5051698670605613, |
|
"grad_norm": 1.974808718225815, |
|
"learning_rate": 8.919903128643563e-05, |
|
"loss": 0.3214, |
|
"num_input_tokens_seen": 17754792, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5066469719350074, |
|
"grad_norm": 2.157888572043865, |
|
"learning_rate": 8.912309984134825e-05, |
|
"loss": 0.3428, |
|
"num_input_tokens_seen": 17805608, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.5081240768094535, |
|
"grad_norm": 1.7029183906768783, |
|
"learning_rate": 8.90469349882396e-05, |
|
"loss": 0.3186, |
|
"num_input_tokens_seen": 17857304, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5096011816838996, |
|
"grad_norm": 1.6160566937167244, |
|
"learning_rate": 8.897053718150838e-05, |
|
"loss": 0.2549, |
|
"num_input_tokens_seen": 17909528, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.5110782865583456, |
|
"grad_norm": 1.3191512067150797, |
|
"learning_rate": 8.889390687694317e-05, |
|
"loss": 0.3248, |
|
"num_input_tokens_seen": 17960976, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5125553914327917, |
|
"grad_norm": 4.7911741928611065, |
|
"learning_rate": 8.88170445317196e-05, |
|
"loss": 0.3234, |
|
"num_input_tokens_seen": 18013008, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.5140324963072378, |
|
"grad_norm": 2.1869693250421514, |
|
"learning_rate": 8.873995060439764e-05, |
|
"loss": 0.2883, |
|
"num_input_tokens_seen": 18065200, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.5155096011816839, |
|
"grad_norm": 2.878830043797237, |
|
"learning_rate": 8.86626255549189e-05, |
|
"loss": 0.2691, |
|
"num_input_tokens_seen": 18116880, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.51698670605613, |
|
"grad_norm": 13.298568339033883, |
|
"learning_rate": 8.858506984460383e-05, |
|
"loss": 0.3173, |
|
"num_input_tokens_seen": 18169344, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.51698670605613, |
|
"eval_loss": 0.44908422231674194, |
|
"eval_runtime": 19.4293, |
|
"eval_samples_per_second": 3.088, |
|
"eval_steps_per_second": 0.772, |
|
"num_input_tokens_seen": 18169344, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.518463810930576, |
|
"grad_norm": 9.533140259257635, |
|
"learning_rate": 8.850728393614902e-05, |
|
"loss": 0.2966, |
|
"num_input_tokens_seen": 18221144, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.5199409158050221, |
|
"grad_norm": 1.1902233180287396, |
|
"learning_rate": 8.842926829362446e-05, |
|
"loss": 0.3101, |
|
"num_input_tokens_seen": 18272752, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5214180206794683, |
|
"grad_norm": 5.4464127921059635, |
|
"learning_rate": 8.835102338247064e-05, |
|
"loss": 0.2545, |
|
"num_input_tokens_seen": 18325888, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.5228951255539144, |
|
"grad_norm": 4.908248580861331, |
|
"learning_rate": 8.827254966949593e-05, |
|
"loss": 0.3223, |
|
"num_input_tokens_seen": 18378016, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5243722304283605, |
|
"grad_norm": 8.878082941549529, |
|
"learning_rate": 8.819384762287373e-05, |
|
"loss": 0.2714, |
|
"num_input_tokens_seen": 18431240, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.5258493353028065, |
|
"grad_norm": 1.3922104834090385, |
|
"learning_rate": 8.811491771213964e-05, |
|
"loss": 0.3438, |
|
"num_input_tokens_seen": 18482832, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5273264401772526, |
|
"grad_norm": 6.15513486850916, |
|
"learning_rate": 8.803576040818873e-05, |
|
"loss": 0.2324, |
|
"num_input_tokens_seen": 18534992, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.5288035450516987, |
|
"grad_norm": 5.349889126448278, |
|
"learning_rate": 8.795637618327269e-05, |
|
"loss": 0.2259, |
|
"num_input_tokens_seen": 18587752, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.5302806499261448, |
|
"grad_norm": 11.727285406275136, |
|
"learning_rate": 8.7876765510997e-05, |
|
"loss": 0.2468, |
|
"num_input_tokens_seen": 18640440, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.5317577548005908, |
|
"grad_norm": 2.562809564221474, |
|
"learning_rate": 8.779692886631812e-05, |
|
"loss": 0.3127, |
|
"num_input_tokens_seen": 18691928, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5317577548005908, |
|
"eval_loss": 0.3499237596988678, |
|
"eval_runtime": 19.3376, |
|
"eval_samples_per_second": 3.103, |
|
"eval_steps_per_second": 0.776, |
|
"num_input_tokens_seen": 18691928, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5332348596750369, |
|
"grad_norm": 2.1114949572134596, |
|
"learning_rate": 8.771686672554067e-05, |
|
"loss": 0.3145, |
|
"num_input_tokens_seen": 18743600, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.534711964549483, |
|
"grad_norm": 2.661731170516522, |
|
"learning_rate": 8.763657956631462e-05, |
|
"loss": 0.3207, |
|
"num_input_tokens_seen": 18794920, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5361890694239291, |
|
"grad_norm": 10.01115206187178, |
|
"learning_rate": 8.75560678676323e-05, |
|
"loss": 0.2463, |
|
"num_input_tokens_seen": 18847600, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.5376661742983752, |
|
"grad_norm": 2.5996909028595963, |
|
"learning_rate": 8.747533210982575e-05, |
|
"loss": 0.3199, |
|
"num_input_tokens_seen": 18899936, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5391432791728212, |
|
"grad_norm": 1.5344678453995588, |
|
"learning_rate": 8.739437277456366e-05, |
|
"loss": 0.282, |
|
"num_input_tokens_seen": 18952336, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.5406203840472673, |
|
"grad_norm": 1.0236614886630173, |
|
"learning_rate": 8.731319034484862e-05, |
|
"loss": 0.2846, |
|
"num_input_tokens_seen": 19004528, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5420974889217134, |
|
"grad_norm": 2.459552261348287, |
|
"learning_rate": 8.723178530501417e-05, |
|
"loss": 0.3537, |
|
"num_input_tokens_seen": 19056296, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.5435745937961596, |
|
"grad_norm": 6.813862731810714, |
|
"learning_rate": 8.7150158140722e-05, |
|
"loss": 0.3289, |
|
"num_input_tokens_seen": 19108832, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.5450516986706057, |
|
"grad_norm": 9.828132924879462, |
|
"learning_rate": 8.706830933895894e-05, |
|
"loss": 0.3094, |
|
"num_input_tokens_seen": 19160960, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.5465288035450517, |
|
"grad_norm": 1.3290993068913195, |
|
"learning_rate": 8.698623938803409e-05, |
|
"loss": 0.2828, |
|
"num_input_tokens_seen": 19212992, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5465288035450517, |
|
"eval_loss": 0.3780718147754669, |
|
"eval_runtime": 19.3274, |
|
"eval_samples_per_second": 3.104, |
|
"eval_steps_per_second": 0.776, |
|
"num_input_tokens_seen": 19212992, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5480059084194978, |
|
"grad_norm": 2.1847334280581436, |
|
"learning_rate": 8.6903948777576e-05, |
|
"loss": 0.2084, |
|
"num_input_tokens_seen": 19266312, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.5494830132939439, |
|
"grad_norm": 16.345570907477917, |
|
"learning_rate": 8.68214379985296e-05, |
|
"loss": 0.3262, |
|
"num_input_tokens_seen": 19318888, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.55096011816839, |
|
"grad_norm": 5.031283231514028, |
|
"learning_rate": 8.673870754315336e-05, |
|
"loss": 0.3074, |
|
"num_input_tokens_seen": 19371352, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.552437223042836, |
|
"grad_norm": 7.773374329248698, |
|
"learning_rate": 8.665575790501639e-05, |
|
"loss": 0.2795, |
|
"num_input_tokens_seen": 19424160, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.5539143279172821, |
|
"grad_norm": 19.56194874042025, |
|
"learning_rate": 8.657258957899535e-05, |
|
"loss": 0.2396, |
|
"num_input_tokens_seen": 19477720, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.5553914327917282, |
|
"grad_norm": 1.2813519038705325, |
|
"learning_rate": 8.648920306127169e-05, |
|
"loss": 0.302, |
|
"num_input_tokens_seen": 19528512, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.5568685376661743, |
|
"grad_norm": 19.876066610019436, |
|
"learning_rate": 8.640559884932848e-05, |
|
"loss": 0.3457, |
|
"num_input_tokens_seen": 19579624, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.5583456425406204, |
|
"grad_norm": 2.133717936227167, |
|
"learning_rate": 8.632177744194765e-05, |
|
"loss": 0.3194, |
|
"num_input_tokens_seen": 19631432, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.5598227474150664, |
|
"grad_norm": 27.424806916466594, |
|
"learning_rate": 8.623773933920688e-05, |
|
"loss": 0.3135, |
|
"num_input_tokens_seen": 19682792, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.5612998522895125, |
|
"grad_norm": 12.744054056352415, |
|
"learning_rate": 8.615348504247663e-05, |
|
"loss": 0.306, |
|
"num_input_tokens_seen": 19735976, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5612998522895125, |
|
"eval_loss": 0.37662214040756226, |
|
"eval_runtime": 19.3962, |
|
"eval_samples_per_second": 3.093, |
|
"eval_steps_per_second": 0.773, |
|
"num_input_tokens_seen": 19735976, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5627769571639586, |
|
"grad_norm": 7.406464018339625, |
|
"learning_rate": 8.606901505441718e-05, |
|
"loss": 0.3128, |
|
"num_input_tokens_seen": 19787504, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.5642540620384048, |
|
"grad_norm": 2.9136695732910467, |
|
"learning_rate": 8.598432987897565e-05, |
|
"loss": 0.2711, |
|
"num_input_tokens_seen": 19839104, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.5657311669128509, |
|
"grad_norm": 16.37365104368587, |
|
"learning_rate": 8.589943002138295e-05, |
|
"loss": 0.3735, |
|
"num_input_tokens_seen": 19891064, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.5672082717872969, |
|
"grad_norm": 12.090734435687315, |
|
"learning_rate": 8.581431598815077e-05, |
|
"loss": 0.29, |
|
"num_input_tokens_seen": 19942368, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.568685376661743, |
|
"grad_norm": 2.098717687702263, |
|
"learning_rate": 8.572898828706857e-05, |
|
"loss": 0.2977, |
|
"num_input_tokens_seen": 19994816, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.5701624815361891, |
|
"grad_norm": 8.108786754862253, |
|
"learning_rate": 8.564344742720059e-05, |
|
"loss": 0.2483, |
|
"num_input_tokens_seen": 20046192, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.5716395864106352, |
|
"grad_norm": 32.34891985769585, |
|
"learning_rate": 8.55576939188827e-05, |
|
"loss": 0.3126, |
|
"num_input_tokens_seen": 20097328, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.5731166912850812, |
|
"grad_norm": 10.903589696110263, |
|
"learning_rate": 8.54717282737195e-05, |
|
"loss": 0.2688, |
|
"num_input_tokens_seen": 20149392, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.5745937961595273, |
|
"grad_norm": 1.6351631003571967, |
|
"learning_rate": 8.538555100458114e-05, |
|
"loss": 0.2491, |
|
"num_input_tokens_seen": 20201392, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.5760709010339734, |
|
"grad_norm": 13.847094678250938, |
|
"learning_rate": 8.529916262560038e-05, |
|
"loss": 0.2992, |
|
"num_input_tokens_seen": 20253288, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5760709010339734, |
|
"eval_loss": 0.3468088209629059, |
|
"eval_runtime": 19.3874, |
|
"eval_samples_per_second": 3.095, |
|
"eval_steps_per_second": 0.774, |
|
"num_input_tokens_seen": 20253288, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.5775480059084195, |
|
"grad_norm": 1.1014534289470133, |
|
"learning_rate": 8.521256365216941e-05, |
|
"loss": 0.2505, |
|
"num_input_tokens_seen": 20305536, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.5790251107828656, |
|
"grad_norm": 2.0823493015928736, |
|
"learning_rate": 8.512575460093683e-05, |
|
"loss": 0.2487, |
|
"num_input_tokens_seen": 20357912, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.5805022156573116, |
|
"grad_norm": 27.36036946104022, |
|
"learning_rate": 8.503873598980456e-05, |
|
"loss": 0.3441, |
|
"num_input_tokens_seen": 20409624, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.5819793205317577, |
|
"grad_norm": 2.332187139346983, |
|
"learning_rate": 8.495150833792478e-05, |
|
"loss": 0.2973, |
|
"num_input_tokens_seen": 20461080, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.5834564254062038, |
|
"grad_norm": 6.745585839667694, |
|
"learning_rate": 8.486407216569678e-05, |
|
"loss": 0.316, |
|
"num_input_tokens_seen": 20512000, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.5849335302806499, |
|
"grad_norm": 1.85548504039152, |
|
"learning_rate": 8.477642799476387e-05, |
|
"loss": 0.338, |
|
"num_input_tokens_seen": 20563824, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.5864106351550961, |
|
"grad_norm": 1.7632477883915527, |
|
"learning_rate": 8.468857634801033e-05, |
|
"loss": 0.2472, |
|
"num_input_tokens_seen": 20615944, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.5878877400295421, |
|
"grad_norm": 1.4299099445196912, |
|
"learning_rate": 8.460051774955818e-05, |
|
"loss": 0.3045, |
|
"num_input_tokens_seen": 20666720, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.5893648449039882, |
|
"grad_norm": 1.6846270262315524, |
|
"learning_rate": 8.451225272476412e-05, |
|
"loss": 0.2606, |
|
"num_input_tokens_seen": 20718504, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.5908419497784343, |
|
"grad_norm": 3.1983448844727196, |
|
"learning_rate": 8.442378180021644e-05, |
|
"loss": 0.2341, |
|
"num_input_tokens_seen": 20770728, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5908419497784343, |
|
"eval_loss": 0.3366144299507141, |
|
"eval_runtime": 19.2317, |
|
"eval_samples_per_second": 3.12, |
|
"eval_steps_per_second": 0.78, |
|
"num_input_tokens_seen": 20770728, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5923190546528804, |
|
"grad_norm": 2.085951928012366, |
|
"learning_rate": 8.433510550373175e-05, |
|
"loss": 0.2844, |
|
"num_input_tokens_seen": 20823136, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.5937961595273265, |
|
"grad_norm": 1.3563763759741745, |
|
"learning_rate": 8.424622436435199e-05, |
|
"loss": 0.2786, |
|
"num_input_tokens_seen": 20875080, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.5952732644017725, |
|
"grad_norm": 1.9044297891885509, |
|
"learning_rate": 8.41571389123411e-05, |
|
"loss": 0.2647, |
|
"num_input_tokens_seen": 20927584, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.5967503692762186, |
|
"grad_norm": 16.58759330144968, |
|
"learning_rate": 8.406784967918203e-05, |
|
"loss": 0.2673, |
|
"num_input_tokens_seen": 20980640, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.5982274741506647, |
|
"grad_norm": 3.4967962357260673, |
|
"learning_rate": 8.397835719757343e-05, |
|
"loss": 0.2973, |
|
"num_input_tokens_seen": 21033272, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.5997045790251108, |
|
"grad_norm": 15.74237531026009, |
|
"learning_rate": 8.388866200142656e-05, |
|
"loss": 0.302, |
|
"num_input_tokens_seen": 21086032, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6011816838995568, |
|
"grad_norm": 1.5694876926785786, |
|
"learning_rate": 8.379876462586203e-05, |
|
"loss": 0.2758, |
|
"num_input_tokens_seen": 21137800, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.6026587887740029, |
|
"grad_norm": 2.5665191340339018, |
|
"learning_rate": 8.370866560720671e-05, |
|
"loss": 0.2687, |
|
"num_input_tokens_seen": 21189264, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.604135893648449, |
|
"grad_norm": 6.275127746938625, |
|
"learning_rate": 8.361836548299045e-05, |
|
"loss": 0.295, |
|
"num_input_tokens_seen": 21240184, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.6056129985228951, |
|
"grad_norm": 17.85896449030129, |
|
"learning_rate": 8.352786479194288e-05, |
|
"loss": 0.2931, |
|
"num_input_tokens_seen": 21291664, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6056129985228951, |
|
"eval_loss": 0.33863261342048645, |
|
"eval_runtime": 19.3289, |
|
"eval_samples_per_second": 3.104, |
|
"eval_steps_per_second": 0.776, |
|
"num_input_tokens_seen": 21291664, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6070901033973413, |
|
"grad_norm": 1.4664266327354816, |
|
"learning_rate": 8.343716407399019e-05, |
|
"loss": 0.2338, |
|
"num_input_tokens_seen": 21344232, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.6085672082717873, |
|
"grad_norm": 1.440796404258189, |
|
"learning_rate": 8.334626387025197e-05, |
|
"loss": 0.3026, |
|
"num_input_tokens_seen": 21396160, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6100443131462334, |
|
"grad_norm": 3.112422612662143, |
|
"learning_rate": 8.325516472303792e-05, |
|
"loss": 0.2898, |
|
"num_input_tokens_seen": 21448032, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.6115214180206795, |
|
"grad_norm": 7.791629500233522, |
|
"learning_rate": 8.316386717584463e-05, |
|
"loss": 0.3265, |
|
"num_input_tokens_seen": 21499144, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.6129985228951256, |
|
"grad_norm": 7.664415048315268, |
|
"learning_rate": 8.307237177335239e-05, |
|
"loss": 0.2513, |
|
"num_input_tokens_seen": 21551328, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.6144756277695717, |
|
"grad_norm": 7.353575088308837, |
|
"learning_rate": 8.298067906142182e-05, |
|
"loss": 0.2864, |
|
"num_input_tokens_seen": 21603800, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6159527326440177, |
|
"grad_norm": 6.877908422150131, |
|
"learning_rate": 8.288878958709072e-05, |
|
"loss": 0.243, |
|
"num_input_tokens_seen": 21656480, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.6174298375184638, |
|
"grad_norm": 7.551272364447902, |
|
"learning_rate": 8.279670389857079e-05, |
|
"loss": 0.2711, |
|
"num_input_tokens_seen": 21708824, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.6189069423929099, |
|
"grad_norm": 1.5109531594598573, |
|
"learning_rate": 8.27044225452443e-05, |
|
"loss": 0.2475, |
|
"num_input_tokens_seen": 21760744, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.620384047267356, |
|
"grad_norm": 10.98724867961312, |
|
"learning_rate": 8.26119460776609e-05, |
|
"loss": 0.1826, |
|
"num_input_tokens_seen": 21813984, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.620384047267356, |
|
"eval_loss": 0.5386325716972351, |
|
"eval_runtime": 19.2817, |
|
"eval_samples_per_second": 3.112, |
|
"eval_steps_per_second": 0.778, |
|
"num_input_tokens_seen": 21813984, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.621861152141802, |
|
"grad_norm": 11.844781107327274, |
|
"learning_rate": 8.251927504753426e-05, |
|
"loss": 0.3018, |
|
"num_input_tokens_seen": 21865304, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.6233382570162481, |
|
"grad_norm": 1.4332555229426183, |
|
"learning_rate": 8.24264100077388e-05, |
|
"loss": 0.2725, |
|
"num_input_tokens_seen": 21918568, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6248153618906942, |
|
"grad_norm": 16.587502798944953, |
|
"learning_rate": 8.233335151230646e-05, |
|
"loss": 0.3103, |
|
"num_input_tokens_seen": 21970352, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.6262924667651403, |
|
"grad_norm": 2.1746923737074333, |
|
"learning_rate": 8.224010011642326e-05, |
|
"loss": 0.2868, |
|
"num_input_tokens_seen": 22021312, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6277695716395865, |
|
"grad_norm": 20.536802693642485, |
|
"learning_rate": 8.21466563764261e-05, |
|
"loss": 0.335, |
|
"num_input_tokens_seen": 22073496, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.6292466765140325, |
|
"grad_norm": 2.23347350807466, |
|
"learning_rate": 8.205302084979937e-05, |
|
"loss": 0.3764, |
|
"num_input_tokens_seen": 22124088, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6307237813884786, |
|
"grad_norm": 1.4129621191965986, |
|
"learning_rate": 8.19591940951717e-05, |
|
"loss": 0.2681, |
|
"num_input_tokens_seen": 22175824, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.6322008862629247, |
|
"grad_norm": 1.1767190413631534, |
|
"learning_rate": 8.186517667231259e-05, |
|
"loss": 0.2619, |
|
"num_input_tokens_seen": 22227376, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.6336779911373708, |
|
"grad_norm": 1.5721543022398727, |
|
"learning_rate": 8.1770969142129e-05, |
|
"loss": 0.2516, |
|
"num_input_tokens_seen": 22279928, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.6351550960118169, |
|
"grad_norm": 1.1838050704992382, |
|
"learning_rate": 8.167657206666217e-05, |
|
"loss": 0.2387, |
|
"num_input_tokens_seen": 22332144, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6351550960118169, |
|
"eval_loss": 0.25809118151664734, |
|
"eval_runtime": 19.3071, |
|
"eval_samples_per_second": 3.108, |
|
"eval_steps_per_second": 0.777, |
|
"num_input_tokens_seen": 22332144, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6366322008862629, |
|
"grad_norm": 48.075734297605706, |
|
"learning_rate": 8.158198600908405e-05, |
|
"loss": 0.2606, |
|
"num_input_tokens_seen": 22383912, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.638109305760709, |
|
"grad_norm": 12.349268472522956, |
|
"learning_rate": 8.148721153369411e-05, |
|
"loss": 0.2672, |
|
"num_input_tokens_seen": 22435504, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6395864106351551, |
|
"grad_norm": 17.499211985691577, |
|
"learning_rate": 8.139224920591598e-05, |
|
"loss": 0.2771, |
|
"num_input_tokens_seen": 22487696, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.6410635155096012, |
|
"grad_norm": 7.176310226710563, |
|
"learning_rate": 8.129709959229388e-05, |
|
"loss": 0.3018, |
|
"num_input_tokens_seen": 22539664, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6425406203840472, |
|
"grad_norm": 22.989487671947256, |
|
"learning_rate": 8.120176326048949e-05, |
|
"loss": 0.312, |
|
"num_input_tokens_seen": 22592240, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.6440177252584933, |
|
"grad_norm": 2.0595093158376825, |
|
"learning_rate": 8.110624077927842e-05, |
|
"loss": 0.2413, |
|
"num_input_tokens_seen": 22643648, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.6454948301329394, |
|
"grad_norm": 1.0826681349485223, |
|
"learning_rate": 8.101053271854682e-05, |
|
"loss": 0.2585, |
|
"num_input_tokens_seen": 22695208, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.6469719350073855, |
|
"grad_norm": 1.351248688875387, |
|
"learning_rate": 8.091463964928801e-05, |
|
"loss": 0.2621, |
|
"num_input_tokens_seen": 22746896, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.6484490398818316, |
|
"grad_norm": 7.190623982268162, |
|
"learning_rate": 8.081856214359908e-05, |
|
"loss": 0.324, |
|
"num_input_tokens_seen": 22797936, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.6499261447562777, |
|
"grad_norm": 0.9587990520200799, |
|
"learning_rate": 8.072230077467748e-05, |
|
"loss": 0.2662, |
|
"num_input_tokens_seen": 22849552, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6499261447562777, |
|
"eval_loss": 0.48401138186454773, |
|
"eval_runtime": 19.182, |
|
"eval_samples_per_second": 3.128, |
|
"eval_steps_per_second": 0.782, |
|
"num_input_tokens_seen": 22849552, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6514032496307238, |
|
"grad_norm": 1.9654706291736397, |
|
"learning_rate": 8.062585611681758e-05, |
|
"loss": 0.3016, |
|
"num_input_tokens_seen": 22900184, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.6528803545051699, |
|
"grad_norm": 4.7148538513910285, |
|
"learning_rate": 8.052922874540722e-05, |
|
"loss": 0.2661, |
|
"num_input_tokens_seen": 22951816, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.654357459379616, |
|
"grad_norm": 15.154253210273424, |
|
"learning_rate": 8.043241923692436e-05, |
|
"loss": 0.2698, |
|
"num_input_tokens_seen": 23003952, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.6558345642540621, |
|
"grad_norm": 1.544288424619005, |
|
"learning_rate": 8.03354281689335e-05, |
|
"loss": 0.2683, |
|
"num_input_tokens_seen": 23056272, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.6573116691285081, |
|
"grad_norm": 1.7746096025671347, |
|
"learning_rate": 8.023825612008242e-05, |
|
"loss": 0.3237, |
|
"num_input_tokens_seen": 23106472, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.6587887740029542, |
|
"grad_norm": 8.974887056755184, |
|
"learning_rate": 8.014090367009859e-05, |
|
"loss": 0.2906, |
|
"num_input_tokens_seen": 23158064, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.6602658788774003, |
|
"grad_norm": 0.9042498724932153, |
|
"learning_rate": 8.004337139978574e-05, |
|
"loss": 0.2067, |
|
"num_input_tokens_seen": 23210584, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.6617429837518464, |
|
"grad_norm": 1.7463381173392456, |
|
"learning_rate": 7.994565989102042e-05, |
|
"loss": 0.2648, |
|
"num_input_tokens_seen": 23262864, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.6632200886262924, |
|
"grad_norm": 2.846956174007256, |
|
"learning_rate": 7.98477697267485e-05, |
|
"loss": 0.2496, |
|
"num_input_tokens_seen": 23314568, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.6646971935007385, |
|
"grad_norm": 4.665885044883532, |
|
"learning_rate": 7.974970149098174e-05, |
|
"loss": 0.2332, |
|
"num_input_tokens_seen": 23366784, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6646971935007385, |
|
"eval_loss": 0.4966147541999817, |
|
"eval_runtime": 19.1792, |
|
"eval_samples_per_second": 3.128, |
|
"eval_steps_per_second": 0.782, |
|
"num_input_tokens_seen": 23366784, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.6661742983751846, |
|
"grad_norm": 1.646844160605743, |
|
"learning_rate": 7.965145576879423e-05, |
|
"loss": 0.3135, |
|
"num_input_tokens_seen": 23418504, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.6676514032496307, |
|
"grad_norm": 4.433343767927195, |
|
"learning_rate": 7.955303314631898e-05, |
|
"loss": 0.3268, |
|
"num_input_tokens_seen": 23469840, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.6691285081240768, |
|
"grad_norm": 2.741861673744247, |
|
"learning_rate": 7.945443421074436e-05, |
|
"loss": 0.2706, |
|
"num_input_tokens_seen": 23521416, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.670605612998523, |
|
"grad_norm": 13.390920032881771, |
|
"learning_rate": 7.935565955031064e-05, |
|
"loss": 0.2044, |
|
"num_input_tokens_seen": 23573176, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.672082717872969, |
|
"grad_norm": 4.093699227168353, |
|
"learning_rate": 7.925670975430644e-05, |
|
"loss": 0.2724, |
|
"num_input_tokens_seen": 23625080, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.6735598227474151, |
|
"grad_norm": 1.060930943475367, |
|
"learning_rate": 7.915758541306523e-05, |
|
"loss": 0.2543, |
|
"num_input_tokens_seen": 23677096, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.6750369276218612, |
|
"grad_norm": 2.1755423246760515, |
|
"learning_rate": 7.90582871179619e-05, |
|
"loss": 0.2372, |
|
"num_input_tokens_seen": 23729168, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.6765140324963073, |
|
"grad_norm": 8.00546036386453, |
|
"learning_rate": 7.895881546140902e-05, |
|
"loss": 0.2695, |
|
"num_input_tokens_seen": 23780568, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.6779911373707533, |
|
"grad_norm": 7.074026968521074, |
|
"learning_rate": 7.885917103685353e-05, |
|
"loss": 0.3282, |
|
"num_input_tokens_seen": 23831360, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.6794682422451994, |
|
"grad_norm": 1.221854745565416, |
|
"learning_rate": 7.875935443877305e-05, |
|
"loss": 0.2481, |
|
"num_input_tokens_seen": 23883032, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6794682422451994, |
|
"eval_loss": 0.24180778861045837, |
|
"eval_runtime": 18.9575, |
|
"eval_samples_per_second": 3.165, |
|
"eval_steps_per_second": 0.791, |
|
"num_input_tokens_seen": 23883032, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6809453471196455, |
|
"grad_norm": 15.468575933907964, |
|
"learning_rate": 7.865936626267243e-05, |
|
"loss": 0.2474, |
|
"num_input_tokens_seen": 23934880, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.6824224519940916, |
|
"grad_norm": 3.9259269918244932, |
|
"learning_rate": 7.855920710508009e-05, |
|
"loss": 0.2613, |
|
"num_input_tokens_seen": 23986160, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.6838995568685377, |
|
"grad_norm": 1.443222642921112, |
|
"learning_rate": 7.845887756354458e-05, |
|
"loss": 0.1957, |
|
"num_input_tokens_seen": 24038984, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.6853766617429837, |
|
"grad_norm": 2.418773014069429, |
|
"learning_rate": 7.835837823663092e-05, |
|
"loss": 0.2709, |
|
"num_input_tokens_seen": 24090648, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.6868537666174298, |
|
"grad_norm": 1.2678547091245644, |
|
"learning_rate": 7.825770972391712e-05, |
|
"loss": 0.2873, |
|
"num_input_tokens_seen": 24142200, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.6883308714918759, |
|
"grad_norm": 1.5899740505737827, |
|
"learning_rate": 7.81568726259905e-05, |
|
"loss": 0.2378, |
|
"num_input_tokens_seen": 24194400, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.689807976366322, |
|
"grad_norm": 1.6209911075093861, |
|
"learning_rate": 7.805586754444416e-05, |
|
"loss": 0.2615, |
|
"num_input_tokens_seen": 24245328, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.691285081240768, |
|
"grad_norm": 16.06254938606901, |
|
"learning_rate": 7.795469508187343e-05, |
|
"loss": 0.2234, |
|
"num_input_tokens_seen": 24297400, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.6927621861152142, |
|
"grad_norm": 2.2353075576651085, |
|
"learning_rate": 7.785335584187219e-05, |
|
"loss": 0.2833, |
|
"num_input_tokens_seen": 24348536, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.6942392909896603, |
|
"grad_norm": 11.321777956495533, |
|
"learning_rate": 7.775185042902933e-05, |
|
"loss": 0.2313, |
|
"num_input_tokens_seen": 24401256, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6942392909896603, |
|
"eval_loss": 0.1869634985923767, |
|
"eval_runtime": 19.499, |
|
"eval_samples_per_second": 3.077, |
|
"eval_steps_per_second": 0.769, |
|
"num_input_tokens_seen": 24401256, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.6957163958641064, |
|
"grad_norm": 1.2627287884293426, |
|
"learning_rate": 7.765017944892514e-05, |
|
"loss": 0.2499, |
|
"num_input_tokens_seen": 24453384, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.6971935007385525, |
|
"grad_norm": 1.476393664852338, |
|
"learning_rate": 7.754834350812765e-05, |
|
"loss": 0.2132, |
|
"num_input_tokens_seen": 24505960, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.6986706056129985, |
|
"grad_norm": 3.9395522365245226, |
|
"learning_rate": 7.744634321418906e-05, |
|
"loss": 0.2049, |
|
"num_input_tokens_seen": 24559008, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.7001477104874446, |
|
"grad_norm": 2.1895254551218115, |
|
"learning_rate": 7.734417917564211e-05, |
|
"loss": 0.2222, |
|
"num_input_tokens_seen": 24611128, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.7016248153618907, |
|
"grad_norm": 2.3733905521598353, |
|
"learning_rate": 7.724185200199643e-05, |
|
"loss": 0.2678, |
|
"num_input_tokens_seen": 24662336, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.7031019202363368, |
|
"grad_norm": 6.930201726148201, |
|
"learning_rate": 7.713936230373491e-05, |
|
"loss": 0.2888, |
|
"num_input_tokens_seen": 24714032, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7045790251107829, |
|
"grad_norm": 1.2305357870388025, |
|
"learning_rate": 7.703671069231007e-05, |
|
"loss": 0.2609, |
|
"num_input_tokens_seen": 24765296, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.7060561299852289, |
|
"grad_norm": 1.3830979392967684, |
|
"learning_rate": 7.693389778014037e-05, |
|
"loss": 0.2654, |
|
"num_input_tokens_seen": 24816744, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.707533234859675, |
|
"grad_norm": 1.641476112098168, |
|
"learning_rate": 7.683092418060664e-05, |
|
"loss": 0.2231, |
|
"num_input_tokens_seen": 24869320, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.7090103397341211, |
|
"grad_norm": 9.651892844655563, |
|
"learning_rate": 7.672779050804834e-05, |
|
"loss": 0.262, |
|
"num_input_tokens_seen": 24921872, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7090103397341211, |
|
"eval_loss": 0.34713664650917053, |
|
"eval_runtime": 18.9635, |
|
"eval_samples_per_second": 3.164, |
|
"eval_steps_per_second": 0.791, |
|
"num_input_tokens_seen": 24921872, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7104874446085672, |
|
"grad_norm": 1.761600275948819, |
|
"learning_rate": 7.662449737775991e-05, |
|
"loss": 0.2704, |
|
"num_input_tokens_seen": 24973200, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.7119645494830132, |
|
"grad_norm": 1.9815653528675592, |
|
"learning_rate": 7.652104540598712e-05, |
|
"loss": 0.2792, |
|
"num_input_tokens_seen": 25024168, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.7134416543574594, |
|
"grad_norm": 8.315884146956419, |
|
"learning_rate": 7.641743520992343e-05, |
|
"loss": 0.2798, |
|
"num_input_tokens_seen": 25075704, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.7149187592319055, |
|
"grad_norm": 0.7729904952291985, |
|
"learning_rate": 7.631366740770622e-05, |
|
"loss": 0.2411, |
|
"num_input_tokens_seen": 25128264, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.7163958641063516, |
|
"grad_norm": 2.9686769656164547, |
|
"learning_rate": 7.620974261841314e-05, |
|
"loss": 0.2204, |
|
"num_input_tokens_seen": 25180080, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.7178729689807977, |
|
"grad_norm": 1.4885611311756322, |
|
"learning_rate": 7.610566146205846e-05, |
|
"loss": 0.2295, |
|
"num_input_tokens_seen": 25232312, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7193500738552437, |
|
"grad_norm": 17.482084035542726, |
|
"learning_rate": 7.60014245595893e-05, |
|
"loss": 0.3177, |
|
"num_input_tokens_seen": 25283688, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.7208271787296898, |
|
"grad_norm": 2.0884881532897253, |
|
"learning_rate": 7.589703253288196e-05, |
|
"loss": 0.2606, |
|
"num_input_tokens_seen": 25335656, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.7223042836041359, |
|
"grad_norm": 2.1877676236194934, |
|
"learning_rate": 7.579248600473827e-05, |
|
"loss": 0.2406, |
|
"num_input_tokens_seen": 25387752, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.723781388478582, |
|
"grad_norm": 1.04372356426487, |
|
"learning_rate": 7.568778559888173e-05, |
|
"loss": 0.2412, |
|
"num_input_tokens_seen": 25439896, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.723781388478582, |
|
"eval_loss": 0.34561124444007874, |
|
"eval_runtime": 19.0832, |
|
"eval_samples_per_second": 3.144, |
|
"eval_steps_per_second": 0.786, |
|
"num_input_tokens_seen": 25439896, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.725258493353028, |
|
"grad_norm": 15.826053352114771, |
|
"learning_rate": 7.558293193995394e-05, |
|
"loss": 0.2752, |
|
"num_input_tokens_seen": 25491160, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.7267355982274741, |
|
"grad_norm": 1.401909903248283, |
|
"learning_rate": 7.547792565351075e-05, |
|
"loss": 0.2399, |
|
"num_input_tokens_seen": 25543152, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7282127031019202, |
|
"grad_norm": 0.8017657229162621, |
|
"learning_rate": 7.537276736601864e-05, |
|
"loss": 0.2351, |
|
"num_input_tokens_seen": 25595312, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.7296898079763663, |
|
"grad_norm": 1.4744204191665158, |
|
"learning_rate": 7.526745770485088e-05, |
|
"loss": 0.1837, |
|
"num_input_tokens_seen": 25648680, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7311669128508124, |
|
"grad_norm": 1.2465650859091382, |
|
"learning_rate": 7.516199729828385e-05, |
|
"loss": 0.3093, |
|
"num_input_tokens_seen": 25701464, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.7326440177252584, |
|
"grad_norm": 1.880567862109699, |
|
"learning_rate": 7.505638677549327e-05, |
|
"loss": 0.223, |
|
"num_input_tokens_seen": 25753528, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7341211225997046, |
|
"grad_norm": 0.6866693023387563, |
|
"learning_rate": 7.495062676655049e-05, |
|
"loss": 0.2128, |
|
"num_input_tokens_seen": 25805768, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.7355982274741507, |
|
"grad_norm": 1.1016249736691914, |
|
"learning_rate": 7.484471790241865e-05, |
|
"loss": 0.2703, |
|
"num_input_tokens_seen": 25856672, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.7370753323485968, |
|
"grad_norm": 11.244379366241214, |
|
"learning_rate": 7.473866081494896e-05, |
|
"loss": 0.2456, |
|
"num_input_tokens_seen": 25908544, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.7385524372230429, |
|
"grad_norm": 1.4228671338565775, |
|
"learning_rate": 7.463245613687695e-05, |
|
"loss": 0.2382, |
|
"num_input_tokens_seen": 25961056, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.7385524372230429, |
|
"eval_loss": 0.2542795240879059, |
|
"eval_runtime": 19.1344, |
|
"eval_samples_per_second": 3.136, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 25961056, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.740029542097489, |
|
"grad_norm": 3.4127573468108934, |
|
"learning_rate": 7.452610450181865e-05, |
|
"loss": 0.2843, |
|
"num_input_tokens_seen": 26012232, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.741506646971935, |
|
"grad_norm": 1.8065383217954118, |
|
"learning_rate": 7.441960654426687e-05, |
|
"loss": 0.2376, |
|
"num_input_tokens_seen": 26064432, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.7429837518463811, |
|
"grad_norm": 1.038286812818806, |
|
"learning_rate": 7.431296289958735e-05, |
|
"loss": 0.2464, |
|
"num_input_tokens_seen": 26115856, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.7444608567208272, |
|
"grad_norm": 1.3908978510260759, |
|
"learning_rate": 7.4206174204015e-05, |
|
"loss": 0.2793, |
|
"num_input_tokens_seen": 26167176, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.7459379615952733, |
|
"grad_norm": 1.785011959334391, |
|
"learning_rate": 7.409924109465011e-05, |
|
"loss": 0.2141, |
|
"num_input_tokens_seen": 26219144, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.7474150664697193, |
|
"grad_norm": 1.0393762543520273, |
|
"learning_rate": 7.399216420945453e-05, |
|
"loss": 0.2137, |
|
"num_input_tokens_seen": 26271712, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.7488921713441654, |
|
"grad_norm": 5.12895726872253, |
|
"learning_rate": 7.388494418724789e-05, |
|
"loss": 0.2177, |
|
"num_input_tokens_seen": 26323656, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.7503692762186115, |
|
"grad_norm": 9.06455667620907, |
|
"learning_rate": 7.377758166770377e-05, |
|
"loss": 0.2762, |
|
"num_input_tokens_seen": 26375392, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.7518463810930576, |
|
"grad_norm": 1.4100384205361678, |
|
"learning_rate": 7.367007729134588e-05, |
|
"loss": 0.2794, |
|
"num_input_tokens_seen": 26426080, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.7533234859675036, |
|
"grad_norm": 0.9925649690798728, |
|
"learning_rate": 7.356243169954426e-05, |
|
"loss": 0.2364, |
|
"num_input_tokens_seen": 26477208, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7533234859675036, |
|
"eval_loss": 0.38712552189826965, |
|
"eval_runtime": 19.0707, |
|
"eval_samples_per_second": 3.146, |
|
"eval_steps_per_second": 0.787, |
|
"num_input_tokens_seen": 26477208, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.7548005908419497, |
|
"grad_norm": 9.286838829443976, |
|
"learning_rate": 7.34546455345114e-05, |
|
"loss": 0.2649, |
|
"num_input_tokens_seen": 26528824, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.7562776957163959, |
|
"grad_norm": 0.847615423645702, |
|
"learning_rate": 7.334671943929853e-05, |
|
"loss": 0.1834, |
|
"num_input_tokens_seen": 26581512, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.757754800590842, |
|
"grad_norm": 11.003109854541512, |
|
"learning_rate": 7.323865405779162e-05, |
|
"loss": 0.2539, |
|
"num_input_tokens_seen": 26633144, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.7592319054652881, |
|
"grad_norm": 1.2876198439398636, |
|
"learning_rate": 7.313045003470766e-05, |
|
"loss": 0.2592, |
|
"num_input_tokens_seen": 26684024, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.7607090103397341, |
|
"grad_norm": 1.1302004834491934, |
|
"learning_rate": 7.302210801559075e-05, |
|
"loss": 0.228, |
|
"num_input_tokens_seen": 26736512, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.7621861152141802, |
|
"grad_norm": 11.72346624813136, |
|
"learning_rate": 7.291362864680831e-05, |
|
"loss": 0.232, |
|
"num_input_tokens_seen": 26788656, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.7636632200886263, |
|
"grad_norm": 1.5381808841738496, |
|
"learning_rate": 7.280501257554716e-05, |
|
"loss": 0.2411, |
|
"num_input_tokens_seen": 26840856, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.7651403249630724, |
|
"grad_norm": 0.8088247020348738, |
|
"learning_rate": 7.269626044980968e-05, |
|
"loss": 0.2214, |
|
"num_input_tokens_seen": 26892840, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.7666174298375185, |
|
"grad_norm": 19.483946991905242, |
|
"learning_rate": 7.258737291841e-05, |
|
"loss": 0.236, |
|
"num_input_tokens_seen": 26945200, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.7680945347119645, |
|
"grad_norm": 1.070187466620262, |
|
"learning_rate": 7.247835063097e-05, |
|
"loss": 0.2082, |
|
"num_input_tokens_seen": 26997904, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7680945347119645, |
|
"eval_loss": 0.3406156003475189, |
|
"eval_runtime": 19.2307, |
|
"eval_samples_per_second": 3.12, |
|
"eval_steps_per_second": 0.78, |
|
"num_input_tokens_seen": 26997904, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7695716395864106, |
|
"grad_norm": 8.368958056782812, |
|
"learning_rate": 7.236919423791556e-05, |
|
"loss": 0.1983, |
|
"num_input_tokens_seen": 27050064, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.7710487444608567, |
|
"grad_norm": 45.77692642603359, |
|
"learning_rate": 7.225990439047264e-05, |
|
"loss": 0.2442, |
|
"num_input_tokens_seen": 27102096, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.7725258493353028, |
|
"grad_norm": 0.9661489018651128, |
|
"learning_rate": 7.215048174066337e-05, |
|
"loss": 0.2095, |
|
"num_input_tokens_seen": 27155064, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.7740029542097489, |
|
"grad_norm": 1.7684270980979584, |
|
"learning_rate": 7.204092694130218e-05, |
|
"loss": 0.2332, |
|
"num_input_tokens_seen": 27206472, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.7754800590841949, |
|
"grad_norm": 1.480868253020904, |
|
"learning_rate": 7.193124064599188e-05, |
|
"loss": 0.2078, |
|
"num_input_tokens_seen": 27258792, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.7769571639586411, |
|
"grad_norm": 1.586650982980591, |
|
"learning_rate": 7.182142350911985e-05, |
|
"loss": 0.2311, |
|
"num_input_tokens_seen": 27310840, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.7784342688330872, |
|
"grad_norm": 1.4722666794337897, |
|
"learning_rate": 7.1711476185854e-05, |
|
"loss": 0.2639, |
|
"num_input_tokens_seen": 27362496, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.7799113737075333, |
|
"grad_norm": 1.8648085527091647, |
|
"learning_rate": 7.160139933213898e-05, |
|
"loss": 0.2475, |
|
"num_input_tokens_seen": 27414544, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.7813884785819794, |
|
"grad_norm": 1.1194589162002906, |
|
"learning_rate": 7.149119360469217e-05, |
|
"loss": 0.1917, |
|
"num_input_tokens_seen": 27467408, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.7828655834564254, |
|
"grad_norm": 1.1768147878553037, |
|
"learning_rate": 7.138085966099985e-05, |
|
"loss": 0.1736, |
|
"num_input_tokens_seen": 27521088, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7828655834564254, |
|
"eval_loss": 0.269732803106308, |
|
"eval_runtime": 18.9263, |
|
"eval_samples_per_second": 3.17, |
|
"eval_steps_per_second": 0.793, |
|
"num_input_tokens_seen": 27521088, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.7843426883308715, |
|
"grad_norm": 21.321277048653876, |
|
"learning_rate": 7.127039815931322e-05, |
|
"loss": 0.2047, |
|
"num_input_tokens_seen": 27573512, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.7858197932053176, |
|
"grad_norm": 17.20183228158753, |
|
"learning_rate": 7.11598097586445e-05, |
|
"loss": 0.2309, |
|
"num_input_tokens_seen": 27625488, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.7872968980797637, |
|
"grad_norm": 1.7315172011761586, |
|
"learning_rate": 7.104909511876293e-05, |
|
"loss": 0.2188, |
|
"num_input_tokens_seen": 27677824, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.7887740029542097, |
|
"grad_norm": 1.1907445578210916, |
|
"learning_rate": 7.0938254900191e-05, |
|
"loss": 0.2127, |
|
"num_input_tokens_seen": 27730048, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.7902511078286558, |
|
"grad_norm": 1.81826608908883, |
|
"learning_rate": 7.082728976420032e-05, |
|
"loss": 0.2534, |
|
"num_input_tokens_seen": 27781512, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.7917282127031019, |
|
"grad_norm": 1.2558672573153766, |
|
"learning_rate": 7.071620037280779e-05, |
|
"loss": 0.204, |
|
"num_input_tokens_seen": 27833808, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.793205317577548, |
|
"grad_norm": 16.755851819106958, |
|
"learning_rate": 7.060498738877159e-05, |
|
"loss": 0.2218, |
|
"num_input_tokens_seen": 27886232, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.794682422451994, |
|
"grad_norm": 1.6442173318750486, |
|
"learning_rate": 7.049365147558727e-05, |
|
"loss": 0.2157, |
|
"num_input_tokens_seen": 27938696, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.7961595273264401, |
|
"grad_norm": 20.30442824896465, |
|
"learning_rate": 7.038219329748376e-05, |
|
"loss": 0.2401, |
|
"num_input_tokens_seen": 27990816, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.7976366322008862, |
|
"grad_norm": 0.9565866694207389, |
|
"learning_rate": 7.027061351941948e-05, |
|
"loss": 0.2225, |
|
"num_input_tokens_seen": 28042992, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7976366322008862, |
|
"eval_loss": 0.41549214720726013, |
|
"eval_runtime": 18.9524, |
|
"eval_samples_per_second": 3.166, |
|
"eval_steps_per_second": 0.791, |
|
"num_input_tokens_seen": 28042992, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7991137370753324, |
|
"grad_norm": 1.2540711450587154, |
|
"learning_rate": 7.01589128070782e-05, |
|
"loss": 0.2279, |
|
"num_input_tokens_seen": 28094200, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.8005908419497785, |
|
"grad_norm": 1.1647829910589267, |
|
"learning_rate": 7.004709182686531e-05, |
|
"loss": 0.2307, |
|
"num_input_tokens_seen": 28146144, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.8020679468242246, |
|
"grad_norm": 1.911821987729577, |
|
"learning_rate": 6.993515124590362e-05, |
|
"loss": 0.2025, |
|
"num_input_tokens_seen": 28198600, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.8035450516986706, |
|
"grad_norm": 3.9104392867751003, |
|
"learning_rate": 6.982309173202951e-05, |
|
"loss": 0.2318, |
|
"num_input_tokens_seen": 28249928, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.8050221565731167, |
|
"grad_norm": 3.023688281243673, |
|
"learning_rate": 6.971091395378895e-05, |
|
"loss": 0.2074, |
|
"num_input_tokens_seen": 28301928, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.8064992614475628, |
|
"grad_norm": 1.4517739851550877, |
|
"learning_rate": 6.95986185804334e-05, |
|
"loss": 0.1935, |
|
"num_input_tokens_seen": 28354256, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8079763663220089, |
|
"grad_norm": 1.273143787849072, |
|
"learning_rate": 6.948620628191595e-05, |
|
"loss": 0.2457, |
|
"num_input_tokens_seen": 28405800, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.8094534711964549, |
|
"grad_norm": 7.171460586418023, |
|
"learning_rate": 6.937367772888725e-05, |
|
"loss": 0.2021, |
|
"num_input_tokens_seen": 28457664, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.810930576070901, |
|
"grad_norm": 4.016097948044553, |
|
"learning_rate": 6.926103359269152e-05, |
|
"loss": 0.2323, |
|
"num_input_tokens_seen": 28509944, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.8124076809453471, |
|
"grad_norm": 1.7260282738026802, |
|
"learning_rate": 6.914827454536254e-05, |
|
"loss": 0.2501, |
|
"num_input_tokens_seen": 28561248, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8124076809453471, |
|
"eval_loss": 0.41148969531059265, |
|
"eval_runtime": 19.291, |
|
"eval_samples_per_second": 3.11, |
|
"eval_steps_per_second": 0.778, |
|
"num_input_tokens_seen": 28561248, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8138847858197932, |
|
"grad_norm": 1.1919186743284782, |
|
"learning_rate": 6.903540125961965e-05, |
|
"loss": 0.225, |
|
"num_input_tokens_seen": 28613120, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.8153618906942393, |
|
"grad_norm": 1.4599715179768002, |
|
"learning_rate": 6.892241440886377e-05, |
|
"loss": 0.2365, |
|
"num_input_tokens_seen": 28664864, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8168389955686853, |
|
"grad_norm": 1.2279132312954155, |
|
"learning_rate": 6.880931466717327e-05, |
|
"loss": 0.2386, |
|
"num_input_tokens_seen": 28716896, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.8183161004431314, |
|
"grad_norm": 22.206631253466607, |
|
"learning_rate": 6.86961027093001e-05, |
|
"loss": 0.2358, |
|
"num_input_tokens_seen": 28769528, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.8197932053175776, |
|
"grad_norm": 6.630701043823761, |
|
"learning_rate": 6.858277921066568e-05, |
|
"loss": 0.2844, |
|
"num_input_tokens_seen": 28821304, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.8212703101920237, |
|
"grad_norm": 0.7473512414072709, |
|
"learning_rate": 6.846934484735686e-05, |
|
"loss": 0.1867, |
|
"num_input_tokens_seen": 28872712, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8227474150664698, |
|
"grad_norm": 1.1277045436573916, |
|
"learning_rate": 6.83558002961219e-05, |
|
"loss": 0.2184, |
|
"num_input_tokens_seen": 28924272, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.8242245199409158, |
|
"grad_norm": 14.609958116422174, |
|
"learning_rate": 6.824214623436644e-05, |
|
"loss": 0.1938, |
|
"num_input_tokens_seen": 28976352, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.8257016248153619, |
|
"grad_norm": 1.2950871631178849, |
|
"learning_rate": 6.812838334014951e-05, |
|
"loss": 0.2046, |
|
"num_input_tokens_seen": 29028344, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.827178729689808, |
|
"grad_norm": 1.6899801995875487, |
|
"learning_rate": 6.801451229217938e-05, |
|
"loss": 0.2507, |
|
"num_input_tokens_seen": 29079576, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.827178729689808, |
|
"eval_loss": 0.32233569025993347, |
|
"eval_runtime": 19.2788, |
|
"eval_samples_per_second": 3.112, |
|
"eval_steps_per_second": 0.778, |
|
"num_input_tokens_seen": 29079576, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8286558345642541, |
|
"grad_norm": 1.7749470743647537, |
|
"learning_rate": 6.790053376980959e-05, |
|
"loss": 0.1752, |
|
"num_input_tokens_seen": 29131768, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.8301329394387001, |
|
"grad_norm": 1.3850536432850993, |
|
"learning_rate": 6.778644845303483e-05, |
|
"loss": 0.2502, |
|
"num_input_tokens_seen": 29183952, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.8316100443131462, |
|
"grad_norm": 1.0103366085788665, |
|
"learning_rate": 6.767225702248698e-05, |
|
"loss": 0.2092, |
|
"num_input_tokens_seen": 29236232, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.8330871491875923, |
|
"grad_norm": 13.569757844040682, |
|
"learning_rate": 6.755796015943097e-05, |
|
"loss": 0.2492, |
|
"num_input_tokens_seen": 29287672, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.8345642540620384, |
|
"grad_norm": 1.23710967928213, |
|
"learning_rate": 6.744355854576075e-05, |
|
"loss": 0.2377, |
|
"num_input_tokens_seen": 29339952, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.8360413589364845, |
|
"grad_norm": 1.3865093164785327, |
|
"learning_rate": 6.732905286399516e-05, |
|
"loss": 0.168, |
|
"num_input_tokens_seen": 29392128, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.8375184638109305, |
|
"grad_norm": 1.585429725562894, |
|
"learning_rate": 6.721444379727398e-05, |
|
"loss": 0.1919, |
|
"num_input_tokens_seen": 29444168, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.8389955686853766, |
|
"grad_norm": 9.069319490379504, |
|
"learning_rate": 6.709973202935374e-05, |
|
"loss": 0.2225, |
|
"num_input_tokens_seen": 29495592, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.8404726735598228, |
|
"grad_norm": 0.9134374448072771, |
|
"learning_rate": 6.698491824460371e-05, |
|
"loss": 0.1715, |
|
"num_input_tokens_seen": 29548008, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.8419497784342689, |
|
"grad_norm": 1.4124132378382863, |
|
"learning_rate": 6.687000312800178e-05, |
|
"loss": 0.1928, |
|
"num_input_tokens_seen": 29600536, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.8419497784342689, |
|
"eval_loss": 0.28275948762893677, |
|
"eval_runtime": 19.4837, |
|
"eval_samples_per_second": 3.08, |
|
"eval_steps_per_second": 0.77, |
|
"num_input_tokens_seen": 29600536, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.843426883308715, |
|
"grad_norm": 1.4547945162774674, |
|
"learning_rate": 6.675498736513036e-05, |
|
"loss": 0.2163, |
|
"num_input_tokens_seen": 29652440, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.844903988183161, |
|
"grad_norm": 1.3348915008279034, |
|
"learning_rate": 6.663987164217236e-05, |
|
"loss": 0.2589, |
|
"num_input_tokens_seen": 29704376, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.8463810930576071, |
|
"grad_norm": 2.2889038673998603, |
|
"learning_rate": 6.652465664590703e-05, |
|
"loss": 0.2325, |
|
"num_input_tokens_seen": 29756504, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.8478581979320532, |
|
"grad_norm": 1.3144459851000174, |
|
"learning_rate": 6.640934306370586e-05, |
|
"loss": 0.242, |
|
"num_input_tokens_seen": 29807328, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.8493353028064993, |
|
"grad_norm": 1.2238000417554058, |
|
"learning_rate": 6.629393158352854e-05, |
|
"loss": 0.2169, |
|
"num_input_tokens_seen": 29859208, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.8508124076809453, |
|
"grad_norm": 1.5853683051276755, |
|
"learning_rate": 6.61784228939188e-05, |
|
"loss": 0.2335, |
|
"num_input_tokens_seen": 29911128, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.8522895125553914, |
|
"grad_norm": 0.8463095817606877, |
|
"learning_rate": 6.606281768400032e-05, |
|
"loss": 0.1913, |
|
"num_input_tokens_seen": 29962384, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.8537666174298375, |
|
"grad_norm": 1.6841064365294203, |
|
"learning_rate": 6.594711664347264e-05, |
|
"loss": 0.2425, |
|
"num_input_tokens_seen": 30013664, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.8552437223042836, |
|
"grad_norm": 1.1559578384960632, |
|
"learning_rate": 6.5831320462607e-05, |
|
"loss": 0.2312, |
|
"num_input_tokens_seen": 30066016, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.8567208271787297, |
|
"grad_norm": 0.9327537302615286, |
|
"learning_rate": 6.571542983224223e-05, |
|
"loss": 0.2029, |
|
"num_input_tokens_seen": 30118072, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8567208271787297, |
|
"eval_loss": 0.39434579014778137, |
|
"eval_runtime": 19.1253, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 30118072, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.8581979320531757, |
|
"grad_norm": 1.2779653791980783, |
|
"learning_rate": 6.559944544378072e-05, |
|
"loss": 0.2241, |
|
"num_input_tokens_seen": 30170248, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.8596750369276218, |
|
"grad_norm": 1.6407050078852088, |
|
"learning_rate": 6.548336798918411e-05, |
|
"loss": 0.2298, |
|
"num_input_tokens_seen": 30222016, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.8611521418020679, |
|
"grad_norm": 1.5078723595666699, |
|
"learning_rate": 6.536719816096935e-05, |
|
"loss": 0.2396, |
|
"num_input_tokens_seen": 30273312, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.8626292466765141, |
|
"grad_norm": 1.852309531147588, |
|
"learning_rate": 6.52509366522045e-05, |
|
"loss": 0.2324, |
|
"num_input_tokens_seen": 30324328, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.8641063515509602, |
|
"grad_norm": 1.3789560520965807, |
|
"learning_rate": 6.513458415650452e-05, |
|
"loss": 0.2263, |
|
"num_input_tokens_seen": 30376488, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.8655834564254062, |
|
"grad_norm": 1.904793400524472, |
|
"learning_rate": 6.501814136802725e-05, |
|
"loss": 0.1734, |
|
"num_input_tokens_seen": 30429504, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.8670605612998523, |
|
"grad_norm": 1.15273615308065, |
|
"learning_rate": 6.490160898146918e-05, |
|
"loss": 0.2235, |
|
"num_input_tokens_seen": 30480400, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.8685376661742984, |
|
"grad_norm": 1.4070108528869274, |
|
"learning_rate": 6.47849876920614e-05, |
|
"loss": 0.2297, |
|
"num_input_tokens_seen": 30531912, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.8700147710487445, |
|
"grad_norm": 1.182373794273276, |
|
"learning_rate": 6.46682781955653e-05, |
|
"loss": 0.1764, |
|
"num_input_tokens_seen": 30584688, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.8714918759231906, |
|
"grad_norm": 17.302996120392294, |
|
"learning_rate": 6.455148118826859e-05, |
|
"loss": 0.1692, |
|
"num_input_tokens_seen": 30637448, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8714918759231906, |
|
"eval_loss": 0.20344533026218414, |
|
"eval_runtime": 18.8463, |
|
"eval_samples_per_second": 3.184, |
|
"eval_steps_per_second": 0.796, |
|
"num_input_tokens_seen": 30637448, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.8729689807976366, |
|
"grad_norm": 1.8856590605160186, |
|
"learning_rate": 6.443459736698105e-05, |
|
"loss": 0.152, |
|
"num_input_tokens_seen": 30690624, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.8744460856720827, |
|
"grad_norm": 1.6528236741968945, |
|
"learning_rate": 6.431762742903038e-05, |
|
"loss": 0.1945, |
|
"num_input_tokens_seen": 30742992, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.8759231905465288, |
|
"grad_norm": 8.103341069666943, |
|
"learning_rate": 6.420057207225807e-05, |
|
"loss": 0.2177, |
|
"num_input_tokens_seen": 30795256, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.8774002954209749, |
|
"grad_norm": 1.1109514825198903, |
|
"learning_rate": 6.408343199501519e-05, |
|
"loss": 0.1579, |
|
"num_input_tokens_seen": 30847696, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.8788774002954209, |
|
"grad_norm": 1.2328126120792269, |
|
"learning_rate": 6.396620789615825e-05, |
|
"loss": 0.1943, |
|
"num_input_tokens_seen": 30899904, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.880354505169867, |
|
"grad_norm": 1.3844672046567905, |
|
"learning_rate": 6.384890047504508e-05, |
|
"loss": 0.1749, |
|
"num_input_tokens_seen": 30952168, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.8818316100443131, |
|
"grad_norm": 1.3306213585837874, |
|
"learning_rate": 6.373151043153056e-05, |
|
"loss": 0.2156, |
|
"num_input_tokens_seen": 31004192, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.8833087149187593, |
|
"grad_norm": 1.3085143966848831, |
|
"learning_rate": 6.361403846596252e-05, |
|
"loss": 0.2083, |
|
"num_input_tokens_seen": 31056712, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.8847858197932054, |
|
"grad_norm": 0.8113418641584603, |
|
"learning_rate": 6.349648527917752e-05, |
|
"loss": 0.1721, |
|
"num_input_tokens_seen": 31108944, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.8862629246676514, |
|
"grad_norm": 1.4549871288078948, |
|
"learning_rate": 6.33788515724967e-05, |
|
"loss": 0.234, |
|
"num_input_tokens_seen": 31159736, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8862629246676514, |
|
"eval_loss": 0.25555509328842163, |
|
"eval_runtime": 19.3746, |
|
"eval_samples_per_second": 3.097, |
|
"eval_steps_per_second": 0.774, |
|
"num_input_tokens_seen": 31159736, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.8877400295420975, |
|
"grad_norm": 1.2324370262682327, |
|
"learning_rate": 6.326113804772157e-05, |
|
"loss": 0.2011, |
|
"num_input_tokens_seen": 31211632, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.8892171344165436, |
|
"grad_norm": 1.7142403977850047, |
|
"learning_rate": 6.314334540712983e-05, |
|
"loss": 0.2197, |
|
"num_input_tokens_seen": 31264376, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.8906942392909897, |
|
"grad_norm": 1.50601762442169, |
|
"learning_rate": 6.302547435347122e-05, |
|
"loss": 0.1853, |
|
"num_input_tokens_seen": 31316584, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.8921713441654358, |
|
"grad_norm": 1.2451326697290692, |
|
"learning_rate": 6.290752558996325e-05, |
|
"loss": 0.2312, |
|
"num_input_tokens_seen": 31367768, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.8936484490398818, |
|
"grad_norm": 1.3816673419860452, |
|
"learning_rate": 6.278949982028704e-05, |
|
"loss": 0.2608, |
|
"num_input_tokens_seen": 31419664, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.8951255539143279, |
|
"grad_norm": 1.430852186731335, |
|
"learning_rate": 6.267139774858318e-05, |
|
"loss": 0.225, |
|
"num_input_tokens_seen": 31471672, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.896602658788774, |
|
"grad_norm": 22.943001206859204, |
|
"learning_rate": 6.255322007944743e-05, |
|
"loss": 0.225, |
|
"num_input_tokens_seen": 31523888, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.8980797636632201, |
|
"grad_norm": 0.8776969753848171, |
|
"learning_rate": 6.243496751792658e-05, |
|
"loss": 0.2291, |
|
"num_input_tokens_seen": 31574992, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.8995568685376661, |
|
"grad_norm": 0.8040234911638864, |
|
"learning_rate": 6.231664076951421e-05, |
|
"loss": 0.1926, |
|
"num_input_tokens_seen": 31627608, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.9010339734121122, |
|
"grad_norm": 1.336878225583922, |
|
"learning_rate": 6.219824054014656e-05, |
|
"loss": 0.2303, |
|
"num_input_tokens_seen": 31679080, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9010339734121122, |
|
"eval_loss": 0.22528553009033203, |
|
"eval_runtime": 19.2502, |
|
"eval_samples_per_second": 3.117, |
|
"eval_steps_per_second": 0.779, |
|
"num_input_tokens_seen": 31679080, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.9025110782865583, |
|
"grad_norm": 1.1996422945391199, |
|
"learning_rate": 6.207976753619816e-05, |
|
"loss": 0.1854, |
|
"num_input_tokens_seen": 31731232, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.9039881831610044, |
|
"grad_norm": 1.142249274568305, |
|
"learning_rate": 6.196122246447779e-05, |
|
"loss": 0.1813, |
|
"num_input_tokens_seen": 31783480, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.9054652880354506, |
|
"grad_norm": 1.8891844898246446, |
|
"learning_rate": 6.184260603222416e-05, |
|
"loss": 0.2099, |
|
"num_input_tokens_seen": 31835864, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.9069423929098966, |
|
"grad_norm": 1.1771283819772904, |
|
"learning_rate": 6.17239189471017e-05, |
|
"loss": 0.2158, |
|
"num_input_tokens_seen": 31887120, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.9084194977843427, |
|
"grad_norm": 1.000017408106927, |
|
"learning_rate": 6.160516191719638e-05, |
|
"loss": 0.1947, |
|
"num_input_tokens_seen": 31938768, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.9098966026587888, |
|
"grad_norm": 10.76780458746382, |
|
"learning_rate": 6.148633565101145e-05, |
|
"loss": 0.2058, |
|
"num_input_tokens_seen": 31990800, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.9113737075332349, |
|
"grad_norm": 1.5246138398849078, |
|
"learning_rate": 6.136744085746322e-05, |
|
"loss": 0.2366, |
|
"num_input_tokens_seen": 32042096, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.912850812407681, |
|
"grad_norm": 1.3169793573688027, |
|
"learning_rate": 6.124847824587684e-05, |
|
"loss": 0.2749, |
|
"num_input_tokens_seen": 32092864, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.914327917282127, |
|
"grad_norm": 1.0350686863717118, |
|
"learning_rate": 6.112944852598205e-05, |
|
"loss": 0.2242, |
|
"num_input_tokens_seen": 32144288, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.9158050221565731, |
|
"grad_norm": 0.8553950294390508, |
|
"learning_rate": 6.1010352407908966e-05, |
|
"loss": 0.1999, |
|
"num_input_tokens_seen": 32196176, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9158050221565731, |
|
"eval_loss": 0.2710443437099457, |
|
"eval_runtime": 19.0309, |
|
"eval_samples_per_second": 3.153, |
|
"eval_steps_per_second": 0.788, |
|
"num_input_tokens_seen": 32196176, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.9172821270310192, |
|
"grad_norm": 0.9733516483269293, |
|
"learning_rate": 6.089119060218385e-05, |
|
"loss": 0.2205, |
|
"num_input_tokens_seen": 32247416, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.9187592319054653, |
|
"grad_norm": 1.393227303687879, |
|
"learning_rate": 6.077196381972482e-05, |
|
"loss": 0.2161, |
|
"num_input_tokens_seen": 32298088, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.9202363367799113, |
|
"grad_norm": 1.0898822094479876, |
|
"learning_rate": 6.065267277183767e-05, |
|
"loss": 0.1846, |
|
"num_input_tokens_seen": 32349768, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.9217134416543574, |
|
"grad_norm": 1.613649982790536, |
|
"learning_rate": 6.0533318170211584e-05, |
|
"loss": 0.2246, |
|
"num_input_tokens_seen": 32401136, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.9231905465288035, |
|
"grad_norm": 1.3536108105978324, |
|
"learning_rate": 6.041390072691495e-05, |
|
"loss": 0.1909, |
|
"num_input_tokens_seen": 32453424, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.9246676514032496, |
|
"grad_norm": 4.434791836263956, |
|
"learning_rate": 6.0294421154391013e-05, |
|
"loss": 0.1651, |
|
"num_input_tokens_seen": 32506104, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.9261447562776958, |
|
"grad_norm": 1.518977363790944, |
|
"learning_rate": 6.0174880165453714e-05, |
|
"loss": 0.2352, |
|
"num_input_tokens_seen": 32557496, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.9276218611521418, |
|
"grad_norm": 22.226709781269058, |
|
"learning_rate": 6.005527847328338e-05, |
|
"loss": 0.1875, |
|
"num_input_tokens_seen": 32609696, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.9290989660265879, |
|
"grad_norm": 1.4067637855508932, |
|
"learning_rate": 5.993561679142253e-05, |
|
"loss": 0.1889, |
|
"num_input_tokens_seen": 32661992, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.930576070901034, |
|
"grad_norm": 1.2192858374169209, |
|
"learning_rate": 5.981589583377154e-05, |
|
"loss": 0.2069, |
|
"num_input_tokens_seen": 32713824, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.930576070901034, |
|
"eval_loss": 0.20293839275836945, |
|
"eval_runtime": 19.3509, |
|
"eval_samples_per_second": 3.101, |
|
"eval_steps_per_second": 0.775, |
|
"num_input_tokens_seen": 32713824, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.9320531757754801, |
|
"grad_norm": 1.7783073907151754, |
|
"learning_rate": 5.969611631458444e-05, |
|
"loss": 0.2035, |
|
"num_input_tokens_seen": 32765648, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.9335302806499262, |
|
"grad_norm": 2.7800459727804068, |
|
"learning_rate": 5.957627894846465e-05, |
|
"loss": 0.1655, |
|
"num_input_tokens_seen": 32818192, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.9350073855243722, |
|
"grad_norm": 0.9191763474328299, |
|
"learning_rate": 5.9456384450360694e-05, |
|
"loss": 0.196, |
|
"num_input_tokens_seen": 32870520, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.9364844903988183, |
|
"grad_norm": 4.38220903682666, |
|
"learning_rate": 5.933643353556195e-05, |
|
"loss": 0.1774, |
|
"num_input_tokens_seen": 32922712, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.9379615952732644, |
|
"grad_norm": 6.889422152522515, |
|
"learning_rate": 5.9216426919694356e-05, |
|
"loss": 0.1731, |
|
"num_input_tokens_seen": 32975768, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.9394387001477105, |
|
"grad_norm": 38.21208106263, |
|
"learning_rate": 5.9096365318716194e-05, |
|
"loss": 0.2306, |
|
"num_input_tokens_seen": 33027264, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.9409158050221565, |
|
"grad_norm": 3.8776611584865295, |
|
"learning_rate": 5.897624944891378e-05, |
|
"loss": 0.1972, |
|
"num_input_tokens_seen": 33079712, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.9423929098966026, |
|
"grad_norm": 1.1372460555071302, |
|
"learning_rate": 5.8856080026897144e-05, |
|
"loss": 0.1612, |
|
"num_input_tokens_seen": 33132048, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.9438700147710487, |
|
"grad_norm": 1.366569822997117, |
|
"learning_rate": 5.8735857769595905e-05, |
|
"loss": 0.2148, |
|
"num_input_tokens_seen": 33184488, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.9453471196454948, |
|
"grad_norm": 1.2035899680395865, |
|
"learning_rate": 5.8615583394254814e-05, |
|
"loss": 0.2135, |
|
"num_input_tokens_seen": 33235872, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.9453471196454948, |
|
"eval_loss": 0.3564297556877136, |
|
"eval_runtime": 19.2969, |
|
"eval_samples_per_second": 3.109, |
|
"eval_steps_per_second": 0.777, |
|
"num_input_tokens_seen": 33235872, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.946824224519941, |
|
"grad_norm": 1.3270189069274072, |
|
"learning_rate": 5.849525761842961e-05, |
|
"loss": 0.2059, |
|
"num_input_tokens_seen": 33287792, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.948301329394387, |
|
"grad_norm": 1.5700498259079116, |
|
"learning_rate": 5.837488115998264e-05, |
|
"loss": 0.1482, |
|
"num_input_tokens_seen": 33340168, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.9497784342688331, |
|
"grad_norm": 1.3963642572652624, |
|
"learning_rate": 5.825445473707867e-05, |
|
"loss": 0.2017, |
|
"num_input_tokens_seen": 33391904, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.9512555391432792, |
|
"grad_norm": 1.2363860850370896, |
|
"learning_rate": 5.813397906818051e-05, |
|
"loss": 0.206, |
|
"num_input_tokens_seen": 33443080, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.9527326440177253, |
|
"grad_norm": 1.4364140192060484, |
|
"learning_rate": 5.801345487204482e-05, |
|
"loss": 0.1868, |
|
"num_input_tokens_seen": 33495320, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.9542097488921714, |
|
"grad_norm": 1.2413385170484263, |
|
"learning_rate": 5.78928828677177e-05, |
|
"loss": 0.2033, |
|
"num_input_tokens_seen": 33546688, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.9556868537666174, |
|
"grad_norm": 1.2267190013200722, |
|
"learning_rate": 5.777226377453057e-05, |
|
"loss": 0.2008, |
|
"num_input_tokens_seen": 33597928, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.9571639586410635, |
|
"grad_norm": 1.2199044517627364, |
|
"learning_rate": 5.76515983120957e-05, |
|
"loss": 0.2121, |
|
"num_input_tokens_seen": 33649752, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.9586410635155096, |
|
"grad_norm": 1.134606964963097, |
|
"learning_rate": 5.7530887200302055e-05, |
|
"loss": 0.2204, |
|
"num_input_tokens_seen": 33700792, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.9601181683899557, |
|
"grad_norm": 0.8601990098934698, |
|
"learning_rate": 5.741013115931088e-05, |
|
"loss": 0.1964, |
|
"num_input_tokens_seen": 33752488, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9601181683899557, |
|
"eval_loss": 0.308076411485672, |
|
"eval_runtime": 19.4071, |
|
"eval_samples_per_second": 3.092, |
|
"eval_steps_per_second": 0.773, |
|
"num_input_tokens_seen": 33752488, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.9615952732644018, |
|
"grad_norm": 1.0061109126775716, |
|
"learning_rate": 5.728933090955151e-05, |
|
"loss": 0.227, |
|
"num_input_tokens_seen": 33803968, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.9630723781388478, |
|
"grad_norm": 7.1108905990118645, |
|
"learning_rate": 5.7168487171717056e-05, |
|
"loss": 0.2117, |
|
"num_input_tokens_seen": 33856104, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.9645494830132939, |
|
"grad_norm": 11.468727046658728, |
|
"learning_rate": 5.704760066676003e-05, |
|
"loss": 0.196, |
|
"num_input_tokens_seen": 33907912, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.96602658788774, |
|
"grad_norm": 1.7059623021479096, |
|
"learning_rate": 5.69266721158881e-05, |
|
"loss": 0.1984, |
|
"num_input_tokens_seen": 33960104, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.9675036927621861, |
|
"grad_norm": 1.099758304724886, |
|
"learning_rate": 5.6805702240559786e-05, |
|
"loss": 0.182, |
|
"num_input_tokens_seen": 34012120, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.9689807976366323, |
|
"grad_norm": 1.1569544282160544, |
|
"learning_rate": 5.668469176248017e-05, |
|
"loss": 0.2219, |
|
"num_input_tokens_seen": 34063520, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.9704579025110783, |
|
"grad_norm": 5.82612333029224, |
|
"learning_rate": 5.6563641403596536e-05, |
|
"loss": 0.2421, |
|
"num_input_tokens_seen": 34115800, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.9719350073855244, |
|
"grad_norm": 1.3328615698520316, |
|
"learning_rate": 5.644255188609411e-05, |
|
"loss": 0.1888, |
|
"num_input_tokens_seen": 34168208, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.9734121122599705, |
|
"grad_norm": 1.458848145104471, |
|
"learning_rate": 5.632142393239174e-05, |
|
"loss": 0.2366, |
|
"num_input_tokens_seen": 34218456, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.9748892171344166, |
|
"grad_norm": 1.6219152592756263, |
|
"learning_rate": 5.6200258265137585e-05, |
|
"loss": 0.2131, |
|
"num_input_tokens_seen": 34269496, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9748892171344166, |
|
"eval_loss": 0.3541204631328583, |
|
"eval_runtime": 19.2633, |
|
"eval_samples_per_second": 3.115, |
|
"eval_steps_per_second": 0.779, |
|
"num_input_tokens_seen": 34269496, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.9763663220088626, |
|
"grad_norm": 0.7198899395850713, |
|
"learning_rate": 5.607905560720481e-05, |
|
"loss": 0.1993, |
|
"num_input_tokens_seen": 34321480, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.9778434268833087, |
|
"grad_norm": 1.1646925735801659, |
|
"learning_rate": 5.595781668168725e-05, |
|
"loss": 0.1965, |
|
"num_input_tokens_seen": 34372752, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.9793205317577548, |
|
"grad_norm": 1.5473226512651947, |
|
"learning_rate": 5.5836542211895105e-05, |
|
"loss": 0.2429, |
|
"num_input_tokens_seen": 34424768, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.9807976366322009, |
|
"grad_norm": 1.6745702087485874, |
|
"learning_rate": 5.571523292135067e-05, |
|
"loss": 0.2214, |
|
"num_input_tokens_seen": 34475248, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.982274741506647, |
|
"grad_norm": 1.7074791320051903, |
|
"learning_rate": 5.559388953378393e-05, |
|
"loss": 0.1624, |
|
"num_input_tokens_seen": 34528200, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.983751846381093, |
|
"grad_norm": 1.3494994119412924, |
|
"learning_rate": 5.547251277312833e-05, |
|
"loss": 0.2222, |
|
"num_input_tokens_seen": 34579080, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.9852289512555391, |
|
"grad_norm": 1.399722512313767, |
|
"learning_rate": 5.535110336351642e-05, |
|
"loss": 0.1895, |
|
"num_input_tokens_seen": 34630680, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.9867060561299852, |
|
"grad_norm": 1.6811876091944713, |
|
"learning_rate": 5.5229662029275505e-05, |
|
"loss": 0.2164, |
|
"num_input_tokens_seen": 34682208, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.9881831610044313, |
|
"grad_norm": 1.3810047796468525, |
|
"learning_rate": 5.510818949492337e-05, |
|
"loss": 0.223, |
|
"num_input_tokens_seen": 34732696, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.9896602658788775, |
|
"grad_norm": 1.6151906286753892, |
|
"learning_rate": 5.498668648516394e-05, |
|
"loss": 0.1779, |
|
"num_input_tokens_seen": 34784784, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9896602658788775, |
|
"eval_loss": 0.22550027072429657, |
|
"eval_runtime": 19.403, |
|
"eval_samples_per_second": 3.092, |
|
"eval_steps_per_second": 0.773, |
|
"num_input_tokens_seen": 34784784, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.9911373707533235, |
|
"grad_norm": 1.6778824792393197, |
|
"learning_rate": 5.4865153724882945e-05, |
|
"loss": 0.2145, |
|
"num_input_tokens_seen": 34836528, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.9926144756277696, |
|
"grad_norm": 1.177435164237277, |
|
"learning_rate": 5.4743591939143624e-05, |
|
"loss": 0.1839, |
|
"num_input_tokens_seen": 34888888, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.9940915805022157, |
|
"grad_norm": 0.9272308554713589, |
|
"learning_rate": 5.462200185318236e-05, |
|
"loss": 0.1665, |
|
"num_input_tokens_seen": 34941112, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.9955686853766618, |
|
"grad_norm": 1.3886029969999028, |
|
"learning_rate": 5.4500384192404395e-05, |
|
"loss": 0.1908, |
|
"num_input_tokens_seen": 34992976, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.9970457902511078, |
|
"grad_norm": 0.7182575806960926, |
|
"learning_rate": 5.4378739682379475e-05, |
|
"loss": 0.1577, |
|
"num_input_tokens_seen": 35045680, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.9985228951255539, |
|
"grad_norm": 1.3292843771600575, |
|
"learning_rate": 5.425706904883753e-05, |
|
"loss": 0.2097, |
|
"num_input_tokens_seen": 35097928, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.7761867380933323, |
|
"learning_rate": 5.4135373017664326e-05, |
|
"loss": 0.1933, |
|
"num_input_tokens_seen": 35150544, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 1.0014771048744462, |
|
"grad_norm": 1.2787364659066394, |
|
"learning_rate": 5.401365231489718e-05, |
|
"loss": 0.1708, |
|
"num_input_tokens_seen": 35203152, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.0029542097488922, |
|
"grad_norm": 1.1654168781378111, |
|
"learning_rate": 5.389190766672056e-05, |
|
"loss": 0.1766, |
|
"num_input_tokens_seen": 35254992, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 1.0044313146233383, |
|
"grad_norm": 1.3171125278735354, |
|
"learning_rate": 5.3770139799461824e-05, |
|
"loss": 0.2173, |
|
"num_input_tokens_seen": 35305984, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0044313146233383, |
|
"eval_loss": 0.40779221057891846, |
|
"eval_runtime": 19.1316, |
|
"eval_samples_per_second": 3.136, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 35305984, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.0047267355982274, |
|
"num_input_tokens_seen": 35316128, |
|
"step": 3401, |
|
"total_flos": 2329910849044480.0, |
|
"train_loss": 0.0003882907692455818, |
|
"train_runtime": 103.5396, |
|
"train_samples_per_second": 788.104, |
|
"train_steps_per_second": 32.838 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3400, |
|
"num_input_tokens_seen": 35316128, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2329910849044480.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|