|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.175, |
|
"eval_steps": 500, |
|
"global_step": 87000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 2.8962271213531494, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1292, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.777284145355225, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1979, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.0732028484344482, |
|
"learning_rate": 6e-06, |
|
"loss": 1.1233, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 2.748718023300171, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.1694, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.2017223834991455, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1499, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 1.1749963760375977, |
|
"eval_runtime": 106.2784, |
|
"eval_samples_per_second": 9.409, |
|
"eval_steps_per_second": 2.352, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.5688204765319824, |
|
"learning_rate": 9.98994974874372e-06, |
|
"loss": 1.1033, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.247222900390625, |
|
"learning_rate": 9.979899497487437e-06, |
|
"loss": 1.1353, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.563870906829834, |
|
"learning_rate": 9.969849246231156e-06, |
|
"loss": 1.1575, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.093879461288452, |
|
"learning_rate": 9.959798994974875e-06, |
|
"loss": 1.186, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.5246388912200928, |
|
"learning_rate": 9.949748743718594e-06, |
|
"loss": 1.123, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 1.2011113166809082, |
|
"eval_runtime": 106.2391, |
|
"eval_samples_per_second": 9.413, |
|
"eval_steps_per_second": 2.353, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 8.584878921508789, |
|
"learning_rate": 9.939698492462311e-06, |
|
"loss": 1.1876, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.8535866737365723, |
|
"learning_rate": 9.929648241206032e-06, |
|
"loss": 1.147, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.041557788848877, |
|
"learning_rate": 9.91959798994975e-06, |
|
"loss": 1.1527, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.288966655731201, |
|
"learning_rate": 9.909547738693468e-06, |
|
"loss": 1.1, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.992133378982544, |
|
"learning_rate": 9.899497487437186e-06, |
|
"loss": 1.1702, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 1.194442629814148, |
|
"eval_runtime": 106.2775, |
|
"eval_samples_per_second": 9.409, |
|
"eval_steps_per_second": 2.352, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10.900914192199707, |
|
"learning_rate": 9.889447236180906e-06, |
|
"loss": 1.1407, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.308995485305786, |
|
"learning_rate": 9.879396984924624e-06, |
|
"loss": 1.1732, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.821071147918701, |
|
"learning_rate": 9.869346733668343e-06, |
|
"loss": 1.178, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.4869799613952637, |
|
"learning_rate": 9.859296482412062e-06, |
|
"loss": 1.1236, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.6243529319763184, |
|
"learning_rate": 9.84924623115578e-06, |
|
"loss": 1.1383, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 1.197302222251892, |
|
"eval_runtime": 106.476, |
|
"eval_samples_per_second": 9.392, |
|
"eval_steps_per_second": 2.348, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.772403717041016, |
|
"learning_rate": 9.839195979899498e-06, |
|
"loss": 1.1792, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.6403985023498535, |
|
"learning_rate": 9.829145728643217e-06, |
|
"loss": 1.2086, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 6.52122163772583, |
|
"learning_rate": 9.819095477386936e-06, |
|
"loss": 1.1417, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.543770790100098, |
|
"learning_rate": 9.809045226130655e-06, |
|
"loss": 1.18, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 7.871328353881836, |
|
"learning_rate": 9.798994974874372e-06, |
|
"loss": 1.1623, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 1.2371257543563843, |
|
"eval_runtime": 106.486, |
|
"eval_samples_per_second": 9.391, |
|
"eval_steps_per_second": 2.348, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 9.064556121826172, |
|
"learning_rate": 9.788944723618091e-06, |
|
"loss": 1.2078, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.955998420715332, |
|
"learning_rate": 9.77889447236181e-06, |
|
"loss": 1.105, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.0823745727539062, |
|
"learning_rate": 9.768844221105529e-06, |
|
"loss": 1.093, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.0110859870910645, |
|
"learning_rate": 9.758793969849248e-06, |
|
"loss": 1.1217, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 6.344435691833496, |
|
"learning_rate": 9.748743718592965e-06, |
|
"loss": 1.0912, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 1.1914273500442505, |
|
"eval_runtime": 106.2585, |
|
"eval_samples_per_second": 9.411, |
|
"eval_steps_per_second": 2.353, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.916799783706665, |
|
"learning_rate": 9.738693467336684e-06, |
|
"loss": 1.1098, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.60597038269043, |
|
"learning_rate": 9.728643216080402e-06, |
|
"loss": 1.1522, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.223481178283691, |
|
"learning_rate": 9.718592964824122e-06, |
|
"loss": 1.1633, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.995976686477661, |
|
"learning_rate": 9.70854271356784e-06, |
|
"loss": 1.168, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.338571071624756, |
|
"learning_rate": 9.698492462311559e-06, |
|
"loss": 1.1514, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_loss": 1.2231833934783936, |
|
"eval_runtime": 106.0603, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.8867563009262085, |
|
"learning_rate": 9.688442211055276e-06, |
|
"loss": 1.1255, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.41115140914917, |
|
"learning_rate": 9.678391959798997e-06, |
|
"loss": 1.1184, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.065484046936035, |
|
"learning_rate": 9.668341708542714e-06, |
|
"loss": 1.1587, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.625395774841309, |
|
"learning_rate": 9.658291457286433e-06, |
|
"loss": 1.1494, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.19793963432312, |
|
"learning_rate": 9.648241206030152e-06, |
|
"loss": 1.1196, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 1.1993799209594727, |
|
"eval_runtime": 106.0818, |
|
"eval_samples_per_second": 9.427, |
|
"eval_steps_per_second": 2.357, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 8.89126205444336, |
|
"learning_rate": 9.63819095477387e-06, |
|
"loss": 1.1485, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.234679698944092, |
|
"learning_rate": 9.628140703517588e-06, |
|
"loss": 1.1758, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 8.758123397827148, |
|
"learning_rate": 9.618090452261307e-06, |
|
"loss": 1.177, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.4299983978271484, |
|
"learning_rate": 9.608040201005026e-06, |
|
"loss": 1.1694, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 11.623360633850098, |
|
"learning_rate": 9.597989949748745e-06, |
|
"loss": 1.1724, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 1.22361421585083, |
|
"eval_runtime": 106.1398, |
|
"eval_samples_per_second": 9.422, |
|
"eval_steps_per_second": 2.355, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.621526002883911, |
|
"learning_rate": 9.587939698492464e-06, |
|
"loss": 1.176, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.011305809020996, |
|
"learning_rate": 9.577889447236181e-06, |
|
"loss": 1.132, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.702287673950195, |
|
"learning_rate": 9.5678391959799e-06, |
|
"loss": 1.2036, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 7.796123027801514, |
|
"learning_rate": 9.55778894472362e-06, |
|
"loss": 1.1573, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 4.329326152801514, |
|
"learning_rate": 9.547738693467338e-06, |
|
"loss": 1.1477, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 1.1805349588394165, |
|
"eval_runtime": 106.2287, |
|
"eval_samples_per_second": 9.414, |
|
"eval_steps_per_second": 2.353, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.5872912406921387, |
|
"learning_rate": 9.537688442211056e-06, |
|
"loss": 1.1774, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.94892692565918, |
|
"learning_rate": 9.527638190954775e-06, |
|
"loss": 1.1639, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.417180061340332, |
|
"learning_rate": 9.517587939698492e-06, |
|
"loss": 1.1626, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.434834003448486, |
|
"learning_rate": 9.507537688442213e-06, |
|
"loss": 1.1846, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.536422252655029, |
|
"learning_rate": 9.49748743718593e-06, |
|
"loss": 1.1359, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_loss": 1.220381498336792, |
|
"eval_runtime": 106.2553, |
|
"eval_samples_per_second": 9.411, |
|
"eval_steps_per_second": 2.353, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 7.294464111328125, |
|
"learning_rate": 9.487437185929649e-06, |
|
"loss": 1.1081, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.32218074798584, |
|
"learning_rate": 9.477386934673368e-06, |
|
"loss": 1.1621, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.692546844482422, |
|
"learning_rate": 9.467336683417087e-06, |
|
"loss": 1.1703, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.460719585418701, |
|
"learning_rate": 9.457286432160804e-06, |
|
"loss": 1.2071, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.178548336029053, |
|
"learning_rate": 9.447236180904523e-06, |
|
"loss": 1.1124, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 1.1760114431381226, |
|
"eval_runtime": 106.2308, |
|
"eval_samples_per_second": 9.413, |
|
"eval_steps_per_second": 2.353, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 7.670156478881836, |
|
"learning_rate": 9.437185929648242e-06, |
|
"loss": 1.1385, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.211444854736328, |
|
"learning_rate": 9.427135678391961e-06, |
|
"loss": 1.1298, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.375978469848633, |
|
"learning_rate": 9.41708542713568e-06, |
|
"loss": 1.1745, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 7.415770053863525, |
|
"learning_rate": 9.407035175879397e-06, |
|
"loss": 1.1609, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.558354377746582, |
|
"learning_rate": 9.396984924623116e-06, |
|
"loss": 1.1433, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 1.1909652948379517, |
|
"eval_runtime": 106.1934, |
|
"eval_samples_per_second": 9.417, |
|
"eval_steps_per_second": 2.354, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.368566989898682, |
|
"learning_rate": 9.386934673366835e-06, |
|
"loss": 1.2008, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.2384209632873535, |
|
"learning_rate": 9.376884422110554e-06, |
|
"loss": 1.2142, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.848013877868652, |
|
"learning_rate": 9.366834170854272e-06, |
|
"loss": 1.1843, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 6.339895725250244, |
|
"learning_rate": 9.35678391959799e-06, |
|
"loss": 1.1703, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 7.7875518798828125, |
|
"learning_rate": 9.34673366834171e-06, |
|
"loss": 1.1185, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 1.2201147079467773, |
|
"eval_runtime": 106.1893, |
|
"eval_samples_per_second": 9.417, |
|
"eval_steps_per_second": 2.354, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.590594291687012, |
|
"learning_rate": 9.336683417085429e-06, |
|
"loss": 1.1346, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.217388153076172, |
|
"learning_rate": 9.326633165829146e-06, |
|
"loss": 1.118, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.3999927043914795, |
|
"learning_rate": 9.316582914572865e-06, |
|
"loss": 1.1394, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4164233207702637, |
|
"learning_rate": 9.306532663316584e-06, |
|
"loss": 1.1311, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3500103950500488, |
|
"learning_rate": 9.296482412060303e-06, |
|
"loss": 1.1907, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 1.2538405656814575, |
|
"eval_runtime": 106.2387, |
|
"eval_samples_per_second": 9.413, |
|
"eval_steps_per_second": 2.353, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.407262325286865, |
|
"learning_rate": 9.28643216080402e-06, |
|
"loss": 1.1847, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.9364423751831055, |
|
"learning_rate": 9.276381909547739e-06, |
|
"loss": 1.1634, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.788465976715088, |
|
"learning_rate": 9.266331658291458e-06, |
|
"loss": 1.1667, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.906449317932129, |
|
"learning_rate": 9.256281407035177e-06, |
|
"loss": 1.1199, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.966096878051758, |
|
"learning_rate": 9.246231155778896e-06, |
|
"loss": 1.1859, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 1.1558078527450562, |
|
"eval_runtime": 106.2052, |
|
"eval_samples_per_second": 9.416, |
|
"eval_steps_per_second": 2.354, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.45963191986084, |
|
"learning_rate": 9.236180904522613e-06, |
|
"loss": 1.1246, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.322927951812744, |
|
"learning_rate": 9.226130653266332e-06, |
|
"loss": 1.1553, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.216075420379639, |
|
"learning_rate": 9.216080402010051e-06, |
|
"loss": 1.1181, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.784651279449463, |
|
"learning_rate": 9.20603015075377e-06, |
|
"loss": 1.0775, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 6.537864685058594, |
|
"learning_rate": 9.195979899497488e-06, |
|
"loss": 1.1112, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_loss": 1.1949126720428467, |
|
"eval_runtime": 106.2149, |
|
"eval_samples_per_second": 9.415, |
|
"eval_steps_per_second": 2.354, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4382109642028809, |
|
"learning_rate": 9.185929648241207e-06, |
|
"loss": 1.1235, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 7.871183395385742, |
|
"learning_rate": 9.175879396984926e-06, |
|
"loss": 1.2089, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.897693395614624, |
|
"learning_rate": 9.165829145728645e-06, |
|
"loss": 1.1792, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.8129703998565674, |
|
"learning_rate": 9.155778894472362e-06, |
|
"loss": 1.1458, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.998715877532959, |
|
"learning_rate": 9.14572864321608e-06, |
|
"loss": 1.1321, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 1.165256381034851, |
|
"eval_runtime": 106.0709, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.936214923858643, |
|
"learning_rate": 9.1356783919598e-06, |
|
"loss": 1.1338, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.714348793029785, |
|
"learning_rate": 9.125628140703519e-06, |
|
"loss": 1.1526, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.532776832580566, |
|
"learning_rate": 9.115577889447236e-06, |
|
"loss": 1.1279, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.0869245529174805, |
|
"learning_rate": 9.105527638190955e-06, |
|
"loss": 1.152, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.13925313949585, |
|
"learning_rate": 9.095477386934674e-06, |
|
"loss": 1.1462, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 1.2190707921981812, |
|
"eval_runtime": 106.0445, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.3696067333221436, |
|
"learning_rate": 9.085427135678393e-06, |
|
"loss": 1.1338, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.178169250488281, |
|
"learning_rate": 9.075376884422112e-06, |
|
"loss": 1.1047, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 8.618546485900879, |
|
"learning_rate": 9.06532663316583e-06, |
|
"loss": 1.2217, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.047238349914551, |
|
"learning_rate": 9.055276381909548e-06, |
|
"loss": 1.1277, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.2608633041381836, |
|
"learning_rate": 9.045226130653267e-06, |
|
"loss": 1.0982, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.2301849126815796, |
|
"eval_runtime": 106.0445, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 7.365833759307861, |
|
"learning_rate": 9.035175879396986e-06, |
|
"loss": 1.1367, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.321079254150391, |
|
"learning_rate": 9.025125628140704e-06, |
|
"loss": 1.1288, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.2387518882751465, |
|
"learning_rate": 9.015075376884423e-06, |
|
"loss": 1.1415, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.032764911651611, |
|
"learning_rate": 9.005025125628142e-06, |
|
"loss": 1.1471, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 7.161890506744385, |
|
"learning_rate": 8.99497487437186e-06, |
|
"loss": 1.1121, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 1.1580675840377808, |
|
"eval_runtime": 106.0405, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 8.492444038391113, |
|
"learning_rate": 8.984924623115578e-06, |
|
"loss": 1.2182, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.7185165882110596, |
|
"learning_rate": 8.974874371859297e-06, |
|
"loss": 1.1466, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.3694963455200195, |
|
"learning_rate": 8.964824120603016e-06, |
|
"loss": 1.1329, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.5741922855377197, |
|
"learning_rate": 8.954773869346735e-06, |
|
"loss": 1.1466, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.072163105010986, |
|
"learning_rate": 8.944723618090452e-06, |
|
"loss": 1.127, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 1.2112760543823242, |
|
"eval_runtime": 106.0385, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 6.539740085601807, |
|
"learning_rate": 8.934673366834171e-06, |
|
"loss": 1.2167, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.4889700412750244, |
|
"learning_rate": 8.92462311557789e-06, |
|
"loss": 1.1264, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.519000053405762, |
|
"learning_rate": 8.914572864321609e-06, |
|
"loss": 1.1552, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.674288272857666, |
|
"learning_rate": 8.904522613065328e-06, |
|
"loss": 1.1393, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.456840991973877, |
|
"learning_rate": 8.894472361809045e-06, |
|
"loss": 1.1376, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"eval_loss": 1.2394318580627441, |
|
"eval_runtime": 106.0218, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.70370626449585, |
|
"learning_rate": 8.884422110552764e-06, |
|
"loss": 1.139, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 7.3822922706604, |
|
"learning_rate": 8.874371859296483e-06, |
|
"loss": 1.1231, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 8.374277114868164, |
|
"learning_rate": 8.864321608040202e-06, |
|
"loss": 1.1627, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.440807342529297, |
|
"learning_rate": 8.85427135678392e-06, |
|
"loss": 1.1215, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 8.692358016967773, |
|
"learning_rate": 8.84422110552764e-06, |
|
"loss": 1.163, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_loss": 1.1826081275939941, |
|
"eval_runtime": 106.0303, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.820358991622925, |
|
"learning_rate": 8.834170854271358e-06, |
|
"loss": 1.103, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 7.24362850189209, |
|
"learning_rate": 8.824120603015077e-06, |
|
"loss": 1.0791, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.441662311553955, |
|
"learning_rate": 8.814070351758794e-06, |
|
"loss": 1.1786, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.2167410850524902, |
|
"learning_rate": 8.804020100502513e-06, |
|
"loss": 1.1805, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.339104652404785, |
|
"learning_rate": 8.793969849246232e-06, |
|
"loss": 1.0919, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 1.1974951028823853, |
|
"eval_runtime": 106.035, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.801215171813965, |
|
"learning_rate": 8.78391959798995e-06, |
|
"loss": 1.0955, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 10.506399154663086, |
|
"learning_rate": 8.773869346733668e-06, |
|
"loss": 1.2134, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.50074577331543, |
|
"learning_rate": 8.763819095477387e-06, |
|
"loss": 1.1828, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.6549763679504395, |
|
"learning_rate": 8.753768844221106e-06, |
|
"loss": 1.1031, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.622566223144531, |
|
"learning_rate": 8.743718592964825e-06, |
|
"loss": 1.108, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 1.206725835800171, |
|
"eval_runtime": 106.0479, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.7196221351623535, |
|
"learning_rate": 8.733668341708544e-06, |
|
"loss": 1.1414, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.0874502658843994, |
|
"learning_rate": 8.723618090452261e-06, |
|
"loss": 1.1779, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.2144765853881836, |
|
"learning_rate": 8.71356783919598e-06, |
|
"loss": 1.1189, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.149327039718628, |
|
"learning_rate": 8.7035175879397e-06, |
|
"loss": 1.2085, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.726438522338867, |
|
"learning_rate": 8.693467336683418e-06, |
|
"loss": 1.0836, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"eval_loss": 1.1935946941375732, |
|
"eval_runtime": 106.028, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.5727105140686035, |
|
"learning_rate": 8.683417085427136e-06, |
|
"loss": 1.0865, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.883179187774658, |
|
"learning_rate": 8.673366834170856e-06, |
|
"loss": 1.1212, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.398787498474121, |
|
"learning_rate": 8.663316582914574e-06, |
|
"loss": 1.1267, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.785742998123169, |
|
"learning_rate": 8.653266331658293e-06, |
|
"loss": 1.135, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.241240978240967, |
|
"learning_rate": 8.64321608040201e-06, |
|
"loss": 1.1242, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_loss": 1.1758685111999512, |
|
"eval_runtime": 106.0542, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.4116621017456055, |
|
"learning_rate": 8.63316582914573e-06, |
|
"loss": 1.0878, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 6.320282459259033, |
|
"learning_rate": 8.623115577889448e-06, |
|
"loss": 1.1523, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8.830348014831543, |
|
"learning_rate": 8.613065326633167e-06, |
|
"loss": 1.0909, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.5644683837890625, |
|
"learning_rate": 8.603015075376884e-06, |
|
"loss": 1.196, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.190166473388672, |
|
"learning_rate": 8.592964824120603e-06, |
|
"loss": 1.1618, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 1.1882929801940918, |
|
"eval_runtime": 106.0551, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 17.052867889404297, |
|
"learning_rate": 8.582914572864322e-06, |
|
"loss": 1.1788, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 7.739620685577393, |
|
"learning_rate": 8.572864321608041e-06, |
|
"loss": 1.1384, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 9.18273639678955, |
|
"learning_rate": 8.56281407035176e-06, |
|
"loss": 1.1222, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.9803543090820312, |
|
"learning_rate": 8.552763819095477e-06, |
|
"loss": 1.1078, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.8314757347106934, |
|
"learning_rate": 8.542713567839196e-06, |
|
"loss": 1.1153, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 1.1791757345199585, |
|
"eval_runtime": 106.2981, |
|
"eval_samples_per_second": 9.408, |
|
"eval_steps_per_second": 2.352, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 7.389952659606934, |
|
"learning_rate": 8.532663316582915e-06, |
|
"loss": 1.0997, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.7773051261901855, |
|
"learning_rate": 8.522613065326634e-06, |
|
"loss": 1.1291, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.93678092956543, |
|
"learning_rate": 8.512562814070352e-06, |
|
"loss": 1.1771, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.244768142700195, |
|
"learning_rate": 8.50251256281407e-06, |
|
"loss": 1.1455, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.881159543991089, |
|
"learning_rate": 8.49246231155779e-06, |
|
"loss": 1.1593, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 1.174180030822754, |
|
"eval_runtime": 106.0212, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6950880289077759, |
|
"learning_rate": 8.482412060301509e-06, |
|
"loss": 1.1111, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.720574855804443, |
|
"learning_rate": 8.472361809045226e-06, |
|
"loss": 1.1277, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.957318902015686, |
|
"learning_rate": 8.462311557788947e-06, |
|
"loss": 1.1213, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.463331937789917, |
|
"learning_rate": 8.452261306532664e-06, |
|
"loss": 1.1348, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.911657810211182, |
|
"learning_rate": 8.442211055276383e-06, |
|
"loss": 1.183, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 1.1880648136138916, |
|
"eval_runtime": 106.0399, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.685622215270996, |
|
"learning_rate": 8.4321608040201e-06, |
|
"loss": 1.0885, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.106893539428711, |
|
"learning_rate": 8.42211055276382e-06, |
|
"loss": 1.1155, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.5625643730163574, |
|
"learning_rate": 8.412060301507538e-06, |
|
"loss": 1.1509, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.4601857662200928, |
|
"learning_rate": 8.402010050251257e-06, |
|
"loss": 1.1641, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.221052408218384, |
|
"learning_rate": 8.391959798994976e-06, |
|
"loss": 1.1614, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 1.2177305221557617, |
|
"eval_runtime": 106.0442, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 7.0826416015625, |
|
"learning_rate": 8.381909547738695e-06, |
|
"loss": 1.2206, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 8.027673721313477, |
|
"learning_rate": 8.371859296482412e-06, |
|
"loss": 1.1221, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.402815103530884, |
|
"learning_rate": 8.361809045226131e-06, |
|
"loss": 1.1168, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.39699125289917, |
|
"learning_rate": 8.35175879396985e-06, |
|
"loss": 1.2223, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.322509288787842, |
|
"learning_rate": 8.341708542713568e-06, |
|
"loss": 1.13, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_loss": 1.20486319065094, |
|
"eval_runtime": 106.0286, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 8.754172325134277, |
|
"learning_rate": 8.331658291457287e-06, |
|
"loss": 1.0929, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.245049476623535, |
|
"learning_rate": 8.321608040201006e-06, |
|
"loss": 1.0899, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.368318557739258, |
|
"learning_rate": 8.311557788944725e-06, |
|
"loss": 1.1458, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.112353563308716, |
|
"learning_rate": 8.301507537688442e-06, |
|
"loss": 1.1361, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.4094953536987305, |
|
"learning_rate": 8.291457286432163e-06, |
|
"loss": 1.1214, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 1.249082088470459, |
|
"eval_runtime": 106.0393, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 6.122429370880127, |
|
"learning_rate": 8.28140703517588e-06, |
|
"loss": 1.1201, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 8.071257591247559, |
|
"learning_rate": 8.271356783919599e-06, |
|
"loss": 1.0958, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.817784309387207, |
|
"learning_rate": 8.261306532663316e-06, |
|
"loss": 1.1187, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.626766324043274, |
|
"learning_rate": 8.251256281407037e-06, |
|
"loss": 1.1443, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.9109134674072266, |
|
"learning_rate": 8.241206030150754e-06, |
|
"loss": 1.1612, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 1.168979287147522, |
|
"eval_runtime": 106.0316, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.2552337646484375, |
|
"learning_rate": 8.231155778894473e-06, |
|
"loss": 1.1636, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.994833946228027, |
|
"learning_rate": 8.221105527638192e-06, |
|
"loss": 1.1199, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 6.6655449867248535, |
|
"learning_rate": 8.211055276381911e-06, |
|
"loss": 1.1118, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 7.384323596954346, |
|
"learning_rate": 8.201005025125628e-06, |
|
"loss": 1.1713, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.516322612762451, |
|
"learning_rate": 8.190954773869347e-06, |
|
"loss": 1.2118, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 1.1855034828186035, |
|
"eval_runtime": 106.0276, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.098135471343994, |
|
"learning_rate": 8.180904522613066e-06, |
|
"loss": 1.1457, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.398161888122559, |
|
"learning_rate": 8.170854271356785e-06, |
|
"loss": 1.1211, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 10.372193336486816, |
|
"learning_rate": 8.160804020100503e-06, |
|
"loss": 1.1691, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 8.23112678527832, |
|
"learning_rate": 8.150753768844222e-06, |
|
"loss": 1.1251, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.921475887298584, |
|
"learning_rate": 8.14070351758794e-06, |
|
"loss": 1.092, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_loss": 1.20026433467865, |
|
"eval_runtime": 106.017, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.03338623046875, |
|
"learning_rate": 8.130653266331658e-06, |
|
"loss": 1.1199, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 10.756241798400879, |
|
"learning_rate": 8.120603015075379e-06, |
|
"loss": 1.121, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 5.687259674072266, |
|
"learning_rate": 8.110552763819096e-06, |
|
"loss": 1.1623, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.741285562515259, |
|
"learning_rate": 8.100502512562815e-06, |
|
"loss": 1.089, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.4017832279205322, |
|
"learning_rate": 8.090452261306532e-06, |
|
"loss": 1.1537, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"eval_loss": 1.210019588470459, |
|
"eval_runtime": 106.0274, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 27.84754753112793, |
|
"learning_rate": 8.080402010050253e-06, |
|
"loss": 1.1101, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 15.681364059448242, |
|
"learning_rate": 8.07035175879397e-06, |
|
"loss": 1.1033, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.995880603790283, |
|
"learning_rate": 8.060301507537689e-06, |
|
"loss": 1.1781, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 11.795672416687012, |
|
"learning_rate": 8.050251256281408e-06, |
|
"loss": 1.0692, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 7.25331449508667, |
|
"learning_rate": 8.040201005025127e-06, |
|
"loss": 1.1582, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.1762893199920654, |
|
"eval_runtime": 106.0116, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 6.3939690589904785, |
|
"learning_rate": 8.030150753768844e-06, |
|
"loss": 1.1748, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.918806552886963, |
|
"learning_rate": 8.020100502512563e-06, |
|
"loss": 1.1929, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 9.905332565307617, |
|
"learning_rate": 8.010050251256282e-06, |
|
"loss": 1.1741, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.9581973552703857, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.1134, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.484316349029541, |
|
"learning_rate": 7.989949748743719e-06, |
|
"loss": 1.1542, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 1.2277439832687378, |
|
"eval_runtime": 106.0241, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.826357841491699, |
|
"learning_rate": 7.979899497487438e-06, |
|
"loss": 1.1211, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.551624774932861, |
|
"learning_rate": 7.969849246231157e-06, |
|
"loss": 1.1811, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.7033650875091553, |
|
"learning_rate": 7.959798994974876e-06, |
|
"loss": 1.1529, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.31227445602417, |
|
"learning_rate": 7.949748743718595e-06, |
|
"loss": 1.1559, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 8.757655143737793, |
|
"learning_rate": 7.939698492462312e-06, |
|
"loss": 1.166, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 1.2115482091903687, |
|
"eval_runtime": 106.0189, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.111684322357178, |
|
"learning_rate": 7.929648241206031e-06, |
|
"loss": 1.1683, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.45263409614563, |
|
"learning_rate": 7.91959798994975e-06, |
|
"loss": 1.1146, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.8724279403686523, |
|
"learning_rate": 7.909547738693469e-06, |
|
"loss": 1.1648, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.8003387451171875, |
|
"learning_rate": 7.899497487437186e-06, |
|
"loss": 1.1802, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 8.398045539855957, |
|
"learning_rate": 7.889447236180905e-06, |
|
"loss": 1.1438, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 1.1905258893966675, |
|
"eval_runtime": 106.016, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.731268882751465, |
|
"learning_rate": 7.879396984924622e-06, |
|
"loss": 1.0726, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 9.864482879638672, |
|
"learning_rate": 7.869346733668343e-06, |
|
"loss": 1.1604, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.624190092086792, |
|
"learning_rate": 7.85929648241206e-06, |
|
"loss": 1.1831, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.659609317779541, |
|
"learning_rate": 7.84924623115578e-06, |
|
"loss": 1.0539, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 6.585338592529297, |
|
"learning_rate": 7.839195979899498e-06, |
|
"loss": 1.2157, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_loss": 1.199148178100586, |
|
"eval_runtime": 106.0429, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.8435893058776855, |
|
"learning_rate": 7.829145728643217e-06, |
|
"loss": 1.07, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.895426273345947, |
|
"learning_rate": 7.819095477386935e-06, |
|
"loss": 1.115, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 7.716872692108154, |
|
"learning_rate": 7.809045226130654e-06, |
|
"loss": 1.1016, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.7104599475860596, |
|
"learning_rate": 7.798994974874373e-06, |
|
"loss": 1.0983, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.22263240814209, |
|
"learning_rate": 7.788944723618092e-06, |
|
"loss": 1.0981, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 1.2477768659591675, |
|
"eval_runtime": 106.0531, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 12.602644920349121, |
|
"learning_rate": 7.77889447236181e-06, |
|
"loss": 1.1739, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.040693283081055, |
|
"learning_rate": 7.768844221105528e-06, |
|
"loss": 1.1364, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.136214733123779, |
|
"learning_rate": 7.758793969849247e-06, |
|
"loss": 1.1374, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 6.945772647857666, |
|
"learning_rate": 7.748743718592966e-06, |
|
"loss": 1.1533, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.4912397861480713, |
|
"learning_rate": 7.738693467336685e-06, |
|
"loss": 1.0981, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_loss": 1.2318669557571411, |
|
"eval_runtime": 106.0223, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.470850944519043, |
|
"learning_rate": 7.728643216080402e-06, |
|
"loss": 1.1453, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.3996975421905518, |
|
"learning_rate": 7.718592964824121e-06, |
|
"loss": 1.1357, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.165435314178467, |
|
"learning_rate": 7.70854271356784e-06, |
|
"loss": 1.0431, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.5636632442474365, |
|
"learning_rate": 7.698492462311559e-06, |
|
"loss": 1.1245, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 5.408770561218262, |
|
"learning_rate": 7.688442211055276e-06, |
|
"loss": 1.131, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 1.1715679168701172, |
|
"eval_runtime": 106.0287, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 8.558609008789062, |
|
"learning_rate": 7.678391959798995e-06, |
|
"loss": 1.0817, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.554088115692139, |
|
"learning_rate": 7.668341708542714e-06, |
|
"loss": 1.1049, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.769896507263184, |
|
"learning_rate": 7.658291457286433e-06, |
|
"loss": 1.1414, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.120572566986084, |
|
"learning_rate": 7.64824120603015e-06, |
|
"loss": 1.1546, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 9.137899398803711, |
|
"learning_rate": 7.63819095477387e-06, |
|
"loss": 1.0968, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.1701098680496216, |
|
"eval_runtime": 106.0427, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.275972843170166, |
|
"learning_rate": 7.628140703517588e-06, |
|
"loss": 1.0813, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 4.437551975250244, |
|
"learning_rate": 7.618090452261308e-06, |
|
"loss": 1.1649, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.896280527114868, |
|
"learning_rate": 7.608040201005026e-06, |
|
"loss": 1.1536, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.638348579406738, |
|
"learning_rate": 7.597989949748744e-06, |
|
"loss": 1.1122, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.344515800476074, |
|
"learning_rate": 7.587939698492463e-06, |
|
"loss": 1.1265, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 1.1937814950942993, |
|
"eval_runtime": 106.0359, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 12.144390106201172, |
|
"learning_rate": 7.577889447236182e-06, |
|
"loss": 1.0717, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.6124162673950195, |
|
"learning_rate": 7.5678391959799e-06, |
|
"loss": 1.1462, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 8.556794166564941, |
|
"learning_rate": 7.557788944723619e-06, |
|
"loss": 1.1072, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 9.116814613342285, |
|
"learning_rate": 7.547738693467337e-06, |
|
"loss": 1.1318, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 9.45323371887207, |
|
"learning_rate": 7.537688442211056e-06, |
|
"loss": 1.1285, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_loss": 1.2085939645767212, |
|
"eval_runtime": 106.0572, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.843000411987305, |
|
"learning_rate": 7.527638190954774e-06, |
|
"loss": 1.1281, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.768526792526245, |
|
"learning_rate": 7.517587939698493e-06, |
|
"loss": 1.1034, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.8277149200439453, |
|
"learning_rate": 7.507537688442211e-06, |
|
"loss": 1.1364, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.662463665008545, |
|
"learning_rate": 7.49748743718593e-06, |
|
"loss": 1.0998, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.321447372436523, |
|
"learning_rate": 7.487437185929649e-06, |
|
"loss": 1.0505, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 1.1745121479034424, |
|
"eval_runtime": 106.0373, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.473324775695801, |
|
"learning_rate": 7.4773869346733675e-06, |
|
"loss": 1.1277, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 8.012463569641113, |
|
"learning_rate": 7.467336683417086e-06, |
|
"loss": 1.171, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.7822492122650146, |
|
"learning_rate": 7.4572864321608055e-06, |
|
"loss": 1.1144, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.848889350891113, |
|
"learning_rate": 7.447236180904524e-06, |
|
"loss": 1.1733, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.613380432128906, |
|
"learning_rate": 7.437185929648242e-06, |
|
"loss": 1.1166, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_loss": 1.1962485313415527, |
|
"eval_runtime": 106.0399, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 9.13502311706543, |
|
"learning_rate": 7.42713567839196e-06, |
|
"loss": 1.1202, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.9282798767089844, |
|
"learning_rate": 7.417085427135679e-06, |
|
"loss": 1.146, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.336259365081787, |
|
"learning_rate": 7.407035175879398e-06, |
|
"loss": 1.139, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 6.539892673492432, |
|
"learning_rate": 7.396984924623116e-06, |
|
"loss": 1.101, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.7083210945129395, |
|
"learning_rate": 7.386934673366835e-06, |
|
"loss": 1.2061, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 1.208566665649414, |
|
"eval_runtime": 105.9891, |
|
"eval_samples_per_second": 9.435, |
|
"eval_steps_per_second": 2.359, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.5254299640655518, |
|
"learning_rate": 7.376884422110553e-06, |
|
"loss": 1.1518, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.481236457824707, |
|
"learning_rate": 7.366834170854272e-06, |
|
"loss": 1.1229, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.710301399230957, |
|
"learning_rate": 7.35678391959799e-06, |
|
"loss": 1.1179, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.444150924682617, |
|
"learning_rate": 7.346733668341709e-06, |
|
"loss": 1.1705, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.206109046936035, |
|
"learning_rate": 7.336683417085427e-06, |
|
"loss": 1.0838, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_loss": 1.1918013095855713, |
|
"eval_runtime": 106.0214, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 10.437155723571777, |
|
"learning_rate": 7.326633165829146e-06, |
|
"loss": 1.1246, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.975705862045288, |
|
"learning_rate": 7.316582914572865e-06, |
|
"loss": 1.1462, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.456331729888916, |
|
"learning_rate": 7.3065326633165835e-06, |
|
"loss": 1.1436, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.7051470279693604, |
|
"learning_rate": 7.296482412060302e-06, |
|
"loss": 1.127, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.190571308135986, |
|
"learning_rate": 7.2864321608040215e-06, |
|
"loss": 1.1008, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 1.1216787099838257, |
|
"eval_runtime": 106.009, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.3411688804626465, |
|
"learning_rate": 7.27638190954774e-06, |
|
"loss": 1.1983, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.207643032073975, |
|
"learning_rate": 7.266331658291458e-06, |
|
"loss": 1.1391, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 12.340192794799805, |
|
"learning_rate": 7.256281407035176e-06, |
|
"loss": 1.1098, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 5.5857367515563965, |
|
"learning_rate": 7.246231155778896e-06, |
|
"loss": 1.1246, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.554783821105957, |
|
"learning_rate": 7.236180904522614e-06, |
|
"loss": 1.0962, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_loss": 1.1856356859207153, |
|
"eval_runtime": 106.0327, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0475980043411255, |
|
"learning_rate": 7.226130653266332e-06, |
|
"loss": 1.1436, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 15.718121528625488, |
|
"learning_rate": 7.21608040201005e-06, |
|
"loss": 1.141, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.8623883724212646, |
|
"learning_rate": 7.206030150753769e-06, |
|
"loss": 1.1262, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.101434230804443, |
|
"learning_rate": 7.195979899497488e-06, |
|
"loss": 1.096, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 8.572381019592285, |
|
"learning_rate": 7.185929648241206e-06, |
|
"loss": 1.156, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 1.2090998888015747, |
|
"eval_runtime": 106.033, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 6.753989219665527, |
|
"learning_rate": 7.175879396984925e-06, |
|
"loss": 1.1645, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.745002269744873, |
|
"learning_rate": 7.165829145728643e-06, |
|
"loss": 1.1904, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.988409042358398, |
|
"learning_rate": 7.155778894472362e-06, |
|
"loss": 1.14, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 12.388657569885254, |
|
"learning_rate": 7.145728643216081e-06, |
|
"loss": 1.1043, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 7.307866096496582, |
|
"learning_rate": 7.1356783919597995e-06, |
|
"loss": 1.1395, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 1.1706377267837524, |
|
"eval_runtime": 106.0325, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.388065814971924, |
|
"learning_rate": 7.125628140703518e-06, |
|
"loss": 1.1336, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.8489246368408203, |
|
"learning_rate": 7.1155778894472375e-06, |
|
"loss": 1.098, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 9.67976188659668, |
|
"learning_rate": 7.105527638190956e-06, |
|
"loss": 1.0673, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.9133048057556152, |
|
"learning_rate": 7.095477386934674e-06, |
|
"loss": 1.0832, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.992185115814209, |
|
"learning_rate": 7.085427135678392e-06, |
|
"loss": 1.1449, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 1.1525543928146362, |
|
"eval_runtime": 106.0329, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5347836017608643, |
|
"learning_rate": 7.075376884422112e-06, |
|
"loss": 1.1152, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 6.405416965484619, |
|
"learning_rate": 7.06532663316583e-06, |
|
"loss": 1.1239, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 10.704813957214355, |
|
"learning_rate": 7.055276381909548e-06, |
|
"loss": 1.1007, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.187832355499268, |
|
"learning_rate": 7.045226130653266e-06, |
|
"loss": 1.1903, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.4171247482299805, |
|
"learning_rate": 7.035175879396986e-06, |
|
"loss": 1.1101, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.1943550109863281, |
|
"eval_runtime": 106.0234, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 10.25555419921875, |
|
"learning_rate": 7.025125628140704e-06, |
|
"loss": 1.1009, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.6381895542144775, |
|
"learning_rate": 7.015075376884422e-06, |
|
"loss": 1.1136, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 10.197813034057617, |
|
"learning_rate": 7.005025125628141e-06, |
|
"loss": 1.1178, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 7.947491645812988, |
|
"learning_rate": 6.99497487437186e-06, |
|
"loss": 1.0774, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.598392009735107, |
|
"learning_rate": 6.984924623115578e-06, |
|
"loss": 1.155, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 1.1524125337600708, |
|
"eval_runtime": 106.0253, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.098531723022461, |
|
"learning_rate": 6.974874371859297e-06, |
|
"loss": 1.1064, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.466740131378174, |
|
"learning_rate": 6.9648241206030155e-06, |
|
"loss": 1.1111, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.58650541305542, |
|
"learning_rate": 6.954773869346734e-06, |
|
"loss": 1.129, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.4121707677841187, |
|
"learning_rate": 6.9447236180904535e-06, |
|
"loss": 1.138, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.7431671619415283, |
|
"learning_rate": 6.934673366834172e-06, |
|
"loss": 1.1675, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 1.1724088191986084, |
|
"eval_runtime": 106.0356, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.1336212158203125, |
|
"learning_rate": 6.92462311557789e-06, |
|
"loss": 1.1452, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.988180637359619, |
|
"learning_rate": 6.914572864321608e-06, |
|
"loss": 1.1288, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.099323749542236, |
|
"learning_rate": 6.904522613065328e-06, |
|
"loss": 1.1179, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.779965877532959, |
|
"learning_rate": 6.894472361809046e-06, |
|
"loss": 1.0647, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 8.382486343383789, |
|
"learning_rate": 6.884422110552764e-06, |
|
"loss": 1.0933, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 1.2039211988449097, |
|
"eval_runtime": 106.0501, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.752281665802002, |
|
"learning_rate": 6.874371859296482e-06, |
|
"loss": 1.0914, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 7.070209980010986, |
|
"learning_rate": 6.864321608040202e-06, |
|
"loss": 1.0789, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 10.135692596435547, |
|
"learning_rate": 6.85427135678392e-06, |
|
"loss": 1.1318, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 10.787989616394043, |
|
"learning_rate": 6.844221105527638e-06, |
|
"loss": 1.1505, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.4322993755340576, |
|
"learning_rate": 6.834170854271357e-06, |
|
"loss": 1.1121, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.2028348445892334, |
|
"eval_runtime": 106.0697, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 6.8712568283081055, |
|
"learning_rate": 6.824120603015076e-06, |
|
"loss": 1.0593, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 8.722804069519043, |
|
"learning_rate": 6.814070351758794e-06, |
|
"loss": 1.1287, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.802107334136963, |
|
"learning_rate": 6.804020100502513e-06, |
|
"loss": 1.1291, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 14.779809951782227, |
|
"learning_rate": 6.7939698492462315e-06, |
|
"loss": 1.124, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 12.029526710510254, |
|
"learning_rate": 6.7839195979899505e-06, |
|
"loss": 1.1146, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"eval_loss": 1.2012832164764404, |
|
"eval_runtime": 106.0411, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.80298376083374, |
|
"learning_rate": 6.7738693467336695e-06, |
|
"loss": 1.1362, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.3808014392852783, |
|
"learning_rate": 6.763819095477388e-06, |
|
"loss": 1.1042, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.032790184020996, |
|
"learning_rate": 6.753768844221106e-06, |
|
"loss": 1.183, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 21.967342376708984, |
|
"learning_rate": 6.743718592964824e-06, |
|
"loss": 1.1154, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.282263994216919, |
|
"learning_rate": 6.733668341708544e-06, |
|
"loss": 1.0703, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 1.1923493146896362, |
|
"eval_runtime": 106.0475, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 11.197218894958496, |
|
"learning_rate": 6.723618090452262e-06, |
|
"loss": 1.0949, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.288850784301758, |
|
"learning_rate": 6.71356783919598e-06, |
|
"loss": 1.1465, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 5.537724018096924, |
|
"learning_rate": 6.703517587939698e-06, |
|
"loss": 1.1354, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 10.037650108337402, |
|
"learning_rate": 6.693467336683418e-06, |
|
"loss": 1.0584, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 8.176264762878418, |
|
"learning_rate": 6.683417085427136e-06, |
|
"loss": 1.1543, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 1.1587566137313843, |
|
"eval_runtime": 106.0403, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.638991832733154, |
|
"learning_rate": 6.673366834170854e-06, |
|
"loss": 1.1664, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 6.490970134735107, |
|
"learning_rate": 6.663316582914573e-06, |
|
"loss": 1.1154, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.5730061531066895, |
|
"learning_rate": 6.653266331658292e-06, |
|
"loss": 1.1113, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.730175495147705, |
|
"learning_rate": 6.64321608040201e-06, |
|
"loss": 1.1427, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.3499104976654053, |
|
"learning_rate": 6.633165829145729e-06, |
|
"loss": 1.1063, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"eval_loss": 1.193935751914978, |
|
"eval_runtime": 106.0279, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 10.057549476623535, |
|
"learning_rate": 6.6231155778894475e-06, |
|
"loss": 1.1197, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 15.203615188598633, |
|
"learning_rate": 6.6130653266331665e-06, |
|
"loss": 1.1391, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 7.671562671661377, |
|
"learning_rate": 6.6030150753768855e-06, |
|
"loss": 1.0629, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.142216205596924, |
|
"learning_rate": 6.592964824120604e-06, |
|
"loss": 1.1066, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 7.480587005615234, |
|
"learning_rate": 6.582914572864322e-06, |
|
"loss": 1.1141, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 1.2126845121383667, |
|
"eval_runtime": 106.0466, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.307888984680176, |
|
"learning_rate": 6.572864321608042e-06, |
|
"loss": 1.1767, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 5.546361446380615, |
|
"learning_rate": 6.56281407035176e-06, |
|
"loss": 1.0825, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.469991445541382, |
|
"learning_rate": 6.552763819095478e-06, |
|
"loss": 1.09, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 9.490703582763672, |
|
"learning_rate": 6.542713567839196e-06, |
|
"loss": 1.1252, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.142402172088623, |
|
"learning_rate": 6.532663316582916e-06, |
|
"loss": 1.1048, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 1.1838258504867554, |
|
"eval_runtime": 106.0204, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.945019483566284, |
|
"learning_rate": 6.522613065326634e-06, |
|
"loss": 1.163, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 9.670687675476074, |
|
"learning_rate": 6.512562814070352e-06, |
|
"loss": 1.1424, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 7.109795093536377, |
|
"learning_rate": 6.50251256281407e-06, |
|
"loss": 1.0786, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 11.224685668945312, |
|
"learning_rate": 6.492462311557789e-06, |
|
"loss": 1.0738, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.65909481048584, |
|
"learning_rate": 6.482412060301508e-06, |
|
"loss": 1.1332, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 1.1802384853363037, |
|
"eval_runtime": 106.0242, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 5.420670032501221, |
|
"learning_rate": 6.472361809045226e-06, |
|
"loss": 1.0465, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 8.849187850952148, |
|
"learning_rate": 6.462311557788945e-06, |
|
"loss": 1.1075, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.64987325668335, |
|
"learning_rate": 6.4522613065326635e-06, |
|
"loss": 1.0641, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.252566337585449, |
|
"learning_rate": 6.4422110552763825e-06, |
|
"loss": 1.1164, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 8.550498008728027, |
|
"learning_rate": 6.4321608040201015e-06, |
|
"loss": 1.1399, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 1.1574853658676147, |
|
"eval_runtime": 106.0269, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.474692344665527, |
|
"learning_rate": 6.42211055276382e-06, |
|
"loss": 1.0862, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.71559476852417, |
|
"learning_rate": 6.412060301507538e-06, |
|
"loss": 1.1035, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.566168785095215, |
|
"learning_rate": 6.402010050251258e-06, |
|
"loss": 1.114, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 6.755014419555664, |
|
"learning_rate": 6.391959798994976e-06, |
|
"loss": 1.1105, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 7.986637592315674, |
|
"learning_rate": 6.381909547738694e-06, |
|
"loss": 1.1106, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"eval_loss": 1.1570751667022705, |
|
"eval_runtime": 106.0272, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.5939812660217285, |
|
"learning_rate": 6.371859296482412e-06, |
|
"loss": 1.1091, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.492833137512207, |
|
"learning_rate": 6.361809045226132e-06, |
|
"loss": 1.1201, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.1696550846099854, |
|
"learning_rate": 6.35175879396985e-06, |
|
"loss": 1.1464, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.580470085144043, |
|
"learning_rate": 6.341708542713568e-06, |
|
"loss": 1.087, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 9.039238929748535, |
|
"learning_rate": 6.331658291457286e-06, |
|
"loss": 1.0744, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_loss": 1.172175407409668, |
|
"eval_runtime": 106.0477, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 6.3673601150512695, |
|
"learning_rate": 6.321608040201006e-06, |
|
"loss": 1.1054, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.981923818588257, |
|
"learning_rate": 6.311557788944724e-06, |
|
"loss": 1.1256, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.649021148681641, |
|
"learning_rate": 6.301507537688442e-06, |
|
"loss": 1.1943, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.58133602142334, |
|
"learning_rate": 6.291457286432161e-06, |
|
"loss": 1.0715, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.975027322769165, |
|
"learning_rate": 6.28140703517588e-06, |
|
"loss": 1.1326, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 1.1794663667678833, |
|
"eval_runtime": 106.2486, |
|
"eval_samples_per_second": 9.412, |
|
"eval_steps_per_second": 2.353, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.90846586227417, |
|
"learning_rate": 6.2713567839195985e-06, |
|
"loss": 1.1318, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.7073814868927, |
|
"learning_rate": 6.2613065326633175e-06, |
|
"loss": 1.1162, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.579253673553467, |
|
"learning_rate": 6.251256281407036e-06, |
|
"loss": 1.164, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 7.598400592803955, |
|
"learning_rate": 6.241206030150754e-06, |
|
"loss": 1.1825, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.532858848571777, |
|
"learning_rate": 6.231155778894474e-06, |
|
"loss": 1.1073, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 1.1997580528259277, |
|
"eval_runtime": 106.0241, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.0677943229675293, |
|
"learning_rate": 6.221105527638192e-06, |
|
"loss": 1.1517, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.746025562286377, |
|
"learning_rate": 6.21105527638191e-06, |
|
"loss": 1.1354, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 11.519875526428223, |
|
"learning_rate": 6.201005025125628e-06, |
|
"loss": 1.1621, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 5.125645160675049, |
|
"learning_rate": 6.190954773869348e-06, |
|
"loss": 1.1307, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 8.785491943359375, |
|
"learning_rate": 6.180904522613066e-06, |
|
"loss": 1.1468, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 1.1801879405975342, |
|
"eval_runtime": 106.031, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.312779426574707, |
|
"learning_rate": 6.170854271356784e-06, |
|
"loss": 1.0977, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 10.210089683532715, |
|
"learning_rate": 6.160804020100502e-06, |
|
"loss": 1.1212, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 14.815506935119629, |
|
"learning_rate": 6.150753768844222e-06, |
|
"loss": 1.0945, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.335196495056152, |
|
"learning_rate": 6.14070351758794e-06, |
|
"loss": 1.0615, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 7.525231838226318, |
|
"learning_rate": 6.130653266331658e-06, |
|
"loss": 1.1338, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 1.1894632577896118, |
|
"eval_runtime": 106.0138, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.6122279167175293, |
|
"learning_rate": 6.120603015075377e-06, |
|
"loss": 1.1506, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 6.508030414581299, |
|
"learning_rate": 6.110552763819096e-06, |
|
"loss": 1.1407, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.020355701446533, |
|
"learning_rate": 6.1005025125628145e-06, |
|
"loss": 1.1289, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 8.432185173034668, |
|
"learning_rate": 6.0904522613065335e-06, |
|
"loss": 1.085, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 6.687023639678955, |
|
"learning_rate": 6.080402010050252e-06, |
|
"loss": 1.0099, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_loss": 1.170699954032898, |
|
"eval_runtime": 106.0277, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.077007293701172, |
|
"learning_rate": 6.070351758793971e-06, |
|
"loss": 1.1303, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 7.5526251792907715, |
|
"learning_rate": 6.06030150753769e-06, |
|
"loss": 1.1067, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.360392093658447, |
|
"learning_rate": 6.050251256281408e-06, |
|
"loss": 1.0872, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 3.9685182571411133, |
|
"learning_rate": 6.040201005025126e-06, |
|
"loss": 1.0929, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 6.292672157287598, |
|
"learning_rate": 6.030150753768844e-06, |
|
"loss": 1.0774, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.157470464706421, |
|
"eval_runtime": 106.0466, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 9.378790855407715, |
|
"learning_rate": 6.020100502512564e-06, |
|
"loss": 1.0823, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.2826924324035645, |
|
"learning_rate": 6.010050251256282e-06, |
|
"loss": 1.153, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 7.502425193786621, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0995, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 5.4071478843688965, |
|
"learning_rate": 5.989949748743718e-06, |
|
"loss": 1.0838, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 9.192337989807129, |
|
"learning_rate": 5.979899497487438e-06, |
|
"loss": 1.0842, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"eval_loss": 1.1935093402862549, |
|
"eval_runtime": 106.0184, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 5.097936630249023, |
|
"learning_rate": 5.969849246231156e-06, |
|
"loss": 1.0933, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.131293296813965, |
|
"learning_rate": 5.959798994974874e-06, |
|
"loss": 1.1113, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 3.551987886428833, |
|
"learning_rate": 5.949748743718593e-06, |
|
"loss": 1.1403, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 3.4648571014404297, |
|
"learning_rate": 5.939698492462312e-06, |
|
"loss": 1.0558, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 4.847639083862305, |
|
"learning_rate": 5.9296482412060305e-06, |
|
"loss": 1.1159, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_loss": 1.173964500427246, |
|
"eval_runtime": 106.0026, |
|
"eval_samples_per_second": 9.434, |
|
"eval_steps_per_second": 2.358, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.479746103286743, |
|
"learning_rate": 5.9195979899497495e-06, |
|
"loss": 1.0749, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.6594090461730957, |
|
"learning_rate": 5.909547738693468e-06, |
|
"loss": 1.1112, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.625340700149536, |
|
"learning_rate": 5.899497487437187e-06, |
|
"loss": 1.0512, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.8171932697296143, |
|
"learning_rate": 5.889447236180905e-06, |
|
"loss": 1.0005, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 8.219596862792969, |
|
"learning_rate": 5.879396984924624e-06, |
|
"loss": 1.0944, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 1.1878960132598877, |
|
"eval_runtime": 106.0059, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 6.593535423278809, |
|
"learning_rate": 5.869346733668342e-06, |
|
"loss": 1.1062, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 12.300980567932129, |
|
"learning_rate": 5.859296482412061e-06, |
|
"loss": 1.0978, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 5.79435396194458, |
|
"learning_rate": 5.84924623115578e-06, |
|
"loss": 1.1111, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.353159427642822, |
|
"learning_rate": 5.839195979899498e-06, |
|
"loss": 0.9926, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 4.977834224700928, |
|
"learning_rate": 5.829145728643216e-06, |
|
"loss": 1.1513, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"eval_loss": 1.163717269897461, |
|
"eval_runtime": 106.0412, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 11.844463348388672, |
|
"learning_rate": 5.819095477386936e-06, |
|
"loss": 1.1136, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.4753732681274414, |
|
"learning_rate": 5.809045226130654e-06, |
|
"loss": 1.0853, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 12.580413818359375, |
|
"learning_rate": 5.798994974874372e-06, |
|
"loss": 1.1077, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 5.60028600692749, |
|
"learning_rate": 5.78894472361809e-06, |
|
"loss": 1.1063, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.8767462968826294, |
|
"learning_rate": 5.778894472361809e-06, |
|
"loss": 1.0604, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 1.165592074394226, |
|
"eval_runtime": 106.03, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 10.33289623260498, |
|
"learning_rate": 5.768844221105528e-06, |
|
"loss": 1.0902, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 7.651864528656006, |
|
"learning_rate": 5.7587939698492465e-06, |
|
"loss": 1.104, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 8.54739761352539, |
|
"learning_rate": 5.7487437185929655e-06, |
|
"loss": 1.087, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.075497150421143, |
|
"learning_rate": 5.738693467336684e-06, |
|
"loss": 1.0538, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 9.329207420349121, |
|
"learning_rate": 5.728643216080403e-06, |
|
"loss": 1.1125, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 1.1709275245666504, |
|
"eval_runtime": 106.005, |
|
"eval_samples_per_second": 9.434, |
|
"eval_steps_per_second": 2.358, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 6.580108642578125, |
|
"learning_rate": 5.718592964824121e-06, |
|
"loss": 1.1526, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 3.0420515537261963, |
|
"learning_rate": 5.70854271356784e-06, |
|
"loss": 1.0622, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.738161563873291, |
|
"learning_rate": 5.698492462311558e-06, |
|
"loss": 1.1277, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 4.498201370239258, |
|
"learning_rate": 5.688442211055277e-06, |
|
"loss": 1.1508, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 9.206507682800293, |
|
"learning_rate": 5.678391959798996e-06, |
|
"loss": 1.15, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"eval_loss": 1.2386019229888916, |
|
"eval_runtime": 106.0267, |
|
"eval_samples_per_second": 9.432, |
|
"eval_steps_per_second": 2.358, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 4.026927471160889, |
|
"learning_rate": 5.668341708542714e-06, |
|
"loss": 1.1185, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 6.961542129516602, |
|
"learning_rate": 5.658291457286432e-06, |
|
"loss": 1.0814, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 9.742774963378906, |
|
"learning_rate": 5.648241206030152e-06, |
|
"loss": 1.1235, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 7.585122108459473, |
|
"learning_rate": 5.63819095477387e-06, |
|
"loss": 1.0564, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 9.282063484191895, |
|
"learning_rate": 5.628140703517588e-06, |
|
"loss": 1.1511, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 1.1688772439956665, |
|
"eval_runtime": 106.0117, |
|
"eval_samples_per_second": 9.433, |
|
"eval_steps_per_second": 2.358, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 7.329659938812256, |
|
"learning_rate": 5.618090452261306e-06, |
|
"loss": 1.101, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 12.55649185180664, |
|
"learning_rate": 5.608040201005026e-06, |
|
"loss": 1.043, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.476288318634033, |
|
"learning_rate": 5.597989949748744e-06, |
|
"loss": 1.097, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.84243631362915, |
|
"learning_rate": 5.5879396984924625e-06, |
|
"loss": 1.0998, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 5.389865398406982, |
|
"learning_rate": 5.577889447236181e-06, |
|
"loss": 1.128, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_loss": 1.1911169290542603, |
|
"eval_runtime": 106.0388, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 3.9024312496185303, |
|
"learning_rate": 5.5678391959799e-06, |
|
"loss": 1.0754, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 9.319623947143555, |
|
"learning_rate": 5.557788944723619e-06, |
|
"loss": 1.1378, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 3.1492748260498047, |
|
"learning_rate": 5.547738693467337e-06, |
|
"loss": 1.1004, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 8.03979778289795, |
|
"learning_rate": 5.537688442211056e-06, |
|
"loss": 1.0497, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 7.752456188201904, |
|
"learning_rate": 5.527638190954774e-06, |
|
"loss": 1.0543, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 1.1790896654129028, |
|
"eval_runtime": 106.0302, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.237823009490967, |
|
"learning_rate": 5.517587939698493e-06, |
|
"loss": 1.1139, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 8.119503021240234, |
|
"learning_rate": 5.507537688442212e-06, |
|
"loss": 1.1112, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 12.357787132263184, |
|
"learning_rate": 5.49748743718593e-06, |
|
"loss": 1.0849, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 13.721491813659668, |
|
"learning_rate": 5.487437185929648e-06, |
|
"loss": 1.0981, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 10.318241119384766, |
|
"learning_rate": 5.477386934673368e-06, |
|
"loss": 1.1199, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"eval_loss": 1.179891586303711, |
|
"eval_runtime": 106.0371, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 8.675165176391602, |
|
"learning_rate": 5.467336683417086e-06, |
|
"loss": 1.0626, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 10.515371322631836, |
|
"learning_rate": 5.457286432160804e-06, |
|
"loss": 1.1091, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 11.576639175415039, |
|
"learning_rate": 5.447236180904522e-06, |
|
"loss": 1.0725, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 9.982847213745117, |
|
"learning_rate": 5.437185929648242e-06, |
|
"loss": 1.0723, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 10.960646629333496, |
|
"learning_rate": 5.42713567839196e-06, |
|
"loss": 1.1203, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 1.1439155340194702, |
|
"eval_runtime": 106.0632, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 3.7092068195343018, |
|
"learning_rate": 5.4170854271356785e-06, |
|
"loss": 1.049, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.488994836807251, |
|
"learning_rate": 5.407035175879397e-06, |
|
"loss": 1.0726, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.420220375061035, |
|
"learning_rate": 5.3969849246231165e-06, |
|
"loss": 1.104, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 3.8675038814544678, |
|
"learning_rate": 5.386934673366835e-06, |
|
"loss": 1.1328, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 14.8497314453125, |
|
"learning_rate": 5.376884422110553e-06, |
|
"loss": 1.0603, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"eval_loss": 1.1883912086486816, |
|
"eval_runtime": 106.0465, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 9.701748847961426, |
|
"learning_rate": 5.366834170854272e-06, |
|
"loss": 1.0552, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.574182510375977, |
|
"learning_rate": 5.356783919597991e-06, |
|
"loss": 1.0861, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.884522438049316, |
|
"learning_rate": 5.346733668341709e-06, |
|
"loss": 1.0978, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 4.888044357299805, |
|
"learning_rate": 5.336683417085428e-06, |
|
"loss": 1.1089, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 8.62630558013916, |
|
"learning_rate": 5.326633165829146e-06, |
|
"loss": 1.0617, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"eval_loss": 1.2093132734298706, |
|
"eval_runtime": 106.0518, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 4.283385276794434, |
|
"learning_rate": 5.316582914572864e-06, |
|
"loss": 1.0723, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.514089584350586, |
|
"learning_rate": 5.306532663316584e-06, |
|
"loss": 1.0563, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 4.744464874267578, |
|
"learning_rate": 5.296482412060302e-06, |
|
"loss": 1.0867, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 4.583905220031738, |
|
"learning_rate": 5.28643216080402e-06, |
|
"loss": 1.0383, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 4.338555812835693, |
|
"learning_rate": 5.2763819095477384e-06, |
|
"loss": 1.0783, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_loss": 1.1971614360809326, |
|
"eval_runtime": 106.0626, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 4.034645080566406, |
|
"learning_rate": 5.266331658291458e-06, |
|
"loss": 1.1005, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 18.2078857421875, |
|
"learning_rate": 5.256281407035176e-06, |
|
"loss": 1.0802, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 11.409656524658203, |
|
"learning_rate": 5.2462311557788945e-06, |
|
"loss": 1.0279, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 9.654501914978027, |
|
"learning_rate": 5.236180904522613e-06, |
|
"loss": 1.1281, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.338249444961548, |
|
"learning_rate": 5.2261306532663325e-06, |
|
"loss": 1.071, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 1.2035820484161377, |
|
"eval_runtime": 106.069, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.0878255367279053, |
|
"learning_rate": 5.216080402010051e-06, |
|
"loss": 1.0815, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.829693078994751, |
|
"learning_rate": 5.206030150753769e-06, |
|
"loss": 1.0727, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 8.797452926635742, |
|
"learning_rate": 5.195979899497488e-06, |
|
"loss": 1.111, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 7.87650728225708, |
|
"learning_rate": 5.185929648241207e-06, |
|
"loss": 1.1124, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 6.868736743927002, |
|
"learning_rate": 5.175879396984925e-06, |
|
"loss": 1.0428, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"eval_loss": 1.205499291419983, |
|
"eval_runtime": 106.0571, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 11.863227844238281, |
|
"learning_rate": 5.165829145728644e-06, |
|
"loss": 1.0223, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 4.662557125091553, |
|
"learning_rate": 5.155778894472362e-06, |
|
"loss": 1.0766, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 6.566014766693115, |
|
"learning_rate": 5.145728643216081e-06, |
|
"loss": 1.0794, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 4.543015003204346, |
|
"learning_rate": 5.1356783919598e-06, |
|
"loss": 1.089, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 11.50754451751709, |
|
"learning_rate": 5.125628140703518e-06, |
|
"loss": 1.1178, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 1.1721173524856567, |
|
"eval_runtime": 106.5237, |
|
"eval_samples_per_second": 9.388, |
|
"eval_steps_per_second": 2.347, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 4.686221599578857, |
|
"learning_rate": 5.115577889447236e-06, |
|
"loss": 1.0752, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 6.304415702819824, |
|
"learning_rate": 5.1055276381909544e-06, |
|
"loss": 1.0687, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 14.939876556396484, |
|
"learning_rate": 5.095477386934674e-06, |
|
"loss": 1.0325, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 6.50522518157959, |
|
"learning_rate": 5.085427135678392e-06, |
|
"loss": 1.0694, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 6.129203796386719, |
|
"learning_rate": 5.0753768844221105e-06, |
|
"loss": 1.0754, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"eval_loss": 1.1991491317749023, |
|
"eval_runtime": 106.0551, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 6.325349807739258, |
|
"learning_rate": 5.065326633165829e-06, |
|
"loss": 1.1652, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 4.5157928466796875, |
|
"learning_rate": 5.0552763819095485e-06, |
|
"loss": 1.0868, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 8.48715877532959, |
|
"learning_rate": 5.045226130653267e-06, |
|
"loss": 1.1151, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.27447509765625, |
|
"learning_rate": 5.035175879396985e-06, |
|
"loss": 1.1102, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 7.0696258544921875, |
|
"learning_rate": 5.025125628140704e-06, |
|
"loss": 1.1229, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 1.19700026512146, |
|
"eval_runtime": 106.0396, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 4.845221519470215, |
|
"learning_rate": 5.015075376884423e-06, |
|
"loss": 1.0913, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.8308610916137695, |
|
"learning_rate": 5.005025125628141e-06, |
|
"loss": 1.074, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 11.193860054016113, |
|
"learning_rate": 4.99497487437186e-06, |
|
"loss": 1.0813, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 9.640278816223145, |
|
"learning_rate": 4.984924623115578e-06, |
|
"loss": 1.092, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 8.961405754089355, |
|
"learning_rate": 4.974874371859297e-06, |
|
"loss": 1.0906, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"eval_loss": 1.2116854190826416, |
|
"eval_runtime": 106.3232, |
|
"eval_samples_per_second": 9.405, |
|
"eval_steps_per_second": 2.351, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.288750171661377, |
|
"learning_rate": 4.964824120603016e-06, |
|
"loss": 1.0906, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 3.4457216262817383, |
|
"learning_rate": 4.954773869346734e-06, |
|
"loss": 1.0875, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 7.605111598968506, |
|
"learning_rate": 4.944723618090453e-06, |
|
"loss": 1.0967, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 9.561128616333008, |
|
"learning_rate": 4.934673366834171e-06, |
|
"loss": 1.0812, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 7.3962931632995605, |
|
"learning_rate": 4.92462311557789e-06, |
|
"loss": 1.0977, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 1.2100400924682617, |
|
"eval_runtime": 106.052, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 6.937880992889404, |
|
"learning_rate": 4.914572864321608e-06, |
|
"loss": 1.0975, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 5.143784999847412, |
|
"learning_rate": 4.904522613065327e-06, |
|
"loss": 1.087, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 3.9461379051208496, |
|
"learning_rate": 4.8944723618090455e-06, |
|
"loss": 1.1038, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 5.973447322845459, |
|
"learning_rate": 4.8844221105527645e-06, |
|
"loss": 1.074, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 6.638311386108398, |
|
"learning_rate": 4.874371859296483e-06, |
|
"loss": 1.0614, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"eval_loss": 1.185408115386963, |
|
"eval_runtime": 106.2987, |
|
"eval_samples_per_second": 9.407, |
|
"eval_steps_per_second": 2.352, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 6.998035907745361, |
|
"learning_rate": 4.864321608040201e-06, |
|
"loss": 1.0744, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 3.3675694465637207, |
|
"learning_rate": 4.85427135678392e-06, |
|
"loss": 1.096, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 8.822165489196777, |
|
"learning_rate": 4.844221105527638e-06, |
|
"loss": 1.0922, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 9.377400398254395, |
|
"learning_rate": 4.834170854271357e-06, |
|
"loss": 1.0192, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 5.365954399108887, |
|
"learning_rate": 4.824120603015076e-06, |
|
"loss": 1.0852, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"eval_loss": 1.1685329675674438, |
|
"eval_runtime": 106.0668, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 10.73094367980957, |
|
"learning_rate": 4.814070351758794e-06, |
|
"loss": 1.026, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 8.08397102355957, |
|
"learning_rate": 4.804020100502513e-06, |
|
"loss": 1.1124, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.968872547149658, |
|
"learning_rate": 4.793969849246232e-06, |
|
"loss": 1.0756, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 8.512557983398438, |
|
"learning_rate": 4.78391959798995e-06, |
|
"loss": 1.1132, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 15.78652286529541, |
|
"learning_rate": 4.773869346733669e-06, |
|
"loss": 1.0729, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_loss": 1.1879240274429321, |
|
"eval_runtime": 106.0637, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 7.199432849884033, |
|
"learning_rate": 4.763819095477387e-06, |
|
"loss": 1.1142, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.3736164569854736, |
|
"learning_rate": 4.753768844221106e-06, |
|
"loss": 1.0713, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.64162540435791, |
|
"learning_rate": 4.743718592964824e-06, |
|
"loss": 1.0596, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 5.036973476409912, |
|
"learning_rate": 4.733668341708543e-06, |
|
"loss": 1.1353, |
|
"step": 52900 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 13.622482299804688, |
|
"learning_rate": 4.7236180904522615e-06, |
|
"loss": 1.0713, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 1.2138935327529907, |
|
"eval_runtime": 106.0336, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 10.74639892578125, |
|
"learning_rate": 4.7135678391959805e-06, |
|
"loss": 1.1092, |
|
"step": 53100 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.7003939151763916, |
|
"learning_rate": 4.703517587939699e-06, |
|
"loss": 1.0927, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 7.11260986328125, |
|
"learning_rate": 4.693467336683418e-06, |
|
"loss": 1.0905, |
|
"step": 53300 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 12.401396751403809, |
|
"learning_rate": 4.683417085427136e-06, |
|
"loss": 1.1015, |
|
"step": 53400 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 8.661816596984863, |
|
"learning_rate": 4.673366834170855e-06, |
|
"loss": 1.1312, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_loss": 1.1807825565338135, |
|
"eval_runtime": 106.0587, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.789402008056641, |
|
"learning_rate": 4.663316582914573e-06, |
|
"loss": 1.0313, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 10.267823219299316, |
|
"learning_rate": 4.653266331658292e-06, |
|
"loss": 1.1201, |
|
"step": 53700 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 7.390511512756348, |
|
"learning_rate": 4.64321608040201e-06, |
|
"loss": 1.1064, |
|
"step": 53800 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 3.220641613006592, |
|
"learning_rate": 4.633165829145729e-06, |
|
"loss": 1.0898, |
|
"step": 53900 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 4.595098972320557, |
|
"learning_rate": 4.623115577889448e-06, |
|
"loss": 1.0807, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"eval_loss": 1.1710258722305298, |
|
"eval_runtime": 106.0574, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 14.225842475891113, |
|
"learning_rate": 4.613065326633166e-06, |
|
"loss": 1.1179, |
|
"step": 54100 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.6075730323791504, |
|
"learning_rate": 4.603015075376885e-06, |
|
"loss": 1.1075, |
|
"step": 54200 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 8.72014045715332, |
|
"learning_rate": 4.592964824120603e-06, |
|
"loss": 1.0653, |
|
"step": 54300 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 13.056175231933594, |
|
"learning_rate": 4.582914572864322e-06, |
|
"loss": 1.106, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 6.238776683807373, |
|
"learning_rate": 4.57286432160804e-06, |
|
"loss": 1.0797, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"eval_loss": 1.1717697381973267, |
|
"eval_runtime": 106.0446, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 4.283226490020752, |
|
"learning_rate": 4.562814070351759e-06, |
|
"loss": 1.0956, |
|
"step": 54600 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.9857282638549805, |
|
"learning_rate": 4.5527638190954775e-06, |
|
"loss": 1.1299, |
|
"step": 54700 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.51924204826355, |
|
"learning_rate": 4.5427135678391965e-06, |
|
"loss": 1.0938, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 4.603549003601074, |
|
"learning_rate": 4.532663316582915e-06, |
|
"loss": 1.1509, |
|
"step": 54900 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 11.00109577178955, |
|
"learning_rate": 4.522613065326634e-06, |
|
"loss": 1.0956, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"eval_loss": 1.1807342767715454, |
|
"eval_runtime": 106.0599, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.154958724975586, |
|
"learning_rate": 4.512562814070352e-06, |
|
"loss": 1.0964, |
|
"step": 55100 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.329712867736816, |
|
"learning_rate": 4.502512562814071e-06, |
|
"loss": 1.1373, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 5.342220783233643, |
|
"learning_rate": 4.492462311557789e-06, |
|
"loss": 1.0808, |
|
"step": 55300 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 5.705708980560303, |
|
"learning_rate": 4.482412060301508e-06, |
|
"loss": 1.062, |
|
"step": 55400 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 4.463514804840088, |
|
"learning_rate": 4.472361809045226e-06, |
|
"loss": 1.1105, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_loss": 1.1636874675750732, |
|
"eval_runtime": 106.046, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.2776780128479004, |
|
"learning_rate": 4.462311557788945e-06, |
|
"loss": 1.1123, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 7.7830491065979, |
|
"learning_rate": 4.452261306532664e-06, |
|
"loss": 1.0417, |
|
"step": 55700 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 4.776956081390381, |
|
"learning_rate": 4.442211055276382e-06, |
|
"loss": 1.1081, |
|
"step": 55800 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 8.097271919250488, |
|
"learning_rate": 4.432160804020101e-06, |
|
"loss": 1.1005, |
|
"step": 55900 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 7.190639019012451, |
|
"learning_rate": 4.42211055276382e-06, |
|
"loss": 1.0712, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 1.1957862377166748, |
|
"eval_runtime": 106.0581, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 12.115496635437012, |
|
"learning_rate": 4.412060301507538e-06, |
|
"loss": 1.0484, |
|
"step": 56100 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 3.0836589336395264, |
|
"learning_rate": 4.4020100502512564e-06, |
|
"loss": 1.0373, |
|
"step": 56200 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 5.282203197479248, |
|
"learning_rate": 4.391959798994975e-06, |
|
"loss": 1.1324, |
|
"step": 56300 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 15.216297149658203, |
|
"learning_rate": 4.3819095477386936e-06, |
|
"loss": 1.12, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 5.781431674957275, |
|
"learning_rate": 4.3718592964824125e-06, |
|
"loss": 1.0372, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"eval_loss": 1.1856083869934082, |
|
"eval_runtime": 106.0461, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 7.501257419586182, |
|
"learning_rate": 4.361809045226131e-06, |
|
"loss": 1.049, |
|
"step": 56600 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.0562238693237305, |
|
"learning_rate": 4.35175879396985e-06, |
|
"loss": 1.0758, |
|
"step": 56700 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.510584831237793, |
|
"learning_rate": 4.341708542713568e-06, |
|
"loss": 1.0304, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 11.4026517868042, |
|
"learning_rate": 4.331658291457287e-06, |
|
"loss": 1.0906, |
|
"step": 56900 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.8201565742492676, |
|
"learning_rate": 4.321608040201005e-06, |
|
"loss": 1.0806, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"eval_loss": 1.1789506673812866, |
|
"eval_runtime": 106.0454, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.044394493103027, |
|
"learning_rate": 4.311557788944724e-06, |
|
"loss": 1.0585, |
|
"step": 57100 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 6.520532131195068, |
|
"learning_rate": 4.301507537688442e-06, |
|
"loss": 1.0524, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.544332981109619, |
|
"learning_rate": 4.291457286432161e-06, |
|
"loss": 1.0957, |
|
"step": 57300 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 5.975767612457275, |
|
"learning_rate": 4.28140703517588e-06, |
|
"loss": 1.0923, |
|
"step": 57400 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 8.338631629943848, |
|
"learning_rate": 4.271356783919598e-06, |
|
"loss": 1.05, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 1.213557481765747, |
|
"eval_runtime": 106.0582, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 9.21115779876709, |
|
"learning_rate": 4.261306532663317e-06, |
|
"loss": 1.0633, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 4.789947032928467, |
|
"learning_rate": 4.251256281407035e-06, |
|
"loss": 1.06, |
|
"step": 57700 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.622896909713745, |
|
"learning_rate": 4.241206030150754e-06, |
|
"loss": 1.0639, |
|
"step": 57800 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 8.646242141723633, |
|
"learning_rate": 4.231155778894473e-06, |
|
"loss": 1.0433, |
|
"step": 57900 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 5.949797630310059, |
|
"learning_rate": 4.221105527638191e-06, |
|
"loss": 1.0838, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 1.2052046060562134, |
|
"eval_runtime": 106.0422, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 3.80587100982666, |
|
"learning_rate": 4.21105527638191e-06, |
|
"loss": 1.0948, |
|
"step": 58100 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 11.919468879699707, |
|
"learning_rate": 4.2010050251256285e-06, |
|
"loss": 1.078, |
|
"step": 58200 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.601760387420654, |
|
"learning_rate": 4.1909547738693475e-06, |
|
"loss": 1.0883, |
|
"step": 58300 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 9.78034782409668, |
|
"learning_rate": 4.180904522613066e-06, |
|
"loss": 1.0545, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 10.80511474609375, |
|
"learning_rate": 4.170854271356784e-06, |
|
"loss": 1.1235, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"eval_loss": 1.2118693590164185, |
|
"eval_runtime": 106.3097, |
|
"eval_samples_per_second": 9.406, |
|
"eval_steps_per_second": 2.352, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 3.7069671154022217, |
|
"learning_rate": 4.160804020100503e-06, |
|
"loss": 1.1136, |
|
"step": 58600 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 15.624918937683105, |
|
"learning_rate": 4.150753768844221e-06, |
|
"loss": 1.0553, |
|
"step": 58700 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 6.897815227508545, |
|
"learning_rate": 4.14070351758794e-06, |
|
"loss": 1.0706, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 5.351986885070801, |
|
"learning_rate": 4.130653266331658e-06, |
|
"loss": 1.0226, |
|
"step": 58900 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 4.158387660980225, |
|
"learning_rate": 4.120603015075377e-06, |
|
"loss": 1.0865, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 1.16471529006958, |
|
"eval_runtime": 106.0354, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.942288398742676, |
|
"learning_rate": 4.110552763819096e-06, |
|
"loss": 1.098, |
|
"step": 59100 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 9.001307487487793, |
|
"learning_rate": 4.100502512562814e-06, |
|
"loss": 1.1024, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 14.217978477478027, |
|
"learning_rate": 4.090452261306533e-06, |
|
"loss": 1.0029, |
|
"step": 59300 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 12.35381031036377, |
|
"learning_rate": 4.080402010050251e-06, |
|
"loss": 1.0757, |
|
"step": 59400 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.351334571838379, |
|
"learning_rate": 4.07035175879397e-06, |
|
"loss": 1.055, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_loss": 1.1809368133544922, |
|
"eval_runtime": 106.0646, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.989234447479248, |
|
"learning_rate": 4.060301507537689e-06, |
|
"loss": 1.034, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 9.616496086120605, |
|
"learning_rate": 4.0502512562814074e-06, |
|
"loss": 1.0885, |
|
"step": 59700 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 7.053142547607422, |
|
"learning_rate": 4.040201005025126e-06, |
|
"loss": 1.0858, |
|
"step": 59800 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 12.209592819213867, |
|
"learning_rate": 4.0301507537688446e-06, |
|
"loss": 1.0442, |
|
"step": 59900 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.404716968536377, |
|
"learning_rate": 4.0201005025125635e-06, |
|
"loss": 1.0734, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 1.1983425617218018, |
|
"eval_runtime": 106.0576, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.645810604095459, |
|
"learning_rate": 4.010050251256282e-06, |
|
"loss": 1.0638, |
|
"step": 60100 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 4.7631940841674805, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.0657, |
|
"step": 60200 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.6057679653167725, |
|
"learning_rate": 3.989949748743719e-06, |
|
"loss": 1.0556, |
|
"step": 60300 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 9.091955184936523, |
|
"learning_rate": 3.979899497487438e-06, |
|
"loss": 1.1244, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 6.215171813964844, |
|
"learning_rate": 3.969849246231156e-06, |
|
"loss": 1.1504, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"eval_loss": 1.1710612773895264, |
|
"eval_runtime": 106.0549, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 7.309828758239746, |
|
"learning_rate": 3.959798994974875e-06, |
|
"loss": 1.0624, |
|
"step": 60600 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 5.1372880935668945, |
|
"learning_rate": 3.949748743718593e-06, |
|
"loss": 1.1102, |
|
"step": 60700 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.1944918632507324, |
|
"learning_rate": 3.939698492462311e-06, |
|
"loss": 1.0699, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 11.555052757263184, |
|
"learning_rate": 3.92964824120603e-06, |
|
"loss": 1.0467, |
|
"step": 60900 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 8.433820724487305, |
|
"learning_rate": 3.919597989949749e-06, |
|
"loss": 1.0355, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"eval_loss": 1.1772249937057495, |
|
"eval_runtime": 106.0454, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.291175365447998, |
|
"learning_rate": 3.909547738693467e-06, |
|
"loss": 1.0655, |
|
"step": 61100 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.7827377319335938, |
|
"learning_rate": 3.899497487437186e-06, |
|
"loss": 1.0947, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 12.351645469665527, |
|
"learning_rate": 3.889447236180905e-06, |
|
"loss": 1.0826, |
|
"step": 61300 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 10.359111785888672, |
|
"learning_rate": 3.8793969849246234e-06, |
|
"loss": 1.0938, |
|
"step": 61400 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.723474979400635, |
|
"learning_rate": 3.869346733668342e-06, |
|
"loss": 1.0753, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"eval_loss": 1.1773525476455688, |
|
"eval_runtime": 106.0341, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 7.097861289978027, |
|
"learning_rate": 3.8592964824120606e-06, |
|
"loss": 1.039, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.926146984100342, |
|
"learning_rate": 3.8492462311557795e-06, |
|
"loss": 1.1176, |
|
"step": 61700 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 9.322738647460938, |
|
"learning_rate": 3.839195979899498e-06, |
|
"loss": 1.0327, |
|
"step": 61800 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 13.57543659210205, |
|
"learning_rate": 3.829145728643217e-06, |
|
"loss": 1.1169, |
|
"step": 61900 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 4.7055535316467285, |
|
"learning_rate": 3.819095477386935e-06, |
|
"loss": 1.0436, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"eval_loss": 1.1872432231903076, |
|
"eval_runtime": 106.0793, |
|
"eval_samples_per_second": 9.427, |
|
"eval_steps_per_second": 2.357, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 3.1538665294647217, |
|
"learning_rate": 3.809045226130654e-06, |
|
"loss": 1.0537, |
|
"step": 62100 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 8.150029182434082, |
|
"learning_rate": 3.798994974874372e-06, |
|
"loss": 1.0598, |
|
"step": 62200 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 4.418848037719727, |
|
"learning_rate": 3.788944723618091e-06, |
|
"loss": 1.0996, |
|
"step": 62300 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.905966281890869, |
|
"learning_rate": 3.7788944723618095e-06, |
|
"loss": 1.0735, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 3.2698171138763428, |
|
"learning_rate": 3.768844221105528e-06, |
|
"loss": 1.0281, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 1.2369818687438965, |
|
"eval_runtime": 106.0418, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 8.885529518127441, |
|
"learning_rate": 3.7587939698492466e-06, |
|
"loss": 1.056, |
|
"step": 62600 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 6.061652660369873, |
|
"learning_rate": 3.748743718592965e-06, |
|
"loss": 1.0571, |
|
"step": 62700 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 6.784704685211182, |
|
"learning_rate": 3.7386934673366837e-06, |
|
"loss": 1.0289, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.635735034942627, |
|
"learning_rate": 3.7286432160804027e-06, |
|
"loss": 1.0872, |
|
"step": 62900 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 7.111885070800781, |
|
"learning_rate": 3.718592964824121e-06, |
|
"loss": 1.0963, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"eval_loss": 1.1708903312683105, |
|
"eval_runtime": 106.032, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 3.6875112056732178, |
|
"learning_rate": 3.7085427135678394e-06, |
|
"loss": 1.0496, |
|
"step": 63100 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 6.22719669342041, |
|
"learning_rate": 3.698492462311558e-06, |
|
"loss": 1.0766, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 5.189857482910156, |
|
"learning_rate": 3.6884422110552766e-06, |
|
"loss": 1.0274, |
|
"step": 63300 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 5.517366409301758, |
|
"learning_rate": 3.678391959798995e-06, |
|
"loss": 1.0942, |
|
"step": 63400 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 9.920944213867188, |
|
"learning_rate": 3.6683417085427137e-06, |
|
"loss": 1.0694, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"eval_loss": 1.1612495183944702, |
|
"eval_runtime": 106.0488, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 30.120113372802734, |
|
"learning_rate": 3.6582914572864327e-06, |
|
"loss": 1.0943, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 6.0459771156311035, |
|
"learning_rate": 3.648241206030151e-06, |
|
"loss": 1.0807, |
|
"step": 63700 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 6.1749653816223145, |
|
"learning_rate": 3.63819095477387e-06, |
|
"loss": 1.1387, |
|
"step": 63800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 4.901200771331787, |
|
"learning_rate": 3.628140703517588e-06, |
|
"loss": 1.0758, |
|
"step": 63900 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 12.518453598022461, |
|
"learning_rate": 3.618090452261307e-06, |
|
"loss": 1.0996, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 1.1155891418457031, |
|
"eval_runtime": 106.0569, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 5.017581462860107, |
|
"learning_rate": 3.608040201005025e-06, |
|
"loss": 1.0532, |
|
"step": 64100 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2370281219482422, |
|
"learning_rate": 3.597989949748744e-06, |
|
"loss": 1.0878, |
|
"step": 64200 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 11.514025688171387, |
|
"learning_rate": 3.5879396984924626e-06, |
|
"loss": 1.0529, |
|
"step": 64300 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 5.596809387207031, |
|
"learning_rate": 3.577889447236181e-06, |
|
"loss": 1.038, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 5.313028335571289, |
|
"learning_rate": 3.5678391959798997e-06, |
|
"loss": 1.0532, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"eval_loss": 1.1719648838043213, |
|
"eval_runtime": 106.0478, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 10.414960861206055, |
|
"learning_rate": 3.5577889447236187e-06, |
|
"loss": 1.0048, |
|
"step": 64600 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 13.392841339111328, |
|
"learning_rate": 3.547738693467337e-06, |
|
"loss": 1.1014, |
|
"step": 64700 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 3.2335710525512695, |
|
"learning_rate": 3.537688442211056e-06, |
|
"loss": 1.1013, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 4.216681003570557, |
|
"learning_rate": 3.527638190954774e-06, |
|
"loss": 1.0363, |
|
"step": 64900 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 3.247030735015869, |
|
"learning_rate": 3.517587939698493e-06, |
|
"loss": 1.0747, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"eval_loss": 1.1740339994430542, |
|
"eval_runtime": 106.0636, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 10.313753128051758, |
|
"learning_rate": 3.507537688442211e-06, |
|
"loss": 1.0048, |
|
"step": 65100 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.487821102142334, |
|
"learning_rate": 3.49748743718593e-06, |
|
"loss": 1.0597, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 5.214561462402344, |
|
"learning_rate": 3.4874371859296487e-06, |
|
"loss": 1.0923, |
|
"step": 65300 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 7.987942218780518, |
|
"learning_rate": 3.477386934673367e-06, |
|
"loss": 1.0697, |
|
"step": 65400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 21.811670303344727, |
|
"learning_rate": 3.467336683417086e-06, |
|
"loss": 1.0922, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"eval_loss": 1.1970088481903076, |
|
"eval_runtime": 106.0745, |
|
"eval_samples_per_second": 9.427, |
|
"eval_steps_per_second": 2.357, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.0042884349823, |
|
"learning_rate": 3.457286432160804e-06, |
|
"loss": 1.0802, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 7.716630935668945, |
|
"learning_rate": 3.447236180904523e-06, |
|
"loss": 1.0439, |
|
"step": 65700 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 10.984993934631348, |
|
"learning_rate": 3.437185929648241e-06, |
|
"loss": 1.0531, |
|
"step": 65800 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.731362819671631, |
|
"learning_rate": 3.42713567839196e-06, |
|
"loss": 1.0971, |
|
"step": 65900 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 7.619626522064209, |
|
"learning_rate": 3.4170854271356786e-06, |
|
"loss": 1.0754, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"eval_loss": 1.185581088066101, |
|
"eval_runtime": 106.0575, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 10.193381309509277, |
|
"learning_rate": 3.407035175879397e-06, |
|
"loss": 1.0544, |
|
"step": 66100 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.2454402446746826, |
|
"learning_rate": 3.3969849246231158e-06, |
|
"loss": 1.142, |
|
"step": 66200 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 9.75003433227539, |
|
"learning_rate": 3.3869346733668347e-06, |
|
"loss": 1.0923, |
|
"step": 66300 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 7.709816932678223, |
|
"learning_rate": 3.376884422110553e-06, |
|
"loss": 1.0747, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 6.4221696853637695, |
|
"learning_rate": 3.366834170854272e-06, |
|
"loss": 1.0434, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"eval_loss": 1.1786295175552368, |
|
"eval_runtime": 106.0416, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 7.951330661773682, |
|
"learning_rate": 3.35678391959799e-06, |
|
"loss": 1.0725, |
|
"step": 66600 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 6.741400718688965, |
|
"learning_rate": 3.346733668341709e-06, |
|
"loss": 1.0469, |
|
"step": 66700 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 14.190262794494629, |
|
"learning_rate": 3.336683417085427e-06, |
|
"loss": 1.0687, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 9.906428337097168, |
|
"learning_rate": 3.326633165829146e-06, |
|
"loss": 1.0962, |
|
"step": 66900 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.775923728942871, |
|
"learning_rate": 3.3165829145728647e-06, |
|
"loss": 1.0659, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_loss": 1.1667512655258179, |
|
"eval_runtime": 106.0757, |
|
"eval_samples_per_second": 9.427, |
|
"eval_steps_per_second": 2.357, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 10.69369888305664, |
|
"learning_rate": 3.3065326633165833e-06, |
|
"loss": 1.0384, |
|
"step": 67100 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 4.059796333312988, |
|
"learning_rate": 3.296482412060302e-06, |
|
"loss": 1.0502, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 3.5036587715148926, |
|
"learning_rate": 3.286432160804021e-06, |
|
"loss": 1.0406, |
|
"step": 67300 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 7.150821685791016, |
|
"learning_rate": 3.276381909547739e-06, |
|
"loss": 1.0384, |
|
"step": 67400 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 4.669068813323975, |
|
"learning_rate": 3.266331658291458e-06, |
|
"loss": 1.0755, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"eval_loss": 1.1525746583938599, |
|
"eval_runtime": 106.0609, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 9.362273216247559, |
|
"learning_rate": 3.256281407035176e-06, |
|
"loss": 1.0794, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 3.6637213230133057, |
|
"learning_rate": 3.2462311557788946e-06, |
|
"loss": 1.0654, |
|
"step": 67700 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 5.397108554840088, |
|
"learning_rate": 3.236180904522613e-06, |
|
"loss": 1.0832, |
|
"step": 67800 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.418199300765991, |
|
"learning_rate": 3.2261306532663318e-06, |
|
"loss": 1.0634, |
|
"step": 67900 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 9.751249313354492, |
|
"learning_rate": 3.2160804020100507e-06, |
|
"loss": 1.0813, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"eval_loss": 1.1616853475570679, |
|
"eval_runtime": 106.044, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 5.580630302429199, |
|
"learning_rate": 3.206030150753769e-06, |
|
"loss": 1.0981, |
|
"step": 68100 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 5.4495530128479, |
|
"learning_rate": 3.195979899497488e-06, |
|
"loss": 1.0737, |
|
"step": 68200 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 12.089224815368652, |
|
"learning_rate": 3.185929648241206e-06, |
|
"loss": 1.0539, |
|
"step": 68300 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 10.440498352050781, |
|
"learning_rate": 3.175879396984925e-06, |
|
"loss": 1.0803, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 10.229782104492188, |
|
"learning_rate": 3.165829145728643e-06, |
|
"loss": 1.0979, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"eval_loss": 1.159399151802063, |
|
"eval_runtime": 106.0674, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 3.6945924758911133, |
|
"learning_rate": 3.155778894472362e-06, |
|
"loss": 1.0627, |
|
"step": 68600 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.110106468200684, |
|
"learning_rate": 3.1457286432160807e-06, |
|
"loss": 1.0585, |
|
"step": 68700 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 7.140746593475342, |
|
"learning_rate": 3.1356783919597993e-06, |
|
"loss": 1.0597, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 16.83041000366211, |
|
"learning_rate": 3.125628140703518e-06, |
|
"loss": 0.9972, |
|
"step": 68900 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.084952354431152, |
|
"learning_rate": 3.115577889447237e-06, |
|
"loss": 0.9752, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"eval_loss": 1.144303798675537, |
|
"eval_runtime": 106.0475, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 9.237672805786133, |
|
"learning_rate": 3.105527638190955e-06, |
|
"loss": 1.0702, |
|
"step": 69100 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 10.925599098205566, |
|
"learning_rate": 3.095477386934674e-06, |
|
"loss": 1.0242, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 13.037814140319824, |
|
"learning_rate": 3.085427135678392e-06, |
|
"loss": 1.0663, |
|
"step": 69300 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 9.554988861083984, |
|
"learning_rate": 3.075376884422111e-06, |
|
"loss": 1.1384, |
|
"step": 69400 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 3.325300455093384, |
|
"learning_rate": 3.065326633165829e-06, |
|
"loss": 1.0099, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"eval_loss": 1.1666878461837769, |
|
"eval_runtime": 106.0586, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 5.020545959472656, |
|
"learning_rate": 3.055276381909548e-06, |
|
"loss": 1.0712, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 9.31608772277832, |
|
"learning_rate": 3.0452261306532668e-06, |
|
"loss": 1.0718, |
|
"step": 69700 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 4.576101779937744, |
|
"learning_rate": 3.0351758793969853e-06, |
|
"loss": 0.9786, |
|
"step": 69800 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 5.823631286621094, |
|
"learning_rate": 3.025125628140704e-06, |
|
"loss": 1.0892, |
|
"step": 69900 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 7.814849376678467, |
|
"learning_rate": 3.015075376884422e-06, |
|
"loss": 1.0544, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 1.1586934328079224, |
|
"eval_runtime": 106.0553, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 20.614145278930664, |
|
"learning_rate": 3.005025125628141e-06, |
|
"loss": 1.0971, |
|
"step": 70100 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 6.837667942047119, |
|
"learning_rate": 2.994974874371859e-06, |
|
"loss": 1.0629, |
|
"step": 70200 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 11.336121559143066, |
|
"learning_rate": 2.984924623115578e-06, |
|
"loss": 1.0624, |
|
"step": 70300 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 13.510031700134277, |
|
"learning_rate": 2.9748743718592967e-06, |
|
"loss": 1.1043, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 5.644843578338623, |
|
"learning_rate": 2.9648241206030153e-06, |
|
"loss": 1.0754, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"eval_loss": 1.1812278032302856, |
|
"eval_runtime": 106.029, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 6.993515968322754, |
|
"learning_rate": 2.954773869346734e-06, |
|
"loss": 1.0704, |
|
"step": 70600 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 9.949378967285156, |
|
"learning_rate": 2.9447236180904524e-06, |
|
"loss": 1.0647, |
|
"step": 70700 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 11.08406925201416, |
|
"learning_rate": 2.934673366834171e-06, |
|
"loss": 1.0978, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 9.99530029296875, |
|
"learning_rate": 2.92462311557789e-06, |
|
"loss": 1.0333, |
|
"step": 70900 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 12.340422630310059, |
|
"learning_rate": 2.914572864321608e-06, |
|
"loss": 1.0597, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"eval_loss": 1.1510303020477295, |
|
"eval_runtime": 106.3062, |
|
"eval_samples_per_second": 9.407, |
|
"eval_steps_per_second": 2.352, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 3.573578119277954, |
|
"learning_rate": 2.904522613065327e-06, |
|
"loss": 1.1165, |
|
"step": 71100 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.8838629722595215, |
|
"learning_rate": 2.894472361809045e-06, |
|
"loss": 1.0916, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 7.589781761169434, |
|
"learning_rate": 2.884422110552764e-06, |
|
"loss": 1.0535, |
|
"step": 71300 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 4.135651111602783, |
|
"learning_rate": 2.8743718592964828e-06, |
|
"loss": 1.0768, |
|
"step": 71400 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 8.806633949279785, |
|
"learning_rate": 2.8643216080402013e-06, |
|
"loss": 1.0945, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"eval_loss": 1.200325846672058, |
|
"eval_runtime": 106.0463, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.913125991821289, |
|
"learning_rate": 2.85427135678392e-06, |
|
"loss": 1.0631, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 5.196088790893555, |
|
"learning_rate": 2.8442211055276384e-06, |
|
"loss": 1.0687, |
|
"step": 71700 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 8.308272361755371, |
|
"learning_rate": 2.834170854271357e-06, |
|
"loss": 1.1081, |
|
"step": 71800 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 8.516919136047363, |
|
"learning_rate": 2.824120603015076e-06, |
|
"loss": 1.0712, |
|
"step": 71900 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 10.68675422668457, |
|
"learning_rate": 2.814070351758794e-06, |
|
"loss": 1.1194, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 1.1641302108764648, |
|
"eval_runtime": 106.0419, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 9.415139198303223, |
|
"learning_rate": 2.804020100502513e-06, |
|
"loss": 1.0774, |
|
"step": 72100 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.7503983974456787, |
|
"learning_rate": 2.7939698492462313e-06, |
|
"loss": 1.1268, |
|
"step": 72200 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.8224315643310547, |
|
"learning_rate": 2.78391959798995e-06, |
|
"loss": 1.0452, |
|
"step": 72300 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 13.259425163269043, |
|
"learning_rate": 2.7738693467336684e-06, |
|
"loss": 1.0463, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 5.253272533416748, |
|
"learning_rate": 2.763819095477387e-06, |
|
"loss": 1.0794, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"eval_loss": 1.1821850538253784, |
|
"eval_runtime": 106.0523, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 14.562149047851562, |
|
"learning_rate": 2.753768844221106e-06, |
|
"loss": 1.1187, |
|
"step": 72600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 13.231179237365723, |
|
"learning_rate": 2.743718592964824e-06, |
|
"loss": 1.0532, |
|
"step": 72700 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 5.84453010559082, |
|
"learning_rate": 2.733668341708543e-06, |
|
"loss": 1.066, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 8.251835823059082, |
|
"learning_rate": 2.723618090452261e-06, |
|
"loss": 1.0764, |
|
"step": 72900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 6.283970832824707, |
|
"learning_rate": 2.71356783919598e-06, |
|
"loss": 1.1205, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"eval_loss": 1.137833833694458, |
|
"eval_runtime": 106.0617, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.204836845397949, |
|
"learning_rate": 2.7035175879396983e-06, |
|
"loss": 1.0788, |
|
"step": 73100 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 12.395988464355469, |
|
"learning_rate": 2.6934673366834173e-06, |
|
"loss": 1.0342, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 7.316469669342041, |
|
"learning_rate": 2.683417085427136e-06, |
|
"loss": 1.1122, |
|
"step": 73300 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.458342552185059, |
|
"learning_rate": 2.6733668341708545e-06, |
|
"loss": 1.0277, |
|
"step": 73400 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 4.2949981689453125, |
|
"learning_rate": 2.663316582914573e-06, |
|
"loss": 1.0895, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_loss": 1.1763978004455566, |
|
"eval_runtime": 106.047, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 3.5636932849884033, |
|
"learning_rate": 2.653266331658292e-06, |
|
"loss": 1.0559, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 10.85332202911377, |
|
"learning_rate": 2.64321608040201e-06, |
|
"loss": 1.0593, |
|
"step": 73700 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 7.5514750480651855, |
|
"learning_rate": 2.633165829145729e-06, |
|
"loss": 1.0819, |
|
"step": 73800 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 9.394440650939941, |
|
"learning_rate": 2.6231155778894473e-06, |
|
"loss": 1.0498, |
|
"step": 73900 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 9.17789363861084, |
|
"learning_rate": 2.6130653266331663e-06, |
|
"loss": 1.1306, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"eval_loss": 1.1681709289550781, |
|
"eval_runtime": 106.0499, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 18.37449073791504, |
|
"learning_rate": 2.6030150753768844e-06, |
|
"loss": 1.051, |
|
"step": 74100 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.33835768699646, |
|
"learning_rate": 2.5929648241206034e-06, |
|
"loss": 1.0613, |
|
"step": 74200 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 6.537227630615234, |
|
"learning_rate": 2.582914572864322e-06, |
|
"loss": 1.1065, |
|
"step": 74300 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 15.826942443847656, |
|
"learning_rate": 2.5728643216080405e-06, |
|
"loss": 1.0764, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 6.798605918884277, |
|
"learning_rate": 2.562814070351759e-06, |
|
"loss": 1.0653, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"eval_loss": 1.1853090524673462, |
|
"eval_runtime": 106.0658, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 5.614243984222412, |
|
"learning_rate": 2.5527638190954772e-06, |
|
"loss": 1.0322, |
|
"step": 74600 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 4.611671447753906, |
|
"learning_rate": 2.542713567839196e-06, |
|
"loss": 1.0867, |
|
"step": 74700 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 13.089502334594727, |
|
"learning_rate": 2.5326633165829143e-06, |
|
"loss": 1.0698, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 7.631048679351807, |
|
"learning_rate": 2.5226130653266333e-06, |
|
"loss": 1.0094, |
|
"step": 74900 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 8.738151550292969, |
|
"learning_rate": 2.512562814070352e-06, |
|
"loss": 1.0388, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"eval_loss": 1.1503615379333496, |
|
"eval_runtime": 106.071, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.997983932495117, |
|
"learning_rate": 2.5025125628140705e-06, |
|
"loss": 1.039, |
|
"step": 75100 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.2443947792053223, |
|
"learning_rate": 2.492462311557789e-06, |
|
"loss": 1.0644, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.138190507888794, |
|
"learning_rate": 2.482412060301508e-06, |
|
"loss": 1.0382, |
|
"step": 75300 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 8.46721363067627, |
|
"learning_rate": 2.4723618090452266e-06, |
|
"loss": 1.0846, |
|
"step": 75400 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 6.676329135894775, |
|
"learning_rate": 2.462311557788945e-06, |
|
"loss": 1.1071, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"eval_loss": 1.1840382814407349, |
|
"eval_runtime": 106.0594, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 4.778306484222412, |
|
"learning_rate": 2.4522613065326637e-06, |
|
"loss": 1.0251, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 10.008584022521973, |
|
"learning_rate": 2.4422110552763823e-06, |
|
"loss": 1.0547, |
|
"step": 75700 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 7.311899185180664, |
|
"learning_rate": 2.4321608040201004e-06, |
|
"loss": 1.04, |
|
"step": 75800 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 18.295940399169922, |
|
"learning_rate": 2.422110552763819e-06, |
|
"loss": 1.0708, |
|
"step": 75900 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 8.365277290344238, |
|
"learning_rate": 2.412060301507538e-06, |
|
"loss": 0.9736, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"eval_loss": 1.1644206047058105, |
|
"eval_runtime": 106.0488, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 7.482743263244629, |
|
"learning_rate": 2.4020100502512565e-06, |
|
"loss": 1.0588, |
|
"step": 76100 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.637324333190918, |
|
"learning_rate": 2.391959798994975e-06, |
|
"loss": 1.0029, |
|
"step": 76200 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 4.1734089851379395, |
|
"learning_rate": 2.3819095477386936e-06, |
|
"loss": 1.089, |
|
"step": 76300 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 16.437904357910156, |
|
"learning_rate": 2.371859296482412e-06, |
|
"loss": 1.0594, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 7.176886081695557, |
|
"learning_rate": 2.3618090452261308e-06, |
|
"loss": 1.0755, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"eval_loss": 1.1777397394180298, |
|
"eval_runtime": 106.0428, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.481789588928223, |
|
"learning_rate": 2.3517587939698493e-06, |
|
"loss": 1.0636, |
|
"step": 76600 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 8.8861083984375, |
|
"learning_rate": 2.341708542713568e-06, |
|
"loss": 1.0088, |
|
"step": 76700 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 9.045199394226074, |
|
"learning_rate": 2.3316582914572865e-06, |
|
"loss": 1.0385, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 8.271011352539062, |
|
"learning_rate": 2.321608040201005e-06, |
|
"loss": 1.0585, |
|
"step": 76900 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.93958854675293, |
|
"learning_rate": 2.311557788944724e-06, |
|
"loss": 1.0454, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"eval_loss": 1.1677011251449585, |
|
"eval_runtime": 106.0657, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 4.292805194854736, |
|
"learning_rate": 2.3015075376884426e-06, |
|
"loss": 1.1054, |
|
"step": 77100 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 6.812904357910156, |
|
"learning_rate": 2.291457286432161e-06, |
|
"loss": 1.104, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.912458896636963, |
|
"learning_rate": 2.2814070351758797e-06, |
|
"loss": 1.0717, |
|
"step": 77300 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 11.724600791931152, |
|
"learning_rate": 2.2713567839195983e-06, |
|
"loss": 1.0796, |
|
"step": 77400 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 8.604060173034668, |
|
"learning_rate": 2.261306532663317e-06, |
|
"loss": 1.0793, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"eval_loss": 1.1650307178497314, |
|
"eval_runtime": 106.053, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 6.0530476570129395, |
|
"learning_rate": 2.2512562814070354e-06, |
|
"loss": 1.0121, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.569116115570068, |
|
"learning_rate": 2.241206030150754e-06, |
|
"loss": 1.0333, |
|
"step": 77700 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 6.292867660522461, |
|
"learning_rate": 2.2311557788944725e-06, |
|
"loss": 1.0363, |
|
"step": 77800 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 5.912831783294678, |
|
"learning_rate": 2.221105527638191e-06, |
|
"loss": 1.0515, |
|
"step": 77900 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 7.606415271759033, |
|
"learning_rate": 2.21105527638191e-06, |
|
"loss": 1.0151, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 1.1836047172546387, |
|
"eval_runtime": 106.0598, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 4.176692008972168, |
|
"learning_rate": 2.2010050251256282e-06, |
|
"loss": 1.0719, |
|
"step": 78100 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 15.275976181030273, |
|
"learning_rate": 2.1909547738693468e-06, |
|
"loss": 1.0158, |
|
"step": 78200 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 5.714578151702881, |
|
"learning_rate": 2.1809045226130653e-06, |
|
"loss": 1.0678, |
|
"step": 78300 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.8382607698440552, |
|
"learning_rate": 2.170854271356784e-06, |
|
"loss": 1.045, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 7.254737377166748, |
|
"learning_rate": 2.1608040201005025e-06, |
|
"loss": 1.0161, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"eval_loss": 1.1580357551574707, |
|
"eval_runtime": 106.0396, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 8.868572235107422, |
|
"learning_rate": 2.150753768844221e-06, |
|
"loss": 1.0734, |
|
"step": 78600 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 10.592870712280273, |
|
"learning_rate": 2.14070351758794e-06, |
|
"loss": 1.0544, |
|
"step": 78700 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 8.468735694885254, |
|
"learning_rate": 2.1306532663316586e-06, |
|
"loss": 1.1248, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 5.241983890533447, |
|
"learning_rate": 2.120603015075377e-06, |
|
"loss": 1.0655, |
|
"step": 78900 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.542044162750244, |
|
"learning_rate": 2.1105527638190957e-06, |
|
"loss": 1.0258, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"eval_loss": 1.1469522714614868, |
|
"eval_runtime": 106.139, |
|
"eval_samples_per_second": 9.422, |
|
"eval_steps_per_second": 2.355, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 9.26364803314209, |
|
"learning_rate": 2.1005025125628143e-06, |
|
"loss": 1.0528, |
|
"step": 79100 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 4.515612602233887, |
|
"learning_rate": 2.090452261306533e-06, |
|
"loss": 1.0788, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 16.94777488708496, |
|
"learning_rate": 2.0804020100502514e-06, |
|
"loss": 1.0541, |
|
"step": 79300 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.827336549758911, |
|
"learning_rate": 2.07035175879397e-06, |
|
"loss": 1.0475, |
|
"step": 79400 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.7908217906951904, |
|
"learning_rate": 2.0603015075376885e-06, |
|
"loss": 1.1173, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"eval_loss": 1.2008851766586304, |
|
"eval_runtime": 106.0493, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.2991623878479, |
|
"learning_rate": 2.050251256281407e-06, |
|
"loss": 1.029, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 6.991485595703125, |
|
"learning_rate": 2.0402010050251257e-06, |
|
"loss": 1.0284, |
|
"step": 79700 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 12.238316535949707, |
|
"learning_rate": 2.0301507537688446e-06, |
|
"loss": 1.1032, |
|
"step": 79800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 4.9090447425842285, |
|
"learning_rate": 2.020100502512563e-06, |
|
"loss": 1.0962, |
|
"step": 79900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 3.7181665897369385, |
|
"learning_rate": 2.0100502512562818e-06, |
|
"loss": 0.9719, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1909617185592651, |
|
"eval_runtime": 106.0461, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 10.567878723144531, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.0297, |
|
"step": 80100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 18.70653533935547, |
|
"learning_rate": 1.989949748743719e-06, |
|
"loss": 0.9519, |
|
"step": 80200 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 9.118350982666016, |
|
"learning_rate": 1.9798994974874375e-06, |
|
"loss": 1.0837, |
|
"step": 80300 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 7.65001916885376, |
|
"learning_rate": 1.9698492462311556e-06, |
|
"loss": 1.0141, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 10.12730598449707, |
|
"learning_rate": 1.9597989949748746e-06, |
|
"loss": 0.9636, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"eval_loss": 1.2351462841033936, |
|
"eval_runtime": 106.0686, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 4.90811824798584, |
|
"learning_rate": 1.949748743718593e-06, |
|
"loss": 1.0283, |
|
"step": 80600 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 13.02475357055664, |
|
"learning_rate": 1.9396984924623117e-06, |
|
"loss": 1.0575, |
|
"step": 80700 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 6.416511535644531, |
|
"learning_rate": 1.9296482412060303e-06, |
|
"loss": 1.0341, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 3.5513508319854736, |
|
"learning_rate": 1.919597989949749e-06, |
|
"loss": 0.9724, |
|
"step": 80900 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 9.833860397338867, |
|
"learning_rate": 1.9095477386934674e-06, |
|
"loss": 0.9616, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"eval_loss": 1.2099558115005493, |
|
"eval_runtime": 106.0421, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.1147024631500244, |
|
"learning_rate": 1.899497487437186e-06, |
|
"loss": 1.0576, |
|
"step": 81100 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 11.194029808044434, |
|
"learning_rate": 1.8894472361809047e-06, |
|
"loss": 1.02, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 8.503270149230957, |
|
"learning_rate": 1.8793969849246233e-06, |
|
"loss": 1.0283, |
|
"step": 81300 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 8.470173835754395, |
|
"learning_rate": 1.8693467336683419e-06, |
|
"loss": 1.124, |
|
"step": 81400 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 5.411770343780518, |
|
"learning_rate": 1.8592964824120604e-06, |
|
"loss": 1.0552, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"eval_loss": 1.1827166080474854, |
|
"eval_runtime": 106.0475, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 12.674576759338379, |
|
"learning_rate": 1.849246231155779e-06, |
|
"loss": 1.0509, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 11.850838661193848, |
|
"learning_rate": 1.8391959798994976e-06, |
|
"loss": 1.0259, |
|
"step": 81700 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 13.24087142944336, |
|
"learning_rate": 1.8291457286432163e-06, |
|
"loss": 0.9819, |
|
"step": 81800 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 4.519367694854736, |
|
"learning_rate": 1.819095477386935e-06, |
|
"loss": 0.9872, |
|
"step": 81900 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 9.235967636108398, |
|
"learning_rate": 1.8090452261306535e-06, |
|
"loss": 1.0907, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_loss": 1.2134792804718018, |
|
"eval_runtime": 106.0602, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 20.29911994934082, |
|
"learning_rate": 1.798994974874372e-06, |
|
"loss": 1.064, |
|
"step": 82100 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 14.744649887084961, |
|
"learning_rate": 1.7889447236180906e-06, |
|
"loss": 0.935, |
|
"step": 82200 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 13.829286575317383, |
|
"learning_rate": 1.7788944723618094e-06, |
|
"loss": 1.0424, |
|
"step": 82300 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 29.574548721313477, |
|
"learning_rate": 1.768844221105528e-06, |
|
"loss": 0.9826, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 24.981372833251953, |
|
"learning_rate": 1.7587939698492465e-06, |
|
"loss": 1.0717, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 1.2202541828155518, |
|
"eval_runtime": 106.0305, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 4.073423862457275, |
|
"learning_rate": 1.748743718592965e-06, |
|
"loss": 1.0153, |
|
"step": 82600 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 4.531286716461182, |
|
"learning_rate": 1.7386934673366834e-06, |
|
"loss": 1.0284, |
|
"step": 82700 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 17.423730850219727, |
|
"learning_rate": 1.728643216080402e-06, |
|
"loss": 1.0107, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 11.03442096710205, |
|
"learning_rate": 1.7185929648241205e-06, |
|
"loss": 1.0691, |
|
"step": 82900 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 6.14145565032959, |
|
"learning_rate": 1.7085427135678393e-06, |
|
"loss": 1.008, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"eval_loss": 1.1811891794204712, |
|
"eval_runtime": 106.0519, |
|
"eval_samples_per_second": 9.429, |
|
"eval_steps_per_second": 2.357, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 7.994312286376953, |
|
"learning_rate": 1.6984924623115579e-06, |
|
"loss": 1.045, |
|
"step": 83100 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 9.712363243103027, |
|
"learning_rate": 1.6884422110552764e-06, |
|
"loss": 0.9937, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 6.869166851043701, |
|
"learning_rate": 1.678391959798995e-06, |
|
"loss": 1.0208, |
|
"step": 83300 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 6.159416675567627, |
|
"learning_rate": 1.6683417085427136e-06, |
|
"loss": 0.955, |
|
"step": 83400 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 11.284279823303223, |
|
"learning_rate": 1.6582914572864323e-06, |
|
"loss": 0.9479, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"eval_loss": 1.1792709827423096, |
|
"eval_runtime": 106.0661, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 4.538952350616455, |
|
"learning_rate": 1.648241206030151e-06, |
|
"loss": 0.9935, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 8.99368953704834, |
|
"learning_rate": 1.6381909547738695e-06, |
|
"loss": 1.0084, |
|
"step": 83700 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 6.514503002166748, |
|
"learning_rate": 1.628140703517588e-06, |
|
"loss": 0.9761, |
|
"step": 83800 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 11.572303771972656, |
|
"learning_rate": 1.6180904522613066e-06, |
|
"loss": 1.0081, |
|
"step": 83900 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 7.4358673095703125, |
|
"learning_rate": 1.6080402010050254e-06, |
|
"loss": 1.0011, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"eval_loss": 1.201202392578125, |
|
"eval_runtime": 106.042, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 6.303736209869385, |
|
"learning_rate": 1.597989949748744e-06, |
|
"loss": 1.0016, |
|
"step": 84100 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 16.139482498168945, |
|
"learning_rate": 1.5879396984924625e-06, |
|
"loss": 1.0374, |
|
"step": 84200 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 9.819048881530762, |
|
"learning_rate": 1.577889447236181e-06, |
|
"loss": 1.0411, |
|
"step": 84300 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 4.718905925750732, |
|
"learning_rate": 1.5678391959798996e-06, |
|
"loss": 1.0069, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 7.538113117218018, |
|
"learning_rate": 1.5577889447236184e-06, |
|
"loss": 0.9829, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"eval_loss": 1.1614010334014893, |
|
"eval_runtime": 106.0653, |
|
"eval_samples_per_second": 9.428, |
|
"eval_steps_per_second": 2.357, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 10.54318904876709, |
|
"learning_rate": 1.547738693467337e-06, |
|
"loss": 1.0193, |
|
"step": 84600 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.262185096740723, |
|
"learning_rate": 1.5376884422110555e-06, |
|
"loss": 0.9674, |
|
"step": 84700 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.759582281112671, |
|
"learning_rate": 1.527638190954774e-06, |
|
"loss": 0.9791, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 9.718276977539062, |
|
"learning_rate": 1.5175879396984927e-06, |
|
"loss": 1.0632, |
|
"step": 84900 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.3487863540649414, |
|
"learning_rate": 1.507537688442211e-06, |
|
"loss": 1.0127, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 1.2236921787261963, |
|
"eval_runtime": 106.0375, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 11.309074401855469, |
|
"learning_rate": 1.4974874371859296e-06, |
|
"loss": 1.0514, |
|
"step": 85100 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 7.24993896484375, |
|
"learning_rate": 1.4874371859296483e-06, |
|
"loss": 1.0591, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 14.747005462646484, |
|
"learning_rate": 1.477386934673367e-06, |
|
"loss": 1.014, |
|
"step": 85300 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 5.064266204833984, |
|
"learning_rate": 1.4673366834170855e-06, |
|
"loss": 1.0337, |
|
"step": 85400 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 3.267292022705078, |
|
"learning_rate": 1.457286432160804e-06, |
|
"loss": 1.0187, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"eval_loss": 1.1988115310668945, |
|
"eval_runtime": 106.0389, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.358, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 7.9214863777160645, |
|
"learning_rate": 1.4472361809045226e-06, |
|
"loss": 1.0343, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 3.9712471961975098, |
|
"learning_rate": 1.4371859296482414e-06, |
|
"loss": 1.0237, |
|
"step": 85700 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 15.225872039794922, |
|
"learning_rate": 1.42713567839196e-06, |
|
"loss": 0.9932, |
|
"step": 85800 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 13.813911437988281, |
|
"learning_rate": 1.4170854271356785e-06, |
|
"loss": 0.9916, |
|
"step": 85900 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 9.51653003692627, |
|
"learning_rate": 1.407035175879397e-06, |
|
"loss": 0.9697, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"eval_loss": 1.1969990730285645, |
|
"eval_runtime": 106.0377, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.477295398712158, |
|
"learning_rate": 1.3969849246231156e-06, |
|
"loss": 1.0454, |
|
"step": 86100 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 20.349689483642578, |
|
"learning_rate": 1.3869346733668342e-06, |
|
"loss": 1.034, |
|
"step": 86200 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 13.855409622192383, |
|
"learning_rate": 1.376884422110553e-06, |
|
"loss": 1.0126, |
|
"step": 86300 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 6.404795169830322, |
|
"learning_rate": 1.3668341708542715e-06, |
|
"loss": 1.0157, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 24.46071434020996, |
|
"learning_rate": 1.35678391959799e-06, |
|
"loss": 0.9319, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"eval_loss": 1.2477246522903442, |
|
"eval_runtime": 106.0459, |
|
"eval_samples_per_second": 9.43, |
|
"eval_steps_per_second": 2.357, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 8.004197120666504, |
|
"learning_rate": 1.3467336683417087e-06, |
|
"loss": 0.9952, |
|
"step": 86600 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 14.353170394897461, |
|
"learning_rate": 1.3366834170854272e-06, |
|
"loss": 1.0468, |
|
"step": 86700 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 17.298402786254883, |
|
"learning_rate": 1.326633165829146e-06, |
|
"loss": 1.0275, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 12.817279815673828, |
|
"learning_rate": 1.3165829145728646e-06, |
|
"loss": 1.04, |
|
"step": 86900 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 7.053438186645508, |
|
"learning_rate": 1.3065326633165831e-06, |
|
"loss": 0.9107, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_loss": 1.2399846315383911, |
|
"eval_runtime": 106.0347, |
|
"eval_samples_per_second": 9.431, |
|
"eval_steps_per_second": 2.358, |
|
"step": 87000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 1.400884961476608e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|