|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9994447529150472, |
|
"eval_steps": 100, |
|
"global_step": 2700, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0037016472330186935, |
|
"grad_norm": 4.926731287363718, |
|
"learning_rate": 1.8518518518518518e-07, |
|
"loss": 1.2779, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007403294466037387, |
|
"grad_norm": 5.703433754206953, |
|
"learning_rate": 3.7037037037037036e-07, |
|
"loss": 1.2249, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01110494169905608, |
|
"grad_norm": 4.7147775174955235, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 1.264, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.014806588932074774, |
|
"grad_norm": 4.566443311796311, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 1.2761, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.018508236165093468, |
|
"grad_norm": 3.7939877101308217, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 1.2628, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02220988339811216, |
|
"grad_norm": 3.2608667369377544, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 1.2616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.025911530631130855, |
|
"grad_norm": 2.693925193560105, |
|
"learning_rate": 1.2962962962962962e-06, |
|
"loss": 1.2135, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.029613177864149548, |
|
"grad_norm": 2.4934538306445972, |
|
"learning_rate": 1.4814814814814815e-06, |
|
"loss": 1.2283, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03331482509716824, |
|
"grad_norm": 2.3738488493804053, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 1.2032, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.037016472330186935, |
|
"grad_norm": 2.003106743138406, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 1.1681, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.040718119563205625, |
|
"grad_norm": 1.9183775399790004, |
|
"learning_rate": 2.037037037037037e-06, |
|
"loss": 1.1386, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04441976679622432, |
|
"grad_norm": 2.0615982756628775, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.1354, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04812141402924301, |
|
"grad_norm": 1.7647874716370962, |
|
"learning_rate": 2.4074074074074075e-06, |
|
"loss": 1.0794, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05182306126226171, |
|
"grad_norm": 1.6869461430504737, |
|
"learning_rate": 2.5925925925925925e-06, |
|
"loss": 1.1044, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0555247084952804, |
|
"grad_norm": 1.5634247693019654, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 1.0827, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.059226355728299096, |
|
"grad_norm": 1.5883020831208092, |
|
"learning_rate": 2.962962962962963e-06, |
|
"loss": 1.0348, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06292800296131779, |
|
"grad_norm": 1.647549749764601, |
|
"learning_rate": 3.1481481481481483e-06, |
|
"loss": 1.046, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06662965019433648, |
|
"grad_norm": 1.5287756578720924, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.0328, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07033129742735518, |
|
"grad_norm": 1.6126962083401655, |
|
"learning_rate": 3.5185185185185187e-06, |
|
"loss": 1.0586, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07403294466037387, |
|
"grad_norm": 1.6109547232696333, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 1.0536, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07403294466037387, |
|
"eval_loss": 1.058408260345459, |
|
"eval_runtime": 3.1704, |
|
"eval_samples_per_second": 40.373, |
|
"eval_steps_per_second": 10.093, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07773459189339256, |
|
"grad_norm": 1.5487151297755306, |
|
"learning_rate": 3.88888888888889e-06, |
|
"loss": 1.0466, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08143623912641125, |
|
"grad_norm": 1.6206054084800117, |
|
"learning_rate": 4.074074074074074e-06, |
|
"loss": 1.0454, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08513788635942994, |
|
"grad_norm": 1.6388236132514973, |
|
"learning_rate": 4.2592592592592596e-06, |
|
"loss": 1.0093, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08883953359244864, |
|
"grad_norm": 1.762544462684926, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.0323, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09254118082546733, |
|
"grad_norm": 1.5157216909181752, |
|
"learning_rate": 4.62962962962963e-06, |
|
"loss": 0.9805, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09624282805848602, |
|
"grad_norm": 1.6126727497640228, |
|
"learning_rate": 4.814814814814815e-06, |
|
"loss": 1.0453, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09994447529150471, |
|
"grad_norm": 1.6056041711466675, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9948, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.10364612252452342, |
|
"grad_norm": 1.7214000983830573, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 1.0221, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10734776975754211, |
|
"grad_norm": 1.790247254335279, |
|
"learning_rate": 5.370370370370371e-06, |
|
"loss": 1.0289, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1110494169905608, |
|
"grad_norm": 1.9740969224229157, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.9731, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11475106422357949, |
|
"grad_norm": 1.7434094806437848, |
|
"learning_rate": 5.740740740740741e-06, |
|
"loss": 1.0133, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11845271145659819, |
|
"grad_norm": 1.6569075088545475, |
|
"learning_rate": 5.925925925925926e-06, |
|
"loss": 0.9619, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12215435868961688, |
|
"grad_norm": 1.5639961881245634, |
|
"learning_rate": 6.111111111111112e-06, |
|
"loss": 0.9748, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12585600592263557, |
|
"grad_norm": 1.63729125924558, |
|
"learning_rate": 6.296296296296297e-06, |
|
"loss": 0.9719, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12955765315565426, |
|
"grad_norm": 1.6770847599823961, |
|
"learning_rate": 6.481481481481482e-06, |
|
"loss": 0.9678, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13325930038867295, |
|
"grad_norm": 1.6036024099972268, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.9837, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13696094762169164, |
|
"grad_norm": 1.7254292999029601, |
|
"learning_rate": 6.851851851851853e-06, |
|
"loss": 0.9544, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14066259485471036, |
|
"grad_norm": 1.6227891647780306, |
|
"learning_rate": 7.0370370370370375e-06, |
|
"loss": 0.9817, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14436424208772905, |
|
"grad_norm": 1.6731930737572018, |
|
"learning_rate": 7.222222222222223e-06, |
|
"loss": 1.0018, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14806588932074774, |
|
"grad_norm": 1.8246088602932442, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.9952, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14806588932074774, |
|
"eval_loss": 0.9997319579124451, |
|
"eval_runtime": 3.164, |
|
"eval_samples_per_second": 40.455, |
|
"eval_steps_per_second": 10.114, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15176753655376643, |
|
"grad_norm": 1.667016338654756, |
|
"learning_rate": 7.592592592592594e-06, |
|
"loss": 0.9611, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15546918378678512, |
|
"grad_norm": 1.8324320776220002, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.9816, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1591708310198038, |
|
"grad_norm": 1.7407854561265863, |
|
"learning_rate": 7.962962962962963e-06, |
|
"loss": 0.948, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1628724782528225, |
|
"grad_norm": 1.6263652375943793, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 0.9725, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1665741254858412, |
|
"grad_norm": 1.604684517414929, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.9748, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.17027577271885988, |
|
"grad_norm": 1.6240132382335448, |
|
"learning_rate": 8.518518518518519e-06, |
|
"loss": 0.8894, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1739774199518786, |
|
"grad_norm": 1.946689743693115, |
|
"learning_rate": 8.703703703703705e-06, |
|
"loss": 0.9728, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1776790671848973, |
|
"grad_norm": 1.5910894212139106, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.9586, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18138071441791598, |
|
"grad_norm": 1.586920219216813, |
|
"learning_rate": 9.074074074074075e-06, |
|
"loss": 0.9915, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.18508236165093467, |
|
"grad_norm": 1.6573463301689406, |
|
"learning_rate": 9.25925925925926e-06, |
|
"loss": 0.9371, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18878400888395336, |
|
"grad_norm": 1.7929171263043344, |
|
"learning_rate": 9.444444444444445e-06, |
|
"loss": 0.9338, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19248565611697205, |
|
"grad_norm": 1.7413680860932421, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 0.9798, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19618730334999074, |
|
"grad_norm": 1.55205024926949, |
|
"learning_rate": 9.814814814814815e-06, |
|
"loss": 0.9108, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.19988895058300943, |
|
"grad_norm": 1.5777416975125247, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9632, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.20359059781602815, |
|
"grad_norm": 1.9182712726935345, |
|
"learning_rate": 9.999895536228031e-06, |
|
"loss": 0.9661, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.20729224504904684, |
|
"grad_norm": 1.5492637421745667, |
|
"learning_rate": 9.999582149277188e-06, |
|
"loss": 0.9658, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21099389228206553, |
|
"grad_norm": 1.5573560992355333, |
|
"learning_rate": 9.999059852242508e-06, |
|
"loss": 0.9345, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.21469553951508422, |
|
"grad_norm": 1.5901765655741555, |
|
"learning_rate": 9.998328666948437e-06, |
|
"loss": 0.9638, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2183971867481029, |
|
"grad_norm": 1.890818029743186, |
|
"learning_rate": 9.997388623947927e-06, |
|
"loss": 0.9614, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.2220988339811216, |
|
"grad_norm": 1.6577126409263625, |
|
"learning_rate": 9.996239762521152e-06, |
|
"loss": 0.9173, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2220988339811216, |
|
"eval_loss": 0.9752744436264038, |
|
"eval_runtime": 3.1471, |
|
"eval_samples_per_second": 40.673, |
|
"eval_steps_per_second": 10.168, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2258004812141403, |
|
"grad_norm": 1.57087297893409, |
|
"learning_rate": 9.994882130673869e-06, |
|
"loss": 0.9587, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.22950212844715898, |
|
"grad_norm": 1.6800412548036792, |
|
"learning_rate": 9.993315785135417e-06, |
|
"loss": 0.9865, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23320377568017767, |
|
"grad_norm": 1.827251207479932, |
|
"learning_rate": 9.991540791356342e-06, |
|
"loss": 0.9791, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23690542291319638, |
|
"grad_norm": 1.6957246714853988, |
|
"learning_rate": 9.989557223505661e-06, |
|
"loss": 0.9739, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.24060707014621507, |
|
"grad_norm": 1.6145201433896634, |
|
"learning_rate": 9.987365164467767e-06, |
|
"loss": 0.9403, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24430871737923376, |
|
"grad_norm": 1.545468057839784, |
|
"learning_rate": 9.98496470583896e-06, |
|
"loss": 0.9484, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24801036461225245, |
|
"grad_norm": 1.69681941694725, |
|
"learning_rate": 9.98235594792363e-06, |
|
"loss": 0.9601, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.25171201184527114, |
|
"grad_norm": 1.620816439467806, |
|
"learning_rate": 9.979538999730047e-06, |
|
"loss": 0.9661, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25541365907828983, |
|
"grad_norm": 1.7174381580440687, |
|
"learning_rate": 9.976513978965829e-06, |
|
"loss": 0.9635, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2591153063113085, |
|
"grad_norm": 1.526013521731973, |
|
"learning_rate": 9.973281012033009e-06, |
|
"loss": 0.9354, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2628169535443272, |
|
"grad_norm": 1.6267145858558363, |
|
"learning_rate": 9.96984023402275e-06, |
|
"loss": 0.9357, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2665186007773459, |
|
"grad_norm": 1.648523250153613, |
|
"learning_rate": 9.966191788709716e-06, |
|
"loss": 0.9029, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2702202480103646, |
|
"grad_norm": 1.7121000617109097, |
|
"learning_rate": 9.962335828546049e-06, |
|
"loss": 0.9764, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2739218952433833, |
|
"grad_norm": 1.7087649168563341, |
|
"learning_rate": 9.958272514655006e-06, |
|
"loss": 0.9094, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.277623542476402, |
|
"grad_norm": 1.4952276511040479, |
|
"learning_rate": 9.954002016824226e-06, |
|
"loss": 0.8792, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2813251897094207, |
|
"grad_norm": 1.5879792170425184, |
|
"learning_rate": 9.949524513498636e-06, |
|
"loss": 0.9599, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2850268369424394, |
|
"grad_norm": 1.6786310875493402, |
|
"learning_rate": 9.944840191772987e-06, |
|
"loss": 0.9066, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2887284841754581, |
|
"grad_norm": 1.6557005217237986, |
|
"learning_rate": 9.939949247384046e-06, |
|
"loss": 0.973, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2924301314084768, |
|
"grad_norm": 1.515758129017559, |
|
"learning_rate": 9.934851884702415e-06, |
|
"loss": 0.9345, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2961317786414955, |
|
"grad_norm": 1.5720836035645664, |
|
"learning_rate": 9.929548316723983e-06, |
|
"loss": 0.9483, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2961317786414955, |
|
"eval_loss": 0.9586808681488037, |
|
"eval_runtime": 3.1572, |
|
"eval_samples_per_second": 40.542, |
|
"eval_steps_per_second": 10.136, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29983342587451417, |
|
"grad_norm": 1.480615187339131, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 0.9001, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.30353507310753286, |
|
"grad_norm": 1.7177590937252007, |
|
"learning_rate": 9.918323459933006e-06, |
|
"loss": 0.9756, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.30723672034055155, |
|
"grad_norm": 1.6291299502048266, |
|
"learning_rate": 9.912402640156812e-06, |
|
"loss": 0.9001, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.31093836757357024, |
|
"grad_norm": 1.8103040276328037, |
|
"learning_rate": 9.906276553136924e-06, |
|
"loss": 0.9351, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.31464001480658893, |
|
"grad_norm": 1.6717258510975974, |
|
"learning_rate": 9.899945454855007e-06, |
|
"loss": 0.9195, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3183416620396076, |
|
"grad_norm": 1.610613790706616, |
|
"learning_rate": 9.893409609859221e-06, |
|
"loss": 0.9224, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3220433092726263, |
|
"grad_norm": 1.5467378469691506, |
|
"learning_rate": 9.886669291253178e-06, |
|
"loss": 0.8938, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.325744956505645, |
|
"grad_norm": 1.5998788496779754, |
|
"learning_rate": 9.879724780684518e-06, |
|
"loss": 0.9549, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3294466037386637, |
|
"grad_norm": 1.7208144001800147, |
|
"learning_rate": 9.872576368333152e-06, |
|
"loss": 0.9178, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3331482509716824, |
|
"grad_norm": 1.636440269203003, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.9606, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33684989820470107, |
|
"grad_norm": 1.8573739879825546, |
|
"learning_rate": 9.857669041590135e-06, |
|
"loss": 0.94, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.34055154543771976, |
|
"grad_norm": 1.547923385218674, |
|
"learning_rate": 9.849910750108718e-06, |
|
"loss": 0.987, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3442531926707385, |
|
"grad_norm": 1.6288545126915284, |
|
"learning_rate": 9.841949802639031e-06, |
|
"loss": 0.94, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3479548399037572, |
|
"grad_norm": 1.6176583181377966, |
|
"learning_rate": 9.833786531833311e-06, |
|
"loss": 0.9029, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3516564871367759, |
|
"grad_norm": 4.489908362910681, |
|
"learning_rate": 9.825421278797984e-06, |
|
"loss": 0.9196, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3553581343697946, |
|
"grad_norm": 1.547885442175379, |
|
"learning_rate": 9.816854393079402e-06, |
|
"loss": 0.9263, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35905978160281327, |
|
"grad_norm": 1.697999310517363, |
|
"learning_rate": 9.808086232649246e-06, |
|
"loss": 0.8988, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.36276142883583196, |
|
"grad_norm": 1.6467738485343044, |
|
"learning_rate": 9.79911716388956e-06, |
|
"loss": 0.9276, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.36646307606885065, |
|
"grad_norm": 1.519755113687478, |
|
"learning_rate": 9.789947561577445e-06, |
|
"loss": 0.9056, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.37016472330186934, |
|
"grad_norm": 1.5694113805529823, |
|
"learning_rate": 9.7805778088694e-06, |
|
"loss": 0.9606, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.37016472330186934, |
|
"eval_loss": 0.9462636709213257, |
|
"eval_runtime": 3.1559, |
|
"eval_samples_per_second": 40.559, |
|
"eval_steps_per_second": 10.14, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.373866370534888, |
|
"grad_norm": 1.5662257522356287, |
|
"learning_rate": 9.771008297285307e-06, |
|
"loss": 0.8891, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3775680177679067, |
|
"grad_norm": 1.5722068922523773, |
|
"learning_rate": 9.761239426692077e-06, |
|
"loss": 0.9116, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3812696650009254, |
|
"grad_norm": 1.6326942959463115, |
|
"learning_rate": 9.75127160528694e-06, |
|
"loss": 0.879, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3849713122339441, |
|
"grad_norm": 1.6193404458045255, |
|
"learning_rate": 9.741105249580383e-06, |
|
"loss": 0.9046, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3886729594669628, |
|
"grad_norm": 1.7270836199994797, |
|
"learning_rate": 9.730740784378755e-06, |
|
"loss": 0.9031, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3923746066999815, |
|
"grad_norm": 1.5366432768741194, |
|
"learning_rate": 9.7201786427665e-06, |
|
"loss": 0.9372, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.39607625393300017, |
|
"grad_norm": 1.431134939008643, |
|
"learning_rate": 9.709419266088086e-06, |
|
"loss": 0.9198, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.39977790116601886, |
|
"grad_norm": 1.614478850042705, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.8721, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.40347954839903755, |
|
"grad_norm": 1.642303581400171, |
|
"learning_rate": 9.687310614099676e-06, |
|
"loss": 0.9332, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4071811956320563, |
|
"grad_norm": 1.5593556355035971, |
|
"learning_rate": 9.67596226261095e-06, |
|
"loss": 0.9177, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.410882842865075, |
|
"grad_norm": 1.4787628943323314, |
|
"learning_rate": 9.664418523660004e-06, |
|
"loss": 0.8897, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.41458449009809367, |
|
"grad_norm": 1.5116552711547666, |
|
"learning_rate": 9.652679879607843e-06, |
|
"loss": 0.9102, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.41828613733111236, |
|
"grad_norm": 1.6954184174735383, |
|
"learning_rate": 9.640746820959684e-06, |
|
"loss": 0.9222, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.42198778456413105, |
|
"grad_norm": 1.5800022221709038, |
|
"learning_rate": 9.628619846344453e-06, |
|
"loss": 0.908, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.42568943179714974, |
|
"grad_norm": 1.742875014361042, |
|
"learning_rate": 9.616299462493952e-06, |
|
"loss": 0.9262, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.42939107903016843, |
|
"grad_norm": 1.6869682327085789, |
|
"learning_rate": 9.603786184221693e-06, |
|
"loss": 0.9656, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4330927262631871, |
|
"grad_norm": 1.5358152756842165, |
|
"learning_rate": 9.591080534401371e-06, |
|
"loss": 0.9008, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4367943734962058, |
|
"grad_norm": 1.472799741124217, |
|
"learning_rate": 9.578183043945031e-06, |
|
"loss": 0.8985, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4404960207292245, |
|
"grad_norm": 1.5285332686194193, |
|
"learning_rate": 9.565094251780872e-06, |
|
"loss": 0.9227, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4441976679622432, |
|
"grad_norm": 1.7306276016731665, |
|
"learning_rate": 9.551814704830734e-06, |
|
"loss": 0.9228, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4441976679622432, |
|
"eval_loss": 0.9377050399780273, |
|
"eval_runtime": 3.1816, |
|
"eval_samples_per_second": 40.231, |
|
"eval_steps_per_second": 10.058, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4478993151952619, |
|
"grad_norm": 1.9364310212084868, |
|
"learning_rate": 9.538344957987245e-06, |
|
"loss": 0.8999, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4516009624282806, |
|
"grad_norm": 1.6600681439267466, |
|
"learning_rate": 9.524685574090627e-06, |
|
"loss": 0.9273, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.45530260966129926, |
|
"grad_norm": 1.5405105840813555, |
|
"learning_rate": 9.51083712390519e-06, |
|
"loss": 0.8671, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.45900425689431795, |
|
"grad_norm": 1.5777002219413507, |
|
"learning_rate": 9.496800186095466e-06, |
|
"loss": 0.915, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.46270590412733664, |
|
"grad_norm": 1.5660428767748025, |
|
"learning_rate": 9.482575347202047e-06, |
|
"loss": 0.9034, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.46640755136035533, |
|
"grad_norm": 1.5848533104225215, |
|
"learning_rate": 9.468163201617063e-06, |
|
"loss": 0.9136, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4701091985933741, |
|
"grad_norm": 1.6202334341144038, |
|
"learning_rate": 9.453564351559348e-06, |
|
"loss": 0.9044, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.47381084582639277, |
|
"grad_norm": 1.5181917545415298, |
|
"learning_rate": 9.438779407049282e-06, |
|
"loss": 0.886, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.47751249305941146, |
|
"grad_norm": 1.660812656260688, |
|
"learning_rate": 9.423808985883289e-06, |
|
"loss": 0.8949, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.48121414029243015, |
|
"grad_norm": 4.58789233207976, |
|
"learning_rate": 9.40865371360804e-06, |
|
"loss": 0.8983, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.48491578752544884, |
|
"grad_norm": 1.5232123704637264, |
|
"learning_rate": 9.393314223494297e-06, |
|
"loss": 0.92, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.48861743475846753, |
|
"grad_norm": 1.4724806815618674, |
|
"learning_rate": 9.377791156510456e-06, |
|
"loss": 0.8933, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4923190819914862, |
|
"grad_norm": 1.535367315925609, |
|
"learning_rate": 9.362085161295768e-06, |
|
"loss": 0.8957, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4960207292245049, |
|
"grad_norm": 1.5839037752325844, |
|
"learning_rate": 9.346196894133239e-06, |
|
"loss": 0.9144, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4997223764575236, |
|
"grad_norm": 1.4456461616907181, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.9266, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5034240236905423, |
|
"grad_norm": 1.5901589151556077, |
|
"learning_rate": 9.313876207150544e-06, |
|
"loss": 0.8872, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.507125670923561, |
|
"grad_norm": 1.5552074217295435, |
|
"learning_rate": 9.297445137866726e-06, |
|
"loss": 0.9333, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5108273181565797, |
|
"grad_norm": 1.7335515030772273, |
|
"learning_rate": 9.280834497651334e-06, |
|
"loss": 0.8647, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5145289653895984, |
|
"grad_norm": 1.6240276146792032, |
|
"learning_rate": 9.264044980588415e-06, |
|
"loss": 0.914, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.518230612622617, |
|
"grad_norm": 1.7161482197557083, |
|
"learning_rate": 9.247077288236488e-06, |
|
"loss": 0.8559, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.518230612622617, |
|
"eval_loss": 0.9287987947463989, |
|
"eval_runtime": 3.1642, |
|
"eval_samples_per_second": 40.453, |
|
"eval_steps_per_second": 10.113, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5219322598556357, |
|
"grad_norm": 1.656070239158929, |
|
"learning_rate": 9.229932129599206e-06, |
|
"loss": 0.8957, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5256339070886544, |
|
"grad_norm": 1.5896600717134812, |
|
"learning_rate": 9.212610221095748e-06, |
|
"loss": 0.9133, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5293355543216731, |
|
"grad_norm": 1.5927452123153347, |
|
"learning_rate": 9.195112286530874e-06, |
|
"loss": 0.9476, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5330372015546918, |
|
"grad_norm": 1.4775780605104303, |
|
"learning_rate": 9.177439057064684e-06, |
|
"loss": 0.902, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5367388487877105, |
|
"grad_norm": 1.5954671281250503, |
|
"learning_rate": 9.159591271182058e-06, |
|
"loss": 0.9182, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5404404960207292, |
|
"grad_norm": 1.7420465439255852, |
|
"learning_rate": 9.141569674661816e-06, |
|
"loss": 0.9065, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5441421432537479, |
|
"grad_norm": 1.517043086331956, |
|
"learning_rate": 9.123375020545534e-06, |
|
"loss": 0.8997, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5478437904867666, |
|
"grad_norm": 1.5722793737373082, |
|
"learning_rate": 9.105008069106093e-06, |
|
"loss": 0.8992, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5515454377197853, |
|
"grad_norm": 1.964265094936069, |
|
"learning_rate": 9.086469587815904e-06, |
|
"loss": 0.924, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.555247084952804, |
|
"grad_norm": 1.6472805894336484, |
|
"learning_rate": 9.067760351314838e-06, |
|
"loss": 0.9196, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5589487321858226, |
|
"grad_norm": 1.5079555175389572, |
|
"learning_rate": 9.048881141377863e-06, |
|
"loss": 0.9169, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5626503794188414, |
|
"grad_norm": 1.5194748519769294, |
|
"learning_rate": 9.029832746882372e-06, |
|
"loss": 0.8988, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5663520266518601, |
|
"grad_norm": 1.6331068517006764, |
|
"learning_rate": 9.01061596377522e-06, |
|
"loss": 0.8944, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5700536738848788, |
|
"grad_norm": 1.7434321207013654, |
|
"learning_rate": 8.991231595039464e-06, |
|
"loss": 0.8409, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5737553211178975, |
|
"grad_norm": 1.6239861748571274, |
|
"learning_rate": 8.97168045066082e-06, |
|
"loss": 0.9224, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5774569683509162, |
|
"grad_norm": 1.5102303282386282, |
|
"learning_rate": 8.951963347593797e-06, |
|
"loss": 0.8907, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5811586155839349, |
|
"grad_norm": 1.4816965709083738, |
|
"learning_rate": 8.932081109727582e-06, |
|
"loss": 0.8866, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5848602628169536, |
|
"grad_norm": 1.6612729975466995, |
|
"learning_rate": 8.9120345678516e-06, |
|
"loss": 0.8744, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5885619100499723, |
|
"grad_norm": 1.5468054795591688, |
|
"learning_rate": 8.891824559620801e-06, |
|
"loss": 0.8734, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.592263557282991, |
|
"grad_norm": 1.67842205416201, |
|
"learning_rate": 8.871451929520662e-06, |
|
"loss": 0.9007, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.592263557282991, |
|
"eval_loss": 0.9225832223892212, |
|
"eval_runtime": 3.1485, |
|
"eval_samples_per_second": 40.654, |
|
"eval_steps_per_second": 10.163, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5959652045160097, |
|
"grad_norm": 1.6055978161311106, |
|
"learning_rate": 8.8509175288319e-06, |
|
"loss": 0.8652, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5996668517490283, |
|
"grad_norm": 1.575542847028311, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 0.8765, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.603368498982047, |
|
"grad_norm": 1.5841116800285324, |
|
"learning_rate": 8.80936685457383e-06, |
|
"loss": 0.8887, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6070701462150657, |
|
"grad_norm": 1.6146240580940538, |
|
"learning_rate": 8.78835231722059e-06, |
|
"loss": 0.9255, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6107717934480844, |
|
"grad_norm": 1.592402592883371, |
|
"learning_rate": 8.767179481638303e-06, |
|
"loss": 0.9086, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6144734406811031, |
|
"grad_norm": 1.4757823575568227, |
|
"learning_rate": 8.74584923254468e-06, |
|
"loss": 0.8529, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6181750879141218, |
|
"grad_norm": 1.537925841696542, |
|
"learning_rate": 8.72436246123503e-06, |
|
"loss": 0.8998, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6218767351471405, |
|
"grad_norm": 1.5213356109194787, |
|
"learning_rate": 8.702720065545024e-06, |
|
"loss": 0.8979, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6255783823801592, |
|
"grad_norm": 1.6246757991237541, |
|
"learning_rate": 8.680922949813177e-06, |
|
"loss": 0.876, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6292800296131779, |
|
"grad_norm": 1.6560473955623969, |
|
"learning_rate": 8.658972024843063e-06, |
|
"loss": 0.905, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6329816768461966, |
|
"grad_norm": 1.6330253392316993, |
|
"learning_rate": 8.636868207865244e-06, |
|
"loss": 0.9118, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6366833240792152, |
|
"grad_norm": 1.578443920433453, |
|
"learning_rate": 8.614612422498965e-06, |
|
"loss": 0.8972, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6403849713122339, |
|
"grad_norm": 1.562414773390906, |
|
"learning_rate": 8.592205598713539e-06, |
|
"loss": 0.9058, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6440866185452526, |
|
"grad_norm": 1.4763369111308837, |
|
"learning_rate": 8.569648672789496e-06, |
|
"loss": 0.8998, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6477882657782713, |
|
"grad_norm": 1.5267284020058436, |
|
"learning_rate": 8.546942587279465e-06, |
|
"loss": 0.8876, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.65148991301129, |
|
"grad_norm": 1.6015010973681025, |
|
"learning_rate": 8.524088290968781e-06, |
|
"loss": 0.9038, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6551915602443087, |
|
"grad_norm": 1.5212386116988514, |
|
"learning_rate": 8.501086738835843e-06, |
|
"loss": 0.8992, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6588932074773274, |
|
"grad_norm": 1.5521865085430773, |
|
"learning_rate": 8.477938892012209e-06, |
|
"loss": 0.9098, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6625948547103461, |
|
"grad_norm": 1.6352371095370688, |
|
"learning_rate": 8.45464571774244e-06, |
|
"loss": 0.8776, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6662965019433648, |
|
"grad_norm": 1.4335123933443348, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.8887, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6662965019433648, |
|
"eval_loss": 0.9168589115142822, |
|
"eval_runtime": 3.1708, |
|
"eval_samples_per_second": 40.368, |
|
"eval_steps_per_second": 10.092, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6699981491763835, |
|
"grad_norm": 1.752889118258146, |
|
"learning_rate": 8.407627286164948e-06, |
|
"loss": 0.937, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6736997964094021, |
|
"grad_norm": 1.61284581944927, |
|
"learning_rate": 8.38390399354631e-06, |
|
"loss": 0.8774, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6774014436424208, |
|
"grad_norm": 1.4371185005872025, |
|
"learning_rate": 8.360039302777614e-06, |
|
"loss": 0.8532, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6811030908754395, |
|
"grad_norm": 1.5270143988810398, |
|
"learning_rate": 8.336034211057098e-06, |
|
"loss": 0.893, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6848047381084582, |
|
"grad_norm": 1.4597070244698995, |
|
"learning_rate": 8.31188972144974e-06, |
|
"loss": 0.8633, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.688506385341477, |
|
"grad_norm": 1.479114786832264, |
|
"learning_rate": 8.28760684284532e-06, |
|
"loss": 0.8757, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6922080325744957, |
|
"grad_norm": 1.5128199418313293, |
|
"learning_rate": 8.263186589916273e-06, |
|
"loss": 0.8766, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6959096798075144, |
|
"grad_norm": 1.6888938446230763, |
|
"learning_rate": 8.238629983075296e-06, |
|
"loss": 0.888, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6996113270405331, |
|
"grad_norm": 1.5613989791244507, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 0.9015, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7033129742735518, |
|
"grad_norm": 1.4840207670355887, |
|
"learning_rate": 8.18911181775353e-06, |
|
"loss": 0.8807, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7070146215065705, |
|
"grad_norm": 1.4206191225329334, |
|
"learning_rate": 8.164152328414476e-06, |
|
"loss": 0.8879, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7107162687395892, |
|
"grad_norm": 1.3773109393942617, |
|
"learning_rate": 8.139060623360494e-06, |
|
"loss": 0.8543, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7144179159726078, |
|
"grad_norm": 1.561145137037389, |
|
"learning_rate": 8.113837751061246e-06, |
|
"loss": 0.8544, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7181195632056265, |
|
"grad_norm": 1.530296331575427, |
|
"learning_rate": 8.088484765467286e-06, |
|
"loss": 0.8786, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7218212104386452, |
|
"grad_norm": 1.4095982178855946, |
|
"learning_rate": 8.063002725966014e-06, |
|
"loss": 0.8623, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7255228576716639, |
|
"grad_norm": 1.4533639199195938, |
|
"learning_rate": 8.037392697337418e-06, |
|
"loss": 0.9177, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7292245049046826, |
|
"grad_norm": 1.529631482196537, |
|
"learning_rate": 8.011655749709575e-06, |
|
"loss": 0.887, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.7329261521377013, |
|
"grad_norm": 1.6257644315310276, |
|
"learning_rate": 7.985792958513932e-06, |
|
"loss": 0.8741, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.73662779937072, |
|
"grad_norm": 1.4945083303719462, |
|
"learning_rate": 7.95980540444038e-06, |
|
"loss": 0.8929, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.7403294466037387, |
|
"grad_norm": 1.5558898574686058, |
|
"learning_rate": 7.93369417339209e-06, |
|
"loss": 0.8871, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7403294466037387, |
|
"eval_loss": 0.9122792482376099, |
|
"eval_runtime": 3.1919, |
|
"eval_samples_per_second": 40.102, |
|
"eval_steps_per_second": 10.025, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7440310938367574, |
|
"grad_norm": 1.415340145678768, |
|
"learning_rate": 7.907460356440133e-06, |
|
"loss": 0.9232, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.747732741069776, |
|
"grad_norm": 1.5415849153000531, |
|
"learning_rate": 7.881105049777902e-06, |
|
"loss": 0.824, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7514343883027947, |
|
"grad_norm": 1.5774048558196452, |
|
"learning_rate": 7.854629354675292e-06, |
|
"loss": 0.8861, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.7551360355358134, |
|
"grad_norm": 1.5140443985048166, |
|
"learning_rate": 7.828034377432694e-06, |
|
"loss": 0.8633, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7588376827688321, |
|
"grad_norm": 1.6161320666951147, |
|
"learning_rate": 7.801321229334764e-06, |
|
"loss": 0.9256, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7625393300018508, |
|
"grad_norm": 1.5750819217859227, |
|
"learning_rate": 7.774491026603985e-06, |
|
"loss": 0.9095, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7662409772348695, |
|
"grad_norm": 1.4573163893911831, |
|
"learning_rate": 7.747544890354031e-06, |
|
"loss": 0.8641, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7699426244678882, |
|
"grad_norm": 1.5190407253784062, |
|
"learning_rate": 7.720483946542913e-06, |
|
"loss": 0.8661, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7736442717009069, |
|
"grad_norm": 1.4675439439774145, |
|
"learning_rate": 7.69330932592594e-06, |
|
"loss": 0.8539, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7773459189339256, |
|
"grad_norm": 1.5250204688526834, |
|
"learning_rate": 7.666022164008458e-06, |
|
"loss": 0.8724, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7810475661669443, |
|
"grad_norm": 1.5414560207792796, |
|
"learning_rate": 7.638623600998409e-06, |
|
"loss": 0.8763, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.784749213399963, |
|
"grad_norm": 1.4673209503143758, |
|
"learning_rate": 7.6111147817586925e-06, |
|
"loss": 0.9014, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7884508606329816, |
|
"grad_norm": 1.4868822477001453, |
|
"learning_rate": 7.5834968557593155e-06, |
|
"loss": 0.9257, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7921525078660003, |
|
"grad_norm": 1.529122704331006, |
|
"learning_rate": 7.5557709770293664e-06, |
|
"loss": 0.8611, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.795854155099019, |
|
"grad_norm": 1.535202143374227, |
|
"learning_rate": 7.527938304108795e-06, |
|
"loss": 0.8845, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7995558023320377, |
|
"grad_norm": 1.6751360090756962, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8917, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8032574495650564, |
|
"grad_norm": 1.590938657275628, |
|
"learning_rate": 7.471957232119235e-06, |
|
"loss": 0.8742, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8069590967980751, |
|
"grad_norm": 1.607017035466218, |
|
"learning_rate": 7.443811172247822e-06, |
|
"loss": 0.8977, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8106607440310938, |
|
"grad_norm": 1.4709104617373727, |
|
"learning_rate": 7.415562996483193e-06, |
|
"loss": 0.8657, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8143623912641126, |
|
"grad_norm": 1.4181383032030903, |
|
"learning_rate": 7.387213885189746e-06, |
|
"loss": 0.8866, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8143623912641126, |
|
"eval_loss": 0.9071660041809082, |
|
"eval_runtime": 3.1786, |
|
"eval_samples_per_second": 40.27, |
|
"eval_steps_per_second": 10.067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8180640384971313, |
|
"grad_norm": 1.537744834366893, |
|
"learning_rate": 7.358765022949519e-06, |
|
"loss": 0.8673, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.82176568573015, |
|
"grad_norm": 1.5681327287966587, |
|
"learning_rate": 7.330217598512696e-06, |
|
"loss": 0.8972, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8254673329631687, |
|
"grad_norm": 1.5959283223351195, |
|
"learning_rate": 7.30157280474793e-06, |
|
"loss": 0.8265, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8291689801961873, |
|
"grad_norm": 1.546887120710175, |
|
"learning_rate": 7.2728318385925035e-06, |
|
"loss": 0.8702, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.832870627429206, |
|
"grad_norm": 1.4838415359645296, |
|
"learning_rate": 7.243995901002312e-06, |
|
"loss": 0.8652, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8365722746622247, |
|
"grad_norm": 1.4983295522338373, |
|
"learning_rate": 7.215066196901676e-06, |
|
"loss": 0.8923, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8402739218952434, |
|
"grad_norm": 1.750569654616615, |
|
"learning_rate": 7.186043935133005e-06, |
|
"loss": 0.9152, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.8439755691282621, |
|
"grad_norm": 1.420557406559206, |
|
"learning_rate": 7.156930328406268e-06, |
|
"loss": 0.8832, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8476772163612808, |
|
"grad_norm": 1.401717890883244, |
|
"learning_rate": 7.127726593248337e-06, |
|
"loss": 0.8731, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.8513788635942995, |
|
"grad_norm": 1.9562309904598933, |
|
"learning_rate": 7.098433949952146e-06, |
|
"loss": 0.8671, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8550805108273182, |
|
"grad_norm": 1.4253025853304513, |
|
"learning_rate": 7.069053622525697e-06, |
|
"loss": 0.8427, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.8587821580603369, |
|
"grad_norm": 1.421182142481623, |
|
"learning_rate": 7.039586838640918e-06, |
|
"loss": 0.8813, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8624838052933556, |
|
"grad_norm": 1.5723535571085472, |
|
"learning_rate": 7.0100348295823706e-06, |
|
"loss": 0.9105, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.8661854525263742, |
|
"grad_norm": 1.4821847838003668, |
|
"learning_rate": 6.980398830195785e-06, |
|
"loss": 0.8597, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8698870997593929, |
|
"grad_norm": 1.5698283294062816, |
|
"learning_rate": 6.950680078836475e-06, |
|
"loss": 0.9062, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8735887469924116, |
|
"grad_norm": 1.4752342138688566, |
|
"learning_rate": 6.920879817317588e-06, |
|
"loss": 0.9017, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8772903942254303, |
|
"grad_norm": 1.5796871936550905, |
|
"learning_rate": 6.890999290858213e-06, |
|
"loss": 0.8906, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.880992041458449, |
|
"grad_norm": 1.4037532471618501, |
|
"learning_rate": 6.861039748031351e-06, |
|
"loss": 0.849, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8846936886914677, |
|
"grad_norm": 1.4505793557706694, |
|
"learning_rate": 6.8310024407117405e-06, |
|
"loss": 0.8767, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.8883953359244864, |
|
"grad_norm": 1.5143366114173251, |
|
"learning_rate": 6.800888624023552e-06, |
|
"loss": 0.9083, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8883953359244864, |
|
"eval_loss": 0.9034964442253113, |
|
"eval_runtime": 3.189, |
|
"eval_samples_per_second": 40.137, |
|
"eval_steps_per_second": 10.034, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8920969831575051, |
|
"grad_norm": 1.448067319770545, |
|
"learning_rate": 6.770699556287939e-06, |
|
"loss": 0.8832, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.8957986303905238, |
|
"grad_norm": 1.5762042517893504, |
|
"learning_rate": 6.740436498970453e-06, |
|
"loss": 0.8689, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8995002776235425, |
|
"grad_norm": 1.4411368898695527, |
|
"learning_rate": 6.710100716628345e-06, |
|
"loss": 0.8727, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9032019248565611, |
|
"grad_norm": 1.4456423978079818, |
|
"learning_rate": 6.679693476857712e-06, |
|
"loss": 0.8266, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9069035720895798, |
|
"grad_norm": 1.4766682552792967, |
|
"learning_rate": 6.649216050240539e-06, |
|
"loss": 0.8937, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9106052193225985, |
|
"grad_norm": 1.5277505231786122, |
|
"learning_rate": 6.618669710291607e-06, |
|
"loss": 0.9154, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9143068665556172, |
|
"grad_norm": 1.535656971389958, |
|
"learning_rate": 6.588055733405266e-06, |
|
"loss": 0.8916, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.9180085137886359, |
|
"grad_norm": 1.6131549613924843, |
|
"learning_rate": 6.557375398802124e-06, |
|
"loss": 0.8888, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9217101610216546, |
|
"grad_norm": 1.7447020971498644, |
|
"learning_rate": 6.526629988475567e-06, |
|
"loss": 0.8955, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.9254118082546733, |
|
"grad_norm": 1.4002568195144542, |
|
"learning_rate": 6.495820787138209e-06, |
|
"loss": 0.8788, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.929113455487692, |
|
"grad_norm": 1.4464217949020943, |
|
"learning_rate": 6.4649490821682035e-06, |
|
"loss": 0.8563, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9328151027207107, |
|
"grad_norm": 1.510865625108143, |
|
"learning_rate": 6.434016163555452e-06, |
|
"loss": 0.8845, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9365167499537294, |
|
"grad_norm": 1.4802916501785524, |
|
"learning_rate": 6.403023323847695e-06, |
|
"loss": 0.8921, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9402183971867482, |
|
"grad_norm": 1.5350448484112322, |
|
"learning_rate": 6.371971858096509e-06, |
|
"loss": 0.8799, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9439200444197668, |
|
"grad_norm": 1.5420451598934732, |
|
"learning_rate": 6.340863063803187e-06, |
|
"loss": 0.8845, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.9476216916527855, |
|
"grad_norm": 1.4730879482600188, |
|
"learning_rate": 6.30969824086453e-06, |
|
"loss": 0.8644, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9513233388858042, |
|
"grad_norm": 1.4626215723789604, |
|
"learning_rate": 6.278478691518519e-06, |
|
"loss": 0.8525, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.9550249861188229, |
|
"grad_norm": 1.4083292701913692, |
|
"learning_rate": 6.247205720289907e-06, |
|
"loss": 0.8569, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9587266333518416, |
|
"grad_norm": 1.3969010629986647, |
|
"learning_rate": 6.215880633935709e-06, |
|
"loss": 0.8268, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.9624282805848603, |
|
"grad_norm": 1.5510643456651918, |
|
"learning_rate": 6.184504741390596e-06, |
|
"loss": 0.8929, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9624282805848603, |
|
"eval_loss": 0.8996505737304688, |
|
"eval_runtime": 3.171, |
|
"eval_samples_per_second": 40.366, |
|
"eval_steps_per_second": 10.091, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.966129927817879, |
|
"grad_norm": 1.4377992355687732, |
|
"learning_rate": 6.153079353712201e-06, |
|
"loss": 0.8874, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.9698315750508977, |
|
"grad_norm": 1.5752576896689705, |
|
"learning_rate": 6.121605784026339e-06, |
|
"loss": 0.861, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9735332222839164, |
|
"grad_norm": 1.5215450507129673, |
|
"learning_rate": 6.09008534747213e-06, |
|
"loss": 0.8206, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.9772348695169351, |
|
"grad_norm": 1.480135671835485, |
|
"learning_rate": 6.058519361147055e-06, |
|
"loss": 0.8686, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9809365167499537, |
|
"grad_norm": 1.467611785721126, |
|
"learning_rate": 6.02690914405191e-06, |
|
"loss": 0.8809, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9846381639829724, |
|
"grad_norm": 1.5241127683327305, |
|
"learning_rate": 5.995256017035703e-06, |
|
"loss": 0.8828, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9883398112159911, |
|
"grad_norm": 1.5428867896748049, |
|
"learning_rate": 5.9635613027404495e-06, |
|
"loss": 0.9001, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.9920414584490098, |
|
"grad_norm": 1.6759585379267294, |
|
"learning_rate": 5.931826325545912e-06, |
|
"loss": 0.8816, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9957431056820285, |
|
"grad_norm": 1.5408480218485106, |
|
"learning_rate": 5.900052411514257e-06, |
|
"loss": 0.8392, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.9994447529150472, |
|
"grad_norm": 1.4822637999538424, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.872, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.0037016472330187, |
|
"grad_norm": 1.4367182586517664, |
|
"learning_rate": 5.836393085267777e-06, |
|
"loss": 0.9729, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.0074032944660374, |
|
"grad_norm": 1.58438978334663, |
|
"learning_rate": 5.804510333090287e-06, |
|
"loss": 0.8305, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.011104941699056, |
|
"grad_norm": 1.4525674500090913, |
|
"learning_rate": 5.772593964039203e-06, |
|
"loss": 0.8301, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.0148065889320748, |
|
"grad_norm": 1.4659924290630681, |
|
"learning_rate": 5.740645311756246e-06, |
|
"loss": 0.8154, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.0185082361650935, |
|
"grad_norm": 1.5899206742842742, |
|
"learning_rate": 5.708665711232103e-06, |
|
"loss": 0.8435, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.0222098833981121, |
|
"grad_norm": 1.4813056264599607, |
|
"learning_rate": 5.6766564987506564e-06, |
|
"loss": 0.7967, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.0259115306311308, |
|
"grad_norm": 1.428226573411074, |
|
"learning_rate": 5.644619011833134e-06, |
|
"loss": 0.8261, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.0296131778641495, |
|
"grad_norm": 1.8176290601286624, |
|
"learning_rate": 5.612554589182228e-06, |
|
"loss": 0.8141, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.0333148250971682, |
|
"grad_norm": 1.3928968915172266, |
|
"learning_rate": 5.5804645706261515e-06, |
|
"loss": 0.8588, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.037016472330187, |
|
"grad_norm": 1.4377153346084877, |
|
"learning_rate": 5.548350297062659e-06, |
|
"loss": 0.8193, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.037016472330187, |
|
"eval_loss": 0.8980569839477539, |
|
"eval_runtime": 3.4399, |
|
"eval_samples_per_second": 37.211, |
|
"eval_steps_per_second": 9.303, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.0407181195632056, |
|
"grad_norm": 1.5558318929758723, |
|
"learning_rate": 5.516213110403009e-06, |
|
"loss": 0.8207, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.0444197667962243, |
|
"grad_norm": 1.6248066354185033, |
|
"learning_rate": 5.484054353515896e-06, |
|
"loss": 0.8091, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.048121414029243, |
|
"grad_norm": 1.4650042601795332, |
|
"learning_rate": 5.451875370171341e-06, |
|
"loss": 0.7989, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.0518230612622617, |
|
"grad_norm": 1.5036679219848448, |
|
"learning_rate": 5.419677504984534e-06, |
|
"loss": 0.8585, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.0555247084952804, |
|
"grad_norm": 1.5147392306430503, |
|
"learning_rate": 5.387462103359655e-06, |
|
"loss": 0.8627, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.059226355728299, |
|
"grad_norm": 1.5401558403047761, |
|
"learning_rate": 5.3552305114336515e-06, |
|
"loss": 0.8373, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.0629280029613177, |
|
"grad_norm": 1.4869284016759738, |
|
"learning_rate": 5.32298407601999e-06, |
|
"loss": 0.8241, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.0666296501943364, |
|
"grad_norm": 1.446867156290036, |
|
"learning_rate": 5.290724144552379e-06, |
|
"loss": 0.8138, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.070331297427355, |
|
"grad_norm": 1.494572614643795, |
|
"learning_rate": 5.258452065028473e-06, |
|
"loss": 0.8168, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.0740329446603738, |
|
"grad_norm": 1.7407671262405369, |
|
"learning_rate": 5.2261691859535325e-06, |
|
"loss": 0.8883, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.0777345918933925, |
|
"grad_norm": 1.39453020317504, |
|
"learning_rate": 5.193876856284085e-06, |
|
"loss": 0.8288, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.0814362391264112, |
|
"grad_norm": 1.4783541394133328, |
|
"learning_rate": 5.161576425371554e-06, |
|
"loss": 0.862, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.0851378863594299, |
|
"grad_norm": 1.4631729411603314, |
|
"learning_rate": 5.1292692429058824e-06, |
|
"loss": 0.8215, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.0888395335924486, |
|
"grad_norm": 1.5090258328958575, |
|
"learning_rate": 5.096956658859122e-06, |
|
"loss": 0.8118, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.0925411808254673, |
|
"grad_norm": 1.4210652826921883, |
|
"learning_rate": 5.064640023429042e-06, |
|
"loss": 0.8439, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.096242828058486, |
|
"grad_norm": 1.5070935483488574, |
|
"learning_rate": 5.032320686982697e-06, |
|
"loss": 0.8173, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.0999444752915046, |
|
"grad_norm": 1.5831686258610027, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8464, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.1036461225245233, |
|
"grad_norm": 1.5180520231676966, |
|
"learning_rate": 4.967679313017304e-06, |
|
"loss": 0.8313, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.107347769757542, |
|
"grad_norm": 1.442160751483896, |
|
"learning_rate": 4.9353599765709585e-06, |
|
"loss": 0.8085, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.1110494169905607, |
|
"grad_norm": 1.4611083796490798, |
|
"learning_rate": 4.903043341140879e-06, |
|
"loss": 0.8514, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.1110494169905607, |
|
"eval_loss": 0.8958368301391602, |
|
"eval_runtime": 3.1683, |
|
"eval_samples_per_second": 40.401, |
|
"eval_steps_per_second": 10.1, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.1147510642235794, |
|
"grad_norm": 1.5165952272515089, |
|
"learning_rate": 4.870730757094121e-06, |
|
"loss": 0.8305, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.118452711456598, |
|
"grad_norm": 1.434796405007299, |
|
"learning_rate": 4.838423574628447e-06, |
|
"loss": 0.8514, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.1221543586896168, |
|
"grad_norm": 1.3769176135928658, |
|
"learning_rate": 4.806123143715916e-06, |
|
"loss": 0.8371, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.1258560059226355, |
|
"grad_norm": 1.4703105610414784, |
|
"learning_rate": 4.773830814046469e-06, |
|
"loss": 0.7945, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.1295576531556542, |
|
"grad_norm": 1.4241930599180446, |
|
"learning_rate": 4.741547934971528e-06, |
|
"loss": 0.8545, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.1332593003886728, |
|
"grad_norm": 1.2652952479040054, |
|
"learning_rate": 4.7092758554476215e-06, |
|
"loss": 0.8469, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.1369609476216915, |
|
"grad_norm": 1.4473025053325514, |
|
"learning_rate": 4.677015923980012e-06, |
|
"loss": 0.8044, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.1406625948547104, |
|
"grad_norm": 1.4934731943517507, |
|
"learning_rate": 4.644769488566351e-06, |
|
"loss": 0.8497, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.1443642420877291, |
|
"grad_norm": 1.3897656173888082, |
|
"learning_rate": 4.6125378966403465e-06, |
|
"loss": 0.838, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.1480658893207478, |
|
"grad_norm": 1.459255607903865, |
|
"learning_rate": 4.580322495015466e-06, |
|
"loss": 0.8174, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.1517675365537665, |
|
"grad_norm": 1.467053008876593, |
|
"learning_rate": 4.548124629828661e-06, |
|
"loss": 0.8184, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.1554691837867852, |
|
"grad_norm": 1.509368529437763, |
|
"learning_rate": 4.515945646484105e-06, |
|
"loss": 0.8209, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.159170831019804, |
|
"grad_norm": 1.475070161503307, |
|
"learning_rate": 4.483786889596993e-06, |
|
"loss": 0.8548, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.1628724782528226, |
|
"grad_norm": 1.452787367094, |
|
"learning_rate": 4.451649702937343e-06, |
|
"loss": 0.8336, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.1665741254858413, |
|
"grad_norm": 1.4000650572451163, |
|
"learning_rate": 4.4195354293738484e-06, |
|
"loss": 0.8239, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.17027577271886, |
|
"grad_norm": 1.501056679186563, |
|
"learning_rate": 4.387445410817774e-06, |
|
"loss": 0.8464, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.1739774199518787, |
|
"grad_norm": 1.5620641061424327, |
|
"learning_rate": 4.355380988166867e-06, |
|
"loss": 0.8443, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.1776790671848973, |
|
"grad_norm": 1.4653706291550204, |
|
"learning_rate": 4.323343501249346e-06, |
|
"loss": 0.8413, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.181380714417916, |
|
"grad_norm": 1.4771199230042058, |
|
"learning_rate": 4.291334288767899e-06, |
|
"loss": 0.8127, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.1850823616509347, |
|
"grad_norm": 1.603019688371863, |
|
"learning_rate": 4.259354688243758e-06, |
|
"loss": 0.7876, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1850823616509347, |
|
"eval_loss": 0.8933541774749756, |
|
"eval_runtime": 3.1438, |
|
"eval_samples_per_second": 40.715, |
|
"eval_steps_per_second": 10.179, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.1887840088839534, |
|
"grad_norm": 1.4189328299226074, |
|
"learning_rate": 4.227406035960798e-06, |
|
"loss": 0.84, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.192485656116972, |
|
"grad_norm": 1.4704536335382927, |
|
"learning_rate": 4.195489666909714e-06, |
|
"loss": 0.8064, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.1961873033499908, |
|
"grad_norm": 1.4501694554566051, |
|
"learning_rate": 4.163606914732224e-06, |
|
"loss": 0.8376, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.1998889505830095, |
|
"grad_norm": 1.4220571934272404, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 0.8222, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.2035905978160282, |
|
"grad_norm": 1.5336709710929675, |
|
"learning_rate": 4.099947588485744e-06, |
|
"loss": 0.8463, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.2072922450490469, |
|
"grad_norm": 1.4382561834939411, |
|
"learning_rate": 4.06817367445409e-06, |
|
"loss": 0.8204, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.2109938922820656, |
|
"grad_norm": 1.7195885049681503, |
|
"learning_rate": 4.036438697259551e-06, |
|
"loss": 0.7754, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.2146955395150842, |
|
"grad_norm": 1.4721892524673703, |
|
"learning_rate": 4.004743982964298e-06, |
|
"loss": 0.852, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.218397186748103, |
|
"grad_norm": 1.5021687057190425, |
|
"learning_rate": 3.9730908559480904e-06, |
|
"loss": 0.8496, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.2220988339811216, |
|
"grad_norm": 1.5795490521401583, |
|
"learning_rate": 3.941480638852948e-06, |
|
"loss": 0.8057, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.2258004812141403, |
|
"grad_norm": 1.4866592547517352, |
|
"learning_rate": 3.909914652527872e-06, |
|
"loss": 0.8323, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.229502128447159, |
|
"grad_norm": 1.3593767710719775, |
|
"learning_rate": 3.878394215973663e-06, |
|
"loss": 0.8282, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.2332037756801777, |
|
"grad_norm": 1.4070136140023897, |
|
"learning_rate": 3.8469206462878e-06, |
|
"loss": 0.8054, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.2369054229131964, |
|
"grad_norm": 1.5019295629206635, |
|
"learning_rate": 3.815495258609404e-06, |
|
"loss": 0.8397, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.240607070146215, |
|
"grad_norm": 1.4845646605655483, |
|
"learning_rate": 3.784119366064293e-06, |
|
"loss": 0.8524, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.2443087173792338, |
|
"grad_norm": 1.334749320097519, |
|
"learning_rate": 3.752794279710094e-06, |
|
"loss": 0.8007, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.2480103646122525, |
|
"grad_norm": 1.3677811469347512, |
|
"learning_rate": 3.721521308481483e-06, |
|
"loss": 0.8428, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.2517120118452711, |
|
"grad_norm": 1.362669707398652, |
|
"learning_rate": 3.690301759135471e-06, |
|
"loss": 0.7833, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.2554136590782898, |
|
"grad_norm": 1.6700680100767193, |
|
"learning_rate": 3.6591369361968127e-06, |
|
"loss": 0.8453, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.2591153063113085, |
|
"grad_norm": 1.4961112856255425, |
|
"learning_rate": 3.6280281419034934e-06, |
|
"loss": 0.841, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2591153063113085, |
|
"eval_loss": 0.8913548588752747, |
|
"eval_runtime": 3.1481, |
|
"eval_samples_per_second": 40.66, |
|
"eval_steps_per_second": 10.165, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.2628169535443272, |
|
"grad_norm": 1.4615195163796504, |
|
"learning_rate": 3.596976676152306e-06, |
|
"loss": 0.8104, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.266518600777346, |
|
"grad_norm": 1.442661859498498, |
|
"learning_rate": 3.5659838364445505e-06, |
|
"loss": 0.8075, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.2702202480103646, |
|
"grad_norm": 1.429321652511765, |
|
"learning_rate": 3.535050917831797e-06, |
|
"loss": 0.8263, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.2739218952433833, |
|
"grad_norm": 1.594083770625216, |
|
"learning_rate": 3.504179212861793e-06, |
|
"loss": 0.8266, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.277623542476402, |
|
"grad_norm": 1.3959844898279354, |
|
"learning_rate": 3.473370011524435e-06, |
|
"loss": 0.8276, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.2813251897094207, |
|
"grad_norm": 1.5711063878358447, |
|
"learning_rate": 3.442624601197877e-06, |
|
"loss": 0.7928, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.2850268369424394, |
|
"grad_norm": 1.481796761216049, |
|
"learning_rate": 3.4119442665947346e-06, |
|
"loss": 0.834, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.288728484175458, |
|
"grad_norm": 1.4347504907033723, |
|
"learning_rate": 3.3813302897083955e-06, |
|
"loss": 0.7957, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.2924301314084767, |
|
"grad_norm": 1.497068560059951, |
|
"learning_rate": 3.350783949759462e-06, |
|
"loss": 0.8639, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.2961317786414954, |
|
"grad_norm": 1.4369336661064722, |
|
"learning_rate": 3.3203065231422904e-06, |
|
"loss": 0.8477, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.2998334258745141, |
|
"grad_norm": 1.497313015187417, |
|
"learning_rate": 3.289899283371657e-06, |
|
"loss": 0.8068, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.3035350731075328, |
|
"grad_norm": 1.5190776440703562, |
|
"learning_rate": 3.259563501029548e-06, |
|
"loss": 0.8606, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.3072367203405515, |
|
"grad_norm": 1.4116722913128272, |
|
"learning_rate": 3.2293004437120622e-06, |
|
"loss": 0.8228, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.3109383675735702, |
|
"grad_norm": 1.5145929050223683, |
|
"learning_rate": 3.1991113759764493e-06, |
|
"loss": 0.8235, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.3146400148065889, |
|
"grad_norm": 1.5391195742763577, |
|
"learning_rate": 3.1689975592882603e-06, |
|
"loss": 0.8364, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.3183416620396076, |
|
"grad_norm": 1.380509271597796, |
|
"learning_rate": 3.1389602519686515e-06, |
|
"loss": 0.8348, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.3220433092726263, |
|
"grad_norm": 1.4156871353579172, |
|
"learning_rate": 3.1090007091417884e-06, |
|
"loss": 0.8093, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.325744956505645, |
|
"grad_norm": 1.5208231607755662, |
|
"learning_rate": 3.0791201826824117e-06, |
|
"loss": 0.8193, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.3294466037386636, |
|
"grad_norm": 1.4403291437738073, |
|
"learning_rate": 3.049319921163526e-06, |
|
"loss": 0.8787, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.3331482509716823, |
|
"grad_norm": 1.4835374292929009, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.8696, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.3331482509716823, |
|
"eval_loss": 0.888968825340271, |
|
"eval_runtime": 3.1554, |
|
"eval_samples_per_second": 40.565, |
|
"eval_steps_per_second": 10.141, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.336849898204701, |
|
"grad_norm": 1.4257381635041224, |
|
"learning_rate": 2.9899651704176324e-06, |
|
"loss": 0.7908, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.3405515454377197, |
|
"grad_norm": 1.5660125344151161, |
|
"learning_rate": 2.9604131613590825e-06, |
|
"loss": 0.8441, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.3442531926707386, |
|
"grad_norm": 1.4297048279895173, |
|
"learning_rate": 2.9309463774743047e-06, |
|
"loss": 0.8186, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.3479548399037573, |
|
"grad_norm": 1.4468200083505747, |
|
"learning_rate": 2.901566050047855e-06, |
|
"loss": 0.8211, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.351656487136776, |
|
"grad_norm": 1.4855511894939493, |
|
"learning_rate": 2.8722734067516637e-06, |
|
"loss": 0.8154, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.3553581343697947, |
|
"grad_norm": 1.442279738766708, |
|
"learning_rate": 2.843069671593734e-06, |
|
"loss": 0.8131, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.3590597816028134, |
|
"grad_norm": 1.4407714882856046, |
|
"learning_rate": 2.813956064866996e-06, |
|
"loss": 0.8368, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.362761428835832, |
|
"grad_norm": 1.638484157783778, |
|
"learning_rate": 2.784933803098326e-06, |
|
"loss": 0.8647, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.3664630760688508, |
|
"grad_norm": 1.4502544189697761, |
|
"learning_rate": 2.7560040989976894e-06, |
|
"loss": 0.8422, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.3701647233018694, |
|
"grad_norm": 1.5300853906976424, |
|
"learning_rate": 2.7271681614074973e-06, |
|
"loss": 0.8177, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.3738663705348881, |
|
"grad_norm": 1.6886425414974917, |
|
"learning_rate": 2.6984271952520723e-06, |
|
"loss": 0.7782, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.3775680177679068, |
|
"grad_norm": 1.4870235777550467, |
|
"learning_rate": 2.6697824014873076e-06, |
|
"loss": 0.7969, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.3812696650009255, |
|
"grad_norm": 1.45657377960479, |
|
"learning_rate": 2.641234977050484e-06, |
|
"loss": 0.834, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.3849713122339442, |
|
"grad_norm": 1.4440591034573387, |
|
"learning_rate": 2.6127861148102552e-06, |
|
"loss": 0.8334, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.388672959466963, |
|
"grad_norm": 1.4480240580594204, |
|
"learning_rate": 2.5844370035168077e-06, |
|
"loss": 0.8187, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.3923746066999816, |
|
"grad_norm": 1.400661573868388, |
|
"learning_rate": 2.5561888277521797e-06, |
|
"loss": 0.8186, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.3960762539330003, |
|
"grad_norm": 1.6068474311928562, |
|
"learning_rate": 2.528042767880766e-06, |
|
"loss": 0.8463, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.399777901166019, |
|
"grad_norm": 1.483569182939659, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.8329, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.4034795483990377, |
|
"grad_norm": 1.5617313472896055, |
|
"learning_rate": 2.4720616958912054e-06, |
|
"loss": 0.8029, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.4071811956320563, |
|
"grad_norm": 1.4937605105346914, |
|
"learning_rate": 2.4442290229706344e-06, |
|
"loss": 0.8355, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.4071811956320563, |
|
"eval_loss": 0.8875657320022583, |
|
"eval_runtime": 3.1599, |
|
"eval_samples_per_second": 40.508, |
|
"eval_steps_per_second": 10.127, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.410882842865075, |
|
"grad_norm": 1.4038567304499268, |
|
"learning_rate": 2.4165031442406857e-06, |
|
"loss": 0.7707, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.4145844900980937, |
|
"grad_norm": 1.4412299500545533, |
|
"learning_rate": 2.3888852182413087e-06, |
|
"loss": 0.7868, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.4182861373311124, |
|
"grad_norm": 1.3862062994975937, |
|
"learning_rate": 2.361376399001592e-06, |
|
"loss": 0.8159, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.421987784564131, |
|
"grad_norm": 1.38936848474066, |
|
"learning_rate": 2.333977835991545e-06, |
|
"loss": 0.8453, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.4256894317971498, |
|
"grad_norm": 1.5119133522075174, |
|
"learning_rate": 2.3066906740740626e-06, |
|
"loss": 0.7919, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.4293910790301685, |
|
"grad_norm": 1.5080292127927728, |
|
"learning_rate": 2.2795160534570866e-06, |
|
"loss": 0.8562, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.4330927262631872, |
|
"grad_norm": 1.5275419177684313, |
|
"learning_rate": 2.2524551096459703e-06, |
|
"loss": 0.8535, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.4367943734962059, |
|
"grad_norm": 1.449152440850533, |
|
"learning_rate": 2.2255089733960162e-06, |
|
"loss": 0.8297, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.4404960207292246, |
|
"grad_norm": 1.5212067706807353, |
|
"learning_rate": 2.1986787706652377e-06, |
|
"loss": 0.8216, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.4441976679622432, |
|
"grad_norm": 1.4160260862714236, |
|
"learning_rate": 2.171965622567308e-06, |
|
"loss": 0.8511, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.447899315195262, |
|
"grad_norm": 1.5218613959907945, |
|
"learning_rate": 2.1453706453247088e-06, |
|
"loss": 0.8938, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.4516009624282806, |
|
"grad_norm": 1.4599588773757752, |
|
"learning_rate": 2.1188949502220987e-06, |
|
"loss": 0.797, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.4553026096612993, |
|
"grad_norm": 1.6402881544632706, |
|
"learning_rate": 2.0925396435598665e-06, |
|
"loss": 0.8493, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.459004256894318, |
|
"grad_norm": 1.3930999875046068, |
|
"learning_rate": 2.066305826607911e-06, |
|
"loss": 0.8067, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.4627059041273367, |
|
"grad_norm": 1.354420609256178, |
|
"learning_rate": 2.0401945955596206e-06, |
|
"loss": 0.8008, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.4664075513603554, |
|
"grad_norm": 1.4848933707960712, |
|
"learning_rate": 2.0142070414860704e-06, |
|
"loss": 0.812, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.470109198593374, |
|
"grad_norm": 1.5352903986245707, |
|
"learning_rate": 1.9883442502904284e-06, |
|
"loss": 0.8251, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.4738108458263928, |
|
"grad_norm": 1.4357488632329796, |
|
"learning_rate": 1.962607302662582e-06, |
|
"loss": 0.826, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.4775124930594115, |
|
"grad_norm": 1.4401875170742706, |
|
"learning_rate": 1.936997274033986e-06, |
|
"loss": 0.8033, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.4812141402924301, |
|
"grad_norm": 1.5299603380399858, |
|
"learning_rate": 1.9115152345327154e-06, |
|
"loss": 0.8209, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4812141402924301, |
|
"eval_loss": 0.8854081630706787, |
|
"eval_runtime": 3.1595, |
|
"eval_samples_per_second": 40.512, |
|
"eval_steps_per_second": 10.128, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.4849157875254488, |
|
"grad_norm": 1.5023805812084643, |
|
"learning_rate": 1.8861622489387555e-06, |
|
"loss": 0.8326, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.4886174347584675, |
|
"grad_norm": 1.441795630960481, |
|
"learning_rate": 1.8609393766395083e-06, |
|
"loss": 0.8389, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.4923190819914862, |
|
"grad_norm": 1.3963484883921775, |
|
"learning_rate": 1.8358476715855262e-06, |
|
"loss": 0.8063, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.496020729224505, |
|
"grad_norm": 1.6126347139639874, |
|
"learning_rate": 1.8108881822464697e-06, |
|
"loss": 0.7948, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.4997223764575236, |
|
"grad_norm": 1.3882529255391067, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.8086, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.5034240236905423, |
|
"grad_norm": 1.416003087661949, |
|
"learning_rate": 1.7613700169247055e-06, |
|
"loss": 0.8458, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.507125670923561, |
|
"grad_norm": 1.4664325854130467, |
|
"learning_rate": 1.7368134100837286e-06, |
|
"loss": 0.8181, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.5108273181565797, |
|
"grad_norm": 1.4688732720880384, |
|
"learning_rate": 1.7123931571546826e-06, |
|
"loss": 0.8268, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.5145289653895984, |
|
"grad_norm": 1.4698210356470707, |
|
"learning_rate": 1.6881102785502618e-06, |
|
"loss": 0.8184, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.518230612622617, |
|
"grad_norm": 1.467012716297586, |
|
"learning_rate": 1.6639657889429017e-06, |
|
"loss": 0.8205, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.5219322598556357, |
|
"grad_norm": 1.4172866012375327, |
|
"learning_rate": 1.639960697222388e-06, |
|
"loss": 0.8171, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.5256339070886544, |
|
"grad_norm": 1.37642182756019, |
|
"learning_rate": 1.6160960064536907e-06, |
|
"loss": 0.8331, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.5293355543216731, |
|
"grad_norm": 1.4823688200402971, |
|
"learning_rate": 1.5923727138350548e-06, |
|
"loss": 0.8486, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.5330372015546918, |
|
"grad_norm": 1.498548947198836, |
|
"learning_rate": 1.5687918106563326e-06, |
|
"loss": 0.8335, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.5367388487877105, |
|
"grad_norm": 1.3845268813387288, |
|
"learning_rate": 1.5453542822575624e-06, |
|
"loss": 0.8426, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.5404404960207292, |
|
"grad_norm": 1.328604024294611, |
|
"learning_rate": 1.52206110798779e-06, |
|
"loss": 0.8365, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.5441421432537479, |
|
"grad_norm": 1.4713655680242212, |
|
"learning_rate": 1.4989132611641576e-06, |
|
"loss": 0.8175, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.5478437904867666, |
|
"grad_norm": 1.3754201562960036, |
|
"learning_rate": 1.4759117090312197e-06, |
|
"loss": 0.8157, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.5515454377197853, |
|
"grad_norm": 1.4552161883723067, |
|
"learning_rate": 1.453057412720536e-06, |
|
"loss": 0.813, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.555247084952804, |
|
"grad_norm": 1.4877720520235689, |
|
"learning_rate": 1.4303513272105057e-06, |
|
"loss": 0.7939, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.555247084952804, |
|
"eval_loss": 0.8845329880714417, |
|
"eval_runtime": 3.1387, |
|
"eval_samples_per_second": 40.781, |
|
"eval_steps_per_second": 10.195, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.5589487321858226, |
|
"grad_norm": 1.4146968178268984, |
|
"learning_rate": 1.4077944012864636e-06, |
|
"loss": 0.793, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.5626503794188413, |
|
"grad_norm": 1.5418185804069942, |
|
"learning_rate": 1.3853875775010355e-06, |
|
"loss": 0.8544, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.56635202665186, |
|
"grad_norm": 1.4973010828815396, |
|
"learning_rate": 1.3631317921347564e-06, |
|
"loss": 0.867, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.5700536738848787, |
|
"grad_norm": 1.463812986781743, |
|
"learning_rate": 1.3410279751569399e-06, |
|
"loss": 0.8358, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.5737553211178974, |
|
"grad_norm": 1.3801862403048382, |
|
"learning_rate": 1.3190770501868243e-06, |
|
"loss": 0.8258, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.577456968350916, |
|
"grad_norm": 1.5444134418998612, |
|
"learning_rate": 1.297279934454978e-06, |
|
"loss": 0.8433, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.5811586155839348, |
|
"grad_norm": 1.3687130856493455, |
|
"learning_rate": 1.2756375387649717e-06, |
|
"loss": 0.852, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.5848602628169535, |
|
"grad_norm": 1.5302115751085112, |
|
"learning_rate": 1.25415076745532e-06, |
|
"loss": 0.8348, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.5885619100499722, |
|
"grad_norm": 1.464801965717105, |
|
"learning_rate": 1.2328205183616964e-06, |
|
"loss": 0.8086, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.5922635572829908, |
|
"grad_norm": 1.3048364093071438, |
|
"learning_rate": 1.2116476827794104e-06, |
|
"loss": 0.7801, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.5959652045160095, |
|
"grad_norm": 1.396464881466601, |
|
"learning_rate": 1.1906331454261704e-06, |
|
"loss": 0.8208, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.5996668517490282, |
|
"grad_norm": 1.4487066144219591, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 0.8456, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.603368498982047, |
|
"grad_norm": 1.3565371093143543, |
|
"learning_rate": 1.1490824711681026e-06, |
|
"loss": 0.8212, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.6070701462150656, |
|
"grad_norm": 1.4612417437272762, |
|
"learning_rate": 1.1285480704793378e-06, |
|
"loss": 0.8541, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.6107717934480843, |
|
"grad_norm": 1.3751693996660053, |
|
"learning_rate": 1.1081754403792e-06, |
|
"loss": 0.816, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.614473440681103, |
|
"grad_norm": 1.335890753357732, |
|
"learning_rate": 1.0879654321484012e-06, |
|
"loss": 0.8222, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.6181750879141217, |
|
"grad_norm": 1.3676584150075997, |
|
"learning_rate": 1.067918890272419e-06, |
|
"loss": 0.8173, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.6218767351471404, |
|
"grad_norm": 1.3485546542471953, |
|
"learning_rate": 1.0480366524062041e-06, |
|
"loss": 0.8132, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.625578382380159, |
|
"grad_norm": 1.3989040385949476, |
|
"learning_rate": 1.0283195493391823e-06, |
|
"loss": 0.8432, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.6292800296131777, |
|
"grad_norm": 1.4025870546853108, |
|
"learning_rate": 1.008768404960535e-06, |
|
"loss": 0.8261, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.6292800296131777, |
|
"eval_loss": 0.8836302757263184, |
|
"eval_runtime": 3.3833, |
|
"eval_samples_per_second": 37.833, |
|
"eval_steps_per_second": 9.458, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.6329816768461964, |
|
"grad_norm": 1.3226535031128235, |
|
"learning_rate": 9.893840362247809e-07, |
|
"loss": 0.8108, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.6366833240792151, |
|
"grad_norm": 1.3959909229691634, |
|
"learning_rate": 9.701672531176287e-07, |
|
"loss": 0.828, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.6403849713122338, |
|
"grad_norm": 1.37325195717116, |
|
"learning_rate": 9.511188586221376e-07, |
|
"loss": 0.8029, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.6440866185452525, |
|
"grad_norm": 1.4152320624730137, |
|
"learning_rate": 9.322396486851626e-07, |
|
"loss": 0.7911, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.6477882657782712, |
|
"grad_norm": 1.4521133158474797, |
|
"learning_rate": 9.135304121840976e-07, |
|
"loss": 0.8333, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.65148991301129, |
|
"grad_norm": 1.4411308803523077, |
|
"learning_rate": 8.949919308939081e-07, |
|
"loss": 0.8353, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.6551915602443086, |
|
"grad_norm": 1.3784078254251741, |
|
"learning_rate": 8.766249794544662e-07, |
|
"loss": 0.8309, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.6588932074773273, |
|
"grad_norm": 1.4766503381010505, |
|
"learning_rate": 8.584303253381848e-07, |
|
"loss": 0.8055, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.662594854710346, |
|
"grad_norm": 1.3921814729723736, |
|
"learning_rate": 8.404087288179425e-07, |
|
"loss": 0.8267, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.6662965019433646, |
|
"grad_norm": 1.3707302114470679, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 0.8199, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.6699981491763833, |
|
"grad_norm": 1.3927411764790272, |
|
"learning_rate": 8.048877134691269e-07, |
|
"loss": 0.8066, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.673699796409402, |
|
"grad_norm": 1.4414260582973282, |
|
"learning_rate": 7.873897789042523e-07, |
|
"loss": 0.8185, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.6774014436424207, |
|
"grad_norm": 1.5093969886036152, |
|
"learning_rate": 7.700678704007947e-07, |
|
"loss": 0.785, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.6811030908754394, |
|
"grad_norm": 1.501532314290818, |
|
"learning_rate": 7.529227117635135e-07, |
|
"loss": 0.8534, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.684804738108458, |
|
"grad_norm": 1.524396041296438, |
|
"learning_rate": 7.35955019411585e-07, |
|
"loss": 0.8538, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.688506385341477, |
|
"grad_norm": 1.3885129879362617, |
|
"learning_rate": 7.191655023486682e-07, |
|
"loss": 0.849, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.6922080325744957, |
|
"grad_norm": 1.3160225292105472, |
|
"learning_rate": 7.02554862133275e-07, |
|
"loss": 0.7977, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.6959096798075144, |
|
"grad_norm": 1.3942465230818648, |
|
"learning_rate": 6.86123792849458e-07, |
|
"loss": 0.81, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.699611327040533, |
|
"grad_norm": 1.3470217330535852, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 0.8524, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.7033129742735518, |
|
"grad_norm": 1.361657567844148, |
|
"learning_rate": 6.53803105866761e-07, |
|
"loss": 0.799, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.7033129742735518, |
|
"eval_loss": 0.8829113245010376, |
|
"eval_runtime": 3.1398, |
|
"eval_samples_per_second": 40.767, |
|
"eval_steps_per_second": 10.192, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.7070146215065705, |
|
"grad_norm": 1.3715303747885355, |
|
"learning_rate": 6.379148387042317e-07, |
|
"loss": 0.8144, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.7107162687395892, |
|
"grad_norm": 1.7715797855954751, |
|
"learning_rate": 6.222088434895462e-07, |
|
"loss": 0.7851, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.7144179159726078, |
|
"grad_norm": 1.521724124428476, |
|
"learning_rate": 6.066857765057055e-07, |
|
"loss": 0.7997, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.7181195632056265, |
|
"grad_norm": 1.348492376638381, |
|
"learning_rate": 5.9134628639196e-07, |
|
"loss": 0.8309, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.7218212104386452, |
|
"grad_norm": 1.347352454847091, |
|
"learning_rate": 5.76191014116711e-07, |
|
"loss": 0.8454, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.725522857671664, |
|
"grad_norm": 1.3893414914267381, |
|
"learning_rate": 5.612205929507209e-07, |
|
"loss": 0.8247, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.7292245049046826, |
|
"grad_norm": 1.4200173691770128, |
|
"learning_rate": 5.464356484406535e-07, |
|
"loss": 0.7975, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.7329261521377013, |
|
"grad_norm": 1.5791470882024337, |
|
"learning_rate": 5.318367983829393e-07, |
|
"loss": 0.8412, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.73662779937072, |
|
"grad_norm": 1.3494376829357295, |
|
"learning_rate": 5.174246527979532e-07, |
|
"loss": 0.8272, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.7403294466037387, |
|
"grad_norm": 1.4384763987684341, |
|
"learning_rate": 5.031998139045352e-07, |
|
"loss": 0.7994, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.7440310938367574, |
|
"grad_norm": 1.5445823627285267, |
|
"learning_rate": 4.891628760948114e-07, |
|
"loss": 0.8299, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.747732741069776, |
|
"grad_norm": 1.3753338907841655, |
|
"learning_rate": 4.753144259093734e-07, |
|
"loss": 0.8218, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.7514343883027947, |
|
"grad_norm": 1.404118293213505, |
|
"learning_rate": 4.6165504201275635e-07, |
|
"loss": 0.8347, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.7551360355358134, |
|
"grad_norm": 1.3108969691800596, |
|
"learning_rate": 4.481852951692672e-07, |
|
"loss": 0.7868, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.7588376827688321, |
|
"grad_norm": 1.4035628915119207, |
|
"learning_rate": 4.349057482191299e-07, |
|
"loss": 0.8085, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.7625393300018508, |
|
"grad_norm": 1.3772863237275759, |
|
"learning_rate": 4.2181695605497066e-07, |
|
"loss": 0.8328, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.7662409772348695, |
|
"grad_norm": 1.4237021945882211, |
|
"learning_rate": 4.089194655986306e-07, |
|
"loss": 0.771, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.7699426244678882, |
|
"grad_norm": 1.4129993844795357, |
|
"learning_rate": 3.9621381577830855e-07, |
|
"loss": 0.8269, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.7736442717009069, |
|
"grad_norm": 1.4812385056598143, |
|
"learning_rate": 3.837005375060482e-07, |
|
"loss": 0.8034, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.7773459189339256, |
|
"grad_norm": 1.3957235505948293, |
|
"learning_rate": 3.7138015365554834e-07, |
|
"loss": 0.835, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.7773459189339256, |
|
"eval_loss": 0.8825176954269409, |
|
"eval_runtime": 3.1663, |
|
"eval_samples_per_second": 40.425, |
|
"eval_steps_per_second": 10.106, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.7810475661669443, |
|
"grad_norm": 1.3595887596000176, |
|
"learning_rate": 3.592531790403159e-07, |
|
"loss": 0.8353, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.784749213399963, |
|
"grad_norm": 1.44472224358499, |
|
"learning_rate": 3.473201203921578e-07, |
|
"loss": 0.8405, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.7884508606329816, |
|
"grad_norm": 1.4673021994124296, |
|
"learning_rate": 3.355814763399973e-07, |
|
"loss": 0.8283, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.7921525078660003, |
|
"grad_norm": 1.3427896673881607, |
|
"learning_rate": 3.2403773738905185e-07, |
|
"loss": 0.817, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.795854155099019, |
|
"grad_norm": 1.3240197787291788, |
|
"learning_rate": 3.1268938590032495e-07, |
|
"loss": 0.8532, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.7995558023320377, |
|
"grad_norm": 1.38845574713167, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 0.8535, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.8032574495650564, |
|
"grad_norm": 1.3363830707149509, |
|
"learning_rate": 2.905807339119138e-07, |
|
"loss": 0.8351, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.806959096798075, |
|
"grad_norm": 1.3538146792103303, |
|
"learning_rate": 2.798213572335001e-07, |
|
"loss": 0.8164, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.8106607440310938, |
|
"grad_norm": 1.3417811155846222, |
|
"learning_rate": 2.6925921562124867e-07, |
|
"loss": 0.7985, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.8143623912641127, |
|
"grad_norm": 1.5439499436760904, |
|
"learning_rate": 2.5889475041961767e-07, |
|
"loss": 0.8514, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.8180640384971314, |
|
"grad_norm": 1.5313102293162648, |
|
"learning_rate": 2.487283947130609e-07, |
|
"loss": 0.7971, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.82176568573015, |
|
"grad_norm": 1.3449133576002705, |
|
"learning_rate": 2.3876057330792344e-07, |
|
"loss": 0.843, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.8254673329631688, |
|
"grad_norm": 1.3175933714693975, |
|
"learning_rate": 2.289917027146943e-07, |
|
"loss": 0.7552, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.8291689801961875, |
|
"grad_norm": 1.3182913196070813, |
|
"learning_rate": 2.1942219113060215e-07, |
|
"loss": 0.8457, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.8328706274292061, |
|
"grad_norm": 1.4098311917433477, |
|
"learning_rate": 2.1005243842255552e-07, |
|
"loss": 0.8181, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.8365722746622248, |
|
"grad_norm": 1.4157284927848202, |
|
"learning_rate": 2.0088283611044034e-07, |
|
"loss": 0.8178, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.8402739218952435, |
|
"grad_norm": 1.3768335774801836, |
|
"learning_rate": 1.919137673507543e-07, |
|
"loss": 0.8172, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 1.8439755691282622, |
|
"grad_norm": 1.3937310960674392, |
|
"learning_rate": 1.8314560692059836e-07, |
|
"loss": 0.7978, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.847677216361281, |
|
"grad_norm": 1.400084035871816, |
|
"learning_rate": 1.745787212020178e-07, |
|
"loss": 0.8199, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 1.8513788635942996, |
|
"grad_norm": 1.3810305734159727, |
|
"learning_rate": 1.6621346816668993e-07, |
|
"loss": 0.8084, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.8513788635942996, |
|
"eval_loss": 0.8822975754737854, |
|
"eval_runtime": 3.1487, |
|
"eval_samples_per_second": 40.652, |
|
"eval_steps_per_second": 10.163, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.8550805108273183, |
|
"grad_norm": 1.4618272988152217, |
|
"learning_rate": 1.5805019736097105e-07, |
|
"loss": 0.7749, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 1.858782158060337, |
|
"grad_norm": 1.3590727353732708, |
|
"learning_rate": 1.500892498912826e-07, |
|
"loss": 0.8444, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.8624838052933557, |
|
"grad_norm": 1.3186307231349548, |
|
"learning_rate": 1.4233095840986756e-07, |
|
"loss": 0.8406, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 1.8661854525263744, |
|
"grad_norm": 1.4373682696099375, |
|
"learning_rate": 1.3477564710088097e-07, |
|
"loss": 0.8209, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.869887099759393, |
|
"grad_norm": 1.2914165232423356, |
|
"learning_rate": 1.2742363166685035e-07, |
|
"loss": 0.792, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.8735887469924117, |
|
"grad_norm": 1.3471998409083057, |
|
"learning_rate": 1.2027521931548214e-07, |
|
"loss": 0.8124, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.8772903942254304, |
|
"grad_norm": 1.2986670359344874, |
|
"learning_rate": 1.1333070874682217e-07, |
|
"loss": 0.792, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 1.8809920414584491, |
|
"grad_norm": 1.3826040955045793, |
|
"learning_rate": 1.0659039014077943e-07, |
|
"loss": 0.8701, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.8846936886914678, |
|
"grad_norm": 1.3920437020559047, |
|
"learning_rate": 1.0005454514499413e-07, |
|
"loss": 0.8087, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 1.8883953359244865, |
|
"grad_norm": 1.4364521156535976, |
|
"learning_rate": 9.372344686307655e-08, |
|
"loss": 0.8298, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.8920969831575052, |
|
"grad_norm": 1.4438874847233494, |
|
"learning_rate": 8.759735984318896e-08, |
|
"loss": 0.8073, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 1.8957986303905239, |
|
"grad_norm": 1.3657337652949615, |
|
"learning_rate": 8.167654006699444e-08, |
|
"loss": 0.8157, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.8995002776235426, |
|
"grad_norm": 1.3090182622064548, |
|
"learning_rate": 7.59612349389599e-08, |
|
"loss": 0.8326, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 1.9032019248565613, |
|
"grad_norm": 1.4794729385434366, |
|
"learning_rate": 7.04516832760177e-08, |
|
"loss": 0.7837, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.90690357208958, |
|
"grad_norm": 1.4008235743572839, |
|
"learning_rate": 6.514811529758747e-08, |
|
"loss": 0.8019, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.9106052193225986, |
|
"grad_norm": 1.449723166913723, |
|
"learning_rate": 6.005075261595495e-08, |
|
"loss": 0.8055, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.9143068665556173, |
|
"grad_norm": 1.3450928085498033, |
|
"learning_rate": 5.515980822701439e-08, |
|
"loss": 0.8349, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 1.918008513788636, |
|
"grad_norm": 1.4416195462263817, |
|
"learning_rate": 5.047548650136513e-08, |
|
"loss": 0.7795, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.9217101610216547, |
|
"grad_norm": 1.5497642083681684, |
|
"learning_rate": 4.599798317577342e-08, |
|
"loss": 0.87, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 1.9254118082546734, |
|
"grad_norm": 1.2616319538457434, |
|
"learning_rate": 4.172748534499449e-08, |
|
"loss": 0.7883, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.9254118082546734, |
|
"eval_loss": 0.8820919990539551, |
|
"eval_runtime": 3.1563, |
|
"eval_samples_per_second": 40.553, |
|
"eval_steps_per_second": 10.138, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.929113455487692, |
|
"grad_norm": 1.6535833071966568, |
|
"learning_rate": 3.766417145395218e-08, |
|
"loss": 0.7991, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 1.9328151027207108, |
|
"grad_norm": 1.5619482603381143, |
|
"learning_rate": 3.3808211290284886e-08, |
|
"loss": 0.8263, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.9365167499537295, |
|
"grad_norm": 1.4670311662402673, |
|
"learning_rate": 3.015976597725068e-08, |
|
"loss": 0.8334, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 1.9402183971867482, |
|
"grad_norm": 1.4739887369993028, |
|
"learning_rate": 2.6718987966992683e-08, |
|
"loss": 0.7994, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.9439200444197668, |
|
"grad_norm": 1.4481950895466418, |
|
"learning_rate": 2.3486021034170857e-08, |
|
"loss": 0.843, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.9476216916527855, |
|
"grad_norm": 1.4687544734332336, |
|
"learning_rate": 2.0461000269953457e-08, |
|
"loss": 0.8637, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.9513233388858042, |
|
"grad_norm": 1.4517252835572454, |
|
"learning_rate": 1.7644052076371544e-08, |
|
"loss": 0.8339, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 1.955024986118823, |
|
"grad_norm": 1.263580454161453, |
|
"learning_rate": 1.5035294161039882e-08, |
|
"loss": 0.8399, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.9587266333518416, |
|
"grad_norm": 1.349067030187528, |
|
"learning_rate": 1.2634835532233658e-08, |
|
"loss": 0.8368, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 1.9624282805848603, |
|
"grad_norm": 1.366301568196159, |
|
"learning_rate": 1.044277649433989e-08, |
|
"loss": 0.8232, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.966129927817879, |
|
"grad_norm": 1.3964320267142443, |
|
"learning_rate": 8.459208643659122e-09, |
|
"loss": 0.7839, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 1.9698315750508977, |
|
"grad_norm": 1.3674713032214498, |
|
"learning_rate": 6.6842148645840374e-09, |
|
"loss": 0.8247, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.9735332222839164, |
|
"grad_norm": 1.3670892065984888, |
|
"learning_rate": 5.11786932613223e-09, |
|
"loss": 0.7976, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 1.977234869516935, |
|
"grad_norm": 1.3770585342143995, |
|
"learning_rate": 3.760237478849793e-09, |
|
"loss": 0.8151, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.9809365167499537, |
|
"grad_norm": 1.4348292274334755, |
|
"learning_rate": 2.611376052073511e-09, |
|
"loss": 0.8425, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 1.9846381639829724, |
|
"grad_norm": 1.390482722091134, |
|
"learning_rate": 1.6713330515627512e-09, |
|
"loss": 0.8398, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.9883398112159911, |
|
"grad_norm": 1.3732191133970555, |
|
"learning_rate": 9.401477574932927e-10, |
|
"loss": 0.8052, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 1.9920414584490098, |
|
"grad_norm": 1.4919098476479833, |
|
"learning_rate": 4.178507228136397e-10, |
|
"loss": 0.7967, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.9957431056820285, |
|
"grad_norm": 1.3717252115748237, |
|
"learning_rate": 1.0446377197104174e-10, |
|
"loss": 0.7917, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 1.9994447529150472, |
|
"grad_norm": 1.397259169666562, |
|
"learning_rate": 0.0, |
|
"loss": 0.8122, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.9994447529150472, |
|
"eval_loss": 0.8820245862007141, |
|
"eval_runtime": 3.1585, |
|
"eval_samples_per_second": 40.525, |
|
"eval_steps_per_second": 10.131, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.9994447529150472, |
|
"step": 2700, |
|
"total_flos": 75972590174208.0, |
|
"train_loss": 0.8770522540586966, |
|
"train_runtime": 4288.7111, |
|
"train_samples_per_second": 10.078, |
|
"train_steps_per_second": 0.63 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2700, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 75972590174208.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|