|
{ |
|
"best_metric": 1.2184005975723267, |
|
"best_model_checkpoint": "./output/training_results/C021_random_sample_llama3-8b-base_pretrain_20240505_135320/checkpoint-800", |
|
"epoch": 4.0, |
|
"eval_steps": 200, |
|
"global_step": 1876, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0021321961620469083, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 1.3827, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010660980810234541, |
|
"grad_norm": 2.0635243899751066, |
|
"learning_rate": 2.25e-06, |
|
"loss": 1.4616, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.021321961620469083, |
|
"grad_norm": 1.903018233428319, |
|
"learning_rate": 6e-06, |
|
"loss": 1.2299, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.031982942430703626, |
|
"grad_norm": 1.5467734081397355, |
|
"learning_rate": 9.75e-06, |
|
"loss": 1.2843, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 1.995653146329813, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 1.31, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.053304904051172705, |
|
"grad_norm": 2.410826051077061, |
|
"learning_rate": 1.473632431655486e-05, |
|
"loss": 1.3681, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06396588486140725, |
|
"grad_norm": 1.765947093190964, |
|
"learning_rate": 1.4306244585426726e-05, |
|
"loss": 1.3228, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 1.8242122051647833, |
|
"learning_rate": 1.3887645147313336e-05, |
|
"loss": 1.2664, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 1.8945258342407882, |
|
"learning_rate": 1.3560845370175857e-05, |
|
"loss": 1.3073, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09594882729211088, |
|
"grad_norm": 1.4272941224837, |
|
"learning_rate": 1.3162217563557544e-05, |
|
"loss": 1.3019, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.10660980810234541, |
|
"grad_norm": 1.4171437963555575, |
|
"learning_rate": 1.2774310614975657e-05, |
|
"loss": 1.3613, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11727078891257996, |
|
"grad_norm": 1.4495644289679146, |
|
"learning_rate": 1.2396864324754692e-05, |
|
"loss": 1.4084, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 1.34494402339979, |
|
"learning_rate": 1.2029624121909564e-05, |
|
"loss": 1.3002, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13859275053304904, |
|
"grad_norm": 1.438232163080863, |
|
"learning_rate": 1.1672340957315019e-05, |
|
"loss": 1.3643, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 1.5674663877824588, |
|
"learning_rate": 1.1324771198617955e-05, |
|
"loss": 1.2599, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15991471215351813, |
|
"grad_norm": 1.3266580498364144, |
|
"learning_rate": 1.0986676526868454e-05, |
|
"loss": 1.307, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 3.7513449392256173, |
|
"learning_rate": 1.0657823834846472e-05, |
|
"loss": 1.3701, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1812366737739872, |
|
"grad_norm": 5.184018665584405, |
|
"learning_rate": 1.0337985127060489e-05, |
|
"loss": 1.37, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.19189765458422176, |
|
"grad_norm": 1.848934855985837, |
|
"learning_rate": 1.0026937421395617e-05, |
|
"loss": 1.2708, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2025586353944563, |
|
"grad_norm": 1.579847110813568, |
|
"learning_rate": 9.724462652387962e-06, |
|
"loss": 1.3402, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 1.5615282561022026, |
|
"learning_rate": 9.430347576103212e-06, |
|
"loss": 1.2961, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 1.3659017705844316, |
|
"learning_rate": 9.1443836765968e-06, |
|
"loss": 1.2203, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2345415778251599, |
|
"grad_norm": 1.4562735931359865, |
|
"learning_rate": 8.921344243503367e-06, |
|
"loss": 1.2945, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.24520255863539445, |
|
"grad_norm": 1.2855121058923065, |
|
"learning_rate": 8.64954174238431e-06, |
|
"loss": 1.295, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 1.3082074964800576, |
|
"learning_rate": 8.385330949249671e-06, |
|
"loss": 1.3053, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.26652452025586354, |
|
"grad_norm": 1.2815031342227978, |
|
"learning_rate": 8.128520488668967e-06, |
|
"loss": 1.2994, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.2771855010660981, |
|
"grad_norm": 1.3107524150250855, |
|
"learning_rate": 7.878923285543024e-06, |
|
"loss": 1.3281, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2878464818763326, |
|
"grad_norm": 1.1907535460514518, |
|
"learning_rate": 7.636356480312689e-06, |
|
"loss": 1.2579, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 1.2284678609494983, |
|
"learning_rate": 7.400641345604814e-06, |
|
"loss": 1.1634, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3091684434968017, |
|
"grad_norm": 1.3826281905120297, |
|
"learning_rate": 7.171603204294806e-06, |
|
"loss": 1.3058, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31982942430703626, |
|
"grad_norm": 2.431901009888026, |
|
"learning_rate": 6.949071348965877e-06, |
|
"loss": 1.3096, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3304904051172708, |
|
"grad_norm": 1.4504346901041576, |
|
"learning_rate": 6.732878962744999e-06, |
|
"loss": 1.2943, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 1.2723296431138542, |
|
"learning_rate": 6.5228630414958555e-06, |
|
"loss": 1.351, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.35181236673773986, |
|
"grad_norm": 1.2139070950942783, |
|
"learning_rate": 6.318864317349401e-06, |
|
"loss": 1.2937, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3624733475479744, |
|
"grad_norm": 5.348465582649999, |
|
"learning_rate": 6.120727183552839e-06, |
|
"loss": 1.2726, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 1.6504356499880541, |
|
"learning_rate": 5.928299620617992e-06, |
|
"loss": 1.3039, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 1.1981923340348997, |
|
"learning_rate": 5.7414331237502024e-06, |
|
"loss": 1.2027, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.39445628997867804, |
|
"grad_norm": 1.229607999022085, |
|
"learning_rate": 5.559982631539405e-06, |
|
"loss": 1.2637, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4051172707889126, |
|
"grad_norm": 4.9301877176481606, |
|
"learning_rate": 5.383806455894783e-06, |
|
"loss": 1.2601, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4157782515991471, |
|
"grad_norm": 1.2303183328754892, |
|
"learning_rate": 5.212766213205117e-06, |
|
"loss": 1.1995, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 1.1512051049767256, |
|
"learning_rate": 5.046726756706699e-06, |
|
"loss": 1.2572, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"eval_loss": 1.261182188987732, |
|
"eval_runtime": 87.341, |
|
"eval_samples_per_second": 19.086, |
|
"eval_steps_per_second": 0.309, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.43710021321961623, |
|
"grad_norm": 1.2483299149897367, |
|
"learning_rate": 4.885556110041288e-06, |
|
"loss": 1.2665, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 1.1979266901021162, |
|
"learning_rate": 4.7291254019864345e-06, |
|
"loss": 1.1936, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4584221748400853, |
|
"grad_norm": 1.1685251787317164, |
|
"learning_rate": 4.577308802341063e-06, |
|
"loss": 1.3458, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 1.156653489171017, |
|
"learning_rate": 4.42998345894903e-06, |
|
"loss": 1.3039, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.47974413646055436, |
|
"grad_norm": 1.2415273415640915, |
|
"learning_rate": 4.287029435843979e-06, |
|
"loss": 1.2127, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4904051172707889, |
|
"grad_norm": 1.0890064939371906, |
|
"learning_rate": 4.148329652498597e-06, |
|
"loss": 1.2844, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5010660980810234, |
|
"grad_norm": 1.2341998642747427, |
|
"learning_rate": 4.013769824161997e-06, |
|
"loss": 1.2989, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 1.0800044414980956, |
|
"learning_rate": 3.883238403268737e-06, |
|
"loss": 1.18, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 1.1911502497460917, |
|
"learning_rate": 3.7566265219035852e-06, |
|
"loss": 1.2198, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5330490405117271, |
|
"grad_norm": 1.1875916676432052, |
|
"learning_rate": 3.633827935305925e-06, |
|
"loss": 1.2321, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5437100213219617, |
|
"grad_norm": 1.1285856989543368, |
|
"learning_rate": 3.5147389663983076e-06, |
|
"loss": 1.2063, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 1.2134517080609044, |
|
"learning_rate": 3.3992584513234327e-06, |
|
"loss": 1.2034, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5650319829424307, |
|
"grad_norm": 1.1497034722164001, |
|
"learning_rate": 3.2872876859744165e-06, |
|
"loss": 1.2478, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.5756929637526652, |
|
"grad_norm": 1.1573953347115111, |
|
"learning_rate": 3.1787303735030416e-06, |
|
"loss": 1.217, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5863539445628998, |
|
"grad_norm": 1.1278493709315967, |
|
"learning_rate": 3.0734925727911652e-06, |
|
"loss": 1.2251, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 1.2020449597183245, |
|
"learning_rate": 2.971482647870452e-06, |
|
"loss": 1.1144, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6076759061833689, |
|
"grad_norm": 1.2203652918926715, |
|
"learning_rate": 2.8726112182758347e-06, |
|
"loss": 1.281, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6183368869936035, |
|
"grad_norm": 1.1541053058741007, |
|
"learning_rate": 2.7767911103183137e-06, |
|
"loss": 1.2344, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6289978678038379, |
|
"grad_norm": 1.170855117533204, |
|
"learning_rate": 2.6839373092628783e-06, |
|
"loss": 1.1737, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 1.1313568462008778, |
|
"learning_rate": 2.593966912397475e-06, |
|
"loss": 1.2395, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.650319829424307, |
|
"grad_norm": 1.0884221660719775, |
|
"learning_rate": 2.5067990829791376e-06, |
|
"loss": 1.2274, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6609808102345416, |
|
"grad_norm": 1.1287129229662805, |
|
"learning_rate": 2.4223550050435816e-06, |
|
"loss": 1.2406, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 1.158009309923302, |
|
"learning_rate": 2.340557839064751e-06, |
|
"loss": 1.2206, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 1.120129915614599, |
|
"learning_rate": 2.261332678450931e-06, |
|
"loss": 1.1631, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6929637526652452, |
|
"grad_norm": 1.2657256378770898, |
|
"learning_rate": 2.184606506864227e-06, |
|
"loss": 1.2707, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7036247334754797, |
|
"grad_norm": 1.1637951032951959, |
|
"learning_rate": 2.11030815635039e-06, |
|
"loss": 1.2491, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 1.2287205505399468, |
|
"learning_rate": 2.0383682662661417e-06, |
|
"loss": 1.22, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 1.326574025149143, |
|
"learning_rate": 1.9687192429912924e-06, |
|
"loss": 1.2984, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7356076759061834, |
|
"grad_norm": 1.1554026106853619, |
|
"learning_rate": 1.9012952204130788e-06, |
|
"loss": 1.2208, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 1.0875768598668099, |
|
"learning_rate": 1.8360320211704227e-06, |
|
"loss": 1.2204, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7569296375266524, |
|
"grad_norm": 1.14982588486912, |
|
"learning_rate": 1.772867118645806e-06, |
|
"loss": 1.2091, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 1.1836213945640173, |
|
"learning_rate": 1.7117395996927912e-06, |
|
"loss": 1.2247, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7782515991471215, |
|
"grad_norm": 1.1378213023095496, |
|
"learning_rate": 1.6525901280871983e-06, |
|
"loss": 1.1597, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7889125799573561, |
|
"grad_norm": 1.146991526368693, |
|
"learning_rate": 1.5953609086902758e-06, |
|
"loss": 1.165, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7995735607675906, |
|
"grad_norm": 1.0759826941866089, |
|
"learning_rate": 1.539995652312188e-06, |
|
"loss": 1.1934, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 1.140264301703701, |
|
"learning_rate": 1.486439541264454e-06, |
|
"loss": 1.2476, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 1.1750555438236339, |
|
"learning_rate": 1.434639195589973e-06, |
|
"loss": 1.2283, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.8315565031982942, |
|
"grad_norm": 1.1027256002443178, |
|
"learning_rate": 1.384542639959556e-06, |
|
"loss": 1.2392, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8422174840085288, |
|
"grad_norm": 1.0969958993285855, |
|
"learning_rate": 1.336099271223909e-06, |
|
"loss": 1.2932, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 1.1632096677157433, |
|
"learning_rate": 1.28925982661026e-06, |
|
"loss": 1.1754, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_loss": 1.2225849628448486, |
|
"eval_runtime": 86.933, |
|
"eval_samples_per_second": 19.176, |
|
"eval_steps_per_second": 0.311, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8635394456289979, |
|
"grad_norm": 1.2018672364738783, |
|
"learning_rate": 1.243976352552906e-06, |
|
"loss": 1.1537, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.8742004264392325, |
|
"grad_norm": 1.1964994592566494, |
|
"learning_rate": 1.2002021741471036e-06, |
|
"loss": 1.2153, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8848614072494669, |
|
"grad_norm": 1.2085715531383925, |
|
"learning_rate": 1.1578918652158905e-06, |
|
"loss": 1.2214, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 1.132268550904319, |
|
"learning_rate": 1.1170012189795644e-06, |
|
"loss": 1.1696, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.906183368869936, |
|
"grad_norm": 1.1888002460068672, |
|
"learning_rate": 1.0774872193176617e-06, |
|
"loss": 1.2102, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9168443496801706, |
|
"grad_norm": 1.081884966553129, |
|
"learning_rate": 1.039308012613421e-06, |
|
"loss": 1.1888, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9275053304904051, |
|
"grad_norm": 1.1082024653231661, |
|
"learning_rate": 1.0024228801708736e-06, |
|
"loss": 1.3088, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 1.1424196623047278, |
|
"learning_rate": 9.667922111948187e-07, |
|
"loss": 1.2702, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.9488272921108742, |
|
"grad_norm": 1.1515848618493325, |
|
"learning_rate": 9.323774763240816e-07, |
|
"loss": 1.2241, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.9594882729211087, |
|
"grad_norm": 1.133219633313401, |
|
"learning_rate": 8.991412017085673e-07, |
|
"loss": 1.1689, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 1.1817034727115894, |
|
"learning_rate": 8.670469436207778e-07, |
|
"loss": 1.2429, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 1.082929968453826, |
|
"learning_rate": 8.360592635925826e-07, |
|
"loss": 1.223, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9914712153518124, |
|
"grad_norm": 1.0995506789476814, |
|
"learning_rate": 8.061437040681486e-07, |
|
"loss": 1.2493, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.0021321961620469, |
|
"grad_norm": 1.4325931391365254, |
|
"learning_rate": 7.772667645640643e-07, |
|
"loss": 1.2061, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0127931769722816, |
|
"grad_norm": 1.3020611480277295, |
|
"learning_rate": 7.493958783278302e-07, |
|
"loss": 1.118, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.023454157782516, |
|
"grad_norm": 1.166419988285865, |
|
"learning_rate": 7.22499389486003e-07, |
|
"loss": 1.0997, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0341151385927505, |
|
"grad_norm": 1.1738691779163593, |
|
"learning_rate": 6.965465306734048e-07, |
|
"loss": 1.0648, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 1.1702760912470462, |
|
"learning_rate": 6.715074011349123e-07, |
|
"loss": 1.125, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0554371002132197, |
|
"grad_norm": 1.1386250553771318, |
|
"learning_rate": 6.473529452915077e-07, |
|
"loss": 1.12, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.0660980810234542, |
|
"grad_norm": 1.145825473750238, |
|
"learning_rate": 6.240549317623237e-07, |
|
"loss": 1.1182, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0767590618336886, |
|
"grad_norm": 1.1263260101281638, |
|
"learning_rate": 6.015859328346043e-07, |
|
"loss": 1.0333, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.0874200426439233, |
|
"grad_norm": 1.099611354066211, |
|
"learning_rate": 5.799193043735526e-07, |
|
"loss": 1.0545, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0980810234541578, |
|
"grad_norm": 1.2122056875352134, |
|
"learning_rate": 5.590291661642085e-07, |
|
"loss": 1.1501, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.1087420042643923, |
|
"grad_norm": 1.2108073336890413, |
|
"learning_rate": 5.388903826775655e-07, |
|
"loss": 1.1355, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1194029850746268, |
|
"grad_norm": 1.1869552548231974, |
|
"learning_rate": 5.194785442532827e-07, |
|
"loss": 1.0177, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.1300639658848615, |
|
"grad_norm": 1.0725136686500363, |
|
"learning_rate": 5.007699486914442e-07, |
|
"loss": 1.0673, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.140724946695096, |
|
"grad_norm": 1.1715291176817577, |
|
"learning_rate": 4.827415832459197e-07, |
|
"loss": 0.9923, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.1513859275053304, |
|
"grad_norm": 1.0912818419343246, |
|
"learning_rate": 4.6537110701200646e-07, |
|
"loss": 1.0214, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1620469083155651, |
|
"grad_norm": 1.2075583423687697, |
|
"learning_rate": 4.486368337011321e-07, |
|
"loss": 1.0799, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.1727078891257996, |
|
"grad_norm": 1.1711621672939032, |
|
"learning_rate": 4.3251771479550135e-07, |
|
"loss": 1.0649, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.183368869936034, |
|
"grad_norm": 1.1379570510356936, |
|
"learning_rate": 4.169933230756739e-07, |
|
"loss": 1.0275, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 1.1805410178961084, |
|
"learning_rate": 4.020438365141708e-07, |
|
"loss": 1.0501, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2046908315565032, |
|
"grad_norm": 1.0319879103444831, |
|
"learning_rate": 3.876500225283097e-07, |
|
"loss": 1.0616, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.2153518123667377, |
|
"grad_norm": 1.1205106100162852, |
|
"learning_rate": 3.7379322258556177e-07, |
|
"loss": 1.0679, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2260127931769722, |
|
"grad_norm": 1.1518728152224136, |
|
"learning_rate": 3.6045533715483096e-07, |
|
"loss": 1.1601, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.236673773987207, |
|
"grad_norm": 1.10359450734736, |
|
"learning_rate": 3.4761881099715165e-07, |
|
"loss": 1.0466, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.2473347547974414, |
|
"grad_norm": 1.1001281681468988, |
|
"learning_rate": 3.3526661878940757e-07, |
|
"loss": 1.114, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.2579957356076759, |
|
"grad_norm": 1.1513825927177352, |
|
"learning_rate": 3.233822510747588e-07, |
|
"loss": 1.1314, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2686567164179103, |
|
"grad_norm": 1.193698165116124, |
|
"learning_rate": 3.119497005335667e-07, |
|
"loss": 1.0904, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.279317697228145, |
|
"grad_norm": 1.1645268517569372, |
|
"learning_rate": 3.0095344856870186e-07, |
|
"loss": 1.1662, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.279317697228145, |
|
"eval_loss": 1.220153570175171, |
|
"eval_runtime": 86.8011, |
|
"eval_samples_per_second": 19.205, |
|
"eval_steps_per_second": 0.311, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2899786780383795, |
|
"grad_norm": 1.1865685508079404, |
|
"learning_rate": 2.903784521992152e-07, |
|
"loss": 1.0543, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.3006396588486142, |
|
"grad_norm": 1.0889008660105814, |
|
"learning_rate": 2.802101312564394e-07, |
|
"loss": 1.1059, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.3113006396588487, |
|
"grad_norm": 1.1629452168406549, |
|
"learning_rate": 2.7043435587668195e-07, |
|
"loss": 1.0847, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.3219616204690832, |
|
"grad_norm": 1.1225822453511047, |
|
"learning_rate": 2.6103743428476387e-07, |
|
"loss": 1.0547, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3326226012793176, |
|
"grad_norm": 1.068456227087864, |
|
"learning_rate": 2.520061008627476e-07, |
|
"loss": 1.1331, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 1.163340113599676, |
|
"learning_rate": 2.4332750449828163e-07, |
|
"loss": 1.0907, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3539445628997868, |
|
"grad_norm": 1.1623906096012029, |
|
"learning_rate": 2.3498919720707982e-07, |
|
"loss": 1.0788, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.3646055437100213, |
|
"grad_norm": 1.1079974563308679, |
|
"learning_rate": 2.269791230241396e-07, |
|
"loss": 1.0701, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.375266524520256, |
|
"grad_norm": 1.2149865496581391, |
|
"learning_rate": 2.1928560715838997e-07, |
|
"loss": 1.0376, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.3859275053304905, |
|
"grad_norm": 1.1528488687976461, |
|
"learning_rate": 2.1189734540554226e-07, |
|
"loss": 1.1269, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.396588486140725, |
|
"grad_norm": 1.1288464186180127, |
|
"learning_rate": 2.048033938139964e-07, |
|
"loss": 1.108, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.4072494669509594, |
|
"grad_norm": 1.1817093681210782, |
|
"learning_rate": 1.979931585987499e-07, |
|
"loss": 1.0499, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.417910447761194, |
|
"grad_norm": 1.1321827623646836, |
|
"learning_rate": 1.9145638629832107e-07, |
|
"loss": 1.041, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.15181492674688, |
|
"learning_rate": 1.8518315416979456e-07, |
|
"loss": 1.0994, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.439232409381663, |
|
"grad_norm": 1.1127579923456647, |
|
"learning_rate": 1.7916386081716564e-07, |
|
"loss": 1.0894, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.4498933901918978, |
|
"grad_norm": 1.1765219547043915, |
|
"learning_rate": 1.7338921704824148e-07, |
|
"loss": 1.0715, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4605543710021323, |
|
"grad_norm": 1.2252010284139379, |
|
"learning_rate": 1.678502369554362e-07, |
|
"loss": 1.1498, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.4712153518123667, |
|
"grad_norm": 1.1121586595467274, |
|
"learning_rate": 1.6253822921587308e-07, |
|
"loss": 1.0357, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.4818763326226012, |
|
"grad_norm": 1.2326653113155133, |
|
"learning_rate": 1.574447886062802e-07, |
|
"loss": 1.0426, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 1.2662375322966155, |
|
"learning_rate": 1.5256178772824204e-07, |
|
"loss": 1.1513, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5031982942430704, |
|
"grad_norm": 1.1537609553653134, |
|
"learning_rate": 1.4788136893944296e-07, |
|
"loss": 1.0303, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.5138592750533049, |
|
"grad_norm": 1.0977344228272827, |
|
"learning_rate": 1.4339593648661315e-07, |
|
"loss": 1.0821, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5245202558635396, |
|
"grad_norm": 1.1284164855389234, |
|
"learning_rate": 1.3909814883595595e-07, |
|
"loss": 1.1313, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.535181236673774, |
|
"grad_norm": 1.1938382106881487, |
|
"learning_rate": 1.3498091119690798e-07, |
|
"loss": 1.1446, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5458422174840085, |
|
"grad_norm": 1.1272930245655628, |
|
"learning_rate": 1.3103736823515384e-07, |
|
"loss": 1.1136, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.556503198294243, |
|
"grad_norm": 1.1773751232659426, |
|
"learning_rate": 1.2726089697088728e-07, |
|
"loss": 1.1309, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.5671641791044775, |
|
"grad_norm": 1.3277025055368337, |
|
"learning_rate": 1.2364509985837655e-07, |
|
"loss": 1.0239, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.5778251599147122, |
|
"grad_norm": 1.230962189006197, |
|
"learning_rate": 1.201837980429599e-07, |
|
"loss": 1.058, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.5884861407249466, |
|
"grad_norm": 1.2101453511321207, |
|
"learning_rate": 1.1687102479166401e-07, |
|
"loss": 1.1119, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.5991471215351813, |
|
"grad_norm": 1.1936647152626743, |
|
"learning_rate": 1.1370101909370577e-07, |
|
"loss": 1.1556, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.6098081023454158, |
|
"grad_norm": 1.1474035015035384, |
|
"learning_rate": 1.1066821942719791e-07, |
|
"loss": 1.0522, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.6204690831556503, |
|
"grad_norm": 1.0637295000469429, |
|
"learning_rate": 1.0776725768844699e-07, |
|
"loss": 1.0549, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6311300639658848, |
|
"grad_norm": 1.10828124909061, |
|
"learning_rate": 1.0499295328029333e-07, |
|
"loss": 1.043, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 1.1283065722653947, |
|
"learning_rate": 1.0234030735600652e-07, |
|
"loss": 1.0731, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.652452025586354, |
|
"grad_norm": 1.161403848045461, |
|
"learning_rate": 9.980449721530949e-08, |
|
"loss": 1.161, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.6631130063965884, |
|
"grad_norm": 1.158445375876672, |
|
"learning_rate": 9.738087084916553e-08, |
|
"loss": 1.089, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.6737739872068231, |
|
"grad_norm": 1.1752425329283938, |
|
"learning_rate": 9.506494163002325e-08, |
|
"loss": 1.0479, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.6844349680170576, |
|
"grad_norm": 1.0813888422827629, |
|
"learning_rate": 9.285238314427303e-08, |
|
"loss": 1.0797, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.695095948827292, |
|
"grad_norm": 1.2358809521231273, |
|
"learning_rate": 9.07390241637265e-08, |
|
"loss": 1.0483, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.7057569296375266, |
|
"grad_norm": 1.1520098737400197, |
|
"learning_rate": 8.872084375298702e-08, |
|
"loss": 1.1182, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7057569296375266, |
|
"eval_loss": 1.2184005975723267, |
|
"eval_runtime": 86.9323, |
|
"eval_samples_per_second": 19.176, |
|
"eval_steps_per_second": 0.311, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.716417910447761, |
|
"grad_norm": 1.1409424379253437, |
|
"learning_rate": 8.679396650963928e-08, |
|
"loss": 1.1356, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.7270788912579957, |
|
"grad_norm": 1.1235815999344876, |
|
"learning_rate": 8.49546579342372e-08, |
|
"loss": 1.155, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.7377398720682304, |
|
"grad_norm": 1.1073975656892132, |
|
"learning_rate": 8.319931992712862e-08, |
|
"loss": 1.1137, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.748400852878465, |
|
"grad_norm": 1.2180389136635439, |
|
"learning_rate": 8.15244864092076e-08, |
|
"loss": 1.1422, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7590618336886994, |
|
"grad_norm": 1.1113266470685994, |
|
"learning_rate": 7.992681906373824e-08, |
|
"loss": 1.062, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.7697228144989339, |
|
"grad_norm": 1.2072670273798571, |
|
"learning_rate": 7.84031031964486e-08, |
|
"loss": 1.0517, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.7803837953091683, |
|
"grad_norm": 1.1803556915432551, |
|
"learning_rate": 7.69502437111444e-08, |
|
"loss": 1.0703, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 1.1844005429106652, |
|
"learning_rate": 7.556526119814338e-08, |
|
"loss": 1.111, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8017057569296375, |
|
"grad_norm": 1.1384485747440536, |
|
"learning_rate": 7.424528813288221e-08, |
|
"loss": 1.1914, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.8123667377398722, |
|
"grad_norm": 1.2171711222670063, |
|
"learning_rate": 7.298756518209771e-08, |
|
"loss": 1.118, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8230277185501067, |
|
"grad_norm": 1.1361749350155368, |
|
"learning_rate": 7.178943761503399e-08, |
|
"loss": 1.2114, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.8336886993603412, |
|
"grad_norm": 1.2202066733922106, |
|
"learning_rate": 7.064835181717455e-08, |
|
"loss": 1.0826, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.8443496801705757, |
|
"grad_norm": 1.1962899610991478, |
|
"learning_rate": 6.956185190404699e-08, |
|
"loss": 1.0077, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.8550106609808101, |
|
"grad_norm": 1.2308765981536185, |
|
"learning_rate": 6.85275764326948e-08, |
|
"loss": 1.0717, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.8656716417910446, |
|
"grad_norm": 1.1658077017002657, |
|
"learning_rate": 6.75432552084579e-08, |
|
"loss": 1.0958, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.8763326226012793, |
|
"grad_norm": 1.270006074619097, |
|
"learning_rate": 6.660670618474825e-08, |
|
"loss": 1.091, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.886993603411514, |
|
"grad_norm": 1.1582088190520023, |
|
"learning_rate": 6.571583245355274e-08, |
|
"loss": 1.0301, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.8976545842217485, |
|
"grad_norm": 1.0859957939920986, |
|
"learning_rate": 6.486861932443996e-08, |
|
"loss": 1.0629, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.908315565031983, |
|
"grad_norm": 1.0879133073671061, |
|
"learning_rate": 6.406313148989106e-08, |
|
"loss": 1.0293, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.9189765458422174, |
|
"grad_norm": 1.1419672670758283, |
|
"learning_rate": 6.329751027481845e-08, |
|
"loss": 1.1259, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.929637526652452, |
|
"grad_norm": 1.147613866551749, |
|
"learning_rate": 6.256997096817788e-08, |
|
"loss": 1.0128, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 1.1753147588481525, |
|
"learning_rate": 6.187880023462194e-08, |
|
"loss": 1.0407, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.950959488272921, |
|
"grad_norm": 1.128791859643809, |
|
"learning_rate": 6.122235360418442e-08, |
|
"loss": 1.0722, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.9616204690831558, |
|
"grad_norm": 1.1601302162296214, |
|
"learning_rate": 6.059905303802484e-08, |
|
"loss": 1.1054, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.9722814498933903, |
|
"grad_norm": 1.198563476242119, |
|
"learning_rate": 6.000738456830302e-08, |
|
"loss": 1.0568, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.9829424307036247, |
|
"grad_norm": 1.2262616318390043, |
|
"learning_rate": 5.944589601029287e-08, |
|
"loss": 1.1251, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.9936034115138592, |
|
"grad_norm": 1.191995506184399, |
|
"learning_rate": 5.89131947448833e-08, |
|
"loss": 1.1911, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.0042643923240937, |
|
"grad_norm": 1.11244803877364, |
|
"learning_rate": 5.840794556965235e-08, |
|
"loss": 1.066, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.014925373134328, |
|
"grad_norm": 1.0894580884709386, |
|
"learning_rate": 5.792886861673804e-08, |
|
"loss": 1.0406, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.025586353944563, |
|
"grad_norm": 1.0653919782073382, |
|
"learning_rate": 5.747473733576709e-08, |
|
"loss": 1.0661, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0362473347547976, |
|
"grad_norm": 1.1280141717212646, |
|
"learning_rate": 5.704437654013792e-08, |
|
"loss": 1.1027, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.046908315565032, |
|
"grad_norm": 1.2313007226700692, |
|
"learning_rate": 5.663666051499173e-08, |
|
"loss": 1.0421, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.0575692963752665, |
|
"grad_norm": 1.153128267269016, |
|
"learning_rate": 5.6250511185239e-08, |
|
"loss": 1.0893, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.068230277185501, |
|
"grad_norm": 1.1364871737157245, |
|
"learning_rate": 5.58848963420453e-08, |
|
"loss": 1.0918, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.0788912579957355, |
|
"grad_norm": 1.1584140858402872, |
|
"learning_rate": 5.553882792621254e-08, |
|
"loss": 1.0183, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 1.1608258928350752, |
|
"learning_rate": 5.521136036692734e-08, |
|
"loss": 1.0322, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.100213219616205, |
|
"grad_norm": 1.1873514358148416, |
|
"learning_rate": 5.490158897437947e-08, |
|
"loss": 1.0441, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.1108742004264394, |
|
"grad_norm": 1.202520887860229, |
|
"learning_rate": 5.460864838478731e-08, |
|
"loss": 1.0996, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.121535181236674, |
|
"grad_norm": 1.1559964888973397, |
|
"learning_rate": 5.4331711056398124e-08, |
|
"loss": 1.095, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.1321961620469083, |
|
"grad_norm": 1.0455908941955974, |
|
"learning_rate": 5.4069985815063316e-08, |
|
"loss": 1.046, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.1321961620469083, |
|
"eval_loss": 1.2190189361572266, |
|
"eval_runtime": 86.8993, |
|
"eval_samples_per_second": 19.183, |
|
"eval_steps_per_second": 0.311, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.1258833797291203, |
|
"learning_rate": 5.3822716448019046e-08, |
|
"loss": 1.049, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.1535181236673773, |
|
"grad_norm": 1.162003500965812, |
|
"learning_rate": 5.35891803445336e-08, |
|
"loss": 1.094, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.1641791044776117, |
|
"grad_norm": 1.1823939265510066, |
|
"learning_rate": 5.3368687182112245e-08, |
|
"loss": 1.0556, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.1748400852878467, |
|
"grad_norm": 1.1584053741977514, |
|
"learning_rate": 5.3160577656980396e-08, |
|
"loss": 1.1292, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.185501066098081, |
|
"grad_norm": 1.1507740539032782, |
|
"learning_rate": 5.296422225759416e-08, |
|
"loss": 1.1023, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.1961620469083156, |
|
"grad_norm": 1.1594756240093598, |
|
"learning_rate": 5.277902007995615e-08, |
|
"loss": 1.107, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.20682302771855, |
|
"grad_norm": 1.10929463120434, |
|
"learning_rate": 5.2604397683542377e-08, |
|
"loss": 1.1102, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.2174840085287846, |
|
"grad_norm": 1.1432334524639647, |
|
"learning_rate": 5.243980798667319e-08, |
|
"loss": 1.1, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.228144989339019, |
|
"grad_norm": 1.1904483860983543, |
|
"learning_rate": 5.228472920018869e-08, |
|
"loss": 1.09, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 1.167622664957689, |
|
"learning_rate": 5.213866379831496e-08, |
|
"loss": 1.0814, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2494669509594885, |
|
"grad_norm": 1.0895757958922552, |
|
"learning_rate": 5.200113752563408e-08, |
|
"loss": 1.1161, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.260127931769723, |
|
"grad_norm": 1.1558659882895932, |
|
"learning_rate": 5.187169843909606e-08, |
|
"loss": 1.0795, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.2707889125799574, |
|
"grad_norm": 1.1344999555588917, |
|
"learning_rate": 5.17499159840361e-08, |
|
"loss": 1.0706, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.281449893390192, |
|
"grad_norm": 1.1415261926993812, |
|
"learning_rate": 5.163538010318527e-08, |
|
"loss": 1.0967, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.2921108742004264, |
|
"grad_norm": 1.1381490852319662, |
|
"learning_rate": 5.152770037768703e-08, |
|
"loss": 1.056, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.302771855010661, |
|
"grad_norm": 1.1347196474571326, |
|
"learning_rate": 5.142650519915549e-08, |
|
"loss": 1.0654, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.3134328358208958, |
|
"grad_norm": 1.140691001232856, |
|
"learning_rate": 5.133144097183511e-08, |
|
"loss": 1.0555, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.3240938166311302, |
|
"grad_norm": 1.1321293656672715, |
|
"learning_rate": 5.1242171343944076e-08, |
|
"loss": 1.0617, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.3347547974413647, |
|
"grad_norm": 1.0730144292666668, |
|
"learning_rate": 5.115837646730644e-08, |
|
"loss": 1.0129, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.345415778251599, |
|
"grad_norm": 1.1568722548151742, |
|
"learning_rate": 5.107975228439987e-08, |
|
"loss": 1.1102, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.3560767590618337, |
|
"grad_norm": 1.1165138012512623, |
|
"learning_rate": 5.100600984196796e-08, |
|
"loss": 1.0316, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.366737739872068, |
|
"grad_norm": 1.1003671418770582, |
|
"learning_rate": 5.093687463036684e-08, |
|
"loss": 1.0387, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.3773987206823026, |
|
"grad_norm": 1.1858341121422311, |
|
"learning_rate": 5.087208594783712e-08, |
|
"loss": 1.1381, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 1.1724486836357029, |
|
"learning_rate": 5.0811396288912494e-08, |
|
"loss": 1.1484, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.398720682302772, |
|
"grad_norm": 1.1477785681404549, |
|
"learning_rate": 5.075457075619642e-08, |
|
"loss": 1.0488, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.4093816631130065, |
|
"grad_norm": 1.1926168382576205, |
|
"learning_rate": 5.070138649475803e-08, |
|
"loss": 1.0335, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.420042643923241, |
|
"grad_norm": 1.1592591417618787, |
|
"learning_rate": 5.065163214841775e-08, |
|
"loss": 1.0867, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.4307036247334755, |
|
"grad_norm": 1.1492395746059412, |
|
"learning_rate": 5.0605107337212035e-08, |
|
"loss": 1.0934, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.44136460554371, |
|
"grad_norm": 1.1808862259382695, |
|
"learning_rate": 5.0561622155345265e-08, |
|
"loss": 1.0804, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.4520255863539444, |
|
"grad_norm": 1.2148315415559092, |
|
"learning_rate": 5.052099668895486e-08, |
|
"loss": 1.1264, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.4626865671641793, |
|
"grad_norm": 1.0677698199987926, |
|
"learning_rate": 5.0483060553033895e-08, |
|
"loss": 1.029, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.473347547974414, |
|
"grad_norm": 1.1698932019494643, |
|
"learning_rate": 5.0447652446872465e-08, |
|
"loss": 1.028, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.4840085287846483, |
|
"grad_norm": 1.1287646587327143, |
|
"learning_rate": 5.0414619727396756e-08, |
|
"loss": 1.0884, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 2.4946695095948828, |
|
"grad_norm": 1.16856773725038, |
|
"learning_rate": 5.0383817999801064e-08, |
|
"loss": 1.0204, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.5053304904051172, |
|
"grad_norm": 1.141453152240354, |
|
"learning_rate": 5.035511072488479e-08, |
|
"loss": 1.0523, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 2.5159914712153517, |
|
"grad_norm": 1.230429011852803, |
|
"learning_rate": 5.032836884252246e-08, |
|
"loss": 1.0617, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.526652452025586, |
|
"grad_norm": 1.151378135213026, |
|
"learning_rate": 5.03034704107104e-08, |
|
"loss": 1.048, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"grad_norm": 1.1297760039941285, |
|
"learning_rate": 5.028030025964957e-08, |
|
"loss": 1.0614, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.5479744136460556, |
|
"grad_norm": 1.149687991991871, |
|
"learning_rate": 5.025874966033864e-08, |
|
"loss": 1.1042, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 2.55863539445629, |
|
"grad_norm": 1.1935154857243262, |
|
"learning_rate": 5.023871600716678e-08, |
|
"loss": 1.0772, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.55863539445629, |
|
"eval_loss": 1.2190290689468384, |
|
"eval_runtime": 86.4625, |
|
"eval_samples_per_second": 19.28, |
|
"eval_steps_per_second": 0.312, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5692963752665245, |
|
"grad_norm": 1.1485570257886584, |
|
"learning_rate": 5.022010251400949e-08, |
|
"loss": 1.1083, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 2.579957356076759, |
|
"grad_norm": 1.180867296316843, |
|
"learning_rate": 5.0202817923345707e-08, |
|
"loss": 1.062, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.5906183368869935, |
|
"grad_norm": 1.1667090859024647, |
|
"learning_rate": 5.0186776227927486e-08, |
|
"loss": 1.0414, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 2.6012793176972284, |
|
"grad_norm": 1.1582105008250059, |
|
"learning_rate": 5.017189640454793e-08, |
|
"loss": 1.0663, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.611940298507463, |
|
"grad_norm": 1.0799260851167845, |
|
"learning_rate": 5.01581021594656e-08, |
|
"loss": 1.102, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 2.6226012793176974, |
|
"grad_norm": 1.1531786446908774, |
|
"learning_rate": 5.01453216850572e-08, |
|
"loss": 1.1336, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.633262260127932, |
|
"grad_norm": 1.1432906025568284, |
|
"learning_rate": 5.0133487427282597e-08, |
|
"loss": 1.0256, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 2.6439232409381663, |
|
"grad_norm": 1.1673336163640968, |
|
"learning_rate": 5.0122535863558936e-08, |
|
"loss": 1.1471, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.654584221748401, |
|
"grad_norm": 1.1845243824938692, |
|
"learning_rate": 5.011240729065255e-08, |
|
"loss": 1.0642, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 2.6652452025586353, |
|
"grad_norm": 1.1256336089648642, |
|
"learning_rate": 5.010304562220935e-08, |
|
"loss": 1.1023, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.6759061833688698, |
|
"grad_norm": 1.1635762261874374, |
|
"learning_rate": 5.00943981955558e-08, |
|
"loss": 1.0605, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"grad_norm": 1.1747405452856283, |
|
"learning_rate": 5.0086415587414125e-08, |
|
"loss": 1.0897, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.697228144989339, |
|
"grad_norm": 1.086387379481192, |
|
"learning_rate": 5.00790514381863e-08, |
|
"loss": 1.0893, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 2.7078891257995736, |
|
"grad_norm": 1.1923401716553281, |
|
"learning_rate": 5.007226228447204e-08, |
|
"loss": 1.0473, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.718550106609808, |
|
"grad_norm": 1.205935131659565, |
|
"learning_rate": 5.006600739949697e-08, |
|
"loss": 1.0231, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 2.7292110874200426, |
|
"grad_norm": 1.1391126635720465, |
|
"learning_rate": 5.006024864113683e-08, |
|
"loss": 1.0403, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.739872068230277, |
|
"grad_norm": 1.1345121679379757, |
|
"learning_rate": 5.005495030723431e-08, |
|
"loss": 1.0764, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 2.750533049040512, |
|
"grad_norm": 1.1850716557115912, |
|
"learning_rate": 5.005007899791418e-08, |
|
"loss": 0.9945, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.7611940298507465, |
|
"grad_norm": 1.1874286830736782, |
|
"learning_rate": 5.00456034846126e-08, |
|
"loss": 1.085, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 2.771855010660981, |
|
"grad_norm": 1.1901764193670399, |
|
"learning_rate": 5.004149458554536e-08, |
|
"loss": 1.0092, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.7825159914712154, |
|
"grad_norm": 1.0961826150665088, |
|
"learning_rate": 5.0037725047349065e-08, |
|
"loss": 1.1071, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 2.79317697228145, |
|
"grad_norm": 1.1628718738526371, |
|
"learning_rate": 5.0034269432638164e-08, |
|
"loss": 1.1233, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.8038379530916844, |
|
"grad_norm": 1.1626738934739451, |
|
"learning_rate": 5.0031104013229344e-08, |
|
"loss": 1.1894, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 2.814498933901919, |
|
"grad_norm": 1.1011603049613505, |
|
"learning_rate": 5.002820666879318e-08, |
|
"loss": 1.1494, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.8251599147121533, |
|
"grad_norm": 1.1717668628457665, |
|
"learning_rate": 5.002555679070117e-08, |
|
"loss": 1.0514, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 1.1774225127445404, |
|
"learning_rate": 5.0023135190844316e-08, |
|
"loss": 1.1027, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.8464818763326227, |
|
"grad_norm": 1.1717401192939292, |
|
"learning_rate": 5.0020924015207164e-08, |
|
"loss": 1.063, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.1587594052870365, |
|
"learning_rate": 5.001890666198876e-08, |
|
"loss": 1.1021, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.8678038379530917, |
|
"grad_norm": 1.1525118578792972, |
|
"learning_rate": 5.001706770406945e-08, |
|
"loss": 1.1019, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 2.878464818763326, |
|
"grad_norm": 1.37326829503098, |
|
"learning_rate": 5.001539281562949e-08, |
|
"loss": 1.0355, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.8891257995735606, |
|
"grad_norm": 1.1331048627302174, |
|
"learning_rate": 5.001386870273264e-08, |
|
"loss": 1.0899, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 2.8997867803837956, |
|
"grad_norm": 1.1984672662153428, |
|
"learning_rate": 5.0012483037694405e-08, |
|
"loss": 1.1161, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.91044776119403, |
|
"grad_norm": 1.1109425480042538, |
|
"learning_rate": 5.0011224397061566e-08, |
|
"loss": 1.0231, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 2.9211087420042645, |
|
"grad_norm": 1.1795722740533605, |
|
"learning_rate": 5.0010082203035725e-08, |
|
"loss": 1.1211, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.931769722814499, |
|
"grad_norm": 1.0937372807822898, |
|
"learning_rate": 5.0009046668180024e-08, |
|
"loss": 1.0735, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 2.9424307036247335, |
|
"grad_norm": 1.1405185259805586, |
|
"learning_rate": 5.000810874325427e-08, |
|
"loss": 1.0276, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.953091684434968, |
|
"grad_norm": 1.20416756854441, |
|
"learning_rate": 5.0007260068029507e-08, |
|
"loss": 1.0596, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.9637526652452024, |
|
"grad_norm": 1.1625071717485667, |
|
"learning_rate": 5.000649292493887e-08, |
|
"loss": 1.0697, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.974413646055437, |
|
"grad_norm": 1.1863714583502374, |
|
"learning_rate": 5.000580019542706e-08, |
|
"loss": 1.1505, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 1.1942905354245756, |
|
"learning_rate": 5.000517531886622e-08, |
|
"loss": 1.0326, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"eval_loss": 1.2187978029251099, |
|
"eval_runtime": 86.8658, |
|
"eval_samples_per_second": 19.191, |
|
"eval_steps_per_second": 0.311, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9957356076759063, |
|
"grad_norm": 1.1519642562669563, |
|
"learning_rate": 5.0004612253911274e-08, |
|
"loss": 1.0479, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 3.0063965884861408, |
|
"grad_norm": 1.1665923395094933, |
|
"learning_rate": 5.000410544217268e-08, |
|
"loss": 1.0174, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.0170575692963753, |
|
"grad_norm": 1.1796085918771502, |
|
"learning_rate": 5.0003649774089725e-08, |
|
"loss": 1.0737, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 3.0277185501066097, |
|
"grad_norm": 1.1757596964828239, |
|
"learning_rate": 5.0003240556892086e-08, |
|
"loss": 1.0738, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.038379530916844, |
|
"grad_norm": 1.1395407440291647, |
|
"learning_rate": 5.000287348454201e-08, |
|
"loss": 1.094, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 3.0490405117270787, |
|
"grad_norm": 1.1323357375626646, |
|
"learning_rate": 5.000254460955416e-08, |
|
"loss": 1.0684, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.0597014925373136, |
|
"grad_norm": 1.1177035566781262, |
|
"learning_rate": 5.0002250316594135e-08, |
|
"loss": 1.038, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 3.070362473347548, |
|
"grad_norm": 1.1118810589333648, |
|
"learning_rate": 5.0001987297761256e-08, |
|
"loss": 1.0114, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.0810234541577826, |
|
"grad_norm": 1.153651371893197, |
|
"learning_rate": 5.00017525294651e-08, |
|
"loss": 1.1564, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 3.091684434968017, |
|
"grad_norm": 1.117017616812222, |
|
"learning_rate": 5.000154325080907e-08, |
|
"loss": 1.0421, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.1023454157782515, |
|
"grad_norm": 1.1455567386664758, |
|
"learning_rate": 5.0001356943398416e-08, |
|
"loss": 1.0494, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 3.113006396588486, |
|
"grad_norm": 1.1745918907121704, |
|
"learning_rate": 5.000119131249349e-08, |
|
"loss": 1.0673, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.1236673773987205, |
|
"grad_norm": 1.0945498527723858, |
|
"learning_rate": 5.000104426943277e-08, |
|
"loss": 1.15, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"grad_norm": 1.2775793960973583, |
|
"learning_rate": 5.0000913915253465e-08, |
|
"loss": 1.0434, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.14498933901919, |
|
"grad_norm": 1.1275573880105167, |
|
"learning_rate": 5.000079852544101e-08, |
|
"loss": 1.0726, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 3.1556503198294243, |
|
"grad_norm": 1.0900224885696637, |
|
"learning_rate": 5.0000696535741776e-08, |
|
"loss": 1.0439, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.166311300639659, |
|
"grad_norm": 1.1943472978175196, |
|
"learning_rate": 5.0000606528976535e-08, |
|
"loss": 1.0778, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 3.1769722814498933, |
|
"grad_norm": 1.1656854953422446, |
|
"learning_rate": 5.000052722279511e-08, |
|
"loss": 1.0236, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.1876332622601278, |
|
"grad_norm": 1.7099219299352593, |
|
"learning_rate": 5.0000457458315606e-08, |
|
"loss": 1.048, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 3.1982942430703627, |
|
"grad_norm": 1.2368961315589513, |
|
"learning_rate": 5.0000396189594195e-08, |
|
"loss": 1.0601, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.208955223880597, |
|
"grad_norm": 1.1053541732810632, |
|
"learning_rate": 5.000034247387439e-08, |
|
"loss": 1.1574, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 3.2196162046908317, |
|
"grad_norm": 1.2088122913217223, |
|
"learning_rate": 5.0000295462567e-08, |
|
"loss": 1.021, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.230277185501066, |
|
"grad_norm": 1.1822420881980706, |
|
"learning_rate": 5.000025439291453e-08, |
|
"loss": 1.0577, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 3.2409381663113006, |
|
"grad_norm": 1.1982389728811844, |
|
"learning_rate": 5.0000218580296324e-08, |
|
"loss": 1.1241, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.251599147121535, |
|
"grad_norm": 1.1836468791922983, |
|
"learning_rate": 5.0000187411132714e-08, |
|
"loss": 1.106, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 3.2622601279317696, |
|
"grad_norm": 1.1758919894146582, |
|
"learning_rate": 5.0000160336348885e-08, |
|
"loss": 1.2091, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.272921108742004, |
|
"grad_norm": 1.1300862552793904, |
|
"learning_rate": 5.0000136865361153e-08, |
|
"loss": 1.0345, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 3.283582089552239, |
|
"grad_norm": 1.1470416574016102, |
|
"learning_rate": 5.000011656055037e-08, |
|
"loss": 1.0357, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.2942430703624734, |
|
"grad_norm": 1.18001188067462, |
|
"learning_rate": 5.000009903218913e-08, |
|
"loss": 1.0896, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 3.304904051172708, |
|
"grad_norm": 1.1316402765789904, |
|
"learning_rate": 5.0000083933791284e-08, |
|
"loss": 1.0557, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.3155650319829424, |
|
"grad_norm": 1.2685713000625365, |
|
"learning_rate": 5.0000070957854027e-08, |
|
"loss": 1.0442, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 3.326226012793177, |
|
"grad_norm": 1.3173532606855904, |
|
"learning_rate": 5.000005983196463e-08, |
|
"loss": 1.1031, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.3368869936034113, |
|
"grad_norm": 1.1645251930023506, |
|
"learning_rate": 5.0000050315245216e-08, |
|
"loss": 1.0327, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 3.3475479744136463, |
|
"grad_norm": 1.1526305321387222, |
|
"learning_rate": 5.000004219511104e-08, |
|
"loss": 1.1047, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.3582089552238807, |
|
"grad_norm": 1.1956529776682534, |
|
"learning_rate": 5.0000035284318585e-08, |
|
"loss": 1.0513, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 3.368869936034115, |
|
"grad_norm": 1.1723882568539148, |
|
"learning_rate": 5.00000294182818e-08, |
|
"loss": 1.19, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.3795309168443497, |
|
"grad_norm": 1.1986098281392215, |
|
"learning_rate": 5.00000244526357e-08, |
|
"loss": 1.077, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 3.390191897654584, |
|
"grad_norm": 1.1822052046386484, |
|
"learning_rate": 5.0000020261028096e-08, |
|
"loss": 0.9985, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.4008528784648187, |
|
"grad_norm": 1.1965544250131317, |
|
"learning_rate": 5.0000016733121405e-08, |
|
"loss": 1.1462, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 3.411513859275053, |
|
"grad_norm": 1.233729124712432, |
|
"learning_rate": 5.00000137727875e-08, |
|
"loss": 1.013, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.411513859275053, |
|
"eval_loss": 1.2190721035003662, |
|
"eval_runtime": 86.8673, |
|
"eval_samples_per_second": 19.19, |
|
"eval_steps_per_second": 0.311, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.4221748400852876, |
|
"grad_norm": 1.1940390976157107, |
|
"learning_rate": 5.00000112964799e-08, |
|
"loss": 1.0902, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 3.4328358208955225, |
|
"grad_norm": 1.1852233751297125, |
|
"learning_rate": 5.000000923176855e-08, |
|
"loss": 1.0622, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.443496801705757, |
|
"grad_norm": 1.174182970082022, |
|
"learning_rate": 5.0000007516023255e-08, |
|
"loss": 1.1077, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 3.4541577825159915, |
|
"grad_norm": 1.2175665950720118, |
|
"learning_rate": 5.0000006095233244e-08, |
|
"loss": 1.0463, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.464818763326226, |
|
"grad_norm": 1.1910402507833668, |
|
"learning_rate": 5.0000004922950696e-08, |
|
"loss": 1.0485, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 3.4754797441364604, |
|
"grad_norm": 1.1756720560395666, |
|
"learning_rate": 5.000000395934729e-08, |
|
"loss": 1.0855, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.486140724946695, |
|
"grad_norm": 1.1357035763961498, |
|
"learning_rate": 5.0000003170373506e-08, |
|
"loss": 1.0552, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 3.49680170575693, |
|
"grad_norm": 1.12995191245614, |
|
"learning_rate": 5.0000002527011166e-08, |
|
"loss": 1.1139, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.5074626865671643, |
|
"grad_norm": 1.2169670051086863, |
|
"learning_rate": 5.000000200461043e-08, |
|
"loss": 1.0746, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 3.518123667377399, |
|
"grad_norm": 1.1477244123007122, |
|
"learning_rate": 5.000000158230317e-08, |
|
"loss": 1.0328, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.5287846481876333, |
|
"grad_norm": 1.146160022657433, |
|
"learning_rate": 5.000000124248513e-08, |
|
"loss": 1.0818, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 3.5394456289978677, |
|
"grad_norm": 1.1725432339751227, |
|
"learning_rate": 5.0000000970360234e-08, |
|
"loss": 1.0631, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.550106609808102, |
|
"grad_norm": 1.1437856255430832, |
|
"learning_rate": 5.000000075354043e-08, |
|
"loss": 1.03, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 3.5607675906183367, |
|
"grad_norm": 1.1949291330530631, |
|
"learning_rate": 5.000000058169555e-08, |
|
"loss": 1.0766, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 1.1521311383486394, |
|
"learning_rate": 5.000000044624768e-08, |
|
"loss": 1.1008, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"grad_norm": 1.1470924970059302, |
|
"learning_rate": 5.000000034010534e-08, |
|
"loss": 1.0803, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.5927505330490406, |
|
"grad_norm": 1.163957344487391, |
|
"learning_rate": 5.0000000257433004e-08, |
|
"loss": 1.0318, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 3.603411513859275, |
|
"grad_norm": 1.1289250902982475, |
|
"learning_rate": 5.000000019345195e-08, |
|
"loss": 1.0714, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.6140724946695095, |
|
"grad_norm": 1.1396900843621998, |
|
"learning_rate": 5.0000000144268826e-08, |
|
"loss": 1.0785, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 3.624733475479744, |
|
"grad_norm": 1.1928003735738484, |
|
"learning_rate": 5.000000010672866e-08, |
|
"loss": 1.0476, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.635394456289979, |
|
"grad_norm": 1.1813297878533093, |
|
"learning_rate": 5.000000007828923e-08, |
|
"loss": 1.1059, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 3.6460554371002134, |
|
"grad_norm": 1.1580070228375894, |
|
"learning_rate": 5.000000005691433e-08, |
|
"loss": 1.0804, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.656716417910448, |
|
"grad_norm": 1.1877112723964363, |
|
"learning_rate": 5.0000000040983296e-08, |
|
"loss": 1.0926, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 3.6673773987206824, |
|
"grad_norm": 1.162988885771696, |
|
"learning_rate": 5.0000000029214826e-08, |
|
"loss": 1.134, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.678038379530917, |
|
"grad_norm": 1.2167801349477705, |
|
"learning_rate": 5.000000002060308e-08, |
|
"loss": 1.1055, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 3.6886993603411513, |
|
"grad_norm": 1.1458862089420816, |
|
"learning_rate": 5.000000001436439e-08, |
|
"loss": 1.0211, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.699360341151386, |
|
"grad_norm": 1.0873296281789822, |
|
"learning_rate": 5.000000000989308e-08, |
|
"loss": 1.0072, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 3.7100213219616203, |
|
"grad_norm": 1.1575880345419727, |
|
"learning_rate": 5.000000000672499e-08, |
|
"loss": 1.1223, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.7206823027718547, |
|
"grad_norm": 1.1887408531399297, |
|
"learning_rate": 5.0000000004507695e-08, |
|
"loss": 1.0693, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 1.1370169892656834, |
|
"learning_rate": 5.000000000297619e-08, |
|
"loss": 0.9994, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.742004264392324, |
|
"grad_norm": 1.1964357258341392, |
|
"learning_rate": 5.000000000193327e-08, |
|
"loss": 1.1289, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 3.7526652452025586, |
|
"grad_norm": 1.1471490215463733, |
|
"learning_rate": 5.000000000123389e-08, |
|
"loss": 1.0828, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.763326226012793, |
|
"grad_norm": 1.1835965960152477, |
|
"learning_rate": 5.0000000000772604e-08, |
|
"loss": 1.1045, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 3.7739872068230276, |
|
"grad_norm": 1.122980946834769, |
|
"learning_rate": 5.000000000047381e-08, |
|
"loss": 1.0586, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.7846481876332625, |
|
"grad_norm": 1.1509727983825737, |
|
"learning_rate": 5.000000000028403e-08, |
|
"loss": 1.1015, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 3.795309168443497, |
|
"grad_norm": 1.1806503665236887, |
|
"learning_rate": 5.000000000016606e-08, |
|
"loss": 1.1195, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.8059701492537314, |
|
"grad_norm": 1.1840945960267957, |
|
"learning_rate": 5.0000000000094455e-08, |
|
"loss": 1.0452, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 3.816631130063966, |
|
"grad_norm": 1.1373535444813456, |
|
"learning_rate": 5.000000000005211e-08, |
|
"loss": 1.0538, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.8272921108742004, |
|
"grad_norm": 1.1164448190518461, |
|
"learning_rate": 5.000000000002779e-08, |
|
"loss": 1.0796, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 3.837953091684435, |
|
"grad_norm": 1.2618135197926215, |
|
"learning_rate": 5.0000000000014265e-08, |
|
"loss": 1.1103, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.837953091684435, |
|
"eval_loss": 1.2187809944152832, |
|
"eval_runtime": 87.3426, |
|
"eval_samples_per_second": 19.086, |
|
"eval_steps_per_second": 0.309, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.8486140724946694, |
|
"grad_norm": 1.166670525467647, |
|
"learning_rate": 5.000000000000701e-08, |
|
"loss": 1.0504, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 3.859275053304904, |
|
"grad_norm": 1.1358229871637773, |
|
"learning_rate": 5.000000000000328e-08, |
|
"loss": 1.031, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.8699360341151388, |
|
"grad_norm": 1.1553072990278574, |
|
"learning_rate": 5.0000000000001454e-08, |
|
"loss": 1.0852, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 3.8805970149253732, |
|
"grad_norm": 1.1514532203092147, |
|
"learning_rate": 5.00000000000006e-08, |
|
"loss": 1.0682, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.8912579957356077, |
|
"grad_norm": 1.1884972946614185, |
|
"learning_rate": 5.000000000000023e-08, |
|
"loss": 1.0012, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 3.901918976545842, |
|
"grad_norm": 1.1704188735844283, |
|
"learning_rate": 5.000000000000008e-08, |
|
"loss": 1.0772, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.9125799573560767, |
|
"grad_norm": 1.128126951495901, |
|
"learning_rate": 5.0000000000000024e-08, |
|
"loss": 1.0497, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 3.923240938166311, |
|
"grad_norm": 1.14600117506389, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 1.0813, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.933901918976546, |
|
"grad_norm": 1.1304423202109097, |
|
"learning_rate": 5e-08, |
|
"loss": 1.1111, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 3.9445628997867805, |
|
"grad_norm": 1.127824200062695, |
|
"learning_rate": 5e-08, |
|
"loss": 1.0618, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.955223880597015, |
|
"grad_norm": 1.1479631723327013, |
|
"learning_rate": 5e-08, |
|
"loss": 1.0155, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 3.9658848614072495, |
|
"grad_norm": 1.2107343511697226, |
|
"learning_rate": 5e-08, |
|
"loss": 1.1179, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.976545842217484, |
|
"grad_norm": 1.1701981970918207, |
|
"learning_rate": 5e-08, |
|
"loss": 1.0525, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 3.9872068230277184, |
|
"grad_norm": 1.2066704843958889, |
|
"learning_rate": 5e-08, |
|
"loss": 1.0012, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.997867803837953, |
|
"grad_norm": 1.1997617373616742, |
|
"learning_rate": 5e-08, |
|
"loss": 1.0131, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 1876, |
|
"total_flos": 391958715432960.0, |
|
"train_loss": 1.1227505109203395, |
|
"train_runtime": 12501.281, |
|
"train_samples_per_second": 4.798, |
|
"train_steps_per_second": 0.15 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1876, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"total_flos": 391958715432960.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|