|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.954337899543379, |
|
"eval_steps": 500, |
|
"global_step": 1090, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0091324200913242, |
|
"grad_norm": 3.1570751667022705, |
|
"learning_rate": 1.8348623853211011e-06, |
|
"loss": 2.2349, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.045662100456621, |
|
"grad_norm": 3.327242374420166, |
|
"learning_rate": 9.174311926605506e-06, |
|
"loss": 2.2947, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.091324200913242, |
|
"grad_norm": 3.6639091968536377, |
|
"learning_rate": 1.834862385321101e-05, |
|
"loss": 2.1754, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.136986301369863, |
|
"grad_norm": 6.006739616394043, |
|
"learning_rate": 2.7522935779816515e-05, |
|
"loss": 1.8388, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.182648401826484, |
|
"grad_norm": 1.5798718929290771, |
|
"learning_rate": 3.669724770642202e-05, |
|
"loss": 1.4696, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.228310502283105, |
|
"grad_norm": 1.038404107093811, |
|
"learning_rate": 4.587155963302753e-05, |
|
"loss": 1.2583, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.273972602739726, |
|
"grad_norm": 0.5968286395072937, |
|
"learning_rate": 5.504587155963303e-05, |
|
"loss": 1.1721, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.319634703196347, |
|
"grad_norm": 0.42477676272392273, |
|
"learning_rate": 6.422018348623854e-05, |
|
"loss": 1.0966, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.365296803652968, |
|
"grad_norm": 0.46538373827934265, |
|
"learning_rate": 7.339449541284404e-05, |
|
"loss": 1.0378, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.410958904109589, |
|
"grad_norm": 0.32725027203559875, |
|
"learning_rate": 8.256880733944955e-05, |
|
"loss": 0.996, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45662100456621, |
|
"grad_norm": 0.27102047204971313, |
|
"learning_rate": 9.174311926605506e-05, |
|
"loss": 0.9549, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.502283105022831, |
|
"grad_norm": 0.20055657625198364, |
|
"learning_rate": 0.00010091743119266055, |
|
"loss": 0.9426, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.547945205479452, |
|
"grad_norm": 0.24150966107845306, |
|
"learning_rate": 0.00011009174311926606, |
|
"loss": 0.918, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.593607305936073, |
|
"grad_norm": 0.20545370876789093, |
|
"learning_rate": 0.00011926605504587157, |
|
"loss": 0.8995, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.639269406392694, |
|
"grad_norm": 0.24165408313274384, |
|
"learning_rate": 0.00012844036697247707, |
|
"loss": 0.8851, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.684931506849315, |
|
"grad_norm": 0.22626952826976776, |
|
"learning_rate": 0.00013761467889908258, |
|
"loss": 0.8729, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.730593607305936, |
|
"grad_norm": 0.2153974175453186, |
|
"learning_rate": 0.0001467889908256881, |
|
"loss": 0.8588, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.776255707762557, |
|
"grad_norm": 0.22552534937858582, |
|
"learning_rate": 0.0001559633027522936, |
|
"loss": 0.8453, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.821917808219178, |
|
"grad_norm": 0.2299458086490631, |
|
"learning_rate": 0.0001651376146788991, |
|
"loss": 0.8491, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.867579908675799, |
|
"grad_norm": 0.38727596402168274, |
|
"learning_rate": 0.00017431192660550458, |
|
"loss": 0.839, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.91324200913242, |
|
"grad_norm": 0.5172630548477173, |
|
"learning_rate": 0.00018348623853211012, |
|
"loss": 0.8359, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.958904109589041, |
|
"grad_norm": 0.3504508137702942, |
|
"learning_rate": 0.0001926605504587156, |
|
"loss": 0.8176, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9954337899543378, |
|
"eval_loss": 2.11497163772583, |
|
"eval_runtime": 0.7274, |
|
"eval_samples_per_second": 13.748, |
|
"eval_steps_per_second": 1.375, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.004566210045662, |
|
"grad_norm": 0.21483348309993744, |
|
"learning_rate": 0.00019999948721966259, |
|
"loss": 0.812, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0502283105022832, |
|
"grad_norm": 0.25326865911483765, |
|
"learning_rate": 0.00019998154046002822, |
|
"loss": 0.8028, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.095890410958904, |
|
"grad_norm": 0.23645517230033875, |
|
"learning_rate": 0.0001999379599421534, |
|
"loss": 0.8003, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1415525114155252, |
|
"grad_norm": 0.22504329681396484, |
|
"learning_rate": 0.00019986875683942535, |
|
"loss": 0.7939, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.187214611872146, |
|
"grad_norm": 0.2312326580286026, |
|
"learning_rate": 0.00019977394889447524, |
|
"loss": 0.7847, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2328767123287672, |
|
"grad_norm": 0.21880114078521729, |
|
"learning_rate": 0.00019965356041462955, |
|
"loss": 0.7856, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.278538812785388, |
|
"grad_norm": 0.19915428757667542, |
|
"learning_rate": 0.00019950762226567781, |
|
"loss": 0.7772, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3242009132420092, |
|
"grad_norm": 0.21661226451396942, |
|
"learning_rate": 0.00019933617186395917, |
|
"loss": 0.7698, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.36986301369863, |
|
"grad_norm": 0.23418781161308289, |
|
"learning_rate": 0.00019913925316676945, |
|
"loss": 0.771, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4155251141552512, |
|
"grad_norm": 0.2712844908237457, |
|
"learning_rate": 0.00019891691666109113, |
|
"loss": 0.7754, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.461187214611872, |
|
"grad_norm": 0.20658765733242035, |
|
"learning_rate": 0.00019866921935064906, |
|
"loss": 0.772, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5068493150684932, |
|
"grad_norm": 0.2858370542526245, |
|
"learning_rate": 0.00019839622474129596, |
|
"loss": 0.7648, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5525114155251143, |
|
"grad_norm": 0.2993509769439697, |
|
"learning_rate": 0.00019809800282473013, |
|
"loss": 0.7553, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5981735159817352, |
|
"grad_norm": 0.19971656799316406, |
|
"learning_rate": 0.0001977746300605507, |
|
"loss": 0.7591, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.643835616438356, |
|
"grad_norm": 0.2134828269481659, |
|
"learning_rate": 0.00019742618935665476, |
|
"loss": 0.7549, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6894977168949772, |
|
"grad_norm": 0.18528909981250763, |
|
"learning_rate": 0.00019705277004798073, |
|
"loss": 0.7511, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7351598173515983, |
|
"grad_norm": 0.22641536593437195, |
|
"learning_rate": 0.0001966544678736044, |
|
"loss": 0.754, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7808219178082192, |
|
"grad_norm": 0.23213805258274078, |
|
"learning_rate": 0.00019623138495219292, |
|
"loss": 0.7476, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.82648401826484, |
|
"grad_norm": 0.28454649448394775, |
|
"learning_rate": 0.00019578362975582292, |
|
"loss": 0.7535, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8721461187214612, |
|
"grad_norm": 0.3238324224948883, |
|
"learning_rate": 0.00019531131708217005, |
|
"loss": 0.7489, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.9178082191780823, |
|
"grad_norm": 0.23858831822872162, |
|
"learning_rate": 0.0001948145680250766, |
|
"loss": 0.7514, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9634703196347032, |
|
"grad_norm": 0.21854574978351593, |
|
"learning_rate": 0.00019429350994350483, |
|
"loss": 0.7464, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.1313483715057373, |
|
"eval_runtime": 0.7223, |
|
"eval_samples_per_second": 13.845, |
|
"eval_steps_per_second": 1.384, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.009132420091324, |
|
"grad_norm": 0.24195519089698792, |
|
"learning_rate": 0.00019374827642888398, |
|
"loss": 0.7367, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0547945205479454, |
|
"grad_norm": 0.2783088684082031, |
|
"learning_rate": 0.0001931790072708596, |
|
"loss": 0.7283, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.1004566210045663, |
|
"grad_norm": 0.25543445348739624, |
|
"learning_rate": 0.00019258584842145343, |
|
"loss": 0.7298, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.146118721461187, |
|
"grad_norm": 0.23951423168182373, |
|
"learning_rate": 0.00019196895195764362, |
|
"loss": 0.7296, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.191780821917808, |
|
"grad_norm": 0.2517695426940918, |
|
"learning_rate": 0.0001913284760423745, |
|
"loss": 0.719, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.237442922374429, |
|
"grad_norm": 0.22162802517414093, |
|
"learning_rate": 0.00019066458488400584, |
|
"loss": 0.7252, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2831050228310503, |
|
"grad_norm": 0.2749063968658447, |
|
"learning_rate": 0.00018997744869421246, |
|
"loss": 0.7253, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.328767123287671, |
|
"grad_norm": 0.2638462483882904, |
|
"learning_rate": 0.00018926724364434446, |
|
"loss": 0.718, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.374429223744292, |
|
"grad_norm": 0.23074951767921448, |
|
"learning_rate": 0.0001885341518202595, |
|
"loss": 0.7203, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4200913242009134, |
|
"grad_norm": 0.18681403994560242, |
|
"learning_rate": 0.00018777836117563892, |
|
"loss": 0.7253, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4657534246575343, |
|
"grad_norm": 0.19247153401374817, |
|
"learning_rate": 0.00018700006548379898, |
|
"loss": 0.7175, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5114155251141552, |
|
"grad_norm": 0.18738825619220734, |
|
"learning_rate": 0.0001861994642880105, |
|
"loss": 0.7206, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.557077625570776, |
|
"grad_norm": 0.2461051046848297, |
|
"learning_rate": 0.00018537676285033887, |
|
"loss": 0.7134, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.602739726027397, |
|
"grad_norm": 0.31902387738227844, |
|
"learning_rate": 0.0001845321720990181, |
|
"loss": 0.7135, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.6484018264840183, |
|
"grad_norm": 0.19439548254013062, |
|
"learning_rate": 0.00018366590857437184, |
|
"loss": 0.7194, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6940639269406392, |
|
"grad_norm": 0.18397387862205505, |
|
"learning_rate": 0.00018277819437329576, |
|
"loss": 0.7172, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.73972602739726, |
|
"grad_norm": 0.1935635656118393, |
|
"learning_rate": 0.00018186925709231532, |
|
"loss": 0.7188, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7853881278538815, |
|
"grad_norm": 0.20888376235961914, |
|
"learning_rate": 0.0001809393297692334, |
|
"loss": 0.71, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.8310502283105023, |
|
"grad_norm": 0.20157787203788757, |
|
"learning_rate": 0.0001799886508233829, |
|
"loss": 0.7152, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8767123287671232, |
|
"grad_norm": 0.201010599732399, |
|
"learning_rate": 0.0001790174639944997, |
|
"loss": 0.7147, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.922374429223744, |
|
"grad_norm": 0.20877932012081146, |
|
"learning_rate": 0.00017802601828023138, |
|
"loss": 0.7101, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.968036529680365, |
|
"grad_norm": 0.19995956122875214, |
|
"learning_rate": 0.00017701456787229804, |
|
"loss": 0.7128, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.9954337899543377, |
|
"eval_loss": 2.144402027130127, |
|
"eval_runtime": 0.7303, |
|
"eval_samples_per_second": 13.694, |
|
"eval_steps_per_second": 1.369, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 3.0136986301369864, |
|
"grad_norm": 0.18146736919879913, |
|
"learning_rate": 0.0001759833720913214, |
|
"loss": 0.7071, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0593607305936072, |
|
"grad_norm": 0.20878320932388306, |
|
"learning_rate": 0.00017493269532033883, |
|
"loss": 0.6943, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.105022831050228, |
|
"grad_norm": 0.24599188566207886, |
|
"learning_rate": 0.0001738628069370195, |
|
"loss": 0.6968, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.1506849315068495, |
|
"grad_norm": 0.22413116693496704, |
|
"learning_rate": 0.00017277398124460023, |
|
"loss": 0.6949, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.1963470319634704, |
|
"grad_norm": 0.22311843931674957, |
|
"learning_rate": 0.000171666497401558, |
|
"loss": 0.6988, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.2420091324200913, |
|
"grad_norm": 0.19590473175048828, |
|
"learning_rate": 0.0001705406393500381, |
|
"loss": 0.692, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.287671232876712, |
|
"grad_norm": 0.18890812993049622, |
|
"learning_rate": 0.00016939669574305566, |
|
"loss": 0.6923, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.19763173162937164, |
|
"learning_rate": 0.0001682349598704892, |
|
"loss": 0.6899, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3789954337899544, |
|
"grad_norm": 0.21059083938598633, |
|
"learning_rate": 0.00016705572958388576, |
|
"loss": 0.689, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.4246575342465753, |
|
"grad_norm": 0.2090144008398056, |
|
"learning_rate": 0.00016585930722009601, |
|
"loss": 0.6909, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.470319634703196, |
|
"grad_norm": 0.1937251091003418, |
|
"learning_rate": 0.00016464599952375998, |
|
"loss": 0.6943, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5159817351598175, |
|
"grad_norm": 0.19087150692939758, |
|
"learning_rate": 0.000163416117568662, |
|
"loss": 0.6936, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.5616438356164384, |
|
"grad_norm": 0.21907366812229156, |
|
"learning_rate": 0.0001621699766779763, |
|
"loss": 0.6928, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6073059360730593, |
|
"grad_norm": 0.1987326741218567, |
|
"learning_rate": 0.00016090789634342278, |
|
"loss": 0.6913, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.65296803652968, |
|
"grad_norm": 0.19414757192134857, |
|
"learning_rate": 0.00015963020014335438, |
|
"loss": 0.6873, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6986301369863015, |
|
"grad_norm": 0.19000251591205597, |
|
"learning_rate": 0.0001583372156597961, |
|
"loss": 0.6895, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.7442922374429224, |
|
"grad_norm": 0.17563720047473907, |
|
"learning_rate": 0.00015702927439445826, |
|
"loss": 0.6905, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7899543378995433, |
|
"grad_norm": 0.22648802399635315, |
|
"learning_rate": 0.00015570671168374438, |
|
"loss": 0.685, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.8356164383561646, |
|
"grad_norm": 0.1822802722454071, |
|
"learning_rate": 0.00015436986661277577, |
|
"loss": 0.6897, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.8812785388127855, |
|
"grad_norm": 0.18406091630458832, |
|
"learning_rate": 0.0001530190819284555, |
|
"loss": 0.6849, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.9269406392694064, |
|
"grad_norm": 0.1968332827091217, |
|
"learning_rate": 0.00015165470395159313, |
|
"loss": 0.6841, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.9726027397260273, |
|
"grad_norm": 0.2520104646682739, |
|
"learning_rate": 0.0001502770824881133, |
|
"loss": 0.6924, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.1630699634552, |
|
"eval_runtime": 0.7164, |
|
"eval_samples_per_second": 13.958, |
|
"eval_steps_per_second": 1.396, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 4.018264840182648, |
|
"grad_norm": 0.20841118693351746, |
|
"learning_rate": 0.00014888657073937076, |
|
"loss": 0.6786, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.063926940639269, |
|
"grad_norm": 0.2335735559463501, |
|
"learning_rate": 0.00014748352521159493, |
|
"loss": 0.6687, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.109589041095891, |
|
"grad_norm": 0.23169922828674316, |
|
"learning_rate": 0.0001460683056244869, |
|
"loss": 0.6701, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.155251141552512, |
|
"grad_norm": 0.20742633938789368, |
|
"learning_rate": 0.00014464127481899312, |
|
"loss": 0.6678, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.200913242009133, |
|
"grad_norm": 0.233662411570549, |
|
"learning_rate": 0.00014320279866427796, |
|
"loss": 0.669, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.2465753424657535, |
|
"grad_norm": 0.22496378421783447, |
|
"learning_rate": 0.00014175324596392075, |
|
"loss": 0.6695, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.292237442922374, |
|
"grad_norm": 0.1961798071861267, |
|
"learning_rate": 0.00014029298836135988, |
|
"loss": 0.6694, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.337899543378995, |
|
"grad_norm": 0.21112968027591705, |
|
"learning_rate": 0.00013882240024460927, |
|
"loss": 0.6758, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.383561643835616, |
|
"grad_norm": 0.21480615437030792, |
|
"learning_rate": 0.0001373418586502706, |
|
"loss": 0.6685, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.429223744292237, |
|
"grad_norm": 0.24116870760917664, |
|
"learning_rate": 0.0001358517431668672, |
|
"loss": 0.6686, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.474885844748858, |
|
"grad_norm": 0.20268571376800537, |
|
"learning_rate": 0.00013435243583752294, |
|
"loss": 0.6642, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.52054794520548, |
|
"grad_norm": 0.19283969700336456, |
|
"learning_rate": 0.00013284432106201233, |
|
"loss": 0.6729, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.566210045662101, |
|
"grad_norm": 0.18919415771961212, |
|
"learning_rate": 0.00013132778549820618, |
|
"loss": 0.6713, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.6118721461187215, |
|
"grad_norm": 0.19388696551322937, |
|
"learning_rate": 0.00012980321796293836, |
|
"loss": 0.6698, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.657534246575342, |
|
"grad_norm": 0.19509977102279663, |
|
"learning_rate": 0.00012827100933231905, |
|
"loss": 0.6732, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.703196347031963, |
|
"grad_norm": 0.19472207129001617, |
|
"learning_rate": 0.00012673155244151985, |
|
"loss": 0.6689, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.748858447488584, |
|
"grad_norm": 0.1974049210548401, |
|
"learning_rate": 0.000125185241984057, |
|
"loss": 0.6661, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.794520547945205, |
|
"grad_norm": 0.18362760543823242, |
|
"learning_rate": 0.00012363247441059776, |
|
"loss": 0.6705, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.840182648401827, |
|
"grad_norm": 0.1863856315612793, |
|
"learning_rate": 0.00012207364782731655, |
|
"loss": 0.6663, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.885844748858448, |
|
"grad_norm": 0.19237390160560608, |
|
"learning_rate": 0.00012050916189382646, |
|
"loss": 0.6701, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.931506849315069, |
|
"grad_norm": 0.1915617734193802, |
|
"learning_rate": 0.00011893941772071249, |
|
"loss": 0.6781, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.9771689497716896, |
|
"grad_norm": 0.18157874047756195, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.6777, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.995433789954338, |
|
"eval_loss": 2.182265520095825, |
|
"eval_runtime": 0.7269, |
|
"eval_samples_per_second": 13.757, |
|
"eval_steps_per_second": 1.376, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 5.0228310502283104, |
|
"grad_norm": 0.19536516070365906, |
|
"learning_rate": 0.0001157857657354354, |
|
"loss": 0.6657, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.068493150684931, |
|
"grad_norm": 0.22191356122493744, |
|
"learning_rate": 0.00011420266647205231, |
|
"loss": 0.6547, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.114155251141552, |
|
"grad_norm": 0.2473655790090561, |
|
"learning_rate": 0.00011261592585930576, |
|
"loss": 0.649, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.159817351598173, |
|
"grad_norm": 0.21013621985912323, |
|
"learning_rate": 0.00011102595071354472, |
|
"loss": 0.6493, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.205479452054795, |
|
"grad_norm": 0.2081068456172943, |
|
"learning_rate": 0.00010943314868040364, |
|
"loss": 0.647, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.251141552511416, |
|
"grad_norm": 0.23985308408737183, |
|
"learning_rate": 0.00010783792813028827, |
|
"loss": 0.6505, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.296803652968037, |
|
"grad_norm": 0.19800134003162384, |
|
"learning_rate": 0.00010624069805367559, |
|
"loss": 0.6537, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.342465753424658, |
|
"grad_norm": 0.21391679346561432, |
|
"learning_rate": 0.00010464186795625482, |
|
"loss": 0.6558, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.3881278538812785, |
|
"grad_norm": 0.205993190407753, |
|
"learning_rate": 0.00010304184775393642, |
|
"loss": 0.6578, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.433789954337899, |
|
"grad_norm": 0.20510774850845337, |
|
"learning_rate": 0.00010144104766775572, |
|
"loss": 0.6565, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.47945205479452, |
|
"grad_norm": 0.1970670372247696, |
|
"learning_rate": 9.983987811869862e-05, |
|
"loss": 0.6515, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.525114155251142, |
|
"grad_norm": 0.18836742639541626, |
|
"learning_rate": 9.823874962247564e-05, |
|
"loss": 0.6557, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 5.570776255707763, |
|
"grad_norm": 0.20027703046798706, |
|
"learning_rate": 9.663807268427198e-05, |
|
"loss": 0.6522, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.616438356164384, |
|
"grad_norm": 0.2243603765964508, |
|
"learning_rate": 9.503825769350017e-05, |
|
"loss": 0.6496, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 5.662100456621005, |
|
"grad_norm": 0.18932883441448212, |
|
"learning_rate": 9.343971481858246e-05, |
|
"loss": 0.6532, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.707762557077626, |
|
"grad_norm": 0.19952723383903503, |
|
"learning_rate": 9.184285390178978e-05, |
|
"loss": 0.6511, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.7534246575342465, |
|
"grad_norm": 0.20302745699882507, |
|
"learning_rate": 9.024808435416434e-05, |
|
"loss": 0.6524, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.799086757990867, |
|
"grad_norm": 0.1958286166191101, |
|
"learning_rate": 8.865581505055291e-05, |
|
"loss": 0.6543, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 5.844748858447488, |
|
"grad_norm": 0.189736470580101, |
|
"learning_rate": 8.706645422477739e-05, |
|
"loss": 0.6529, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.890410958904109, |
|
"grad_norm": 0.2069847285747528, |
|
"learning_rate": 8.548040936496989e-05, |
|
"loss": 0.6542, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.936073059360731, |
|
"grad_norm": 0.2031773030757904, |
|
"learning_rate": 8.389808710909881e-05, |
|
"loss": 0.6515, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.981735159817352, |
|
"grad_norm": 0.20669801533222198, |
|
"learning_rate": 8.231989314071317e-05, |
|
"loss": 0.6526, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.207833766937256, |
|
"eval_runtime": 0.7173, |
|
"eval_samples_per_second": 13.941, |
|
"eval_steps_per_second": 1.394, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 6.027397260273973, |
|
"grad_norm": 0.2104310542345047, |
|
"learning_rate": 8.07462320849313e-05, |
|
"loss": 0.6444, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 6.073059360730594, |
|
"grad_norm": 0.18349581956863403, |
|
"learning_rate": 7.917750740470117e-05, |
|
"loss": 0.6397, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.1187214611872145, |
|
"grad_norm": 0.20204049348831177, |
|
"learning_rate": 7.761412129735852e-05, |
|
"loss": 0.6439, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.164383561643835, |
|
"grad_norm": 0.20316743850708008, |
|
"learning_rate": 7.605647459150961e-05, |
|
"loss": 0.6392, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.210045662100456, |
|
"grad_norm": 0.2150379866361618, |
|
"learning_rate": 7.450496664426477e-05, |
|
"loss": 0.634, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.255707762557078, |
|
"grad_norm": 0.19887958467006683, |
|
"learning_rate": 7.295999523884921e-05, |
|
"loss": 0.6442, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.301369863013699, |
|
"grad_norm": 0.1974460333585739, |
|
"learning_rate": 7.142195648261747e-05, |
|
"loss": 0.6396, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.34703196347032, |
|
"grad_norm": 0.21475858986377716, |
|
"learning_rate": 6.989124470549745e-05, |
|
"loss": 0.6354, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.392694063926941, |
|
"grad_norm": 0.19873230159282684, |
|
"learning_rate": 6.83682523588902e-05, |
|
"loss": 0.64, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.438356164383562, |
|
"grad_norm": 0.20658674836158752, |
|
"learning_rate": 6.685336991505122e-05, |
|
"loss": 0.6367, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.4840182648401825, |
|
"grad_norm": 0.2072938233613968, |
|
"learning_rate": 6.534698576697939e-05, |
|
"loss": 0.6388, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.529680365296803, |
|
"grad_norm": 0.20482133328914642, |
|
"learning_rate": 6.384948612883873e-05, |
|
"loss": 0.6397, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 6.575342465753424, |
|
"grad_norm": 0.20856335759162903, |
|
"learning_rate": 6.2361254936939e-05, |
|
"loss": 0.6407, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.621004566210045, |
|
"grad_norm": 0.21489538252353668, |
|
"learning_rate": 6.088267375130023e-05, |
|
"loss": 0.6414, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.19792255759239197, |
|
"learning_rate": 5.941412165782645e-05, |
|
"loss": 0.634, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.712328767123288, |
|
"grad_norm": 0.19678843021392822, |
|
"learning_rate": 5.79559751711138e-05, |
|
"loss": 0.6407, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 6.757990867579909, |
|
"grad_norm": 0.18998436629772186, |
|
"learning_rate": 5.650860813791785e-05, |
|
"loss": 0.6346, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.80365296803653, |
|
"grad_norm": 0.19847027957439423, |
|
"learning_rate": 5.507239164130501e-05, |
|
"loss": 0.6409, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 6.8493150684931505, |
|
"grad_norm": 0.205197274684906, |
|
"learning_rate": 5.364769390551225e-05, |
|
"loss": 0.6404, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.894977168949771, |
|
"grad_norm": 0.19521455466747284, |
|
"learning_rate": 5.2234880201540284e-05, |
|
"loss": 0.6386, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 6.940639269406392, |
|
"grad_norm": 0.1955854445695877, |
|
"learning_rate": 5.0834312753503124e-05, |
|
"loss": 0.6349, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.986301369863014, |
|
"grad_norm": 0.1964850276708603, |
|
"learning_rate": 4.9446350645759885e-05, |
|
"loss": 0.6326, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 6.995433789954338, |
|
"eval_loss": 2.229551315307617, |
|
"eval_runtime": 0.7351, |
|
"eval_samples_per_second": 13.604, |
|
"eval_steps_per_second": 1.36, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 7.031963470319635, |
|
"grad_norm": 0.1950986385345459, |
|
"learning_rate": 4.807134973085036e-05, |
|
"loss": 0.626, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 7.077625570776256, |
|
"grad_norm": 0.20411163568496704, |
|
"learning_rate": 4.6709662538260267e-05, |
|
"loss": 0.6293, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.123287671232877, |
|
"grad_norm": 0.21672751009464264, |
|
"learning_rate": 4.53616381840377e-05, |
|
"loss": 0.6255, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.168949771689498, |
|
"grad_norm": 0.20474384725093842, |
|
"learning_rate": 4.402762228128531e-05, |
|
"loss": 0.6317, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.2146118721461185, |
|
"grad_norm": 0.20330122113227844, |
|
"learning_rate": 4.2707956851550016e-05, |
|
"loss": 0.6297, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.260273972602739, |
|
"grad_norm": 0.21286101639270782, |
|
"learning_rate": 4.140298023713416e-05, |
|
"loss": 0.6278, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.30593607305936, |
|
"grad_norm": 0.20241223275661469, |
|
"learning_rate": 4.011302701434937e-05, |
|
"loss": 0.6223, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.351598173515982, |
|
"grad_norm": 0.20820164680480957, |
|
"learning_rate": 3.8838427907736476e-05, |
|
"loss": 0.631, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.397260273972603, |
|
"grad_norm": 0.2071073353290558, |
|
"learning_rate": 3.757950970527249e-05, |
|
"loss": 0.627, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.442922374429224, |
|
"grad_norm": 0.20754040777683258, |
|
"learning_rate": 3.633659517458736e-05, |
|
"loss": 0.6284, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 7.488584474885845, |
|
"grad_norm": 0.204667329788208, |
|
"learning_rate": 3.5110002980210975e-05, |
|
"loss": 0.6214, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.534246575342466, |
|
"grad_norm": 0.204596146941185, |
|
"learning_rate": 3.3900047601872596e-05, |
|
"loss": 0.6241, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 7.579908675799087, |
|
"grad_norm": 0.2120332568883896, |
|
"learning_rate": 3.270703925387279e-05, |
|
"loss": 0.6295, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 7.6255707762557075, |
|
"grad_norm": 0.20006106793880463, |
|
"learning_rate": 3.153128380554941e-05, |
|
"loss": 0.6259, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 7.671232876712329, |
|
"grad_norm": 0.19608016312122345, |
|
"learning_rate": 3.037308270285709e-05, |
|
"loss": 0.6234, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.71689497716895, |
|
"grad_norm": 0.20698235929012299, |
|
"learning_rate": 2.923273289108115e-05, |
|
"loss": 0.6312, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 7.762557077625571, |
|
"grad_norm": 0.20038080215454102, |
|
"learning_rate": 2.8110526738705344e-05, |
|
"loss": 0.6266, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.808219178082192, |
|
"grad_norm": 0.2022303193807602, |
|
"learning_rate": 2.7006751962452882e-05, |
|
"loss": 0.631, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 7.853881278538813, |
|
"grad_norm": 0.20605237782001495, |
|
"learning_rate": 2.592169155352031e-05, |
|
"loss": 0.6331, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.899543378995434, |
|
"grad_norm": 0.2080409824848175, |
|
"learning_rate": 2.485562370502279e-05, |
|
"loss": 0.6313, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 7.945205479452055, |
|
"grad_norm": 0.2000366896390915, |
|
"learning_rate": 2.3808821740669606e-05, |
|
"loss": 0.6285, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.9908675799086755, |
|
"grad_norm": 0.20418736338615417, |
|
"learning_rate": 2.2781554044688015e-05, |
|
"loss": 0.6311, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.2484524250030518, |
|
"eval_runtime": 0.7257, |
|
"eval_samples_per_second": 13.779, |
|
"eval_steps_per_second": 1.378, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 8.036529680365296, |
|
"grad_norm": 0.2047678381204605, |
|
"learning_rate": 2.1774083993013718e-05, |
|
"loss": 0.6168, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 8.082191780821917, |
|
"grad_norm": 0.20055288076400757, |
|
"learning_rate": 2.078666988576504e-05, |
|
"loss": 0.6163, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.127853881278538, |
|
"grad_norm": 0.2002391219139099, |
|
"learning_rate": 1.9819564881018983e-05, |
|
"loss": 0.6216, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.173515981735159, |
|
"grad_norm": 0.19824841618537903, |
|
"learning_rate": 1.887301692990494e-05, |
|
"loss": 0.6215, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.219178082191782, |
|
"grad_norm": 0.20471996068954468, |
|
"learning_rate": 1.7947268713034127e-05, |
|
"loss": 0.6168, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.264840182648403, |
|
"grad_norm": 0.20544229447841644, |
|
"learning_rate": 1.7042557578279626e-05, |
|
"loss": 0.6188, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.310502283105023, |
|
"grad_norm": 0.208919957280159, |
|
"learning_rate": 1.6159115479924257e-05, |
|
"loss": 0.6221, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.356164383561644, |
|
"grad_norm": 0.20433735847473145, |
|
"learning_rate": 1.529716891919074e-05, |
|
"loss": 0.6187, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 8.401826484018265, |
|
"grad_norm": 0.20233039557933807, |
|
"learning_rate": 1.4456938886170412e-05, |
|
"loss": 0.6224, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.447488584474886, |
|
"grad_norm": 0.2014380395412445, |
|
"learning_rate": 1.3638640803164516e-05, |
|
"loss": 0.6186, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 8.493150684931507, |
|
"grad_norm": 0.20106054842472076, |
|
"learning_rate": 1.2842484469453365e-05, |
|
"loss": 0.6229, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 8.538812785388128, |
|
"grad_norm": 0.20424200594425201, |
|
"learning_rate": 1.2068674007506786e-05, |
|
"loss": 0.6225, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 8.584474885844749, |
|
"grad_norm": 0.19919848442077637, |
|
"learning_rate": 1.1317407810650372e-05, |
|
"loss": 0.621, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.63013698630137, |
|
"grad_norm": 0.21022921800613403, |
|
"learning_rate": 1.058887849220026e-05, |
|
"loss": 0.6208, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 8.67579908675799, |
|
"grad_norm": 0.2036212980747223, |
|
"learning_rate": 9.883272836080116e-06, |
|
"loss": 0.6184, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 8.721461187214611, |
|
"grad_norm": 0.19858643412590027, |
|
"learning_rate": 9.200771748932513e-06, |
|
"loss": 0.6224, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 8.767123287671232, |
|
"grad_norm": 0.2013389766216278, |
|
"learning_rate": 8.541550213737171e-06, |
|
"loss": 0.6175, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.812785388127853, |
|
"grad_norm": 0.20145344734191895, |
|
"learning_rate": 7.905777244947954e-06, |
|
"loss": 0.6204, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 8.858447488584474, |
|
"grad_norm": 0.20189128816127777, |
|
"learning_rate": 7.293615845160196e-06, |
|
"loss": 0.6225, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 8.904109589041095, |
|
"grad_norm": 0.2002427577972412, |
|
"learning_rate": 6.705222963319191e-06, |
|
"loss": 0.6197, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 8.949771689497716, |
|
"grad_norm": 0.21808940172195435, |
|
"learning_rate": 6.140749454480932e-06, |
|
"loss": 0.6228, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.995433789954339, |
|
"grad_norm": 0.20092381536960602, |
|
"learning_rate": 5.6003400411351325e-06, |
|
"loss": 0.6233, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 8.995433789954339, |
|
"eval_loss": 2.258711099624634, |
|
"eval_runtime": 0.7286, |
|
"eval_samples_per_second": 13.726, |
|
"eval_steps_per_second": 1.373, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 9.04109589041096, |
|
"grad_norm": 0.19878605008125305, |
|
"learning_rate": 5.0841332761005e-06, |
|
"loss": 0.6193, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 9.08675799086758, |
|
"grad_norm": 0.20306305587291718, |
|
"learning_rate": 4.592261507001993e-06, |
|
"loss": 0.6177, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.132420091324201, |
|
"grad_norm": 0.2019263654947281, |
|
"learning_rate": 4.124850842338779e-06, |
|
"loss": 0.6133, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.178082191780822, |
|
"grad_norm": 0.2005142867565155, |
|
"learning_rate": 3.6820211191520125e-06, |
|
"loss": 0.6141, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.223744292237443, |
|
"grad_norm": 0.20048119127750397, |
|
"learning_rate": 3.263885872300343e-06, |
|
"loss": 0.6224, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.269406392694064, |
|
"grad_norm": 0.20118780434131622, |
|
"learning_rate": 2.8705523053513816e-06, |
|
"loss": 0.6158, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 9.315068493150685, |
|
"grad_norm": 0.19994989037513733, |
|
"learning_rate": 2.502121263096224e-06, |
|
"loss": 0.6133, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.360730593607306, |
|
"grad_norm": 0.19647841155529022, |
|
"learning_rate": 2.1586872056944428e-06, |
|
"loss": 0.6172, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 9.406392694063927, |
|
"grad_norm": 0.20165683329105377, |
|
"learning_rate": 1.840338184455881e-06, |
|
"loss": 0.6193, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 9.452054794520548, |
|
"grad_norm": 0.20188592374324799, |
|
"learning_rate": 1.5471558192656777e-06, |
|
"loss": 0.6216, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 9.497716894977168, |
|
"grad_norm": 0.20157510042190552, |
|
"learning_rate": 1.2792152776580968e-06, |
|
"loss": 0.616, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.54337899543379, |
|
"grad_norm": 0.20351751148700714, |
|
"learning_rate": 1.036585255544764e-06, |
|
"loss": 0.6149, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 9.58904109589041, |
|
"grad_norm": 0.1994985044002533, |
|
"learning_rate": 8.193279596020121e-07, |
|
"loss": 0.6184, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 9.634703196347033, |
|
"grad_norm": 0.2011667788028717, |
|
"learning_rate": 6.274990913221035e-07, |
|
"loss": 0.6149, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 9.680365296803654, |
|
"grad_norm": 0.2020387500524521, |
|
"learning_rate": 4.6114783273213393e-07, |
|
"loss": 0.619, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.726027397260275, |
|
"grad_norm": 0.19661925733089447, |
|
"learning_rate": 3.203168337845508e-07, |
|
"loss": 0.6193, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 9.771689497716896, |
|
"grad_norm": 0.20111018419265747, |
|
"learning_rate": 2.05042201422323e-07, |
|
"loss": 0.615, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 9.817351598173516, |
|
"grad_norm": 0.20135089755058289, |
|
"learning_rate": 1.1535349032167908e-07, |
|
"loss": 0.6158, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 9.863013698630137, |
|
"grad_norm": 0.20196588337421417, |
|
"learning_rate": 5.127369531473525e-08, |
|
"loss": 0.614, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.908675799086758, |
|
"grad_norm": 0.19875915348529816, |
|
"learning_rate": 1.2819245493955744e-08, |
|
"loss": 0.6174, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"grad_norm": 0.19983670115470886, |
|
"learning_rate": 0.0, |
|
"loss": 0.6194, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"eval_loss": 2.260620594024658, |
|
"eval_runtime": 0.6934, |
|
"eval_samples_per_second": 14.422, |
|
"eval_steps_per_second": 1.442, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.954337899543379, |
|
"step": 1090, |
|
"total_flos": 6.456679991336763e+18, |
|
"train_loss": 0.7130365929472338, |
|
"train_runtime": 9668.9725, |
|
"train_samples_per_second": 14.452, |
|
"train_steps_per_second": 0.113 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1090, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.456679991336763e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|