|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 100, |
|
"global_step": 2853, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010515247108307045, |
|
"grad_norm": 21.846562454867367, |
|
"learning_rate": 6.993006993006993e-08, |
|
"loss": 1.3669, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005257623554153523, |
|
"grad_norm": 21.132009448461105, |
|
"learning_rate": 3.496503496503497e-07, |
|
"loss": 1.3522, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010515247108307046, |
|
"grad_norm": 17.494352717419737, |
|
"learning_rate": 6.993006993006994e-07, |
|
"loss": 1.354, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015772870662460567, |
|
"grad_norm": 3.3387986746360583, |
|
"learning_rate": 1.0489510489510491e-06, |
|
"loss": 1.2867, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02103049421661409, |
|
"grad_norm": 2.205920355996413, |
|
"learning_rate": 1.3986013986013987e-06, |
|
"loss": 1.2275, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026288117770767613, |
|
"grad_norm": 1.369597839526372, |
|
"learning_rate": 1.7482517482517483e-06, |
|
"loss": 1.1889, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.031545741324921134, |
|
"grad_norm": 1.1230719730143253, |
|
"learning_rate": 2.0979020979020983e-06, |
|
"loss": 1.1654, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03680336487907466, |
|
"grad_norm": 0.8609236106610554, |
|
"learning_rate": 2.4475524475524477e-06, |
|
"loss": 1.1648, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04206098843322818, |
|
"grad_norm": 0.797862698606503, |
|
"learning_rate": 2.7972027972027974e-06, |
|
"loss": 1.1519, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0473186119873817, |
|
"grad_norm": 0.8072361631632317, |
|
"learning_rate": 3.1468531468531472e-06, |
|
"loss": 1.1237, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.052576235541535225, |
|
"grad_norm": 0.7629783040030311, |
|
"learning_rate": 3.4965034965034966e-06, |
|
"loss": 1.1325, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05783385909568875, |
|
"grad_norm": 0.6873017397880803, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 1.1026, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06309148264984227, |
|
"grad_norm": 0.696611266506068, |
|
"learning_rate": 4.195804195804197e-06, |
|
"loss": 1.1039, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0683491062039958, |
|
"grad_norm": 0.7029981157873147, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.0926, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07360672975814932, |
|
"grad_norm": 0.7133610172269549, |
|
"learning_rate": 4.895104895104895e-06, |
|
"loss": 1.1062, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07886435331230283, |
|
"grad_norm": 0.7217968834872122, |
|
"learning_rate": 5.244755244755245e-06, |
|
"loss": 1.1003, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08412197686645637, |
|
"grad_norm": 0.7014523604685313, |
|
"learning_rate": 5.594405594405595e-06, |
|
"loss": 1.0944, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08937960042060988, |
|
"grad_norm": 0.7206581604903497, |
|
"learning_rate": 5.944055944055944e-06, |
|
"loss": 1.1056, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0946372239747634, |
|
"grad_norm": 0.7457066334039347, |
|
"learning_rate": 6.2937062937062944e-06, |
|
"loss": 1.1065, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09989484752891693, |
|
"grad_norm": 0.7208104956061856, |
|
"learning_rate": 6.643356643356644e-06, |
|
"loss": 1.0892, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10515247108307045, |
|
"grad_norm": 0.7717192179121782, |
|
"learning_rate": 6.993006993006993e-06, |
|
"loss": 1.103, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10515247108307045, |
|
"eval_loss": 1.0989242792129517, |
|
"eval_runtime": 734.3008, |
|
"eval_samples_per_second": 18.326, |
|
"eval_steps_per_second": 0.144, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11041009463722397, |
|
"grad_norm": 0.8259894569490014, |
|
"learning_rate": 7.342657342657343e-06, |
|
"loss": 1.0814, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1156677181913775, |
|
"grad_norm": 1.1085848778320089, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 1.1025, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12092534174553102, |
|
"grad_norm": 0.7455910381865771, |
|
"learning_rate": 8.041958041958042e-06, |
|
"loss": 1.1079, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12618296529968454, |
|
"grad_norm": 0.9674079050397694, |
|
"learning_rate": 8.391608391608393e-06, |
|
"loss": 1.0784, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13144058885383805, |
|
"grad_norm": 0.9988378537350968, |
|
"learning_rate": 8.741258741258743e-06, |
|
"loss": 1.0759, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1366982124079916, |
|
"grad_norm": 0.8719562477062238, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.1161, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14195583596214512, |
|
"grad_norm": 0.7675881427867184, |
|
"learning_rate": 9.44055944055944e-06, |
|
"loss": 1.0935, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14721345951629863, |
|
"grad_norm": 0.724214660257682, |
|
"learning_rate": 9.79020979020979e-06, |
|
"loss": 1.0905, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15247108307045215, |
|
"grad_norm": 0.7816188167257716, |
|
"learning_rate": 1.013986013986014e-05, |
|
"loss": 1.1015, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15772870662460567, |
|
"grad_norm": 0.8160659875512388, |
|
"learning_rate": 1.048951048951049e-05, |
|
"loss": 1.0841, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16298633017875921, |
|
"grad_norm": 0.7024355651373789, |
|
"learning_rate": 1.083916083916084e-05, |
|
"loss": 1.0944, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16824395373291273, |
|
"grad_norm": 0.8350948417258764, |
|
"learning_rate": 1.118881118881119e-05, |
|
"loss": 1.0865, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17350157728706625, |
|
"grad_norm": 0.7759430616830292, |
|
"learning_rate": 1.1538461538461538e-05, |
|
"loss": 1.0957, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.17875920084121977, |
|
"grad_norm": 0.8253932883708276, |
|
"learning_rate": 1.1888111888111888e-05, |
|
"loss": 1.0862, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18401682439537329, |
|
"grad_norm": 0.727381322987075, |
|
"learning_rate": 1.2237762237762239e-05, |
|
"loss": 1.0531, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.1892744479495268, |
|
"grad_norm": 0.7093901420339217, |
|
"learning_rate": 1.2587412587412589e-05, |
|
"loss": 1.0983, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19453207150368035, |
|
"grad_norm": 0.953147430950059, |
|
"learning_rate": 1.2937062937062939e-05, |
|
"loss": 1.0971, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.19978969505783387, |
|
"grad_norm": 0.7716908173558169, |
|
"learning_rate": 1.3286713286713288e-05, |
|
"loss": 1.075, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.20504731861198738, |
|
"grad_norm": 0.8672736054906722, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.0793, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2103049421661409, |
|
"grad_norm": 0.8675965486083684, |
|
"learning_rate": 1.3986013986013986e-05, |
|
"loss": 1.0867, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2103049421661409, |
|
"eval_loss": 1.0965888500213623, |
|
"eval_runtime": 649.556, |
|
"eval_samples_per_second": 20.717, |
|
"eval_steps_per_second": 0.163, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21556256572029442, |
|
"grad_norm": 0.8977881972264273, |
|
"learning_rate": 1.4335664335664336e-05, |
|
"loss": 1.0954, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22082018927444794, |
|
"grad_norm": 0.7235719037878356, |
|
"learning_rate": 1.4685314685314686e-05, |
|
"loss": 1.0983, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22607781282860148, |
|
"grad_norm": 0.7296340934381736, |
|
"learning_rate": 1.5034965034965037e-05, |
|
"loss": 1.0782, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.231335436382755, |
|
"grad_norm": 0.8547235635957527, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 1.0924, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.23659305993690852, |
|
"grad_norm": 0.7564410266828079, |
|
"learning_rate": 1.5734265734265734e-05, |
|
"loss": 1.0912, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24185068349106204, |
|
"grad_norm": 0.8525133534517718, |
|
"learning_rate": 1.6083916083916083e-05, |
|
"loss": 1.0954, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.24710830704521555, |
|
"grad_norm": 0.805401759340136, |
|
"learning_rate": 1.6433566433566433e-05, |
|
"loss": 1.0749, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.25236593059936907, |
|
"grad_norm": 0.7402154168479581, |
|
"learning_rate": 1.6783216783216786e-05, |
|
"loss": 1.0986, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2576235541535226, |
|
"grad_norm": 0.7474174775706688, |
|
"learning_rate": 1.7132867132867136e-05, |
|
"loss": 1.0869, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2628811777076761, |
|
"grad_norm": 0.7369490806417859, |
|
"learning_rate": 1.7482517482517486e-05, |
|
"loss": 1.0776, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26813880126182965, |
|
"grad_norm": 0.9287606428487797, |
|
"learning_rate": 1.7832167832167832e-05, |
|
"loss": 1.1021, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2733964248159832, |
|
"grad_norm": 0.6947297961768544, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.1012, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2786540483701367, |
|
"grad_norm": 0.8123153065370199, |
|
"learning_rate": 1.8531468531468532e-05, |
|
"loss": 1.1074, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.28391167192429023, |
|
"grad_norm": 0.8302401027015646, |
|
"learning_rate": 1.888111888111888e-05, |
|
"loss": 1.1083, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2891692954784437, |
|
"grad_norm": 0.6881962349423542, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 1.0937, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.29442691903259727, |
|
"grad_norm": 0.7279189773202484, |
|
"learning_rate": 1.958041958041958e-05, |
|
"loss": 1.0913, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2996845425867508, |
|
"grad_norm": 1.0229788845896652, |
|
"learning_rate": 1.993006993006993e-05, |
|
"loss": 1.062, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3049421661409043, |
|
"grad_norm": 0.8087865464750971, |
|
"learning_rate": 1.9999880177844552e-05, |
|
"loss": 1.0943, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31019978969505785, |
|
"grad_norm": 0.7888576974082969, |
|
"learning_rate": 1.9999393405259354e-05, |
|
"loss": 1.0814, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"grad_norm": 0.7387794672867187, |
|
"learning_rate": 1.9998532211572566e-05, |
|
"loss": 1.111, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"eval_loss": 1.101216435432434, |
|
"eval_runtime": 582.6045, |
|
"eval_samples_per_second": 23.098, |
|
"eval_steps_per_second": 0.182, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3207150368033649, |
|
"grad_norm": 0.7908623885539283, |
|
"learning_rate": 1.999729662903106e-05, |
|
"loss": 1.0945, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.32597266035751843, |
|
"grad_norm": 0.6771503993700702, |
|
"learning_rate": 1.999568670390045e-05, |
|
"loss": 1.0926, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3312302839116719, |
|
"grad_norm": 0.6841445829487095, |
|
"learning_rate": 1.9993702496463395e-05, |
|
"loss": 1.1157, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.33648790746582546, |
|
"grad_norm": 0.6751678361124496, |
|
"learning_rate": 1.9991344081017312e-05, |
|
"loss": 1.1029, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.34174553101997895, |
|
"grad_norm": 0.6430661618178782, |
|
"learning_rate": 1.9988611545871606e-05, |
|
"loss": 1.0914, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.3470031545741325, |
|
"grad_norm": 0.6415970890637294, |
|
"learning_rate": 1.9985504993344375e-05, |
|
"loss": 1.095, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.352260778128286, |
|
"grad_norm": 0.7730277501959658, |
|
"learning_rate": 1.9982024539758547e-05, |
|
"loss": 1.1047, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.35751840168243953, |
|
"grad_norm": 0.697788892685463, |
|
"learning_rate": 1.997817031543756e-05, |
|
"loss": 1.0943, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3627760252365931, |
|
"grad_norm": 0.7222719849480133, |
|
"learning_rate": 1.9973942464700456e-05, |
|
"loss": 1.0723, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.36803364879074657, |
|
"grad_norm": 0.7260613938093592, |
|
"learning_rate": 1.9969341145856493e-05, |
|
"loss": 1.0839, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3732912723449001, |
|
"grad_norm": 0.733047206796414, |
|
"learning_rate": 1.9964366531199205e-05, |
|
"loss": 1.1031, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.3785488958990536, |
|
"grad_norm": 0.7026038822669134, |
|
"learning_rate": 1.995901880699997e-05, |
|
"loss": 1.0921, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.38380651945320715, |
|
"grad_norm": 0.7130129636017671, |
|
"learning_rate": 1.9953298173501007e-05, |
|
"loss": 1.1082, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3890641430073607, |
|
"grad_norm": 0.6640121507704535, |
|
"learning_rate": 1.9947204844907903e-05, |
|
"loss": 1.0865, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3943217665615142, |
|
"grad_norm": 0.6489539943403665, |
|
"learning_rate": 1.994073904938157e-05, |
|
"loss": 1.1005, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.39957939011566773, |
|
"grad_norm": 0.6442461845826825, |
|
"learning_rate": 1.9933901029029732e-05, |
|
"loss": 1.0723, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4048370136698212, |
|
"grad_norm": 0.7261445745563544, |
|
"learning_rate": 1.992669103989783e-05, |
|
"loss": 1.1011, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.41009463722397477, |
|
"grad_norm": 0.7222317305626339, |
|
"learning_rate": 1.9919109351959444e-05, |
|
"loss": 1.0908, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4153522607781283, |
|
"grad_norm": 0.6323590176184729, |
|
"learning_rate": 1.9911156249106186e-05, |
|
"loss": 1.089, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4206098843322818, |
|
"grad_norm": 0.6782790061464099, |
|
"learning_rate": 1.9902832029137086e-05, |
|
"loss": 1.0974, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4206098843322818, |
|
"eval_loss": 1.0965957641601562, |
|
"eval_runtime": 611.6915, |
|
"eval_samples_per_second": 22.0, |
|
"eval_steps_per_second": 0.173, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.42586750788643535, |
|
"grad_norm": 0.7130643351963079, |
|
"learning_rate": 1.9894137003747404e-05, |
|
"loss": 1.0863, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.43112513144058884, |
|
"grad_norm": 0.7035673344638229, |
|
"learning_rate": 1.988507149851699e-05, |
|
"loss": 1.0928, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4363827549947424, |
|
"grad_norm": 0.6366880839024881, |
|
"learning_rate": 1.987563585289808e-05, |
|
"loss": 1.0876, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4416403785488959, |
|
"grad_norm": 0.6724589905602705, |
|
"learning_rate": 1.9865830420202587e-05, |
|
"loss": 1.0814, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4468980021030494, |
|
"grad_norm": 0.7296112006903912, |
|
"learning_rate": 1.9855655567588877e-05, |
|
"loss": 1.0849, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.45215562565720296, |
|
"grad_norm": 0.7301202009190912, |
|
"learning_rate": 1.984511167604801e-05, |
|
"loss": 1.0943, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.45741324921135645, |
|
"grad_norm": 0.6728038801425467, |
|
"learning_rate": 1.9834199140389485e-05, |
|
"loss": 1.0958, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.46267087276551, |
|
"grad_norm": 0.6461291574762016, |
|
"learning_rate": 1.982291836922645e-05, |
|
"loss": 1.0814, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4679284963196635, |
|
"grad_norm": 0.7354488392025322, |
|
"learning_rate": 1.9811269784960404e-05, |
|
"loss": 1.1019, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.47318611987381703, |
|
"grad_norm": 0.871639557338332, |
|
"learning_rate": 1.9799253823765383e-05, |
|
"loss": 1.1006, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4784437434279706, |
|
"grad_norm": 0.7250167929947016, |
|
"learning_rate": 1.9786870935571617e-05, |
|
"loss": 1.0976, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.48370136698212407, |
|
"grad_norm": 0.7624377086650501, |
|
"learning_rate": 1.97741215840487e-05, |
|
"loss": 1.073, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4889589905362776, |
|
"grad_norm": 0.7335919595002304, |
|
"learning_rate": 1.9761006246588217e-05, |
|
"loss": 1.0928, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4942166140904311, |
|
"grad_norm": 0.6382852192610631, |
|
"learning_rate": 1.9747525414285863e-05, |
|
"loss": 1.0945, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.49947423764458465, |
|
"grad_norm": 0.718180529210079, |
|
"learning_rate": 1.9733679591923062e-05, |
|
"loss": 1.0749, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5047318611987381, |
|
"grad_norm": 0.6600718457016724, |
|
"learning_rate": 1.9719469297948076e-05, |
|
"loss": 1.1181, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5099894847528917, |
|
"grad_norm": 0.6689062165685349, |
|
"learning_rate": 1.9704895064456573e-05, |
|
"loss": 1.0952, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5152471083070452, |
|
"grad_norm": 0.7143276328895771, |
|
"learning_rate": 1.968995743717171e-05, |
|
"loss": 1.0896, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5205047318611987, |
|
"grad_norm": 0.6221294359823765, |
|
"learning_rate": 1.9674656975423704e-05, |
|
"loss": 1.0742, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5257623554153522, |
|
"grad_norm": 0.7268351101096144, |
|
"learning_rate": 1.9658994252128884e-05, |
|
"loss": 1.0898, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5257623554153522, |
|
"eval_loss": 1.091992974281311, |
|
"eval_runtime": 577.2656, |
|
"eval_samples_per_second": 23.312, |
|
"eval_steps_per_second": 0.184, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5310199789695058, |
|
"grad_norm": 0.7409815849447423, |
|
"learning_rate": 1.964296985376823e-05, |
|
"loss": 1.0785, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5362776025236593, |
|
"grad_norm": 0.7136236155581998, |
|
"learning_rate": 1.962658438036543e-05, |
|
"loss": 1.0983, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5415352260778128, |
|
"grad_norm": 0.7215624141555339, |
|
"learning_rate": 1.9609838445464406e-05, |
|
"loss": 1.1007, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5467928496319664, |
|
"grad_norm": 0.6979369948772214, |
|
"learning_rate": 1.959273267610633e-05, |
|
"loss": 1.0806, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5520504731861199, |
|
"grad_norm": 0.7255670203711404, |
|
"learning_rate": 1.9575267712806152e-05, |
|
"loss": 1.0753, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5573080967402734, |
|
"grad_norm": 0.6378781651024482, |
|
"learning_rate": 1.955744420952863e-05, |
|
"loss": 1.1001, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.562565720294427, |
|
"grad_norm": 0.6440842622036477, |
|
"learning_rate": 1.9539262833663813e-05, |
|
"loss": 1.0867, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5678233438485805, |
|
"grad_norm": 0.650711077304966, |
|
"learning_rate": 1.9520724266002078e-05, |
|
"loss": 1.0861, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.573080967402734, |
|
"grad_norm": 0.9412839294952584, |
|
"learning_rate": 1.9501829200708627e-05, |
|
"loss": 1.066, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5783385909568874, |
|
"grad_norm": 0.7997373349509072, |
|
"learning_rate": 1.948257834529749e-05, |
|
"loss": 1.0804, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.583596214511041, |
|
"grad_norm": 0.6632970321629863, |
|
"learning_rate": 1.9462972420605045e-05, |
|
"loss": 1.0796, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5888538380651945, |
|
"grad_norm": 0.6907348547616222, |
|
"learning_rate": 1.9443012160763014e-05, |
|
"loss": 1.0914, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.594111461619348, |
|
"grad_norm": 0.7602392699866063, |
|
"learning_rate": 1.9422698313170982e-05, |
|
"loss": 1.0782, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5993690851735016, |
|
"grad_norm": 0.7425506195668518, |
|
"learning_rate": 1.9402031638468407e-05, |
|
"loss": 1.0728, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6046267087276551, |
|
"grad_norm": 0.6057134136385478, |
|
"learning_rate": 1.9381012910506146e-05, |
|
"loss": 1.0944, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6098843322818086, |
|
"grad_norm": 0.611926050399381, |
|
"learning_rate": 1.935964291631746e-05, |
|
"loss": 1.0887, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6151419558359621, |
|
"grad_norm": 0.6044521957797464, |
|
"learning_rate": 1.933792245608857e-05, |
|
"loss": 1.0653, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6203995793901157, |
|
"grad_norm": 0.6160859598416025, |
|
"learning_rate": 1.9315852343128677e-05, |
|
"loss": 1.0697, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6256572029442692, |
|
"grad_norm": 0.6454926848454089, |
|
"learning_rate": 1.9293433403839506e-05, |
|
"loss": 1.0835, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"grad_norm": 0.6271287719549755, |
|
"learning_rate": 1.9270666477684375e-05, |
|
"loss": 1.0749, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"eval_loss": 1.0876203775405884, |
|
"eval_runtime": 619.1152, |
|
"eval_samples_per_second": 21.736, |
|
"eval_steps_per_second": 0.171, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6361724500525763, |
|
"grad_norm": 0.634393838535348, |
|
"learning_rate": 1.9247552417156758e-05, |
|
"loss": 1.0729, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6414300736067298, |
|
"grad_norm": 0.6594690945271786, |
|
"learning_rate": 1.9224092087748344e-05, |
|
"loss": 1.0827, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6466876971608833, |
|
"grad_norm": 0.611714575208264, |
|
"learning_rate": 1.920028636791667e-05, |
|
"loss": 1.0882, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6519453207150369, |
|
"grad_norm": 0.7463577820820205, |
|
"learning_rate": 1.9176136149052184e-05, |
|
"loss": 1.0756, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6572029442691903, |
|
"grad_norm": 0.5943822071057456, |
|
"learning_rate": 1.9151642335444894e-05, |
|
"loss": 1.0781, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6624605678233438, |
|
"grad_norm": 0.6478466639224281, |
|
"learning_rate": 1.9126805844250507e-05, |
|
"loss": 1.0799, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6677181913774973, |
|
"grad_norm": 1.125407499631879, |
|
"learning_rate": 1.910162760545607e-05, |
|
"loss": 1.0863, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6729758149316509, |
|
"grad_norm": 0.6317836803464292, |
|
"learning_rate": 1.9076108561845167e-05, |
|
"loss": 1.068, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6782334384858044, |
|
"grad_norm": 0.6782741352289255, |
|
"learning_rate": 1.90502496689626e-05, |
|
"loss": 1.0717, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6834910620399579, |
|
"grad_norm": 0.6549048073170591, |
|
"learning_rate": 1.902405189507862e-05, |
|
"loss": 1.0729, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6887486855941115, |
|
"grad_norm": 0.5944400668808439, |
|
"learning_rate": 1.899751622115267e-05, |
|
"loss": 1.073, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.694006309148265, |
|
"grad_norm": 0.6344443790559094, |
|
"learning_rate": 1.8970643640796642e-05, |
|
"loss": 1.0765, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6992639327024185, |
|
"grad_norm": 0.6066328657447971, |
|
"learning_rate": 1.8943435160237693e-05, |
|
"loss": 1.068, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.704521556256572, |
|
"grad_norm": 0.7935810543521484, |
|
"learning_rate": 1.8915891798280545e-05, |
|
"loss": 1.075, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7097791798107256, |
|
"grad_norm": 0.6311479883642119, |
|
"learning_rate": 1.8888014586269353e-05, |
|
"loss": 1.0605, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7150368033648791, |
|
"grad_norm": 0.6247754068444527, |
|
"learning_rate": 1.8859804568049083e-05, |
|
"loss": 1.0853, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7202944269190326, |
|
"grad_norm": 0.6133863303859032, |
|
"learning_rate": 1.8831262799926412e-05, |
|
"loss": 1.0751, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7255520504731862, |
|
"grad_norm": 0.6378281851358015, |
|
"learning_rate": 1.88023903506302e-05, |
|
"loss": 1.086, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7308096740273397, |
|
"grad_norm": 0.6695843196133265, |
|
"learning_rate": 1.8773188301271458e-05, |
|
"loss": 1.0655, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7360672975814931, |
|
"grad_norm": 0.6310578043108518, |
|
"learning_rate": 1.874365774530285e-05, |
|
"loss": 1.0847, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7360672975814931, |
|
"eval_loss": 1.083134412765503, |
|
"eval_runtime": 594.902, |
|
"eval_samples_per_second": 22.621, |
|
"eval_steps_per_second": 0.178, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7413249211356467, |
|
"grad_norm": 0.7538683907974313, |
|
"learning_rate": 1.8713799788477794e-05, |
|
"loss": 1.0691, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7465825446898002, |
|
"grad_norm": 0.706371524563473, |
|
"learning_rate": 1.8683615548809007e-05, |
|
"loss": 1.0654, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7518401682439537, |
|
"grad_norm": 0.7089836009644308, |
|
"learning_rate": 1.865310615652668e-05, |
|
"loss": 1.0732, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7570977917981072, |
|
"grad_norm": 0.6253449282146815, |
|
"learning_rate": 1.862227275403614e-05, |
|
"loss": 1.0595, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7623554153522608, |
|
"grad_norm": 0.6352792231235775, |
|
"learning_rate": 1.8591116495875065e-05, |
|
"loss": 1.0611, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7676130389064143, |
|
"grad_norm": 0.6559807547521417, |
|
"learning_rate": 1.8559638548670276e-05, |
|
"loss": 1.0772, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7728706624605678, |
|
"grad_norm": 0.660949169309788, |
|
"learning_rate": 1.8527840091094038e-05, |
|
"loss": 1.0723, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7781282860147214, |
|
"grad_norm": 0.6485292004090661, |
|
"learning_rate": 1.849572231381993e-05, |
|
"loss": 1.0756, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7833859095688749, |
|
"grad_norm": 0.5894518164357108, |
|
"learning_rate": 1.8463286419478256e-05, |
|
"loss": 1.0878, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7886435331230284, |
|
"grad_norm": 0.6373909243160687, |
|
"learning_rate": 1.843053362261102e-05, |
|
"loss": 1.0698, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7939011566771819, |
|
"grad_norm": 0.6247774742453552, |
|
"learning_rate": 1.8397465149626438e-05, |
|
"loss": 1.0689, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.7991587802313355, |
|
"grad_norm": 0.6702489085237104, |
|
"learning_rate": 1.836408223875303e-05, |
|
"loss": 1.0878, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.804416403785489, |
|
"grad_norm": 0.5901778445639561, |
|
"learning_rate": 1.8330386139993253e-05, |
|
"loss": 1.0615, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8096740273396424, |
|
"grad_norm": 0.5690160698641555, |
|
"learning_rate": 1.8296378115076683e-05, |
|
"loss": 1.0627, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.814931650893796, |
|
"grad_norm": 0.7286612536078287, |
|
"learning_rate": 1.826205943741277e-05, |
|
"loss": 1.0599, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8201892744479495, |
|
"grad_norm": 0.6255138205467193, |
|
"learning_rate": 1.8227431392043188e-05, |
|
"loss": 1.0738, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.825446898002103, |
|
"grad_norm": 0.6089376456915286, |
|
"learning_rate": 1.8192495275593667e-05, |
|
"loss": 1.0682, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8307045215562566, |
|
"grad_norm": 0.6155868150283563, |
|
"learning_rate": 1.8157252396225487e-05, |
|
"loss": 1.065, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8359621451104101, |
|
"grad_norm": 0.7289316735890606, |
|
"learning_rate": 1.812170407358647e-05, |
|
"loss": 1.0577, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8412197686645636, |
|
"grad_norm": 0.6194611530873854, |
|
"learning_rate": 1.8085851638761564e-05, |
|
"loss": 1.0749, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8412197686645636, |
|
"eval_loss": 1.0777511596679688, |
|
"eval_runtime": 578.5287, |
|
"eval_samples_per_second": 23.261, |
|
"eval_steps_per_second": 0.183, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8464773922187171, |
|
"grad_norm": 0.5897179737564566, |
|
"learning_rate": 1.8049696434223018e-05, |
|
"loss": 1.064, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.8517350157728707, |
|
"grad_norm": 0.6249138645283078, |
|
"learning_rate": 1.801323981378011e-05, |
|
"loss": 1.0689, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8569926393270242, |
|
"grad_norm": 0.6094536651967496, |
|
"learning_rate": 1.797648314252844e-05, |
|
"loss": 1.0547, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8622502628811777, |
|
"grad_norm": 0.6427649229281082, |
|
"learning_rate": 1.7939427796798835e-05, |
|
"loss": 1.0709, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8675078864353313, |
|
"grad_norm": 0.625645109760211, |
|
"learning_rate": 1.790207516410579e-05, |
|
"loss": 1.0711, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8727655099894848, |
|
"grad_norm": 0.6900102876237034, |
|
"learning_rate": 1.7864426643095537e-05, |
|
"loss": 1.0551, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8780231335436383, |
|
"grad_norm": 0.6633694160119932, |
|
"learning_rate": 1.7826483643493664e-05, |
|
"loss": 1.0647, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8832807570977917, |
|
"grad_norm": 0.6706740933862908, |
|
"learning_rate": 1.7788247586052324e-05, |
|
"loss": 1.068, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8885383806519453, |
|
"grad_norm": 0.6147588746912578, |
|
"learning_rate": 1.774971990249703e-05, |
|
"loss": 1.0675, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8937960042060988, |
|
"grad_norm": 0.650347913047383, |
|
"learning_rate": 1.7710902035473075e-05, |
|
"loss": 1.0563, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8990536277602523, |
|
"grad_norm": 0.5896501069060196, |
|
"learning_rate": 1.7671795438491476e-05, |
|
"loss": 1.0549, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9043112513144059, |
|
"grad_norm": 0.5865757288759952, |
|
"learning_rate": 1.763240157587457e-05, |
|
"loss": 1.074, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9095688748685594, |
|
"grad_norm": 0.6448523425472431, |
|
"learning_rate": 1.759272192270118e-05, |
|
"loss": 1.0406, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9148264984227129, |
|
"grad_norm": 0.628930087369231, |
|
"learning_rate": 1.7552757964751375e-05, |
|
"loss": 1.0604, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9200841219768665, |
|
"grad_norm": 0.5573844980993936, |
|
"learning_rate": 1.751251119845085e-05, |
|
"loss": 1.0712, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.92534174553102, |
|
"grad_norm": 0.5760631844651097, |
|
"learning_rate": 1.7471983130814872e-05, |
|
"loss": 1.0677, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.9305993690851735, |
|
"grad_norm": 0.6608474625527273, |
|
"learning_rate": 1.7431175279391864e-05, |
|
"loss": 1.0564, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.935856992639327, |
|
"grad_norm": 0.6158122817932856, |
|
"learning_rate": 1.7390089172206594e-05, |
|
"loss": 1.0698, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9411146161934806, |
|
"grad_norm": 0.6348226976928315, |
|
"learning_rate": 1.7348726347702922e-05, |
|
"loss": 1.0541, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"grad_norm": 0.5893951119046926, |
|
"learning_rate": 1.730708835468624e-05, |
|
"loss": 1.055, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"eval_loss": 1.0719902515411377, |
|
"eval_runtime": 554.5404, |
|
"eval_samples_per_second": 24.267, |
|
"eval_steps_per_second": 0.191, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9516298633017876, |
|
"grad_norm": 0.6398319094636862, |
|
"learning_rate": 1.7265176752265437e-05, |
|
"loss": 1.0606, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9568874868559412, |
|
"grad_norm": 0.6048116978972946, |
|
"learning_rate": 1.7222993109794547e-05, |
|
"loss": 1.0602, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9621451104100947, |
|
"grad_norm": 0.5840246341713026, |
|
"learning_rate": 1.7180539006813973e-05, |
|
"loss": 1.0479, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9674027339642481, |
|
"grad_norm": 0.5778229669814231, |
|
"learning_rate": 1.7137816032991338e-05, |
|
"loss": 1.0552, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9726603575184016, |
|
"grad_norm": 0.599559903007225, |
|
"learning_rate": 1.7094825788061984e-05, |
|
"loss": 1.0602, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9779179810725552, |
|
"grad_norm": 0.6085935007813816, |
|
"learning_rate": 1.7051569881769033e-05, |
|
"loss": 1.0702, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9831756046267087, |
|
"grad_norm": 0.6210127216958851, |
|
"learning_rate": 1.7008049933803153e-05, |
|
"loss": 1.0562, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9884332281808622, |
|
"grad_norm": 0.5660970609343743, |
|
"learning_rate": 1.696426757374187e-05, |
|
"loss": 1.0488, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9936908517350158, |
|
"grad_norm": 0.6052820312725565, |
|
"learning_rate": 1.6920224440988578e-05, |
|
"loss": 1.0579, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9989484752891693, |
|
"grad_norm": 0.6336659141670167, |
|
"learning_rate": 1.6875922184711152e-05, |
|
"loss": 1.0391, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0042060988433228, |
|
"grad_norm": 0.8649311407022923, |
|
"learning_rate": 1.6831362463780173e-05, |
|
"loss": 0.9427, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.0094637223974763, |
|
"grad_norm": 0.7906840430230622, |
|
"learning_rate": 1.6786546946706826e-05, |
|
"loss": 0.9093, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.0147213459516298, |
|
"grad_norm": 0.7615451637281871, |
|
"learning_rate": 1.6741477311580442e-05, |
|
"loss": 0.9129, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.0199789695057835, |
|
"grad_norm": 0.81395189037578, |
|
"learning_rate": 1.669615524600562e-05, |
|
"loss": 0.9116, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.025236593059937, |
|
"grad_norm": 0.6675565867389684, |
|
"learning_rate": 1.6650582447039087e-05, |
|
"loss": 0.897, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.0304942166140905, |
|
"grad_norm": 0.6558457233521835, |
|
"learning_rate": 1.6604760621126104e-05, |
|
"loss": 0.9059, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.035751840168244, |
|
"grad_norm": 0.791116301575079, |
|
"learning_rate": 1.655869148403661e-05, |
|
"loss": 0.9123, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.0410094637223974, |
|
"grad_norm": 0.6281691549427542, |
|
"learning_rate": 1.6512376760800943e-05, |
|
"loss": 0.9165, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.046267087276551, |
|
"grad_norm": 0.722210053446233, |
|
"learning_rate": 1.646581818564528e-05, |
|
"loss": 0.8885, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.0515247108307044, |
|
"grad_norm": 0.6566766982009167, |
|
"learning_rate": 1.641901750192666e-05, |
|
"loss": 0.9184, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0515247108307044, |
|
"eval_loss": 1.0817060470581055, |
|
"eval_runtime": 548.8481, |
|
"eval_samples_per_second": 24.519, |
|
"eval_steps_per_second": 0.193, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0567823343848581, |
|
"grad_norm": 0.7215682123240776, |
|
"learning_rate": 1.6371976462067744e-05, |
|
"loss": 0.9048, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.0620399579390116, |
|
"grad_norm": 0.5754913559382355, |
|
"learning_rate": 1.6324696827491178e-05, |
|
"loss": 0.9062, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.0672975814931651, |
|
"grad_norm": 0.7713724891213452, |
|
"learning_rate": 1.6277180368553637e-05, |
|
"loss": 0.9003, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.0725552050473186, |
|
"grad_norm": 0.6705202466831766, |
|
"learning_rate": 1.622942886447953e-05, |
|
"loss": 0.9076, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.077812828601472, |
|
"grad_norm": 0.7709385226269342, |
|
"learning_rate": 1.6181444103294405e-05, |
|
"loss": 0.9016, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.0830704521556256, |
|
"grad_norm": 0.6618094790250554, |
|
"learning_rate": 1.613322788175796e-05, |
|
"loss": 0.9087, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.088328075709779, |
|
"grad_norm": 0.7111642531915952, |
|
"learning_rate": 1.608478200529679e-05, |
|
"loss": 0.8993, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.0935856992639328, |
|
"grad_norm": 0.9967278615618546, |
|
"learning_rate": 1.6036108287936774e-05, |
|
"loss": 0.9053, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0988433228180863, |
|
"grad_norm": 0.7211016358920939, |
|
"learning_rate": 1.598720855223516e-05, |
|
"loss": 0.8967, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.1041009463722398, |
|
"grad_norm": 0.681965857428634, |
|
"learning_rate": 1.5938084629212308e-05, |
|
"loss": 0.9069, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.1093585699263933, |
|
"grad_norm": 0.7296745556202008, |
|
"learning_rate": 1.5888738358283125e-05, |
|
"loss": 0.8918, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.1146161934805467, |
|
"grad_norm": 0.6472282910374098, |
|
"learning_rate": 1.5839171587188213e-05, |
|
"loss": 0.8953, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.1198738170347002, |
|
"grad_norm": 0.6420578981972046, |
|
"learning_rate": 1.5789386171924656e-05, |
|
"loss": 0.9185, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.125131440588854, |
|
"grad_norm": 0.6592365438130466, |
|
"learning_rate": 1.5739383976676538e-05, |
|
"loss": 0.9338, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.1303890641430074, |
|
"grad_norm": 0.6668713420054354, |
|
"learning_rate": 1.5689166873745133e-05, |
|
"loss": 0.9071, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.135646687697161, |
|
"grad_norm": 0.6314319656757978, |
|
"learning_rate": 1.5638736743478807e-05, |
|
"loss": 0.9094, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.1409043112513144, |
|
"grad_norm": 0.6557318538936868, |
|
"learning_rate": 1.5588095474202597e-05, |
|
"loss": 0.9056, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.146161934805468, |
|
"grad_norm": 0.6988942180423913, |
|
"learning_rate": 1.55372449621475e-05, |
|
"loss": 0.9093, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1514195583596214, |
|
"grad_norm": 0.6288925365676942, |
|
"learning_rate": 1.54861871113795e-05, |
|
"loss": 0.8931, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.1566771819137749, |
|
"grad_norm": 0.6060978130757313, |
|
"learning_rate": 1.5434923833728238e-05, |
|
"loss": 0.8955, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1566771819137749, |
|
"eval_loss": 1.0778801441192627, |
|
"eval_runtime": 560.7689, |
|
"eval_samples_per_second": 23.997, |
|
"eval_steps_per_second": 0.189, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1619348054679284, |
|
"grad_norm": 0.636138975576772, |
|
"learning_rate": 1.538345704871544e-05, |
|
"loss": 0.9164, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.167192429022082, |
|
"grad_norm": 0.7813214708227075, |
|
"learning_rate": 1.533178868348304e-05, |
|
"loss": 0.9123, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.1724500525762356, |
|
"grad_norm": 0.6454922302300423, |
|
"learning_rate": 1.5279920672721014e-05, |
|
"loss": 0.9096, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.177707676130389, |
|
"grad_norm": 0.6684532969652581, |
|
"learning_rate": 1.522785495859495e-05, |
|
"loss": 0.913, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.1829652996845426, |
|
"grad_norm": 0.659104192691736, |
|
"learning_rate": 1.517559349067331e-05, |
|
"loss": 0.9127, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.188222923238696, |
|
"grad_norm": 0.6327096229416864, |
|
"learning_rate": 1.5123138225854437e-05, |
|
"loss": 0.9179, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.1934805467928495, |
|
"grad_norm": 0.6821427010599724, |
|
"learning_rate": 1.507049112829328e-05, |
|
"loss": 0.916, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.1987381703470033, |
|
"grad_norm": 0.6383663706263557, |
|
"learning_rate": 1.5017654169327847e-05, |
|
"loss": 0.9205, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.2039957939011567, |
|
"grad_norm": 0.6642751432840621, |
|
"learning_rate": 1.4964629327405385e-05, |
|
"loss": 0.9064, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.2092534174553102, |
|
"grad_norm": 0.6370926988086576, |
|
"learning_rate": 1.4911418588008302e-05, |
|
"loss": 0.9009, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.2145110410094637, |
|
"grad_norm": 0.6726809074089126, |
|
"learning_rate": 1.4858023943579831e-05, |
|
"loss": 0.9177, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.2197686645636172, |
|
"grad_norm": 0.6624168311883211, |
|
"learning_rate": 1.4804447393449408e-05, |
|
"loss": 0.9008, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.2250262881177707, |
|
"grad_norm": 0.6736191492385858, |
|
"learning_rate": 1.4750690943757815e-05, |
|
"loss": 0.9177, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.2302839116719242, |
|
"grad_norm": 0.6626164162916314, |
|
"learning_rate": 1.469675660738206e-05, |
|
"loss": 0.9125, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.235541535226078, |
|
"grad_norm": 0.6561095205909978, |
|
"learning_rate": 1.4642646403860017e-05, |
|
"loss": 0.9224, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.2407991587802314, |
|
"grad_norm": 0.6404857197573285, |
|
"learning_rate": 1.4588362359314787e-05, |
|
"loss": 0.9147, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.2460567823343849, |
|
"grad_norm": 0.6247458161762777, |
|
"learning_rate": 1.453390650637884e-05, |
|
"loss": 0.9055, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.2513144058885384, |
|
"grad_norm": 0.6205798650094878, |
|
"learning_rate": 1.4479280884117919e-05, |
|
"loss": 0.9098, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.2565720294426919, |
|
"grad_norm": 0.6171085702613818, |
|
"learning_rate": 1.4424487537954658e-05, |
|
"loss": 0.9086, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.2618296529968454, |
|
"grad_norm": 0.6817002284070426, |
|
"learning_rate": 1.4369528519592016e-05, |
|
"loss": 0.914, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2618296529968454, |
|
"eval_loss": 1.0758436918258667, |
|
"eval_runtime": 554.9555, |
|
"eval_samples_per_second": 24.249, |
|
"eval_steps_per_second": 0.191, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.267087276550999, |
|
"grad_norm": 0.6556393089241064, |
|
"learning_rate": 1.4314405886936444e-05, |
|
"loss": 0.907, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.2723449001051526, |
|
"grad_norm": 0.6564247019338768, |
|
"learning_rate": 1.425912170402083e-05, |
|
"loss": 0.8947, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.277602523659306, |
|
"grad_norm": 0.6909745550376631, |
|
"learning_rate": 1.4203678040927211e-05, |
|
"loss": 0.9015, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.2828601472134595, |
|
"grad_norm": 0.6649938010634878, |
|
"learning_rate": 1.414807697370926e-05, |
|
"loss": 0.9147, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.288117770767613, |
|
"grad_norm": 0.6827602346821062, |
|
"learning_rate": 1.4092320584314552e-05, |
|
"loss": 0.9223, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.2933753943217665, |
|
"grad_norm": 0.6891969548538285, |
|
"learning_rate": 1.4036410960506601e-05, |
|
"loss": 0.909, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.29863301787592, |
|
"grad_norm": 0.7488612526253159, |
|
"learning_rate": 1.3980350195786691e-05, |
|
"loss": 0.9063, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.3038906414300735, |
|
"grad_norm": 0.8765777386899024, |
|
"learning_rate": 1.3924140389315488e-05, |
|
"loss": 0.8949, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.3091482649842272, |
|
"grad_norm": 0.6756135072464465, |
|
"learning_rate": 1.3867783645834428e-05, |
|
"loss": 0.9173, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.3144058885383807, |
|
"grad_norm": 0.6511543641668399, |
|
"learning_rate": 1.3811282075586916e-05, |
|
"loss": 0.9075, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.3196635120925342, |
|
"grad_norm": 0.6171780710166301, |
|
"learning_rate": 1.3754637794239303e-05, |
|
"loss": 0.8977, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.3249211356466877, |
|
"grad_norm": 0.658721220404947, |
|
"learning_rate": 1.3697852922801669e-05, |
|
"loss": 0.9072, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.3301787592008412, |
|
"grad_norm": 0.6417444192429201, |
|
"learning_rate": 1.3640929587548403e-05, |
|
"loss": 0.9091, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.3354363827549949, |
|
"grad_norm": 0.6187189724748463, |
|
"learning_rate": 1.3583869919938597e-05, |
|
"loss": 0.9129, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.3406940063091484, |
|
"grad_norm": 0.5843959371785157, |
|
"learning_rate": 1.3526676056536205e-05, |
|
"loss": 0.9092, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.3459516298633019, |
|
"grad_norm": 0.6932618289744372, |
|
"learning_rate": 1.3469350138930073e-05, |
|
"loss": 0.9079, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.3512092534174553, |
|
"grad_norm": 0.6598615985676897, |
|
"learning_rate": 1.3411894313653727e-05, |
|
"loss": 0.8944, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.3564668769716088, |
|
"grad_norm": 0.6427748827555393, |
|
"learning_rate": 1.3354310732105014e-05, |
|
"loss": 0.898, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.3617245005257623, |
|
"grad_norm": 0.6121349209877303, |
|
"learning_rate": 1.3296601550465525e-05, |
|
"loss": 0.909, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.3669821240799158, |
|
"grad_norm": 0.6575524447093695, |
|
"learning_rate": 1.3238768929619874e-05, |
|
"loss": 0.9098, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3669821240799158, |
|
"eval_loss": 1.069818139076233, |
|
"eval_runtime": 559.1797, |
|
"eval_samples_per_second": 24.066, |
|
"eval_steps_per_second": 0.19, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3722397476340693, |
|
"grad_norm": 0.6531825341664897, |
|
"learning_rate": 1.3180815035074786e-05, |
|
"loss": 0.9171, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.3774973711882228, |
|
"grad_norm": 0.6882987706313063, |
|
"learning_rate": 1.3122742036877994e-05, |
|
"loss": 0.8888, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3827549947423765, |
|
"grad_norm": 0.724082633852385, |
|
"learning_rate": 1.3064552109537e-05, |
|
"loss": 0.896, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.38801261829653, |
|
"grad_norm": 0.6895669186673943, |
|
"learning_rate": 1.3006247431937644e-05, |
|
"loss": 0.925, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.3932702418506835, |
|
"grad_norm": 0.6718431536804129, |
|
"learning_rate": 1.2947830187262514e-05, |
|
"loss": 0.9099, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.398527865404837, |
|
"grad_norm": 0.688445352407702, |
|
"learning_rate": 1.2889302562909214e-05, |
|
"loss": 0.8949, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.4037854889589905, |
|
"grad_norm": 0.6016293866381901, |
|
"learning_rate": 1.2830666750408434e-05, |
|
"loss": 0.9015, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.4090431125131442, |
|
"grad_norm": 0.6182893633299666, |
|
"learning_rate": 1.2771924945341906e-05, |
|
"loss": 0.9075, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.4143007360672977, |
|
"grad_norm": 0.6593893582600123, |
|
"learning_rate": 1.2713079347260198e-05, |
|
"loss": 0.8963, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.4195583596214512, |
|
"grad_norm": 0.6688143172592789, |
|
"learning_rate": 1.2654132159600327e-05, |
|
"loss": 0.9021, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.4248159831756047, |
|
"grad_norm": 0.6250269029897194, |
|
"learning_rate": 1.2595085589603281e-05, |
|
"loss": 0.9001, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.4300736067297581, |
|
"grad_norm": 0.6184329559921266, |
|
"learning_rate": 1.2535941848231352e-05, |
|
"loss": 0.8931, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.4353312302839116, |
|
"grad_norm": 0.6598155701237914, |
|
"learning_rate": 1.2476703150085356e-05, |
|
"loss": 0.9046, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.4405888538380651, |
|
"grad_norm": 0.6728059285538895, |
|
"learning_rate": 1.2417371713321713e-05, |
|
"loss": 0.9081, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.4458464773922186, |
|
"grad_norm": 0.6795053004000011, |
|
"learning_rate": 1.2357949759569372e-05, |
|
"loss": 0.8935, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.4511041009463723, |
|
"grad_norm": 0.6370835079324721, |
|
"learning_rate": 1.2298439513846634e-05, |
|
"loss": 0.9134, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.4563617245005258, |
|
"grad_norm": 0.6511674325575209, |
|
"learning_rate": 1.2238843204477855e-05, |
|
"loss": 0.9025, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.4616193480546793, |
|
"grad_norm": 0.6486276822993603, |
|
"learning_rate": 1.2179163063009974e-05, |
|
"loss": 0.9084, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.4668769716088328, |
|
"grad_norm": 0.6375900541444521, |
|
"learning_rate": 1.2119401324128976e-05, |
|
"loss": 0.892, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.4721345951629863, |
|
"grad_norm": 0.6636437536958206, |
|
"learning_rate": 1.2059560225576212e-05, |
|
"loss": 0.9126, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4721345951629863, |
|
"eval_loss": 1.066650629043579, |
|
"eval_runtime": 578.6632, |
|
"eval_samples_per_second": 23.255, |
|
"eval_steps_per_second": 0.183, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4773922187171398, |
|
"grad_norm": 0.6880602268392096, |
|
"learning_rate": 1.1999642008064612e-05, |
|
"loss": 0.9133, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.4826498422712935, |
|
"grad_norm": 0.6439745800900593, |
|
"learning_rate": 1.1939648915194766e-05, |
|
"loss": 0.8956, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.487907465825447, |
|
"grad_norm": 0.6333947925789535, |
|
"learning_rate": 1.1879583193370934e-05, |
|
"loss": 0.8967, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.4931650893796005, |
|
"grad_norm": 0.6887095313857406, |
|
"learning_rate": 1.1819447091716918e-05, |
|
"loss": 0.8953, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.498422712933754, |
|
"grad_norm": 0.8314454423988585, |
|
"learning_rate": 1.1759242861991855e-05, |
|
"loss": 0.9061, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.5036803364879074, |
|
"grad_norm": 0.6207340757493971, |
|
"learning_rate": 1.1698972758505891e-05, |
|
"loss": 0.884, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.508937960042061, |
|
"grad_norm": 0.6356005817235517, |
|
"learning_rate": 1.1638639038035771e-05, |
|
"loss": 0.9056, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.5141955835962144, |
|
"grad_norm": 0.6341731273814719, |
|
"learning_rate": 1.1578243959740345e-05, |
|
"loss": 0.8926, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.519453207150368, |
|
"grad_norm": 0.6524260051325438, |
|
"learning_rate": 1.1517789785075965e-05, |
|
"loss": 0.8925, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.5247108307045214, |
|
"grad_norm": 0.6390976768866661, |
|
"learning_rate": 1.1457278777711816e-05, |
|
"loss": 0.896, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.5299684542586751, |
|
"grad_norm": 0.672745789784435, |
|
"learning_rate": 1.139671320344514e-05, |
|
"loss": 0.8919, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.5352260778128286, |
|
"grad_norm": 0.6849640495250097, |
|
"learning_rate": 1.1336095330116406e-05, |
|
"loss": 0.8908, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.540483701366982, |
|
"grad_norm": 0.6909452334309092, |
|
"learning_rate": 1.127542742752439e-05, |
|
"loss": 0.901, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.5457413249211358, |
|
"grad_norm": 0.6514347502639167, |
|
"learning_rate": 1.1214711767341184e-05, |
|
"loss": 0.8886, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.5509989484752893, |
|
"grad_norm": 0.7670979545467012, |
|
"learning_rate": 1.1153950623027127e-05, |
|
"loss": 0.8915, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.5562565720294428, |
|
"grad_norm": 0.7094429002966973, |
|
"learning_rate": 1.1093146269745694e-05, |
|
"loss": 0.8986, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.5615141955835963, |
|
"grad_norm": 0.7040092519773771, |
|
"learning_rate": 1.1032300984278286e-05, |
|
"loss": 0.8995, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.5667718191377498, |
|
"grad_norm": 0.6717747776159033, |
|
"learning_rate": 1.0971417044938984e-05, |
|
"loss": 0.8894, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.5720294426919033, |
|
"grad_norm": 0.6111734491076107, |
|
"learning_rate": 1.091049673148924e-05, |
|
"loss": 0.8903, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.5772870662460567, |
|
"grad_norm": 0.6339144886316356, |
|
"learning_rate": 1.0849542325052514e-05, |
|
"loss": 0.9032, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5772870662460567, |
|
"eval_loss": 1.060400366783142, |
|
"eval_runtime": 553.3344, |
|
"eval_samples_per_second": 24.32, |
|
"eval_steps_per_second": 0.192, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5825446898002102, |
|
"grad_norm": 0.6119889525138412, |
|
"learning_rate": 1.0788556108028854e-05, |
|
"loss": 0.9059, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.5878023133543637, |
|
"grad_norm": 0.6610719745391888, |
|
"learning_rate": 1.072754036400944e-05, |
|
"loss": 0.8845, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5930599369085172, |
|
"grad_norm": 0.6334246363490683, |
|
"learning_rate": 1.0666497377691067e-05, |
|
"loss": 0.909, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.598317560462671, |
|
"grad_norm": 0.6600607162051635, |
|
"learning_rate": 1.0605429434790607e-05, |
|
"loss": 0.9101, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.6035751840168244, |
|
"grad_norm": 0.6624807422048473, |
|
"learning_rate": 1.0544338821959407e-05, |
|
"loss": 0.8918, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.608832807570978, |
|
"grad_norm": 0.6540415860179337, |
|
"learning_rate": 1.0483227826697686e-05, |
|
"loss": 0.902, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.6140904311251314, |
|
"grad_norm": 0.6339684794581751, |
|
"learning_rate": 1.0422098737268862e-05, |
|
"loss": 0.9047, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.619348054679285, |
|
"grad_norm": 0.63411282308358, |
|
"learning_rate": 1.0360953842613886e-05, |
|
"loss": 0.9106, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.6246056782334386, |
|
"grad_norm": 0.6246624939138397, |
|
"learning_rate": 1.0299795432265516e-05, |
|
"loss": 0.8941, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.629863301787592, |
|
"grad_norm": 0.6422075365217625, |
|
"learning_rate": 1.0238625796262604e-05, |
|
"loss": 0.8969, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.6351209253417456, |
|
"grad_norm": 0.641718675847965, |
|
"learning_rate": 1.0177447225064334e-05, |
|
"loss": 0.8932, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.640378548895899, |
|
"grad_norm": 0.6996379461819543, |
|
"learning_rate": 1.0116262009464475e-05, |
|
"loss": 0.8988, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.6456361724500526, |
|
"grad_norm": 0.6496660294162664, |
|
"learning_rate": 1.0055072440505576e-05, |
|
"loss": 0.8857, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.650893796004206, |
|
"grad_norm": 0.6913136358312865, |
|
"learning_rate": 9.993880809393203e-06, |
|
"loss": 0.8953, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.6561514195583595, |
|
"grad_norm": 0.6323428927883549, |
|
"learning_rate": 9.932689407410136e-06, |
|
"loss": 0.894, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.661409043112513, |
|
"grad_norm": 0.7165826659774039, |
|
"learning_rate": 9.871500525830581e-06, |
|
"loss": 0.8946, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.6630355223364007, |
|
"learning_rate": 9.810316455834359e-06, |
|
"loss": 0.8907, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.6719242902208202, |
|
"grad_norm": 0.6096362135364939, |
|
"learning_rate": 9.749139488421133e-06, |
|
"loss": 0.893, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.6771819137749737, |
|
"grad_norm": 0.6414609478289887, |
|
"learning_rate": 9.687971914324607e-06, |
|
"loss": 0.897, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.6824395373291272, |
|
"grad_norm": 0.6909628111495161, |
|
"learning_rate": 9.626816023926771e-06, |
|
"loss": 0.8882, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6824395373291272, |
|
"eval_loss": 1.054638385772705, |
|
"eval_runtime": 554.7908, |
|
"eval_samples_per_second": 24.256, |
|
"eval_steps_per_second": 0.191, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.687697160883281, |
|
"grad_norm": 0.658967287448874, |
|
"learning_rate": 9.565674107172109e-06, |
|
"loss": 0.8963, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.6929547844374344, |
|
"grad_norm": 0.671113099618244, |
|
"learning_rate": 9.504548453481875e-06, |
|
"loss": 0.9006, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.698212407991588, |
|
"grad_norm": 0.6356681511467472, |
|
"learning_rate": 9.443441351668375e-06, |
|
"loss": 0.8855, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.7034700315457414, |
|
"grad_norm": 0.6733155642148883, |
|
"learning_rate": 9.382355089849235e-06, |
|
"loss": 0.8918, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.7087276550998949, |
|
"grad_norm": 0.6923042640634246, |
|
"learning_rate": 9.321291955361756e-06, |
|
"loss": 0.8933, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.7139852786540484, |
|
"grad_norm": 0.6424747471753014, |
|
"learning_rate": 9.260254234677235e-06, |
|
"loss": 0.8816, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.7192429022082019, |
|
"grad_norm": 0.6144029954554266, |
|
"learning_rate": 9.199244213315377e-06, |
|
"loss": 0.8905, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.7245005257623554, |
|
"grad_norm": 0.6170077707358106, |
|
"learning_rate": 9.138264175758693e-06, |
|
"loss": 0.8863, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.7297581493165088, |
|
"grad_norm": 0.6197301026220858, |
|
"learning_rate": 9.07731640536698e-06, |
|
"loss": 0.8796, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.7350157728706623, |
|
"grad_norm": 0.6643068163348533, |
|
"learning_rate": 9.016403184291805e-06, |
|
"loss": 0.8908, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.7402733964248158, |
|
"grad_norm": 0.5938702426426252, |
|
"learning_rate": 8.955526793391049e-06, |
|
"loss": 0.8902, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.7455310199789695, |
|
"grad_norm": 0.6474013520993763, |
|
"learning_rate": 8.894689512143528e-06, |
|
"loss": 0.8862, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.750788643533123, |
|
"grad_norm": 0.6350300886593221, |
|
"learning_rate": 8.833893618563604e-06, |
|
"loss": 0.8847, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.7560462670872765, |
|
"grad_norm": 0.6377181777254709, |
|
"learning_rate": 8.773141389115914e-06, |
|
"loss": 0.8865, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.7613038906414302, |
|
"grad_norm": 0.6115861001350186, |
|
"learning_rate": 8.712435098630116e-06, |
|
"loss": 0.8863, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.7665615141955837, |
|
"grad_norm": 0.6631610912721477, |
|
"learning_rate": 8.651777020215713e-06, |
|
"loss": 0.8959, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.7718191377497372, |
|
"grad_norm": 0.6241016927327407, |
|
"learning_rate": 8.591169425176931e-06, |
|
"loss": 0.8726, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.7770767613038907, |
|
"grad_norm": 0.6207578864949994, |
|
"learning_rate": 8.53061458292768e-06, |
|
"loss": 0.8892, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.7823343848580442, |
|
"grad_norm": 0.6848519519116634, |
|
"learning_rate": 8.470114760906583e-06, |
|
"loss": 0.8943, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.7875920084121977, |
|
"grad_norm": 0.6571666376626863, |
|
"learning_rate": 8.409672224492051e-06, |
|
"loss": 0.8847, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.7875920084121977, |
|
"eval_loss": 1.049035906791687, |
|
"eval_runtime": 554.1715, |
|
"eval_samples_per_second": 24.283, |
|
"eval_steps_per_second": 0.191, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.7928496319663512, |
|
"grad_norm": 0.645616472902103, |
|
"learning_rate": 8.349289236917482e-06, |
|
"loss": 0.8816, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.7981072555205047, |
|
"grad_norm": 0.6574673506951342, |
|
"learning_rate": 8.28896805918649e-06, |
|
"loss": 0.8648, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.8033648790746581, |
|
"grad_norm": 0.6469048695832662, |
|
"learning_rate": 8.228710949988283e-06, |
|
"loss": 0.8844, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.8086225026288116, |
|
"grad_norm": 0.6503850752063266, |
|
"learning_rate": 8.168520165613035e-06, |
|
"loss": 0.8927, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.8138801261829653, |
|
"grad_norm": 0.6478121209226875, |
|
"learning_rate": 8.108397959867445e-06, |
|
"loss": 0.8973, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.8191377497371188, |
|
"grad_norm": 0.6456428948521569, |
|
"learning_rate": 8.04834658399032e-06, |
|
"loss": 0.8829, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.8243953732912723, |
|
"grad_norm": 0.6790517960706193, |
|
"learning_rate": 7.988368286568287e-06, |
|
"loss": 0.8756, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.8296529968454258, |
|
"grad_norm": 0.7206882773594423, |
|
"learning_rate": 7.928465313451603e-06, |
|
"loss": 0.9051, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.8349106203995795, |
|
"grad_norm": 0.6598808403648849, |
|
"learning_rate": 7.868639907670042e-06, |
|
"loss": 0.9019, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.840168243953733, |
|
"grad_norm": 0.8136122353035425, |
|
"learning_rate": 7.808894309348925e-06, |
|
"loss": 0.8814, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.8454258675078865, |
|
"grad_norm": 0.6383354015679575, |
|
"learning_rate": 7.749230755625228e-06, |
|
"loss": 0.8775, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.85068349106204, |
|
"grad_norm": 0.6270079438127367, |
|
"learning_rate": 7.689651480563824e-06, |
|
"loss": 0.8959, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.8559411146161935, |
|
"grad_norm": 0.6235081226194247, |
|
"learning_rate": 7.630158715073813e-06, |
|
"loss": 0.8871, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.861198738170347, |
|
"grad_norm": 0.6526009528156013, |
|
"learning_rate": 7.570754686825004e-06, |
|
"loss": 0.8867, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.8664563617245005, |
|
"grad_norm": 0.6321251834720393, |
|
"learning_rate": 7.511441620164499e-06, |
|
"loss": 0.9111, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.871713985278654, |
|
"grad_norm": 0.6579757228675541, |
|
"learning_rate": 7.452221736033387e-06, |
|
"loss": 0.8758, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.8769716088328074, |
|
"grad_norm": 0.6288476910531294, |
|
"learning_rate": 7.393097251883609e-06, |
|
"loss": 0.8848, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.882229232386961, |
|
"grad_norm": 0.6671896908639643, |
|
"learning_rate": 7.334070381594904e-06, |
|
"loss": 0.8879, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.8874868559411146, |
|
"grad_norm": 0.6375714540658346, |
|
"learning_rate": 7.275143335391927e-06, |
|
"loss": 0.8871, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.8927444794952681, |
|
"grad_norm": 0.6461378473926269, |
|
"learning_rate": 7.21631831976147e-06, |
|
"loss": 0.8831, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8927444794952681, |
|
"eval_loss": 1.0454537868499756, |
|
"eval_runtime": 554.7002, |
|
"eval_samples_per_second": 24.26, |
|
"eval_steps_per_second": 0.191, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8980021030494216, |
|
"grad_norm": 0.6266360175385085, |
|
"learning_rate": 7.157597537369866e-06, |
|
"loss": 0.8836, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.9032597266035753, |
|
"grad_norm": 0.7106762429735706, |
|
"learning_rate": 7.098983186980495e-06, |
|
"loss": 0.8894, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.9085173501577288, |
|
"grad_norm": 0.6449309860617594, |
|
"learning_rate": 7.040477463371449e-06, |
|
"loss": 0.8961, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.9137749737118823, |
|
"grad_norm": 0.6118460786718801, |
|
"learning_rate": 6.982082557253371e-06, |
|
"loss": 0.8898, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.9190325972660358, |
|
"grad_norm": 0.6200070078112132, |
|
"learning_rate": 6.9238006551873985e-06, |
|
"loss": 0.8993, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.9242902208201893, |
|
"grad_norm": 0.62946195709294, |
|
"learning_rate": 6.86563393950331e-06, |
|
"loss": 0.8746, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.9295478443743428, |
|
"grad_norm": 0.6894329752058552, |
|
"learning_rate": 6.807584588217798e-06, |
|
"loss": 0.8768, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.9348054679284963, |
|
"grad_norm": 0.6337025218810814, |
|
"learning_rate": 6.749654774952925e-06, |
|
"loss": 0.8774, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.9400630914826498, |
|
"grad_norm": 0.6061458342443647, |
|
"learning_rate": 6.691846668854709e-06, |
|
"loss": 0.8925, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.9453207150368033, |
|
"grad_norm": 0.6323722322620482, |
|
"learning_rate": 6.634162434511939e-06, |
|
"loss": 0.8878, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.9505783385909568, |
|
"grad_norm": 0.6295608770739457, |
|
"learning_rate": 6.57660423187509e-06, |
|
"loss": 0.8894, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.9558359621451105, |
|
"grad_norm": 0.7448236764255614, |
|
"learning_rate": 6.519174216175458e-06, |
|
"loss": 0.884, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.961093585699264, |
|
"grad_norm": 0.6643005564433259, |
|
"learning_rate": 6.461874537844465e-06, |
|
"loss": 0.8712, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.9663512092534174, |
|
"grad_norm": 0.6460805751831616, |
|
"learning_rate": 6.404707342433123e-06, |
|
"loss": 0.8794, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.971608832807571, |
|
"grad_norm": 0.6437260367816269, |
|
"learning_rate": 6.347674770531716e-06, |
|
"loss": 0.8913, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.9768664563617246, |
|
"grad_norm": 0.6422567155892785, |
|
"learning_rate": 6.2907789576896125e-06, |
|
"loss": 0.8722, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.9821240799158781, |
|
"grad_norm": 0.6631332611742206, |
|
"learning_rate": 6.2340220343353455e-06, |
|
"loss": 0.8747, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.9873817034700316, |
|
"grad_norm": 0.5923326352879508, |
|
"learning_rate": 6.177406125696804e-06, |
|
"loss": 0.8863, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.9926393270241851, |
|
"grad_norm": 0.6040038053093328, |
|
"learning_rate": 6.120933351721665e-06, |
|
"loss": 0.8822, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.9978969505783386, |
|
"grad_norm": 0.6155901401028533, |
|
"learning_rate": 6.064605826998031e-06, |
|
"loss": 0.8781, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9978969505783386, |
|
"eval_loss": 1.0413092374801636, |
|
"eval_runtime": 562.2304, |
|
"eval_samples_per_second": 23.935, |
|
"eval_steps_per_second": 0.189, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.003154574132492, |
|
"grad_norm": 0.9856009462574625, |
|
"learning_rate": 6.00842566067522e-06, |
|
"loss": 0.7565, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.0084121976866456, |
|
"grad_norm": 1.1010436371290768, |
|
"learning_rate": 5.952394956384823e-06, |
|
"loss": 0.7157, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.013669821240799, |
|
"grad_norm": 0.7976498381871772, |
|
"learning_rate": 5.896515812161896e-06, |
|
"loss": 0.7125, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.0189274447949526, |
|
"grad_norm": 0.7307266469267819, |
|
"learning_rate": 5.840790320366444e-06, |
|
"loss": 0.7208, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.024185068349106, |
|
"grad_norm": 0.7601898382424687, |
|
"learning_rate": 5.7852205676050355e-06, |
|
"loss": 0.7079, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.0294426919032595, |
|
"grad_norm": 0.7636705961643997, |
|
"learning_rate": 5.7298086346527e-06, |
|
"loss": 0.7021, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.034700315457413, |
|
"grad_norm": 0.7600945150765135, |
|
"learning_rate": 5.674556596374993e-06, |
|
"loss": 0.698, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.039957939011567, |
|
"grad_norm": 0.7283661435999434, |
|
"learning_rate": 5.619466521650309e-06, |
|
"loss": 0.7135, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.0452155625657205, |
|
"grad_norm": 0.673772367415323, |
|
"learning_rate": 5.564540473292433e-06, |
|
"loss": 0.712, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.050473186119874, |
|
"grad_norm": 0.7274211486508272, |
|
"learning_rate": 5.509780507973266e-06, |
|
"loss": 0.7316, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.0557308096740274, |
|
"grad_norm": 0.7019707530514135, |
|
"learning_rate": 5.455188676145846e-06, |
|
"loss": 0.7178, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.060988433228181, |
|
"grad_norm": 0.727356607819457, |
|
"learning_rate": 5.40076702196755e-06, |
|
"loss": 0.6901, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.0662460567823344, |
|
"grad_norm": 0.7393497514045044, |
|
"learning_rate": 5.346517583223567e-06, |
|
"loss": 0.7091, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.071503680336488, |
|
"grad_norm": 0.6909005753061759, |
|
"learning_rate": 5.292442391250567e-06, |
|
"loss": 0.7103, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.0767613038906414, |
|
"grad_norm": 0.7199779190451211, |
|
"learning_rate": 5.238543470860677e-06, |
|
"loss": 0.7142, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.082018927444795, |
|
"grad_norm": 0.6986050924763797, |
|
"learning_rate": 5.184822840265635e-06, |
|
"loss": 0.719, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.0872765509989484, |
|
"grad_norm": 0.6873483374112779, |
|
"learning_rate": 5.131282511001221e-06, |
|
"loss": 0.7188, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.092534174553102, |
|
"grad_norm": 0.704017833699201, |
|
"learning_rate": 5.077924487851954e-06, |
|
"loss": 0.7206, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.0977917981072554, |
|
"grad_norm": 0.6869215244017003, |
|
"learning_rate": 5.024750768776011e-06, |
|
"loss": 0.7197, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.103049421661409, |
|
"grad_norm": 0.6616206251205331, |
|
"learning_rate": 4.971763344830419e-06, |
|
"loss": 0.7197, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.103049421661409, |
|
"eval_loss": 1.0822256803512573, |
|
"eval_runtime": 566.9236, |
|
"eval_samples_per_second": 23.737, |
|
"eval_steps_per_second": 0.187, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.108307045215563, |
|
"grad_norm": 0.747519024431639, |
|
"learning_rate": 4.91896420009649e-06, |
|
"loss": 0.7115, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.1135646687697163, |
|
"grad_norm": 0.7095283324919017, |
|
"learning_rate": 4.866355311605547e-06, |
|
"loss": 0.7215, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.1188222923238698, |
|
"grad_norm": 0.7245597363837365, |
|
"learning_rate": 4.813938649264881e-06, |
|
"loss": 0.7038, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.1240799158780233, |
|
"grad_norm": 0.7212203821120433, |
|
"learning_rate": 4.7617161757839895e-06, |
|
"loss": 0.715, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.1293375394321767, |
|
"grad_norm": 0.7027132940392441, |
|
"learning_rate": 4.7096898466010976e-06, |
|
"loss": 0.716, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.1345951629863302, |
|
"grad_norm": 0.6920491890608464, |
|
"learning_rate": 4.657861609809923e-06, |
|
"loss": 0.7027, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.1398527865404837, |
|
"grad_norm": 0.7246862757367895, |
|
"learning_rate": 4.6062334060867416e-06, |
|
"loss": 0.7211, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.145110410094637, |
|
"grad_norm": 0.6816731320053306, |
|
"learning_rate": 4.554807168617703e-06, |
|
"loss": 0.7127, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.1503680336487907, |
|
"grad_norm": 0.7013385203267727, |
|
"learning_rate": 4.5035848230264715e-06, |
|
"loss": 0.7158, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.155625657202944, |
|
"grad_norm": 0.7169543079018775, |
|
"learning_rate": 4.452568287302088e-06, |
|
"loss": 0.7071, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.1608832807570977, |
|
"grad_norm": 0.713248407044651, |
|
"learning_rate": 4.40175947172719e-06, |
|
"loss": 0.7068, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.166140904311251, |
|
"grad_norm": 0.6698951380098755, |
|
"learning_rate": 4.351160278806444e-06, |
|
"loss": 0.7169, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.1713985278654047, |
|
"grad_norm": 0.6926886822542322, |
|
"learning_rate": 4.300772603195335e-06, |
|
"loss": 0.7097, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.176656151419558, |
|
"grad_norm": 0.7101604887955768, |
|
"learning_rate": 4.250598331629215e-06, |
|
"loss": 0.7199, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.181913774973712, |
|
"grad_norm": 0.6817786841786956, |
|
"learning_rate": 4.200639342852648e-06, |
|
"loss": 0.709, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.1871713985278656, |
|
"grad_norm": 0.670024634466742, |
|
"learning_rate": 4.150897507549076e-06, |
|
"loss": 0.7031, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.192429022082019, |
|
"grad_norm": 0.704511383930273, |
|
"learning_rate": 4.101374688270751e-06, |
|
"loss": 0.716, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.1976866456361726, |
|
"grad_norm": 0.6737857814580686, |
|
"learning_rate": 4.052072739369015e-06, |
|
"loss": 0.7151, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.202944269190326, |
|
"grad_norm": 0.7004818342552892, |
|
"learning_rate": 4.0029935069248494e-06, |
|
"loss": 0.7084, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.2082018927444795, |
|
"grad_norm": 0.6938485406548258, |
|
"learning_rate": 3.954138828679762e-06, |
|
"loss": 0.7137, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.2082018927444795, |
|
"eval_loss": 1.0840835571289062, |
|
"eval_runtime": 554.526, |
|
"eval_samples_per_second": 24.268, |
|
"eval_steps_per_second": 0.191, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.213459516298633, |
|
"grad_norm": 0.6902078976776752, |
|
"learning_rate": 3.905510533966959e-06, |
|
"loss": 0.7096, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.2187171398527865, |
|
"grad_norm": 0.7110522716973304, |
|
"learning_rate": 3.857110443642864e-06, |
|
"loss": 0.6949, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.22397476340694, |
|
"grad_norm": 0.7247408104466715, |
|
"learning_rate": 3.8089403700189254e-06, |
|
"loss": 0.7187, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.2292323869610935, |
|
"grad_norm": 0.7097288878868501, |
|
"learning_rate": 3.7610021167937526e-06, |
|
"loss": 0.7036, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.234490010515247, |
|
"grad_norm": 0.7612906599424331, |
|
"learning_rate": 3.713297478985595e-06, |
|
"loss": 0.7205, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.2397476340694005, |
|
"grad_norm": 0.7985865232124967, |
|
"learning_rate": 3.6658282428651026e-06, |
|
"loss": 0.7018, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.245005257623554, |
|
"grad_norm": 0.6445514804150951, |
|
"learning_rate": 3.618596185888471e-06, |
|
"loss": 0.6983, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.250262881177708, |
|
"grad_norm": 0.6788252376343907, |
|
"learning_rate": 3.5716030766308553e-06, |
|
"loss": 0.6963, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.2555205047318614, |
|
"grad_norm": 0.6558652902911214, |
|
"learning_rate": 3.5248506747201694e-06, |
|
"loss": 0.6988, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.260778128286015, |
|
"grad_norm": 0.727190238646923, |
|
"learning_rate": 3.4783407307711913e-06, |
|
"loss": 0.701, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.2660357518401684, |
|
"grad_norm": 0.7053251271830925, |
|
"learning_rate": 3.4320749863199987e-06, |
|
"loss": 0.7038, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.271293375394322, |
|
"grad_norm": 0.691685408706534, |
|
"learning_rate": 3.3860551737587857e-06, |
|
"loss": 0.7068, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.2765509989484753, |
|
"grad_norm": 0.6897266118308167, |
|
"learning_rate": 3.3402830162709644e-06, |
|
"loss": 0.703, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.281808622502629, |
|
"grad_norm": 0.6917521598477109, |
|
"learning_rate": 3.2947602277666678e-06, |
|
"loss": 0.7136, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.2870662460567823, |
|
"grad_norm": 0.6899343095386444, |
|
"learning_rate": 3.2494885128185517e-06, |
|
"loss": 0.6984, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.292323869610936, |
|
"grad_norm": 0.6869089208872174, |
|
"learning_rate": 3.2044695665979865e-06, |
|
"loss": 0.724, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.2975814931650893, |
|
"grad_norm": 0.7005346292608602, |
|
"learning_rate": 3.1597050748115655e-06, |
|
"loss": 0.7035, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.302839116719243, |
|
"grad_norm": 0.7061499912056902, |
|
"learning_rate": 3.115196713638e-06, |
|
"loss": 0.6865, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.3080967402733963, |
|
"grad_norm": 0.6815319705079519, |
|
"learning_rate": 3.0709461496653504e-06, |
|
"loss": 0.7156, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.3133543638275498, |
|
"grad_norm": 0.7049825225126681, |
|
"learning_rate": 3.0269550398286096e-06, |
|
"loss": 0.7115, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.3133543638275498, |
|
"eval_loss": 1.0800352096557617, |
|
"eval_runtime": 568.5479, |
|
"eval_samples_per_second": 23.669, |
|
"eval_steps_per_second": 0.186, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.3186119873817033, |
|
"grad_norm": 0.6675183707377966, |
|
"learning_rate": 2.983225031347683e-06, |
|
"loss": 0.7087, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.3238696109358568, |
|
"grad_norm": 0.7114348169331429, |
|
"learning_rate": 2.939757761665686e-06, |
|
"loss": 0.7077, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.3291272344900107, |
|
"grad_norm": 0.7191874914216904, |
|
"learning_rate": 2.8965548583876534e-06, |
|
"loss": 0.7201, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.334384858044164, |
|
"grad_norm": 0.6766258501238187, |
|
"learning_rate": 2.853617939219574e-06, |
|
"loss": 0.7072, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.3396424815983177, |
|
"grad_norm": 0.7028752741574394, |
|
"learning_rate": 2.810948611907832e-06, |
|
"loss": 0.6955, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.344900105152471, |
|
"grad_norm": 0.7210493538085075, |
|
"learning_rate": 2.7685484741790023e-06, |
|
"loss": 0.7129, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.3501577287066246, |
|
"grad_norm": 0.6928964162595481, |
|
"learning_rate": 2.7264191136800112e-06, |
|
"loss": 0.6873, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.355415352260778, |
|
"grad_norm": 0.6949752358383088, |
|
"learning_rate": 2.6845621079187122e-06, |
|
"loss": 0.7207, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.3606729758149316, |
|
"grad_norm": 0.7000497878298911, |
|
"learning_rate": 2.6429790242047927e-06, |
|
"loss": 0.7019, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.365930599369085, |
|
"grad_norm": 0.6655488986940491, |
|
"learning_rate": 2.6016714195911085e-06, |
|
"loss": 0.6909, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.3711882229232386, |
|
"grad_norm": 0.6946100724369102, |
|
"learning_rate": 2.560640840815363e-06, |
|
"loss": 0.703, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.376445846477392, |
|
"grad_norm": 0.6799665527381428, |
|
"learning_rate": 2.5198888242422014e-06, |
|
"loss": 0.7029, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.3817034700315456, |
|
"grad_norm": 0.698092499847167, |
|
"learning_rate": 2.4794168958056854e-06, |
|
"loss": 0.706, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.386961093585699, |
|
"grad_norm": 0.6725956864860293, |
|
"learning_rate": 2.439226570952137e-06, |
|
"loss": 0.7087, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.392218717139853, |
|
"grad_norm": 0.7109494323803826, |
|
"learning_rate": 2.3993193545834182e-06, |
|
"loss": 0.7125, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.3974763406940065, |
|
"grad_norm": 0.7088160313512611, |
|
"learning_rate": 2.35969674100056e-06, |
|
"loss": 0.6979, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.40273396424816, |
|
"grad_norm": 0.6826523489540324, |
|
"learning_rate": 2.3203602138478264e-06, |
|
"loss": 0.7055, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.4079915878023135, |
|
"grad_norm": 0.6930882874841964, |
|
"learning_rate": 2.281311246057143e-06, |
|
"loss": 0.7201, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.413249211356467, |
|
"grad_norm": 0.6782194389254947, |
|
"learning_rate": 2.242551299792962e-06, |
|
"loss": 0.7278, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.4185068349106205, |
|
"grad_norm": 0.6611886260527141, |
|
"learning_rate": 2.204081826397494e-06, |
|
"loss": 0.7178, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.4185068349106205, |
|
"eval_loss": 1.0789012908935547, |
|
"eval_runtime": 548.9059, |
|
"eval_samples_per_second": 24.516, |
|
"eval_steps_per_second": 0.193, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.423764458464774, |
|
"grad_norm": 0.6913748928617807, |
|
"learning_rate": 2.1659042663363795e-06, |
|
"loss": 0.7031, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.4290220820189274, |
|
"grad_norm": 0.68971986235768, |
|
"learning_rate": 2.1280200491447465e-06, |
|
"loss": 0.6902, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.434279705573081, |
|
"grad_norm": 0.7068453091320502, |
|
"learning_rate": 2.0904305933736714e-06, |
|
"loss": 0.7064, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.4395373291272344, |
|
"grad_norm": 0.7009937280786678, |
|
"learning_rate": 2.053137306537082e-06, |
|
"loss": 0.702, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.444794952681388, |
|
"grad_norm": 0.7009541498050648, |
|
"learning_rate": 2.0161415850590327e-06, |
|
"loss": 0.7072, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.4500525762355414, |
|
"grad_norm": 0.6679413662712783, |
|
"learning_rate": 1.9794448142214396e-06, |
|
"loss": 0.7121, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.455310199789695, |
|
"grad_norm": 0.6929272185822167, |
|
"learning_rate": 1.9430483681121836e-06, |
|
"loss": 0.7164, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.4605678233438484, |
|
"grad_norm": 0.7778000958451866, |
|
"learning_rate": 1.9069536095736817e-06, |
|
"loss": 0.7091, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.465825446898002, |
|
"grad_norm": 0.6672776696135466, |
|
"learning_rate": 1.8711618901518446e-06, |
|
"loss": 0.7132, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.471083070452156, |
|
"grad_norm": 0.6949140160619673, |
|
"learning_rate": 1.8356745500454699e-06, |
|
"loss": 0.6974, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.4763406940063093, |
|
"grad_norm": 0.6950911698278153, |
|
"learning_rate": 1.8004929180560582e-06, |
|
"loss": 0.6894, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.481598317560463, |
|
"grad_norm": 0.6826148060946653, |
|
"learning_rate": 1.7656183115380577e-06, |
|
"loss": 0.7043, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.4868559411146163, |
|
"grad_norm": 0.7310354415413428, |
|
"learning_rate": 1.7310520363495454e-06, |
|
"loss": 0.7021, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.4921135646687698, |
|
"grad_norm": 0.6754671470342107, |
|
"learning_rate": 1.6967953868033104e-06, |
|
"loss": 0.7043, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.4973711882229233, |
|
"grad_norm": 0.6935442287350769, |
|
"learning_rate": 1.6628496456184107e-06, |
|
"loss": 0.6994, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.5026288117770767, |
|
"grad_norm": 0.690259266155438, |
|
"learning_rate": 1.6292160838721316e-06, |
|
"loss": 0.6946, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.5078864353312302, |
|
"grad_norm": 0.6934285014568452, |
|
"learning_rate": 1.5958959609523905e-06, |
|
"loss": 0.719, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.5131440588853837, |
|
"grad_norm": 0.706595235609839, |
|
"learning_rate": 1.562890524510583e-06, |
|
"loss": 0.699, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.518401682439537, |
|
"grad_norm": 0.7031045404384867, |
|
"learning_rate": 1.530201010414859e-06, |
|
"loss": 0.7019, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.5236593059936907, |
|
"grad_norm": 0.6611225731580428, |
|
"learning_rate": 1.4978286427038602e-06, |
|
"loss": 0.7063, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.5236593059936907, |
|
"eval_loss": 1.0776675939559937, |
|
"eval_runtime": 549.0786, |
|
"eval_samples_per_second": 24.508, |
|
"eval_steps_per_second": 0.193, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.5289169295478446, |
|
"grad_norm": 0.6876289627741422, |
|
"learning_rate": 1.4657746335408695e-06, |
|
"loss": 0.7068, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.534174553101998, |
|
"grad_norm": 0.680233555417602, |
|
"learning_rate": 1.4340401831684413e-06, |
|
"loss": 0.6807, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.5394321766561516, |
|
"grad_norm": 0.6654932547762412, |
|
"learning_rate": 1.4026264798634359e-06, |
|
"loss": 0.7179, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.544689800210305, |
|
"grad_norm": 0.6945732690751362, |
|
"learning_rate": 1.371534699892547e-06, |
|
"loss": 0.7086, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.5499474237644586, |
|
"grad_norm": 0.6862420273962914, |
|
"learning_rate": 1.3407660074682472e-06, |
|
"loss": 0.7028, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.555205047318612, |
|
"grad_norm": 0.651460129300283, |
|
"learning_rate": 1.3103215547051962e-06, |
|
"loss": 0.6975, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.5604626708727656, |
|
"grad_norm": 0.6970590762896678, |
|
"learning_rate": 1.2802024815770942e-06, |
|
"loss": 0.7115, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.565720294426919, |
|
"grad_norm": 0.6744240212503375, |
|
"learning_rate": 1.250409915874007e-06, |
|
"loss": 0.7057, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.5709779179810726, |
|
"grad_norm": 0.6699733139877856, |
|
"learning_rate": 1.220944973160133e-06, |
|
"loss": 0.6884, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.576235541535226, |
|
"grad_norm": 0.6915305368046275, |
|
"learning_rate": 1.1918087567320257e-06, |
|
"loss": 0.7026, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.5814931650893795, |
|
"grad_norm": 0.6755768658668228, |
|
"learning_rate": 1.1630023575772908e-06, |
|
"loss": 0.6966, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.586750788643533, |
|
"grad_norm": 0.705779731938613, |
|
"learning_rate": 1.1345268543337283e-06, |
|
"loss": 0.6988, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.5920084121976865, |
|
"grad_norm": 0.7111985726538933, |
|
"learning_rate": 1.1063833132489477e-06, |
|
"loss": 0.696, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.59726603575184, |
|
"grad_norm": 0.6539744158999056, |
|
"learning_rate": 1.0785727881404329e-06, |
|
"loss": 0.6961, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.6025236593059935, |
|
"grad_norm": 0.6848492944946433, |
|
"learning_rate": 1.051096320356103e-06, |
|
"loss": 0.7046, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.607781282860147, |
|
"grad_norm": 0.7032823101149783, |
|
"learning_rate": 1.0239549387352954e-06, |
|
"loss": 0.7201, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.6130389064143005, |
|
"grad_norm": 0.6762173164818084, |
|
"learning_rate": 9.97149659570259e-07, |
|
"loss": 0.7116, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.6182965299684544, |
|
"grad_norm": 0.6806035208648271, |
|
"learning_rate": 9.706814865680957e-07, |
|
"loss": 0.7045, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.623554153522608, |
|
"grad_norm": 0.6776503088053696, |
|
"learning_rate": 9.445514108131693e-07, |
|
"loss": 0.6888, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.6288117770767614, |
|
"grad_norm": 0.6836339268439919, |
|
"learning_rate": 9.187604107300107e-07, |
|
"loss": 0.6964, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.6288117770767614, |
|
"eval_loss": 1.0754879713058472, |
|
"eval_runtime": 544.4972, |
|
"eval_samples_per_second": 24.715, |
|
"eval_steps_per_second": 0.195, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.634069400630915, |
|
"grad_norm": 0.6761130619047382, |
|
"learning_rate": 8.933094520466634e-07, |
|
"loss": 0.7058, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.6393270241850684, |
|
"grad_norm": 0.6672694366752451, |
|
"learning_rate": 8.681994877585365e-07, |
|
"loss": 0.7054, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.644584647739222, |
|
"grad_norm": 0.7017173692899314, |
|
"learning_rate": 8.434314580927105e-07, |
|
"loss": 0.7003, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.6498422712933754, |
|
"grad_norm": 0.6828167224204641, |
|
"learning_rate": 8.19006290472737e-07, |
|
"loss": 0.7134, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.655099894847529, |
|
"grad_norm": 0.6887161892823586, |
|
"learning_rate": 7.949248994839131e-07, |
|
"loss": 0.7107, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.6603575184016823, |
|
"grad_norm": 0.6858305599284509, |
|
"learning_rate": 7.711881868390292e-07, |
|
"loss": 0.7185, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.665615141955836, |
|
"grad_norm": 0.6919951634850794, |
|
"learning_rate": 7.477970413446089e-07, |
|
"loss": 0.7038, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.6708727655099898, |
|
"grad_norm": 0.7059421711173827, |
|
"learning_rate": 7.247523388676292e-07, |
|
"loss": 0.6934, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.6761303890641432, |
|
"grad_norm": 0.697370543891664, |
|
"learning_rate": 7.020549423027223e-07, |
|
"loss": 0.6874, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.6813880126182967, |
|
"grad_norm": 0.6851210955122395, |
|
"learning_rate": 6.797057015398634e-07, |
|
"loss": 0.7091, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.6866456361724502, |
|
"grad_norm": 0.6810814971271851, |
|
"learning_rate": 6.577054534325511e-07, |
|
"loss": 0.6935, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.6919032597266037, |
|
"grad_norm": 0.6676833725760639, |
|
"learning_rate": 6.360550217664685e-07, |
|
"loss": 0.7088, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.697160883280757, |
|
"grad_norm": 0.7148977742599517, |
|
"learning_rate": 6.147552172286375e-07, |
|
"loss": 0.6987, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.7024185068349107, |
|
"grad_norm": 0.6475197510665502, |
|
"learning_rate": 5.938068373770667e-07, |
|
"loss": 0.6864, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.707676130389064, |
|
"grad_norm": 0.685110898697612, |
|
"learning_rate": 5.732106666108827e-07, |
|
"loss": 0.6937, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.7129337539432177, |
|
"grad_norm": 0.6850644373487722, |
|
"learning_rate": 5.529674761409643e-07, |
|
"loss": 0.701, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.718191377497371, |
|
"grad_norm": 0.6619622645326332, |
|
"learning_rate": 5.330780239610534e-07, |
|
"loss": 0.705, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.7234490010515247, |
|
"grad_norm": 0.6779887305496379, |
|
"learning_rate": 5.135430548193909e-07, |
|
"loss": 0.6912, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.728706624605678, |
|
"grad_norm": 0.6695357873979283, |
|
"learning_rate": 4.943633001908111e-07, |
|
"loss": 0.7007, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.7339642481598316, |
|
"grad_norm": 0.6851094475471325, |
|
"learning_rate": 4.7553947824936496e-07, |
|
"loss": 0.7121, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.7339642481598316, |
|
"eval_loss": 1.0742169618606567, |
|
"eval_runtime": 543.8651, |
|
"eval_samples_per_second": 24.743, |
|
"eval_steps_per_second": 0.195, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.739221871713985, |
|
"grad_norm": 0.6798881286754066, |
|
"learning_rate": 4.5707229384142184e-07, |
|
"loss": 0.7043, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.7444794952681386, |
|
"grad_norm": 0.6627199879579073, |
|
"learning_rate": 4.3896243845927943e-07, |
|
"loss": 0.7083, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.749737118822292, |
|
"grad_norm": 0.6911107462785068, |
|
"learning_rate": 4.21210590215273e-07, |
|
"loss": 0.7062, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.7549947423764456, |
|
"grad_norm": 0.6538298159253733, |
|
"learning_rate": 4.0381741381638085e-07, |
|
"loss": 0.6919, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.7602523659305995, |
|
"grad_norm": 0.6913261772512153, |
|
"learning_rate": 3.8678356053933666e-07, |
|
"loss": 0.6899, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.765509989484753, |
|
"grad_norm": 0.6731586319154937, |
|
"learning_rate": 3.7010966820623996e-07, |
|
"loss": 0.7115, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.7707676130389065, |
|
"grad_norm": 0.6739111157184594, |
|
"learning_rate": 3.5379636116067764e-07, |
|
"loss": 0.6938, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.77602523659306, |
|
"grad_norm": 0.6775894239204638, |
|
"learning_rate": 3.378442502443424e-07, |
|
"loss": 0.7018, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.7812828601472135, |
|
"grad_norm": 0.6630535974515509, |
|
"learning_rate": 3.222539327741592e-07, |
|
"loss": 0.7108, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.786540483701367, |
|
"grad_norm": 0.6476313251006354, |
|
"learning_rate": 3.070259925199248e-07, |
|
"loss": 0.7064, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.7917981072555205, |
|
"grad_norm": 0.6793550821713811, |
|
"learning_rate": 2.921609996824437e-07, |
|
"loss": 0.686, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.797055730809674, |
|
"grad_norm": 0.6950659181503308, |
|
"learning_rate": 2.7765951087218134e-07, |
|
"loss": 0.6922, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.8023133543638274, |
|
"grad_norm": 0.6759277309855073, |
|
"learning_rate": 2.6352206908841325e-07, |
|
"loss": 0.7123, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.807570977917981, |
|
"grad_norm": 0.6871290912583685, |
|
"learning_rate": 2.497492036989058e-07, |
|
"loss": 0.7071, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.812828601472135, |
|
"grad_norm": 0.6672178424750838, |
|
"learning_rate": 2.3634143042008396e-07, |
|
"loss": 0.7055, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.8180862250262884, |
|
"grad_norm": 0.6871427641549465, |
|
"learning_rate": 2.2329925129772613e-07, |
|
"loss": 0.7162, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.823343848580442, |
|
"grad_norm": 0.6996639531083144, |
|
"learning_rate": 2.1062315468816318e-07, |
|
"loss": 0.7116, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.8286014721345953, |
|
"grad_norm": 0.7057461914462779, |
|
"learning_rate": 1.9831361523999227e-07, |
|
"loss": 0.6978, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.833859095688749, |
|
"grad_norm": 0.6606180852855636, |
|
"learning_rate": 1.8637109387630637e-07, |
|
"loss": 0.6872, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.8391167192429023, |
|
"grad_norm": 0.6603518954437334, |
|
"learning_rate": 1.7479603777742937e-07, |
|
"loss": 0.7049, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.8391167192429023, |
|
"eval_loss": 1.074755311012268, |
|
"eval_runtime": 548.3041, |
|
"eval_samples_per_second": 24.543, |
|
"eval_steps_per_second": 0.193, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.844374342797056, |
|
"grad_norm": 0.7039186631952389, |
|
"learning_rate": 1.6358888036418053e-07, |
|
"loss": 0.7076, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.8496319663512093, |
|
"grad_norm": 0.6613941861667958, |
|
"learning_rate": 1.5275004128163407e-07, |
|
"loss": 0.7022, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.854889589905363, |
|
"grad_norm": 0.6784432805911156, |
|
"learning_rate": 1.422799263834196e-07, |
|
"loss": 0.7018, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.8601472134595163, |
|
"grad_norm": 0.662880920108081, |
|
"learning_rate": 1.3217892771651087e-07, |
|
"loss": 0.7039, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.8654048370136698, |
|
"grad_norm": 0.674177068306156, |
|
"learning_rate": 1.224474235065587e-07, |
|
"loss": 0.6948, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.8706624605678233, |
|
"grad_norm": 0.6576941034750949, |
|
"learning_rate": 1.1308577814371669e-07, |
|
"loss": 0.6959, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.8759200841219767, |
|
"grad_norm": 0.6877738227702634, |
|
"learning_rate": 1.040943421690055e-07, |
|
"loss": 0.7016, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.8811777076761302, |
|
"grad_norm": 0.6570796449184478, |
|
"learning_rate": 9.547345226118666e-08, |
|
"loss": 0.7008, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.8864353312302837, |
|
"grad_norm": 0.6556870027002477, |
|
"learning_rate": 8.722343122414823e-08, |
|
"loss": 0.7114, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.891692954784437, |
|
"grad_norm": 0.6525356309193387, |
|
"learning_rate": 7.93445879748267e-08, |
|
"loss": 0.705, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.8969505783385907, |
|
"grad_norm": 0.6979809421888648, |
|
"learning_rate": 7.183721753163508e-08, |
|
"loss": 0.705, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.9022082018927446, |
|
"grad_norm": 0.6680505376816218, |
|
"learning_rate": 6.470160100341516e-08, |
|
"loss": 0.7028, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.907465825446898, |
|
"grad_norm": 0.6754425700333265, |
|
"learning_rate": 5.793800557891471e-08, |
|
"loss": 0.6969, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.9127234490010516, |
|
"grad_norm": 0.6770770823855421, |
|
"learning_rate": 5.154668451678224e-08, |
|
"loss": 0.709, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.917981072555205, |
|
"grad_norm": 0.6880130710385723, |
|
"learning_rate": 4.552787713608231e-08, |
|
"loss": 0.69, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.9232386961093586, |
|
"grad_norm": 0.6625414510833385, |
|
"learning_rate": 3.988180880733161e-08, |
|
"loss": 0.6962, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.928496319663512, |
|
"grad_norm": 0.6643252155800653, |
|
"learning_rate": 3.460869094407127e-08, |
|
"loss": 0.7037, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.9337539432176656, |
|
"grad_norm": 0.6897645676504198, |
|
"learning_rate": 2.9708720994934272e-08, |
|
"loss": 0.6896, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.939011566771819, |
|
"grad_norm": 0.7113672933129457, |
|
"learning_rate": 2.5182082436266963e-08, |
|
"loss": 0.7165, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.9442691903259726, |
|
"grad_norm": 0.6781710312687059, |
|
"learning_rate": 2.1028944765251193e-08, |
|
"loss": 0.7024, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.9442691903259726, |
|
"eval_loss": 1.074735403060913, |
|
"eval_runtime": 544.9092, |
|
"eval_samples_per_second": 24.696, |
|
"eval_steps_per_second": 0.195, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.949526813880126, |
|
"grad_norm": 0.7502190973801118, |
|
"learning_rate": 1.724946349355605e-08, |
|
"loss": 0.6952, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.9547844374342795, |
|
"grad_norm": 0.6554060805074167, |
|
"learning_rate": 1.3843780141521435e-08, |
|
"loss": 0.7095, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.9600420609884335, |
|
"grad_norm": 0.6884790361695539, |
|
"learning_rate": 1.081202223285449e-08, |
|
"loss": 0.7096, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.965299684542587, |
|
"grad_norm": 0.6687316519292371, |
|
"learning_rate": 8.154303289854559e-09, |
|
"loss": 0.7071, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.9705573080967405, |
|
"grad_norm": 0.6719077380861403, |
|
"learning_rate": 5.870722829164344e-09, |
|
"loss": 0.6954, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.975814931650894, |
|
"grad_norm": 0.6445219670997994, |
|
"learning_rate": 3.9613663580406745e-09, |
|
"loss": 0.6844, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.9810725552050474, |
|
"grad_norm": 0.6702818163839258, |
|
"learning_rate": 2.426305371155957e-09, |
|
"loss": 0.6924, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.986330178759201, |
|
"grad_norm": 0.6546313538456479, |
|
"learning_rate": 1.265597347920311e-09, |
|
"loss": 0.7013, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.9915878023133544, |
|
"grad_norm": 0.6790610179426215, |
|
"learning_rate": 4.792857503266301e-10, |
|
"loss": 0.7013, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.996845425867508, |
|
"grad_norm": 0.6610872038208641, |
|
"learning_rate": 6.740002132743506e-11, |
|
"loss": 0.708, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2853, |
|
"total_flos": 1194720315310080.0, |
|
"train_loss": 0.8973418972260736, |
|
"train_runtime": 76133.7056, |
|
"train_samples_per_second": 4.793, |
|
"train_steps_per_second": 0.037 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2853, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1194720315310080.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|