{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1047, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028653295128939827, "grad_norm": 257.7583052380938, "learning_rate": 1.904761904761905e-07, "loss": 9.25, "step": 1 }, { "epoch": 0.014326647564469915, "grad_norm": 241.29073677978286, "learning_rate": 9.523809523809525e-07, "loss": 9.2969, "step": 5 }, { "epoch": 0.02865329512893983, "grad_norm": 225.58659295977776, "learning_rate": 1.904761904761905e-06, "loss": 9.0344, "step": 10 }, { "epoch": 0.04297994269340974, "grad_norm": 87.70853454265482, "learning_rate": 2.8571428571428573e-06, "loss": 7.9078, "step": 15 }, { "epoch": 0.05730659025787966, "grad_norm": 43.9183222857209, "learning_rate": 3.80952380952381e-06, "loss": 6.8359, "step": 20 }, { "epoch": 0.07163323782234957, "grad_norm": 31.094485209793813, "learning_rate": 4.761904761904762e-06, "loss": 5.4719, "step": 25 }, { "epoch": 0.08595988538681948, "grad_norm": 13.666017374081363, "learning_rate": 5.7142857142857145e-06, "loss": 4.1055, "step": 30 }, { "epoch": 0.10028653295128939, "grad_norm": 10.161245236008533, "learning_rate": 6.666666666666667e-06, "loss": 3.25, "step": 35 }, { "epoch": 0.11461318051575932, "grad_norm": 3.2026690104839344, "learning_rate": 7.61904761904762e-06, "loss": 2.3469, "step": 40 }, { "epoch": 0.12893982808022922, "grad_norm": 1.9300134153720614, "learning_rate": 8.571428571428571e-06, "loss": 1.9957, "step": 45 }, { "epoch": 0.14326647564469913, "grad_norm": 0.868748661209738, "learning_rate": 9.523809523809525e-06, "loss": 1.7973, "step": 50 }, { "epoch": 0.15759312320916904, "grad_norm": 0.8403053660761957, "learning_rate": 1.0476190476190477e-05, "loss": 1.777, "step": 55 }, { "epoch": 0.17191977077363896, "grad_norm": 0.6204488839484352, "learning_rate": 1.1428571428571429e-05, "loss": 1.6609, "step": 60 }, { "epoch": 0.18624641833810887, "grad_norm": 0.46072894517704527, "learning_rate": 1.2380952380952383e-05, "loss": 1.5734, "step": 65 }, { "epoch": 0.20057306590257878, "grad_norm": 0.40370134421433296, "learning_rate": 1.3333333333333333e-05, "loss": 1.568, "step": 70 }, { "epoch": 0.2148997134670487, "grad_norm": 0.4261850895407627, "learning_rate": 1.4285714285714287e-05, "loss": 1.5102, "step": 75 }, { "epoch": 0.22922636103151864, "grad_norm": 0.385686551593444, "learning_rate": 1.523809523809524e-05, "loss": 1.5504, "step": 80 }, { "epoch": 0.24355300859598855, "grad_norm": 0.3521586438938912, "learning_rate": 1.6190476190476193e-05, "loss": 1.4641, "step": 85 }, { "epoch": 0.25787965616045844, "grad_norm": 0.37387478168247884, "learning_rate": 1.7142857142857142e-05, "loss": 1.5211, "step": 90 }, { "epoch": 0.2722063037249284, "grad_norm": 0.3334520851844191, "learning_rate": 1.8095238095238097e-05, "loss": 1.4211, "step": 95 }, { "epoch": 0.28653295128939826, "grad_norm": 0.3374208227712567, "learning_rate": 1.904761904761905e-05, "loss": 1.4105, "step": 100 }, { "epoch": 0.3008595988538682, "grad_norm": 0.3494984015612909, "learning_rate": 2e-05, "loss": 1.3641, "step": 105 }, { "epoch": 0.3151862464183381, "grad_norm": 0.3291034349461991, "learning_rate": 1.999860973403976e-05, "loss": 1.4148, "step": 110 }, { "epoch": 0.32951289398280803, "grad_norm": 0.3435680154282283, "learning_rate": 1.999443932272694e-05, "loss": 1.4477, "step": 115 }, { "epoch": 0.3438395415472779, "grad_norm": 0.3218335176321063, "learning_rate": 1.99874899256577e-05, "loss": 1.3348, "step": 120 }, { "epoch": 0.35816618911174786, "grad_norm": 0.3169060110914379, "learning_rate": 1.997776347513409e-05, "loss": 1.3887, "step": 125 }, { "epoch": 0.37249283667621774, "grad_norm": 0.31497640525358284, "learning_rate": 1.9965262675626726e-05, "loss": 1.348, "step": 130 }, { "epoch": 0.3868194842406877, "grad_norm": 0.31512378020322396, "learning_rate": 1.994999100302281e-05, "loss": 1.3641, "step": 135 }, { "epoch": 0.40114613180515757, "grad_norm": 0.33739494340791354, "learning_rate": 1.9931952703659655e-05, "loss": 1.3059, "step": 140 }, { "epoch": 0.4154727793696275, "grad_norm": 0.31137992113387536, "learning_rate": 1.991115279314398e-05, "loss": 1.3754, "step": 145 }, { "epoch": 0.4297994269340974, "grad_norm": 0.36579857260459275, "learning_rate": 1.9887597054957304e-05, "loss": 1.3375, "step": 150 }, { "epoch": 0.44412607449856734, "grad_norm": 0.33522933646678277, "learning_rate": 1.9861292038847818e-05, "loss": 1.3645, "step": 155 }, { "epoch": 0.4584527220630373, "grad_norm": 0.32743853323105904, "learning_rate": 1.983224505900921e-05, "loss": 1.3012, "step": 160 }, { "epoch": 0.47277936962750716, "grad_norm": 0.3294434325423303, "learning_rate": 1.9800464192046956e-05, "loss": 1.368, "step": 165 }, { "epoch": 0.4871060171919771, "grad_norm": 0.3258072749981886, "learning_rate": 1.976595827473255e-05, "loss": 1.3148, "step": 170 }, { "epoch": 0.501432664756447, "grad_norm": 0.3864088044514439, "learning_rate": 1.9728736901546454e-05, "loss": 1.3098, "step": 175 }, { "epoch": 0.5157593123209169, "grad_norm": 0.3364423513855298, "learning_rate": 1.968881042201029e-05, "loss": 1.3059, "step": 180 }, { "epoch": 0.5300859598853869, "grad_norm": 0.34214151520580405, "learning_rate": 1.9646189937809145e-05, "loss": 1.3352, "step": 185 }, { "epoch": 0.5444126074498568, "grad_norm": 0.3053274945471879, "learning_rate": 1.9600887299704694e-05, "loss": 1.3387, "step": 190 }, { "epoch": 0.5587392550143266, "grad_norm": 0.3041769711953325, "learning_rate": 1.9552915104240067e-05, "loss": 1.3188, "step": 195 }, { "epoch": 0.5730659025787965, "grad_norm": 0.31747827349059277, "learning_rate": 1.950228669023735e-05, "loss": 1.343, "step": 200 }, { "epoch": 0.5873925501432665, "grad_norm": 0.30247749181312306, "learning_rate": 1.9449016135088657e-05, "loss": 1.3676, "step": 205 }, { "epoch": 0.6017191977077364, "grad_norm": 0.31090923349754856, "learning_rate": 1.9393118250841897e-05, "loss": 1.3371, "step": 210 }, { "epoch": 0.6160458452722063, "grad_norm": 0.3058957610153376, "learning_rate": 1.9334608580082204e-05, "loss": 1.3062, "step": 215 }, { "epoch": 0.6303724928366762, "grad_norm": 0.3201938803930117, "learning_rate": 1.9273503391610307e-05, "loss": 1.309, "step": 220 }, { "epoch": 0.6446991404011462, "grad_norm": 0.30643913797653993, "learning_rate": 1.920981967591891e-05, "loss": 1.3035, "step": 225 }, { "epoch": 0.6590257879656161, "grad_norm": 0.3091049342115299, "learning_rate": 1.914357514046844e-05, "loss": 1.3672, "step": 230 }, { "epoch": 0.673352435530086, "grad_norm": 0.3211019921093764, "learning_rate": 1.9074788204763438e-05, "loss": 1.3309, "step": 235 }, { "epoch": 0.6876790830945558, "grad_norm": 0.3142136640616353, "learning_rate": 1.9003477995230942e-05, "loss": 1.3301, "step": 240 }, { "epoch": 0.7020057306590258, "grad_norm": 0.2969129447372341, "learning_rate": 1.8929664339902342e-05, "loss": 1.2844, "step": 245 }, { "epoch": 0.7163323782234957, "grad_norm": 0.32166405234870055, "learning_rate": 1.8853367762900117e-05, "loss": 1.2605, "step": 250 }, { "epoch": 0.7306590257879656, "grad_norm": 0.3214367335695634, "learning_rate": 1.8774609478731048e-05, "loss": 1.2793, "step": 255 }, { "epoch": 0.7449856733524355, "grad_norm": 0.31356330763968654, "learning_rate": 1.8693411386387445e-05, "loss": 1.3105, "step": 260 }, { "epoch": 0.7593123209169055, "grad_norm": 0.28277004921839216, "learning_rate": 1.8609796063258076e-05, "loss": 1.3352, "step": 265 }, { "epoch": 0.7736389684813754, "grad_norm": 0.29899645005685277, "learning_rate": 1.8523786758850436e-05, "loss": 1.2777, "step": 270 }, { "epoch": 0.7879656160458453, "grad_norm": 0.2904988184310563, "learning_rate": 1.8435407388326167e-05, "loss": 1.2992, "step": 275 }, { "epoch": 0.8022922636103151, "grad_norm": 0.3003438883173033, "learning_rate": 1.834468252585135e-05, "loss": 1.3004, "step": 280 }, { "epoch": 0.8166189111747851, "grad_norm": 0.3125023034962173, "learning_rate": 1.8251637397763597e-05, "loss": 1.227, "step": 285 }, { "epoch": 0.830945558739255, "grad_norm": 0.30438015798153173, "learning_rate": 1.8156297875557777e-05, "loss": 1.259, "step": 290 }, { "epoch": 0.8452722063037249, "grad_norm": 0.30190146232606113, "learning_rate": 1.8058690468692366e-05, "loss": 1.2824, "step": 295 }, { "epoch": 0.8595988538681948, "grad_norm": 0.3023797380550246, "learning_rate": 1.7958842317218413e-05, "loss": 1.277, "step": 300 }, { "epoch": 0.8739255014326648, "grad_norm": 0.3102084836884206, "learning_rate": 1.7856781184233152e-05, "loss": 1.1988, "step": 305 }, { "epoch": 0.8882521489971347, "grad_norm": 0.2970584447964946, "learning_rate": 1.7752535448160395e-05, "loss": 1.2727, "step": 310 }, { "epoch": 0.9025787965616046, "grad_norm": 0.30487145100844953, "learning_rate": 1.7646134094859816e-05, "loss": 1.2566, "step": 315 }, { "epoch": 0.9169054441260746, "grad_norm": 0.3048375178754921, "learning_rate": 1.7537606709567336e-05, "loss": 1.2457, "step": 320 }, { "epoch": 0.9312320916905444, "grad_norm": 0.2948102666221556, "learning_rate": 1.742698346866886e-05, "loss": 1.2965, "step": 325 }, { "epoch": 0.9455587392550143, "grad_norm": 0.31264839407509326, "learning_rate": 1.731429513130964e-05, "loss": 1.2801, "step": 330 }, { "epoch": 0.9598853868194842, "grad_norm": 0.29735923550950955, "learning_rate": 1.7199573030841577e-05, "loss": 1.2605, "step": 335 }, { "epoch": 0.9742120343839542, "grad_norm": 0.3039565840031905, "learning_rate": 1.708284906611091e-05, "loss": 1.234, "step": 340 }, { "epoch": 0.9885386819484241, "grad_norm": 0.3073836805156935, "learning_rate": 1.696415569258862e-05, "loss": 1.259, "step": 345 }, { "epoch": 1.002865329512894, "grad_norm": 0.31664041219314865, "learning_rate": 1.6843525913346087e-05, "loss": 1.2664, "step": 350 }, { "epoch": 1.0171919770773639, "grad_norm": 0.29695347553484913, "learning_rate": 1.6720993269878486e-05, "loss": 1.217, "step": 355 }, { "epoch": 1.0315186246418337, "grad_norm": 0.300646007452569, "learning_rate": 1.659659183277847e-05, "loss": 1.2168, "step": 360 }, { "epoch": 1.0458452722063036, "grad_norm": 0.3055551516874735, "learning_rate": 1.647035619226271e-05, "loss": 1.1906, "step": 365 }, { "epoch": 1.0601719197707737, "grad_norm": 0.30061799520952165, "learning_rate": 1.634232144855401e-05, "loss": 1.2289, "step": 370 }, { "epoch": 1.0744985673352436, "grad_norm": 0.29060382670998225, "learning_rate": 1.6212523202121547e-05, "loss": 1.2109, "step": 375 }, { "epoch": 1.0888252148997135, "grad_norm": 0.30881741047203054, "learning_rate": 1.6080997543782063e-05, "loss": 1.2297, "step": 380 }, { "epoch": 1.1031518624641834, "grad_norm": 0.29077207858842863, "learning_rate": 1.5947781044664696e-05, "loss": 1.2512, "step": 385 }, { "epoch": 1.1174785100286533, "grad_norm": 0.2984009106567584, "learning_rate": 1.581291074604226e-05, "loss": 1.1762, "step": 390 }, { "epoch": 1.1318051575931232, "grad_norm": 0.28683159244265233, "learning_rate": 1.5676424149031798e-05, "loss": 1.1719, "step": 395 }, { "epoch": 1.146131805157593, "grad_norm": 0.3016666749794573, "learning_rate": 1.5538359204167285e-05, "loss": 1.2754, "step": 400 }, { "epoch": 1.1604584527220632, "grad_norm": 0.2861211530074967, "learning_rate": 1.5398754300847346e-05, "loss": 1.2566, "step": 405 }, { "epoch": 1.174785100286533, "grad_norm": 0.3013734628684354, "learning_rate": 1.525764825666097e-05, "loss": 1.1691, "step": 410 }, { "epoch": 1.189111747851003, "grad_norm": 0.29971634545543485, "learning_rate": 1.5115080306594172e-05, "loss": 1.1811, "step": 415 }, { "epoch": 1.2034383954154728, "grad_norm": 0.31994782495783924, "learning_rate": 1.4971090092120544e-05, "loss": 1.2414, "step": 420 }, { "epoch": 1.2177650429799427, "grad_norm": 0.29658597163323713, "learning_rate": 1.4825717650178846e-05, "loss": 1.1973, "step": 425 }, { "epoch": 1.2320916905444126, "grad_norm": 0.30035988308067263, "learning_rate": 1.4679003402040593e-05, "loss": 1.2164, "step": 430 }, { "epoch": 1.2464183381088825, "grad_norm": 0.30992527789489627, "learning_rate": 1.4530988142070802e-05, "loss": 1.1625, "step": 435 }, { "epoch": 1.2607449856733524, "grad_norm": 0.30039271345012736, "learning_rate": 1.438171302638498e-05, "loss": 1.2523, "step": 440 }, { "epoch": 1.2750716332378222, "grad_norm": 0.301258281954922, "learning_rate": 1.4231219561405533e-05, "loss": 1.2164, "step": 445 }, { "epoch": 1.2893982808022924, "grad_norm": 0.30070972022384745, "learning_rate": 1.4079549592320782e-05, "loss": 1.2371, "step": 450 }, { "epoch": 1.3037249283667622, "grad_norm": 0.2937945088213742, "learning_rate": 1.3926745291449773e-05, "loss": 1.2227, "step": 455 }, { "epoch": 1.3180515759312321, "grad_norm": 0.2927965843522735, "learning_rate": 1.3772849146516114e-05, "loss": 1.2098, "step": 460 }, { "epoch": 1.332378223495702, "grad_norm": 0.29991379304860427, "learning_rate": 1.3617903948834155e-05, "loss": 1.1414, "step": 465 }, { "epoch": 1.346704871060172, "grad_norm": 0.2895474514716426, "learning_rate": 1.34619527814107e-05, "loss": 1.2188, "step": 470 }, { "epoch": 1.3610315186246418, "grad_norm": 0.30135691213637283, "learning_rate": 1.3305039006965657e-05, "loss": 1.2746, "step": 475 }, { "epoch": 1.3753581661891117, "grad_norm": 0.2808237365976051, "learning_rate": 1.3147206255874886e-05, "loss": 1.1936, "step": 480 }, { "epoch": 1.3896848137535818, "grad_norm": 0.29073762199706943, "learning_rate": 1.2988498414038635e-05, "loss": 1.1734, "step": 485 }, { "epoch": 1.4040114613180517, "grad_norm": 0.3166900967170064, "learning_rate": 1.282895961067893e-05, "loss": 1.1973, "step": 490 }, { "epoch": 1.4183381088825215, "grad_norm": 0.3011306233980793, "learning_rate": 1.2668634206069305e-05, "loss": 1.2238, "step": 495 }, { "epoch": 1.4326647564469914, "grad_norm": 0.2901803615383375, "learning_rate": 1.2507566779200273e-05, "loss": 1.2496, "step": 500 }, { "epoch": 1.4469914040114613, "grad_norm": 0.29696637870766024, "learning_rate": 1.2345802115384014e-05, "loss": 1.1768, "step": 505 }, { "epoch": 1.4613180515759312, "grad_norm": 0.2863270222332892, "learning_rate": 1.2183385193801655e-05, "loss": 1.2156, "step": 510 }, { "epoch": 1.475644699140401, "grad_norm": 0.3025458017583608, "learning_rate": 1.2020361174996694e-05, "loss": 1.173, "step": 515 }, { "epoch": 1.4899713467048712, "grad_norm": 0.287124352936146, "learning_rate": 1.1856775388317936e-05, "loss": 1.1773, "step": 520 }, { "epoch": 1.5042979942693409, "grad_norm": 0.30845106692202556, "learning_rate": 1.1692673319315541e-05, "loss": 1.2316, "step": 525 }, { "epoch": 1.518624641833811, "grad_norm": 0.2934898590653195, "learning_rate": 1.1528100597093617e-05, "loss": 1.1652, "step": 530 }, { "epoch": 1.5329512893982808, "grad_norm": 0.5593622869918241, "learning_rate": 1.13631029816229e-05, "loss": 1.2328, "step": 535 }, { "epoch": 1.5472779369627507, "grad_norm": 0.29465220623927774, "learning_rate": 1.1197726351017052e-05, "loss": 1.1785, "step": 540 }, { "epoch": 1.5616045845272206, "grad_norm": 0.2914162673673657, "learning_rate": 1.1032016688776106e-05, "loss": 1.2613, "step": 545 }, { "epoch": 1.5759312320916905, "grad_norm": 0.2990525095465911, "learning_rate": 1.0866020071000597e-05, "loss": 1.2006, "step": 550 }, { "epoch": 1.5902578796561606, "grad_norm": 0.2855200603052628, "learning_rate": 1.0699782653579973e-05, "loss": 1.2094, "step": 555 }, { "epoch": 1.6045845272206303, "grad_norm": 0.2909907948504594, "learning_rate": 1.0533350659358779e-05, "loss": 1.2035, "step": 560 }, { "epoch": 1.6189111747851004, "grad_norm": 0.289103776445201, "learning_rate": 1.0366770365284271e-05, "loss": 1.1848, "step": 565 }, { "epoch": 1.63323782234957, "grad_norm": 0.2825139343735471, "learning_rate": 1.0200088089538944e-05, "loss": 1.2031, "step": 570 }, { "epoch": 1.6475644699140402, "grad_norm": 0.27750397580689223, "learning_rate": 1.0033350178661633e-05, "loss": 1.1998, "step": 575 }, { "epoch": 1.66189111747851, "grad_norm": 0.2845108454176054, "learning_rate": 9.866602994660688e-06, "loss": 1.1523, "step": 580 }, { "epoch": 1.67621776504298, "grad_norm": 0.2903678786080609, "learning_rate": 9.699892902122887e-06, "loss": 1.1922, "step": 585 }, { "epoch": 1.6905444126074498, "grad_norm": 0.2935036283873705, "learning_rate": 9.53326625532161e-06, "loss": 1.2277, "step": 590 }, { "epoch": 1.7048710601719197, "grad_norm": 0.29136738669186435, "learning_rate": 9.366769385327875e-06, "loss": 1.1641, "step": 595 }, { "epoch": 1.7191977077363898, "grad_norm": 0.29992898897918896, "learning_rate": 9.200448587127852e-06, "loss": 1.1887, "step": 600 }, { "epoch": 1.7335243553008595, "grad_norm": 0.2980387182263642, "learning_rate": 9.034350106750383e-06, "loss": 1.2117, "step": 605 }, { "epoch": 1.7478510028653296, "grad_norm": 0.2973187539429791, "learning_rate": 8.868520128408134e-06, "loss": 1.2273, "step": 610 }, { "epoch": 1.7621776504297995, "grad_norm": 0.2939664139251852, "learning_rate": 8.703004761655918e-06, "loss": 1.2121, "step": 615 }, { "epoch": 1.7765042979942693, "grad_norm": 0.2948724111626734, "learning_rate": 8.537850028569796e-06, "loss": 1.1727, "step": 620 }, { "epoch": 1.7908309455587392, "grad_norm": 0.2900310921230476, "learning_rate": 8.37310185095048e-06, "loss": 1.1705, "step": 625 }, { "epoch": 1.8051575931232091, "grad_norm": 0.2878990205353967, "learning_rate": 8.208806037554645e-06, "loss": 1.1781, "step": 630 }, { "epoch": 1.8194842406876792, "grad_norm": 0.29396661211641484, "learning_rate": 8.045008271357644e-06, "loss": 1.2625, "step": 635 }, { "epoch": 1.8338108882521489, "grad_norm": 0.2853159419583723, "learning_rate": 7.88175409685122e-06, "loss": 1.1562, "step": 640 }, { "epoch": 1.848137535816619, "grad_norm": 0.2924378192139063, "learning_rate": 7.719088907379705e-06, "loss": 1.2141, "step": 645 }, { "epoch": 1.8624641833810889, "grad_norm": 0.2835974358624726, "learning_rate": 7.557057932518274e-06, "loss": 1.1344, "step": 650 }, { "epoch": 1.8767908309455588, "grad_norm": 0.2852512548851884, "learning_rate": 7.39570622549669e-06, "loss": 1.2395, "step": 655 }, { "epoch": 1.8911174785100286, "grad_norm": 0.29264683297829125, "learning_rate": 7.235078650672141e-06, "loss": 1.1797, "step": 660 }, { "epoch": 1.9054441260744985, "grad_norm": 0.2898654877917546, "learning_rate": 7.075219871054528e-06, "loss": 1.2227, "step": 665 }, { "epoch": 1.9197707736389686, "grad_norm": 0.29421755996555327, "learning_rate": 6.91617433588781e-06, "loss": 1.1711, "step": 670 }, { "epoch": 1.9340974212034383, "grad_norm": 0.2800223774944341, "learning_rate": 6.757986268290713e-06, "loss": 1.2025, "step": 675 }, { "epoch": 1.9484240687679084, "grad_norm": 0.28987949528203105, "learning_rate": 6.600699652960383e-06, "loss": 1.1891, "step": 680 }, { "epoch": 1.962750716332378, "grad_norm": 0.2845787595193169, "learning_rate": 6.4443582239422744e-06, "loss": 1.1602, "step": 685 }, { "epoch": 1.9770773638968482, "grad_norm": 0.27697906700119446, "learning_rate": 6.289005452469778e-06, "loss": 1.2195, "step": 690 }, { "epoch": 1.991404011461318, "grad_norm": 0.2864230953591413, "learning_rate": 6.134684534876892e-06, "loss": 1.1859, "step": 695 }, { "epoch": 2.005730659025788, "grad_norm": 0.3004607532045262, "learning_rate": 5.981438380587355e-06, "loss": 1.2074, "step": 700 }, { "epoch": 2.020057306590258, "grad_norm": 0.29024736883757196, "learning_rate": 5.829309600183536e-06, "loss": 1.1371, "step": 705 }, { "epoch": 2.0343839541547277, "grad_norm": 0.285433189631626, "learning_rate": 5.678340493558427e-06, "loss": 1.2063, "step": 710 }, { "epoch": 2.048710601719198, "grad_norm": 0.2957416068435454, "learning_rate": 5.528573038154028e-06, "loss": 1.1945, "step": 715 }, { "epoch": 2.0630372492836675, "grad_norm": 0.2948232121670883, "learning_rate": 5.380048877289381e-06, "loss": 1.1439, "step": 720 }, { "epoch": 2.0773638968481376, "grad_norm": 0.2919439807242836, "learning_rate": 5.232809308581504e-06, "loss": 1.1496, "step": 725 }, { "epoch": 2.0916905444126073, "grad_norm": 0.2867698512059567, "learning_rate": 5.086895272462475e-06, "loss": 1.1186, "step": 730 }, { "epoch": 2.1060171919770774, "grad_norm": 0.2917668731887502, "learning_rate": 4.942347340795803e-06, "loss": 1.125, "step": 735 }, { "epoch": 2.1203438395415475, "grad_norm": 0.2870682782334859, "learning_rate": 4.799205705595294e-06, "loss": 1.0992, "step": 740 }, { "epoch": 2.134670487106017, "grad_norm": 0.2869641039546839, "learning_rate": 4.657510167849525e-06, "loss": 1.141, "step": 745 }, { "epoch": 2.1489971346704873, "grad_norm": 0.28154741238148256, "learning_rate": 4.5173001264550665e-06, "loss": 1.0984, "step": 750 }, { "epoch": 2.163323782234957, "grad_norm": 0.29022223670208136, "learning_rate": 4.378614567261487e-06, "loss": 1.1313, "step": 755 }, { "epoch": 2.177650429799427, "grad_norm": 0.28112870621050123, "learning_rate": 4.241492052231213e-06, "loss": 1.1865, "step": 760 }, { "epoch": 2.1919770773638967, "grad_norm": 0.27708366870966095, "learning_rate": 4.105970708717244e-06, "loss": 1.1467, "step": 765 }, { "epoch": 2.206303724928367, "grad_norm": 0.28678919627949717, "learning_rate": 3.972088218861738e-06, "loss": 1.1592, "step": 770 }, { "epoch": 2.2206303724928365, "grad_norm": 0.2813821171114588, "learning_rate": 3.83988180911836e-06, "loss": 1.1549, "step": 775 }, { "epoch": 2.2349570200573066, "grad_norm": 0.29052831071379115, "learning_rate": 3.7093882399013504e-06, "loss": 1.1742, "step": 780 }, { "epoch": 2.2492836676217767, "grad_norm": 0.28664925785362466, "learning_rate": 3.580643795364166e-06, "loss": 1.1883, "step": 785 }, { "epoch": 2.2636103151862463, "grad_norm": 0.28360378572017647, "learning_rate": 3.4536842733105702e-06, "loss": 1.1783, "step": 790 }, { "epoch": 2.2779369627507164, "grad_norm": 0.2784641405735339, "learning_rate": 3.3285449752409315e-06, "loss": 1.1119, "step": 795 }, { "epoch": 2.292263610315186, "grad_norm": 0.28470450318470536, "learning_rate": 3.205260696536534e-06, "loss": 1.15, "step": 800 }, { "epoch": 2.306590257879656, "grad_norm": 0.27908057259095853, "learning_rate": 3.083865716784592e-06, "loss": 1.1469, "step": 805 }, { "epoch": 2.3209169054441263, "grad_norm": 0.28684050381253245, "learning_rate": 2.964393790246728e-06, "loss": 1.0791, "step": 810 }, { "epoch": 2.335243553008596, "grad_norm": 0.2833942155479012, "learning_rate": 2.846878136473472e-06, "loss": 1.1854, "step": 815 }, { "epoch": 2.349570200573066, "grad_norm": 0.2813005058217904, "learning_rate": 2.7313514310674826e-06, "loss": 1.1102, "step": 820 }, { "epoch": 2.3638968481375358, "grad_norm": 0.27630773934160313, "learning_rate": 2.6178457965979543e-06, "loss": 1.1111, "step": 825 }, { "epoch": 2.378223495702006, "grad_norm": 0.28322453198420466, "learning_rate": 2.506392793668869e-06, "loss": 1.1262, "step": 830 }, { "epoch": 2.3925501432664755, "grad_norm": 0.27434180065283353, "learning_rate": 2.3970234121434555e-06, "loss": 1.158, "step": 835 }, { "epoch": 2.4068767908309456, "grad_norm": 0.28158095452878135, "learning_rate": 2.2897680625273623e-06, "loss": 1.1152, "step": 840 }, { "epoch": 2.4212034383954153, "grad_norm": 0.29278876714519303, "learning_rate": 2.1846565675129074e-06, "loss": 1.1395, "step": 845 }, { "epoch": 2.4355300859598854, "grad_norm": 0.27557568657805176, "learning_rate": 2.0817181536868035e-06, "loss": 1.1469, "step": 850 }, { "epoch": 2.4498567335243555, "grad_norm": 0.2754040319685035, "learning_rate": 1.9809814434036e-06, "loss": 1.1393, "step": 855 }, { "epoch": 2.464183381088825, "grad_norm": 0.2810022564605419, "learning_rate": 1.8824744468271506e-06, "loss": 1.1043, "step": 860 }, { "epoch": 2.4785100286532953, "grad_norm": 0.27969700189605484, "learning_rate": 1.786224554142285e-06, "loss": 1.116, "step": 865 }, { "epoch": 2.492836676217765, "grad_norm": 0.27897638050493495, "learning_rate": 1.6922585279389037e-06, "loss": 1.1367, "step": 870 }, { "epoch": 2.507163323782235, "grad_norm": 0.2815481626728245, "learning_rate": 1.6006024957705357e-06, "loss": 1.1365, "step": 875 }, { "epoch": 2.5214899713467047, "grad_norm": 0.2769674435573708, "learning_rate": 1.5112819428894976e-06, "loss": 1.1832, "step": 880 }, { "epoch": 2.535816618911175, "grad_norm": 0.28423912540069224, "learning_rate": 1.4243217051606285e-06, "loss": 1.2, "step": 885 }, { "epoch": 2.5501432664756445, "grad_norm": 0.28433511602868367, "learning_rate": 1.339745962155613e-06, "loss": 1.1258, "step": 890 }, { "epoch": 2.5644699140401146, "grad_norm": 0.27492757921736494, "learning_rate": 1.2575782304297647e-06, "loss": 1.1631, "step": 895 }, { "epoch": 2.5787965616045847, "grad_norm": 0.28357440374503834, "learning_rate": 1.1778413569831726e-06, "loss": 1.1508, "step": 900 }, { "epoch": 2.5931232091690544, "grad_norm": 0.28309913251371766, "learning_rate": 1.1005575129080203e-06, "loss": 1.1596, "step": 905 }, { "epoch": 2.6074498567335245, "grad_norm": 0.2830020221949328, "learning_rate": 1.0257481872238483e-06, "loss": 1.1809, "step": 910 }, { "epoch": 2.621776504297994, "grad_norm": 0.2738198006004891, "learning_rate": 9.534341809024583e-07, "loss": 1.0836, "step": 915 }, { "epoch": 2.6361031518624642, "grad_norm": 0.27780926764956043, "learning_rate": 8.836356010841385e-07, "loss": 1.1633, "step": 920 }, { "epoch": 2.6504297994269344, "grad_norm": 0.28167674862359654, "learning_rate": 8.16371855486805e-07, "loss": 1.152, "step": 925 }, { "epoch": 2.664756446991404, "grad_norm": 0.28736011956752383, "learning_rate": 7.516616470096317e-07, "loss": 1.1127, "step": 930 }, { "epoch": 2.6790830945558737, "grad_norm": 0.27167353567975117, "learning_rate": 6.895229685326443e-07, "loss": 1.1574, "step": 935 }, { "epoch": 2.693409742120344, "grad_norm": 0.29082932902589526, "learning_rate": 6.299730979137419e-07, "loss": 1.1426, "step": 940 }, { "epoch": 2.707736389684814, "grad_norm": 0.27278739042779193, "learning_rate": 5.730285931845381e-07, "loss": 1.1113, "step": 945 }, { "epoch": 2.7220630372492836, "grad_norm": 0.2695715592534727, "learning_rate": 5.187052879463394e-07, "loss": 1.1182, "step": 950 }, { "epoch": 2.7363896848137537, "grad_norm": 0.2767698424495714, "learning_rate": 4.6701828696757213e-07, "loss": 1.1264, "step": 955 }, { "epoch": 2.7507163323782233, "grad_norm": 0.2862004769810508, "learning_rate": 4.1798196198384545e-07, "loss": 1.1766, "step": 960 }, { "epoch": 2.7650429799426934, "grad_norm": 0.27956749462161845, "learning_rate": 3.716099477018475e-07, "loss": 1.1463, "step": 965 }, { "epoch": 2.7793696275071635, "grad_norm": 0.28226367416458953, "learning_rate": 3.279151380081691e-07, "loss": 1.1898, "step": 970 }, { "epoch": 2.793696275071633, "grad_norm": 0.2759717986692799, "learning_rate": 2.8690968238412444e-07, "loss": 1.1193, "step": 975 }, { "epoch": 2.8080229226361033, "grad_norm": 0.2913913242847074, "learning_rate": 2.4860498252753827e-07, "loss": 1.2113, "step": 980 }, { "epoch": 2.822349570200573, "grad_norm": 0.28827782734894336, "learning_rate": 2.130116891824796e-07, "loss": 1.1344, "step": 985 }, { "epoch": 2.836676217765043, "grad_norm": 0.2726866974754793, "learning_rate": 1.8013969917777484e-07, "loss": 1.1385, "step": 990 }, { "epoch": 2.8510028653295127, "grad_norm": 0.2818762723051636, "learning_rate": 1.4999815267517593e-07, "loss": 1.1732, "step": 995 }, { "epoch": 2.865329512893983, "grad_norm": 0.2806296409646362, "learning_rate": 1.225954306279009e-07, "loss": 1.1609, "step": 1000 }, { "epoch": 2.8796561604584525, "grad_norm": 0.27946799660404814, "learning_rate": 9.793915245028595e-08, "loss": 1.1875, "step": 1005 }, { "epoch": 2.8939828080229226, "grad_norm": 0.2854513869083756, "learning_rate": 7.603617389918106e-08, "loss": 1.1342, "step": 1010 }, { "epoch": 2.9083094555873927, "grad_norm": 0.2764976233678434, "learning_rate": 5.689258516768825e-08, "loss": 1.1789, "step": 1015 }, { "epoch": 2.9226361031518624, "grad_norm": 0.2843787086373923, "learning_rate": 4.05137091917629e-08, "loss": 1.177, "step": 1020 }, { "epoch": 2.9369627507163325, "grad_norm": 0.2793191895679787, "learning_rate": 2.6904100170150883e-08, "loss": 1.1459, "step": 1025 }, { "epoch": 2.951289398280802, "grad_norm": 0.2820836844171885, "learning_rate": 1.6067542298083826e-08, "loss": 1.2051, "step": 1030 }, { "epoch": 2.9656160458452723, "grad_norm": 0.280116788571808, "learning_rate": 8.007048715068522e-09, "loss": 1.1293, "step": 1035 }, { "epoch": 2.9799426934097424, "grad_norm": 0.2760942837882742, "learning_rate": 2.7248606670760012e-09, "loss": 1.1348, "step": 1040 }, { "epoch": 2.994269340974212, "grad_norm": 0.2785194035391107, "learning_rate": 2.2244688335226749e-10, "loss": 1.1268, "step": 1045 }, { "epoch": 3.0, "step": 1047, "total_flos": 99070802657280.0, "train_loss": 1.4185899444842407, "train_runtime": 3548.2371, "train_samples_per_second": 18.883, "train_steps_per_second": 0.295 } ], "logging_steps": 5, "max_steps": 1047, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 99070802657280.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }