|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9993181043300376, |
|
"eval_steps": 500, |
|
"global_step": 1466, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013637913399249914, |
|
"grad_norm": 1.598020721574378, |
|
"learning_rate": 6.8027210884353745e-06, |
|
"loss": 1.3936, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006818956699624957, |
|
"grad_norm": 1.582179170492486, |
|
"learning_rate": 3.4013605442176877e-05, |
|
"loss": 1.3732, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.013637913399249914, |
|
"grad_norm": 0.5247209097240487, |
|
"learning_rate": 6.802721088435375e-05, |
|
"loss": 1.3443, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.020456870098874872, |
|
"grad_norm": 0.8017522199019398, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 1.2617, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02727582679849983, |
|
"grad_norm": 0.4014040737926046, |
|
"learning_rate": 0.0001360544217687075, |
|
"loss": 1.1857, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03409478349812479, |
|
"grad_norm": 0.32790437534680206, |
|
"learning_rate": 0.00017006802721088434, |
|
"loss": 1.1436, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.040913740197749744, |
|
"grad_norm": 0.2966698205141879, |
|
"learning_rate": 0.00020408163265306123, |
|
"loss": 1.0996, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0477326968973747, |
|
"grad_norm": 0.17808447662676022, |
|
"learning_rate": 0.0002380952380952381, |
|
"loss": 1.0757, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05455165359699966, |
|
"grad_norm": 0.14553030270593192, |
|
"learning_rate": 0.000272108843537415, |
|
"loss": 1.07, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06137061029662462, |
|
"grad_norm": 0.12365194167763814, |
|
"learning_rate": 0.0003061224489795919, |
|
"loss": 1.0494, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06818956699624958, |
|
"grad_norm": 0.19450168087415454, |
|
"learning_rate": 0.0003401360544217687, |
|
"loss": 1.0515, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07500852369587453, |
|
"grad_norm": 0.1207148840427689, |
|
"learning_rate": 0.0003741496598639456, |
|
"loss": 1.0253, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08182748039549949, |
|
"grad_norm": 0.10854420757501072, |
|
"learning_rate": 0.00040816326530612246, |
|
"loss": 1.0353, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08864643709512444, |
|
"grad_norm": 0.16561709445658757, |
|
"learning_rate": 0.0004421768707482993, |
|
"loss": 1.0285, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0954653937947494, |
|
"grad_norm": 0.524524123408497, |
|
"learning_rate": 0.0004761904761904762, |
|
"loss": 1.0379, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10228435049437436, |
|
"grad_norm": 0.12853108179786138, |
|
"learning_rate": 0.0005102040816326531, |
|
"loss": 1.0186, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10910330719399931, |
|
"grad_norm": 0.2734451972437396, |
|
"learning_rate": 0.00054421768707483, |
|
"loss": 1.0063, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.11592226389362427, |
|
"grad_norm": 0.14711225612929515, |
|
"learning_rate": 0.0005782312925170068, |
|
"loss": 1.0132, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.12274122059324924, |
|
"grad_norm": 0.11857109812836017, |
|
"learning_rate": 0.0006122448979591838, |
|
"loss": 1.0131, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1295601772928742, |
|
"grad_norm": 0.16843211920179874, |
|
"learning_rate": 0.0006462585034013606, |
|
"loss": 0.9969, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.13637913399249915, |
|
"grad_norm": 0.15975057567245052, |
|
"learning_rate": 0.0006802721088435374, |
|
"loss": 0.9909, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1431980906921241, |
|
"grad_norm": 0.15632985298714983, |
|
"learning_rate": 0.0007142857142857143, |
|
"loss": 0.9966, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.15001704739174906, |
|
"grad_norm": 0.1621684383980749, |
|
"learning_rate": 0.0007482993197278912, |
|
"loss": 0.9915, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.15683600409137402, |
|
"grad_norm": 0.12303095362829028, |
|
"learning_rate": 0.000782312925170068, |
|
"loss": 0.9787, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.16365496079099898, |
|
"grad_norm": 0.14599562372712946, |
|
"learning_rate": 0.0008163265306122449, |
|
"loss": 0.9882, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.17047391749062393, |
|
"grad_norm": 0.14693955034453152, |
|
"learning_rate": 0.0008503401360544217, |
|
"loss": 0.9801, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1772928741902489, |
|
"grad_norm": 0.22554930118958344, |
|
"learning_rate": 0.0008843537414965987, |
|
"loss": 0.9767, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.18411183088987385, |
|
"grad_norm": 0.15209662483639966, |
|
"learning_rate": 0.0009183673469387756, |
|
"loss": 0.9827, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1909307875894988, |
|
"grad_norm": 0.1338153825791751, |
|
"learning_rate": 0.0009523809523809524, |
|
"loss": 0.9725, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.19774974428912376, |
|
"grad_norm": 0.16873273473563985, |
|
"learning_rate": 0.0009863945578231293, |
|
"loss": 0.9747, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.20456870098874871, |
|
"grad_norm": 0.18631348995548455, |
|
"learning_rate": 0.000999987235881584, |
|
"loss": 0.9737, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.21138765768837367, |
|
"grad_norm": 0.1398209387998964, |
|
"learning_rate": 0.0009999092352957284, |
|
"loss": 0.9848, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.21820661438799863, |
|
"grad_norm": 0.13947559562857936, |
|
"learning_rate": 0.0009997603363497414, |
|
"loss": 0.9643, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.22502557108762358, |
|
"grad_norm": 0.14313053783616825, |
|
"learning_rate": 0.000999540560160838, |
|
"loss": 0.974, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.23184452778724854, |
|
"grad_norm": 0.13580380953886942, |
|
"learning_rate": 0.0009992499378982194, |
|
"loss": 0.9761, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2386634844868735, |
|
"grad_norm": 0.14428497093865783, |
|
"learning_rate": 0.0009988885107786517, |
|
"loss": 0.9652, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.24548244118649848, |
|
"grad_norm": 0.12382971206501955, |
|
"learning_rate": 0.0009984563300606192, |
|
"loss": 0.9678, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.25230139788612344, |
|
"grad_norm": 0.14065780649342097, |
|
"learning_rate": 0.0009979534570370575, |
|
"loss": 0.969, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2591203545857484, |
|
"grad_norm": 0.11357268655774008, |
|
"learning_rate": 0.000997379963026658, |
|
"loss": 0.9692, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.26593931128537335, |
|
"grad_norm": 0.1622202809625719, |
|
"learning_rate": 0.0009967359293637553, |
|
"loss": 0.9588, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2727582679849983, |
|
"grad_norm": 0.1776637907272292, |
|
"learning_rate": 0.0009960214473867907, |
|
"loss": 0.9613, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27957722468462326, |
|
"grad_norm": 0.1215741064620054, |
|
"learning_rate": 0.0009952366184253602, |
|
"loss": 0.963, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2863961813842482, |
|
"grad_norm": 0.1302207261176898, |
|
"learning_rate": 0.0009943815537858415, |
|
"loss": 0.9551, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2932151380838732, |
|
"grad_norm": 0.11682895510290592, |
|
"learning_rate": 0.00099345637473561, |
|
"loss": 0.9598, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.30003409478349813, |
|
"grad_norm": 0.15188852916347045, |
|
"learning_rate": 0.0009924612124858389, |
|
"loss": 0.9583, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3068530514831231, |
|
"grad_norm": 0.13240495035670563, |
|
"learning_rate": 0.0009913962081728918, |
|
"loss": 0.9538, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.31367200818274804, |
|
"grad_norm": 0.16120589037533914, |
|
"learning_rate": 0.0009902615128383062, |
|
"loss": 0.9464, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.320490964882373, |
|
"grad_norm": 0.1382061477313828, |
|
"learning_rate": 0.0009890572874073713, |
|
"loss": 0.9434, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.32730992158199795, |
|
"grad_norm": 0.11549986617558586, |
|
"learning_rate": 0.0009877837026663068, |
|
"loss": 0.9441, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3341288782816229, |
|
"grad_norm": 0.10980499757109458, |
|
"learning_rate": 0.00098644093923804, |
|
"loss": 0.96, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.34094783498124787, |
|
"grad_norm": 0.19062656435818273, |
|
"learning_rate": 0.0009850291875565908, |
|
"loss": 0.9577, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3477667916808728, |
|
"grad_norm": 0.12141161965377588, |
|
"learning_rate": 0.0009835486478400625, |
|
"loss": 0.9456, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3545857483804978, |
|
"grad_norm": 0.11932394138168441, |
|
"learning_rate": 0.000981999530062248, |
|
"loss": 0.9546, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.36140470508012273, |
|
"grad_norm": 0.1277558444188088, |
|
"learning_rate": 0.0009803820539228492, |
|
"loss": 0.9375, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3682236617797477, |
|
"grad_norm": 0.11354346745136285, |
|
"learning_rate": 0.0009786964488163194, |
|
"loss": 0.9473, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.37504261847937265, |
|
"grad_norm": 0.10871503403228854, |
|
"learning_rate": 0.000976942953799331, |
|
"loss": 0.9321, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3818615751789976, |
|
"grad_norm": 0.14107095714136358, |
|
"learning_rate": 0.0009751218175568688, |
|
"loss": 0.9346, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.38868053187862256, |
|
"grad_norm": 0.11468378232954199, |
|
"learning_rate": 0.0009732332983669651, |
|
"loss": 0.9333, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3954994885782475, |
|
"grad_norm": 0.11614073571579422, |
|
"learning_rate": 0.0009712776640640671, |
|
"loss": 0.933, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.40231844527787247, |
|
"grad_norm": 0.14233058701254916, |
|
"learning_rate": 0.0009692551920010519, |
|
"loss": 0.9365, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.40913740197749743, |
|
"grad_norm": 0.16365059288233255, |
|
"learning_rate": 0.0009671661690098941, |
|
"loss": 0.9345, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4159563586771224, |
|
"grad_norm": 0.2778458797011963, |
|
"learning_rate": 0.0009650108913609837, |
|
"loss": 0.9243, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.42277531537674734, |
|
"grad_norm": 0.12497969488106986, |
|
"learning_rate": 0.0009627896647211103, |
|
"loss": 0.915, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4295942720763723, |
|
"grad_norm": 0.12132320992585822, |
|
"learning_rate": 0.0009605028041101116, |
|
"loss": 0.9306, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.43641322877599725, |
|
"grad_norm": 0.15422956629129714, |
|
"learning_rate": 0.0009581506338561974, |
|
"loss": 0.9229, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4432321854756222, |
|
"grad_norm": 0.10857379661878619, |
|
"learning_rate": 0.0009557334875499513, |
|
"loss": 0.9295, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.45005114217524717, |
|
"grad_norm": 0.09700456702304185, |
|
"learning_rate": 0.0009532517079970214, |
|
"loss": 0.9144, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4568700988748721, |
|
"grad_norm": 0.1574301623396698, |
|
"learning_rate": 0.000950705647169502, |
|
"loss": 0.9303, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4636890555744971, |
|
"grad_norm": 0.12653500024100792, |
|
"learning_rate": 0.000948095666156016, |
|
"loss": 0.9145, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.47050801227412203, |
|
"grad_norm": 0.12348356194845742, |
|
"learning_rate": 0.0009454221351105055, |
|
"loss": 0.9115, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.477326968973747, |
|
"grad_norm": 0.1307740739484115, |
|
"learning_rate": 0.0009426854331997334, |
|
"loss": 0.9188, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.484145925673372, |
|
"grad_norm": 0.10835447562048035, |
|
"learning_rate": 0.0009398859485495119, |
|
"loss": 0.9247, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.49096488237299696, |
|
"grad_norm": 0.11863788647430744, |
|
"learning_rate": 0.0009370240781896553, |
|
"loss": 0.9102, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.4977838390726219, |
|
"grad_norm": 0.14783785955650325, |
|
"learning_rate": 0.0009341002279976728, |
|
"loss": 0.9136, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5046027957722469, |
|
"grad_norm": 0.11185935178347504, |
|
"learning_rate": 0.0009311148126412067, |
|
"loss": 0.9108, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5114217524718718, |
|
"grad_norm": 0.11360824911697796, |
|
"learning_rate": 0.0009280682555192229, |
|
"loss": 0.9167, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5182407091714968, |
|
"grad_norm": 0.10219440421235579, |
|
"learning_rate": 0.0009249609887019624, |
|
"loss": 0.9125, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5250596658711217, |
|
"grad_norm": 0.1063437764241102, |
|
"learning_rate": 0.0009217934528696652, |
|
"loss": 0.9135, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5318786225707467, |
|
"grad_norm": 0.12924256438154388, |
|
"learning_rate": 0.000918566097250072, |
|
"loss": 0.9044, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5386975792703717, |
|
"grad_norm": 0.1672822885194737, |
|
"learning_rate": 0.0009152793795547129, |
|
"loss": 0.905, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5455165359699966, |
|
"grad_norm": 0.18170913356641202, |
|
"learning_rate": 0.0009119337659139939, |
|
"loss": 0.9077, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5523354926696216, |
|
"grad_norm": 0.14583299571955888, |
|
"learning_rate": 0.0009085297308110889, |
|
"loss": 0.9013, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5591544493692465, |
|
"grad_norm": 0.1328861894472585, |
|
"learning_rate": 0.0009050677570146482, |
|
"loss": 0.8972, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5659734060688715, |
|
"grad_norm": 0.12732498687417756, |
|
"learning_rate": 0.0009015483355103298, |
|
"loss": 0.8948, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5727923627684964, |
|
"grad_norm": 0.10225924635313659, |
|
"learning_rate": 0.0008979719654311677, |
|
"loss": 0.8977, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5796113194681214, |
|
"grad_norm": 0.11222734733135577, |
|
"learning_rate": 0.0008943391539867831, |
|
"loss": 0.9086, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5864302761677463, |
|
"grad_norm": 0.10007024319396716, |
|
"learning_rate": 0.0008906504163914506, |
|
"loss": 0.9047, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5932492328673713, |
|
"grad_norm": 0.11019466700605089, |
|
"learning_rate": 0.0008869062757910296, |
|
"loss": 0.8998, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6000681895669963, |
|
"grad_norm": 0.12131434093089721, |
|
"learning_rate": 0.00088310726318877, |
|
"loss": 0.9073, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6068871462666212, |
|
"grad_norm": 0.09939916214847802, |
|
"learning_rate": 0.0008792539173700046, |
|
"loss": 0.9056, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6137061029662462, |
|
"grad_norm": 0.11240353319322192, |
|
"learning_rate": 0.0008753467848257366, |
|
"loss": 0.9023, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6205250596658711, |
|
"grad_norm": 0.12884203346821665, |
|
"learning_rate": 0.0008713864196751353, |
|
"loss": 0.891, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6273440163654961, |
|
"grad_norm": 0.13107538778471978, |
|
"learning_rate": 0.0008673733835869496, |
|
"loss": 0.9053, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.634162973065121, |
|
"grad_norm": 0.1265128902066715, |
|
"learning_rate": 0.0008633082456998505, |
|
"loss": 0.8765, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.640981929764746, |
|
"grad_norm": 0.11168557466408092, |
|
"learning_rate": 0.0008591915825417144, |
|
"loss": 0.8937, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.647800886464371, |
|
"grad_norm": 0.10749488343940884, |
|
"learning_rate": 0.0008550239779478592, |
|
"loss": 0.8936, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6546198431639959, |
|
"grad_norm": 0.12116636988129866, |
|
"learning_rate": 0.0008508060229782422, |
|
"loss": 0.8985, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6614387998636209, |
|
"grad_norm": 0.11248409191993927, |
|
"learning_rate": 0.0008465383158336352, |
|
"loss": 0.9068, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6682577565632458, |
|
"grad_norm": 0.09616110942207395, |
|
"learning_rate": 0.0008422214617707864, |
|
"loss": 0.893, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6750767132628708, |
|
"grad_norm": 0.10944978412158525, |
|
"learning_rate": 0.000837856073016581, |
|
"loss": 0.9015, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6818956699624957, |
|
"grad_norm": 0.12373353259009305, |
|
"learning_rate": 0.0008334427686812137, |
|
"loss": 0.8805, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6887146266621207, |
|
"grad_norm": 0.10998182720982563, |
|
"learning_rate": 0.000828982174670385, |
|
"loss": 0.8764, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6955335833617456, |
|
"grad_norm": 0.11981879919775634, |
|
"learning_rate": 0.0008244749235965338, |
|
"loss": 0.8972, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7023525400613706, |
|
"grad_norm": 0.11009515701872061, |
|
"learning_rate": 0.000819921654689119, |
|
"loss": 0.883, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7091714967609956, |
|
"grad_norm": 0.1206592530642094, |
|
"learning_rate": 0.0008153230137039615, |
|
"loss": 0.8887, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7159904534606205, |
|
"grad_norm": 0.10679229804484212, |
|
"learning_rate": 0.0008106796528316626, |
|
"loss": 0.8894, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7228094101602455, |
|
"grad_norm": 0.11030470444996518, |
|
"learning_rate": 0.000805992230605108, |
|
"loss": 0.8881, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7296283668598704, |
|
"grad_norm": 0.14989020257644473, |
|
"learning_rate": 0.0008012614118060733, |
|
"loss": 0.8758, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7364473235594954, |
|
"grad_norm": 0.19608291727965096, |
|
"learning_rate": 0.0007964878673709432, |
|
"loss": 0.873, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7432662802591203, |
|
"grad_norm": 0.1357305661381077, |
|
"learning_rate": 0.0007916722742955573, |
|
"loss": 0.878, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.7500852369587453, |
|
"grad_norm": 0.09279906563803725, |
|
"learning_rate": 0.0007868153155391968, |
|
"loss": 0.8844, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7569041936583703, |
|
"grad_norm": 0.1003975725083102, |
|
"learning_rate": 0.0007819176799277262, |
|
"loss": 0.8875, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.7637231503579952, |
|
"grad_norm": 0.09978871199012411, |
|
"learning_rate": 0.0007769800620559015, |
|
"loss": 0.8866, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7705421070576202, |
|
"grad_norm": 0.13532770031312716, |
|
"learning_rate": 0.0007720031621888615, |
|
"loss": 0.879, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7773610637572451, |
|
"grad_norm": 0.10118664495666357, |
|
"learning_rate": 0.0007669876861628144, |
|
"loss": 0.87, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7841800204568701, |
|
"grad_norm": 0.09994831940420743, |
|
"learning_rate": 0.0007619343452849349, |
|
"loss": 0.8759, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.790998977156495, |
|
"grad_norm": 0.12994411471721243, |
|
"learning_rate": 0.0007568438562324833, |
|
"loss": 0.8783, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.79781793385612, |
|
"grad_norm": 0.1169831966799223, |
|
"learning_rate": 0.0007517169409511664, |
|
"loss": 0.8672, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8046368905557449, |
|
"grad_norm": 0.10141671278849922, |
|
"learning_rate": 0.0007465543265527482, |
|
"loss": 0.8695, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8114558472553699, |
|
"grad_norm": 0.11783176873723326, |
|
"learning_rate": 0.0007413567452119298, |
|
"loss": 0.8689, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8182748039549949, |
|
"grad_norm": 0.12718960852527547, |
|
"learning_rate": 0.00073612493406251, |
|
"loss": 0.8708, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8250937606546198, |
|
"grad_norm": 0.12649370297868867, |
|
"learning_rate": 0.0007308596350928434, |
|
"loss": 0.8759, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8319127173542448, |
|
"grad_norm": 0.1009142565676403, |
|
"learning_rate": 0.0007255615950406102, |
|
"loss": 0.862, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8387316740538697, |
|
"grad_norm": 0.10490423262148889, |
|
"learning_rate": 0.0007202315652869112, |
|
"loss": 0.87, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8455506307534947, |
|
"grad_norm": 0.09903668718402683, |
|
"learning_rate": 0.0007148703017497058, |
|
"loss": 0.8705, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.8523695874531196, |
|
"grad_norm": 0.14442585966687965, |
|
"learning_rate": 0.0007094785647766055, |
|
"loss": 0.8681, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8591885441527446, |
|
"grad_norm": 0.16498073796059864, |
|
"learning_rate": 0.0007040571190370397, |
|
"loss": 0.8656, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.8660075008523695, |
|
"grad_norm": 0.1129999366359928, |
|
"learning_rate": 0.0006986067334138079, |
|
"loss": 0.8686, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.8728264575519945, |
|
"grad_norm": 0.09929311073474267, |
|
"learning_rate": 0.0006931281808940361, |
|
"loss": 0.8687, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8796454142516195, |
|
"grad_norm": 0.11972041401629113, |
|
"learning_rate": 0.0006876222384595477, |
|
"loss": 0.8652, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8864643709512444, |
|
"grad_norm": 0.09934940714328987, |
|
"learning_rate": 0.0006820896869766725, |
|
"loss": 0.8709, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8932833276508694, |
|
"grad_norm": 0.10973507676224258, |
|
"learning_rate": 0.0006765313110855009, |
|
"loss": 0.8652, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9001022843504943, |
|
"grad_norm": 0.09165992876326395, |
|
"learning_rate": 0.0006709478990886039, |
|
"loss": 0.8611, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9069212410501193, |
|
"grad_norm": 0.09988220618681196, |
|
"learning_rate": 0.0006653402428392354, |
|
"loss": 0.8556, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9137401977497442, |
|
"grad_norm": 0.09446350263562757, |
|
"learning_rate": 0.0006597091376290288, |
|
"loss": 0.8528, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9205591544493692, |
|
"grad_norm": 0.08902820350532409, |
|
"learning_rate": 0.0006540553820752069, |
|
"loss": 0.8609, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.9273781111489942, |
|
"grad_norm": 0.10011816954581992, |
|
"learning_rate": 0.00064837977800732, |
|
"loss": 0.8625, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9341970678486191, |
|
"grad_norm": 0.08775726631732685, |
|
"learning_rate": 0.0006426831303535284, |
|
"loss": 0.8576, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9410160245482441, |
|
"grad_norm": 0.11117730920395297, |
|
"learning_rate": 0.0006369662470264462, |
|
"loss": 0.8704, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.947834981247869, |
|
"grad_norm": 0.139154974071725, |
|
"learning_rate": 0.0006312299388085596, |
|
"loss": 0.8584, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.954653937947494, |
|
"grad_norm": 0.10346972946518294, |
|
"learning_rate": 0.0006254750192372418, |
|
"loss": 0.8709, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9614728946471189, |
|
"grad_norm": 0.09900042387283375, |
|
"learning_rate": 0.0006197023044893734, |
|
"loss": 0.8482, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.968291851346744, |
|
"grad_norm": 0.09908280186750126, |
|
"learning_rate": 0.0006139126132655905, |
|
"loss": 0.8597, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.975110808046369, |
|
"grad_norm": 0.13127044402154367, |
|
"learning_rate": 0.0006081067666741757, |
|
"loss": 0.8483, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.9819297647459939, |
|
"grad_norm": 0.09623829465725477, |
|
"learning_rate": 0.0006022855881146053, |
|
"loss": 0.8589, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.9887487214456189, |
|
"grad_norm": 0.09762948252187646, |
|
"learning_rate": 0.0005964499031607727, |
|
"loss": 0.8479, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9955676781452438, |
|
"grad_norm": 0.11109345413298818, |
|
"learning_rate": 0.0005906005394439044, |
|
"loss": 0.8625, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9996590521650187, |
|
"eval_loss": 0.8650394678115845, |
|
"eval_runtime": 60.297, |
|
"eval_samples_per_second": 184.404, |
|
"eval_steps_per_second": 5.771, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 1.0023866348448687, |
|
"grad_norm": 0.150003136868036, |
|
"learning_rate": 0.0005847383265351828, |
|
"loss": 0.8218, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.0092055915444937, |
|
"grad_norm": 0.1668337683466409, |
|
"learning_rate": 0.0005788640958280941, |
|
"loss": 0.7516, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0160245482441186, |
|
"grad_norm": 0.14142825658305677, |
|
"learning_rate": 0.0005729786804205181, |
|
"loss": 0.7548, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.0228435049437437, |
|
"grad_norm": 0.12528810569521878, |
|
"learning_rate": 0.0005670829149965773, |
|
"loss": 0.7473, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0296624616433685, |
|
"grad_norm": 0.11736651721866655, |
|
"learning_rate": 0.0005611776357082579, |
|
"loss": 0.747, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0364814183429936, |
|
"grad_norm": 0.09403020504249461, |
|
"learning_rate": 0.0005552636800568266, |
|
"loss": 0.7621, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.0433003750426184, |
|
"grad_norm": 0.11315198038951943, |
|
"learning_rate": 0.0005493418867740529, |
|
"loss": 0.745, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.0501193317422435, |
|
"grad_norm": 0.1022560757959738, |
|
"learning_rate": 0.0005434130957032589, |
|
"loss": 0.7459, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.0569382884418683, |
|
"grad_norm": 0.10922375995449658, |
|
"learning_rate": 0.0005374781476802096, |
|
"loss": 0.7466, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.0637572451414934, |
|
"grad_norm": 0.5193070680732547, |
|
"learning_rate": 0.0005315378844138647, |
|
"loss": 0.7627, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.0705762018411182, |
|
"grad_norm": 0.12177144687559627, |
|
"learning_rate": 0.0005255931483670049, |
|
"loss": 0.7546, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.0773951585407433, |
|
"grad_norm": 0.7694014802591276, |
|
"learning_rate": 0.000519644782636751, |
|
"loss": 0.7528, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.0842141152403681, |
|
"grad_norm": 0.11357197300498553, |
|
"learning_rate": 0.000513693630834995, |
|
"loss": 0.7533, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.0910330719399932, |
|
"grad_norm": 0.09386251189985013, |
|
"learning_rate": 0.0005077405369687564, |
|
"loss": 0.7542, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.097852028639618, |
|
"grad_norm": 0.09656492984059917, |
|
"learning_rate": 0.0005017863453204828, |
|
"loss": 0.7453, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.1046709853392431, |
|
"grad_norm": 0.09988629196548339, |
|
"learning_rate": 0.0004958319003283121, |
|
"loss": 0.7567, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.111489942038868, |
|
"grad_norm": 0.1340291294865019, |
|
"learning_rate": 0.0004898780464663119, |
|
"loss": 0.7572, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.118308898738493, |
|
"grad_norm": 0.12216818040509048, |
|
"learning_rate": 0.00048392562812471485, |
|
"loss": 0.7386, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.1251278554381179, |
|
"grad_norm": 0.10511324731067179, |
|
"learning_rate": 0.0004779754894901638, |
|
"loss": 0.7551, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.131946812137743, |
|
"grad_norm": 0.08878475250721538, |
|
"learning_rate": 0.00047202847442598845, |
|
"loss": 0.7476, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.1387657688373678, |
|
"grad_norm": 0.10961153167460981, |
|
"learning_rate": 0.0004660854263525255, |
|
"loss": 0.7501, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.1455847255369929, |
|
"grad_norm": 0.11227235770228143, |
|
"learning_rate": 0.0004601471881275041, |
|
"loss": 0.7468, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.1524036822366177, |
|
"grad_norm": 0.14319195588727593, |
|
"learning_rate": 0.00045421460192650786, |
|
"loss": 0.7459, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.1592226389362428, |
|
"grad_norm": 0.12581674820255845, |
|
"learning_rate": 0.00044828850912353703, |
|
"loss": 0.7496, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.1660415956358676, |
|
"grad_norm": 0.10879564209007402, |
|
"learning_rate": 0.0004423697501716823, |
|
"loss": 0.7377, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.1728605523354927, |
|
"grad_norm": 0.10910700864864685, |
|
"learning_rate": 0.00043645916448392885, |
|
"loss": 0.7569, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.1796795090351175, |
|
"grad_norm": 0.12233584779335835, |
|
"learning_rate": 0.00043055759031411007, |
|
"loss": 0.7451, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.1864984657347426, |
|
"grad_norm": 0.09336996907684925, |
|
"learning_rate": 0.0004246658646380229, |
|
"loss": 0.7486, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.1933174224343674, |
|
"grad_norm": 0.09297895821672336, |
|
"learning_rate": 0.00041878482303472745, |
|
"loss": 0.7522, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.2001363791339925, |
|
"grad_norm": 0.09998221831882348, |
|
"learning_rate": 0.00041291529956804195, |
|
"loss": 0.756, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2069553358336174, |
|
"grad_norm": 0.09246595578930603, |
|
"learning_rate": 0.0004070581266682539, |
|
"loss": 0.7564, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.2137742925332424, |
|
"grad_norm": 0.09888677663554202, |
|
"learning_rate": 0.000401214135014063, |
|
"loss": 0.7406, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2205932492328673, |
|
"grad_norm": 0.09840879351277865, |
|
"learning_rate": 0.0003953841534147725, |
|
"loss": 0.7444, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.2274122059324923, |
|
"grad_norm": 0.09608186181313223, |
|
"learning_rate": 0.000389569008692745, |
|
"loss": 0.7434, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2342311626321174, |
|
"grad_norm": 0.12027155555266326, |
|
"learning_rate": 0.0003837695255661403, |
|
"loss": 0.7505, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.2410501193317423, |
|
"grad_norm": 0.09938398385533825, |
|
"learning_rate": 0.00037798652653195266, |
|
"loss": 0.7524, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.247869076031367, |
|
"grad_norm": 0.0963922155626505, |
|
"learning_rate": 0.00037222083174936137, |
|
"loss": 0.7481, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.2546880327309922, |
|
"grad_norm": 0.0959346234671189, |
|
"learning_rate": 0.00036647325892341393, |
|
"loss": 0.7398, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.2615069894306172, |
|
"grad_norm": 0.08820345622198668, |
|
"learning_rate": 0.0003607446231890575, |
|
"loss": 0.7405, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.268325946130242, |
|
"grad_norm": 0.11589956732636802, |
|
"learning_rate": 0.0003550357369955347, |
|
"loss": 0.7509, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.275144902829867, |
|
"grad_norm": 0.10547612256066802, |
|
"learning_rate": 0.00034934740999115866, |
|
"loss": 0.7502, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.281963859529492, |
|
"grad_norm": 0.11823191257134831, |
|
"learning_rate": 0.00034368044890848814, |
|
"loss": 0.7661, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.288782816229117, |
|
"grad_norm": 0.09346920096074791, |
|
"learning_rate": 0.0003380356574499141, |
|
"loss": 0.7409, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.295601772928742, |
|
"grad_norm": 0.09247106405202875, |
|
"learning_rate": 0.00033241383617367706, |
|
"loss": 0.7448, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.3024207296283667, |
|
"grad_norm": 0.10126097744075767, |
|
"learning_rate": 0.00032681578238032914, |
|
"loss": 0.7455, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.3092396863279918, |
|
"grad_norm": 0.10957664402150283, |
|
"learning_rate": 0.0003212422899996599, |
|
"loss": 0.7442, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3160586430276169, |
|
"grad_norm": 0.08997598908036177, |
|
"learning_rate": 0.0003156941494780983, |
|
"loss": 0.7399, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.3228775997272417, |
|
"grad_norm": 0.09339106504051306, |
|
"learning_rate": 0.0003101721476666106, |
|
"loss": 0.7473, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.3296965564268666, |
|
"grad_norm": 0.09094074237158431, |
|
"learning_rate": 0.00030467706770910687, |
|
"loss": 0.7349, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.3365155131264916, |
|
"grad_norm": 0.08949096519435662, |
|
"learning_rate": 0.00029920968893137277, |
|
"loss": 0.7335, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.3433344698261167, |
|
"grad_norm": 0.10666408169466289, |
|
"learning_rate": 0.00029377078673054524, |
|
"loss": 0.7431, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.3501534265257416, |
|
"grad_norm": 0.09360456030533196, |
|
"learning_rate": 0.00028836113246514215, |
|
"loss": 0.7504, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.3569723832253664, |
|
"grad_norm": 0.0857322076398207, |
|
"learning_rate": 0.00028298149334566745, |
|
"loss": 0.7461, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.3637913399249915, |
|
"grad_norm": 0.08664863370331233, |
|
"learning_rate": 0.0002776326323258029, |
|
"loss": 0.7347, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3706102966246165, |
|
"grad_norm": 0.09553160941014031, |
|
"learning_rate": 0.0002723153079942047, |
|
"loss": 0.7432, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.3774292533242414, |
|
"grad_norm": 0.09757917450665214, |
|
"learning_rate": 0.00026703027446691753, |
|
"loss": 0.7319, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.3842482100238662, |
|
"grad_norm": 0.13001681090880932, |
|
"learning_rate": 0.0002617782812804252, |
|
"loss": 0.731, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.3910671667234913, |
|
"grad_norm": 0.09842205689754281, |
|
"learning_rate": 0.00025656007328534857, |
|
"loss": 0.7377, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.3978861234231164, |
|
"grad_norm": 0.08966471368100484, |
|
"learning_rate": 0.00025137639054080975, |
|
"loss": 0.7393, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.4047050801227412, |
|
"grad_norm": 0.09544505949561016, |
|
"learning_rate": 0.0002462279682094742, |
|
"loss": 0.7448, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.411524036822366, |
|
"grad_norm": 0.09721892448809809, |
|
"learning_rate": 0.00024111553645328872, |
|
"loss": 0.7383, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.4183429935219911, |
|
"grad_norm": 0.09528944859478844, |
|
"learning_rate": 0.00023603982032992861, |
|
"loss": 0.743, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.4251619502216162, |
|
"grad_norm": 0.09295756990644403, |
|
"learning_rate": 0.00023100153968996678, |
|
"loss": 0.7354, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.431980906921241, |
|
"grad_norm": 0.09085933894422743, |
|
"learning_rate": 0.0002260014090747845, |
|
"loss": 0.7363, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.4387998636208659, |
|
"grad_norm": 0.09685032026741523, |
|
"learning_rate": 0.00022104013761523156, |
|
"loss": 0.7457, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.445618820320491, |
|
"grad_norm": 0.08857516230100924, |
|
"learning_rate": 0.00021611842893105726, |
|
"loss": 0.742, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.452437777020116, |
|
"grad_norm": 0.08773056464470375, |
|
"learning_rate": 0.0002112369810311201, |
|
"loss": 0.7332, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.4592567337197409, |
|
"grad_norm": 0.0900763405573559, |
|
"learning_rate": 0.00020639648621439488, |
|
"loss": 0.7449, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.466075690419366, |
|
"grad_norm": 0.09851993938416971, |
|
"learning_rate": 0.00020159763097178952, |
|
"loss": 0.7402, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.4728946471189908, |
|
"grad_norm": 0.08759892676999576, |
|
"learning_rate": 0.0001968410958887849, |
|
"loss": 0.7445, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.4797136038186158, |
|
"grad_norm": 0.0907347079072196, |
|
"learning_rate": 0.0001921275555489122, |
|
"loss": 0.7399, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.4865325605182407, |
|
"grad_norm": 0.08112607532285956, |
|
"learning_rate": 0.00018745767843808209, |
|
"loss": 0.7304, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.4933515172178657, |
|
"grad_norm": 0.09161753107862378, |
|
"learning_rate": 0.000182832126849779, |
|
"loss": 0.7285, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.5001704739174906, |
|
"grad_norm": 0.08358605650538908, |
|
"learning_rate": 0.00017825155679113204, |
|
"loss": 0.7348, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5069894306171157, |
|
"grad_norm": 0.08675686822052969, |
|
"learning_rate": 0.00017371661788987875, |
|
"loss": 0.7409, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.5138083873167405, |
|
"grad_norm": 0.08810226317282835, |
|
"learning_rate": 0.0001692279533022339, |
|
"loss": 0.7384, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.5206273440163653, |
|
"grad_norm": 0.08950130348184156, |
|
"learning_rate": 0.00016478619962167495, |
|
"loss": 0.7492, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.5274463007159904, |
|
"grad_norm": 0.08034358957544274, |
|
"learning_rate": 0.00016039198678865861, |
|
"loss": 0.7332, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.5342652574156155, |
|
"grad_norm": 0.09875564116278145, |
|
"learning_rate": 0.00015604593800128157, |
|
"loss": 0.7245, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.5410842141152403, |
|
"grad_norm": 0.08654215732134887, |
|
"learning_rate": 0.00015174866962689655, |
|
"loss": 0.735, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.5479031708148652, |
|
"grad_norm": 0.08592984519220763, |
|
"learning_rate": 0.00014750079111469844, |
|
"loss": 0.7409, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.5547221275144902, |
|
"grad_norm": 0.09146622488124098, |
|
"learning_rate": 0.00014330290490928936, |
|
"loss": 0.7307, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.5615410842141153, |
|
"grad_norm": 0.09107009139517197, |
|
"learning_rate": 0.00013915560636524005, |
|
"loss": 0.7298, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.5683600409137401, |
|
"grad_norm": 0.08290920543587224, |
|
"learning_rate": 0.0001350594836626537, |
|
"loss": 0.7288, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.575178997613365, |
|
"grad_norm": 0.089017743866575, |
|
"learning_rate": 0.00013101511772375002, |
|
"loss": 0.721, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.58199795431299, |
|
"grad_norm": 0.08395496088906577, |
|
"learning_rate": 0.00012702308213047653, |
|
"loss": 0.7369, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.5888169110126151, |
|
"grad_norm": 0.09332825258379267, |
|
"learning_rate": 0.00012308394304316224, |
|
"loss": 0.7331, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.59563586771224, |
|
"grad_norm": 0.08663551849922511, |
|
"learning_rate": 0.00011919825912022336, |
|
"loss": 0.7284, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.6024548244118648, |
|
"grad_norm": 0.08467643435872389, |
|
"learning_rate": 0.00011536658143893309, |
|
"loss": 0.73, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.6092737811114899, |
|
"grad_norm": 0.10792296188935317, |
|
"learning_rate": 0.0001115894534172659, |
|
"loss": 0.7304, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.616092737811115, |
|
"grad_norm": 0.08665506882217053, |
|
"learning_rate": 0.0001078674107368291, |
|
"loss": 0.7365, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.6229116945107398, |
|
"grad_norm": 0.0869258591144236, |
|
"learning_rate": 0.00010420098126689159, |
|
"loss": 0.7271, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.6297306512103649, |
|
"grad_norm": 0.08082381008828005, |
|
"learning_rate": 0.00010059068498951912, |
|
"loss": 0.7334, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.63654960790999, |
|
"grad_norm": 0.09653014658305435, |
|
"learning_rate": 9.70370339258298e-05, |
|
"loss": 0.733, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.6433685646096148, |
|
"grad_norm": 0.08209631476731684, |
|
"learning_rate": 9.354053206337803e-05, |
|
"loss": 0.7295, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.6501875213092396, |
|
"grad_norm": 0.08281543379309102, |
|
"learning_rate": 9.010167528467784e-05, |
|
"loss": 0.7282, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.6570064780088647, |
|
"grad_norm": 0.09079242657548177, |
|
"learning_rate": 8.672095129687485e-05, |
|
"loss": 0.7332, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.6638254347084898, |
|
"grad_norm": 0.08732562104521625, |
|
"learning_rate": 8.33988395625791e-05, |
|
"loss": 0.7196, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.6706443914081146, |
|
"grad_norm": 0.09634799774642026, |
|
"learning_rate": 8.013581123186675e-05, |
|
"loss": 0.7243, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.6774633481077394, |
|
"grad_norm": 0.0892053596550817, |
|
"learning_rate": 7.693232907545955e-05, |
|
"loss": 0.7284, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.6842823048073645, |
|
"grad_norm": 0.08080152740232713, |
|
"learning_rate": 7.378884741909409e-05, |
|
"loss": 0.7226, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.6911012615069896, |
|
"grad_norm": 0.08211682147824328, |
|
"learning_rate": 7.070581207908832e-05, |
|
"loss": 0.7293, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.6979202182066144, |
|
"grad_norm": 0.08125416074026208, |
|
"learning_rate": 6.76836602991146e-05, |
|
"loss": 0.7241, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.7047391749062393, |
|
"grad_norm": 0.08118597843483698, |
|
"learning_rate": 6.472282068818857e-05, |
|
"loss": 0.7176, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.7115581316058643, |
|
"grad_norm": 0.08444459255476687, |
|
"learning_rate": 6.182371315988283e-05, |
|
"loss": 0.7345, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.7183770883054894, |
|
"grad_norm": 0.08993862852560496, |
|
"learning_rate": 5.898674887277394e-05, |
|
"loss": 0.7287, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.7251960450051143, |
|
"grad_norm": 0.09172625446804755, |
|
"learning_rate": 5.621233017213095e-05, |
|
"loss": 0.728, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.732015001704739, |
|
"grad_norm": 0.09728192339349417, |
|
"learning_rate": 5.3500850532853477e-05, |
|
"loss": 0.7252, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.7388339584043642, |
|
"grad_norm": 0.08183637004716483, |
|
"learning_rate": 5.085269450366836e-05, |
|
"loss": 0.7249, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.7456529151039892, |
|
"grad_norm": 0.08417398467645139, |
|
"learning_rate": 4.8268237652591805e-05, |
|
"loss": 0.7263, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.752471871803614, |
|
"grad_norm": 0.08559525343966955, |
|
"learning_rate": 4.574784651366581e-05, |
|
"loss": 0.7307, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.759290828503239, |
|
"grad_norm": 0.08652462817281027, |
|
"learning_rate": 4.329187853497491e-05, |
|
"loss": 0.7321, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.766109785202864, |
|
"grad_norm": 0.07780149732014437, |
|
"learning_rate": 4.0900682027952274e-05, |
|
"loss": 0.7259, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.772928741902489, |
|
"grad_norm": 0.08073420987197251, |
|
"learning_rate": 3.8574596117981367e-05, |
|
"loss": 0.7318, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.779747698602114, |
|
"grad_norm": 0.09399125884464814, |
|
"learning_rate": 3.631395069630039e-05, |
|
"loss": 0.7286, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.7865666553017387, |
|
"grad_norm": 0.08683728371693161, |
|
"learning_rate": 3.411906637321588e-05, |
|
"loss": 0.7281, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.7933856120013638, |
|
"grad_norm": 0.08220865889050938, |
|
"learning_rate": 3.199025443263331e-05, |
|
"loss": 0.7214, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.8002045687009889, |
|
"grad_norm": 0.08394991114226853, |
|
"learning_rate": 2.9927816787910233e-05, |
|
"loss": 0.7274, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.8070235254006137, |
|
"grad_norm": 0.08648979166756451, |
|
"learning_rate": 2.79320459390377e-05, |
|
"loss": 0.728, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.8138424821002386, |
|
"grad_norm": 0.0809919030945408, |
|
"learning_rate": 2.600322493115742e-05, |
|
"loss": 0.7207, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.8206614387998636, |
|
"grad_norm": 0.08146420956821185, |
|
"learning_rate": 2.414162731441971e-05, |
|
"loss": 0.7279, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.8274803954994887, |
|
"grad_norm": 0.08991080627514368, |
|
"learning_rate": 2.2347517105187952e-05, |
|
"loss": 0.7285, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.8342993521991136, |
|
"grad_norm": 0.08113174604843235, |
|
"learning_rate": 2.062114874859483e-05, |
|
"loss": 0.7212, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.8411183088987384, |
|
"grad_norm": 0.0890991046050432, |
|
"learning_rate": 1.8962767082456368e-05, |
|
"loss": 0.7238, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.8479372655983635, |
|
"grad_norm": 0.08194468587127682, |
|
"learning_rate": 1.7372607302548916e-05, |
|
"loss": 0.7278, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.8547562222979885, |
|
"grad_norm": 0.0796060532873072, |
|
"learning_rate": 1.58508949292524e-05, |
|
"loss": 0.7303, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.8615751789976134, |
|
"grad_norm": 0.08325978355636353, |
|
"learning_rate": 1.439784577556702e-05, |
|
"loss": 0.7259, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.8683941356972382, |
|
"grad_norm": 0.08101951703165321, |
|
"learning_rate": 1.3013665916505824e-05, |
|
"loss": 0.7213, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.8752130923968633, |
|
"grad_norm": 0.07474258489762015, |
|
"learning_rate": 1.1698551659868716e-05, |
|
"loss": 0.7251, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.8820320490964884, |
|
"grad_norm": 0.07988354824659524, |
|
"learning_rate": 1.0452689518401615e-05, |
|
"loss": 0.7336, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.8888510057961132, |
|
"grad_norm": 0.0869204045893242, |
|
"learning_rate": 9.276256183344767e-06, |
|
"loss": 0.7225, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.895669962495738, |
|
"grad_norm": 0.08073129640276419, |
|
"learning_rate": 8.169418499373749e-06, |
|
"loss": 0.719, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.9024889191953631, |
|
"grad_norm": 0.09277363778878074, |
|
"learning_rate": 7.132333440937666e-06, |
|
"loss": 0.7163, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.9093078758949882, |
|
"grad_norm": 0.08156654600969289, |
|
"learning_rate": 6.165148089996075e-06, |
|
"loss": 0.7294, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.916126832594613, |
|
"grad_norm": 0.07931208656000137, |
|
"learning_rate": 5.267999615159724e-06, |
|
"loss": 0.7188, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.9229457892942379, |
|
"grad_norm": 0.08734085147118956, |
|
"learning_rate": 4.441015252237113e-06, |
|
"loss": 0.7239, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.929764745993863, |
|
"grad_norm": 0.07927625348774799, |
|
"learning_rate": 3.684312286189151e-06, |
|
"loss": 0.737, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.936583702693488, |
|
"grad_norm": 0.08278514720419748, |
|
"learning_rate": 2.997998034495908e-06, |
|
"loss": 0.7232, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.9434026593931129, |
|
"grad_norm": 0.09337537685419582, |
|
"learning_rate": 2.382169831936565e-06, |
|
"loss": 0.7327, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.9502216160927377, |
|
"grad_norm": 0.07802348568191127, |
|
"learning_rate": 1.8369150167848459e-06, |
|
"loss": 0.7168, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.9570405727923628, |
|
"grad_norm": 0.07582457656438514, |
|
"learning_rate": 1.3623109184228711e-06, |
|
"loss": 0.7111, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.9638595294919878, |
|
"grad_norm": 0.07999706762970123, |
|
"learning_rate": 9.584248463739288e-07, |
|
"loss": 0.7251, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.9706784861916127, |
|
"grad_norm": 0.08421307868632791, |
|
"learning_rate": 6.253140807562785e-07, |
|
"loss": 0.7172, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.9774974428912375, |
|
"grad_norm": 0.08148471965897129, |
|
"learning_rate": 3.630258641600381e-07, |
|
"loss": 0.7198, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.9843163995908626, |
|
"grad_norm": 0.0748791624620726, |
|
"learning_rate": 1.7159739494654325e-07, |
|
"loss": 0.7267, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.9911353562904877, |
|
"grad_norm": 0.08613985665976748, |
|
"learning_rate": 5.1055821973289286e-08, |
|
"loss": 0.7258, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.9979543129901125, |
|
"grad_norm": 0.07749130518414991, |
|
"learning_rate": 1.418240743289445e-09, |
|
"loss": 0.7335, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.9993181043300376, |
|
"eval_loss": 0.8305559158325195, |
|
"eval_runtime": 60.2883, |
|
"eval_samples_per_second": 184.43, |
|
"eval_steps_per_second": 5.772, |
|
"step": 1466 |
|
}, |
|
{ |
|
"epoch": 1.9993181043300376, |
|
"step": 1466, |
|
"total_flos": 143858545459200.0, |
|
"train_loss": 0.8359313647060732, |
|
"train_runtime": 3324.125, |
|
"train_samples_per_second": 56.452, |
|
"train_steps_per_second": 0.441 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1466, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 143858545459200.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|