{ "best_metric": 1.2184005975723267, "best_model_checkpoint": "./output/training_results/C021_random_sample_llama3-8b-base_pretrain_20240505_135320/checkpoint-800", "epoch": 4.0, "eval_steps": 200, "global_step": 1876, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021321961620469083, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 1.3827, "step": 1 }, { "epoch": 0.010660980810234541, "grad_norm": 2.0635243899751066, "learning_rate": 2.25e-06, "loss": 1.4616, "step": 5 }, { "epoch": 0.021321961620469083, "grad_norm": 1.903018233428319, "learning_rate": 6e-06, "loss": 1.2299, "step": 10 }, { "epoch": 0.031982942430703626, "grad_norm": 1.5467734081397355, "learning_rate": 9.75e-06, "loss": 1.2843, "step": 15 }, { "epoch": 0.042643923240938165, "grad_norm": 1.995653146329813, "learning_rate": 1.3500000000000001e-05, "loss": 1.31, "step": 20 }, { "epoch": 0.053304904051172705, "grad_norm": 2.410826051077061, "learning_rate": 1.473632431655486e-05, "loss": 1.3681, "step": 25 }, { "epoch": 0.06396588486140725, "grad_norm": 1.765947093190964, "learning_rate": 1.4306244585426726e-05, "loss": 1.3228, "step": 30 }, { "epoch": 0.07462686567164178, "grad_norm": 1.8242122051647833, "learning_rate": 1.3887645147313336e-05, "loss": 1.2664, "step": 35 }, { "epoch": 0.08528784648187633, "grad_norm": 1.8945258342407882, "learning_rate": 1.3560845370175857e-05, "loss": 1.3073, "step": 40 }, { "epoch": 0.09594882729211088, "grad_norm": 1.4272941224837, "learning_rate": 1.3162217563557544e-05, "loss": 1.3019, "step": 45 }, { "epoch": 0.10660980810234541, "grad_norm": 1.4171437963555575, "learning_rate": 1.2774310614975657e-05, "loss": 1.3613, "step": 50 }, { "epoch": 0.11727078891257996, "grad_norm": 1.4495644289679146, "learning_rate": 1.2396864324754692e-05, "loss": 1.4084, "step": 55 }, { "epoch": 0.1279317697228145, "grad_norm": 1.34494402339979, "learning_rate": 1.2029624121909564e-05, "loss": 1.3002, "step": 60 }, { "epoch": 0.13859275053304904, "grad_norm": 1.438232163080863, "learning_rate": 1.1672340957315019e-05, "loss": 1.3643, "step": 65 }, { "epoch": 0.14925373134328357, "grad_norm": 1.5674663877824588, "learning_rate": 1.1324771198617955e-05, "loss": 1.2599, "step": 70 }, { "epoch": 0.15991471215351813, "grad_norm": 1.3266580498364144, "learning_rate": 1.0986676526868454e-05, "loss": 1.307, "step": 75 }, { "epoch": 0.17057569296375266, "grad_norm": 3.7513449392256173, "learning_rate": 1.0657823834846472e-05, "loss": 1.3701, "step": 80 }, { "epoch": 0.1812366737739872, "grad_norm": 5.184018665584405, "learning_rate": 1.0337985127060489e-05, "loss": 1.37, "step": 85 }, { "epoch": 0.19189765458422176, "grad_norm": 1.848934855985837, "learning_rate": 1.0026937421395617e-05, "loss": 1.2708, "step": 90 }, { "epoch": 0.2025586353944563, "grad_norm": 1.579847110813568, "learning_rate": 9.724462652387962e-06, "loss": 1.3402, "step": 95 }, { "epoch": 0.21321961620469082, "grad_norm": 1.5615282561022026, "learning_rate": 9.430347576103212e-06, "loss": 1.2961, "step": 100 }, { "epoch": 0.22388059701492538, "grad_norm": 1.3659017705844316, "learning_rate": 9.1443836765968e-06, "loss": 1.2203, "step": 105 }, { "epoch": 0.2345415778251599, "grad_norm": 1.4562735931359865, "learning_rate": 8.921344243503367e-06, "loss": 1.2945, "step": 110 }, { "epoch": 0.24520255863539445, "grad_norm": 1.2855121058923065, "learning_rate": 8.64954174238431e-06, "loss": 1.295, "step": 115 }, { "epoch": 0.255863539445629, "grad_norm": 1.3082074964800576, "learning_rate": 8.385330949249671e-06, "loss": 1.3053, "step": 120 }, { "epoch": 0.26652452025586354, "grad_norm": 1.2815031342227978, "learning_rate": 8.128520488668967e-06, "loss": 1.2994, "step": 125 }, { "epoch": 0.2771855010660981, "grad_norm": 1.3107524150250855, "learning_rate": 7.878923285543024e-06, "loss": 1.3281, "step": 130 }, { "epoch": 0.2878464818763326, "grad_norm": 1.1907535460514518, "learning_rate": 7.636356480312689e-06, "loss": 1.2579, "step": 135 }, { "epoch": 0.29850746268656714, "grad_norm": 1.2284678609494983, "learning_rate": 7.400641345604814e-06, "loss": 1.1634, "step": 140 }, { "epoch": 0.3091684434968017, "grad_norm": 1.3826281905120297, "learning_rate": 7.171603204294806e-06, "loss": 1.3058, "step": 145 }, { "epoch": 0.31982942430703626, "grad_norm": 2.431901009888026, "learning_rate": 6.949071348965877e-06, "loss": 1.3096, "step": 150 }, { "epoch": 0.3304904051172708, "grad_norm": 1.4504346901041576, "learning_rate": 6.732878962744999e-06, "loss": 1.2943, "step": 155 }, { "epoch": 0.3411513859275053, "grad_norm": 1.2723296431138542, "learning_rate": 6.5228630414958555e-06, "loss": 1.351, "step": 160 }, { "epoch": 0.35181236673773986, "grad_norm": 1.2139070950942783, "learning_rate": 6.318864317349401e-06, "loss": 1.2937, "step": 165 }, { "epoch": 0.3624733475479744, "grad_norm": 5.348465582649999, "learning_rate": 6.120727183552839e-06, "loss": 1.2726, "step": 170 }, { "epoch": 0.373134328358209, "grad_norm": 1.6504356499880541, "learning_rate": 5.928299620617992e-06, "loss": 1.3039, "step": 175 }, { "epoch": 0.3837953091684435, "grad_norm": 1.1981923340348997, "learning_rate": 5.7414331237502024e-06, "loss": 1.2027, "step": 180 }, { "epoch": 0.39445628997867804, "grad_norm": 1.229607999022085, "learning_rate": 5.559982631539405e-06, "loss": 1.2637, "step": 185 }, { "epoch": 0.4051172707889126, "grad_norm": 4.9301877176481606, "learning_rate": 5.383806455894783e-06, "loss": 1.2601, "step": 190 }, { "epoch": 0.4157782515991471, "grad_norm": 1.2303183328754892, "learning_rate": 5.212766213205117e-06, "loss": 1.1995, "step": 195 }, { "epoch": 0.42643923240938164, "grad_norm": 1.1512051049767256, "learning_rate": 5.046726756706699e-06, "loss": 1.2572, "step": 200 }, { "epoch": 0.42643923240938164, "eval_loss": 1.261182188987732, "eval_runtime": 87.341, "eval_samples_per_second": 19.086, "eval_steps_per_second": 0.309, "step": 200 }, { "epoch": 0.43710021321961623, "grad_norm": 1.2483299149897367, "learning_rate": 4.885556110041288e-06, "loss": 1.2665, "step": 205 }, { "epoch": 0.44776119402985076, "grad_norm": 1.1979266901021162, "learning_rate": 4.7291254019864345e-06, "loss": 1.1936, "step": 210 }, { "epoch": 0.4584221748400853, "grad_norm": 1.1685251787317164, "learning_rate": 4.577308802341063e-06, "loss": 1.3458, "step": 215 }, { "epoch": 0.4690831556503198, "grad_norm": 1.156653489171017, "learning_rate": 4.42998345894903e-06, "loss": 1.3039, "step": 220 }, { "epoch": 0.47974413646055436, "grad_norm": 1.2415273415640915, "learning_rate": 4.287029435843979e-06, "loss": 1.2127, "step": 225 }, { "epoch": 0.4904051172707889, "grad_norm": 1.0890064939371906, "learning_rate": 4.148329652498597e-06, "loss": 1.2844, "step": 230 }, { "epoch": 0.5010660980810234, "grad_norm": 1.2341998642747427, "learning_rate": 4.013769824161997e-06, "loss": 1.2989, "step": 235 }, { "epoch": 0.511727078891258, "grad_norm": 1.0800044414980956, "learning_rate": 3.883238403268737e-06, "loss": 1.18, "step": 240 }, { "epoch": 0.5223880597014925, "grad_norm": 1.1911502497460917, "learning_rate": 3.7566265219035852e-06, "loss": 1.2198, "step": 245 }, { "epoch": 0.5330490405117271, "grad_norm": 1.1875916676432052, "learning_rate": 3.633827935305925e-06, "loss": 1.2321, "step": 250 }, { "epoch": 0.5437100213219617, "grad_norm": 1.1285856989543368, "learning_rate": 3.5147389663983076e-06, "loss": 1.2063, "step": 255 }, { "epoch": 0.5543710021321961, "grad_norm": 1.2134517080609044, "learning_rate": 3.3992584513234327e-06, "loss": 1.2034, "step": 260 }, { "epoch": 0.5650319829424307, "grad_norm": 1.1497034722164001, "learning_rate": 3.2872876859744165e-06, "loss": 1.2478, "step": 265 }, { "epoch": 0.5756929637526652, "grad_norm": 1.1573953347115111, "learning_rate": 3.1787303735030416e-06, "loss": 1.217, "step": 270 }, { "epoch": 0.5863539445628998, "grad_norm": 1.1278493709315967, "learning_rate": 3.0734925727911652e-06, "loss": 1.2251, "step": 275 }, { "epoch": 0.5970149253731343, "grad_norm": 1.2020449597183245, "learning_rate": 2.971482647870452e-06, "loss": 1.1144, "step": 280 }, { "epoch": 0.6076759061833689, "grad_norm": 1.2203652918926715, "learning_rate": 2.8726112182758347e-06, "loss": 1.281, "step": 285 }, { "epoch": 0.6183368869936035, "grad_norm": 1.1541053058741007, "learning_rate": 2.7767911103183137e-06, "loss": 1.2344, "step": 290 }, { "epoch": 0.6289978678038379, "grad_norm": 1.170855117533204, "learning_rate": 2.6839373092628783e-06, "loss": 1.1737, "step": 295 }, { "epoch": 0.6396588486140725, "grad_norm": 1.1313568462008778, "learning_rate": 2.593966912397475e-06, "loss": 1.2395, "step": 300 }, { "epoch": 0.650319829424307, "grad_norm": 1.0884221660719775, "learning_rate": 2.5067990829791376e-06, "loss": 1.2274, "step": 305 }, { "epoch": 0.6609808102345416, "grad_norm": 1.1287129229662805, "learning_rate": 2.4223550050435816e-06, "loss": 1.2406, "step": 310 }, { "epoch": 0.6716417910447762, "grad_norm": 1.158009309923302, "learning_rate": 2.340557839064751e-06, "loss": 1.2206, "step": 315 }, { "epoch": 0.6823027718550106, "grad_norm": 1.120129915614599, "learning_rate": 2.261332678450931e-06, "loss": 1.1631, "step": 320 }, { "epoch": 0.6929637526652452, "grad_norm": 1.2657256378770898, "learning_rate": 2.184606506864227e-06, "loss": 1.2707, "step": 325 }, { "epoch": 0.7036247334754797, "grad_norm": 1.1637951032951959, "learning_rate": 2.11030815635039e-06, "loss": 1.2491, "step": 330 }, { "epoch": 0.7142857142857143, "grad_norm": 1.2287205505399468, "learning_rate": 2.0383682662661417e-06, "loss": 1.22, "step": 335 }, { "epoch": 0.7249466950959488, "grad_norm": 1.326574025149143, "learning_rate": 1.9687192429912924e-06, "loss": 1.2984, "step": 340 }, { "epoch": 0.7356076759061834, "grad_norm": 1.1554026106853619, "learning_rate": 1.9012952204130788e-06, "loss": 1.2208, "step": 345 }, { "epoch": 0.746268656716418, "grad_norm": 1.0875768598668099, "learning_rate": 1.8360320211704227e-06, "loss": 1.2204, "step": 350 }, { "epoch": 0.7569296375266524, "grad_norm": 1.14982588486912, "learning_rate": 1.772867118645806e-06, "loss": 1.2091, "step": 355 }, { "epoch": 0.767590618336887, "grad_norm": 1.1836213945640173, "learning_rate": 1.7117395996927912e-06, "loss": 1.2247, "step": 360 }, { "epoch": 0.7782515991471215, "grad_norm": 1.1378213023095496, "learning_rate": 1.6525901280871983e-06, "loss": 1.1597, "step": 365 }, { "epoch": 0.7889125799573561, "grad_norm": 1.146991526368693, "learning_rate": 1.5953609086902758e-06, "loss": 1.165, "step": 370 }, { "epoch": 0.7995735607675906, "grad_norm": 1.0759826941866089, "learning_rate": 1.539995652312188e-06, "loss": 1.1934, "step": 375 }, { "epoch": 0.8102345415778252, "grad_norm": 1.140264301703701, "learning_rate": 1.486439541264454e-06, "loss": 1.2476, "step": 380 }, { "epoch": 0.8208955223880597, "grad_norm": 1.1750555438236339, "learning_rate": 1.434639195589973e-06, "loss": 1.2283, "step": 385 }, { "epoch": 0.8315565031982942, "grad_norm": 1.1027256002443178, "learning_rate": 1.384542639959556e-06, "loss": 1.2392, "step": 390 }, { "epoch": 0.8422174840085288, "grad_norm": 1.0969958993285855, "learning_rate": 1.336099271223909e-06, "loss": 1.2932, "step": 395 }, { "epoch": 0.8528784648187633, "grad_norm": 1.1632096677157433, "learning_rate": 1.28925982661026e-06, "loss": 1.1754, "step": 400 }, { "epoch": 0.8528784648187633, "eval_loss": 1.2225849628448486, "eval_runtime": 86.933, "eval_samples_per_second": 19.176, "eval_steps_per_second": 0.311, "step": 400 }, { "epoch": 0.8635394456289979, "grad_norm": 1.2018672364738783, "learning_rate": 1.243976352552906e-06, "loss": 1.1537, "step": 405 }, { "epoch": 0.8742004264392325, "grad_norm": 1.1964994592566494, "learning_rate": 1.2002021741471036e-06, "loss": 1.2153, "step": 410 }, { "epoch": 0.8848614072494669, "grad_norm": 1.2085715531383925, "learning_rate": 1.1578918652158905e-06, "loss": 1.2214, "step": 415 }, { "epoch": 0.8955223880597015, "grad_norm": 1.132268550904319, "learning_rate": 1.1170012189795644e-06, "loss": 1.1696, "step": 420 }, { "epoch": 0.906183368869936, "grad_norm": 1.1888002460068672, "learning_rate": 1.0774872193176617e-06, "loss": 1.2102, "step": 425 }, { "epoch": 0.9168443496801706, "grad_norm": 1.081884966553129, "learning_rate": 1.039308012613421e-06, "loss": 1.1888, "step": 430 }, { "epoch": 0.9275053304904051, "grad_norm": 1.1082024653231661, "learning_rate": 1.0024228801708736e-06, "loss": 1.3088, "step": 435 }, { "epoch": 0.9381663113006397, "grad_norm": 1.1424196623047278, "learning_rate": 9.667922111948187e-07, "loss": 1.2702, "step": 440 }, { "epoch": 0.9488272921108742, "grad_norm": 1.1515848618493325, "learning_rate": 9.323774763240816e-07, "loss": 1.2241, "step": 445 }, { "epoch": 0.9594882729211087, "grad_norm": 1.133219633313401, "learning_rate": 8.991412017085673e-07, "loss": 1.1689, "step": 450 }, { "epoch": 0.9701492537313433, "grad_norm": 1.1817034727115894, "learning_rate": 8.670469436207778e-07, "loss": 1.2429, "step": 455 }, { "epoch": 0.9808102345415778, "grad_norm": 1.082929968453826, "learning_rate": 8.360592635925826e-07, "loss": 1.223, "step": 460 }, { "epoch": 0.9914712153518124, "grad_norm": 1.0995506789476814, "learning_rate": 8.061437040681486e-07, "loss": 1.2493, "step": 465 }, { "epoch": 1.0021321961620469, "grad_norm": 1.4325931391365254, "learning_rate": 7.772667645640643e-07, "loss": 1.2061, "step": 470 }, { "epoch": 1.0127931769722816, "grad_norm": 1.3020611480277295, "learning_rate": 7.493958783278302e-07, "loss": 1.118, "step": 475 }, { "epoch": 1.023454157782516, "grad_norm": 1.166419988285865, "learning_rate": 7.22499389486003e-07, "loss": 1.0997, "step": 480 }, { "epoch": 1.0341151385927505, "grad_norm": 1.1738691779163593, "learning_rate": 6.965465306734048e-07, "loss": 1.0648, "step": 485 }, { "epoch": 1.044776119402985, "grad_norm": 1.1702760912470462, "learning_rate": 6.715074011349123e-07, "loss": 1.125, "step": 490 }, { "epoch": 1.0554371002132197, "grad_norm": 1.1386250553771318, "learning_rate": 6.473529452915077e-07, "loss": 1.12, "step": 495 }, { "epoch": 1.0660980810234542, "grad_norm": 1.145825473750238, "learning_rate": 6.240549317623237e-07, "loss": 1.1182, "step": 500 }, { "epoch": 1.0767590618336886, "grad_norm": 1.1263260101281638, "learning_rate": 6.015859328346043e-07, "loss": 1.0333, "step": 505 }, { "epoch": 1.0874200426439233, "grad_norm": 1.099611354066211, "learning_rate": 5.799193043735526e-07, "loss": 1.0545, "step": 510 }, { "epoch": 1.0980810234541578, "grad_norm": 1.2122056875352134, "learning_rate": 5.590291661642085e-07, "loss": 1.1501, "step": 515 }, { "epoch": 1.1087420042643923, "grad_norm": 1.2108073336890413, "learning_rate": 5.388903826775655e-07, "loss": 1.1355, "step": 520 }, { "epoch": 1.1194029850746268, "grad_norm": 1.1869552548231974, "learning_rate": 5.194785442532827e-07, "loss": 1.0177, "step": 525 }, { "epoch": 1.1300639658848615, "grad_norm": 1.0725136686500363, "learning_rate": 5.007699486914442e-07, "loss": 1.0673, "step": 530 }, { "epoch": 1.140724946695096, "grad_norm": 1.1715291176817577, "learning_rate": 4.827415832459197e-07, "loss": 0.9923, "step": 535 }, { "epoch": 1.1513859275053304, "grad_norm": 1.0912818419343246, "learning_rate": 4.6537110701200646e-07, "loss": 1.0214, "step": 540 }, { "epoch": 1.1620469083155651, "grad_norm": 1.2075583423687697, "learning_rate": 4.486368337011321e-07, "loss": 1.0799, "step": 545 }, { "epoch": 1.1727078891257996, "grad_norm": 1.1711621672939032, "learning_rate": 4.3251771479550135e-07, "loss": 1.0649, "step": 550 }, { "epoch": 1.183368869936034, "grad_norm": 1.1379570510356936, "learning_rate": 4.169933230756739e-07, "loss": 1.0275, "step": 555 }, { "epoch": 1.1940298507462686, "grad_norm": 1.1805410178961084, "learning_rate": 4.020438365141708e-07, "loss": 1.0501, "step": 560 }, { "epoch": 1.2046908315565032, "grad_norm": 1.0319879103444831, "learning_rate": 3.876500225283097e-07, "loss": 1.0616, "step": 565 }, { "epoch": 1.2153518123667377, "grad_norm": 1.1205106100162852, "learning_rate": 3.7379322258556177e-07, "loss": 1.0679, "step": 570 }, { "epoch": 1.2260127931769722, "grad_norm": 1.1518728152224136, "learning_rate": 3.6045533715483096e-07, "loss": 1.1601, "step": 575 }, { "epoch": 1.236673773987207, "grad_norm": 1.10359450734736, "learning_rate": 3.4761881099715165e-07, "loss": 1.0466, "step": 580 }, { "epoch": 1.2473347547974414, "grad_norm": 1.1001281681468988, "learning_rate": 3.3526661878940757e-07, "loss": 1.114, "step": 585 }, { "epoch": 1.2579957356076759, "grad_norm": 1.1513825927177352, "learning_rate": 3.233822510747588e-07, "loss": 1.1314, "step": 590 }, { "epoch": 1.2686567164179103, "grad_norm": 1.193698165116124, "learning_rate": 3.119497005335667e-07, "loss": 1.0904, "step": 595 }, { "epoch": 1.279317697228145, "grad_norm": 1.1645268517569372, "learning_rate": 3.0095344856870186e-07, "loss": 1.1662, "step": 600 }, { "epoch": 1.279317697228145, "eval_loss": 1.220153570175171, "eval_runtime": 86.8011, "eval_samples_per_second": 19.205, "eval_steps_per_second": 0.311, "step": 600 }, { "epoch": 1.2899786780383795, "grad_norm": 1.1865685508079404, "learning_rate": 2.903784521992152e-07, "loss": 1.0543, "step": 605 }, { "epoch": 1.3006396588486142, "grad_norm": 1.0889008660105814, "learning_rate": 2.802101312564394e-07, "loss": 1.1059, "step": 610 }, { "epoch": 1.3113006396588487, "grad_norm": 1.1629452168406549, "learning_rate": 2.7043435587668195e-07, "loss": 1.0847, "step": 615 }, { "epoch": 1.3219616204690832, "grad_norm": 1.1225822453511047, "learning_rate": 2.6103743428476387e-07, "loss": 1.0547, "step": 620 }, { "epoch": 1.3326226012793176, "grad_norm": 1.068456227087864, "learning_rate": 2.520061008627476e-07, "loss": 1.1331, "step": 625 }, { "epoch": 1.3432835820895521, "grad_norm": 1.163340113599676, "learning_rate": 2.4332750449828163e-07, "loss": 1.0907, "step": 630 }, { "epoch": 1.3539445628997868, "grad_norm": 1.1623906096012029, "learning_rate": 2.3498919720707982e-07, "loss": 1.0788, "step": 635 }, { "epoch": 1.3646055437100213, "grad_norm": 1.1079974563308679, "learning_rate": 2.269791230241396e-07, "loss": 1.0701, "step": 640 }, { "epoch": 1.375266524520256, "grad_norm": 1.2149865496581391, "learning_rate": 2.1928560715838997e-07, "loss": 1.0376, "step": 645 }, { "epoch": 1.3859275053304905, "grad_norm": 1.1528488687976461, "learning_rate": 2.1189734540554226e-07, "loss": 1.1269, "step": 650 }, { "epoch": 1.396588486140725, "grad_norm": 1.1288464186180127, "learning_rate": 2.048033938139964e-07, "loss": 1.108, "step": 655 }, { "epoch": 1.4072494669509594, "grad_norm": 1.1817093681210782, "learning_rate": 1.979931585987499e-07, "loss": 1.0499, "step": 660 }, { "epoch": 1.417910447761194, "grad_norm": 1.1321827623646836, "learning_rate": 1.9145638629832107e-07, "loss": 1.041, "step": 665 }, { "epoch": 1.4285714285714286, "grad_norm": 1.15181492674688, "learning_rate": 1.8518315416979456e-07, "loss": 1.0994, "step": 670 }, { "epoch": 1.439232409381663, "grad_norm": 1.1127579923456647, "learning_rate": 1.7916386081716564e-07, "loss": 1.0894, "step": 675 }, { "epoch": 1.4498933901918978, "grad_norm": 1.1765219547043915, "learning_rate": 1.7338921704824148e-07, "loss": 1.0715, "step": 680 }, { "epoch": 1.4605543710021323, "grad_norm": 1.2252010284139379, "learning_rate": 1.678502369554362e-07, "loss": 1.1498, "step": 685 }, { "epoch": 1.4712153518123667, "grad_norm": 1.1121586595467274, "learning_rate": 1.6253822921587308e-07, "loss": 1.0357, "step": 690 }, { "epoch": 1.4818763326226012, "grad_norm": 1.2326653113155133, "learning_rate": 1.574447886062802e-07, "loss": 1.0426, "step": 695 }, { "epoch": 1.4925373134328357, "grad_norm": 1.2662375322966155, "learning_rate": 1.5256178772824204e-07, "loss": 1.1513, "step": 700 }, { "epoch": 1.5031982942430704, "grad_norm": 1.1537609553653134, "learning_rate": 1.4788136893944296e-07, "loss": 1.0303, "step": 705 }, { "epoch": 1.5138592750533049, "grad_norm": 1.0977344228272827, "learning_rate": 1.4339593648661315e-07, "loss": 1.0821, "step": 710 }, { "epoch": 1.5245202558635396, "grad_norm": 1.1284164855389234, "learning_rate": 1.3909814883595595e-07, "loss": 1.1313, "step": 715 }, { "epoch": 1.535181236673774, "grad_norm": 1.1938382106881487, "learning_rate": 1.3498091119690798e-07, "loss": 1.1446, "step": 720 }, { "epoch": 1.5458422174840085, "grad_norm": 1.1272930245655628, "learning_rate": 1.3103736823515384e-07, "loss": 1.1136, "step": 725 }, { "epoch": 1.556503198294243, "grad_norm": 1.1773751232659426, "learning_rate": 1.2726089697088728e-07, "loss": 1.1309, "step": 730 }, { "epoch": 1.5671641791044775, "grad_norm": 1.3277025055368337, "learning_rate": 1.2364509985837655e-07, "loss": 1.0239, "step": 735 }, { "epoch": 1.5778251599147122, "grad_norm": 1.230962189006197, "learning_rate": 1.201837980429599e-07, "loss": 1.058, "step": 740 }, { "epoch": 1.5884861407249466, "grad_norm": 1.2101453511321207, "learning_rate": 1.1687102479166401e-07, "loss": 1.1119, "step": 745 }, { "epoch": 1.5991471215351813, "grad_norm": 1.1936647152626743, "learning_rate": 1.1370101909370577e-07, "loss": 1.1556, "step": 750 }, { "epoch": 1.6098081023454158, "grad_norm": 1.1474035015035384, "learning_rate": 1.1066821942719791e-07, "loss": 1.0522, "step": 755 }, { "epoch": 1.6204690831556503, "grad_norm": 1.0637295000469429, "learning_rate": 1.0776725768844699e-07, "loss": 1.0549, "step": 760 }, { "epoch": 1.6311300639658848, "grad_norm": 1.10828124909061, "learning_rate": 1.0499295328029333e-07, "loss": 1.043, "step": 765 }, { "epoch": 1.6417910447761193, "grad_norm": 1.1283065722653947, "learning_rate": 1.0234030735600652e-07, "loss": 1.0731, "step": 770 }, { "epoch": 1.652452025586354, "grad_norm": 1.161403848045461, "learning_rate": 9.980449721530949e-08, "loss": 1.161, "step": 775 }, { "epoch": 1.6631130063965884, "grad_norm": 1.158445375876672, "learning_rate": 9.738087084916553e-08, "loss": 1.089, "step": 780 }, { "epoch": 1.6737739872068231, "grad_norm": 1.1752425329283938, "learning_rate": 9.506494163002325e-08, "loss": 1.0479, "step": 785 }, { "epoch": 1.6844349680170576, "grad_norm": 1.0813888422827629, "learning_rate": 9.285238314427303e-08, "loss": 1.0797, "step": 790 }, { "epoch": 1.695095948827292, "grad_norm": 1.2358809521231273, "learning_rate": 9.07390241637265e-08, "loss": 1.0483, "step": 795 }, { "epoch": 1.7057569296375266, "grad_norm": 1.1520098737400197, "learning_rate": 8.872084375298702e-08, "loss": 1.1182, "step": 800 }, { "epoch": 1.7057569296375266, "eval_loss": 1.2184005975723267, "eval_runtime": 86.9323, "eval_samples_per_second": 19.176, "eval_steps_per_second": 0.311, "step": 800 }, { "epoch": 1.716417910447761, "grad_norm": 1.1409424379253437, "learning_rate": 8.679396650963928e-08, "loss": 1.1356, "step": 805 }, { "epoch": 1.7270788912579957, "grad_norm": 1.1235815999344876, "learning_rate": 8.49546579342372e-08, "loss": 1.155, "step": 810 }, { "epoch": 1.7377398720682304, "grad_norm": 1.1073975656892132, "learning_rate": 8.319931992712862e-08, "loss": 1.1137, "step": 815 }, { "epoch": 1.748400852878465, "grad_norm": 1.2180389136635439, "learning_rate": 8.15244864092076e-08, "loss": 1.1422, "step": 820 }, { "epoch": 1.7590618336886994, "grad_norm": 1.1113266470685994, "learning_rate": 7.992681906373824e-08, "loss": 1.062, "step": 825 }, { "epoch": 1.7697228144989339, "grad_norm": 1.2072670273798571, "learning_rate": 7.84031031964486e-08, "loss": 1.0517, "step": 830 }, { "epoch": 1.7803837953091683, "grad_norm": 1.1803556915432551, "learning_rate": 7.69502437111444e-08, "loss": 1.0703, "step": 835 }, { "epoch": 1.7910447761194028, "grad_norm": 1.1844005429106652, "learning_rate": 7.556526119814338e-08, "loss": 1.111, "step": 840 }, { "epoch": 1.8017057569296375, "grad_norm": 1.1384485747440536, "learning_rate": 7.424528813288221e-08, "loss": 1.1914, "step": 845 }, { "epoch": 1.8123667377398722, "grad_norm": 1.2171711222670063, "learning_rate": 7.298756518209771e-08, "loss": 1.118, "step": 850 }, { "epoch": 1.8230277185501067, "grad_norm": 1.1361749350155368, "learning_rate": 7.178943761503399e-08, "loss": 1.2114, "step": 855 }, { "epoch": 1.8336886993603412, "grad_norm": 1.2202066733922106, "learning_rate": 7.064835181717455e-08, "loss": 1.0826, "step": 860 }, { "epoch": 1.8443496801705757, "grad_norm": 1.1962899610991478, "learning_rate": 6.956185190404699e-08, "loss": 1.0077, "step": 865 }, { "epoch": 1.8550106609808101, "grad_norm": 1.2308765981536185, "learning_rate": 6.85275764326948e-08, "loss": 1.0717, "step": 870 }, { "epoch": 1.8656716417910446, "grad_norm": 1.1658077017002657, "learning_rate": 6.75432552084579e-08, "loss": 1.0958, "step": 875 }, { "epoch": 1.8763326226012793, "grad_norm": 1.270006074619097, "learning_rate": 6.660670618474825e-08, "loss": 1.091, "step": 880 }, { "epoch": 1.886993603411514, "grad_norm": 1.1582088190520023, "learning_rate": 6.571583245355274e-08, "loss": 1.0301, "step": 885 }, { "epoch": 1.8976545842217485, "grad_norm": 1.0859957939920986, "learning_rate": 6.486861932443996e-08, "loss": 1.0629, "step": 890 }, { "epoch": 1.908315565031983, "grad_norm": 1.0879133073671061, "learning_rate": 6.406313148989106e-08, "loss": 1.0293, "step": 895 }, { "epoch": 1.9189765458422174, "grad_norm": 1.1419672670758283, "learning_rate": 6.329751027481845e-08, "loss": 1.1259, "step": 900 }, { "epoch": 1.929637526652452, "grad_norm": 1.147613866551749, "learning_rate": 6.256997096817788e-08, "loss": 1.0128, "step": 905 }, { "epoch": 1.9402985074626866, "grad_norm": 1.1753147588481525, "learning_rate": 6.187880023462194e-08, "loss": 1.0407, "step": 910 }, { "epoch": 1.950959488272921, "grad_norm": 1.128791859643809, "learning_rate": 6.122235360418442e-08, "loss": 1.0722, "step": 915 }, { "epoch": 1.9616204690831558, "grad_norm": 1.1601302162296214, "learning_rate": 6.059905303802484e-08, "loss": 1.1054, "step": 920 }, { "epoch": 1.9722814498933903, "grad_norm": 1.198563476242119, "learning_rate": 6.000738456830302e-08, "loss": 1.0568, "step": 925 }, { "epoch": 1.9829424307036247, "grad_norm": 1.2262616318390043, "learning_rate": 5.944589601029287e-08, "loss": 1.1251, "step": 930 }, { "epoch": 1.9936034115138592, "grad_norm": 1.191995506184399, "learning_rate": 5.89131947448833e-08, "loss": 1.1911, "step": 935 }, { "epoch": 2.0042643923240937, "grad_norm": 1.11244803877364, "learning_rate": 5.840794556965235e-08, "loss": 1.066, "step": 940 }, { "epoch": 2.014925373134328, "grad_norm": 1.0894580884709386, "learning_rate": 5.792886861673804e-08, "loss": 1.0406, "step": 945 }, { "epoch": 2.025586353944563, "grad_norm": 1.0653919782073382, "learning_rate": 5.747473733576709e-08, "loss": 1.0661, "step": 950 }, { "epoch": 2.0362473347547976, "grad_norm": 1.1280141717212646, "learning_rate": 5.704437654013792e-08, "loss": 1.1027, "step": 955 }, { "epoch": 2.046908315565032, "grad_norm": 1.2313007226700692, "learning_rate": 5.663666051499173e-08, "loss": 1.0421, "step": 960 }, { "epoch": 2.0575692963752665, "grad_norm": 1.153128267269016, "learning_rate": 5.6250511185239e-08, "loss": 1.0893, "step": 965 }, { "epoch": 2.068230277185501, "grad_norm": 1.1364871737157245, "learning_rate": 5.58848963420453e-08, "loss": 1.0918, "step": 970 }, { "epoch": 2.0788912579957355, "grad_norm": 1.1584140858402872, "learning_rate": 5.553882792621254e-08, "loss": 1.0183, "step": 975 }, { "epoch": 2.08955223880597, "grad_norm": 1.1608258928350752, "learning_rate": 5.521136036692734e-08, "loss": 1.0322, "step": 980 }, { "epoch": 2.100213219616205, "grad_norm": 1.1873514358148416, "learning_rate": 5.490158897437947e-08, "loss": 1.0441, "step": 985 }, { "epoch": 2.1108742004264394, "grad_norm": 1.202520887860229, "learning_rate": 5.460864838478731e-08, "loss": 1.0996, "step": 990 }, { "epoch": 2.121535181236674, "grad_norm": 1.1559964888973397, "learning_rate": 5.4331711056398124e-08, "loss": 1.095, "step": 995 }, { "epoch": 2.1321961620469083, "grad_norm": 1.0455908941955974, "learning_rate": 5.4069985815063316e-08, "loss": 1.046, "step": 1000 }, { "epoch": 2.1321961620469083, "eval_loss": 1.2190189361572266, "eval_runtime": 86.8993, "eval_samples_per_second": 19.183, "eval_steps_per_second": 0.311, "step": 1000 }, { "epoch": 2.142857142857143, "grad_norm": 1.1258833797291203, "learning_rate": 5.3822716448019046e-08, "loss": 1.049, "step": 1005 }, { "epoch": 2.1535181236673773, "grad_norm": 1.162003500965812, "learning_rate": 5.35891803445336e-08, "loss": 1.094, "step": 1010 }, { "epoch": 2.1641791044776117, "grad_norm": 1.1823939265510066, "learning_rate": 5.3368687182112245e-08, "loss": 1.0556, "step": 1015 }, { "epoch": 2.1748400852878467, "grad_norm": 1.1584053741977514, "learning_rate": 5.3160577656980396e-08, "loss": 1.1292, "step": 1020 }, { "epoch": 2.185501066098081, "grad_norm": 1.1507740539032782, "learning_rate": 5.296422225759416e-08, "loss": 1.1023, "step": 1025 }, { "epoch": 2.1961620469083156, "grad_norm": 1.1594756240093598, "learning_rate": 5.277902007995615e-08, "loss": 1.107, "step": 1030 }, { "epoch": 2.20682302771855, "grad_norm": 1.10929463120434, "learning_rate": 5.2604397683542377e-08, "loss": 1.1102, "step": 1035 }, { "epoch": 2.2174840085287846, "grad_norm": 1.1432334524639647, "learning_rate": 5.243980798667319e-08, "loss": 1.1, "step": 1040 }, { "epoch": 2.228144989339019, "grad_norm": 1.1904483860983543, "learning_rate": 5.228472920018869e-08, "loss": 1.09, "step": 1045 }, { "epoch": 2.2388059701492535, "grad_norm": 1.167622664957689, "learning_rate": 5.213866379831496e-08, "loss": 1.0814, "step": 1050 }, { "epoch": 2.2494669509594885, "grad_norm": 1.0895757958922552, "learning_rate": 5.200113752563408e-08, "loss": 1.1161, "step": 1055 }, { "epoch": 2.260127931769723, "grad_norm": 1.1558659882895932, "learning_rate": 5.187169843909606e-08, "loss": 1.0795, "step": 1060 }, { "epoch": 2.2707889125799574, "grad_norm": 1.1344999555588917, "learning_rate": 5.17499159840361e-08, "loss": 1.0706, "step": 1065 }, { "epoch": 2.281449893390192, "grad_norm": 1.1415261926993812, "learning_rate": 5.163538010318527e-08, "loss": 1.0967, "step": 1070 }, { "epoch": 2.2921108742004264, "grad_norm": 1.1381490852319662, "learning_rate": 5.152770037768703e-08, "loss": 1.056, "step": 1075 }, { "epoch": 2.302771855010661, "grad_norm": 1.1347196474571326, "learning_rate": 5.142650519915549e-08, "loss": 1.0654, "step": 1080 }, { "epoch": 2.3134328358208958, "grad_norm": 1.140691001232856, "learning_rate": 5.133144097183511e-08, "loss": 1.0555, "step": 1085 }, { "epoch": 2.3240938166311302, "grad_norm": 1.1321293656672715, "learning_rate": 5.1242171343944076e-08, "loss": 1.0617, "step": 1090 }, { "epoch": 2.3347547974413647, "grad_norm": 1.0730144292666668, "learning_rate": 5.115837646730644e-08, "loss": 1.0129, "step": 1095 }, { "epoch": 2.345415778251599, "grad_norm": 1.1568722548151742, "learning_rate": 5.107975228439987e-08, "loss": 1.1102, "step": 1100 }, { "epoch": 2.3560767590618337, "grad_norm": 1.1165138012512623, "learning_rate": 5.100600984196796e-08, "loss": 1.0316, "step": 1105 }, { "epoch": 2.366737739872068, "grad_norm": 1.1003671418770582, "learning_rate": 5.093687463036684e-08, "loss": 1.0387, "step": 1110 }, { "epoch": 2.3773987206823026, "grad_norm": 1.1858341121422311, "learning_rate": 5.087208594783712e-08, "loss": 1.1381, "step": 1115 }, { "epoch": 2.388059701492537, "grad_norm": 1.1724486836357029, "learning_rate": 5.0811396288912494e-08, "loss": 1.1484, "step": 1120 }, { "epoch": 2.398720682302772, "grad_norm": 1.1477785681404549, "learning_rate": 5.075457075619642e-08, "loss": 1.0488, "step": 1125 }, { "epoch": 2.4093816631130065, "grad_norm": 1.1926168382576205, "learning_rate": 5.070138649475803e-08, "loss": 1.0335, "step": 1130 }, { "epoch": 2.420042643923241, "grad_norm": 1.1592591417618787, "learning_rate": 5.065163214841775e-08, "loss": 1.0867, "step": 1135 }, { "epoch": 2.4307036247334755, "grad_norm": 1.1492395746059412, "learning_rate": 5.0605107337212035e-08, "loss": 1.0934, "step": 1140 }, { "epoch": 2.44136460554371, "grad_norm": 1.1808862259382695, "learning_rate": 5.0561622155345265e-08, "loss": 1.0804, "step": 1145 }, { "epoch": 2.4520255863539444, "grad_norm": 1.2148315415559092, "learning_rate": 5.052099668895486e-08, "loss": 1.1264, "step": 1150 }, { "epoch": 2.4626865671641793, "grad_norm": 1.0677698199987926, "learning_rate": 5.0483060553033895e-08, "loss": 1.029, "step": 1155 }, { "epoch": 2.473347547974414, "grad_norm": 1.1698932019494643, "learning_rate": 5.0447652446872465e-08, "loss": 1.028, "step": 1160 }, { "epoch": 2.4840085287846483, "grad_norm": 1.1287646587327143, "learning_rate": 5.0414619727396756e-08, "loss": 1.0884, "step": 1165 }, { "epoch": 2.4946695095948828, "grad_norm": 1.16856773725038, "learning_rate": 5.0383817999801064e-08, "loss": 1.0204, "step": 1170 }, { "epoch": 2.5053304904051172, "grad_norm": 1.141453152240354, "learning_rate": 5.035511072488479e-08, "loss": 1.0523, "step": 1175 }, { "epoch": 2.5159914712153517, "grad_norm": 1.230429011852803, "learning_rate": 5.032836884252246e-08, "loss": 1.0617, "step": 1180 }, { "epoch": 2.526652452025586, "grad_norm": 1.151378135213026, "learning_rate": 5.03034704107104e-08, "loss": 1.048, "step": 1185 }, { "epoch": 2.5373134328358207, "grad_norm": 1.1297760039941285, "learning_rate": 5.028030025964957e-08, "loss": 1.0614, "step": 1190 }, { "epoch": 2.5479744136460556, "grad_norm": 1.149687991991871, "learning_rate": 5.025874966033864e-08, "loss": 1.1042, "step": 1195 }, { "epoch": 2.55863539445629, "grad_norm": 1.1935154857243262, "learning_rate": 5.023871600716678e-08, "loss": 1.0772, "step": 1200 }, { "epoch": 2.55863539445629, "eval_loss": 1.2190290689468384, "eval_runtime": 86.4625, "eval_samples_per_second": 19.28, "eval_steps_per_second": 0.312, "step": 1200 }, { "epoch": 2.5692963752665245, "grad_norm": 1.1485570257886584, "learning_rate": 5.022010251400949e-08, "loss": 1.1083, "step": 1205 }, { "epoch": 2.579957356076759, "grad_norm": 1.180867296316843, "learning_rate": 5.0202817923345707e-08, "loss": 1.062, "step": 1210 }, { "epoch": 2.5906183368869935, "grad_norm": 1.1667090859024647, "learning_rate": 5.0186776227927486e-08, "loss": 1.0414, "step": 1215 }, { "epoch": 2.6012793176972284, "grad_norm": 1.1582105008250059, "learning_rate": 5.017189640454793e-08, "loss": 1.0663, "step": 1220 }, { "epoch": 2.611940298507463, "grad_norm": 1.0799260851167845, "learning_rate": 5.01581021594656e-08, "loss": 1.102, "step": 1225 }, { "epoch": 2.6226012793176974, "grad_norm": 1.1531786446908774, "learning_rate": 5.01453216850572e-08, "loss": 1.1336, "step": 1230 }, { "epoch": 2.633262260127932, "grad_norm": 1.1432906025568284, "learning_rate": 5.0133487427282597e-08, "loss": 1.0256, "step": 1235 }, { "epoch": 2.6439232409381663, "grad_norm": 1.1673336163640968, "learning_rate": 5.0122535863558936e-08, "loss": 1.1471, "step": 1240 }, { "epoch": 2.654584221748401, "grad_norm": 1.1845243824938692, "learning_rate": 5.011240729065255e-08, "loss": 1.0642, "step": 1245 }, { "epoch": 2.6652452025586353, "grad_norm": 1.1256336089648642, "learning_rate": 5.010304562220935e-08, "loss": 1.1023, "step": 1250 }, { "epoch": 2.6759061833688698, "grad_norm": 1.1635762261874374, "learning_rate": 5.00943981955558e-08, "loss": 1.0605, "step": 1255 }, { "epoch": 2.6865671641791042, "grad_norm": 1.1747405452856283, "learning_rate": 5.0086415587414125e-08, "loss": 1.0897, "step": 1260 }, { "epoch": 2.697228144989339, "grad_norm": 1.086387379481192, "learning_rate": 5.00790514381863e-08, "loss": 1.0893, "step": 1265 }, { "epoch": 2.7078891257995736, "grad_norm": 1.1923401716553281, "learning_rate": 5.007226228447204e-08, "loss": 1.0473, "step": 1270 }, { "epoch": 2.718550106609808, "grad_norm": 1.205935131659565, "learning_rate": 5.006600739949697e-08, "loss": 1.0231, "step": 1275 }, { "epoch": 2.7292110874200426, "grad_norm": 1.1391126635720465, "learning_rate": 5.006024864113683e-08, "loss": 1.0403, "step": 1280 }, { "epoch": 2.739872068230277, "grad_norm": 1.1345121679379757, "learning_rate": 5.005495030723431e-08, "loss": 1.0764, "step": 1285 }, { "epoch": 2.750533049040512, "grad_norm": 1.1850716557115912, "learning_rate": 5.005007899791418e-08, "loss": 0.9945, "step": 1290 }, { "epoch": 2.7611940298507465, "grad_norm": 1.1874286830736782, "learning_rate": 5.00456034846126e-08, "loss": 1.085, "step": 1295 }, { "epoch": 2.771855010660981, "grad_norm": 1.1901764193670399, "learning_rate": 5.004149458554536e-08, "loss": 1.0092, "step": 1300 }, { "epoch": 2.7825159914712154, "grad_norm": 1.0961826150665088, "learning_rate": 5.0037725047349065e-08, "loss": 1.1071, "step": 1305 }, { "epoch": 2.79317697228145, "grad_norm": 1.1628718738526371, "learning_rate": 5.0034269432638164e-08, "loss": 1.1233, "step": 1310 }, { "epoch": 2.8038379530916844, "grad_norm": 1.1626738934739451, "learning_rate": 5.0031104013229344e-08, "loss": 1.1894, "step": 1315 }, { "epoch": 2.814498933901919, "grad_norm": 1.1011603049613505, "learning_rate": 5.002820666879318e-08, "loss": 1.1494, "step": 1320 }, { "epoch": 2.8251599147121533, "grad_norm": 1.1717668628457665, "learning_rate": 5.002555679070117e-08, "loss": 1.0514, "step": 1325 }, { "epoch": 2.835820895522388, "grad_norm": 1.1774225127445404, "learning_rate": 5.0023135190844316e-08, "loss": 1.1027, "step": 1330 }, { "epoch": 2.8464818763326227, "grad_norm": 1.1717401192939292, "learning_rate": 5.0020924015207164e-08, "loss": 1.063, "step": 1335 }, { "epoch": 2.857142857142857, "grad_norm": 1.1587594052870365, "learning_rate": 5.001890666198876e-08, "loss": 1.1021, "step": 1340 }, { "epoch": 2.8678038379530917, "grad_norm": 1.1525118578792972, "learning_rate": 5.001706770406945e-08, "loss": 1.1019, "step": 1345 }, { "epoch": 2.878464818763326, "grad_norm": 1.37326829503098, "learning_rate": 5.001539281562949e-08, "loss": 1.0355, "step": 1350 }, { "epoch": 2.8891257995735606, "grad_norm": 1.1331048627302174, "learning_rate": 5.001386870273264e-08, "loss": 1.0899, "step": 1355 }, { "epoch": 2.8997867803837956, "grad_norm": 1.1984672662153428, "learning_rate": 5.0012483037694405e-08, "loss": 1.1161, "step": 1360 }, { "epoch": 2.91044776119403, "grad_norm": 1.1109425480042538, "learning_rate": 5.0011224397061566e-08, "loss": 1.0231, "step": 1365 }, { "epoch": 2.9211087420042645, "grad_norm": 1.1795722740533605, "learning_rate": 5.0010082203035725e-08, "loss": 1.1211, "step": 1370 }, { "epoch": 2.931769722814499, "grad_norm": 1.0937372807822898, "learning_rate": 5.0009046668180024e-08, "loss": 1.0735, "step": 1375 }, { "epoch": 2.9424307036247335, "grad_norm": 1.1405185259805586, "learning_rate": 5.000810874325427e-08, "loss": 1.0276, "step": 1380 }, { "epoch": 2.953091684434968, "grad_norm": 1.20416756854441, "learning_rate": 5.0007260068029507e-08, "loss": 1.0596, "step": 1385 }, { "epoch": 2.9637526652452024, "grad_norm": 1.1625071717485667, "learning_rate": 5.000649292493887e-08, "loss": 1.0697, "step": 1390 }, { "epoch": 2.974413646055437, "grad_norm": 1.1863714583502374, "learning_rate": 5.000580019542706e-08, "loss": 1.1505, "step": 1395 }, { "epoch": 2.9850746268656714, "grad_norm": 1.1942905354245756, "learning_rate": 5.000517531886622e-08, "loss": 1.0326, "step": 1400 }, { "epoch": 2.9850746268656714, "eval_loss": 1.2187978029251099, "eval_runtime": 86.8658, "eval_samples_per_second": 19.191, "eval_steps_per_second": 0.311, "step": 1400 }, { "epoch": 2.9957356076759063, "grad_norm": 1.1519642562669563, "learning_rate": 5.0004612253911274e-08, "loss": 1.0479, "step": 1405 }, { "epoch": 3.0063965884861408, "grad_norm": 1.1665923395094933, "learning_rate": 5.000410544217268e-08, "loss": 1.0174, "step": 1410 }, { "epoch": 3.0170575692963753, "grad_norm": 1.1796085918771502, "learning_rate": 5.0003649774089725e-08, "loss": 1.0737, "step": 1415 }, { "epoch": 3.0277185501066097, "grad_norm": 1.1757596964828239, "learning_rate": 5.0003240556892086e-08, "loss": 1.0738, "step": 1420 }, { "epoch": 3.038379530916844, "grad_norm": 1.1395407440291647, "learning_rate": 5.000287348454201e-08, "loss": 1.094, "step": 1425 }, { "epoch": 3.0490405117270787, "grad_norm": 1.1323357375626646, "learning_rate": 5.000254460955416e-08, "loss": 1.0684, "step": 1430 }, { "epoch": 3.0597014925373136, "grad_norm": 1.1177035566781262, "learning_rate": 5.0002250316594135e-08, "loss": 1.038, "step": 1435 }, { "epoch": 3.070362473347548, "grad_norm": 1.1118810589333648, "learning_rate": 5.0001987297761256e-08, "loss": 1.0114, "step": 1440 }, { "epoch": 3.0810234541577826, "grad_norm": 1.153651371893197, "learning_rate": 5.00017525294651e-08, "loss": 1.1564, "step": 1445 }, { "epoch": 3.091684434968017, "grad_norm": 1.117017616812222, "learning_rate": 5.000154325080907e-08, "loss": 1.0421, "step": 1450 }, { "epoch": 3.1023454157782515, "grad_norm": 1.1455567386664758, "learning_rate": 5.0001356943398416e-08, "loss": 1.0494, "step": 1455 }, { "epoch": 3.113006396588486, "grad_norm": 1.1745918907121704, "learning_rate": 5.000119131249349e-08, "loss": 1.0673, "step": 1460 }, { "epoch": 3.1236673773987205, "grad_norm": 1.0945498527723858, "learning_rate": 5.000104426943277e-08, "loss": 1.15, "step": 1465 }, { "epoch": 3.1343283582089554, "grad_norm": 1.2775793960973583, "learning_rate": 5.0000913915253465e-08, "loss": 1.0434, "step": 1470 }, { "epoch": 3.14498933901919, "grad_norm": 1.1275573880105167, "learning_rate": 5.000079852544101e-08, "loss": 1.0726, "step": 1475 }, { "epoch": 3.1556503198294243, "grad_norm": 1.0900224885696637, "learning_rate": 5.0000696535741776e-08, "loss": 1.0439, "step": 1480 }, { "epoch": 3.166311300639659, "grad_norm": 1.1943472978175196, "learning_rate": 5.0000606528976535e-08, "loss": 1.0778, "step": 1485 }, { "epoch": 3.1769722814498933, "grad_norm": 1.1656854953422446, "learning_rate": 5.000052722279511e-08, "loss": 1.0236, "step": 1490 }, { "epoch": 3.1876332622601278, "grad_norm": 1.7099219299352593, "learning_rate": 5.0000457458315606e-08, "loss": 1.048, "step": 1495 }, { "epoch": 3.1982942430703627, "grad_norm": 1.2368961315589513, "learning_rate": 5.0000396189594195e-08, "loss": 1.0601, "step": 1500 }, { "epoch": 3.208955223880597, "grad_norm": 1.1053541732810632, "learning_rate": 5.000034247387439e-08, "loss": 1.1574, "step": 1505 }, { "epoch": 3.2196162046908317, "grad_norm": 1.2088122913217223, "learning_rate": 5.0000295462567e-08, "loss": 1.021, "step": 1510 }, { "epoch": 3.230277185501066, "grad_norm": 1.1822420881980706, "learning_rate": 5.000025439291453e-08, "loss": 1.0577, "step": 1515 }, { "epoch": 3.2409381663113006, "grad_norm": 1.1982389728811844, "learning_rate": 5.0000218580296324e-08, "loss": 1.1241, "step": 1520 }, { "epoch": 3.251599147121535, "grad_norm": 1.1836468791922983, "learning_rate": 5.0000187411132714e-08, "loss": 1.106, "step": 1525 }, { "epoch": 3.2622601279317696, "grad_norm": 1.1758919894146582, "learning_rate": 5.0000160336348885e-08, "loss": 1.2091, "step": 1530 }, { "epoch": 3.272921108742004, "grad_norm": 1.1300862552793904, "learning_rate": 5.0000136865361153e-08, "loss": 1.0345, "step": 1535 }, { "epoch": 3.283582089552239, "grad_norm": 1.1470416574016102, "learning_rate": 5.000011656055037e-08, "loss": 1.0357, "step": 1540 }, { "epoch": 3.2942430703624734, "grad_norm": 1.18001188067462, "learning_rate": 5.000009903218913e-08, "loss": 1.0896, "step": 1545 }, { "epoch": 3.304904051172708, "grad_norm": 1.1316402765789904, "learning_rate": 5.0000083933791284e-08, "loss": 1.0557, "step": 1550 }, { "epoch": 3.3155650319829424, "grad_norm": 1.2685713000625365, "learning_rate": 5.0000070957854027e-08, "loss": 1.0442, "step": 1555 }, { "epoch": 3.326226012793177, "grad_norm": 1.3173532606855904, "learning_rate": 5.000005983196463e-08, "loss": 1.1031, "step": 1560 }, { "epoch": 3.3368869936034113, "grad_norm": 1.1645251930023506, "learning_rate": 5.0000050315245216e-08, "loss": 1.0327, "step": 1565 }, { "epoch": 3.3475479744136463, "grad_norm": 1.1526305321387222, "learning_rate": 5.000004219511104e-08, "loss": 1.1047, "step": 1570 }, { "epoch": 3.3582089552238807, "grad_norm": 1.1956529776682534, "learning_rate": 5.0000035284318585e-08, "loss": 1.0513, "step": 1575 }, { "epoch": 3.368869936034115, "grad_norm": 1.1723882568539148, "learning_rate": 5.00000294182818e-08, "loss": 1.19, "step": 1580 }, { "epoch": 3.3795309168443497, "grad_norm": 1.1986098281392215, "learning_rate": 5.00000244526357e-08, "loss": 1.077, "step": 1585 }, { "epoch": 3.390191897654584, "grad_norm": 1.1822052046386484, "learning_rate": 5.0000020261028096e-08, "loss": 0.9985, "step": 1590 }, { "epoch": 3.4008528784648187, "grad_norm": 1.1965544250131317, "learning_rate": 5.0000016733121405e-08, "loss": 1.1462, "step": 1595 }, { "epoch": 3.411513859275053, "grad_norm": 1.233729124712432, "learning_rate": 5.00000137727875e-08, "loss": 1.013, "step": 1600 }, { "epoch": 3.411513859275053, "eval_loss": 1.2190721035003662, "eval_runtime": 86.8673, "eval_samples_per_second": 19.19, "eval_steps_per_second": 0.311, "step": 1600 }, { "epoch": 3.4221748400852876, "grad_norm": 1.1940390976157107, "learning_rate": 5.00000112964799e-08, "loss": 1.0902, "step": 1605 }, { "epoch": 3.4328358208955225, "grad_norm": 1.1852233751297125, "learning_rate": 5.000000923176855e-08, "loss": 1.0622, "step": 1610 }, { "epoch": 3.443496801705757, "grad_norm": 1.174182970082022, "learning_rate": 5.0000007516023255e-08, "loss": 1.1077, "step": 1615 }, { "epoch": 3.4541577825159915, "grad_norm": 1.2175665950720118, "learning_rate": 5.0000006095233244e-08, "loss": 1.0463, "step": 1620 }, { "epoch": 3.464818763326226, "grad_norm": 1.1910402507833668, "learning_rate": 5.0000004922950696e-08, "loss": 1.0485, "step": 1625 }, { "epoch": 3.4754797441364604, "grad_norm": 1.1756720560395666, "learning_rate": 5.000000395934729e-08, "loss": 1.0855, "step": 1630 }, { "epoch": 3.486140724946695, "grad_norm": 1.1357035763961498, "learning_rate": 5.0000003170373506e-08, "loss": 1.0552, "step": 1635 }, { "epoch": 3.49680170575693, "grad_norm": 1.12995191245614, "learning_rate": 5.0000002527011166e-08, "loss": 1.1139, "step": 1640 }, { "epoch": 3.5074626865671643, "grad_norm": 1.2169670051086863, "learning_rate": 5.000000200461043e-08, "loss": 1.0746, "step": 1645 }, { "epoch": 3.518123667377399, "grad_norm": 1.1477244123007122, "learning_rate": 5.000000158230317e-08, "loss": 1.0328, "step": 1650 }, { "epoch": 3.5287846481876333, "grad_norm": 1.146160022657433, "learning_rate": 5.000000124248513e-08, "loss": 1.0818, "step": 1655 }, { "epoch": 3.5394456289978677, "grad_norm": 1.1725432339751227, "learning_rate": 5.0000000970360234e-08, "loss": 1.0631, "step": 1660 }, { "epoch": 3.550106609808102, "grad_norm": 1.1437856255430832, "learning_rate": 5.000000075354043e-08, "loss": 1.03, "step": 1665 }, { "epoch": 3.5607675906183367, "grad_norm": 1.1949291330530631, "learning_rate": 5.000000058169555e-08, "loss": 1.0766, "step": 1670 }, { "epoch": 3.571428571428571, "grad_norm": 1.1521311383486394, "learning_rate": 5.000000044624768e-08, "loss": 1.1008, "step": 1675 }, { "epoch": 3.582089552238806, "grad_norm": 1.1470924970059302, "learning_rate": 5.000000034010534e-08, "loss": 1.0803, "step": 1680 }, { "epoch": 3.5927505330490406, "grad_norm": 1.163957344487391, "learning_rate": 5.0000000257433004e-08, "loss": 1.0318, "step": 1685 }, { "epoch": 3.603411513859275, "grad_norm": 1.1289250902982475, "learning_rate": 5.000000019345195e-08, "loss": 1.0714, "step": 1690 }, { "epoch": 3.6140724946695095, "grad_norm": 1.1396900843621998, "learning_rate": 5.0000000144268826e-08, "loss": 1.0785, "step": 1695 }, { "epoch": 3.624733475479744, "grad_norm": 1.1928003735738484, "learning_rate": 5.000000010672866e-08, "loss": 1.0476, "step": 1700 }, { "epoch": 3.635394456289979, "grad_norm": 1.1813297878533093, "learning_rate": 5.000000007828923e-08, "loss": 1.1059, "step": 1705 }, { "epoch": 3.6460554371002134, "grad_norm": 1.1580070228375894, "learning_rate": 5.000000005691433e-08, "loss": 1.0804, "step": 1710 }, { "epoch": 3.656716417910448, "grad_norm": 1.1877112723964363, "learning_rate": 5.0000000040983296e-08, "loss": 1.0926, "step": 1715 }, { "epoch": 3.6673773987206824, "grad_norm": 1.162988885771696, "learning_rate": 5.0000000029214826e-08, "loss": 1.134, "step": 1720 }, { "epoch": 3.678038379530917, "grad_norm": 1.2167801349477705, "learning_rate": 5.000000002060308e-08, "loss": 1.1055, "step": 1725 }, { "epoch": 3.6886993603411513, "grad_norm": 1.1458862089420816, "learning_rate": 5.000000001436439e-08, "loss": 1.0211, "step": 1730 }, { "epoch": 3.699360341151386, "grad_norm": 1.0873296281789822, "learning_rate": 5.000000000989308e-08, "loss": 1.0072, "step": 1735 }, { "epoch": 3.7100213219616203, "grad_norm": 1.1575880345419727, "learning_rate": 5.000000000672499e-08, "loss": 1.1223, "step": 1740 }, { "epoch": 3.7206823027718547, "grad_norm": 1.1887408531399297, "learning_rate": 5.0000000004507695e-08, "loss": 1.0693, "step": 1745 }, { "epoch": 3.7313432835820897, "grad_norm": 1.1370169892656834, "learning_rate": 5.000000000297619e-08, "loss": 0.9994, "step": 1750 }, { "epoch": 3.742004264392324, "grad_norm": 1.1964357258341392, "learning_rate": 5.000000000193327e-08, "loss": 1.1289, "step": 1755 }, { "epoch": 3.7526652452025586, "grad_norm": 1.1471490215463733, "learning_rate": 5.000000000123389e-08, "loss": 1.0828, "step": 1760 }, { "epoch": 3.763326226012793, "grad_norm": 1.1835965960152477, "learning_rate": 5.0000000000772604e-08, "loss": 1.1045, "step": 1765 }, { "epoch": 3.7739872068230276, "grad_norm": 1.122980946834769, "learning_rate": 5.000000000047381e-08, "loss": 1.0586, "step": 1770 }, { "epoch": 3.7846481876332625, "grad_norm": 1.1509727983825737, "learning_rate": 5.000000000028403e-08, "loss": 1.1015, "step": 1775 }, { "epoch": 3.795309168443497, "grad_norm": 1.1806503665236887, "learning_rate": 5.000000000016606e-08, "loss": 1.1195, "step": 1780 }, { "epoch": 3.8059701492537314, "grad_norm": 1.1840945960267957, "learning_rate": 5.0000000000094455e-08, "loss": 1.0452, "step": 1785 }, { "epoch": 3.816631130063966, "grad_norm": 1.1373535444813456, "learning_rate": 5.000000000005211e-08, "loss": 1.0538, "step": 1790 }, { "epoch": 3.8272921108742004, "grad_norm": 1.1164448190518461, "learning_rate": 5.000000000002779e-08, "loss": 1.0796, "step": 1795 }, { "epoch": 3.837953091684435, "grad_norm": 1.2618135197926215, "learning_rate": 5.0000000000014265e-08, "loss": 1.1103, "step": 1800 }, { "epoch": 3.837953091684435, "eval_loss": 1.2187809944152832, "eval_runtime": 87.3426, "eval_samples_per_second": 19.086, "eval_steps_per_second": 0.309, "step": 1800 }, { "epoch": 3.8486140724946694, "grad_norm": 1.166670525467647, "learning_rate": 5.000000000000701e-08, "loss": 1.0504, "step": 1805 }, { "epoch": 3.859275053304904, "grad_norm": 1.1358229871637773, "learning_rate": 5.000000000000328e-08, "loss": 1.031, "step": 1810 }, { "epoch": 3.8699360341151388, "grad_norm": 1.1553072990278574, "learning_rate": 5.0000000000001454e-08, "loss": 1.0852, "step": 1815 }, { "epoch": 3.8805970149253732, "grad_norm": 1.1514532203092147, "learning_rate": 5.00000000000006e-08, "loss": 1.0682, "step": 1820 }, { "epoch": 3.8912579957356077, "grad_norm": 1.1884972946614185, "learning_rate": 5.000000000000023e-08, "loss": 1.0012, "step": 1825 }, { "epoch": 3.901918976545842, "grad_norm": 1.1704188735844283, "learning_rate": 5.000000000000008e-08, "loss": 1.0772, "step": 1830 }, { "epoch": 3.9125799573560767, "grad_norm": 1.128126951495901, "learning_rate": 5.0000000000000024e-08, "loss": 1.0497, "step": 1835 }, { "epoch": 3.923240938166311, "grad_norm": 1.14600117506389, "learning_rate": 5.0000000000000004e-08, "loss": 1.0813, "step": 1840 }, { "epoch": 3.933901918976546, "grad_norm": 1.1304423202109097, "learning_rate": 5e-08, "loss": 1.1111, "step": 1845 }, { "epoch": 3.9445628997867805, "grad_norm": 1.127824200062695, "learning_rate": 5e-08, "loss": 1.0618, "step": 1850 }, { "epoch": 3.955223880597015, "grad_norm": 1.1479631723327013, "learning_rate": 5e-08, "loss": 1.0155, "step": 1855 }, { "epoch": 3.9658848614072495, "grad_norm": 1.2107343511697226, "learning_rate": 5e-08, "loss": 1.1179, "step": 1860 }, { "epoch": 3.976545842217484, "grad_norm": 1.1701981970918207, "learning_rate": 5e-08, "loss": 1.0525, "step": 1865 }, { "epoch": 3.9872068230277184, "grad_norm": 1.2066704843958889, "learning_rate": 5e-08, "loss": 1.0012, "step": 1870 }, { "epoch": 3.997867803837953, "grad_norm": 1.1997617373616742, "learning_rate": 5e-08, "loss": 1.0131, "step": 1875 }, { "epoch": 4.0, "step": 1876, "total_flos": 391958715432960.0, "train_loss": 1.1227505109203395, "train_runtime": 12501.281, "train_samples_per_second": 4.798, "train_steps_per_second": 0.15 } ], "logging_steps": 5, "max_steps": 1876, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "total_flos": 391958715432960.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }