{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2886961298148445, "eval_steps": 500000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011443480649074222, "grad_norm": 603.6091918945312, "learning_rate": 1.1666666666666667e-07, "loss": 10.731, "step": 500 }, { "epoch": 0.022886961298148444, "grad_norm": 718.9038696289062, "learning_rate": 2.3333333333333333e-07, "loss": 9.2432, "step": 1000 }, { "epoch": 0.034330441947222665, "grad_norm": 38.804962158203125, "learning_rate": 3.5000000000000004e-07, "loss": 9.1395, "step": 1500 }, { "epoch": 0.04577392259629689, "grad_norm": 619.4507446289062, "learning_rate": 4.6666666666666666e-07, "loss": 11.2638, "step": 2000 }, { "epoch": 0.05721740324537111, "grad_norm": 0.11708039790391922, "learning_rate": 5.833333333333333e-07, "loss": 9.1618, "step": 2500 }, { "epoch": 0.06866088389444533, "grad_norm": 206.2231903076172, "learning_rate": 7.000000000000001e-07, "loss": 12.4357, "step": 3000 }, { "epoch": 0.08010436454351956, "grad_norm": 26.35240936279297, "learning_rate": 8.166666666666666e-07, "loss": 9.4019, "step": 3500 }, { "epoch": 0.09154784519259378, "grad_norm": 390.7612609863281, "learning_rate": 9.333333333333333e-07, "loss": 10.9883, "step": 4000 }, { "epoch": 0.10299132584166801, "grad_norm": 560.8733520507812, "learning_rate": 1.05e-06, "loss": 11.4073, "step": 4500 }, { "epoch": 0.11443480649074222, "grad_norm": 733.6063232421875, "learning_rate": 1.1666666666666666e-06, "loss": 9.4747, "step": 5000 }, { "epoch": 0.12587828713981644, "grad_norm": 176.73806762695312, "learning_rate": 1.2833333333333333e-06, "loss": 11.2038, "step": 5500 }, { "epoch": 0.13732176778889066, "grad_norm": 40.61717224121094, "learning_rate": 1.4000000000000001e-06, "loss": 11.3759, "step": 6000 }, { "epoch": 0.1487652484379649, "grad_norm": 2191.57470703125, "learning_rate": 1.5166666666666666e-06, "loss": 10.7893, "step": 6500 }, { "epoch": 0.16020872908703912, "grad_norm": 10.336786270141602, "learning_rate": 1.6333333333333333e-06, "loss": 11.6208, "step": 7000 }, { "epoch": 0.17165220973611334, "grad_norm": 0.11516555398702621, "learning_rate": 1.75e-06, "loss": 10.8105, "step": 7500 }, { "epoch": 0.18309569038518755, "grad_norm": 0.2883591651916504, "learning_rate": 1.8666666666666667e-06, "loss": 11.147, "step": 8000 }, { "epoch": 0.19453917103426177, "grad_norm": 862.2401123046875, "learning_rate": 1.983333333333333e-06, "loss": 10.8087, "step": 8500 }, { "epoch": 0.20598265168333602, "grad_norm": 0.004448544699698687, "learning_rate": 2.1e-06, "loss": 11.3934, "step": 9000 }, { "epoch": 0.21742613233241023, "grad_norm": 0.008581358008086681, "learning_rate": 2.2166666666666665e-06, "loss": 13.5201, "step": 9500 }, { "epoch": 0.22886961298148445, "grad_norm": 0.170095756649971, "learning_rate": 2.333333333333333e-06, "loss": 10.7812, "step": 10000 }, { "epoch": 0.24031309363055867, "grad_norm": 0.8723887801170349, "learning_rate": 2.45e-06, "loss": 10.8569, "step": 10500 }, { "epoch": 0.2517565742796329, "grad_norm": 1424.5357666015625, "learning_rate": 2.5666666666666665e-06, "loss": 13.1824, "step": 11000 }, { "epoch": 0.26320005492870713, "grad_norm": 2.006991386413574, "learning_rate": 2.6833333333333336e-06, "loss": 12.9087, "step": 11500 }, { "epoch": 0.2746435355777813, "grad_norm": 492.7056884765625, "learning_rate": 2.8000000000000003e-06, "loss": 11.9267, "step": 12000 }, { "epoch": 0.28608701622685556, "grad_norm": 6.009059906005859, "learning_rate": 2.9166666666666666e-06, "loss": 12.8387, "step": 12500 }, { "epoch": 0.2975304968759298, "grad_norm": 1738.22900390625, "learning_rate": 3.0333333333333332e-06, "loss": 13.1134, "step": 13000 }, { "epoch": 0.308973977525004, "grad_norm": 1644.960205078125, "learning_rate": 3.15e-06, "loss": 10.5768, "step": 13500 }, { "epoch": 0.32041745817407824, "grad_norm": 2049.489501953125, "learning_rate": 3.2666666666666666e-06, "loss": 12.9949, "step": 14000 }, { "epoch": 0.33186093882315243, "grad_norm": 2.0113472938537598, "learning_rate": 3.3833333333333333e-06, "loss": 12.5091, "step": 14500 }, { "epoch": 0.3433044194722267, "grad_norm": 767.9404296875, "learning_rate": 3.5e-06, "loss": 12.5176, "step": 15000 }, { "epoch": 0.3547479001213009, "grad_norm": 342.45513916015625, "learning_rate": 3.616666666666667e-06, "loss": 12.6625, "step": 15500 }, { "epoch": 0.3661913807703751, "grad_norm": 771.4798583984375, "learning_rate": 3.7333333333333333e-06, "loss": 14.2435, "step": 16000 }, { "epoch": 0.37763486141944935, "grad_norm": 4853.9267578125, "learning_rate": 3.85e-06, "loss": 15.1053, "step": 16500 }, { "epoch": 0.38907834206852354, "grad_norm": 0.0017908277222886682, "learning_rate": 3.966666666666666e-06, "loss": 12.6706, "step": 17000 }, { "epoch": 0.4005218227175978, "grad_norm": 0.2733666002750397, "learning_rate": 4.083333333333334e-06, "loss": 12.4815, "step": 17500 }, { "epoch": 0.41196530336667203, "grad_norm": 0.0, "learning_rate": 4.2e-06, "loss": 12.5318, "step": 18000 }, { "epoch": 0.4234087840157462, "grad_norm": 1851.682861328125, "learning_rate": 4.316666666666667e-06, "loss": 13.7826, "step": 18500 }, { "epoch": 0.43485226466482046, "grad_norm": 157.1131591796875, "learning_rate": 4.433333333333333e-06, "loss": 12.2737, "step": 19000 }, { "epoch": 0.44629574531389465, "grad_norm": 1271.1109619140625, "learning_rate": 4.5500000000000005e-06, "loss": 10.5671, "step": 19500 }, { "epoch": 0.4577392259629689, "grad_norm": 480.1098937988281, "learning_rate": 4.666666666666666e-06, "loss": 13.5673, "step": 20000 }, { "epoch": 0.46918270661204314, "grad_norm": 1566.4591064453125, "learning_rate": 4.783333333333333e-06, "loss": 14.4705, "step": 20500 }, { "epoch": 0.48062618726111733, "grad_norm": 0.0007110056467354298, "learning_rate": 4.9e-06, "loss": 11.6491, "step": 21000 }, { "epoch": 0.4920696679101916, "grad_norm": 4.593759059906006, "learning_rate": 5.016666666666666e-06, "loss": 12.8944, "step": 21500 }, { "epoch": 0.5035131485592658, "grad_norm": 0.0, "learning_rate": 5.133333333333333e-06, "loss": 13.6318, "step": 22000 }, { "epoch": 0.51495662920834, "grad_norm": 132.595458984375, "learning_rate": 5.25e-06, "loss": 12.8382, "step": 22500 }, { "epoch": 0.5264001098574143, "grad_norm": 347.5989074707031, "learning_rate": 5.366666666666667e-06, "loss": 11.8936, "step": 23000 }, { "epoch": 0.5378435905064884, "grad_norm": 0.009264913387596607, "learning_rate": 5.483333333333333e-06, "loss": 14.0151, "step": 23500 }, { "epoch": 0.5492870711555626, "grad_norm": 245.79705810546875, "learning_rate": 5.600000000000001e-06, "loss": 11.9611, "step": 24000 }, { "epoch": 0.5607305518046369, "grad_norm": 1345.8895263671875, "learning_rate": 5.7166666666666664e-06, "loss": 13.2515, "step": 24500 }, { "epoch": 0.5721740324537111, "grad_norm": 0.26148831844329834, "learning_rate": 5.833333333333333e-06, "loss": 13.5492, "step": 25000 }, { "epoch": 0.5836175131027853, "grad_norm": 4638.9169921875, "learning_rate": 5.95e-06, "loss": 14.1596, "step": 25500 }, { "epoch": 0.5950609937518596, "grad_norm": 184.8077392578125, "learning_rate": 6.0666666666666665e-06, "loss": 11.8104, "step": 26000 }, { "epoch": 0.6065044744009338, "grad_norm": 140.49444580078125, "learning_rate": 6.183333333333333e-06, "loss": 11.2624, "step": 26500 }, { "epoch": 0.617947955050008, "grad_norm": 0.0011962183052673936, "learning_rate": 6.3e-06, "loss": 12.7681, "step": 27000 }, { "epoch": 0.6293914356990822, "grad_norm": 0.057224027812480927, "learning_rate": 6.4166666666666665e-06, "loss": 15.7726, "step": 27500 }, { "epoch": 0.6408349163481565, "grad_norm": 0.04104612395167351, "learning_rate": 6.533333333333333e-06, "loss": 15.2471, "step": 28000 }, { "epoch": 0.6522783969972307, "grad_norm": 0.003121469169855118, "learning_rate": 6.65e-06, "loss": 16.5911, "step": 28500 }, { "epoch": 0.6637218776463049, "grad_norm": 0.0, "learning_rate": 6.7666666666666665e-06, "loss": 11.7935, "step": 29000 }, { "epoch": 0.6751653582953792, "grad_norm": 1068.4923095703125, "learning_rate": 6.883333333333333e-06, "loss": 12.6445, "step": 29500 }, { "epoch": 0.6866088389444533, "grad_norm": 49.82635498046875, "learning_rate": 7e-06, "loss": 13.4921, "step": 30000 }, { "epoch": 0.6980523195935275, "grad_norm": 0.021331172436475754, "learning_rate": 6.987037037037037e-06, "loss": 13.8913, "step": 30500 }, { "epoch": 0.7094958002426018, "grad_norm": 390.8102111816406, "learning_rate": 6.974074074074074e-06, "loss": 14.1043, "step": 31000 }, { "epoch": 0.720939280891676, "grad_norm": 464.5948791503906, "learning_rate": 6.9611111111111116e-06, "loss": 14.1976, "step": 31500 }, { "epoch": 0.7323827615407502, "grad_norm": 383.39373779296875, "learning_rate": 6.948148148148148e-06, "loss": 13.338, "step": 32000 }, { "epoch": 0.7438262421898244, "grad_norm": 0.005027694161981344, "learning_rate": 6.935185185185185e-06, "loss": 13.6854, "step": 32500 }, { "epoch": 0.7552697228388987, "grad_norm": 1620.7523193359375, "learning_rate": 6.922222222222222e-06, "loss": 11.7295, "step": 33000 }, { "epoch": 0.7667132034879729, "grad_norm": 959.1514282226562, "learning_rate": 6.90925925925926e-06, "loss": 14.5347, "step": 33500 }, { "epoch": 0.7781566841370471, "grad_norm": 778.1695556640625, "learning_rate": 6.896296296296296e-06, "loss": 12.9294, "step": 34000 }, { "epoch": 0.7896001647861214, "grad_norm": 18.240928649902344, "learning_rate": 6.883333333333333e-06, "loss": 12.5578, "step": 34500 }, { "epoch": 0.8010436454351956, "grad_norm": 0.060547858476638794, "learning_rate": 6.8703703703703704e-06, "loss": 11.7427, "step": 35000 }, { "epoch": 0.8124871260842698, "grad_norm": 0.03452019393444061, "learning_rate": 6.857407407407408e-06, "loss": 13.3464, "step": 35500 }, { "epoch": 0.8239306067333441, "grad_norm": 0.0, "learning_rate": 6.844444444444444e-06, "loss": 12.3813, "step": 36000 }, { "epoch": 0.8353740873824183, "grad_norm": 0.01644955575466156, "learning_rate": 6.831481481481482e-06, "loss": 15.1999, "step": 36500 }, { "epoch": 0.8468175680314924, "grad_norm": 0.041852571070194244, "learning_rate": 6.8185185185185185e-06, "loss": 14.611, "step": 37000 }, { "epoch": 0.8582610486805666, "grad_norm": 0.03489886596798897, "learning_rate": 6.805555555555556e-06, "loss": 12.7151, "step": 37500 }, { "epoch": 0.8697045293296409, "grad_norm": 1784.8714599609375, "learning_rate": 6.792592592592592e-06, "loss": 13.324, "step": 38000 }, { "epoch": 0.8811480099787151, "grad_norm": 325.86151123046875, "learning_rate": 6.77962962962963e-06, "loss": 13.8547, "step": 38500 }, { "epoch": 0.8925914906277893, "grad_norm": 209.41114807128906, "learning_rate": 6.7666666666666665e-06, "loss": 13.9472, "step": 39000 }, { "epoch": 0.9040349712768636, "grad_norm": 0.0, "learning_rate": 6.753703703703704e-06, "loss": 14.3618, "step": 39500 }, { "epoch": 0.9154784519259378, "grad_norm": 1665.116943359375, "learning_rate": 6.74074074074074e-06, "loss": 15.8669, "step": 40000 }, { "epoch": 0.926921932575012, "grad_norm": 0.0066319056786596775, "learning_rate": 6.727777777777778e-06, "loss": 14.8973, "step": 40500 }, { "epoch": 0.9383654132240863, "grad_norm": 738.2325439453125, "learning_rate": 6.714814814814815e-06, "loss": 13.7844, "step": 41000 }, { "epoch": 0.9498088938731605, "grad_norm": 276.5779113769531, "learning_rate": 6.701851851851852e-06, "loss": 12.9653, "step": 41500 }, { "epoch": 0.9612523745222347, "grad_norm": 1572.8726806640625, "learning_rate": 6.688888888888889e-06, "loss": 12.8457, "step": 42000 }, { "epoch": 0.9726958551713089, "grad_norm": 1.0475573539733887, "learning_rate": 6.675925925925926e-06, "loss": 12.2727, "step": 42500 }, { "epoch": 0.9841393358203832, "grad_norm": 12.392683029174805, "learning_rate": 6.662962962962963e-06, "loss": 14.6495, "step": 43000 }, { "epoch": 0.9955828164694573, "grad_norm": 0.0, "learning_rate": 6.65e-06, "loss": 13.6766, "step": 43500 }, { "epoch": 1.0070262971185315, "grad_norm": 178.05191040039062, "learning_rate": 6.637037037037037e-06, "loss": 12.0881, "step": 44000 }, { "epoch": 1.0184697777676057, "grad_norm": 287.11749267578125, "learning_rate": 6.624074074074074e-06, "loss": 10.3373, "step": 44500 }, { "epoch": 1.02991325841668, "grad_norm": 229.68655395507812, "learning_rate": 6.611111111111111e-06, "loss": 12.0917, "step": 45000 }, { "epoch": 1.0413567390657543, "grad_norm": 0.005943186115473509, "learning_rate": 6.598148148148148e-06, "loss": 11.5574, "step": 45500 }, { "epoch": 1.0528002197148285, "grad_norm": 0.00034078938188031316, "learning_rate": 6.585185185185185e-06, "loss": 13.1375, "step": 46000 }, { "epoch": 1.0642437003639027, "grad_norm": 0.3885180354118347, "learning_rate": 6.572222222222222e-06, "loss": 11.2181, "step": 46500 }, { "epoch": 1.075687181012977, "grad_norm": 1127.3404541015625, "learning_rate": 6.559259259259259e-06, "loss": 12.5768, "step": 47000 }, { "epoch": 1.087130661662051, "grad_norm": 1220.1749267578125, "learning_rate": 6.546296296296297e-06, "loss": 13.4144, "step": 47500 }, { "epoch": 1.0985741423111253, "grad_norm": 0.0005302530480548739, "learning_rate": 6.533333333333333e-06, "loss": 11.0923, "step": 48000 }, { "epoch": 1.1100176229601995, "grad_norm": 661.6603393554688, "learning_rate": 6.52037037037037e-06, "loss": 11.0951, "step": 48500 }, { "epoch": 1.1214611036092739, "grad_norm": 0.08705360442399979, "learning_rate": 6.507407407407407e-06, "loss": 11.5883, "step": 49000 }, { "epoch": 1.132904584258348, "grad_norm": 171.4398651123047, "learning_rate": 6.494444444444445e-06, "loss": 13.5062, "step": 49500 }, { "epoch": 1.1443480649074222, "grad_norm": 0.04586634412407875, "learning_rate": 6.481481481481481e-06, "loss": 12.4203, "step": 50000 }, { "epoch": 1.1557915455564964, "grad_norm": 0.0418708510696888, "learning_rate": 6.4685185185185185e-06, "loss": 12.8418, "step": 50500 }, { "epoch": 1.1672350262055706, "grad_norm": 635.2694091796875, "learning_rate": 6.455555555555556e-06, "loss": 11.7328, "step": 51000 }, { "epoch": 1.1786785068546448, "grad_norm": 216.6066131591797, "learning_rate": 6.442592592592593e-06, "loss": 13.6954, "step": 51500 }, { "epoch": 1.1901219875037192, "grad_norm": 120.59862518310547, "learning_rate": 6.429629629629629e-06, "loss": 11.0234, "step": 52000 }, { "epoch": 1.2015654681527934, "grad_norm": 647.171875, "learning_rate": 6.4166666666666665e-06, "loss": 14.4181, "step": 52500 }, { "epoch": 1.2130089488018676, "grad_norm": 0.8411855697631836, "learning_rate": 6.403703703703704e-06, "loss": 13.5984, "step": 53000 }, { "epoch": 1.2244524294509418, "grad_norm": 0.0012393624056130648, "learning_rate": 6.390740740740741e-06, "loss": 12.1035, "step": 53500 }, { "epoch": 1.235895910100016, "grad_norm": 0.0, "learning_rate": 6.377777777777777e-06, "loss": 12.8851, "step": 54000 }, { "epoch": 1.2473393907490902, "grad_norm": 0.0, "learning_rate": 6.3648148148148145e-06, "loss": 15.1293, "step": 54500 }, { "epoch": 1.2587828713981644, "grad_norm": 2496.68505859375, "learning_rate": 6.351851851851852e-06, "loss": 11.7267, "step": 55000 }, { "epoch": 1.2702263520472385, "grad_norm": 603.0614013671875, "learning_rate": 6.338888888888889e-06, "loss": 12.6656, "step": 55500 }, { "epoch": 1.281669832696313, "grad_norm": 0.1624222695827484, "learning_rate": 6.325925925925925e-06, "loss": 12.3149, "step": 56000 }, { "epoch": 1.2931133133453871, "grad_norm": 0.0846194475889206, "learning_rate": 6.3129629629629634e-06, "loss": 14.1863, "step": 56500 }, { "epoch": 1.3045567939944613, "grad_norm": 0.0, "learning_rate": 6.3e-06, "loss": 13.3858, "step": 57000 }, { "epoch": 1.3160002746435355, "grad_norm": 0.0017423485405743122, "learning_rate": 6.287037037037037e-06, "loss": 10.2554, "step": 57500 }, { "epoch": 1.3274437552926097, "grad_norm": 1450.389892578125, "learning_rate": 6.274074074074073e-06, "loss": 12.7777, "step": 58000 }, { "epoch": 1.3388872359416841, "grad_norm": 0.043471284210681915, "learning_rate": 6.2611111111111115e-06, "loss": 11.1225, "step": 58500 }, { "epoch": 1.3503307165907583, "grad_norm": 0.0038315041456371546, "learning_rate": 6.248148148148148e-06, "loss": 10.2773, "step": 59000 }, { "epoch": 1.3617741972398325, "grad_norm": 2073.392578125, "learning_rate": 6.235185185185185e-06, "loss": 13.5407, "step": 59500 }, { "epoch": 1.3732176778889067, "grad_norm": 4.363699913024902, "learning_rate": 6.2222222222222215e-06, "loss": 14.8187, "step": 60000 }, { "epoch": 1.3846611585379809, "grad_norm": 0.010874895378947258, "learning_rate": 6.2092592592592595e-06, "loss": 11.7766, "step": 60500 }, { "epoch": 1.396104639187055, "grad_norm": 5.799459457397461, "learning_rate": 6.196296296296296e-06, "loss": 12.6413, "step": 61000 }, { "epoch": 1.4075481198361293, "grad_norm": 0.0036441339179873466, "learning_rate": 6.183333333333333e-06, "loss": 11.2557, "step": 61500 }, { "epoch": 1.4189916004852035, "grad_norm": 1275.374755859375, "learning_rate": 6.17037037037037e-06, "loss": 11.7272, "step": 62000 }, { "epoch": 1.4304350811342779, "grad_norm": 4855.68212890625, "learning_rate": 6.157407407407408e-06, "loss": 12.1518, "step": 62500 }, { "epoch": 1.441878561783352, "grad_norm": 523.7196655273438, "learning_rate": 6.144444444444444e-06, "loss": 12.6978, "step": 63000 }, { "epoch": 1.4533220424324262, "grad_norm": 1098.2850341796875, "learning_rate": 6.131481481481481e-06, "loss": 11.8028, "step": 63500 }, { "epoch": 1.4647655230815004, "grad_norm": 1153.9100341796875, "learning_rate": 6.118518518518518e-06, "loss": 11.3937, "step": 64000 }, { "epoch": 1.4762090037305746, "grad_norm": 1573.587890625, "learning_rate": 6.105555555555556e-06, "loss": 13.9497, "step": 64500 }, { "epoch": 1.487652484379649, "grad_norm": 908.9359130859375, "learning_rate": 6.092592592592592e-06, "loss": 11.6579, "step": 65000 }, { "epoch": 1.4990959650287232, "grad_norm": 1250.6552734375, "learning_rate": 6.079629629629629e-06, "loss": 12.2497, "step": 65500 }, { "epoch": 1.5105394456777974, "grad_norm": 0.030488021671772003, "learning_rate": 6.0666666666666665e-06, "loss": 13.4229, "step": 66000 }, { "epoch": 1.5219829263268716, "grad_norm": 3663.417724609375, "learning_rate": 6.053703703703704e-06, "loss": 11.9166, "step": 66500 }, { "epoch": 1.5334264069759458, "grad_norm": 0.10368392616510391, "learning_rate": 6.04074074074074e-06, "loss": 11.7133, "step": 67000 }, { "epoch": 1.54486988762502, "grad_norm": 23.950149536132812, "learning_rate": 6.027777777777778e-06, "loss": 10.2979, "step": 67500 }, { "epoch": 1.5563133682740942, "grad_norm": 1314.4285888671875, "learning_rate": 6.0148148148148145e-06, "loss": 11.5517, "step": 68000 }, { "epoch": 1.5677568489231684, "grad_norm": 0.0, "learning_rate": 6.001851851851852e-06, "loss": 12.2986, "step": 68500 }, { "epoch": 1.5792003295722425, "grad_norm": 558.1203002929688, "learning_rate": 5.988888888888888e-06, "loss": 11.5298, "step": 69000 }, { "epoch": 1.590643810221317, "grad_norm": 0.5483641624450684, "learning_rate": 5.975925925925926e-06, "loss": 11.2464, "step": 69500 }, { "epoch": 1.6020872908703911, "grad_norm": 0.009755097329616547, "learning_rate": 5.9629629629629626e-06, "loss": 13.0815, "step": 70000 }, { "epoch": 1.6135307715194653, "grad_norm": 0.007829127833247185, "learning_rate": 5.95e-06, "loss": 10.083, "step": 70500 }, { "epoch": 1.6249742521685397, "grad_norm": 435.7135314941406, "learning_rate": 5.937037037037036e-06, "loss": 11.4477, "step": 71000 }, { "epoch": 1.636417732817614, "grad_norm": 5.702239513397217, "learning_rate": 5.924074074074074e-06, "loss": 11.0763, "step": 71500 }, { "epoch": 1.6478612134666881, "grad_norm": 0.10179603844881058, "learning_rate": 5.911111111111111e-06, "loss": 12.8192, "step": 72000 }, { "epoch": 1.6593046941157623, "grad_norm": 681.2998046875, "learning_rate": 5.898148148148148e-06, "loss": 11.8389, "step": 72500 }, { "epoch": 1.6707481747648365, "grad_norm": 168.10958862304688, "learning_rate": 5.885185185185185e-06, "loss": 9.9504, "step": 73000 }, { "epoch": 1.6821916554139107, "grad_norm": 0.004441286437213421, "learning_rate": 5.872222222222222e-06, "loss": 14.6829, "step": 73500 }, { "epoch": 1.6936351360629849, "grad_norm": 0.020898401737213135, "learning_rate": 5.859259259259259e-06, "loss": 11.1156, "step": 74000 }, { "epoch": 1.705078616712059, "grad_norm": 0.021843064576387405, "learning_rate": 5.846296296296296e-06, "loss": 10.6854, "step": 74500 }, { "epoch": 1.7165220973611333, "grad_norm": 33.806251525878906, "learning_rate": 5.833333333333333e-06, "loss": 13.1055, "step": 75000 }, { "epoch": 1.7279655780102074, "grad_norm": 29.790348052978516, "learning_rate": 5.82037037037037e-06, "loss": 12.1967, "step": 75500 }, { "epoch": 1.7394090586592819, "grad_norm": 2041.1212158203125, "learning_rate": 5.8074074074074076e-06, "loss": 12.7192, "step": 76000 }, { "epoch": 1.750852539308356, "grad_norm": 1129.54833984375, "learning_rate": 5.794444444444444e-06, "loss": 12.4452, "step": 76500 }, { "epoch": 1.7622960199574302, "grad_norm": 0.3305358290672302, "learning_rate": 5.781481481481482e-06, "loss": 12.6818, "step": 77000 }, { "epoch": 1.7737395006065044, "grad_norm": 5318.0498046875, "learning_rate": 5.768518518518518e-06, "loss": 11.3843, "step": 77500 }, { "epoch": 1.7851829812555788, "grad_norm": 1354.53466796875, "learning_rate": 5.755555555555556e-06, "loss": 11.9361, "step": 78000 }, { "epoch": 1.796626461904653, "grad_norm": 0.01847067102789879, "learning_rate": 5.742592592592593e-06, "loss": 14.4414, "step": 78500 }, { "epoch": 1.8080699425537272, "grad_norm": 0.0, "learning_rate": 5.72962962962963e-06, "loss": 12.2809, "step": 79000 }, { "epoch": 1.8195134232028014, "grad_norm": 0.0010120772058144212, "learning_rate": 5.7166666666666664e-06, "loss": 11.9991, "step": 79500 }, { "epoch": 1.8309569038518756, "grad_norm": 0.0, "learning_rate": 5.703703703703704e-06, "loss": 10.9662, "step": 80000 }, { "epoch": 1.8424003845009498, "grad_norm": 279.47137451171875, "learning_rate": 5.690740740740741e-06, "loss": 11.1481, "step": 80500 }, { "epoch": 1.853843865150024, "grad_norm": 0.0, "learning_rate": 5.677777777777778e-06, "loss": 12.123, "step": 81000 }, { "epoch": 1.8652873457990982, "grad_norm": 807.4469604492188, "learning_rate": 5.6648148148148145e-06, "loss": 13.1075, "step": 81500 }, { "epoch": 1.8767308264481724, "grad_norm": 173.2933349609375, "learning_rate": 5.6518518518518525e-06, "loss": 12.5405, "step": 82000 }, { "epoch": 1.8881743070972465, "grad_norm": 0.0, "learning_rate": 5.638888888888889e-06, "loss": 13.9821, "step": 82500 }, { "epoch": 1.899617787746321, "grad_norm": 2014.615478515625, "learning_rate": 5.625925925925926e-06, "loss": 9.4421, "step": 83000 }, { "epoch": 1.9110612683953951, "grad_norm": 0.0, "learning_rate": 5.6129629629629625e-06, "loss": 10.8762, "step": 83500 }, { "epoch": 1.9225047490444693, "grad_norm": 70.37690734863281, "learning_rate": 5.600000000000001e-06, "loss": 12.3405, "step": 84000 }, { "epoch": 1.9339482296935437, "grad_norm": 0.5092810392379761, "learning_rate": 5.587037037037037e-06, "loss": 10.5346, "step": 84500 }, { "epoch": 1.945391710342618, "grad_norm": 1005.1751098632812, "learning_rate": 5.574074074074074e-06, "loss": 13.8286, "step": 85000 }, { "epoch": 1.9568351909916921, "grad_norm": 0.003633589716628194, "learning_rate": 5.5611111111111106e-06, "loss": 12.5279, "step": 85500 }, { "epoch": 1.9682786716407663, "grad_norm": 398.6470642089844, "learning_rate": 5.548148148148149e-06, "loss": 10.5745, "step": 86000 }, { "epoch": 1.9797221522898405, "grad_norm": 538.2572021484375, "learning_rate": 5.535185185185185e-06, "loss": 14.3954, "step": 86500 }, { "epoch": 1.9911656329389147, "grad_norm": 656.5657348632812, "learning_rate": 5.522222222222222e-06, "loss": 13.2243, "step": 87000 }, { "epoch": 2.002609113587989, "grad_norm": 195.96044921875, "learning_rate": 5.5092592592592595e-06, "loss": 13.1255, "step": 87500 }, { "epoch": 2.014052594237063, "grad_norm": 1494.8990478515625, "learning_rate": 5.496296296296297e-06, "loss": 8.6693, "step": 88000 }, { "epoch": 2.0254960748861373, "grad_norm": 0.0, "learning_rate": 5.483333333333333e-06, "loss": 10.1512, "step": 88500 }, { "epoch": 2.0369395555352114, "grad_norm": 3263.656982421875, "learning_rate": 5.47037037037037e-06, "loss": 11.7968, "step": 89000 }, { "epoch": 2.0483830361842856, "grad_norm": 0.5306602716445923, "learning_rate": 5.4574074074074075e-06, "loss": 10.1574, "step": 89500 }, { "epoch": 2.05982651683336, "grad_norm": 955.903076171875, "learning_rate": 5.444444444444445e-06, "loss": 8.7719, "step": 90000 }, { "epoch": 2.0712699974824345, "grad_norm": 28.916826248168945, "learning_rate": 5.431481481481481e-06, "loss": 7.6421, "step": 90500 }, { "epoch": 2.0827134781315086, "grad_norm": 0.018332993611693382, "learning_rate": 5.418518518518518e-06, "loss": 9.0135, "step": 91000 }, { "epoch": 2.094156958780583, "grad_norm": 240.7179412841797, "learning_rate": 5.4055555555555556e-06, "loss": 8.8454, "step": 91500 }, { "epoch": 2.105600439429657, "grad_norm": 237.4253692626953, "learning_rate": 5.392592592592593e-06, "loss": 9.7903, "step": 92000 }, { "epoch": 2.117043920078731, "grad_norm": 2.053232192993164, "learning_rate": 5.379629629629629e-06, "loss": 11.0608, "step": 92500 }, { "epoch": 2.1284874007278054, "grad_norm": 862.31201171875, "learning_rate": 5.366666666666667e-06, "loss": 10.3125, "step": 93000 }, { "epoch": 2.1399308813768796, "grad_norm": 0.03503242880105972, "learning_rate": 5.353703703703704e-06, "loss": 12.3845, "step": 93500 }, { "epoch": 2.151374362025954, "grad_norm": 0.035824213176965714, "learning_rate": 5.340740740740741e-06, "loss": 9.1687, "step": 94000 }, { "epoch": 2.162817842675028, "grad_norm": 0.0, "learning_rate": 5.327777777777777e-06, "loss": 9.9659, "step": 94500 }, { "epoch": 2.174261323324102, "grad_norm": 87.5394515991211, "learning_rate": 5.314814814814815e-06, "loss": 9.6173, "step": 95000 }, { "epoch": 2.1857048039731763, "grad_norm": 0.0, "learning_rate": 5.301851851851852e-06, "loss": 9.4475, "step": 95500 }, { "epoch": 2.1971482846222505, "grad_norm": 1670.8548583984375, "learning_rate": 5.288888888888889e-06, "loss": 9.944, "step": 96000 }, { "epoch": 2.2085917652713247, "grad_norm": 0.0, "learning_rate": 5.275925925925925e-06, "loss": 9.0547, "step": 96500 }, { "epoch": 2.220035245920399, "grad_norm": 0.06572415679693222, "learning_rate": 5.262962962962963e-06, "loss": 8.8515, "step": 97000 }, { "epoch": 2.2314787265694735, "grad_norm": 0.009067896753549576, "learning_rate": 5.25e-06, "loss": 10.3228, "step": 97500 }, { "epoch": 2.2429222072185477, "grad_norm": 0.0, "learning_rate": 5.237037037037037e-06, "loss": 10.091, "step": 98000 }, { "epoch": 2.254365687867622, "grad_norm": 12197.4453125, "learning_rate": 5.224074074074074e-06, "loss": 10.6507, "step": 98500 }, { "epoch": 2.265809168516696, "grad_norm": 0.006908933632075787, "learning_rate": 5.211111111111111e-06, "loss": 9.9997, "step": 99000 }, { "epoch": 2.2772526491657703, "grad_norm": 786.2550659179688, "learning_rate": 5.198148148148148e-06, "loss": 10.4626, "step": 99500 }, { "epoch": 2.2886961298148445, "grad_norm": 206.81629943847656, "learning_rate": 5.185185185185185e-06, "loss": 9.6626, "step": 100000 } ], "logging_steps": 500, "max_steps": 300000, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }