{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.881422924901186, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003952569169960474, "grad_norm": 10.89438533782959, "learning_rate": 5e-06, "loss": 4.1054, "step": 1 }, { "epoch": 0.007905138339920948, "grad_norm": 10.555335998535156, "learning_rate": 1e-05, "loss": 4.1308, "step": 2 }, { "epoch": 0.011857707509881422, "grad_norm": 10.703575134277344, "learning_rate": 1.5e-05, "loss": 4.3599, "step": 3 }, { "epoch": 0.015810276679841896, "grad_norm": 5.956073760986328, "learning_rate": 2e-05, "loss": 3.8205, "step": 4 }, { "epoch": 0.019762845849802372, "grad_norm": 6.856590270996094, "learning_rate": 2.5e-05, "loss": 3.5323, "step": 5 }, { "epoch": 0.023715415019762844, "grad_norm": 7.316745281219482, "learning_rate": 3e-05, "loss": 3.7808, "step": 6 }, { "epoch": 0.02766798418972332, "grad_norm": 4.9175705909729, "learning_rate": 3.5e-05, "loss": 3.4615, "step": 7 }, { "epoch": 0.03162055335968379, "grad_norm": 4.404887676239014, "learning_rate": 4e-05, "loss": 3.3232, "step": 8 }, { "epoch": 0.03557312252964427, "grad_norm": 4.211629390716553, "learning_rate": 4.5e-05, "loss": 3.2175, "step": 9 }, { "epoch": 0.039525691699604744, "grad_norm": 4.854459762573242, "learning_rate": 5e-05, "loss": 3.3368, "step": 10 }, { "epoch": 0.043478260869565216, "grad_norm": 3.71142840385437, "learning_rate": 4.998015873015873e-05, "loss": 3.2278, "step": 11 }, { "epoch": 0.04743083003952569, "grad_norm": 2.8285276889801025, "learning_rate": 4.996031746031746e-05, "loss": 2.9991, "step": 12 }, { "epoch": 0.05138339920948617, "grad_norm": 2.7281620502471924, "learning_rate": 4.994047619047619e-05, "loss": 3.2112, "step": 13 }, { "epoch": 0.05533596837944664, "grad_norm": 2.6310040950775146, "learning_rate": 4.9920634920634924e-05, "loss": 3.3289, "step": 14 }, { "epoch": 0.05928853754940711, "grad_norm": 2.928968667984009, "learning_rate": 4.990079365079365e-05, "loss": 3.3052, "step": 15 }, { "epoch": 0.06324110671936758, "grad_norm": 2.7853801250457764, "learning_rate": 4.9880952380952385e-05, "loss": 3.0065, "step": 16 }, { "epoch": 0.06719367588932806, "grad_norm": 2.0545976161956787, "learning_rate": 4.986111111111111e-05, "loss": 3.0645, "step": 17 }, { "epoch": 0.07114624505928854, "grad_norm": 2.3543589115142822, "learning_rate": 4.9841269841269845e-05, "loss": 3.0591, "step": 18 }, { "epoch": 0.07509881422924901, "grad_norm": 2.1679906845092773, "learning_rate": 4.982142857142857e-05, "loss": 2.9374, "step": 19 }, { "epoch": 0.07905138339920949, "grad_norm": 2.5650746822357178, "learning_rate": 4.9801587301587306e-05, "loss": 2.9292, "step": 20 }, { "epoch": 0.08300395256916997, "grad_norm": 2.4944090843200684, "learning_rate": 4.978174603174603e-05, "loss": 3.0967, "step": 21 }, { "epoch": 0.08695652173913043, "grad_norm": 2.8087055683135986, "learning_rate": 4.976190476190477e-05, "loss": 3.076, "step": 22 }, { "epoch": 0.09090909090909091, "grad_norm": 2.570450782775879, "learning_rate": 4.9742063492063494e-05, "loss": 2.8585, "step": 23 }, { "epoch": 0.09486166007905138, "grad_norm": 2.3296656608581543, "learning_rate": 4.972222222222223e-05, "loss": 2.7182, "step": 24 }, { "epoch": 0.09881422924901186, "grad_norm": 2.1972436904907227, "learning_rate": 4.9702380952380955e-05, "loss": 2.8991, "step": 25 }, { "epoch": 0.10276679841897234, "grad_norm": 2.7606372833251953, "learning_rate": 4.968253968253969e-05, "loss": 2.912, "step": 26 }, { "epoch": 0.1067193675889328, "grad_norm": 2.7917637825012207, "learning_rate": 4.9662698412698415e-05, "loss": 2.8944, "step": 27 }, { "epoch": 0.11067193675889328, "grad_norm": 2.271846055984497, "learning_rate": 4.964285714285715e-05, "loss": 2.7234, "step": 28 }, { "epoch": 0.11462450592885376, "grad_norm": 2.8581902980804443, "learning_rate": 4.9623015873015876e-05, "loss": 2.662, "step": 29 }, { "epoch": 0.11857707509881422, "grad_norm": 2.4018585681915283, "learning_rate": 4.960317460317461e-05, "loss": 2.554, "step": 30 }, { "epoch": 0.1225296442687747, "grad_norm": 3.0106191635131836, "learning_rate": 4.958333333333334e-05, "loss": 2.6187, "step": 31 }, { "epoch": 0.12648221343873517, "grad_norm": 2.4871628284454346, "learning_rate": 4.956349206349207e-05, "loss": 2.5486, "step": 32 }, { "epoch": 0.13043478260869565, "grad_norm": 3.1579999923706055, "learning_rate": 4.95436507936508e-05, "loss": 2.4142, "step": 33 }, { "epoch": 0.13438735177865613, "grad_norm": 2.4816365242004395, "learning_rate": 4.9523809523809525e-05, "loss": 2.8092, "step": 34 }, { "epoch": 0.1383399209486166, "grad_norm": 2.4598770141601562, "learning_rate": 4.950396825396826e-05, "loss": 2.5606, "step": 35 }, { "epoch": 0.1422924901185771, "grad_norm": 2.9210352897644043, "learning_rate": 4.9484126984126985e-05, "loss": 2.6477, "step": 36 }, { "epoch": 0.14624505928853754, "grad_norm": 2.082595109939575, "learning_rate": 4.946428571428572e-05, "loss": 2.6539, "step": 37 }, { "epoch": 0.15019762845849802, "grad_norm": 2.171468734741211, "learning_rate": 4.9444444444444446e-05, "loss": 2.8466, "step": 38 }, { "epoch": 0.1541501976284585, "grad_norm": 2.395892858505249, "learning_rate": 4.942460317460318e-05, "loss": 2.4224, "step": 39 }, { "epoch": 0.15810276679841898, "grad_norm": 3.123197317123413, "learning_rate": 4.940476190476191e-05, "loss": 2.9204, "step": 40 }, { "epoch": 0.16205533596837945, "grad_norm": 2.659156084060669, "learning_rate": 4.938492063492064e-05, "loss": 2.43, "step": 41 }, { "epoch": 0.16600790513833993, "grad_norm": 2.520598888397217, "learning_rate": 4.936507936507937e-05, "loss": 2.4932, "step": 42 }, { "epoch": 0.16996047430830039, "grad_norm": 2.3670473098754883, "learning_rate": 4.93452380952381e-05, "loss": 2.5526, "step": 43 }, { "epoch": 0.17391304347826086, "grad_norm": 2.4440720081329346, "learning_rate": 4.932539682539683e-05, "loss": 2.7987, "step": 44 }, { "epoch": 0.17786561264822134, "grad_norm": 2.4289233684539795, "learning_rate": 4.930555555555556e-05, "loss": 2.5825, "step": 45 }, { "epoch": 0.18181818181818182, "grad_norm": 2.3470001220703125, "learning_rate": 4.928571428571429e-05, "loss": 2.6192, "step": 46 }, { "epoch": 0.1857707509881423, "grad_norm": 2.0881083011627197, "learning_rate": 4.926587301587302e-05, "loss": 2.5765, "step": 47 }, { "epoch": 0.18972332015810275, "grad_norm": 2.184652328491211, "learning_rate": 4.924603174603175e-05, "loss": 2.5345, "step": 48 }, { "epoch": 0.19367588932806323, "grad_norm": 2.1539316177368164, "learning_rate": 4.9226190476190484e-05, "loss": 2.7078, "step": 49 }, { "epoch": 0.1976284584980237, "grad_norm": 2.476987361907959, "learning_rate": 4.9206349206349204e-05, "loss": 2.5304, "step": 50 }, { "epoch": 0.2015810276679842, "grad_norm": 2.591413974761963, "learning_rate": 4.918650793650794e-05, "loss": 2.2591, "step": 51 }, { "epoch": 0.20553359683794467, "grad_norm": 6.341241836547852, "learning_rate": 4.9166666666666665e-05, "loss": 2.4837, "step": 52 }, { "epoch": 0.20948616600790515, "grad_norm": 2.9513611793518066, "learning_rate": 4.91468253968254e-05, "loss": 2.6514, "step": 53 }, { "epoch": 0.2134387351778656, "grad_norm": 2.2543134689331055, "learning_rate": 4.9126984126984125e-05, "loss": 2.657, "step": 54 }, { "epoch": 0.21739130434782608, "grad_norm": 2.0405731201171875, "learning_rate": 4.910714285714286e-05, "loss": 2.5088, "step": 55 }, { "epoch": 0.22134387351778656, "grad_norm": 2.6079225540161133, "learning_rate": 4.9087301587301586e-05, "loss": 2.2703, "step": 56 }, { "epoch": 0.22529644268774704, "grad_norm": 2.587153673171997, "learning_rate": 4.906746031746032e-05, "loss": 2.5027, "step": 57 }, { "epoch": 0.22924901185770752, "grad_norm": 2.083115577697754, "learning_rate": 4.904761904761905e-05, "loss": 2.7224, "step": 58 }, { "epoch": 0.233201581027668, "grad_norm": 2.291238784790039, "learning_rate": 4.902777777777778e-05, "loss": 2.7383, "step": 59 }, { "epoch": 0.23715415019762845, "grad_norm": 2.4876511096954346, "learning_rate": 4.900793650793651e-05, "loss": 2.3814, "step": 60 }, { "epoch": 0.24110671936758893, "grad_norm": 2.6496829986572266, "learning_rate": 4.898809523809524e-05, "loss": 2.2271, "step": 61 }, { "epoch": 0.2450592885375494, "grad_norm": 2.8697926998138428, "learning_rate": 4.896825396825397e-05, "loss": 2.5395, "step": 62 }, { "epoch": 0.2490118577075099, "grad_norm": 2.5211477279663086, "learning_rate": 4.89484126984127e-05, "loss": 2.0096, "step": 63 }, { "epoch": 0.25296442687747034, "grad_norm": 2.0613839626312256, "learning_rate": 4.892857142857143e-05, "loss": 2.1318, "step": 64 }, { "epoch": 0.25691699604743085, "grad_norm": 2.4938395023345947, "learning_rate": 4.8908730158730156e-05, "loss": 2.4198, "step": 65 }, { "epoch": 0.2608695652173913, "grad_norm": 2.5162112712860107, "learning_rate": 4.888888888888889e-05, "loss": 2.2697, "step": 66 }, { "epoch": 0.2648221343873518, "grad_norm": 2.361053943634033, "learning_rate": 4.886904761904762e-05, "loss": 2.6438, "step": 67 }, { "epoch": 0.26877470355731226, "grad_norm": 2.257228374481201, "learning_rate": 4.884920634920635e-05, "loss": 2.4411, "step": 68 }, { "epoch": 0.2727272727272727, "grad_norm": 3.0840468406677246, "learning_rate": 4.882936507936508e-05, "loss": 2.1741, "step": 69 }, { "epoch": 0.2766798418972332, "grad_norm": 2.4883201122283936, "learning_rate": 4.880952380952381e-05, "loss": 2.2851, "step": 70 }, { "epoch": 0.28063241106719367, "grad_norm": 2.0108885765075684, "learning_rate": 4.878968253968254e-05, "loss": 2.3053, "step": 71 }, { "epoch": 0.2845849802371542, "grad_norm": 2.4046590328216553, "learning_rate": 4.876984126984127e-05, "loss": 2.2728, "step": 72 }, { "epoch": 0.2885375494071146, "grad_norm": 2.492821216583252, "learning_rate": 4.875e-05, "loss": 2.463, "step": 73 }, { "epoch": 0.2924901185770751, "grad_norm": 1.9024349451065063, "learning_rate": 4.873015873015873e-05, "loss": 2.5196, "step": 74 }, { "epoch": 0.2964426877470356, "grad_norm": 2.240678310394287, "learning_rate": 4.871031746031746e-05, "loss": 2.2548, "step": 75 }, { "epoch": 0.30039525691699603, "grad_norm": 2.447615146636963, "learning_rate": 4.8690476190476194e-05, "loss": 2.9293, "step": 76 }, { "epoch": 0.30434782608695654, "grad_norm": 1.805576205253601, "learning_rate": 4.867063492063492e-05, "loss": 2.3533, "step": 77 }, { "epoch": 0.308300395256917, "grad_norm": 2.2282891273498535, "learning_rate": 4.8650793650793654e-05, "loss": 2.5451, "step": 78 }, { "epoch": 0.31225296442687744, "grad_norm": 2.568352699279785, "learning_rate": 4.863095238095238e-05, "loss": 1.9444, "step": 79 }, { "epoch": 0.31620553359683795, "grad_norm": 1.9792273044586182, "learning_rate": 4.8611111111111115e-05, "loss": 2.4367, "step": 80 }, { "epoch": 0.3201581027667984, "grad_norm": 2.3930373191833496, "learning_rate": 4.859126984126984e-05, "loss": 2.3669, "step": 81 }, { "epoch": 0.3241106719367589, "grad_norm": 2.2622337341308594, "learning_rate": 4.8571428571428576e-05, "loss": 2.4811, "step": 82 }, { "epoch": 0.32806324110671936, "grad_norm": 2.4643726348876953, "learning_rate": 4.85515873015873e-05, "loss": 2.3285, "step": 83 }, { "epoch": 0.33201581027667987, "grad_norm": 2.4243311882019043, "learning_rate": 4.853174603174604e-05, "loss": 2.579, "step": 84 }, { "epoch": 0.3359683794466403, "grad_norm": 3.0440781116485596, "learning_rate": 4.8511904761904764e-05, "loss": 2.0484, "step": 85 }, { "epoch": 0.33992094861660077, "grad_norm": 2.282578706741333, "learning_rate": 4.84920634920635e-05, "loss": 2.2485, "step": 86 }, { "epoch": 0.3438735177865613, "grad_norm": 2.2174232006073, "learning_rate": 4.8472222222222224e-05, "loss": 2.6938, "step": 87 }, { "epoch": 0.34782608695652173, "grad_norm": 2.4350028038024902, "learning_rate": 4.845238095238095e-05, "loss": 2.1194, "step": 88 }, { "epoch": 0.35177865612648224, "grad_norm": 2.424428939819336, "learning_rate": 4.8432539682539685e-05, "loss": 2.1156, "step": 89 }, { "epoch": 0.3557312252964427, "grad_norm": 2.444096326828003, "learning_rate": 4.841269841269841e-05, "loss": 2.2003, "step": 90 }, { "epoch": 0.35968379446640314, "grad_norm": 2.2972753047943115, "learning_rate": 4.8392857142857146e-05, "loss": 2.273, "step": 91 }, { "epoch": 0.36363636363636365, "grad_norm": 2.599595546722412, "learning_rate": 4.837301587301587e-05, "loss": 2.1506, "step": 92 }, { "epoch": 0.3675889328063241, "grad_norm": 2.1127512454986572, "learning_rate": 4.835317460317461e-05, "loss": 2.2395, "step": 93 }, { "epoch": 0.3715415019762846, "grad_norm": 1.8674228191375732, "learning_rate": 4.8333333333333334e-05, "loss": 2.4755, "step": 94 }, { "epoch": 0.37549407114624506, "grad_norm": 2.062201499938965, "learning_rate": 4.831349206349207e-05, "loss": 2.8275, "step": 95 }, { "epoch": 0.3794466403162055, "grad_norm": 2.196667432785034, "learning_rate": 4.8293650793650794e-05, "loss": 2.3944, "step": 96 }, { "epoch": 0.383399209486166, "grad_norm": 2.0713300704956055, "learning_rate": 4.827380952380953e-05, "loss": 2.4696, "step": 97 }, { "epoch": 0.38735177865612647, "grad_norm": 2.0104503631591797, "learning_rate": 4.8253968253968255e-05, "loss": 2.334, "step": 98 }, { "epoch": 0.391304347826087, "grad_norm": 2.1917121410369873, "learning_rate": 4.823412698412699e-05, "loss": 2.2834, "step": 99 }, { "epoch": 0.3952569169960474, "grad_norm": 2.4141860008239746, "learning_rate": 4.8214285714285716e-05, "loss": 2.3086, "step": 100 }, { "epoch": 0.39920948616600793, "grad_norm": 2.384432792663574, "learning_rate": 4.819444444444445e-05, "loss": 2.1402, "step": 101 }, { "epoch": 0.4031620553359684, "grad_norm": 2.4520158767700195, "learning_rate": 4.817460317460318e-05, "loss": 2.033, "step": 102 }, { "epoch": 0.40711462450592883, "grad_norm": 2.809053659439087, "learning_rate": 4.815476190476191e-05, "loss": 2.0992, "step": 103 }, { "epoch": 0.41106719367588934, "grad_norm": 2.3782355785369873, "learning_rate": 4.813492063492064e-05, "loss": 2.3137, "step": 104 }, { "epoch": 0.4150197628458498, "grad_norm": 1.9347542524337769, "learning_rate": 4.811507936507937e-05, "loss": 2.3339, "step": 105 }, { "epoch": 0.4189723320158103, "grad_norm": 2.0949320793151855, "learning_rate": 4.80952380952381e-05, "loss": 2.4536, "step": 106 }, { "epoch": 0.42292490118577075, "grad_norm": 1.8690857887268066, "learning_rate": 4.807539682539683e-05, "loss": 2.0155, "step": 107 }, { "epoch": 0.4268774703557312, "grad_norm": 2.1170482635498047, "learning_rate": 4.805555555555556e-05, "loss": 2.2891, "step": 108 }, { "epoch": 0.4308300395256917, "grad_norm": 2.1500346660614014, "learning_rate": 4.803571428571429e-05, "loss": 2.4996, "step": 109 }, { "epoch": 0.43478260869565216, "grad_norm": 1.9133753776550293, "learning_rate": 4.801587301587302e-05, "loss": 2.57, "step": 110 }, { "epoch": 0.43873517786561267, "grad_norm": 2.0104360580444336, "learning_rate": 4.799603174603175e-05, "loss": 2.1764, "step": 111 }, { "epoch": 0.4426877470355731, "grad_norm": 2.2259364128112793, "learning_rate": 4.797619047619048e-05, "loss": 2.4447, "step": 112 }, { "epoch": 0.44664031620553357, "grad_norm": 1.6618857383728027, "learning_rate": 4.795634920634921e-05, "loss": 2.3628, "step": 113 }, { "epoch": 0.4505928853754941, "grad_norm": 2.0611374378204346, "learning_rate": 4.793650793650794e-05, "loss": 2.7009, "step": 114 }, { "epoch": 0.45454545454545453, "grad_norm": 2.3287601470947266, "learning_rate": 4.791666666666667e-05, "loss": 2.5049, "step": 115 }, { "epoch": 0.45849802371541504, "grad_norm": 2.238161563873291, "learning_rate": 4.78968253968254e-05, "loss": 2.3077, "step": 116 }, { "epoch": 0.4624505928853755, "grad_norm": 2.3234920501708984, "learning_rate": 4.787698412698413e-05, "loss": 2.3173, "step": 117 }, { "epoch": 0.466403162055336, "grad_norm": 2.1455657482147217, "learning_rate": 4.785714285714286e-05, "loss": 2.329, "step": 118 }, { "epoch": 0.47035573122529645, "grad_norm": 2.4944536685943604, "learning_rate": 4.783730158730159e-05, "loss": 2.1652, "step": 119 }, { "epoch": 0.4743083003952569, "grad_norm": 1.9373809099197388, "learning_rate": 4.781746031746032e-05, "loss": 2.3349, "step": 120 }, { "epoch": 0.4782608695652174, "grad_norm": 2.0994389057159424, "learning_rate": 4.779761904761905e-05, "loss": 2.27, "step": 121 }, { "epoch": 0.48221343873517786, "grad_norm": 1.697857141494751, "learning_rate": 4.7777777777777784e-05, "loss": 2.3405, "step": 122 }, { "epoch": 0.48616600790513836, "grad_norm": 2.085141658782959, "learning_rate": 4.775793650793651e-05, "loss": 1.9531, "step": 123 }, { "epoch": 0.4901185770750988, "grad_norm": 2.2885854244232178, "learning_rate": 4.7738095238095245e-05, "loss": 2.2837, "step": 124 }, { "epoch": 0.49407114624505927, "grad_norm": 2.0135693550109863, "learning_rate": 4.771825396825397e-05, "loss": 2.3009, "step": 125 }, { "epoch": 0.4980237154150198, "grad_norm": 2.485074520111084, "learning_rate": 4.7698412698412706e-05, "loss": 1.9311, "step": 126 }, { "epoch": 0.5019762845849802, "grad_norm": 2.2360167503356934, "learning_rate": 4.767857142857143e-05, "loss": 2.426, "step": 127 }, { "epoch": 0.5059288537549407, "grad_norm": 2.3205668926239014, "learning_rate": 4.7658730158730166e-05, "loss": 1.9649, "step": 128 }, { "epoch": 0.5098814229249012, "grad_norm": 2.3518431186676025, "learning_rate": 4.7638888888888887e-05, "loss": 2.5223, "step": 129 }, { "epoch": 0.5138339920948617, "grad_norm": 2.197111129760742, "learning_rate": 4.761904761904762e-05, "loss": 2.303, "step": 130 }, { "epoch": 0.5177865612648221, "grad_norm": 2.4568827152252197, "learning_rate": 4.759920634920635e-05, "loss": 2.0753, "step": 131 }, { "epoch": 0.5217391304347826, "grad_norm": 2.004725217819214, "learning_rate": 4.757936507936508e-05, "loss": 2.3092, "step": 132 }, { "epoch": 0.525691699604743, "grad_norm": 2.0977554321289062, "learning_rate": 4.755952380952381e-05, "loss": 2.4393, "step": 133 }, { "epoch": 0.5296442687747036, "grad_norm": 1.903745412826538, "learning_rate": 4.753968253968254e-05, "loss": 2.0935, "step": 134 }, { "epoch": 0.5335968379446641, "grad_norm": 2.0144717693328857, "learning_rate": 4.751984126984127e-05, "loss": 2.2653, "step": 135 }, { "epoch": 0.5375494071146245, "grad_norm": 2.0766761302948, "learning_rate": 4.75e-05, "loss": 2.2034, "step": 136 }, { "epoch": 0.541501976284585, "grad_norm": 2.109747886657715, "learning_rate": 4.748015873015873e-05, "loss": 2.2778, "step": 137 }, { "epoch": 0.5454545454545454, "grad_norm": 8.714595794677734, "learning_rate": 4.746031746031746e-05, "loss": 1.945, "step": 138 }, { "epoch": 0.549407114624506, "grad_norm": 2.571970224380493, "learning_rate": 4.744047619047619e-05, "loss": 2.0047, "step": 139 }, { "epoch": 0.5533596837944664, "grad_norm": 2.0782594680786133, "learning_rate": 4.7420634920634924e-05, "loss": 2.2821, "step": 140 }, { "epoch": 0.5573122529644269, "grad_norm": 1.9552282094955444, "learning_rate": 4.740079365079365e-05, "loss": 1.9381, "step": 141 }, { "epoch": 0.5612648221343873, "grad_norm": 2.5030784606933594, "learning_rate": 4.738095238095238e-05, "loss": 1.9552, "step": 142 }, { "epoch": 0.5652173913043478, "grad_norm": 1.996016263961792, "learning_rate": 4.736111111111111e-05, "loss": 2.3815, "step": 143 }, { "epoch": 0.5691699604743083, "grad_norm": 2.2575206756591797, "learning_rate": 4.734126984126984e-05, "loss": 2.3244, "step": 144 }, { "epoch": 0.5731225296442688, "grad_norm": 2.067824363708496, "learning_rate": 4.732142857142857e-05, "loss": 2.338, "step": 145 }, { "epoch": 0.5770750988142292, "grad_norm": 1.8195747137069702, "learning_rate": 4.73015873015873e-05, "loss": 2.4185, "step": 146 }, { "epoch": 0.5810276679841897, "grad_norm": 2.230973958969116, "learning_rate": 4.728174603174603e-05, "loss": 2.2955, "step": 147 }, { "epoch": 0.5849802371541502, "grad_norm": 2.303642749786377, "learning_rate": 4.726190476190476e-05, "loss": 2.6163, "step": 148 }, { "epoch": 0.5889328063241107, "grad_norm": 2.1462321281433105, "learning_rate": 4.7242063492063494e-05, "loss": 2.1224, "step": 149 }, { "epoch": 0.5928853754940712, "grad_norm": 1.8279284238815308, "learning_rate": 4.722222222222222e-05, "loss": 2.1556, "step": 150 }, { "epoch": 0.5968379446640316, "grad_norm": 2.0945279598236084, "learning_rate": 4.7202380952380955e-05, "loss": 2.2004, "step": 151 }, { "epoch": 0.6007905138339921, "grad_norm": 2.0842084884643555, "learning_rate": 4.718253968253968e-05, "loss": 1.9211, "step": 152 }, { "epoch": 0.6047430830039525, "grad_norm": 1.9995882511138916, "learning_rate": 4.7162698412698416e-05, "loss": 2.116, "step": 153 }, { "epoch": 0.6086956521739131, "grad_norm": 2.471076250076294, "learning_rate": 4.714285714285714e-05, "loss": 2.035, "step": 154 }, { "epoch": 0.6126482213438735, "grad_norm": 2.1906332969665527, "learning_rate": 4.7123015873015876e-05, "loss": 1.8779, "step": 155 }, { "epoch": 0.616600790513834, "grad_norm": 2.511838674545288, "learning_rate": 4.71031746031746e-05, "loss": 1.9335, "step": 156 }, { "epoch": 0.6205533596837944, "grad_norm": 1.754230260848999, "learning_rate": 4.708333333333334e-05, "loss": 2.1955, "step": 157 }, { "epoch": 0.6245059288537549, "grad_norm": 1.9571726322174072, "learning_rate": 4.7063492063492064e-05, "loss": 2.0802, "step": 158 }, { "epoch": 0.6284584980237155, "grad_norm": 2.271517038345337, "learning_rate": 4.70436507936508e-05, "loss": 2.2352, "step": 159 }, { "epoch": 0.6324110671936759, "grad_norm": 1.9034878015518188, "learning_rate": 4.7023809523809525e-05, "loss": 2.2499, "step": 160 }, { "epoch": 0.6363636363636364, "grad_norm": 1.9912493228912354, "learning_rate": 4.700396825396826e-05, "loss": 2.6197, "step": 161 }, { "epoch": 0.6403162055335968, "grad_norm": 2.255777597427368, "learning_rate": 4.6984126984126986e-05, "loss": 2.36, "step": 162 }, { "epoch": 0.6442687747035574, "grad_norm": 2.2219536304473877, "learning_rate": 4.696428571428572e-05, "loss": 1.9835, "step": 163 }, { "epoch": 0.6482213438735178, "grad_norm": 2.551605224609375, "learning_rate": 4.6944444444444446e-05, "loss": 2.2187, "step": 164 }, { "epoch": 0.6521739130434783, "grad_norm": 2.3967275619506836, "learning_rate": 4.692460317460317e-05, "loss": 1.8407, "step": 165 }, { "epoch": 0.6561264822134387, "grad_norm": 2.194493293762207, "learning_rate": 4.690476190476191e-05, "loss": 2.3458, "step": 166 }, { "epoch": 0.6600790513833992, "grad_norm": 1.9432865381240845, "learning_rate": 4.6884920634920634e-05, "loss": 2.2172, "step": 167 }, { "epoch": 0.6640316205533597, "grad_norm": 2.18040132522583, "learning_rate": 4.686507936507937e-05, "loss": 2.1371, "step": 168 }, { "epoch": 0.6679841897233202, "grad_norm": 2.4075417518615723, "learning_rate": 4.6845238095238095e-05, "loss": 2.2766, "step": 169 }, { "epoch": 0.6719367588932806, "grad_norm": 1.8353605270385742, "learning_rate": 4.682539682539683e-05, "loss": 1.809, "step": 170 }, { "epoch": 0.6758893280632411, "grad_norm": 2.305044651031494, "learning_rate": 4.6805555555555556e-05, "loss": 1.9724, "step": 171 }, { "epoch": 0.6798418972332015, "grad_norm": 2.7235798835754395, "learning_rate": 4.678571428571429e-05, "loss": 1.6704, "step": 172 }, { "epoch": 0.6837944664031621, "grad_norm": 2.5702693462371826, "learning_rate": 4.6765873015873016e-05, "loss": 2.0057, "step": 173 }, { "epoch": 0.6877470355731226, "grad_norm": 1.952614188194275, "learning_rate": 4.674603174603175e-05, "loss": 2.3162, "step": 174 }, { "epoch": 0.691699604743083, "grad_norm": 2.334252119064331, "learning_rate": 4.672619047619048e-05, "loss": 2.2679, "step": 175 }, { "epoch": 0.6956521739130435, "grad_norm": 2.064568519592285, "learning_rate": 4.670634920634921e-05, "loss": 2.2213, "step": 176 }, { "epoch": 0.6996047430830039, "grad_norm": 1.8959503173828125, "learning_rate": 4.668650793650794e-05, "loss": 2.4685, "step": 177 }, { "epoch": 0.7035573122529645, "grad_norm": 2.5481746196746826, "learning_rate": 4.666666666666667e-05, "loss": 2.2007, "step": 178 }, { "epoch": 0.7075098814229249, "grad_norm": 1.9843651056289673, "learning_rate": 4.66468253968254e-05, "loss": 2.2955, "step": 179 }, { "epoch": 0.7114624505928854, "grad_norm": 1.9685429334640503, "learning_rate": 4.662698412698413e-05, "loss": 2.0954, "step": 180 }, { "epoch": 0.7154150197628458, "grad_norm": 2.216379165649414, "learning_rate": 4.660714285714286e-05, "loss": 2.3786, "step": 181 }, { "epoch": 0.7193675889328063, "grad_norm": 2.144599437713623, "learning_rate": 4.658730158730159e-05, "loss": 2.1757, "step": 182 }, { "epoch": 0.7233201581027668, "grad_norm": 1.829734206199646, "learning_rate": 4.656746031746032e-05, "loss": 2.2259, "step": 183 }, { "epoch": 0.7272727272727273, "grad_norm": 1.8877259492874146, "learning_rate": 4.6547619047619054e-05, "loss": 2.1431, "step": 184 }, { "epoch": 0.7312252964426877, "grad_norm": 1.9971317052841187, "learning_rate": 4.652777777777778e-05, "loss": 2.263, "step": 185 }, { "epoch": 0.7351778656126482, "grad_norm": 2.129324436187744, "learning_rate": 4.6507936507936515e-05, "loss": 2.1523, "step": 186 }, { "epoch": 0.7391304347826086, "grad_norm": 2.2705445289611816, "learning_rate": 4.648809523809524e-05, "loss": 2.0131, "step": 187 }, { "epoch": 0.7430830039525692, "grad_norm": 1.9856712818145752, "learning_rate": 4.646825396825397e-05, "loss": 1.8903, "step": 188 }, { "epoch": 0.7470355731225297, "grad_norm": 2.1891982555389404, "learning_rate": 4.64484126984127e-05, "loss": 2.2185, "step": 189 }, { "epoch": 0.7509881422924901, "grad_norm": 2.514817476272583, "learning_rate": 4.642857142857143e-05, "loss": 2.3529, "step": 190 }, { "epoch": 0.7549407114624506, "grad_norm": 1.9026365280151367, "learning_rate": 4.640873015873016e-05, "loss": 2.4147, "step": 191 }, { "epoch": 0.758893280632411, "grad_norm": 2.04667329788208, "learning_rate": 4.638888888888889e-05, "loss": 1.9511, "step": 192 }, { "epoch": 0.7628458498023716, "grad_norm": 1.8381803035736084, "learning_rate": 4.6369047619047624e-05, "loss": 2.1575, "step": 193 }, { "epoch": 0.766798418972332, "grad_norm": 2.05672550201416, "learning_rate": 4.634920634920635e-05, "loss": 2.0799, "step": 194 }, { "epoch": 0.7707509881422925, "grad_norm": 1.973151445388794, "learning_rate": 4.6329365079365085e-05, "loss": 2.2552, "step": 195 }, { "epoch": 0.7747035573122529, "grad_norm": 1.8159013986587524, "learning_rate": 4.630952380952381e-05, "loss": 1.8409, "step": 196 }, { "epoch": 0.7786561264822134, "grad_norm": 2.1178860664367676, "learning_rate": 4.6289682539682545e-05, "loss": 2.1587, "step": 197 }, { "epoch": 0.782608695652174, "grad_norm": 2.200260877609253, "learning_rate": 4.626984126984127e-05, "loss": 1.5963, "step": 198 }, { "epoch": 0.7865612648221344, "grad_norm": 2.7356910705566406, "learning_rate": 4.6250000000000006e-05, "loss": 1.7795, "step": 199 }, { "epoch": 0.7905138339920948, "grad_norm": 2.6005921363830566, "learning_rate": 4.623015873015873e-05, "loss": 2.07, "step": 200 }, { "epoch": 0.7944664031620553, "grad_norm": 2.155571937561035, "learning_rate": 4.621031746031747e-05, "loss": 1.9797, "step": 201 }, { "epoch": 0.7984189723320159, "grad_norm": 1.8139017820358276, "learning_rate": 4.6190476190476194e-05, "loss": 2.0915, "step": 202 }, { "epoch": 0.8023715415019763, "grad_norm": 2.1978001594543457, "learning_rate": 4.617063492063493e-05, "loss": 2.0828, "step": 203 }, { "epoch": 0.8063241106719368, "grad_norm": 2.4255006313323975, "learning_rate": 4.6150793650793655e-05, "loss": 1.8046, "step": 204 }, { "epoch": 0.8102766798418972, "grad_norm": 2.3868203163146973, "learning_rate": 4.613095238095239e-05, "loss": 2.0577, "step": 205 }, { "epoch": 0.8142292490118577, "grad_norm": 1.7391860485076904, "learning_rate": 4.6111111111111115e-05, "loss": 2.292, "step": 206 }, { "epoch": 0.8181818181818182, "grad_norm": 1.9303078651428223, "learning_rate": 4.609126984126984e-05, "loss": 1.8133, "step": 207 }, { "epoch": 0.8221343873517787, "grad_norm": 1.9964845180511475, "learning_rate": 4.607142857142857e-05, "loss": 2.1007, "step": 208 }, { "epoch": 0.8260869565217391, "grad_norm": 2.102794885635376, "learning_rate": 4.60515873015873e-05, "loss": 2.2607, "step": 209 }, { "epoch": 0.8300395256916996, "grad_norm": 2.2853715419769287, "learning_rate": 4.603174603174603e-05, "loss": 2.2519, "step": 210 }, { "epoch": 0.83399209486166, "grad_norm": 1.8562860488891602, "learning_rate": 4.6011904761904764e-05, "loss": 2.1676, "step": 211 }, { "epoch": 0.8379446640316206, "grad_norm": 2.448793649673462, "learning_rate": 4.599206349206349e-05, "loss": 2.1657, "step": 212 }, { "epoch": 0.841897233201581, "grad_norm": 1.8922572135925293, "learning_rate": 4.5972222222222225e-05, "loss": 1.7849, "step": 213 }, { "epoch": 0.8458498023715415, "grad_norm": 2.190263509750366, "learning_rate": 4.595238095238095e-05, "loss": 2.1807, "step": 214 }, { "epoch": 0.849802371541502, "grad_norm": 2.0355405807495117, "learning_rate": 4.5932539682539685e-05, "loss": 2.163, "step": 215 }, { "epoch": 0.8537549407114624, "grad_norm": 2.3606069087982178, "learning_rate": 4.591269841269841e-05, "loss": 2.3872, "step": 216 }, { "epoch": 0.857707509881423, "grad_norm": 2.091801166534424, "learning_rate": 4.5892857142857146e-05, "loss": 2.0303, "step": 217 }, { "epoch": 0.8616600790513834, "grad_norm": 2.5148980617523193, "learning_rate": 4.587301587301587e-05, "loss": 1.7546, "step": 218 }, { "epoch": 0.8656126482213439, "grad_norm": 2.172477960586548, "learning_rate": 4.58531746031746e-05, "loss": 2.0569, "step": 219 }, { "epoch": 0.8695652173913043, "grad_norm": 1.820509433746338, "learning_rate": 4.5833333333333334e-05, "loss": 2.2322, "step": 220 }, { "epoch": 0.8735177865612648, "grad_norm": 1.5621439218521118, "learning_rate": 4.581349206349206e-05, "loss": 2.0088, "step": 221 }, { "epoch": 0.8774703557312253, "grad_norm": 2.2147130966186523, "learning_rate": 4.5793650793650795e-05, "loss": 1.9092, "step": 222 }, { "epoch": 0.8814229249011858, "grad_norm": 2.103334426879883, "learning_rate": 4.577380952380952e-05, "loss": 1.9992, "step": 223 }, { "epoch": 0.8853754940711462, "grad_norm": 1.926761507987976, "learning_rate": 4.5753968253968255e-05, "loss": 1.8496, "step": 224 }, { "epoch": 0.8893280632411067, "grad_norm": 2.040013313293457, "learning_rate": 4.573412698412698e-05, "loss": 1.9573, "step": 225 }, { "epoch": 0.8932806324110671, "grad_norm": 1.969488263130188, "learning_rate": 4.5714285714285716e-05, "loss": 2.1783, "step": 226 }, { "epoch": 0.8972332015810277, "grad_norm": 2.0519626140594482, "learning_rate": 4.569444444444444e-05, "loss": 1.9827, "step": 227 }, { "epoch": 0.9011857707509882, "grad_norm": 2.2971911430358887, "learning_rate": 4.567460317460318e-05, "loss": 1.9498, "step": 228 }, { "epoch": 0.9051383399209486, "grad_norm": 2.0618982315063477, "learning_rate": 4.5654761904761904e-05, "loss": 1.8099, "step": 229 }, { "epoch": 0.9090909090909091, "grad_norm": 1.670811414718628, "learning_rate": 4.563492063492064e-05, "loss": 2.1808, "step": 230 }, { "epoch": 0.9130434782608695, "grad_norm": 3.3800909519195557, "learning_rate": 4.5615079365079365e-05, "loss": 1.458, "step": 231 }, { "epoch": 0.9169960474308301, "grad_norm": 2.106755495071411, "learning_rate": 4.55952380952381e-05, "loss": 2.1909, "step": 232 }, { "epoch": 0.9209486166007905, "grad_norm": 2.3558948040008545, "learning_rate": 4.5575396825396825e-05, "loss": 2.1529, "step": 233 }, { "epoch": 0.924901185770751, "grad_norm": 2.1475250720977783, "learning_rate": 4.555555555555556e-05, "loss": 2.0272, "step": 234 }, { "epoch": 0.9288537549407114, "grad_norm": 2.1931400299072266, "learning_rate": 4.5535714285714286e-05, "loss": 2.0567, "step": 235 }, { "epoch": 0.932806324110672, "grad_norm": 2.34619140625, "learning_rate": 4.551587301587302e-05, "loss": 2.36, "step": 236 }, { "epoch": 0.9367588932806324, "grad_norm": 1.8363399505615234, "learning_rate": 4.549603174603175e-05, "loss": 2.2903, "step": 237 }, { "epoch": 0.9407114624505929, "grad_norm": 2.0586445331573486, "learning_rate": 4.547619047619048e-05, "loss": 1.8766, "step": 238 }, { "epoch": 0.9446640316205533, "grad_norm": 1.7671842575073242, "learning_rate": 4.545634920634921e-05, "loss": 1.7877, "step": 239 }, { "epoch": 0.9486166007905138, "grad_norm": 2.8485286235809326, "learning_rate": 4.543650793650794e-05, "loss": 1.9066, "step": 240 }, { "epoch": 0.9525691699604744, "grad_norm": 2.1801576614379883, "learning_rate": 4.541666666666667e-05, "loss": 2.2792, "step": 241 }, { "epoch": 0.9565217391304348, "grad_norm": 1.9892218112945557, "learning_rate": 4.5396825396825395e-05, "loss": 2.1138, "step": 242 }, { "epoch": 0.9604743083003953, "grad_norm": 2.0810766220092773, "learning_rate": 4.537698412698413e-05, "loss": 1.8441, "step": 243 }, { "epoch": 0.9644268774703557, "grad_norm": 2.243373155593872, "learning_rate": 4.5357142857142856e-05, "loss": 2.0694, "step": 244 }, { "epoch": 0.9683794466403162, "grad_norm": 2.5647904872894287, "learning_rate": 4.533730158730159e-05, "loss": 1.861, "step": 245 }, { "epoch": 0.9723320158102767, "grad_norm": 1.625081181526184, "learning_rate": 4.531746031746032e-05, "loss": 2.1496, "step": 246 }, { "epoch": 0.9762845849802372, "grad_norm": 1.7020546197891235, "learning_rate": 4.529761904761905e-05, "loss": 1.9332, "step": 247 }, { "epoch": 0.9802371541501976, "grad_norm": 1.802681565284729, "learning_rate": 4.527777777777778e-05, "loss": 1.8204, "step": 248 }, { "epoch": 0.9841897233201581, "grad_norm": 1.7810888290405273, "learning_rate": 4.525793650793651e-05, "loss": 1.8915, "step": 249 }, { "epoch": 0.9881422924901185, "grad_norm": 1.6715744733810425, "learning_rate": 4.523809523809524e-05, "loss": 2.0593, "step": 250 }, { "epoch": 0.9920948616600791, "grad_norm": 2.5212719440460205, "learning_rate": 4.521825396825397e-05, "loss": 2.0725, "step": 251 }, { "epoch": 0.9960474308300395, "grad_norm": 2.189178943634033, "learning_rate": 4.51984126984127e-05, "loss": 2.0336, "step": 252 }, { "epoch": 1.0, "grad_norm": 2.2733006477355957, "learning_rate": 4.517857142857143e-05, "loss": 1.9606, "step": 253 }, { "epoch": 1.0039525691699605, "grad_norm": 1.8183317184448242, "learning_rate": 4.515873015873016e-05, "loss": 1.9106, "step": 254 }, { "epoch": 1.007905138339921, "grad_norm": 2.471332550048828, "learning_rate": 4.5138888888888894e-05, "loss": 1.9963, "step": 255 }, { "epoch": 1.0118577075098814, "grad_norm": 1.8422023057937622, "learning_rate": 4.511904761904762e-05, "loss": 2.0978, "step": 256 }, { "epoch": 1.0158102766798418, "grad_norm": 1.79561185836792, "learning_rate": 4.5099206349206354e-05, "loss": 1.4994, "step": 257 }, { "epoch": 1.0197628458498025, "grad_norm": 2.271358013153076, "learning_rate": 4.507936507936508e-05, "loss": 1.7649, "step": 258 }, { "epoch": 1.023715415019763, "grad_norm": 2.6813673973083496, "learning_rate": 4.5059523809523815e-05, "loss": 1.8252, "step": 259 }, { "epoch": 1.0276679841897234, "grad_norm": 1.8112016916275024, "learning_rate": 4.503968253968254e-05, "loss": 1.7883, "step": 260 }, { "epoch": 1.0316205533596838, "grad_norm": 2.4050188064575195, "learning_rate": 4.5019841269841276e-05, "loss": 1.9855, "step": 261 }, { "epoch": 1.0355731225296443, "grad_norm": 2.38291072845459, "learning_rate": 4.5e-05, "loss": 1.8171, "step": 262 }, { "epoch": 1.0395256916996047, "grad_norm": 2.221317768096924, "learning_rate": 4.4980158730158737e-05, "loss": 1.5565, "step": 263 }, { "epoch": 1.0434782608695652, "grad_norm": 2.3104920387268066, "learning_rate": 4.4960317460317464e-05, "loss": 1.926, "step": 264 }, { "epoch": 1.0474308300395256, "grad_norm": 2.0159404277801514, "learning_rate": 4.494047619047619e-05, "loss": 1.6123, "step": 265 }, { "epoch": 1.051383399209486, "grad_norm": 2.0315968990325928, "learning_rate": 4.4920634920634924e-05, "loss": 1.9577, "step": 266 }, { "epoch": 1.0553359683794465, "grad_norm": 2.021657943725586, "learning_rate": 4.490079365079365e-05, "loss": 1.7845, "step": 267 }, { "epoch": 1.0592885375494072, "grad_norm": 2.1020936965942383, "learning_rate": 4.4880952380952385e-05, "loss": 1.8233, "step": 268 }, { "epoch": 1.0632411067193677, "grad_norm": 2.0355613231658936, "learning_rate": 4.486111111111111e-05, "loss": 1.9241, "step": 269 }, { "epoch": 1.0671936758893281, "grad_norm": 1.797027349472046, "learning_rate": 4.4841269841269846e-05, "loss": 1.7056, "step": 270 }, { "epoch": 1.0711462450592886, "grad_norm": 2.725644826889038, "learning_rate": 4.482142857142857e-05, "loss": 1.4221, "step": 271 }, { "epoch": 1.075098814229249, "grad_norm": 2.5590972900390625, "learning_rate": 4.4801587301587307e-05, "loss": 1.7194, "step": 272 }, { "epoch": 1.0790513833992095, "grad_norm": 2.4488916397094727, "learning_rate": 4.4781746031746034e-05, "loss": 1.3487, "step": 273 }, { "epoch": 1.08300395256917, "grad_norm": 2.245779037475586, "learning_rate": 4.476190476190477e-05, "loss": 1.6895, "step": 274 }, { "epoch": 1.0869565217391304, "grad_norm": 2.3138208389282227, "learning_rate": 4.4742063492063494e-05, "loss": 2.0797, "step": 275 }, { "epoch": 1.0909090909090908, "grad_norm": 2.106358528137207, "learning_rate": 4.472222222222223e-05, "loss": 1.7057, "step": 276 }, { "epoch": 1.0948616600790513, "grad_norm": 2.0810418128967285, "learning_rate": 4.4702380952380955e-05, "loss": 1.6139, "step": 277 }, { "epoch": 1.098814229249012, "grad_norm": 2.3179874420166016, "learning_rate": 4.468253968253969e-05, "loss": 1.7748, "step": 278 }, { "epoch": 1.1027667984189724, "grad_norm": 2.4948816299438477, "learning_rate": 4.4662698412698416e-05, "loss": 2.0072, "step": 279 }, { "epoch": 1.1067193675889329, "grad_norm": 2.1853582859039307, "learning_rate": 4.464285714285715e-05, "loss": 2.017, "step": 280 }, { "epoch": 1.1106719367588933, "grad_norm": 2.066575288772583, "learning_rate": 4.4623015873015877e-05, "loss": 1.7979, "step": 281 }, { "epoch": 1.1146245059288538, "grad_norm": 2.454230785369873, "learning_rate": 4.460317460317461e-05, "loss": 1.6153, "step": 282 }, { "epoch": 1.1185770750988142, "grad_norm": 2.088303804397583, "learning_rate": 4.458333333333334e-05, "loss": 1.7366, "step": 283 }, { "epoch": 1.1225296442687747, "grad_norm": 2.0203330516815186, "learning_rate": 4.456349206349207e-05, "loss": 1.8878, "step": 284 }, { "epoch": 1.1264822134387351, "grad_norm": 1.8328367471694946, "learning_rate": 4.45436507936508e-05, "loss": 1.4853, "step": 285 }, { "epoch": 1.1304347826086956, "grad_norm": 2.1855709552764893, "learning_rate": 4.4523809523809525e-05, "loss": 2.1256, "step": 286 }, { "epoch": 1.1343873517786562, "grad_norm": 2.0757203102111816, "learning_rate": 4.450396825396825e-05, "loss": 2.1395, "step": 287 }, { "epoch": 1.1383399209486167, "grad_norm": 2.217658519744873, "learning_rate": 4.4484126984126986e-05, "loss": 1.6217, "step": 288 }, { "epoch": 1.1422924901185771, "grad_norm": 2.346212387084961, "learning_rate": 4.446428571428571e-05, "loss": 1.6561, "step": 289 }, { "epoch": 1.1462450592885376, "grad_norm": 2.340932607650757, "learning_rate": 4.4444444444444447e-05, "loss": 1.5699, "step": 290 }, { "epoch": 1.150197628458498, "grad_norm": 2.2137064933776855, "learning_rate": 4.4424603174603174e-05, "loss": 1.9642, "step": 291 }, { "epoch": 1.1541501976284585, "grad_norm": 1.6653351783752441, "learning_rate": 4.440476190476191e-05, "loss": 1.8973, "step": 292 }, { "epoch": 1.158102766798419, "grad_norm": 2.238419771194458, "learning_rate": 4.4384920634920634e-05, "loss": 1.7091, "step": 293 }, { "epoch": 1.1620553359683794, "grad_norm": 1.8844563961029053, "learning_rate": 4.436507936507937e-05, "loss": 1.8355, "step": 294 }, { "epoch": 1.1660079051383399, "grad_norm": 1.5482860803604126, "learning_rate": 4.4345238095238095e-05, "loss": 1.8152, "step": 295 }, { "epoch": 1.1699604743083003, "grad_norm": 2.046618700027466, "learning_rate": 4.432539682539683e-05, "loss": 1.5727, "step": 296 }, { "epoch": 1.1739130434782608, "grad_norm": 1.9557174444198608, "learning_rate": 4.4305555555555556e-05, "loss": 1.2086, "step": 297 }, { "epoch": 1.1778656126482214, "grad_norm": 2.5918216705322266, "learning_rate": 4.428571428571428e-05, "loss": 1.8132, "step": 298 }, { "epoch": 1.1818181818181819, "grad_norm": 2.1008517742156982, "learning_rate": 4.4265873015873017e-05, "loss": 1.7421, "step": 299 }, { "epoch": 1.1857707509881423, "grad_norm": 2.3811569213867188, "learning_rate": 4.4246031746031744e-05, "loss": 1.9358, "step": 300 }, { "epoch": 1.1897233201581028, "grad_norm": 2.317112922668457, "learning_rate": 4.422619047619048e-05, "loss": 1.8341, "step": 301 }, { "epoch": 1.1936758893280632, "grad_norm": 1.8912357091903687, "learning_rate": 4.4206349206349204e-05, "loss": 1.7062, "step": 302 }, { "epoch": 1.1976284584980237, "grad_norm": 1.7493529319763184, "learning_rate": 4.418650793650794e-05, "loss": 1.9519, "step": 303 }, { "epoch": 1.2015810276679841, "grad_norm": 2.0161383152008057, "learning_rate": 4.4166666666666665e-05, "loss": 1.6137, "step": 304 }, { "epoch": 1.2055335968379446, "grad_norm": 1.7949028015136719, "learning_rate": 4.41468253968254e-05, "loss": 1.7514, "step": 305 }, { "epoch": 1.2094861660079053, "grad_norm": 2.1296987533569336, "learning_rate": 4.4126984126984126e-05, "loss": 1.6876, "step": 306 }, { "epoch": 1.2134387351778657, "grad_norm": 2.3091883659362793, "learning_rate": 4.410714285714286e-05, "loss": 1.9752, "step": 307 }, { "epoch": 1.2173913043478262, "grad_norm": 2.804625988006592, "learning_rate": 4.4087301587301587e-05, "loss": 1.7697, "step": 308 }, { "epoch": 1.2213438735177866, "grad_norm": 1.7799369096755981, "learning_rate": 4.406746031746032e-05, "loss": 1.8301, "step": 309 }, { "epoch": 1.225296442687747, "grad_norm": 2.0240299701690674, "learning_rate": 4.404761904761905e-05, "loss": 1.6604, "step": 310 }, { "epoch": 1.2292490118577075, "grad_norm": 2.3005123138427734, "learning_rate": 4.402777777777778e-05, "loss": 1.8875, "step": 311 }, { "epoch": 1.233201581027668, "grad_norm": 2.1108436584472656, "learning_rate": 4.400793650793651e-05, "loss": 1.8508, "step": 312 }, { "epoch": 1.2371541501976284, "grad_norm": 2.7670435905456543, "learning_rate": 4.398809523809524e-05, "loss": 1.8427, "step": 313 }, { "epoch": 1.2411067193675889, "grad_norm": 2.1103644371032715, "learning_rate": 4.396825396825397e-05, "loss": 1.5623, "step": 314 }, { "epoch": 1.2450592885375493, "grad_norm": 2.517120838165283, "learning_rate": 4.39484126984127e-05, "loss": 1.688, "step": 315 }, { "epoch": 1.2490118577075098, "grad_norm": 1.799034595489502, "learning_rate": 4.392857142857143e-05, "loss": 1.7609, "step": 316 }, { "epoch": 1.2529644268774702, "grad_norm": 2.450300455093384, "learning_rate": 4.390873015873016e-05, "loss": 1.6296, "step": 317 }, { "epoch": 1.256916996047431, "grad_norm": 2.148935317993164, "learning_rate": 4.388888888888889e-05, "loss": 1.849, "step": 318 }, { "epoch": 1.2608695652173914, "grad_norm": 2.3157055377960205, "learning_rate": 4.3869047619047624e-05, "loss": 1.62, "step": 319 }, { "epoch": 1.2648221343873518, "grad_norm": 1.6752326488494873, "learning_rate": 4.384920634920635e-05, "loss": 1.594, "step": 320 }, { "epoch": 1.2687747035573123, "grad_norm": 2.3844189643859863, "learning_rate": 4.382936507936508e-05, "loss": 2.033, "step": 321 }, { "epoch": 1.2727272727272727, "grad_norm": 2.3986222743988037, "learning_rate": 4.380952380952381e-05, "loss": 1.4159, "step": 322 }, { "epoch": 1.2766798418972332, "grad_norm": 1.919364094734192, "learning_rate": 4.378968253968254e-05, "loss": 1.9236, "step": 323 }, { "epoch": 1.2806324110671936, "grad_norm": 2.3783788681030273, "learning_rate": 4.376984126984127e-05, "loss": 1.8965, "step": 324 }, { "epoch": 1.2845849802371543, "grad_norm": 2.122201919555664, "learning_rate": 4.375e-05, "loss": 1.7124, "step": 325 }, { "epoch": 1.2885375494071147, "grad_norm": 1.829590082168579, "learning_rate": 4.373015873015873e-05, "loss": 1.7605, "step": 326 }, { "epoch": 1.2924901185770752, "grad_norm": 1.9862433671951294, "learning_rate": 4.371031746031746e-05, "loss": 1.6903, "step": 327 }, { "epoch": 1.2964426877470356, "grad_norm": 2.1671435832977295, "learning_rate": 4.3690476190476194e-05, "loss": 1.9038, "step": 328 }, { "epoch": 1.300395256916996, "grad_norm": 2.0836524963378906, "learning_rate": 4.367063492063492e-05, "loss": 1.8675, "step": 329 }, { "epoch": 1.3043478260869565, "grad_norm": 2.2062249183654785, "learning_rate": 4.3650793650793655e-05, "loss": 1.7196, "step": 330 }, { "epoch": 1.308300395256917, "grad_norm": 2.018446683883667, "learning_rate": 4.363095238095238e-05, "loss": 1.6995, "step": 331 }, { "epoch": 1.3122529644268774, "grad_norm": 2.521754026412964, "learning_rate": 4.3611111111111116e-05, "loss": 1.649, "step": 332 }, { "epoch": 1.316205533596838, "grad_norm": 2.223747968673706, "learning_rate": 4.359126984126984e-05, "loss": 1.5682, "step": 333 }, { "epoch": 1.3201581027667983, "grad_norm": 2.356834650039673, "learning_rate": 4.3571428571428576e-05, "loss": 1.7477, "step": 334 }, { "epoch": 1.3241106719367588, "grad_norm": 1.9272010326385498, "learning_rate": 4.35515873015873e-05, "loss": 1.8585, "step": 335 }, { "epoch": 1.3280632411067192, "grad_norm": 2.0459604263305664, "learning_rate": 4.353174603174604e-05, "loss": 1.5119, "step": 336 }, { "epoch": 1.33201581027668, "grad_norm": 1.9021974802017212, "learning_rate": 4.3511904761904764e-05, "loss": 1.9379, "step": 337 }, { "epoch": 1.3359683794466404, "grad_norm": 2.5137360095977783, "learning_rate": 4.34920634920635e-05, "loss": 1.8224, "step": 338 }, { "epoch": 1.3399209486166008, "grad_norm": 1.5489245653152466, "learning_rate": 4.3472222222222225e-05, "loss": 1.8488, "step": 339 }, { "epoch": 1.3438735177865613, "grad_norm": 1.811347484588623, "learning_rate": 4.345238095238096e-05, "loss": 1.5281, "step": 340 }, { "epoch": 1.3478260869565217, "grad_norm": 2.5799477100372314, "learning_rate": 4.3432539682539686e-05, "loss": 1.7175, "step": 341 }, { "epoch": 1.3517786561264822, "grad_norm": 2.4223809242248535, "learning_rate": 4.341269841269842e-05, "loss": 1.6518, "step": 342 }, { "epoch": 1.3557312252964426, "grad_norm": 2.713998556137085, "learning_rate": 4.3392857142857146e-05, "loss": 1.6456, "step": 343 }, { "epoch": 1.359683794466403, "grad_norm": 2.6156938076019287, "learning_rate": 4.337301587301587e-05, "loss": 1.6932, "step": 344 }, { "epoch": 1.3636363636363638, "grad_norm": 2.451413154602051, "learning_rate": 4.335317460317461e-05, "loss": 1.7987, "step": 345 }, { "epoch": 1.3675889328063242, "grad_norm": 1.895689845085144, "learning_rate": 4.3333333333333334e-05, "loss": 1.7549, "step": 346 }, { "epoch": 1.3715415019762847, "grad_norm": 2.4354727268218994, "learning_rate": 4.331349206349207e-05, "loss": 1.5661, "step": 347 }, { "epoch": 1.3754940711462451, "grad_norm": 2.5790798664093018, "learning_rate": 4.3293650793650795e-05, "loss": 1.89, "step": 348 }, { "epoch": 1.3794466403162056, "grad_norm": 1.9985342025756836, "learning_rate": 4.327380952380953e-05, "loss": 1.5705, "step": 349 }, { "epoch": 1.383399209486166, "grad_norm": 1.87380850315094, "learning_rate": 4.3253968253968256e-05, "loss": 1.8884, "step": 350 }, { "epoch": 1.3873517786561265, "grad_norm": 2.4583888053894043, "learning_rate": 4.323412698412699e-05, "loss": 1.51, "step": 351 }, { "epoch": 1.391304347826087, "grad_norm": 2.0746865272521973, "learning_rate": 4.3214285714285716e-05, "loss": 1.688, "step": 352 }, { "epoch": 1.3952569169960474, "grad_norm": 2.3721249103546143, "learning_rate": 4.319444444444445e-05, "loss": 1.2734, "step": 353 }, { "epoch": 1.3992094861660078, "grad_norm": 2.44508695602417, "learning_rate": 4.317460317460318e-05, "loss": 1.8527, "step": 354 }, { "epoch": 1.4031620553359683, "grad_norm": 2.2311763763427734, "learning_rate": 4.315476190476191e-05, "loss": 1.6745, "step": 355 }, { "epoch": 1.4071146245059287, "grad_norm": 1.867505431175232, "learning_rate": 4.313492063492064e-05, "loss": 1.9571, "step": 356 }, { "epoch": 1.4110671936758894, "grad_norm": 2.2237112522125244, "learning_rate": 4.311507936507937e-05, "loss": 2.133, "step": 357 }, { "epoch": 1.4150197628458498, "grad_norm": 1.834166169166565, "learning_rate": 4.30952380952381e-05, "loss": 2.0071, "step": 358 }, { "epoch": 1.4189723320158103, "grad_norm": 1.9390510320663452, "learning_rate": 4.307539682539683e-05, "loss": 1.5187, "step": 359 }, { "epoch": 1.4229249011857708, "grad_norm": 2.7471730709075928, "learning_rate": 4.305555555555556e-05, "loss": 1.7342, "step": 360 }, { "epoch": 1.4268774703557312, "grad_norm": 1.8465523719787598, "learning_rate": 4.303571428571429e-05, "loss": 1.9512, "step": 361 }, { "epoch": 1.4308300395256917, "grad_norm": 2.100477933883667, "learning_rate": 4.301587301587302e-05, "loss": 1.6373, "step": 362 }, { "epoch": 1.434782608695652, "grad_norm": 2.298740863800049, "learning_rate": 4.2996031746031754e-05, "loss": 1.7411, "step": 363 }, { "epoch": 1.4387351778656128, "grad_norm": 1.8366012573242188, "learning_rate": 4.297619047619048e-05, "loss": 1.89, "step": 364 }, { "epoch": 1.4426877470355732, "grad_norm": 2.246143341064453, "learning_rate": 4.295634920634921e-05, "loss": 1.6918, "step": 365 }, { "epoch": 1.4466403162055337, "grad_norm": 1.8526705503463745, "learning_rate": 4.2936507936507935e-05, "loss": 1.6653, "step": 366 }, { "epoch": 1.4505928853754941, "grad_norm": 2.273653984069824, "learning_rate": 4.291666666666667e-05, "loss": 2.0948, "step": 367 }, { "epoch": 1.4545454545454546, "grad_norm": 2.1943068504333496, "learning_rate": 4.2896825396825396e-05, "loss": 1.6798, "step": 368 }, { "epoch": 1.458498023715415, "grad_norm": 1.9659279584884644, "learning_rate": 4.287698412698413e-05, "loss": 1.9008, "step": 369 }, { "epoch": 1.4624505928853755, "grad_norm": 1.792734980583191, "learning_rate": 4.2857142857142856e-05, "loss": 1.6039, "step": 370 }, { "epoch": 1.466403162055336, "grad_norm": 2.0952234268188477, "learning_rate": 4.283730158730159e-05, "loss": 1.5815, "step": 371 }, { "epoch": 1.4703557312252964, "grad_norm": 1.9501980543136597, "learning_rate": 4.281746031746032e-05, "loss": 1.5774, "step": 372 }, { "epoch": 1.4743083003952568, "grad_norm": 1.960719347000122, "learning_rate": 4.279761904761905e-05, "loss": 1.5171, "step": 373 }, { "epoch": 1.4782608695652173, "grad_norm": 2.022639751434326, "learning_rate": 4.277777777777778e-05, "loss": 1.8212, "step": 374 }, { "epoch": 1.4822134387351777, "grad_norm": 1.9371771812438965, "learning_rate": 4.2757936507936505e-05, "loss": 1.6054, "step": 375 }, { "epoch": 1.4861660079051384, "grad_norm": 2.072070598602295, "learning_rate": 4.273809523809524e-05, "loss": 1.6589, "step": 376 }, { "epoch": 1.4901185770750989, "grad_norm": 1.8878108263015747, "learning_rate": 4.2718253968253966e-05, "loss": 1.9921, "step": 377 }, { "epoch": 1.4940711462450593, "grad_norm": 1.9379626512527466, "learning_rate": 4.26984126984127e-05, "loss": 1.8345, "step": 378 }, { "epoch": 1.4980237154150198, "grad_norm": 2.775383234024048, "learning_rate": 4.2678571428571426e-05, "loss": 1.8299, "step": 379 }, { "epoch": 1.5019762845849802, "grad_norm": 2.041731119155884, "learning_rate": 4.265873015873016e-05, "loss": 1.4022, "step": 380 }, { "epoch": 1.5059288537549407, "grad_norm": 2.0958447456359863, "learning_rate": 4.263888888888889e-05, "loss": 1.8758, "step": 381 }, { "epoch": 1.5098814229249014, "grad_norm": 2.6933484077453613, "learning_rate": 4.261904761904762e-05, "loss": 2.0389, "step": 382 }, { "epoch": 1.5138339920948618, "grad_norm": 2.594853401184082, "learning_rate": 4.259920634920635e-05, "loss": 1.5752, "step": 383 }, { "epoch": 1.5177865612648223, "grad_norm": 2.6426239013671875, "learning_rate": 4.257936507936508e-05, "loss": 1.9273, "step": 384 }, { "epoch": 1.5217391304347827, "grad_norm": 1.837033748626709, "learning_rate": 4.255952380952381e-05, "loss": 1.774, "step": 385 }, { "epoch": 1.5256916996047432, "grad_norm": 2.4687540531158447, "learning_rate": 4.253968253968254e-05, "loss": 2.3856, "step": 386 }, { "epoch": 1.5296442687747036, "grad_norm": 1.7977819442749023, "learning_rate": 4.251984126984127e-05, "loss": 1.6884, "step": 387 }, { "epoch": 1.533596837944664, "grad_norm": 2.1651384830474854, "learning_rate": 4.25e-05, "loss": 2.0945, "step": 388 }, { "epoch": 1.5375494071146245, "grad_norm": 1.796292781829834, "learning_rate": 4.248015873015873e-05, "loss": 1.7548, "step": 389 }, { "epoch": 1.541501976284585, "grad_norm": 3.0309319496154785, "learning_rate": 4.2460317460317464e-05, "loss": 1.9628, "step": 390 }, { "epoch": 1.5454545454545454, "grad_norm": 1.8279919624328613, "learning_rate": 4.244047619047619e-05, "loss": 1.9005, "step": 391 }, { "epoch": 1.5494071146245059, "grad_norm": 1.9936280250549316, "learning_rate": 4.2420634920634925e-05, "loss": 1.7502, "step": 392 }, { "epoch": 1.5533596837944663, "grad_norm": 1.728090763092041, "learning_rate": 4.240079365079365e-05, "loss": 1.8107, "step": 393 }, { "epoch": 1.5573122529644268, "grad_norm": 1.614793300628662, "learning_rate": 4.2380952380952385e-05, "loss": 1.6821, "step": 394 }, { "epoch": 1.5612648221343872, "grad_norm": 2.016058921813965, "learning_rate": 4.236111111111111e-05, "loss": 1.6821, "step": 395 }, { "epoch": 1.5652173913043477, "grad_norm": 2.421929359436035, "learning_rate": 4.2341269841269846e-05, "loss": 2.0802, "step": 396 }, { "epoch": 1.5691699604743083, "grad_norm": 1.9785467386245728, "learning_rate": 4.232142857142857e-05, "loss": 1.7954, "step": 397 }, { "epoch": 1.5731225296442688, "grad_norm": 2.4265410900115967, "learning_rate": 4.23015873015873e-05, "loss": 1.6911, "step": 398 }, { "epoch": 1.5770750988142292, "grad_norm": 2.7018771171569824, "learning_rate": 4.2281746031746034e-05, "loss": 1.7161, "step": 399 }, { "epoch": 1.5810276679841897, "grad_norm": 1.9163415431976318, "learning_rate": 4.226190476190476e-05, "loss": 1.6967, "step": 400 }, { "epoch": 1.5849802371541502, "grad_norm": 1.7845193147659302, "learning_rate": 4.2242063492063495e-05, "loss": 2.0456, "step": 401 }, { "epoch": 1.5889328063241108, "grad_norm": 2.364121675491333, "learning_rate": 4.222222222222222e-05, "loss": 1.8798, "step": 402 }, { "epoch": 1.5928853754940713, "grad_norm": 1.998205542564392, "learning_rate": 4.2202380952380955e-05, "loss": 1.6054, "step": 403 }, { "epoch": 1.5968379446640317, "grad_norm": 2.847519874572754, "learning_rate": 4.218253968253968e-05, "loss": 1.723, "step": 404 }, { "epoch": 1.6007905138339922, "grad_norm": 2.2237510681152344, "learning_rate": 4.2162698412698416e-05, "loss": 1.5771, "step": 405 }, { "epoch": 1.6047430830039526, "grad_norm": 2.4652063846588135, "learning_rate": 4.214285714285714e-05, "loss": 1.9165, "step": 406 }, { "epoch": 1.608695652173913, "grad_norm": 1.9992256164550781, "learning_rate": 4.212301587301588e-05, "loss": 1.9398, "step": 407 }, { "epoch": 1.6126482213438735, "grad_norm": 1.7840253114700317, "learning_rate": 4.2103174603174604e-05, "loss": 1.623, "step": 408 }, { "epoch": 1.616600790513834, "grad_norm": 1.5577056407928467, "learning_rate": 4.208333333333334e-05, "loss": 1.6815, "step": 409 }, { "epoch": 1.6205533596837944, "grad_norm": 2.6008100509643555, "learning_rate": 4.2063492063492065e-05, "loss": 1.6165, "step": 410 }, { "epoch": 1.6245059288537549, "grad_norm": 1.9897792339324951, "learning_rate": 4.20436507936508e-05, "loss": 1.6959, "step": 411 }, { "epoch": 1.6284584980237153, "grad_norm": 1.9050272703170776, "learning_rate": 4.2023809523809525e-05, "loss": 1.9483, "step": 412 }, { "epoch": 1.6324110671936758, "grad_norm": 2.1082065105438232, "learning_rate": 4.200396825396826e-05, "loss": 1.6326, "step": 413 }, { "epoch": 1.6363636363636362, "grad_norm": 1.8587292432785034, "learning_rate": 4.1984126984126986e-05, "loss": 1.5015, "step": 414 }, { "epoch": 1.6403162055335967, "grad_norm": 2.440154552459717, "learning_rate": 4.196428571428572e-05, "loss": 1.8917, "step": 415 }, { "epoch": 1.6442687747035574, "grad_norm": 2.1441152095794678, "learning_rate": 4.194444444444445e-05, "loss": 1.8155, "step": 416 }, { "epoch": 1.6482213438735178, "grad_norm": 1.9019662141799927, "learning_rate": 4.192460317460318e-05, "loss": 1.9238, "step": 417 }, { "epoch": 1.6521739130434783, "grad_norm": 1.8844928741455078, "learning_rate": 4.190476190476191e-05, "loss": 1.6799, "step": 418 }, { "epoch": 1.6561264822134387, "grad_norm": 2.373283624649048, "learning_rate": 4.188492063492064e-05, "loss": 1.6916, "step": 419 }, { "epoch": 1.6600790513833992, "grad_norm": 2.1077182292938232, "learning_rate": 4.186507936507937e-05, "loss": 1.924, "step": 420 }, { "epoch": 1.6640316205533598, "grad_norm": 2.5388450622558594, "learning_rate": 4.1845238095238095e-05, "loss": 1.7846, "step": 421 }, { "epoch": 1.6679841897233203, "grad_norm": 2.8166000843048096, "learning_rate": 4.182539682539683e-05, "loss": 1.5143, "step": 422 }, { "epoch": 1.6719367588932808, "grad_norm": 1.919756531715393, "learning_rate": 4.1805555555555556e-05, "loss": 1.7846, "step": 423 }, { "epoch": 1.6758893280632412, "grad_norm": 2.265761375427246, "learning_rate": 4.178571428571429e-05, "loss": 1.2774, "step": 424 }, { "epoch": 1.6798418972332017, "grad_norm": 2.4432921409606934, "learning_rate": 4.176587301587302e-05, "loss": 1.637, "step": 425 }, { "epoch": 1.683794466403162, "grad_norm": 2.095233678817749, "learning_rate": 4.174603174603175e-05, "loss": 1.4832, "step": 426 }, { "epoch": 1.6877470355731226, "grad_norm": 2.602231740951538, "learning_rate": 4.172619047619048e-05, "loss": 1.5574, "step": 427 }, { "epoch": 1.691699604743083, "grad_norm": 1.9906206130981445, "learning_rate": 4.170634920634921e-05, "loss": 1.5909, "step": 428 }, { "epoch": 1.6956521739130435, "grad_norm": 2.3297181129455566, "learning_rate": 4.168650793650794e-05, "loss": 1.7061, "step": 429 }, { "epoch": 1.699604743083004, "grad_norm": 2.3630523681640625, "learning_rate": 4.166666666666667e-05, "loss": 1.6981, "step": 430 }, { "epoch": 1.7035573122529644, "grad_norm": 1.9802945852279663, "learning_rate": 4.16468253968254e-05, "loss": 1.7784, "step": 431 }, { "epoch": 1.7075098814229248, "grad_norm": 2.8566882610321045, "learning_rate": 4.162698412698413e-05, "loss": 1.5392, "step": 432 }, { "epoch": 1.7114624505928853, "grad_norm": 2.2854273319244385, "learning_rate": 4.160714285714286e-05, "loss": 1.7431, "step": 433 }, { "epoch": 1.7154150197628457, "grad_norm": 1.9989619255065918, "learning_rate": 4.1587301587301594e-05, "loss": 1.8095, "step": 434 }, { "epoch": 1.7193675889328062, "grad_norm": 2.6097142696380615, "learning_rate": 4.156746031746032e-05, "loss": 1.464, "step": 435 }, { "epoch": 1.7233201581027668, "grad_norm": 2.048234462738037, "learning_rate": 4.1547619047619054e-05, "loss": 2.0557, "step": 436 }, { "epoch": 1.7272727272727273, "grad_norm": 2.046173334121704, "learning_rate": 4.152777777777778e-05, "loss": 1.4132, "step": 437 }, { "epoch": 1.7312252964426877, "grad_norm": 2.1724562644958496, "learning_rate": 4.1507936507936515e-05, "loss": 1.6773, "step": 438 }, { "epoch": 1.7351778656126482, "grad_norm": 2.2624452114105225, "learning_rate": 4.148809523809524e-05, "loss": 1.6013, "step": 439 }, { "epoch": 1.7391304347826086, "grad_norm": 2.2074332237243652, "learning_rate": 4.1468253968253976e-05, "loss": 1.7686, "step": 440 }, { "epoch": 1.7430830039525693, "grad_norm": 2.1703858375549316, "learning_rate": 4.14484126984127e-05, "loss": 1.5108, "step": 441 }, { "epoch": 1.7470355731225298, "grad_norm": 1.9463564157485962, "learning_rate": 4.1428571428571437e-05, "loss": 1.9879, "step": 442 }, { "epoch": 1.7509881422924902, "grad_norm": 2.4043517112731934, "learning_rate": 4.1408730158730164e-05, "loss": 1.6002, "step": 443 }, { "epoch": 1.7549407114624507, "grad_norm": 1.906467080116272, "learning_rate": 4.138888888888889e-05, "loss": 1.8149, "step": 444 }, { "epoch": 1.7588932806324111, "grad_norm": 2.612128257751465, "learning_rate": 4.136904761904762e-05, "loss": 1.8805, "step": 445 }, { "epoch": 1.7628458498023716, "grad_norm": 2.056048631668091, "learning_rate": 4.134920634920635e-05, "loss": 1.6064, "step": 446 }, { "epoch": 1.766798418972332, "grad_norm": 1.9283335208892822, "learning_rate": 4.132936507936508e-05, "loss": 2.2151, "step": 447 }, { "epoch": 1.7707509881422925, "grad_norm": 1.8714007139205933, "learning_rate": 4.130952380952381e-05, "loss": 1.6642, "step": 448 }, { "epoch": 1.774703557312253, "grad_norm": 2.0939621925354004, "learning_rate": 4.128968253968254e-05, "loss": 1.558, "step": 449 }, { "epoch": 1.7786561264822134, "grad_norm": 2.0754241943359375, "learning_rate": 4.126984126984127e-05, "loss": 1.6465, "step": 450 }, { "epoch": 1.7826086956521738, "grad_norm": 2.7905728816986084, "learning_rate": 4.125e-05, "loss": 1.7298, "step": 451 }, { "epoch": 1.7865612648221343, "grad_norm": 2.2117226123809814, "learning_rate": 4.123015873015873e-05, "loss": 1.5478, "step": 452 }, { "epoch": 1.7905138339920947, "grad_norm": 2.22251033782959, "learning_rate": 4.121031746031746e-05, "loss": 1.4483, "step": 453 }, { "epoch": 1.7944664031620552, "grad_norm": 2.636388063430786, "learning_rate": 4.119047619047619e-05, "loss": 1.3413, "step": 454 }, { "epoch": 1.7984189723320159, "grad_norm": 2.417893409729004, "learning_rate": 4.117063492063492e-05, "loss": 1.5273, "step": 455 }, { "epoch": 1.8023715415019763, "grad_norm": 1.960670828819275, "learning_rate": 4.115079365079365e-05, "loss": 1.8887, "step": 456 }, { "epoch": 1.8063241106719368, "grad_norm": 2.0714080333709717, "learning_rate": 4.113095238095238e-05, "loss": 1.9395, "step": 457 }, { "epoch": 1.8102766798418972, "grad_norm": 2.6367220878601074, "learning_rate": 4.111111111111111e-05, "loss": 1.475, "step": 458 }, { "epoch": 1.8142292490118577, "grad_norm": 2.8209869861602783, "learning_rate": 4.109126984126984e-05, "loss": 1.4326, "step": 459 }, { "epoch": 1.8181818181818183, "grad_norm": 1.884318470954895, "learning_rate": 4.107142857142857e-05, "loss": 1.5297, "step": 460 }, { "epoch": 1.8221343873517788, "grad_norm": 1.7990609407424927, "learning_rate": 4.1051587301587304e-05, "loss": 1.9113, "step": 461 }, { "epoch": 1.8260869565217392, "grad_norm": 1.8307974338531494, "learning_rate": 4.103174603174603e-05, "loss": 1.5365, "step": 462 }, { "epoch": 1.8300395256916997, "grad_norm": 2.2995738983154297, "learning_rate": 4.1011904761904764e-05, "loss": 1.4421, "step": 463 }, { "epoch": 1.8339920948616601, "grad_norm": 2.2058722972869873, "learning_rate": 4.099206349206349e-05, "loss": 1.5384, "step": 464 }, { "epoch": 1.8379446640316206, "grad_norm": 2.011327028274536, "learning_rate": 4.0972222222222225e-05, "loss": 1.6523, "step": 465 }, { "epoch": 1.841897233201581, "grad_norm": 2.0535199642181396, "learning_rate": 4.095238095238095e-05, "loss": 1.6645, "step": 466 }, { "epoch": 1.8458498023715415, "grad_norm": 2.485356092453003, "learning_rate": 4.0932539682539686e-05, "loss": 1.5562, "step": 467 }, { "epoch": 1.849802371541502, "grad_norm": 2.368373155593872, "learning_rate": 4.091269841269841e-05, "loss": 1.806, "step": 468 }, { "epoch": 1.8537549407114624, "grad_norm": 2.3104066848754883, "learning_rate": 4.0892857142857147e-05, "loss": 1.9633, "step": 469 }, { "epoch": 1.8577075098814229, "grad_norm": 2.3505473136901855, "learning_rate": 4.0873015873015874e-05, "loss": 1.6175, "step": 470 }, { "epoch": 1.8616600790513833, "grad_norm": 1.9146708250045776, "learning_rate": 4.085317460317461e-05, "loss": 1.5535, "step": 471 }, { "epoch": 1.8656126482213438, "grad_norm": 1.624837040901184, "learning_rate": 4.0833333333333334e-05, "loss": 1.6677, "step": 472 }, { "epoch": 1.8695652173913042, "grad_norm": 1.7742847204208374, "learning_rate": 4.081349206349207e-05, "loss": 1.5509, "step": 473 }, { "epoch": 1.8735177865612647, "grad_norm": 1.9143329858779907, "learning_rate": 4.0793650793650795e-05, "loss": 1.8877, "step": 474 }, { "epoch": 1.8774703557312253, "grad_norm": 2.1041526794433594, "learning_rate": 4.077380952380952e-05, "loss": 1.6796, "step": 475 }, { "epoch": 1.8814229249011858, "grad_norm": 2.0280792713165283, "learning_rate": 4.0753968253968256e-05, "loss": 1.7892, "step": 476 }, { "epoch": 1.8853754940711462, "grad_norm": 2.8694331645965576, "learning_rate": 4.073412698412698e-05, "loss": 1.9448, "step": 477 }, { "epoch": 1.8893280632411067, "grad_norm": 2.471470832824707, "learning_rate": 4.0714285714285717e-05, "loss": 1.7856, "step": 478 }, { "epoch": 1.8932806324110671, "grad_norm": 2.2044591903686523, "learning_rate": 4.0694444444444444e-05, "loss": 1.7688, "step": 479 }, { "epoch": 1.8972332015810278, "grad_norm": 2.3773610591888428, "learning_rate": 4.067460317460318e-05, "loss": 1.908, "step": 480 }, { "epoch": 1.9011857707509883, "grad_norm": 2.1004207134246826, "learning_rate": 4.0654761904761904e-05, "loss": 1.4418, "step": 481 }, { "epoch": 1.9051383399209487, "grad_norm": 2.415409803390503, "learning_rate": 4.063492063492064e-05, "loss": 1.7695, "step": 482 }, { "epoch": 1.9090909090909092, "grad_norm": 1.89458167552948, "learning_rate": 4.0615079365079365e-05, "loss": 1.8081, "step": 483 }, { "epoch": 1.9130434782608696, "grad_norm": 2.560452699661255, "learning_rate": 4.05952380952381e-05, "loss": 2.0119, "step": 484 }, { "epoch": 1.91699604743083, "grad_norm": 2.115086793899536, "learning_rate": 4.0575396825396826e-05, "loss": 1.7012, "step": 485 }, { "epoch": 1.9209486166007905, "grad_norm": 2.1731040477752686, "learning_rate": 4.055555555555556e-05, "loss": 1.7267, "step": 486 }, { "epoch": 1.924901185770751, "grad_norm": 1.9672155380249023, "learning_rate": 4.0535714285714287e-05, "loss": 1.7396, "step": 487 }, { "epoch": 1.9288537549407114, "grad_norm": 2.380613327026367, "learning_rate": 4.051587301587302e-05, "loss": 1.4355, "step": 488 }, { "epoch": 1.9328063241106719, "grad_norm": 2.087024450302124, "learning_rate": 4.049603174603175e-05, "loss": 1.812, "step": 489 }, { "epoch": 1.9367588932806323, "grad_norm": 2.704306125640869, "learning_rate": 4.047619047619048e-05, "loss": 1.7029, "step": 490 }, { "epoch": 1.9407114624505928, "grad_norm": 1.8128267526626587, "learning_rate": 4.045634920634921e-05, "loss": 1.616, "step": 491 }, { "epoch": 1.9446640316205532, "grad_norm": 1.7133640050888062, "learning_rate": 4.043650793650794e-05, "loss": 1.4945, "step": 492 }, { "epoch": 1.9486166007905137, "grad_norm": 1.8677951097488403, "learning_rate": 4.041666666666667e-05, "loss": 1.6594, "step": 493 }, { "epoch": 1.9525691699604744, "grad_norm": 1.908327579498291, "learning_rate": 4.03968253968254e-05, "loss": 1.7492, "step": 494 }, { "epoch": 1.9565217391304348, "grad_norm": 2.1175668239593506, "learning_rate": 4.037698412698413e-05, "loss": 1.6509, "step": 495 }, { "epoch": 1.9604743083003953, "grad_norm": 2.284240961074829, "learning_rate": 4.035714285714286e-05, "loss": 1.4284, "step": 496 }, { "epoch": 1.9644268774703557, "grad_norm": 1.581749677658081, "learning_rate": 4.033730158730159e-05, "loss": 1.8258, "step": 497 }, { "epoch": 1.9683794466403162, "grad_norm": 2.200153112411499, "learning_rate": 4.031746031746032e-05, "loss": 1.573, "step": 498 }, { "epoch": 1.9723320158102768, "grad_norm": 2.2217061519622803, "learning_rate": 4.029761904761905e-05, "loss": 1.6952, "step": 499 }, { "epoch": 1.9762845849802373, "grad_norm": 2.2044413089752197, "learning_rate": 4.027777777777778e-05, "loss": 1.5415, "step": 500 }, { "epoch": 1.9802371541501977, "grad_norm": 2.001845359802246, "learning_rate": 4.025793650793651e-05, "loss": 1.8791, "step": 501 }, { "epoch": 1.9841897233201582, "grad_norm": 1.7531094551086426, "learning_rate": 4.023809523809524e-05, "loss": 1.7529, "step": 502 }, { "epoch": 1.9881422924901186, "grad_norm": 3.204590320587158, "learning_rate": 4.021825396825397e-05, "loss": 1.6866, "step": 503 }, { "epoch": 1.992094861660079, "grad_norm": 2.3752427101135254, "learning_rate": 4.01984126984127e-05, "loss": 1.4053, "step": 504 }, { "epoch": 1.9960474308300395, "grad_norm": 1.789313793182373, "learning_rate": 4.017857142857143e-05, "loss": 1.6106, "step": 505 }, { "epoch": 2.0, "grad_norm": 2.152024269104004, "learning_rate": 4.015873015873016e-05, "loss": 1.7857, "step": 506 }, { "epoch": 2.0039525691699605, "grad_norm": 1.7887808084487915, "learning_rate": 4.0138888888888894e-05, "loss": 1.2694, "step": 507 }, { "epoch": 2.007905138339921, "grad_norm": 2.0157063007354736, "learning_rate": 4.011904761904762e-05, "loss": 1.3274, "step": 508 }, { "epoch": 2.0118577075098814, "grad_norm": 2.462636709213257, "learning_rate": 4.0099206349206355e-05, "loss": 1.5154, "step": 509 }, { "epoch": 2.015810276679842, "grad_norm": 2.299966812133789, "learning_rate": 4.007936507936508e-05, "loss": 1.378, "step": 510 }, { "epoch": 2.0197628458498023, "grad_norm": 2.748854398727417, "learning_rate": 4.0059523809523816e-05, "loss": 1.5578, "step": 511 }, { "epoch": 2.0237154150197627, "grad_norm": 2.8967697620391846, "learning_rate": 4.003968253968254e-05, "loss": 1.2523, "step": 512 }, { "epoch": 2.027667984189723, "grad_norm": 3.2559096813201904, "learning_rate": 4.0019841269841276e-05, "loss": 1.3687, "step": 513 }, { "epoch": 2.0316205533596836, "grad_norm": 3.0115792751312256, "learning_rate": 4e-05, "loss": 1.6751, "step": 514 }, { "epoch": 2.035573122529644, "grad_norm": 2.7409708499908447, "learning_rate": 3.998015873015874e-05, "loss": 1.5692, "step": 515 }, { "epoch": 2.039525691699605, "grad_norm": 2.488542318344116, "learning_rate": 3.9960317460317464e-05, "loss": 1.6312, "step": 516 }, { "epoch": 2.0434782608695654, "grad_norm": 2.267803192138672, "learning_rate": 3.99404761904762e-05, "loss": 1.3411, "step": 517 }, { "epoch": 2.047430830039526, "grad_norm": 2.1117396354675293, "learning_rate": 3.9920634920634925e-05, "loss": 1.3751, "step": 518 }, { "epoch": 2.0513833992094863, "grad_norm": 2.5073764324188232, "learning_rate": 3.990079365079366e-05, "loss": 1.3868, "step": 519 }, { "epoch": 2.0553359683794468, "grad_norm": 2.462871789932251, "learning_rate": 3.9880952380952386e-05, "loss": 1.2461, "step": 520 }, { "epoch": 2.059288537549407, "grad_norm": 2.81300950050354, "learning_rate": 3.986111111111111e-05, "loss": 1.6863, "step": 521 }, { "epoch": 2.0632411067193677, "grad_norm": 2.345595359802246, "learning_rate": 3.984126984126984e-05, "loss": 1.6056, "step": 522 }, { "epoch": 2.067193675889328, "grad_norm": 3.0314559936523438, "learning_rate": 3.982142857142857e-05, "loss": 1.4429, "step": 523 }, { "epoch": 2.0711462450592886, "grad_norm": 2.563462257385254, "learning_rate": 3.98015873015873e-05, "loss": 1.4249, "step": 524 }, { "epoch": 2.075098814229249, "grad_norm": 3.088543653488159, "learning_rate": 3.9781746031746034e-05, "loss": 1.6226, "step": 525 }, { "epoch": 2.0790513833992095, "grad_norm": 2.6812891960144043, "learning_rate": 3.976190476190476e-05, "loss": 1.4069, "step": 526 }, { "epoch": 2.08300395256917, "grad_norm": 2.729356288909912, "learning_rate": 3.9742063492063495e-05, "loss": 1.4145, "step": 527 }, { "epoch": 2.0869565217391304, "grad_norm": 1.9687360525131226, "learning_rate": 3.972222222222222e-05, "loss": 1.2861, "step": 528 }, { "epoch": 2.090909090909091, "grad_norm": 2.8874728679656982, "learning_rate": 3.970238095238095e-05, "loss": 1.0838, "step": 529 }, { "epoch": 2.0948616600790513, "grad_norm": 2.1731650829315186, "learning_rate": 3.968253968253968e-05, "loss": 1.5371, "step": 530 }, { "epoch": 2.0988142292490117, "grad_norm": 2.3168811798095703, "learning_rate": 3.966269841269841e-05, "loss": 1.6941, "step": 531 }, { "epoch": 2.102766798418972, "grad_norm": 2.0623104572296143, "learning_rate": 3.964285714285714e-05, "loss": 1.2772, "step": 532 }, { "epoch": 2.1067193675889326, "grad_norm": 3.4326181411743164, "learning_rate": 3.962301587301587e-05, "loss": 1.3626, "step": 533 }, { "epoch": 2.110671936758893, "grad_norm": 2.1339476108551025, "learning_rate": 3.9603174603174604e-05, "loss": 1.4972, "step": 534 }, { "epoch": 2.1146245059288535, "grad_norm": 2.1961209774017334, "learning_rate": 3.958333333333333e-05, "loss": 1.2863, "step": 535 }, { "epoch": 2.1185770750988144, "grad_norm": 2.13753604888916, "learning_rate": 3.9563492063492065e-05, "loss": 1.3903, "step": 536 }, { "epoch": 2.122529644268775, "grad_norm": 2.521846055984497, "learning_rate": 3.954365079365079e-05, "loss": 1.2818, "step": 537 }, { "epoch": 2.1264822134387353, "grad_norm": 2.1714985370635986, "learning_rate": 3.9523809523809526e-05, "loss": 1.4502, "step": 538 }, { "epoch": 2.130434782608696, "grad_norm": 3.092069149017334, "learning_rate": 3.950396825396825e-05, "loss": 1.5637, "step": 539 }, { "epoch": 2.1343873517786562, "grad_norm": 2.6351075172424316, "learning_rate": 3.9484126984126986e-05, "loss": 1.4462, "step": 540 }, { "epoch": 2.1383399209486167, "grad_norm": 1.9999125003814697, "learning_rate": 3.946428571428571e-05, "loss": 1.4445, "step": 541 }, { "epoch": 2.142292490118577, "grad_norm": 3.0483391284942627, "learning_rate": 3.944444444444445e-05, "loss": 1.206, "step": 542 }, { "epoch": 2.1462450592885376, "grad_norm": 2.455862522125244, "learning_rate": 3.9424603174603174e-05, "loss": 1.3128, "step": 543 }, { "epoch": 2.150197628458498, "grad_norm": 2.6878578662872314, "learning_rate": 3.940476190476191e-05, "loss": 1.3592, "step": 544 }, { "epoch": 2.1541501976284585, "grad_norm": 2.25331449508667, "learning_rate": 3.9384920634920635e-05, "loss": 1.2749, "step": 545 }, { "epoch": 2.158102766798419, "grad_norm": 2.7239346504211426, "learning_rate": 3.936507936507937e-05, "loss": 1.0487, "step": 546 }, { "epoch": 2.1620553359683794, "grad_norm": 2.8506832122802734, "learning_rate": 3.9345238095238096e-05, "loss": 1.2713, "step": 547 }, { "epoch": 2.16600790513834, "grad_norm": 2.3467752933502197, "learning_rate": 3.932539682539683e-05, "loss": 1.1959, "step": 548 }, { "epoch": 2.1699604743083003, "grad_norm": 1.99912428855896, "learning_rate": 3.9305555555555556e-05, "loss": 1.2617, "step": 549 }, { "epoch": 2.1739130434782608, "grad_norm": 2.6946754455566406, "learning_rate": 3.928571428571429e-05, "loss": 1.5949, "step": 550 }, { "epoch": 2.177865612648221, "grad_norm": 2.771515369415283, "learning_rate": 3.926587301587302e-05, "loss": 1.4533, "step": 551 }, { "epoch": 2.1818181818181817, "grad_norm": 2.188755989074707, "learning_rate": 3.9246031746031744e-05, "loss": 1.3215, "step": 552 }, { "epoch": 2.185770750988142, "grad_norm": 2.7638049125671387, "learning_rate": 3.922619047619048e-05, "loss": 1.1885, "step": 553 }, { "epoch": 2.1897233201581026, "grad_norm": 2.2438464164733887, "learning_rate": 3.9206349206349205e-05, "loss": 1.2809, "step": 554 }, { "epoch": 2.1936758893280635, "grad_norm": 2.8813867568969727, "learning_rate": 3.918650793650794e-05, "loss": 1.3764, "step": 555 }, { "epoch": 2.197628458498024, "grad_norm": 2.219583034515381, "learning_rate": 3.9166666666666665e-05, "loss": 1.5119, "step": 556 }, { "epoch": 2.2015810276679844, "grad_norm": 2.888500452041626, "learning_rate": 3.91468253968254e-05, "loss": 1.5976, "step": 557 }, { "epoch": 2.205533596837945, "grad_norm": 2.0659193992614746, "learning_rate": 3.9126984126984126e-05, "loss": 1.3095, "step": 558 }, { "epoch": 2.2094861660079053, "grad_norm": 2.3203117847442627, "learning_rate": 3.910714285714286e-05, "loss": 1.2698, "step": 559 }, { "epoch": 2.2134387351778657, "grad_norm": 2.8916614055633545, "learning_rate": 3.908730158730159e-05, "loss": 1.4077, "step": 560 }, { "epoch": 2.217391304347826, "grad_norm": 2.6932151317596436, "learning_rate": 3.906746031746032e-05, "loss": 1.1033, "step": 561 }, { "epoch": 2.2213438735177866, "grad_norm": 2.4444422721862793, "learning_rate": 3.904761904761905e-05, "loss": 1.2782, "step": 562 }, { "epoch": 2.225296442687747, "grad_norm": 2.334657669067383, "learning_rate": 3.902777777777778e-05, "loss": 1.5071, "step": 563 }, { "epoch": 2.2292490118577075, "grad_norm": 2.4118669033050537, "learning_rate": 3.900793650793651e-05, "loss": 1.2665, "step": 564 }, { "epoch": 2.233201581027668, "grad_norm": 2.160811185836792, "learning_rate": 3.898809523809524e-05, "loss": 1.4489, "step": 565 }, { "epoch": 2.2371541501976284, "grad_norm": 2.49159574508667, "learning_rate": 3.896825396825397e-05, "loss": 1.3127, "step": 566 }, { "epoch": 2.241106719367589, "grad_norm": 2.5408575534820557, "learning_rate": 3.89484126984127e-05, "loss": 1.0498, "step": 567 }, { "epoch": 2.2450592885375493, "grad_norm": 2.121267557144165, "learning_rate": 3.892857142857143e-05, "loss": 1.3147, "step": 568 }, { "epoch": 2.2490118577075098, "grad_norm": 2.7093420028686523, "learning_rate": 3.8908730158730164e-05, "loss": 1.3408, "step": 569 }, { "epoch": 2.2529644268774702, "grad_norm": 3.397962808609009, "learning_rate": 3.888888888888889e-05, "loss": 1.4971, "step": 570 }, { "epoch": 2.2569169960474307, "grad_norm": 2.7930893898010254, "learning_rate": 3.8869047619047625e-05, "loss": 1.4294, "step": 571 }, { "epoch": 2.260869565217391, "grad_norm": 2.7343547344207764, "learning_rate": 3.884920634920635e-05, "loss": 0.9542, "step": 572 }, { "epoch": 2.2648221343873516, "grad_norm": 2.365590810775757, "learning_rate": 3.8829365079365085e-05, "loss": 1.2086, "step": 573 }, { "epoch": 2.2687747035573125, "grad_norm": 2.1030609607696533, "learning_rate": 3.880952380952381e-05, "loss": 1.3605, "step": 574 }, { "epoch": 2.2727272727272725, "grad_norm": 2.1257293224334717, "learning_rate": 3.878968253968254e-05, "loss": 1.4352, "step": 575 }, { "epoch": 2.2766798418972334, "grad_norm": 2.316331386566162, "learning_rate": 3.876984126984127e-05, "loss": 1.4796, "step": 576 }, { "epoch": 2.280632411067194, "grad_norm": 2.8514187335968018, "learning_rate": 3.875e-05, "loss": 1.3073, "step": 577 }, { "epoch": 2.2845849802371543, "grad_norm": 2.8949341773986816, "learning_rate": 3.8730158730158734e-05, "loss": 1.3915, "step": 578 }, { "epoch": 2.2885375494071147, "grad_norm": 2.150175094604492, "learning_rate": 3.871031746031746e-05, "loss": 1.2567, "step": 579 }, { "epoch": 2.292490118577075, "grad_norm": 2.0244216918945312, "learning_rate": 3.8690476190476195e-05, "loss": 1.3674, "step": 580 }, { "epoch": 2.2964426877470356, "grad_norm": 2.2279505729675293, "learning_rate": 3.867063492063492e-05, "loss": 1.4776, "step": 581 }, { "epoch": 2.300395256916996, "grad_norm": 2.4603147506713867, "learning_rate": 3.8650793650793655e-05, "loss": 1.6039, "step": 582 }, { "epoch": 2.3043478260869565, "grad_norm": 2.8002820014953613, "learning_rate": 3.863095238095238e-05, "loss": 1.2765, "step": 583 }, { "epoch": 2.308300395256917, "grad_norm": 2.505256414413452, "learning_rate": 3.8611111111111116e-05, "loss": 1.1921, "step": 584 }, { "epoch": 2.3122529644268774, "grad_norm": 2.458230972290039, "learning_rate": 3.859126984126984e-05, "loss": 1.7022, "step": 585 }, { "epoch": 2.316205533596838, "grad_norm": 2.3297207355499268, "learning_rate": 3.857142857142858e-05, "loss": 1.5739, "step": 586 }, { "epoch": 2.3201581027667983, "grad_norm": 2.038970708847046, "learning_rate": 3.8551587301587304e-05, "loss": 1.2446, "step": 587 }, { "epoch": 2.324110671936759, "grad_norm": 2.4906201362609863, "learning_rate": 3.853174603174604e-05, "loss": 1.3053, "step": 588 }, { "epoch": 2.3280632411067192, "grad_norm": 2.42885160446167, "learning_rate": 3.8511904761904765e-05, "loss": 1.1066, "step": 589 }, { "epoch": 2.3320158102766797, "grad_norm": 2.2669129371643066, "learning_rate": 3.84920634920635e-05, "loss": 1.2192, "step": 590 }, { "epoch": 2.33596837944664, "grad_norm": 4.123607635498047, "learning_rate": 3.8472222222222225e-05, "loss": 1.3616, "step": 591 }, { "epoch": 2.3399209486166006, "grad_norm": 2.522677183151245, "learning_rate": 3.845238095238096e-05, "loss": 1.3161, "step": 592 }, { "epoch": 2.3438735177865615, "grad_norm": 2.6520330905914307, "learning_rate": 3.8432539682539686e-05, "loss": 1.1572, "step": 593 }, { "epoch": 2.3478260869565215, "grad_norm": 2.5755152702331543, "learning_rate": 3.841269841269842e-05, "loss": 1.5446, "step": 594 }, { "epoch": 2.3517786561264824, "grad_norm": 2.2885093688964844, "learning_rate": 3.839285714285715e-05, "loss": 1.4885, "step": 595 }, { "epoch": 2.355731225296443, "grad_norm": 2.176738977432251, "learning_rate": 3.837301587301588e-05, "loss": 1.4043, "step": 596 }, { "epoch": 2.3596837944664033, "grad_norm": 2.7747554779052734, "learning_rate": 3.835317460317461e-05, "loss": 1.6517, "step": 597 }, { "epoch": 2.3636363636363638, "grad_norm": 1.9070560932159424, "learning_rate": 3.8333333333333334e-05, "loss": 1.0572, "step": 598 }, { "epoch": 2.367588932806324, "grad_norm": 2.161180019378662, "learning_rate": 3.831349206349207e-05, "loss": 1.3089, "step": 599 }, { "epoch": 2.3715415019762847, "grad_norm": 2.5658626556396484, "learning_rate": 3.8293650793650795e-05, "loss": 1.1435, "step": 600 }, { "epoch": 2.375494071146245, "grad_norm": 2.733940839767456, "learning_rate": 3.827380952380952e-05, "loss": 1.4425, "step": 601 }, { "epoch": 2.3794466403162056, "grad_norm": 2.494096279144287, "learning_rate": 3.8253968253968256e-05, "loss": 1.5981, "step": 602 }, { "epoch": 2.383399209486166, "grad_norm": 2.405909299850464, "learning_rate": 3.823412698412698e-05, "loss": 1.4946, "step": 603 }, { "epoch": 2.3873517786561265, "grad_norm": 2.067415952682495, "learning_rate": 3.821428571428572e-05, "loss": 1.3633, "step": 604 }, { "epoch": 2.391304347826087, "grad_norm": 2.1548092365264893, "learning_rate": 3.8194444444444444e-05, "loss": 1.2963, "step": 605 }, { "epoch": 2.3952569169960474, "grad_norm": 2.004725694656372, "learning_rate": 3.817460317460317e-05, "loss": 1.4675, "step": 606 }, { "epoch": 2.399209486166008, "grad_norm": 2.1438629627227783, "learning_rate": 3.8154761904761904e-05, "loss": 1.5303, "step": 607 }, { "epoch": 2.4031620553359683, "grad_norm": 2.6474621295928955, "learning_rate": 3.813492063492063e-05, "loss": 1.5247, "step": 608 }, { "epoch": 2.4071146245059287, "grad_norm": 2.7593584060668945, "learning_rate": 3.8115079365079365e-05, "loss": 1.2225, "step": 609 }, { "epoch": 2.411067193675889, "grad_norm": 2.53412127494812, "learning_rate": 3.809523809523809e-05, "loss": 1.3592, "step": 610 }, { "epoch": 2.4150197628458496, "grad_norm": 2.477109909057617, "learning_rate": 3.8075396825396826e-05, "loss": 1.3446, "step": 611 }, { "epoch": 2.4189723320158105, "grad_norm": 2.8138701915740967, "learning_rate": 3.805555555555555e-05, "loss": 1.3014, "step": 612 }, { "epoch": 2.4229249011857705, "grad_norm": 2.805239200592041, "learning_rate": 3.803571428571429e-05, "loss": 1.3786, "step": 613 }, { "epoch": 2.4268774703557314, "grad_norm": 2.7038285732269287, "learning_rate": 3.8015873015873014e-05, "loss": 1.1213, "step": 614 }, { "epoch": 2.430830039525692, "grad_norm": 2.978872299194336, "learning_rate": 3.799603174603175e-05, "loss": 1.2804, "step": 615 }, { "epoch": 2.4347826086956523, "grad_norm": 2.0930662155151367, "learning_rate": 3.7976190476190474e-05, "loss": 1.6642, "step": 616 }, { "epoch": 2.438735177865613, "grad_norm": 2.3781399726867676, "learning_rate": 3.795634920634921e-05, "loss": 1.2398, "step": 617 }, { "epoch": 2.4426877470355732, "grad_norm": 2.1842458248138428, "learning_rate": 3.7936507936507935e-05, "loss": 1.5295, "step": 618 }, { "epoch": 2.4466403162055337, "grad_norm": 3.6338818073272705, "learning_rate": 3.791666666666667e-05, "loss": 1.0698, "step": 619 }, { "epoch": 2.450592885375494, "grad_norm": 2.5560874938964844, "learning_rate": 3.7896825396825396e-05, "loss": 1.3759, "step": 620 }, { "epoch": 2.4545454545454546, "grad_norm": 2.4692065715789795, "learning_rate": 3.787698412698413e-05, "loss": 1.2892, "step": 621 }, { "epoch": 2.458498023715415, "grad_norm": 2.5481793880462646, "learning_rate": 3.785714285714286e-05, "loss": 1.5718, "step": 622 }, { "epoch": 2.4624505928853755, "grad_norm": 2.608428716659546, "learning_rate": 3.783730158730159e-05, "loss": 1.3108, "step": 623 }, { "epoch": 2.466403162055336, "grad_norm": 2.8005776405334473, "learning_rate": 3.781746031746032e-05, "loss": 1.5068, "step": 624 }, { "epoch": 2.4703557312252964, "grad_norm": 2.2495791912078857, "learning_rate": 3.779761904761905e-05, "loss": 1.3082, "step": 625 }, { "epoch": 2.474308300395257, "grad_norm": 2.567267656326294, "learning_rate": 3.777777777777778e-05, "loss": 1.4442, "step": 626 }, { "epoch": 2.4782608695652173, "grad_norm": 2.4826018810272217, "learning_rate": 3.775793650793651e-05, "loss": 1.2581, "step": 627 }, { "epoch": 2.4822134387351777, "grad_norm": 2.3756072521209717, "learning_rate": 3.773809523809524e-05, "loss": 1.2695, "step": 628 }, { "epoch": 2.486166007905138, "grad_norm": 2.6527416706085205, "learning_rate": 3.7718253968253966e-05, "loss": 1.2956, "step": 629 }, { "epoch": 2.4901185770750986, "grad_norm": 2.467822790145874, "learning_rate": 3.76984126984127e-05, "loss": 1.4364, "step": 630 }, { "epoch": 2.494071146245059, "grad_norm": 2.6203055381774902, "learning_rate": 3.767857142857143e-05, "loss": 1.4897, "step": 631 }, { "epoch": 2.4980237154150196, "grad_norm": 2.233952045440674, "learning_rate": 3.765873015873016e-05, "loss": 1.3318, "step": 632 }, { "epoch": 2.5019762845849804, "grad_norm": 1.96635901927948, "learning_rate": 3.763888888888889e-05, "loss": 1.2446, "step": 633 }, { "epoch": 2.5059288537549405, "grad_norm": 2.349234104156494, "learning_rate": 3.761904761904762e-05, "loss": 1.3324, "step": 634 }, { "epoch": 2.5098814229249014, "grad_norm": 2.961324691772461, "learning_rate": 3.759920634920635e-05, "loss": 1.1765, "step": 635 }, { "epoch": 2.513833992094862, "grad_norm": 2.8292768001556396, "learning_rate": 3.757936507936508e-05, "loss": 1.0077, "step": 636 }, { "epoch": 2.5177865612648223, "grad_norm": 2.9311156272888184, "learning_rate": 3.755952380952381e-05, "loss": 1.3392, "step": 637 }, { "epoch": 2.5217391304347827, "grad_norm": 2.6130383014678955, "learning_rate": 3.753968253968254e-05, "loss": 1.2328, "step": 638 }, { "epoch": 2.525691699604743, "grad_norm": 2.0875937938690186, "learning_rate": 3.751984126984127e-05, "loss": 1.3491, "step": 639 }, { "epoch": 2.5296442687747036, "grad_norm": 2.593751907348633, "learning_rate": 3.7500000000000003e-05, "loss": 1.1862, "step": 640 }, { "epoch": 2.533596837944664, "grad_norm": 2.0226974487304688, "learning_rate": 3.748015873015873e-05, "loss": 1.3762, "step": 641 }, { "epoch": 2.5375494071146245, "grad_norm": 2.4336142539978027, "learning_rate": 3.7460317460317464e-05, "loss": 1.3271, "step": 642 }, { "epoch": 2.541501976284585, "grad_norm": 1.8397362232208252, "learning_rate": 3.744047619047619e-05, "loss": 1.2235, "step": 643 }, { "epoch": 2.5454545454545454, "grad_norm": 2.0357463359832764, "learning_rate": 3.7420634920634925e-05, "loss": 1.3871, "step": 644 }, { "epoch": 2.549407114624506, "grad_norm": 3.3774683475494385, "learning_rate": 3.740079365079365e-05, "loss": 1.3579, "step": 645 }, { "epoch": 2.5533596837944663, "grad_norm": 3.041534662246704, "learning_rate": 3.7380952380952386e-05, "loss": 1.1318, "step": 646 }, { "epoch": 2.5573122529644268, "grad_norm": 2.484518527984619, "learning_rate": 3.736111111111111e-05, "loss": 1.3531, "step": 647 }, { "epoch": 2.561264822134387, "grad_norm": 2.612365245819092, "learning_rate": 3.7341269841269846e-05, "loss": 1.3075, "step": 648 }, { "epoch": 2.5652173913043477, "grad_norm": 2.3737142086029053, "learning_rate": 3.7321428571428573e-05, "loss": 1.272, "step": 649 }, { "epoch": 2.5691699604743086, "grad_norm": 2.600964069366455, "learning_rate": 3.730158730158731e-05, "loss": 1.0839, "step": 650 }, { "epoch": 2.5731225296442686, "grad_norm": 2.697070598602295, "learning_rate": 3.7281746031746034e-05, "loss": 1.1383, "step": 651 }, { "epoch": 2.5770750988142295, "grad_norm": 2.752293109893799, "learning_rate": 3.726190476190476e-05, "loss": 1.2823, "step": 652 }, { "epoch": 2.5810276679841895, "grad_norm": 2.47784161567688, "learning_rate": 3.7242063492063495e-05, "loss": 1.5547, "step": 653 }, { "epoch": 2.5849802371541504, "grad_norm": 2.5010085105895996, "learning_rate": 3.722222222222222e-05, "loss": 1.2051, "step": 654 }, { "epoch": 2.588932806324111, "grad_norm": 2.3102123737335205, "learning_rate": 3.7202380952380956e-05, "loss": 1.1448, "step": 655 }, { "epoch": 2.5928853754940713, "grad_norm": 2.760524034500122, "learning_rate": 3.718253968253968e-05, "loss": 1.3238, "step": 656 }, { "epoch": 2.5968379446640317, "grad_norm": 3.336056709289551, "learning_rate": 3.7162698412698416e-05, "loss": 1.2137, "step": 657 }, { "epoch": 2.600790513833992, "grad_norm": 3.0085883140563965, "learning_rate": 3.7142857142857143e-05, "loss": 1.0954, "step": 658 }, { "epoch": 2.6047430830039526, "grad_norm": 2.819323778152466, "learning_rate": 3.712301587301588e-05, "loss": 1.4706, "step": 659 }, { "epoch": 2.608695652173913, "grad_norm": 2.527916193008423, "learning_rate": 3.7103174603174604e-05, "loss": 1.2759, "step": 660 }, { "epoch": 2.6126482213438735, "grad_norm": 3.203950881958008, "learning_rate": 3.708333333333334e-05, "loss": 1.907, "step": 661 }, { "epoch": 2.616600790513834, "grad_norm": 2.3258347511291504, "learning_rate": 3.7063492063492065e-05, "loss": 1.0956, "step": 662 }, { "epoch": 2.6205533596837944, "grad_norm": 2.3885722160339355, "learning_rate": 3.70436507936508e-05, "loss": 1.3588, "step": 663 }, { "epoch": 2.624505928853755, "grad_norm": 2.46718168258667, "learning_rate": 3.7023809523809526e-05, "loss": 1.5113, "step": 664 }, { "epoch": 2.6284584980237153, "grad_norm": 2.780848503112793, "learning_rate": 3.700396825396826e-05, "loss": 1.3804, "step": 665 }, { "epoch": 2.632411067193676, "grad_norm": 2.919987440109253, "learning_rate": 3.6984126984126986e-05, "loss": 1.4946, "step": 666 }, { "epoch": 2.6363636363636362, "grad_norm": 2.093374252319336, "learning_rate": 3.696428571428572e-05, "loss": 1.3501, "step": 667 }, { "epoch": 2.6403162055335967, "grad_norm": 3.430272102355957, "learning_rate": 3.694444444444445e-05, "loss": 0.956, "step": 668 }, { "epoch": 2.6442687747035576, "grad_norm": 2.6946187019348145, "learning_rate": 3.692460317460318e-05, "loss": 1.3491, "step": 669 }, { "epoch": 2.6482213438735176, "grad_norm": 2.1861889362335205, "learning_rate": 3.690476190476191e-05, "loss": 1.3425, "step": 670 }, { "epoch": 2.6521739130434785, "grad_norm": 2.343151569366455, "learning_rate": 3.688492063492064e-05, "loss": 1.3148, "step": 671 }, { "epoch": 2.6561264822134385, "grad_norm": 2.8744852542877197, "learning_rate": 3.686507936507937e-05, "loss": 1.1524, "step": 672 }, { "epoch": 2.6600790513833994, "grad_norm": 2.830291271209717, "learning_rate": 3.68452380952381e-05, "loss": 1.5512, "step": 673 }, { "epoch": 2.66403162055336, "grad_norm": 2.655128002166748, "learning_rate": 3.682539682539683e-05, "loss": 1.2045, "step": 674 }, { "epoch": 2.6679841897233203, "grad_norm": 2.4060311317443848, "learning_rate": 3.6805555555555556e-05, "loss": 1.0609, "step": 675 }, { "epoch": 2.6719367588932808, "grad_norm": 2.0129261016845703, "learning_rate": 3.678571428571429e-05, "loss": 1.2873, "step": 676 }, { "epoch": 2.675889328063241, "grad_norm": 2.645951271057129, "learning_rate": 3.676587301587302e-05, "loss": 1.1951, "step": 677 }, { "epoch": 2.6798418972332017, "grad_norm": 2.4416396617889404, "learning_rate": 3.674603174603175e-05, "loss": 1.3103, "step": 678 }, { "epoch": 2.683794466403162, "grad_norm": 3.599039077758789, "learning_rate": 3.672619047619048e-05, "loss": 1.4431, "step": 679 }, { "epoch": 2.6877470355731226, "grad_norm": 2.941194534301758, "learning_rate": 3.6706349206349205e-05, "loss": 1.0918, "step": 680 }, { "epoch": 2.691699604743083, "grad_norm": 2.7332816123962402, "learning_rate": 3.668650793650794e-05, "loss": 1.3429, "step": 681 }, { "epoch": 2.6956521739130435, "grad_norm": 3.3440287113189697, "learning_rate": 3.6666666666666666e-05, "loss": 1.4627, "step": 682 }, { "epoch": 2.699604743083004, "grad_norm": 2.5131101608276367, "learning_rate": 3.664682539682539e-05, "loss": 1.5036, "step": 683 }, { "epoch": 2.7035573122529644, "grad_norm": 2.991786241531372, "learning_rate": 3.6626984126984126e-05, "loss": 1.8342, "step": 684 }, { "epoch": 2.707509881422925, "grad_norm": 2.5615289211273193, "learning_rate": 3.6607142857142853e-05, "loss": 1.3832, "step": 685 }, { "epoch": 2.7114624505928853, "grad_norm": 2.467275857925415, "learning_rate": 3.658730158730159e-05, "loss": 1.2683, "step": 686 }, { "epoch": 2.7154150197628457, "grad_norm": 2.156137228012085, "learning_rate": 3.6567460317460314e-05, "loss": 1.2575, "step": 687 }, { "epoch": 2.719367588932806, "grad_norm": 1.868895411491394, "learning_rate": 3.654761904761905e-05, "loss": 1.4412, "step": 688 }, { "epoch": 2.7233201581027666, "grad_norm": 2.4768173694610596, "learning_rate": 3.6527777777777775e-05, "loss": 1.2711, "step": 689 }, { "epoch": 2.7272727272727275, "grad_norm": 3.286557912826538, "learning_rate": 3.650793650793651e-05, "loss": 1.4438, "step": 690 }, { "epoch": 2.7312252964426875, "grad_norm": 2.219813823699951, "learning_rate": 3.6488095238095236e-05, "loss": 1.309, "step": 691 }, { "epoch": 2.7351778656126484, "grad_norm": 2.425921678543091, "learning_rate": 3.646825396825397e-05, "loss": 1.4898, "step": 692 }, { "epoch": 2.7391304347826084, "grad_norm": 2.3334920406341553, "learning_rate": 3.6448412698412696e-05, "loss": 1.3917, "step": 693 }, { "epoch": 2.7430830039525693, "grad_norm": 2.4505045413970947, "learning_rate": 3.642857142857143e-05, "loss": 1.2184, "step": 694 }, { "epoch": 2.7470355731225298, "grad_norm": 2.223360538482666, "learning_rate": 3.640873015873016e-05, "loss": 1.4157, "step": 695 }, { "epoch": 2.7509881422924902, "grad_norm": 2.872119665145874, "learning_rate": 3.638888888888889e-05, "loss": 1.1279, "step": 696 }, { "epoch": 2.7549407114624507, "grad_norm": 2.502791404724121, "learning_rate": 3.636904761904762e-05, "loss": 1.1267, "step": 697 }, { "epoch": 2.758893280632411, "grad_norm": 2.880258083343506, "learning_rate": 3.634920634920635e-05, "loss": 1.2936, "step": 698 }, { "epoch": 2.7628458498023716, "grad_norm": 2.5304715633392334, "learning_rate": 3.632936507936508e-05, "loss": 1.0587, "step": 699 }, { "epoch": 2.766798418972332, "grad_norm": 2.664189338684082, "learning_rate": 3.630952380952381e-05, "loss": 1.4785, "step": 700 }, { "epoch": 2.7707509881422925, "grad_norm": 2.274841070175171, "learning_rate": 3.628968253968254e-05, "loss": 1.3508, "step": 701 }, { "epoch": 2.774703557312253, "grad_norm": 2.5693907737731934, "learning_rate": 3.626984126984127e-05, "loss": 1.3676, "step": 702 }, { "epoch": 2.7786561264822134, "grad_norm": 2.505685806274414, "learning_rate": 3.625e-05, "loss": 1.3922, "step": 703 }, { "epoch": 2.782608695652174, "grad_norm": 2.220625400543213, "learning_rate": 3.6230158730158734e-05, "loss": 1.2811, "step": 704 }, { "epoch": 2.7865612648221343, "grad_norm": 2.694793939590454, "learning_rate": 3.621031746031746e-05, "loss": 1.4419, "step": 705 }, { "epoch": 2.7905138339920947, "grad_norm": 2.996187925338745, "learning_rate": 3.619047619047619e-05, "loss": 1.4447, "step": 706 }, { "epoch": 2.794466403162055, "grad_norm": 2.177954912185669, "learning_rate": 3.617063492063492e-05, "loss": 1.3629, "step": 707 }, { "epoch": 2.7984189723320156, "grad_norm": 2.743326187133789, "learning_rate": 3.615079365079365e-05, "loss": 1.4397, "step": 708 }, { "epoch": 2.8023715415019765, "grad_norm": 2.6118264198303223, "learning_rate": 3.613095238095238e-05, "loss": 1.1212, "step": 709 }, { "epoch": 2.8063241106719365, "grad_norm": 2.1388731002807617, "learning_rate": 3.611111111111111e-05, "loss": 1.5192, "step": 710 }, { "epoch": 2.8102766798418974, "grad_norm": 2.721480131149292, "learning_rate": 3.609126984126984e-05, "loss": 1.579, "step": 711 }, { "epoch": 2.8142292490118574, "grad_norm": 2.726956844329834, "learning_rate": 3.607142857142857e-05, "loss": 1.6523, "step": 712 }, { "epoch": 2.8181818181818183, "grad_norm": 2.369964599609375, "learning_rate": 3.6051587301587304e-05, "loss": 1.3389, "step": 713 }, { "epoch": 2.822134387351779, "grad_norm": 2.3911476135253906, "learning_rate": 3.603174603174603e-05, "loss": 1.4589, "step": 714 }, { "epoch": 2.8260869565217392, "grad_norm": 2.717634916305542, "learning_rate": 3.6011904761904765e-05, "loss": 1.4241, "step": 715 }, { "epoch": 2.8300395256916997, "grad_norm": 2.3168764114379883, "learning_rate": 3.599206349206349e-05, "loss": 1.3172, "step": 716 }, { "epoch": 2.83399209486166, "grad_norm": 2.34859037399292, "learning_rate": 3.5972222222222225e-05, "loss": 1.2871, "step": 717 }, { "epoch": 2.8379446640316206, "grad_norm": 2.4705185890197754, "learning_rate": 3.595238095238095e-05, "loss": 1.524, "step": 718 }, { "epoch": 2.841897233201581, "grad_norm": 2.9651718139648438, "learning_rate": 3.5932539682539686e-05, "loss": 1.4915, "step": 719 }, { "epoch": 2.8458498023715415, "grad_norm": 2.442030668258667, "learning_rate": 3.591269841269841e-05, "loss": 1.5001, "step": 720 }, { "epoch": 2.849802371541502, "grad_norm": 4.075084686279297, "learning_rate": 3.589285714285715e-05, "loss": 0.9039, "step": 721 }, { "epoch": 2.8537549407114624, "grad_norm": 3.0871315002441406, "learning_rate": 3.5873015873015874e-05, "loss": 1.6743, "step": 722 }, { "epoch": 2.857707509881423, "grad_norm": 2.6841776371002197, "learning_rate": 3.585317460317461e-05, "loss": 1.1567, "step": 723 }, { "epoch": 2.8616600790513833, "grad_norm": 2.2139689922332764, "learning_rate": 3.5833333333333335e-05, "loss": 1.4276, "step": 724 }, { "epoch": 2.8656126482213438, "grad_norm": 2.8722684383392334, "learning_rate": 3.581349206349207e-05, "loss": 1.2222, "step": 725 }, { "epoch": 2.869565217391304, "grad_norm": 3.5767440795898438, "learning_rate": 3.5793650793650795e-05, "loss": 1.421, "step": 726 }, { "epoch": 2.8735177865612647, "grad_norm": 3.3746676445007324, "learning_rate": 3.577380952380953e-05, "loss": 1.9383, "step": 727 }, { "epoch": 2.8774703557312256, "grad_norm": 3.810504913330078, "learning_rate": 3.5753968253968256e-05, "loss": 1.0087, "step": 728 }, { "epoch": 2.8814229249011856, "grad_norm": 2.2538998126983643, "learning_rate": 3.573412698412698e-05, "loss": 1.4981, "step": 729 }, { "epoch": 2.8853754940711465, "grad_norm": 2.282931327819824, "learning_rate": 3.571428571428572e-05, "loss": 1.2655, "step": 730 }, { "epoch": 2.8893280632411065, "grad_norm": 2.3477354049682617, "learning_rate": 3.5694444444444444e-05, "loss": 1.4641, "step": 731 }, { "epoch": 2.8932806324110674, "grad_norm": 2.141794204711914, "learning_rate": 3.567460317460318e-05, "loss": 1.5262, "step": 732 }, { "epoch": 2.897233201581028, "grad_norm": 2.0820016860961914, "learning_rate": 3.5654761904761905e-05, "loss": 1.3743, "step": 733 }, { "epoch": 2.9011857707509883, "grad_norm": 2.0662853717803955, "learning_rate": 3.563492063492064e-05, "loss": 1.3662, "step": 734 }, { "epoch": 2.9051383399209487, "grad_norm": 2.205324411392212, "learning_rate": 3.5615079365079365e-05, "loss": 1.4079, "step": 735 }, { "epoch": 2.909090909090909, "grad_norm": 2.201070785522461, "learning_rate": 3.55952380952381e-05, "loss": 1.1008, "step": 736 }, { "epoch": 2.9130434782608696, "grad_norm": 2.159719944000244, "learning_rate": 3.5575396825396826e-05, "loss": 1.2646, "step": 737 }, { "epoch": 2.91699604743083, "grad_norm": 2.3559587001800537, "learning_rate": 3.555555555555556e-05, "loss": 1.4704, "step": 738 }, { "epoch": 2.9209486166007905, "grad_norm": 3.3258392810821533, "learning_rate": 3.553571428571429e-05, "loss": 1.1226, "step": 739 }, { "epoch": 2.924901185770751, "grad_norm": 2.6636176109313965, "learning_rate": 3.551587301587302e-05, "loss": 1.3304, "step": 740 }, { "epoch": 2.9288537549407114, "grad_norm": 2.8491475582122803, "learning_rate": 3.549603174603175e-05, "loss": 1.4011, "step": 741 }, { "epoch": 2.932806324110672, "grad_norm": 3.1546342372894287, "learning_rate": 3.547619047619048e-05, "loss": 1.0591, "step": 742 }, { "epoch": 2.9367588932806323, "grad_norm": 3.2184884548187256, "learning_rate": 3.545634920634921e-05, "loss": 1.5376, "step": 743 }, { "epoch": 2.940711462450593, "grad_norm": 2.49813175201416, "learning_rate": 3.543650793650794e-05, "loss": 1.2826, "step": 744 }, { "epoch": 2.9446640316205532, "grad_norm": 2.511040210723877, "learning_rate": 3.541666666666667e-05, "loss": 1.4989, "step": 745 }, { "epoch": 2.9486166007905137, "grad_norm": 2.233522415161133, "learning_rate": 3.53968253968254e-05, "loss": 1.1618, "step": 746 }, { "epoch": 2.9525691699604746, "grad_norm": 2.288743495941162, "learning_rate": 3.537698412698413e-05, "loss": 0.9788, "step": 747 }, { "epoch": 2.9565217391304346, "grad_norm": 3.0124926567077637, "learning_rate": 3.5357142857142864e-05, "loss": 0.9291, "step": 748 }, { "epoch": 2.9604743083003955, "grad_norm": 3.1643810272216797, "learning_rate": 3.533730158730159e-05, "loss": 0.789, "step": 749 }, { "epoch": 2.9644268774703555, "grad_norm": 2.8564984798431396, "learning_rate": 3.5317460317460324e-05, "loss": 1.4296, "step": 750 }, { "epoch": 2.9683794466403164, "grad_norm": 2.2462079524993896, "learning_rate": 3.529761904761905e-05, "loss": 1.275, "step": 751 }, { "epoch": 2.972332015810277, "grad_norm": 2.5663106441497803, "learning_rate": 3.527777777777778e-05, "loss": 1.7537, "step": 752 }, { "epoch": 2.9762845849802373, "grad_norm": 2.8393218517303467, "learning_rate": 3.525793650793651e-05, "loss": 1.6171, "step": 753 }, { "epoch": 2.9802371541501977, "grad_norm": 2.4854910373687744, "learning_rate": 3.523809523809524e-05, "loss": 1.4017, "step": 754 }, { "epoch": 2.984189723320158, "grad_norm": 2.6834123134613037, "learning_rate": 3.521825396825397e-05, "loss": 1.4678, "step": 755 }, { "epoch": 2.9881422924901186, "grad_norm": 2.4412403106689453, "learning_rate": 3.51984126984127e-05, "loss": 1.2793, "step": 756 }, { "epoch": 2.992094861660079, "grad_norm": 2.7471697330474854, "learning_rate": 3.5178571428571434e-05, "loss": 1.4015, "step": 757 }, { "epoch": 2.9960474308300395, "grad_norm": 2.6294939517974854, "learning_rate": 3.515873015873016e-05, "loss": 1.4328, "step": 758 }, { "epoch": 3.0, "grad_norm": 2.844717502593994, "learning_rate": 3.513888888888889e-05, "loss": 1.2477, "step": 759 }, { "epoch": 3.0039525691699605, "grad_norm": 2.496687173843384, "learning_rate": 3.511904761904762e-05, "loss": 0.9741, "step": 760 }, { "epoch": 3.007905138339921, "grad_norm": 2.5280258655548096, "learning_rate": 3.509920634920635e-05, "loss": 0.8558, "step": 761 }, { "epoch": 3.0118577075098814, "grad_norm": 2.257899522781372, "learning_rate": 3.5079365079365075e-05, "loss": 1.1979, "step": 762 }, { "epoch": 3.015810276679842, "grad_norm": 2.1623339653015137, "learning_rate": 3.505952380952381e-05, "loss": 0.9799, "step": 763 }, { "epoch": 3.0197628458498023, "grad_norm": 2.9412381649017334, "learning_rate": 3.5039682539682536e-05, "loss": 0.8915, "step": 764 }, { "epoch": 3.0237154150197627, "grad_norm": 3.138514995574951, "learning_rate": 3.501984126984127e-05, "loss": 0.8636, "step": 765 }, { "epoch": 3.027667984189723, "grad_norm": 4.3793416023254395, "learning_rate": 3.5e-05, "loss": 0.7668, "step": 766 }, { "epoch": 3.0316205533596836, "grad_norm": 4.136168003082275, "learning_rate": 3.498015873015873e-05, "loss": 0.9277, "step": 767 }, { "epoch": 3.035573122529644, "grad_norm": 4.092403888702393, "learning_rate": 3.496031746031746e-05, "loss": 1.1269, "step": 768 }, { "epoch": 3.039525691699605, "grad_norm": 4.120564937591553, "learning_rate": 3.494047619047619e-05, "loss": 0.9644, "step": 769 }, { "epoch": 3.0434782608695654, "grad_norm": 2.608088254928589, "learning_rate": 3.492063492063492e-05, "loss": 0.8488, "step": 770 }, { "epoch": 3.047430830039526, "grad_norm": 3.257272481918335, "learning_rate": 3.490079365079365e-05, "loss": 0.8844, "step": 771 }, { "epoch": 3.0513833992094863, "grad_norm": 3.9354476928710938, "learning_rate": 3.488095238095238e-05, "loss": 0.9306, "step": 772 }, { "epoch": 3.0553359683794468, "grad_norm": 3.038783550262451, "learning_rate": 3.486111111111111e-05, "loss": 0.9639, "step": 773 }, { "epoch": 3.059288537549407, "grad_norm": 2.9691848754882812, "learning_rate": 3.484126984126984e-05, "loss": 1.0708, "step": 774 }, { "epoch": 3.0632411067193677, "grad_norm": 2.9549548625946045, "learning_rate": 3.4821428571428574e-05, "loss": 0.6093, "step": 775 }, { "epoch": 3.067193675889328, "grad_norm": 2.429551839828491, "learning_rate": 3.48015873015873e-05, "loss": 0.9417, "step": 776 }, { "epoch": 3.0711462450592886, "grad_norm": 3.1636385917663574, "learning_rate": 3.4781746031746034e-05, "loss": 0.7358, "step": 777 }, { "epoch": 3.075098814229249, "grad_norm": 2.7946548461914062, "learning_rate": 3.476190476190476e-05, "loss": 1.0406, "step": 778 }, { "epoch": 3.0790513833992095, "grad_norm": 2.363107442855835, "learning_rate": 3.4742063492063495e-05, "loss": 0.7761, "step": 779 }, { "epoch": 3.08300395256917, "grad_norm": 2.957887649536133, "learning_rate": 3.472222222222222e-05, "loss": 0.947, "step": 780 }, { "epoch": 3.0869565217391304, "grad_norm": 2.936105489730835, "learning_rate": 3.4702380952380956e-05, "loss": 0.9311, "step": 781 }, { "epoch": 3.090909090909091, "grad_norm": 3.723759174346924, "learning_rate": 3.468253968253968e-05, "loss": 0.7467, "step": 782 }, { "epoch": 3.0948616600790513, "grad_norm": 2.8762381076812744, "learning_rate": 3.466269841269842e-05, "loss": 1.078, "step": 783 }, { "epoch": 3.0988142292490117, "grad_norm": 3.5455434322357178, "learning_rate": 3.4642857142857144e-05, "loss": 0.9024, "step": 784 }, { "epoch": 3.102766798418972, "grad_norm": 3.4952311515808105, "learning_rate": 3.462301587301587e-05, "loss": 1.1114, "step": 785 }, { "epoch": 3.1067193675889326, "grad_norm": 3.0181546211242676, "learning_rate": 3.4603174603174604e-05, "loss": 1.0401, "step": 786 }, { "epoch": 3.110671936758893, "grad_norm": 2.7165963649749756, "learning_rate": 3.458333333333333e-05, "loss": 1.0207, "step": 787 }, { "epoch": 3.1146245059288535, "grad_norm": 3.077030897140503, "learning_rate": 3.4563492063492065e-05, "loss": 0.8701, "step": 788 }, { "epoch": 3.1185770750988144, "grad_norm": 2.9918665885925293, "learning_rate": 3.454365079365079e-05, "loss": 1.2665, "step": 789 }, { "epoch": 3.122529644268775, "grad_norm": 3.622313976287842, "learning_rate": 3.4523809523809526e-05, "loss": 1.1394, "step": 790 }, { "epoch": 3.1264822134387353, "grad_norm": 2.705162286758423, "learning_rate": 3.450396825396825e-05, "loss": 0.8924, "step": 791 }, { "epoch": 3.130434782608696, "grad_norm": 3.595820426940918, "learning_rate": 3.448412698412699e-05, "loss": 1.057, "step": 792 }, { "epoch": 3.1343873517786562, "grad_norm": 3.4480679035186768, "learning_rate": 3.4464285714285714e-05, "loss": 0.8167, "step": 793 }, { "epoch": 3.1383399209486167, "grad_norm": 2.8485183715820312, "learning_rate": 3.444444444444445e-05, "loss": 0.8327, "step": 794 }, { "epoch": 3.142292490118577, "grad_norm": 3.1256802082061768, "learning_rate": 3.4424603174603174e-05, "loss": 1.0551, "step": 795 }, { "epoch": 3.1462450592885376, "grad_norm": 3.3785347938537598, "learning_rate": 3.440476190476191e-05, "loss": 1.2833, "step": 796 }, { "epoch": 3.150197628458498, "grad_norm": 2.9685938358306885, "learning_rate": 3.4384920634920635e-05, "loss": 1.0435, "step": 797 }, { "epoch": 3.1541501976284585, "grad_norm": 2.9943830966949463, "learning_rate": 3.436507936507937e-05, "loss": 0.7879, "step": 798 }, { "epoch": 3.158102766798419, "grad_norm": 2.655322790145874, "learning_rate": 3.4345238095238096e-05, "loss": 0.8366, "step": 799 }, { "epoch": 3.1620553359683794, "grad_norm": 3.5305442810058594, "learning_rate": 3.432539682539683e-05, "loss": 0.8311, "step": 800 }, { "epoch": 3.16600790513834, "grad_norm": 2.946333169937134, "learning_rate": 3.430555555555556e-05, "loss": 1.037, "step": 801 }, { "epoch": 3.1699604743083003, "grad_norm": 3.5695722103118896, "learning_rate": 3.428571428571429e-05, "loss": 0.7785, "step": 802 }, { "epoch": 3.1739130434782608, "grad_norm": 3.318615436553955, "learning_rate": 3.426587301587302e-05, "loss": 0.8239, "step": 803 }, { "epoch": 3.177865612648221, "grad_norm": 3.4505763053894043, "learning_rate": 3.424603174603175e-05, "loss": 0.8528, "step": 804 }, { "epoch": 3.1818181818181817, "grad_norm": 3.004587411880493, "learning_rate": 3.422619047619048e-05, "loss": 1.123, "step": 805 }, { "epoch": 3.185770750988142, "grad_norm": 3.96537709236145, "learning_rate": 3.420634920634921e-05, "loss": 0.9566, "step": 806 }, { "epoch": 3.1897233201581026, "grad_norm": 3.715635061264038, "learning_rate": 3.418650793650794e-05, "loss": 0.8639, "step": 807 }, { "epoch": 3.1936758893280635, "grad_norm": 2.9800610542297363, "learning_rate": 3.4166666666666666e-05, "loss": 0.9108, "step": 808 }, { "epoch": 3.197628458498024, "grad_norm": 3.7264318466186523, "learning_rate": 3.41468253968254e-05, "loss": 1.0782, "step": 809 }, { "epoch": 3.2015810276679844, "grad_norm": 2.8650896549224854, "learning_rate": 3.412698412698413e-05, "loss": 0.7647, "step": 810 }, { "epoch": 3.205533596837945, "grad_norm": 4.19207763671875, "learning_rate": 3.410714285714286e-05, "loss": 1.0105, "step": 811 }, { "epoch": 3.2094861660079053, "grad_norm": 2.8715367317199707, "learning_rate": 3.408730158730159e-05, "loss": 1.0023, "step": 812 }, { "epoch": 3.2134387351778657, "grad_norm": 5.560529708862305, "learning_rate": 3.406746031746032e-05, "loss": 0.8081, "step": 813 }, { "epoch": 3.217391304347826, "grad_norm": 3.2364330291748047, "learning_rate": 3.404761904761905e-05, "loss": 0.8591, "step": 814 }, { "epoch": 3.2213438735177866, "grad_norm": 2.6285791397094727, "learning_rate": 3.402777777777778e-05, "loss": 1.0187, "step": 815 }, { "epoch": 3.225296442687747, "grad_norm": 3.7776012420654297, "learning_rate": 3.400793650793651e-05, "loss": 0.819, "step": 816 }, { "epoch": 3.2292490118577075, "grad_norm": 2.7896721363067627, "learning_rate": 3.398809523809524e-05, "loss": 0.975, "step": 817 }, { "epoch": 3.233201581027668, "grad_norm": 2.8350744247436523, "learning_rate": 3.396825396825397e-05, "loss": 1.3311, "step": 818 }, { "epoch": 3.2371541501976284, "grad_norm": 3.2776432037353516, "learning_rate": 3.3948412698412703e-05, "loss": 0.8914, "step": 819 }, { "epoch": 3.241106719367589, "grad_norm": 2.9362752437591553, "learning_rate": 3.392857142857143e-05, "loss": 1.0549, "step": 820 }, { "epoch": 3.2450592885375493, "grad_norm": 2.536288022994995, "learning_rate": 3.3908730158730164e-05, "loss": 0.8083, "step": 821 }, { "epoch": 3.2490118577075098, "grad_norm": 3.4325833320617676, "learning_rate": 3.388888888888889e-05, "loss": 0.9863, "step": 822 }, { "epoch": 3.2529644268774702, "grad_norm": 2.966909646987915, "learning_rate": 3.3869047619047625e-05, "loss": 0.9928, "step": 823 }, { "epoch": 3.2569169960474307, "grad_norm": 3.627739191055298, "learning_rate": 3.384920634920635e-05, "loss": 1.0572, "step": 824 }, { "epoch": 3.260869565217391, "grad_norm": 3.5908288955688477, "learning_rate": 3.3829365079365086e-05, "loss": 1.1389, "step": 825 }, { "epoch": 3.2648221343873516, "grad_norm": 2.7947778701782227, "learning_rate": 3.380952380952381e-05, "loss": 0.9508, "step": 826 }, { "epoch": 3.2687747035573125, "grad_norm": 2.9585814476013184, "learning_rate": 3.3789682539682546e-05, "loss": 0.8809, "step": 827 }, { "epoch": 3.2727272727272725, "grad_norm": 2.968035936355591, "learning_rate": 3.3769841269841273e-05, "loss": 1.0937, "step": 828 }, { "epoch": 3.2766798418972334, "grad_norm": 4.545178413391113, "learning_rate": 3.375000000000001e-05, "loss": 1.3998, "step": 829 }, { "epoch": 3.280632411067194, "grad_norm": 3.4058380126953125, "learning_rate": 3.3730158730158734e-05, "loss": 1.269, "step": 830 }, { "epoch": 3.2845849802371543, "grad_norm": 3.8424315452575684, "learning_rate": 3.371031746031746e-05, "loss": 0.7128, "step": 831 }, { "epoch": 3.2885375494071147, "grad_norm": 2.7813680171966553, "learning_rate": 3.3690476190476195e-05, "loss": 0.8108, "step": 832 }, { "epoch": 3.292490118577075, "grad_norm": 2.6684579849243164, "learning_rate": 3.367063492063492e-05, "loss": 0.8819, "step": 833 }, { "epoch": 3.2964426877470356, "grad_norm": 3.1254680156707764, "learning_rate": 3.3650793650793656e-05, "loss": 0.9083, "step": 834 }, { "epoch": 3.300395256916996, "grad_norm": 2.6201884746551514, "learning_rate": 3.363095238095238e-05, "loss": 0.751, "step": 835 }, { "epoch": 3.3043478260869565, "grad_norm": 2.976027250289917, "learning_rate": 3.3611111111111116e-05, "loss": 0.7687, "step": 836 }, { "epoch": 3.308300395256917, "grad_norm": 4.267848968505859, "learning_rate": 3.3591269841269843e-05, "loss": 0.8482, "step": 837 }, { "epoch": 3.3122529644268774, "grad_norm": 4.229562759399414, "learning_rate": 3.357142857142857e-05, "loss": 0.9975, "step": 838 }, { "epoch": 3.316205533596838, "grad_norm": 3.5600497722625732, "learning_rate": 3.35515873015873e-05, "loss": 1.197, "step": 839 }, { "epoch": 3.3201581027667983, "grad_norm": 3.5733120441436768, "learning_rate": 3.353174603174603e-05, "loss": 0.9723, "step": 840 }, { "epoch": 3.324110671936759, "grad_norm": 4.316638469696045, "learning_rate": 3.351190476190476e-05, "loss": 1.0709, "step": 841 }, { "epoch": 3.3280632411067192, "grad_norm": 2.8753249645233154, "learning_rate": 3.349206349206349e-05, "loss": 1.044, "step": 842 }, { "epoch": 3.3320158102766797, "grad_norm": 3.8951146602630615, "learning_rate": 3.347222222222222e-05, "loss": 1.2154, "step": 843 }, { "epoch": 3.33596837944664, "grad_norm": 2.5250723361968994, "learning_rate": 3.345238095238095e-05, "loss": 0.7862, "step": 844 }, { "epoch": 3.3399209486166006, "grad_norm": 2.821502447128296, "learning_rate": 3.343253968253968e-05, "loss": 1.0455, "step": 845 }, { "epoch": 3.3438735177865615, "grad_norm": 2.9029202461242676, "learning_rate": 3.3412698412698413e-05, "loss": 1.1058, "step": 846 }, { "epoch": 3.3478260869565215, "grad_norm": 2.247542381286621, "learning_rate": 3.339285714285714e-05, "loss": 0.9376, "step": 847 }, { "epoch": 3.3517786561264824, "grad_norm": 4.182517051696777, "learning_rate": 3.3373015873015874e-05, "loss": 0.7921, "step": 848 }, { "epoch": 3.355731225296443, "grad_norm": 2.482083797454834, "learning_rate": 3.33531746031746e-05, "loss": 0.9948, "step": 849 }, { "epoch": 3.3596837944664033, "grad_norm": 2.999898672103882, "learning_rate": 3.3333333333333335e-05, "loss": 0.8631, "step": 850 }, { "epoch": 3.3636363636363638, "grad_norm": 4.035496234893799, "learning_rate": 3.331349206349206e-05, "loss": 0.8496, "step": 851 }, { "epoch": 3.367588932806324, "grad_norm": 3.46382737159729, "learning_rate": 3.3293650793650796e-05, "loss": 0.7815, "step": 852 }, { "epoch": 3.3715415019762847, "grad_norm": 2.803903818130493, "learning_rate": 3.327380952380952e-05, "loss": 0.8237, "step": 853 }, { "epoch": 3.375494071146245, "grad_norm": 3.75998592376709, "learning_rate": 3.3253968253968256e-05, "loss": 1.183, "step": 854 }, { "epoch": 3.3794466403162056, "grad_norm": 3.8531124591827393, "learning_rate": 3.3234126984126983e-05, "loss": 1.105, "step": 855 }, { "epoch": 3.383399209486166, "grad_norm": 4.231212615966797, "learning_rate": 3.321428571428572e-05, "loss": 0.8909, "step": 856 }, { "epoch": 3.3873517786561265, "grad_norm": 3.3275704383850098, "learning_rate": 3.3194444444444444e-05, "loss": 0.8102, "step": 857 }, { "epoch": 3.391304347826087, "grad_norm": 3.103105306625366, "learning_rate": 3.317460317460318e-05, "loss": 0.6757, "step": 858 }, { "epoch": 3.3952569169960474, "grad_norm": 4.611384391784668, "learning_rate": 3.3154761904761905e-05, "loss": 1.113, "step": 859 }, { "epoch": 3.399209486166008, "grad_norm": 2.466308355331421, "learning_rate": 3.313492063492064e-05, "loss": 0.758, "step": 860 }, { "epoch": 3.4031620553359683, "grad_norm": 2.4484543800354004, "learning_rate": 3.3115079365079366e-05, "loss": 0.9824, "step": 861 }, { "epoch": 3.4071146245059287, "grad_norm": 2.8706154823303223, "learning_rate": 3.309523809523809e-05, "loss": 0.9201, "step": 862 }, { "epoch": 3.411067193675889, "grad_norm": 3.8132359981536865, "learning_rate": 3.3075396825396826e-05, "loss": 0.6403, "step": 863 }, { "epoch": 3.4150197628458496, "grad_norm": 3.109771251678467, "learning_rate": 3.3055555555555553e-05, "loss": 0.756, "step": 864 }, { "epoch": 3.4189723320158105, "grad_norm": 3.4301555156707764, "learning_rate": 3.303571428571429e-05, "loss": 1.0933, "step": 865 }, { "epoch": 3.4229249011857705, "grad_norm": 3.461763620376587, "learning_rate": 3.3015873015873014e-05, "loss": 1.2169, "step": 866 }, { "epoch": 3.4268774703557314, "grad_norm": 4.107053756713867, "learning_rate": 3.299603174603175e-05, "loss": 1.0824, "step": 867 }, { "epoch": 3.430830039525692, "grad_norm": 3.434462070465088, "learning_rate": 3.2976190476190475e-05, "loss": 1.0065, "step": 868 }, { "epoch": 3.4347826086956523, "grad_norm": 3.3090224266052246, "learning_rate": 3.295634920634921e-05, "loss": 0.9509, "step": 869 }, { "epoch": 3.438735177865613, "grad_norm": 3.5097386837005615, "learning_rate": 3.2936507936507936e-05, "loss": 0.8331, "step": 870 }, { "epoch": 3.4426877470355732, "grad_norm": 3.1316442489624023, "learning_rate": 3.291666666666667e-05, "loss": 0.8634, "step": 871 }, { "epoch": 3.4466403162055337, "grad_norm": 2.5135436058044434, "learning_rate": 3.2896825396825396e-05, "loss": 0.6734, "step": 872 }, { "epoch": 3.450592885375494, "grad_norm": 3.5739667415618896, "learning_rate": 3.287698412698413e-05, "loss": 0.8149, "step": 873 }, { "epoch": 3.4545454545454546, "grad_norm": 2.84611439704895, "learning_rate": 3.285714285714286e-05, "loss": 0.8658, "step": 874 }, { "epoch": 3.458498023715415, "grad_norm": 2.680215835571289, "learning_rate": 3.283730158730159e-05, "loss": 1.1295, "step": 875 }, { "epoch": 3.4624505928853755, "grad_norm": 3.028846263885498, "learning_rate": 3.281746031746032e-05, "loss": 1.0244, "step": 876 }, { "epoch": 3.466403162055336, "grad_norm": 2.5386555194854736, "learning_rate": 3.279761904761905e-05, "loss": 1.0598, "step": 877 }, { "epoch": 3.4703557312252964, "grad_norm": 2.7928504943847656, "learning_rate": 3.277777777777778e-05, "loss": 1.181, "step": 878 }, { "epoch": 3.474308300395257, "grad_norm": 3.864605665206909, "learning_rate": 3.275793650793651e-05, "loss": 0.9425, "step": 879 }, { "epoch": 3.4782608695652173, "grad_norm": 2.8199121952056885, "learning_rate": 3.273809523809524e-05, "loss": 1.0021, "step": 880 }, { "epoch": 3.4822134387351777, "grad_norm": 3.323481321334839, "learning_rate": 3.271825396825397e-05, "loss": 1.141, "step": 881 }, { "epoch": 3.486166007905138, "grad_norm": 3.6544790267944336, "learning_rate": 3.26984126984127e-05, "loss": 0.9603, "step": 882 }, { "epoch": 3.4901185770750986, "grad_norm": 3.930521011352539, "learning_rate": 3.2678571428571434e-05, "loss": 0.9759, "step": 883 }, { "epoch": 3.494071146245059, "grad_norm": 2.638461112976074, "learning_rate": 3.265873015873016e-05, "loss": 1.0179, "step": 884 }, { "epoch": 3.4980237154150196, "grad_norm": 3.1710309982299805, "learning_rate": 3.263888888888889e-05, "loss": 0.9966, "step": 885 }, { "epoch": 3.5019762845849804, "grad_norm": 4.251243591308594, "learning_rate": 3.261904761904762e-05, "loss": 0.9437, "step": 886 }, { "epoch": 3.5059288537549405, "grad_norm": 3.0481278896331787, "learning_rate": 3.259920634920635e-05, "loss": 0.99, "step": 887 }, { "epoch": 3.5098814229249014, "grad_norm": 2.843862295150757, "learning_rate": 3.257936507936508e-05, "loss": 1.057, "step": 888 }, { "epoch": 3.513833992094862, "grad_norm": 2.6303539276123047, "learning_rate": 3.255952380952381e-05, "loss": 1.0664, "step": 889 }, { "epoch": 3.5177865612648223, "grad_norm": 2.792405128479004, "learning_rate": 3.253968253968254e-05, "loss": 1.0506, "step": 890 }, { "epoch": 3.5217391304347827, "grad_norm": 5.069064140319824, "learning_rate": 3.251984126984127e-05, "loss": 0.815, "step": 891 }, { "epoch": 3.525691699604743, "grad_norm": 2.9551217555999756, "learning_rate": 3.2500000000000004e-05, "loss": 1.1707, "step": 892 }, { "epoch": 3.5296442687747036, "grad_norm": 3.7232065200805664, "learning_rate": 3.248015873015873e-05, "loss": 1.0063, "step": 893 }, { "epoch": 3.533596837944664, "grad_norm": 3.990532875061035, "learning_rate": 3.2460317460317465e-05, "loss": 0.8298, "step": 894 }, { "epoch": 3.5375494071146245, "grad_norm": 4.859503269195557, "learning_rate": 3.244047619047619e-05, "loss": 0.8827, "step": 895 }, { "epoch": 3.541501976284585, "grad_norm": 3.32344913482666, "learning_rate": 3.2420634920634925e-05, "loss": 0.8536, "step": 896 }, { "epoch": 3.5454545454545454, "grad_norm": 3.2624435424804688, "learning_rate": 3.240079365079365e-05, "loss": 1.2324, "step": 897 }, { "epoch": 3.549407114624506, "grad_norm": 4.429110050201416, "learning_rate": 3.2380952380952386e-05, "loss": 1.1154, "step": 898 }, { "epoch": 3.5533596837944663, "grad_norm": 3.8388755321502686, "learning_rate": 3.236111111111111e-05, "loss": 1.2359, "step": 899 }, { "epoch": 3.5573122529644268, "grad_norm": 3.2584800720214844, "learning_rate": 3.234126984126985e-05, "loss": 0.9469, "step": 900 }, { "epoch": 3.561264822134387, "grad_norm": 3.2762997150421143, "learning_rate": 3.2321428571428574e-05, "loss": 0.7991, "step": 901 }, { "epoch": 3.5652173913043477, "grad_norm": 3.214747905731201, "learning_rate": 3.230158730158731e-05, "loss": 1.2799, "step": 902 }, { "epoch": 3.5691699604743086, "grad_norm": 3.528118133544922, "learning_rate": 3.2281746031746035e-05, "loss": 0.9679, "step": 903 }, { "epoch": 3.5731225296442686, "grad_norm": 2.9673147201538086, "learning_rate": 3.226190476190477e-05, "loss": 1.0996, "step": 904 }, { "epoch": 3.5770750988142295, "grad_norm": 2.345867395401001, "learning_rate": 3.2242063492063495e-05, "loss": 0.8981, "step": 905 }, { "epoch": 3.5810276679841895, "grad_norm": 2.914339065551758, "learning_rate": 3.222222222222223e-05, "loss": 1.2681, "step": 906 }, { "epoch": 3.5849802371541504, "grad_norm": 2.6776278018951416, "learning_rate": 3.2202380952380956e-05, "loss": 0.9516, "step": 907 }, { "epoch": 3.588932806324111, "grad_norm": 3.282893180847168, "learning_rate": 3.218253968253968e-05, "loss": 1.1996, "step": 908 }, { "epoch": 3.5928853754940713, "grad_norm": 2.855419635772705, "learning_rate": 3.216269841269842e-05, "loss": 0.9228, "step": 909 }, { "epoch": 3.5968379446640317, "grad_norm": 3.372144937515259, "learning_rate": 3.2142857142857144e-05, "loss": 1.1176, "step": 910 }, { "epoch": 3.600790513833992, "grad_norm": 2.9246695041656494, "learning_rate": 3.212301587301588e-05, "loss": 0.8848, "step": 911 }, { "epoch": 3.6047430830039526, "grad_norm": 3.258700370788574, "learning_rate": 3.2103174603174605e-05, "loss": 1.1232, "step": 912 }, { "epoch": 3.608695652173913, "grad_norm": 3.726252794265747, "learning_rate": 3.208333333333334e-05, "loss": 0.9653, "step": 913 }, { "epoch": 3.6126482213438735, "grad_norm": 3.1601107120513916, "learning_rate": 3.2063492063492065e-05, "loss": 1.039, "step": 914 }, { "epoch": 3.616600790513834, "grad_norm": 3.5057191848754883, "learning_rate": 3.20436507936508e-05, "loss": 0.9399, "step": 915 }, { "epoch": 3.6205533596837944, "grad_norm": 3.562908411026001, "learning_rate": 3.202380952380952e-05, "loss": 1.0264, "step": 916 }, { "epoch": 3.624505928853755, "grad_norm": 3.443648099899292, "learning_rate": 3.200396825396825e-05, "loss": 0.8603, "step": 917 }, { "epoch": 3.6284584980237153, "grad_norm": 3.095889091491699, "learning_rate": 3.198412698412698e-05, "loss": 1.1861, "step": 918 }, { "epoch": 3.632411067193676, "grad_norm": 3.4708924293518066, "learning_rate": 3.1964285714285714e-05, "loss": 1.1105, "step": 919 }, { "epoch": 3.6363636363636362, "grad_norm": 2.894131660461426, "learning_rate": 3.194444444444444e-05, "loss": 1.07, "step": 920 }, { "epoch": 3.6403162055335967, "grad_norm": 4.143770694732666, "learning_rate": 3.1924603174603175e-05, "loss": 0.9666, "step": 921 }, { "epoch": 3.6442687747035576, "grad_norm": 2.9045121669769287, "learning_rate": 3.19047619047619e-05, "loss": 1.0069, "step": 922 }, { "epoch": 3.6482213438735176, "grad_norm": 2.5982000827789307, "learning_rate": 3.1884920634920635e-05, "loss": 1.2217, "step": 923 }, { "epoch": 3.6521739130434785, "grad_norm": 2.874178647994995, "learning_rate": 3.186507936507936e-05, "loss": 1.1494, "step": 924 }, { "epoch": 3.6561264822134385, "grad_norm": 3.3155198097229004, "learning_rate": 3.1845238095238096e-05, "loss": 1.1825, "step": 925 }, { "epoch": 3.6600790513833994, "grad_norm": 3.4435360431671143, "learning_rate": 3.182539682539682e-05, "loss": 0.7864, "step": 926 }, { "epoch": 3.66403162055336, "grad_norm": 3.355100393295288, "learning_rate": 3.180555555555556e-05, "loss": 0.8861, "step": 927 }, { "epoch": 3.6679841897233203, "grad_norm": 3.6266121864318848, "learning_rate": 3.1785714285714284e-05, "loss": 1.0996, "step": 928 }, { "epoch": 3.6719367588932808, "grad_norm": 3.77498459815979, "learning_rate": 3.176587301587302e-05, "loss": 0.7629, "step": 929 }, { "epoch": 3.675889328063241, "grad_norm": 3.8526268005371094, "learning_rate": 3.1746031746031745e-05, "loss": 1.1476, "step": 930 }, { "epoch": 3.6798418972332017, "grad_norm": 2.970158338546753, "learning_rate": 3.172619047619048e-05, "loss": 1.1171, "step": 931 }, { "epoch": 3.683794466403162, "grad_norm": 2.934819459915161, "learning_rate": 3.1706349206349205e-05, "loss": 0.964, "step": 932 }, { "epoch": 3.6877470355731226, "grad_norm": 3.0377979278564453, "learning_rate": 3.168650793650794e-05, "loss": 1.1309, "step": 933 }, { "epoch": 3.691699604743083, "grad_norm": 3.5154223442077637, "learning_rate": 3.1666666666666666e-05, "loss": 0.9907, "step": 934 }, { "epoch": 3.6956521739130435, "grad_norm": 3.415923833847046, "learning_rate": 3.16468253968254e-05, "loss": 0.9697, "step": 935 }, { "epoch": 3.699604743083004, "grad_norm": 3.0694198608398438, "learning_rate": 3.162698412698413e-05, "loss": 0.9632, "step": 936 }, { "epoch": 3.7035573122529644, "grad_norm": 2.25923490524292, "learning_rate": 3.160714285714286e-05, "loss": 0.9753, "step": 937 }, { "epoch": 3.707509881422925, "grad_norm": 4.5253190994262695, "learning_rate": 3.158730158730159e-05, "loss": 0.9479, "step": 938 }, { "epoch": 3.7114624505928853, "grad_norm": 2.794191598892212, "learning_rate": 3.1567460317460315e-05, "loss": 0.9286, "step": 939 }, { "epoch": 3.7154150197628457, "grad_norm": 3.3554208278656006, "learning_rate": 3.154761904761905e-05, "loss": 1.0205, "step": 940 }, { "epoch": 3.719367588932806, "grad_norm": 3.562750816345215, "learning_rate": 3.1527777777777775e-05, "loss": 0.7624, "step": 941 }, { "epoch": 3.7233201581027666, "grad_norm": 2.868648052215576, "learning_rate": 3.150793650793651e-05, "loss": 0.8968, "step": 942 }, { "epoch": 3.7272727272727275, "grad_norm": 3.128340721130371, "learning_rate": 3.1488095238095236e-05, "loss": 0.9745, "step": 943 }, { "epoch": 3.7312252964426875, "grad_norm": 3.5969157218933105, "learning_rate": 3.146825396825397e-05, "loss": 1.0403, "step": 944 }, { "epoch": 3.7351778656126484, "grad_norm": 3.2540881633758545, "learning_rate": 3.14484126984127e-05, "loss": 1.1106, "step": 945 }, { "epoch": 3.7391304347826084, "grad_norm": 2.8699069023132324, "learning_rate": 3.142857142857143e-05, "loss": 0.9645, "step": 946 }, { "epoch": 3.7430830039525693, "grad_norm": 3.2061824798583984, "learning_rate": 3.140873015873016e-05, "loss": 0.864, "step": 947 }, { "epoch": 3.7470355731225298, "grad_norm": 2.7899482250213623, "learning_rate": 3.138888888888889e-05, "loss": 1.1192, "step": 948 }, { "epoch": 3.7509881422924902, "grad_norm": 4.122161388397217, "learning_rate": 3.136904761904762e-05, "loss": 0.8513, "step": 949 }, { "epoch": 3.7549407114624507, "grad_norm": 2.6482725143432617, "learning_rate": 3.134920634920635e-05, "loss": 0.811, "step": 950 }, { "epoch": 3.758893280632411, "grad_norm": 2.7796828746795654, "learning_rate": 3.132936507936508e-05, "loss": 0.9356, "step": 951 }, { "epoch": 3.7628458498023716, "grad_norm": 3.4659996032714844, "learning_rate": 3.130952380952381e-05, "loss": 1.0248, "step": 952 }, { "epoch": 3.766798418972332, "grad_norm": 3.140477418899536, "learning_rate": 3.128968253968254e-05, "loss": 1.2593, "step": 953 }, { "epoch": 3.7707509881422925, "grad_norm": 3.8717284202575684, "learning_rate": 3.1269841269841274e-05, "loss": 1.0904, "step": 954 }, { "epoch": 3.774703557312253, "grad_norm": 2.737684488296509, "learning_rate": 3.125e-05, "loss": 0.8346, "step": 955 }, { "epoch": 3.7786561264822134, "grad_norm": 3.515249013900757, "learning_rate": 3.1230158730158734e-05, "loss": 1.1003, "step": 956 }, { "epoch": 3.782608695652174, "grad_norm": 3.8520002365112305, "learning_rate": 3.121031746031746e-05, "loss": 0.8998, "step": 957 }, { "epoch": 3.7865612648221343, "grad_norm": 3.644209384918213, "learning_rate": 3.1190476190476195e-05, "loss": 1.0063, "step": 958 }, { "epoch": 3.7905138339920947, "grad_norm": 3.2415544986724854, "learning_rate": 3.117063492063492e-05, "loss": 0.8328, "step": 959 }, { "epoch": 3.794466403162055, "grad_norm": 3.8536369800567627, "learning_rate": 3.1150793650793656e-05, "loss": 0.9605, "step": 960 }, { "epoch": 3.7984189723320156, "grad_norm": 2.8492162227630615, "learning_rate": 3.113095238095238e-05, "loss": 0.8059, "step": 961 }, { "epoch": 3.8023715415019765, "grad_norm": 3.2966291904449463, "learning_rate": 3.111111111111111e-05, "loss": 0.9345, "step": 962 }, { "epoch": 3.8063241106719365, "grad_norm": 3.0209009647369385, "learning_rate": 3.1091269841269844e-05, "loss": 0.9933, "step": 963 }, { "epoch": 3.8102766798418974, "grad_norm": 2.6817831993103027, "learning_rate": 3.107142857142857e-05, "loss": 0.9633, "step": 964 }, { "epoch": 3.8142292490118574, "grad_norm": 3.1547319889068604, "learning_rate": 3.1051587301587304e-05, "loss": 1.1855, "step": 965 }, { "epoch": 3.8181818181818183, "grad_norm": 3.485544443130493, "learning_rate": 3.103174603174603e-05, "loss": 1.1107, "step": 966 }, { "epoch": 3.822134387351779, "grad_norm": 3.0962092876434326, "learning_rate": 3.1011904761904765e-05, "loss": 1.2013, "step": 967 }, { "epoch": 3.8260869565217392, "grad_norm": 3.373776912689209, "learning_rate": 3.099206349206349e-05, "loss": 0.7553, "step": 968 }, { "epoch": 3.8300395256916997, "grad_norm": 3.174527883529663, "learning_rate": 3.0972222222222226e-05, "loss": 1.1787, "step": 969 }, { "epoch": 3.83399209486166, "grad_norm": 3.290992498397827, "learning_rate": 3.095238095238095e-05, "loss": 0.9103, "step": 970 }, { "epoch": 3.8379446640316206, "grad_norm": 3.8229081630706787, "learning_rate": 3.093253968253969e-05, "loss": 0.9309, "step": 971 }, { "epoch": 3.841897233201581, "grad_norm": 3.5504062175750732, "learning_rate": 3.0912698412698414e-05, "loss": 1.0507, "step": 972 }, { "epoch": 3.8458498023715415, "grad_norm": 3.4466750621795654, "learning_rate": 3.089285714285715e-05, "loss": 1.056, "step": 973 }, { "epoch": 3.849802371541502, "grad_norm": 4.083953857421875, "learning_rate": 3.0873015873015874e-05, "loss": 0.9524, "step": 974 }, { "epoch": 3.8537549407114624, "grad_norm": 4.008061408996582, "learning_rate": 3.085317460317461e-05, "loss": 1.0339, "step": 975 }, { "epoch": 3.857707509881423, "grad_norm": 2.528346300125122, "learning_rate": 3.0833333333333335e-05, "loss": 1.0405, "step": 976 }, { "epoch": 3.8616600790513833, "grad_norm": 2.7856969833374023, "learning_rate": 3.081349206349207e-05, "loss": 0.878, "step": 977 }, { "epoch": 3.8656126482213438, "grad_norm": 3.5250959396362305, "learning_rate": 3.0793650793650796e-05, "loss": 1.0533, "step": 978 }, { "epoch": 3.869565217391304, "grad_norm": 3.1857500076293945, "learning_rate": 3.077380952380953e-05, "loss": 1.1474, "step": 979 }, { "epoch": 3.8735177865612647, "grad_norm": 3.111074209213257, "learning_rate": 3.075396825396826e-05, "loss": 1.0187, "step": 980 }, { "epoch": 3.8774703557312256, "grad_norm": 3.149482250213623, "learning_rate": 3.073412698412699e-05, "loss": 0.8251, "step": 981 }, { "epoch": 3.8814229249011856, "grad_norm": 3.7290894985198975, "learning_rate": 3.071428571428572e-05, "loss": 0.8504, "step": 982 }, { "epoch": 3.8853754940711465, "grad_norm": 2.8733174800872803, "learning_rate": 3.069444444444445e-05, "loss": 0.7323, "step": 983 }, { "epoch": 3.8893280632411065, "grad_norm": 3.1158406734466553, "learning_rate": 3.067460317460318e-05, "loss": 0.8583, "step": 984 }, { "epoch": 3.8932806324110674, "grad_norm": 3.126250743865967, "learning_rate": 3.0654761904761905e-05, "loss": 1.0742, "step": 985 }, { "epoch": 3.897233201581028, "grad_norm": 4.984484672546387, "learning_rate": 3.063492063492064e-05, "loss": 1.0604, "step": 986 }, { "epoch": 3.9011857707509883, "grad_norm": 3.3782174587249756, "learning_rate": 3.0615079365079366e-05, "loss": 0.8303, "step": 987 }, { "epoch": 3.9051383399209487, "grad_norm": 3.0348994731903076, "learning_rate": 3.05952380952381e-05, "loss": 1.0685, "step": 988 }, { "epoch": 3.909090909090909, "grad_norm": 3.0283362865448, "learning_rate": 3.057539682539683e-05, "loss": 1.1548, "step": 989 }, { "epoch": 3.9130434782608696, "grad_norm": 3.938385009765625, "learning_rate": 3.055555555555556e-05, "loss": 0.7831, "step": 990 }, { "epoch": 3.91699604743083, "grad_norm": 3.3874075412750244, "learning_rate": 3.053571428571429e-05, "loss": 0.7838, "step": 991 }, { "epoch": 3.9209486166007905, "grad_norm": 3.09395432472229, "learning_rate": 3.051587301587302e-05, "loss": 0.7563, "step": 992 }, { "epoch": 3.924901185770751, "grad_norm": 3.3949618339538574, "learning_rate": 3.049603174603175e-05, "loss": 1.2432, "step": 993 }, { "epoch": 3.9288537549407114, "grad_norm": 3.91082501411438, "learning_rate": 3.0476190476190482e-05, "loss": 0.8746, "step": 994 }, { "epoch": 3.932806324110672, "grad_norm": 3.182863235473633, "learning_rate": 3.0456349206349206e-05, "loss": 0.9189, "step": 995 }, { "epoch": 3.9367588932806323, "grad_norm": 3.7233152389526367, "learning_rate": 3.0436507936507936e-05, "loss": 0.9538, "step": 996 }, { "epoch": 3.940711462450593, "grad_norm": 4.027876853942871, "learning_rate": 3.0416666666666666e-05, "loss": 0.8883, "step": 997 }, { "epoch": 3.9446640316205532, "grad_norm": 3.228454113006592, "learning_rate": 3.0396825396825397e-05, "loss": 0.9645, "step": 998 }, { "epoch": 3.9486166007905137, "grad_norm": 3.264758348464966, "learning_rate": 3.0376984126984127e-05, "loss": 0.9099, "step": 999 }, { "epoch": 3.9525691699604746, "grad_norm": 3.349977731704712, "learning_rate": 3.0357142857142857e-05, "loss": 0.731, "step": 1000 }, { "epoch": 3.9565217391304346, "grad_norm": 3.3020308017730713, "learning_rate": 3.0337301587301588e-05, "loss": 0.9018, "step": 1001 }, { "epoch": 3.9604743083003955, "grad_norm": 3.4217529296875, "learning_rate": 3.0317460317460318e-05, "loss": 1.1229, "step": 1002 }, { "epoch": 3.9644268774703555, "grad_norm": 3.2917253971099854, "learning_rate": 3.029761904761905e-05, "loss": 1.0188, "step": 1003 }, { "epoch": 3.9683794466403164, "grad_norm": 3.4840149879455566, "learning_rate": 3.0277777777777776e-05, "loss": 0.987, "step": 1004 }, { "epoch": 3.972332015810277, "grad_norm": 3.3869576454162598, "learning_rate": 3.0257936507936506e-05, "loss": 0.8482, "step": 1005 }, { "epoch": 3.9762845849802373, "grad_norm": 4.020000457763672, "learning_rate": 3.0238095238095236e-05, "loss": 0.9846, "step": 1006 }, { "epoch": 3.9802371541501977, "grad_norm": 3.028305768966675, "learning_rate": 3.0218253968253967e-05, "loss": 0.8622, "step": 1007 }, { "epoch": 3.984189723320158, "grad_norm": 2.8138604164123535, "learning_rate": 3.0198412698412697e-05, "loss": 1.2293, "step": 1008 }, { "epoch": 3.9881422924901186, "grad_norm": 3.196972370147705, "learning_rate": 3.0178571428571427e-05, "loss": 1.0762, "step": 1009 }, { "epoch": 3.992094861660079, "grad_norm": 4.044905185699463, "learning_rate": 3.0158730158730158e-05, "loss": 1.01, "step": 1010 }, { "epoch": 3.9960474308300395, "grad_norm": 2.731539011001587, "learning_rate": 3.0138888888888888e-05, "loss": 0.9882, "step": 1011 }, { "epoch": 4.0, "grad_norm": 2.787231683731079, "learning_rate": 3.011904761904762e-05, "loss": 0.7786, "step": 1012 }, { "epoch": 4.003952569169961, "grad_norm": 2.597419023513794, "learning_rate": 3.009920634920635e-05, "loss": 0.5683, "step": 1013 }, { "epoch": 4.007905138339921, "grad_norm": 3.2427003383636475, "learning_rate": 3.007936507936508e-05, "loss": 0.9022, "step": 1014 }, { "epoch": 4.011857707509882, "grad_norm": 2.5612714290618896, "learning_rate": 3.005952380952381e-05, "loss": 0.4494, "step": 1015 }, { "epoch": 4.015810276679842, "grad_norm": 3.5918335914611816, "learning_rate": 3.003968253968254e-05, "loss": 0.5691, "step": 1016 }, { "epoch": 4.019762845849803, "grad_norm": 4.1184892654418945, "learning_rate": 3.001984126984127e-05, "loss": 0.6702, "step": 1017 }, { "epoch": 4.023715415019763, "grad_norm": 4.948250770568848, "learning_rate": 3e-05, "loss": 0.7939, "step": 1018 }, { "epoch": 4.027667984189724, "grad_norm": 4.541839122772217, "learning_rate": 2.998015873015873e-05, "loss": 0.5846, "step": 1019 }, { "epoch": 4.031620553359684, "grad_norm": 6.365063190460205, "learning_rate": 2.996031746031746e-05, "loss": 1.0476, "step": 1020 }, { "epoch": 4.0355731225296445, "grad_norm": 5.4133992195129395, "learning_rate": 2.9940476190476192e-05, "loss": 0.5169, "step": 1021 }, { "epoch": 4.0395256916996045, "grad_norm": 5.79783821105957, "learning_rate": 2.9920634920634922e-05, "loss": 0.5218, "step": 1022 }, { "epoch": 4.043478260869565, "grad_norm": 3.8281731605529785, "learning_rate": 2.9900793650793653e-05, "loss": 0.5262, "step": 1023 }, { "epoch": 4.047430830039525, "grad_norm": 3.4406702518463135, "learning_rate": 2.9880952380952383e-05, "loss": 0.6195, "step": 1024 }, { "epoch": 4.051383399209486, "grad_norm": 2.6785409450531006, "learning_rate": 2.9861111111111113e-05, "loss": 0.6092, "step": 1025 }, { "epoch": 4.055335968379446, "grad_norm": 3.221266508102417, "learning_rate": 2.9841269841269844e-05, "loss": 0.5122, "step": 1026 }, { "epoch": 4.059288537549407, "grad_norm": 3.454709053039551, "learning_rate": 2.982142857142857e-05, "loss": 0.364, "step": 1027 }, { "epoch": 4.063241106719367, "grad_norm": 3.4353463649749756, "learning_rate": 2.98015873015873e-05, "loss": 0.4952, "step": 1028 }, { "epoch": 4.067193675889328, "grad_norm": 3.565638303756714, "learning_rate": 2.978174603174603e-05, "loss": 0.7452, "step": 1029 }, { "epoch": 4.071146245059288, "grad_norm": 4.197516918182373, "learning_rate": 2.9761904761904762e-05, "loss": 0.5709, "step": 1030 }, { "epoch": 4.075098814229249, "grad_norm": 3.723968744277954, "learning_rate": 2.9742063492063492e-05, "loss": 0.4972, "step": 1031 }, { "epoch": 4.07905138339921, "grad_norm": 3.2937045097351074, "learning_rate": 2.9722222222222223e-05, "loss": 0.5963, "step": 1032 }, { "epoch": 4.08300395256917, "grad_norm": 3.6638174057006836, "learning_rate": 2.9702380952380953e-05, "loss": 0.6339, "step": 1033 }, { "epoch": 4.086956521739131, "grad_norm": 3.8052680492401123, "learning_rate": 2.9682539682539683e-05, "loss": 0.4921, "step": 1034 }, { "epoch": 4.090909090909091, "grad_norm": 3.9429097175598145, "learning_rate": 2.9662698412698414e-05, "loss": 0.5685, "step": 1035 }, { "epoch": 4.094861660079052, "grad_norm": 4.403009414672852, "learning_rate": 2.9642857142857144e-05, "loss": 0.8721, "step": 1036 }, { "epoch": 4.098814229249012, "grad_norm": 3.999182939529419, "learning_rate": 2.9623015873015875e-05, "loss": 0.768, "step": 1037 }, { "epoch": 4.102766798418973, "grad_norm": 3.5793051719665527, "learning_rate": 2.9603174603174605e-05, "loss": 0.465, "step": 1038 }, { "epoch": 4.106719367588933, "grad_norm": 3.8434765338897705, "learning_rate": 2.9583333333333335e-05, "loss": 0.5206, "step": 1039 }, { "epoch": 4.1106719367588935, "grad_norm": 4.839815139770508, "learning_rate": 2.9563492063492066e-05, "loss": 0.4424, "step": 1040 }, { "epoch": 4.1146245059288535, "grad_norm": 4.106775283813477, "learning_rate": 2.9543650793650796e-05, "loss": 0.5181, "step": 1041 }, { "epoch": 4.118577075098814, "grad_norm": 3.0763916969299316, "learning_rate": 2.9523809523809526e-05, "loss": 0.5855, "step": 1042 }, { "epoch": 4.122529644268774, "grad_norm": 3.8926162719726562, "learning_rate": 2.9503968253968257e-05, "loss": 0.6347, "step": 1043 }, { "epoch": 4.126482213438735, "grad_norm": 4.27288293838501, "learning_rate": 2.9484126984126987e-05, "loss": 0.4295, "step": 1044 }, { "epoch": 4.130434782608695, "grad_norm": 2.7551262378692627, "learning_rate": 2.9464285714285718e-05, "loss": 0.733, "step": 1045 }, { "epoch": 4.134387351778656, "grad_norm": 3.992636203765869, "learning_rate": 2.9444444444444448e-05, "loss": 0.5781, "step": 1046 }, { "epoch": 4.138339920948616, "grad_norm": 5.2175469398498535, "learning_rate": 2.9424603174603178e-05, "loss": 0.5791, "step": 1047 }, { "epoch": 4.142292490118577, "grad_norm": 3.3808295726776123, "learning_rate": 2.940476190476191e-05, "loss": 0.5001, "step": 1048 }, { "epoch": 4.146245059288537, "grad_norm": 3.7828593254089355, "learning_rate": 2.938492063492064e-05, "loss": 0.5229, "step": 1049 }, { "epoch": 4.150197628458498, "grad_norm": 3.9474239349365234, "learning_rate": 2.9365079365079366e-05, "loss": 0.6843, "step": 1050 }, { "epoch": 4.154150197628459, "grad_norm": 3.5939300060272217, "learning_rate": 2.9345238095238096e-05, "loss": 0.5496, "step": 1051 }, { "epoch": 4.158102766798419, "grad_norm": 4.338398456573486, "learning_rate": 2.9325396825396827e-05, "loss": 0.5746, "step": 1052 }, { "epoch": 4.16205533596838, "grad_norm": 3.5342512130737305, "learning_rate": 2.9305555555555557e-05, "loss": 0.5925, "step": 1053 }, { "epoch": 4.16600790513834, "grad_norm": 3.970322608947754, "learning_rate": 2.9285714285714288e-05, "loss": 0.5174, "step": 1054 }, { "epoch": 4.169960474308301, "grad_norm": 3.5085713863372803, "learning_rate": 2.9265873015873018e-05, "loss": 0.5177, "step": 1055 }, { "epoch": 4.173913043478261, "grad_norm": 4.02853536605835, "learning_rate": 2.9246031746031748e-05, "loss": 0.58, "step": 1056 }, { "epoch": 4.177865612648222, "grad_norm": 3.9853994846343994, "learning_rate": 2.922619047619048e-05, "loss": 0.8849, "step": 1057 }, { "epoch": 4.181818181818182, "grad_norm": 3.8760087490081787, "learning_rate": 2.920634920634921e-05, "loss": 0.3916, "step": 1058 }, { "epoch": 4.1857707509881426, "grad_norm": 4.411335468292236, "learning_rate": 2.918650793650794e-05, "loss": 0.5398, "step": 1059 }, { "epoch": 4.189723320158103, "grad_norm": 2.8598296642303467, "learning_rate": 2.916666666666667e-05, "loss": 0.5319, "step": 1060 }, { "epoch": 4.1936758893280635, "grad_norm": 3.225334405899048, "learning_rate": 2.91468253968254e-05, "loss": 0.6801, "step": 1061 }, { "epoch": 4.1976284584980235, "grad_norm": 3.511744976043701, "learning_rate": 2.912698412698413e-05, "loss": 0.6467, "step": 1062 }, { "epoch": 4.201581027667984, "grad_norm": 5.1000165939331055, "learning_rate": 2.910714285714286e-05, "loss": 0.6979, "step": 1063 }, { "epoch": 4.205533596837944, "grad_norm": 4.431115627288818, "learning_rate": 2.908730158730159e-05, "loss": 0.7414, "step": 1064 }, { "epoch": 4.209486166007905, "grad_norm": 3.7748608589172363, "learning_rate": 2.906746031746032e-05, "loss": 0.5364, "step": 1065 }, { "epoch": 4.213438735177865, "grad_norm": 3.4133846759796143, "learning_rate": 2.9047619047619052e-05, "loss": 0.766, "step": 1066 }, { "epoch": 4.217391304347826, "grad_norm": 4.544011116027832, "learning_rate": 2.9027777777777782e-05, "loss": 0.6298, "step": 1067 }, { "epoch": 4.221343873517786, "grad_norm": 3.5001301765441895, "learning_rate": 2.9007936507936513e-05, "loss": 0.6486, "step": 1068 }, { "epoch": 4.225296442687747, "grad_norm": 4.15241003036499, "learning_rate": 2.8988095238095243e-05, "loss": 0.6689, "step": 1069 }, { "epoch": 4.229249011857707, "grad_norm": 4.282387733459473, "learning_rate": 2.8968253968253974e-05, "loss": 0.6515, "step": 1070 }, { "epoch": 4.233201581027668, "grad_norm": 3.4778153896331787, "learning_rate": 2.8948412698412704e-05, "loss": 0.6742, "step": 1071 }, { "epoch": 4.237154150197629, "grad_norm": 3.6013362407684326, "learning_rate": 2.8928571428571434e-05, "loss": 0.5932, "step": 1072 }, { "epoch": 4.241106719367589, "grad_norm": 3.0561881065368652, "learning_rate": 2.890873015873016e-05, "loss": 0.5181, "step": 1073 }, { "epoch": 4.24505928853755, "grad_norm": 3.436709403991699, "learning_rate": 2.8888888888888888e-05, "loss": 0.6392, "step": 1074 }, { "epoch": 4.24901185770751, "grad_norm": 3.798583984375, "learning_rate": 2.886904761904762e-05, "loss": 0.3719, "step": 1075 }, { "epoch": 4.252964426877471, "grad_norm": 4.340543746948242, "learning_rate": 2.884920634920635e-05, "loss": 0.7213, "step": 1076 }, { "epoch": 4.256916996047431, "grad_norm": 3.0846059322357178, "learning_rate": 2.882936507936508e-05, "loss": 0.6184, "step": 1077 }, { "epoch": 4.260869565217392, "grad_norm": 3.127023696899414, "learning_rate": 2.880952380952381e-05, "loss": 0.8165, "step": 1078 }, { "epoch": 4.264822134387352, "grad_norm": 4.435762405395508, "learning_rate": 2.878968253968254e-05, "loss": 0.8462, "step": 1079 }, { "epoch": 4.2687747035573125, "grad_norm": 3.7268333435058594, "learning_rate": 2.876984126984127e-05, "loss": 0.5501, "step": 1080 }, { "epoch": 4.2727272727272725, "grad_norm": 4.605154514312744, "learning_rate": 2.8749999999999997e-05, "loss": 0.4942, "step": 1081 }, { "epoch": 4.276679841897233, "grad_norm": 4.297336101531982, "learning_rate": 2.8730158730158728e-05, "loss": 0.7144, "step": 1082 }, { "epoch": 4.280632411067193, "grad_norm": 3.4597883224487305, "learning_rate": 2.8710317460317458e-05, "loss": 0.6571, "step": 1083 }, { "epoch": 4.284584980237154, "grad_norm": 3.579210042953491, "learning_rate": 2.869047619047619e-05, "loss": 0.8167, "step": 1084 }, { "epoch": 4.288537549407114, "grad_norm": 4.63372802734375, "learning_rate": 2.867063492063492e-05, "loss": 0.5739, "step": 1085 }, { "epoch": 4.292490118577075, "grad_norm": 4.892091751098633, "learning_rate": 2.865079365079365e-05, "loss": 0.7786, "step": 1086 }, { "epoch": 4.296442687747035, "grad_norm": 3.682457208633423, "learning_rate": 2.863095238095238e-05, "loss": 0.6753, "step": 1087 }, { "epoch": 4.300395256916996, "grad_norm": 4.4599432945251465, "learning_rate": 2.861111111111111e-05, "loss": 0.5335, "step": 1088 }, { "epoch": 4.304347826086957, "grad_norm": 4.139975070953369, "learning_rate": 2.859126984126984e-05, "loss": 0.631, "step": 1089 }, { "epoch": 4.308300395256917, "grad_norm": 2.9012610912323, "learning_rate": 2.857142857142857e-05, "loss": 0.6686, "step": 1090 }, { "epoch": 4.312252964426877, "grad_norm": 4.059675216674805, "learning_rate": 2.85515873015873e-05, "loss": 0.5449, "step": 1091 }, { "epoch": 4.316205533596838, "grad_norm": 3.3060367107391357, "learning_rate": 2.853174603174603e-05, "loss": 0.5399, "step": 1092 }, { "epoch": 4.320158102766799, "grad_norm": 4.22020149230957, "learning_rate": 2.8511904761904762e-05, "loss": 0.5664, "step": 1093 }, { "epoch": 4.324110671936759, "grad_norm": 3.2028045654296875, "learning_rate": 2.8492063492063492e-05, "loss": 0.5289, "step": 1094 }, { "epoch": 4.32806324110672, "grad_norm": 4.202157497406006, "learning_rate": 2.8472222222222223e-05, "loss": 0.6845, "step": 1095 }, { "epoch": 4.33201581027668, "grad_norm": 3.3916430473327637, "learning_rate": 2.8452380952380953e-05, "loss": 0.7733, "step": 1096 }, { "epoch": 4.335968379446641, "grad_norm": 3.6081862449645996, "learning_rate": 2.8432539682539683e-05, "loss": 0.7535, "step": 1097 }, { "epoch": 4.339920948616601, "grad_norm": 3.1682872772216797, "learning_rate": 2.8412698412698414e-05, "loss": 0.5639, "step": 1098 }, { "epoch": 4.3438735177865615, "grad_norm": 3.9137520790100098, "learning_rate": 2.8392857142857144e-05, "loss": 0.5085, "step": 1099 }, { "epoch": 4.3478260869565215, "grad_norm": 3.902578353881836, "learning_rate": 2.8373015873015875e-05, "loss": 0.7883, "step": 1100 }, { "epoch": 4.351778656126482, "grad_norm": 3.498415946960449, "learning_rate": 2.8353174603174605e-05, "loss": 0.7754, "step": 1101 }, { "epoch": 4.355731225296442, "grad_norm": 3.8628177642822266, "learning_rate": 2.8333333333333335e-05, "loss": 0.7349, "step": 1102 }, { "epoch": 4.359683794466403, "grad_norm": 3.9302592277526855, "learning_rate": 2.8313492063492066e-05, "loss": 0.5988, "step": 1103 }, { "epoch": 4.363636363636363, "grad_norm": 3.7080790996551514, "learning_rate": 2.8293650793650793e-05, "loss": 0.7116, "step": 1104 }, { "epoch": 4.367588932806324, "grad_norm": 3.222238779067993, "learning_rate": 2.8273809523809523e-05, "loss": 0.4634, "step": 1105 }, { "epoch": 4.371541501976284, "grad_norm": 3.111239194869995, "learning_rate": 2.8253968253968253e-05, "loss": 0.4384, "step": 1106 }, { "epoch": 4.375494071146245, "grad_norm": 3.0986177921295166, "learning_rate": 2.8234126984126984e-05, "loss": 0.7854, "step": 1107 }, { "epoch": 4.379446640316205, "grad_norm": 4.473976135253906, "learning_rate": 2.8214285714285714e-05, "loss": 0.6981, "step": 1108 }, { "epoch": 4.383399209486166, "grad_norm": 3.3967599868774414, "learning_rate": 2.8194444444444445e-05, "loss": 0.7922, "step": 1109 }, { "epoch": 4.387351778656127, "grad_norm": 5.092434406280518, "learning_rate": 2.8174603174603175e-05, "loss": 0.6217, "step": 1110 }, { "epoch": 4.391304347826087, "grad_norm": 6.223644256591797, "learning_rate": 2.8154761904761905e-05, "loss": 0.6414, "step": 1111 }, { "epoch": 4.395256916996048, "grad_norm": 4.24446439743042, "learning_rate": 2.8134920634920636e-05, "loss": 0.9485, "step": 1112 }, { "epoch": 4.399209486166008, "grad_norm": 4.5990986824035645, "learning_rate": 2.8115079365079366e-05, "loss": 0.7239, "step": 1113 }, { "epoch": 4.403162055335969, "grad_norm": 4.005640506744385, "learning_rate": 2.8095238095238096e-05, "loss": 0.9314, "step": 1114 }, { "epoch": 4.407114624505929, "grad_norm": 4.990167140960693, "learning_rate": 2.8075396825396827e-05, "loss": 0.6125, "step": 1115 }, { "epoch": 4.41106719367589, "grad_norm": 4.539425849914551, "learning_rate": 2.8055555555555557e-05, "loss": 0.4673, "step": 1116 }, { "epoch": 4.41501976284585, "grad_norm": 5.004159927368164, "learning_rate": 2.8035714285714288e-05, "loss": 0.6748, "step": 1117 }, { "epoch": 4.4189723320158105, "grad_norm": 5.46227502822876, "learning_rate": 2.8015873015873018e-05, "loss": 0.6671, "step": 1118 }, { "epoch": 4.4229249011857705, "grad_norm": 3.3494784832000732, "learning_rate": 2.799603174603175e-05, "loss": 0.5736, "step": 1119 }, { "epoch": 4.426877470355731, "grad_norm": 4.099065780639648, "learning_rate": 2.797619047619048e-05, "loss": 0.6317, "step": 1120 }, { "epoch": 4.430830039525691, "grad_norm": 3.662707805633545, "learning_rate": 2.795634920634921e-05, "loss": 0.6749, "step": 1121 }, { "epoch": 4.434782608695652, "grad_norm": 3.4078757762908936, "learning_rate": 2.793650793650794e-05, "loss": 0.723, "step": 1122 }, { "epoch": 4.438735177865612, "grad_norm": 3.4741852283477783, "learning_rate": 2.791666666666667e-05, "loss": 0.7619, "step": 1123 }, { "epoch": 4.442687747035573, "grad_norm": 4.361887454986572, "learning_rate": 2.78968253968254e-05, "loss": 0.7173, "step": 1124 }, { "epoch": 4.446640316205533, "grad_norm": 3.31022047996521, "learning_rate": 2.787698412698413e-05, "loss": 0.4076, "step": 1125 }, { "epoch": 4.450592885375494, "grad_norm": 3.635115623474121, "learning_rate": 2.785714285714286e-05, "loss": 0.5728, "step": 1126 }, { "epoch": 4.454545454545454, "grad_norm": 5.300922870635986, "learning_rate": 2.7837301587301588e-05, "loss": 0.8788, "step": 1127 }, { "epoch": 4.458498023715415, "grad_norm": 4.8898491859436035, "learning_rate": 2.781746031746032e-05, "loss": 0.6955, "step": 1128 }, { "epoch": 4.462450592885375, "grad_norm": 3.6756420135498047, "learning_rate": 2.779761904761905e-05, "loss": 0.6531, "step": 1129 }, { "epoch": 4.466403162055336, "grad_norm": 4.139333724975586, "learning_rate": 2.777777777777778e-05, "loss": 0.4227, "step": 1130 }, { "epoch": 4.470355731225297, "grad_norm": 4.150503158569336, "learning_rate": 2.775793650793651e-05, "loss": 0.5596, "step": 1131 }, { "epoch": 4.474308300395257, "grad_norm": 3.9440436363220215, "learning_rate": 2.773809523809524e-05, "loss": 0.5473, "step": 1132 }, { "epoch": 4.478260869565218, "grad_norm": 4.698122978210449, "learning_rate": 2.771825396825397e-05, "loss": 0.5331, "step": 1133 }, { "epoch": 4.482213438735178, "grad_norm": 4.4642133712768555, "learning_rate": 2.76984126984127e-05, "loss": 0.5267, "step": 1134 }, { "epoch": 4.486166007905139, "grad_norm": 3.5104897022247314, "learning_rate": 2.767857142857143e-05, "loss": 0.7138, "step": 1135 }, { "epoch": 4.490118577075099, "grad_norm": 4.170843601226807, "learning_rate": 2.765873015873016e-05, "loss": 0.8175, "step": 1136 }, { "epoch": 4.4940711462450595, "grad_norm": 3.3033299446105957, "learning_rate": 2.7638888888888892e-05, "loss": 0.5711, "step": 1137 }, { "epoch": 4.4980237154150196, "grad_norm": 4.042965412139893, "learning_rate": 2.7619047619047622e-05, "loss": 0.7944, "step": 1138 }, { "epoch": 4.5019762845849804, "grad_norm": 3.890293836593628, "learning_rate": 2.7599206349206352e-05, "loss": 0.7309, "step": 1139 }, { "epoch": 4.5059288537549405, "grad_norm": 3.5573506355285645, "learning_rate": 2.7579365079365083e-05, "loss": 0.2866, "step": 1140 }, { "epoch": 4.509881422924901, "grad_norm": 3.3874566555023193, "learning_rate": 2.7559523809523813e-05, "loss": 0.5363, "step": 1141 }, { "epoch": 4.513833992094861, "grad_norm": 3.777653217315674, "learning_rate": 2.7539682539682544e-05, "loss": 0.8195, "step": 1142 }, { "epoch": 4.517786561264822, "grad_norm": 3.0119121074676514, "learning_rate": 2.7519841269841274e-05, "loss": 0.4722, "step": 1143 }, { "epoch": 4.521739130434782, "grad_norm": 3.57424259185791, "learning_rate": 2.7500000000000004e-05, "loss": 0.6739, "step": 1144 }, { "epoch": 4.525691699604743, "grad_norm": 4.8777174949646, "learning_rate": 2.7480158730158735e-05, "loss": 0.5557, "step": 1145 }, { "epoch": 4.529644268774703, "grad_norm": 5.821610927581787, "learning_rate": 2.7460317460317465e-05, "loss": 0.7279, "step": 1146 }, { "epoch": 4.533596837944664, "grad_norm": 3.617403268814087, "learning_rate": 2.7440476190476195e-05, "loss": 0.7618, "step": 1147 }, { "epoch": 4.537549407114625, "grad_norm": 4.069850921630859, "learning_rate": 2.7420634920634926e-05, "loss": 0.6691, "step": 1148 }, { "epoch": 4.541501976284585, "grad_norm": 3.8116888999938965, "learning_rate": 2.7400793650793656e-05, "loss": 0.6847, "step": 1149 }, { "epoch": 4.545454545454545, "grad_norm": 4.080233573913574, "learning_rate": 2.7380952380952383e-05, "loss": 0.7181, "step": 1150 }, { "epoch": 4.549407114624506, "grad_norm": 3.930443048477173, "learning_rate": 2.7361111111111114e-05, "loss": 0.4972, "step": 1151 }, { "epoch": 4.553359683794467, "grad_norm": 3.8190906047821045, "learning_rate": 2.734126984126984e-05, "loss": 0.7407, "step": 1152 }, { "epoch": 4.557312252964427, "grad_norm": 4.792582988739014, "learning_rate": 2.732142857142857e-05, "loss": 0.5122, "step": 1153 }, { "epoch": 4.561264822134388, "grad_norm": 3.7988221645355225, "learning_rate": 2.73015873015873e-05, "loss": 0.7225, "step": 1154 }, { "epoch": 4.565217391304348, "grad_norm": 4.3564605712890625, "learning_rate": 2.7281746031746032e-05, "loss": 0.5053, "step": 1155 }, { "epoch": 4.569169960474309, "grad_norm": 3.3426055908203125, "learning_rate": 2.7261904761904762e-05, "loss": 0.5994, "step": 1156 }, { "epoch": 4.573122529644269, "grad_norm": 3.6366496086120605, "learning_rate": 2.7242063492063492e-05, "loss": 0.7183, "step": 1157 }, { "epoch": 4.5770750988142295, "grad_norm": 4.141812801361084, "learning_rate": 2.7222222222222223e-05, "loss": 0.8983, "step": 1158 }, { "epoch": 4.5810276679841895, "grad_norm": 3.6468398571014404, "learning_rate": 2.720238095238095e-05, "loss": 0.7188, "step": 1159 }, { "epoch": 4.58498023715415, "grad_norm": 3.9180054664611816, "learning_rate": 2.718253968253968e-05, "loss": 0.5211, "step": 1160 }, { "epoch": 4.58893280632411, "grad_norm": 3.7029590606689453, "learning_rate": 2.716269841269841e-05, "loss": 0.6044, "step": 1161 }, { "epoch": 4.592885375494071, "grad_norm": 4.28466272354126, "learning_rate": 2.714285714285714e-05, "loss": 0.6413, "step": 1162 }, { "epoch": 4.596837944664031, "grad_norm": 5.500331401824951, "learning_rate": 2.712301587301587e-05, "loss": 0.7009, "step": 1163 }, { "epoch": 4.600790513833992, "grad_norm": 4.083467960357666, "learning_rate": 2.7103174603174602e-05, "loss": 0.822, "step": 1164 }, { "epoch": 4.604743083003952, "grad_norm": 2.7674543857574463, "learning_rate": 2.7083333333333332e-05, "loss": 0.6164, "step": 1165 }, { "epoch": 4.608695652173913, "grad_norm": 3.0514588356018066, "learning_rate": 2.7063492063492062e-05, "loss": 0.4806, "step": 1166 }, { "epoch": 4.612648221343873, "grad_norm": 4.635437965393066, "learning_rate": 2.7043650793650793e-05, "loss": 0.5268, "step": 1167 }, { "epoch": 4.616600790513834, "grad_norm": 3.791935443878174, "learning_rate": 2.7023809523809523e-05, "loss": 0.6394, "step": 1168 }, { "epoch": 4.620553359683795, "grad_norm": 3.0412096977233887, "learning_rate": 2.7003968253968254e-05, "loss": 0.5388, "step": 1169 }, { "epoch": 4.624505928853755, "grad_norm": 3.3811123371124268, "learning_rate": 2.6984126984126984e-05, "loss": 0.5911, "step": 1170 }, { "epoch": 4.628458498023716, "grad_norm": 3.5328590869903564, "learning_rate": 2.6964285714285714e-05, "loss": 0.5826, "step": 1171 }, { "epoch": 4.632411067193676, "grad_norm": 3.483593702316284, "learning_rate": 2.6944444444444445e-05, "loss": 0.5478, "step": 1172 }, { "epoch": 4.636363636363637, "grad_norm": 4.339010238647461, "learning_rate": 2.6924603174603175e-05, "loss": 0.5807, "step": 1173 }, { "epoch": 4.640316205533597, "grad_norm": 3.775643825531006, "learning_rate": 2.6904761904761905e-05, "loss": 0.7021, "step": 1174 }, { "epoch": 4.644268774703558, "grad_norm": 4.458033561706543, "learning_rate": 2.6884920634920636e-05, "loss": 0.6987, "step": 1175 }, { "epoch": 4.648221343873518, "grad_norm": 4.082224369049072, "learning_rate": 2.6865079365079366e-05, "loss": 0.5305, "step": 1176 }, { "epoch": 4.6521739130434785, "grad_norm": 4.834577560424805, "learning_rate": 2.6845238095238097e-05, "loss": 0.5664, "step": 1177 }, { "epoch": 4.6561264822134385, "grad_norm": 4.269159317016602, "learning_rate": 2.6825396825396827e-05, "loss": 0.5704, "step": 1178 }, { "epoch": 4.660079051383399, "grad_norm": 3.2594568729400635, "learning_rate": 2.6805555555555557e-05, "loss": 0.5927, "step": 1179 }, { "epoch": 4.664031620553359, "grad_norm": 3.458214521408081, "learning_rate": 2.6785714285714288e-05, "loss": 0.5754, "step": 1180 }, { "epoch": 4.66798418972332, "grad_norm": 4.189966201782227, "learning_rate": 2.6765873015873018e-05, "loss": 0.8723, "step": 1181 }, { "epoch": 4.67193675889328, "grad_norm": 3.8360981941223145, "learning_rate": 2.6746031746031745e-05, "loss": 0.4775, "step": 1182 }, { "epoch": 4.675889328063241, "grad_norm": 4.381925582885742, "learning_rate": 2.6726190476190475e-05, "loss": 0.607, "step": 1183 }, { "epoch": 4.679841897233201, "grad_norm": 4.080252647399902, "learning_rate": 2.6706349206349206e-05, "loss": 0.8249, "step": 1184 }, { "epoch": 4.683794466403162, "grad_norm": 3.6359405517578125, "learning_rate": 2.6686507936507936e-05, "loss": 0.3814, "step": 1185 }, { "epoch": 4.687747035573123, "grad_norm": 3.182612895965576, "learning_rate": 2.6666666666666667e-05, "loss": 0.5989, "step": 1186 }, { "epoch": 4.691699604743083, "grad_norm": 4.81362247467041, "learning_rate": 2.6646825396825397e-05, "loss": 0.7298, "step": 1187 }, { "epoch": 4.695652173913043, "grad_norm": 3.2109711170196533, "learning_rate": 2.6626984126984127e-05, "loss": 0.3586, "step": 1188 }, { "epoch": 4.699604743083004, "grad_norm": 3.968430280685425, "learning_rate": 2.6607142857142858e-05, "loss": 0.5076, "step": 1189 }, { "epoch": 4.703557312252965, "grad_norm": 3.057274341583252, "learning_rate": 2.6587301587301588e-05, "loss": 0.4669, "step": 1190 }, { "epoch": 4.707509881422925, "grad_norm": 4.017573356628418, "learning_rate": 2.656746031746032e-05, "loss": 0.6747, "step": 1191 }, { "epoch": 4.711462450592886, "grad_norm": 3.6146085262298584, "learning_rate": 2.654761904761905e-05, "loss": 0.4618, "step": 1192 }, { "epoch": 4.715415019762846, "grad_norm": 3.433858871459961, "learning_rate": 2.652777777777778e-05, "loss": 0.4446, "step": 1193 }, { "epoch": 4.719367588932807, "grad_norm": 3.7666232585906982, "learning_rate": 2.650793650793651e-05, "loss": 0.5724, "step": 1194 }, { "epoch": 4.723320158102767, "grad_norm": 3.7725718021392822, "learning_rate": 2.648809523809524e-05, "loss": 0.7872, "step": 1195 }, { "epoch": 4.7272727272727275, "grad_norm": 4.2439446449279785, "learning_rate": 2.646825396825397e-05, "loss": 0.7158, "step": 1196 }, { "epoch": 4.7312252964426875, "grad_norm": 5.448159694671631, "learning_rate": 2.64484126984127e-05, "loss": 0.4721, "step": 1197 }, { "epoch": 4.735177865612648, "grad_norm": 4.358859062194824, "learning_rate": 2.642857142857143e-05, "loss": 0.6254, "step": 1198 }, { "epoch": 4.739130434782608, "grad_norm": 3.7496564388275146, "learning_rate": 2.640873015873016e-05, "loss": 0.7183, "step": 1199 }, { "epoch": 4.743083003952569, "grad_norm": 4.28209114074707, "learning_rate": 2.6388888888888892e-05, "loss": 0.5722, "step": 1200 }, { "epoch": 4.747035573122529, "grad_norm": 3.856718063354492, "learning_rate": 2.6369047619047622e-05, "loss": 0.5289, "step": 1201 }, { "epoch": 4.75098814229249, "grad_norm": 3.4435012340545654, "learning_rate": 2.6349206349206353e-05, "loss": 0.7542, "step": 1202 }, { "epoch": 4.75494071146245, "grad_norm": 4.4995436668396, "learning_rate": 2.6329365079365083e-05, "loss": 0.7093, "step": 1203 }, { "epoch": 4.758893280632411, "grad_norm": 3.929421901702881, "learning_rate": 2.6309523809523813e-05, "loss": 0.8889, "step": 1204 }, { "epoch": 4.762845849802371, "grad_norm": 3.778069496154785, "learning_rate": 2.628968253968254e-05, "loss": 0.766, "step": 1205 }, { "epoch": 4.766798418972332, "grad_norm": 3.344264030456543, "learning_rate": 2.626984126984127e-05, "loss": 0.5133, "step": 1206 }, { "epoch": 4.770750988142293, "grad_norm": 3.597881317138672, "learning_rate": 2.625e-05, "loss": 0.7008, "step": 1207 }, { "epoch": 4.774703557312253, "grad_norm": 3.753389358520508, "learning_rate": 2.623015873015873e-05, "loss": 0.5307, "step": 1208 }, { "epoch": 4.778656126482213, "grad_norm": 3.206299066543579, "learning_rate": 2.6210317460317462e-05, "loss": 0.5639, "step": 1209 }, { "epoch": 4.782608695652174, "grad_norm": 3.630187749862671, "learning_rate": 2.6190476190476192e-05, "loss": 0.4498, "step": 1210 }, { "epoch": 4.786561264822135, "grad_norm": 3.9658334255218506, "learning_rate": 2.6170634920634923e-05, "loss": 0.6379, "step": 1211 }, { "epoch": 4.790513833992095, "grad_norm": 4.101767539978027, "learning_rate": 2.6150793650793653e-05, "loss": 0.6574, "step": 1212 }, { "epoch": 4.794466403162056, "grad_norm": 3.9470412731170654, "learning_rate": 2.6130952380952383e-05, "loss": 0.5475, "step": 1213 }, { "epoch": 4.798418972332016, "grad_norm": 4.430981159210205, "learning_rate": 2.6111111111111114e-05, "loss": 0.4462, "step": 1214 }, { "epoch": 4.8023715415019765, "grad_norm": 5.369266033172607, "learning_rate": 2.6091269841269844e-05, "loss": 0.7793, "step": 1215 }, { "epoch": 4.8063241106719365, "grad_norm": 3.2877326011657715, "learning_rate": 2.6071428571428574e-05, "loss": 0.5941, "step": 1216 }, { "epoch": 4.810276679841897, "grad_norm": 4.576911449432373, "learning_rate": 2.6051587301587305e-05, "loss": 0.582, "step": 1217 }, { "epoch": 4.8142292490118574, "grad_norm": 3.5742104053497314, "learning_rate": 2.6031746031746035e-05, "loss": 0.6526, "step": 1218 }, { "epoch": 4.818181818181818, "grad_norm": 2.9173500537872314, "learning_rate": 2.6011904761904766e-05, "loss": 0.4235, "step": 1219 }, { "epoch": 4.822134387351778, "grad_norm": 3.32147479057312, "learning_rate": 2.5992063492063496e-05, "loss": 0.5129, "step": 1220 }, { "epoch": 4.826086956521739, "grad_norm": 3.451444387435913, "learning_rate": 2.5972222222222226e-05, "loss": 0.5899, "step": 1221 }, { "epoch": 4.830039525691699, "grad_norm": 4.481076717376709, "learning_rate": 2.5952380952380957e-05, "loss": 0.7092, "step": 1222 }, { "epoch": 4.83399209486166, "grad_norm": 5.769997596740723, "learning_rate": 2.5932539682539687e-05, "loss": 0.8478, "step": 1223 }, { "epoch": 4.837944664031621, "grad_norm": 4.319329738616943, "learning_rate": 2.5912698412698417e-05, "loss": 0.7591, "step": 1224 }, { "epoch": 4.841897233201581, "grad_norm": 3.661302328109741, "learning_rate": 2.5892857142857148e-05, "loss": 0.6261, "step": 1225 }, { "epoch": 4.845849802371541, "grad_norm": 3.7250123023986816, "learning_rate": 2.5873015873015878e-05, "loss": 0.8497, "step": 1226 }, { "epoch": 4.849802371541502, "grad_norm": 4.192583084106445, "learning_rate": 2.585317460317461e-05, "loss": 0.6283, "step": 1227 }, { "epoch": 4.853754940711463, "grad_norm": 3.978309154510498, "learning_rate": 2.5833333333333336e-05, "loss": 0.7002, "step": 1228 }, { "epoch": 4.857707509881423, "grad_norm": 3.473998546600342, "learning_rate": 2.5813492063492066e-05, "loss": 0.6287, "step": 1229 }, { "epoch": 4.861660079051384, "grad_norm": 3.9068286418914795, "learning_rate": 2.5793650793650796e-05, "loss": 0.4956, "step": 1230 }, { "epoch": 4.865612648221344, "grad_norm": 2.8710238933563232, "learning_rate": 2.5773809523809523e-05, "loss": 0.6042, "step": 1231 }, { "epoch": 4.869565217391305, "grad_norm": 3.4524970054626465, "learning_rate": 2.5753968253968254e-05, "loss": 0.4019, "step": 1232 }, { "epoch": 4.873517786561265, "grad_norm": 3.4277803897857666, "learning_rate": 2.5734126984126984e-05, "loss": 0.4765, "step": 1233 }, { "epoch": 4.877470355731226, "grad_norm": 3.466019868850708, "learning_rate": 2.5714285714285714e-05, "loss": 0.6861, "step": 1234 }, { "epoch": 4.881422924901186, "grad_norm": 4.309812068939209, "learning_rate": 2.5694444444444445e-05, "loss": 0.5055, "step": 1235 }, { "epoch": 4.8853754940711465, "grad_norm": 3.608254909515381, "learning_rate": 2.5674603174603172e-05, "loss": 0.5123, "step": 1236 }, { "epoch": 4.8893280632411065, "grad_norm": 4.4674787521362305, "learning_rate": 2.5654761904761902e-05, "loss": 0.8411, "step": 1237 }, { "epoch": 4.893280632411067, "grad_norm": 4.53634786605835, "learning_rate": 2.5634920634920633e-05, "loss": 0.5074, "step": 1238 }, { "epoch": 4.897233201581027, "grad_norm": 3.6150360107421875, "learning_rate": 2.5615079365079363e-05, "loss": 0.7875, "step": 1239 }, { "epoch": 4.901185770750988, "grad_norm": 4.096851348876953, "learning_rate": 2.5595238095238093e-05, "loss": 0.6804, "step": 1240 }, { "epoch": 4.905138339920948, "grad_norm": 3.8696417808532715, "learning_rate": 2.5575396825396824e-05, "loss": 0.6159, "step": 1241 }, { "epoch": 4.909090909090909, "grad_norm": 3.823349714279175, "learning_rate": 2.5555555555555554e-05, "loss": 0.6646, "step": 1242 }, { "epoch": 4.913043478260869, "grad_norm": 4.550106048583984, "learning_rate": 2.5535714285714284e-05, "loss": 0.6771, "step": 1243 }, { "epoch": 4.91699604743083, "grad_norm": 4.155416011810303, "learning_rate": 2.5515873015873015e-05, "loss": 0.6493, "step": 1244 }, { "epoch": 4.920948616600791, "grad_norm": 3.774624824523926, "learning_rate": 2.5496031746031745e-05, "loss": 0.6771, "step": 1245 }, { "epoch": 4.924901185770751, "grad_norm": 3.5552561283111572, "learning_rate": 2.5476190476190476e-05, "loss": 0.6626, "step": 1246 }, { "epoch": 4.928853754940711, "grad_norm": 3.7037765979766846, "learning_rate": 2.5456349206349206e-05, "loss": 0.5427, "step": 1247 }, { "epoch": 4.932806324110672, "grad_norm": 4.090572357177734, "learning_rate": 2.5436507936507936e-05, "loss": 0.7206, "step": 1248 }, { "epoch": 4.936758893280633, "grad_norm": 3.817936420440674, "learning_rate": 2.5416666666666667e-05, "loss": 0.4127, "step": 1249 }, { "epoch": 4.940711462450593, "grad_norm": 3.522655487060547, "learning_rate": 2.5396825396825397e-05, "loss": 0.6837, "step": 1250 }, { "epoch": 4.944664031620554, "grad_norm": 3.069960832595825, "learning_rate": 2.5376984126984127e-05, "loss": 0.5466, "step": 1251 }, { "epoch": 4.948616600790514, "grad_norm": 2.7633731365203857, "learning_rate": 2.5357142857142858e-05, "loss": 0.3194, "step": 1252 }, { "epoch": 4.952569169960475, "grad_norm": 4.602902889251709, "learning_rate": 2.5337301587301588e-05, "loss": 0.5539, "step": 1253 }, { "epoch": 4.956521739130435, "grad_norm": 4.454743385314941, "learning_rate": 2.531746031746032e-05, "loss": 0.5853, "step": 1254 }, { "epoch": 4.9604743083003955, "grad_norm": 3.8360097408294678, "learning_rate": 2.529761904761905e-05, "loss": 0.6157, "step": 1255 }, { "epoch": 4.9644268774703555, "grad_norm": 3.1883623600006104, "learning_rate": 2.527777777777778e-05, "loss": 0.6483, "step": 1256 }, { "epoch": 4.968379446640316, "grad_norm": 3.8525397777557373, "learning_rate": 2.525793650793651e-05, "loss": 0.4996, "step": 1257 }, { "epoch": 4.972332015810276, "grad_norm": 3.8935108184814453, "learning_rate": 2.523809523809524e-05, "loss": 0.5236, "step": 1258 }, { "epoch": 4.976284584980237, "grad_norm": 3.436164140701294, "learning_rate": 2.5218253968253967e-05, "loss": 0.6688, "step": 1259 }, { "epoch": 4.980237154150197, "grad_norm": 3.803886651992798, "learning_rate": 2.5198412698412697e-05, "loss": 0.4893, "step": 1260 }, { "epoch": 4.984189723320158, "grad_norm": 4.310186862945557, "learning_rate": 2.5178571428571428e-05, "loss": 0.3476, "step": 1261 }, { "epoch": 4.988142292490118, "grad_norm": 4.136441707611084, "learning_rate": 2.5158730158730158e-05, "loss": 0.6233, "step": 1262 }, { "epoch": 4.992094861660079, "grad_norm": 3.9039924144744873, "learning_rate": 2.513888888888889e-05, "loss": 0.6625, "step": 1263 }, { "epoch": 4.996047430830039, "grad_norm": 5.175858497619629, "learning_rate": 2.511904761904762e-05, "loss": 0.5538, "step": 1264 }, { "epoch": 5.0, "grad_norm": 4.218173503875732, "learning_rate": 2.509920634920635e-05, "loss": 0.7123, "step": 1265 }, { "epoch": 5.003952569169961, "grad_norm": 3.1178061962127686, "learning_rate": 2.507936507936508e-05, "loss": 0.4178, "step": 1266 }, { "epoch": 5.007905138339921, "grad_norm": 2.8245718479156494, "learning_rate": 2.505952380952381e-05, "loss": 0.2365, "step": 1267 }, { "epoch": 5.011857707509882, "grad_norm": 2.6951630115509033, "learning_rate": 2.503968253968254e-05, "loss": 0.4007, "step": 1268 }, { "epoch": 5.015810276679842, "grad_norm": 2.6765530109405518, "learning_rate": 2.501984126984127e-05, "loss": 0.2377, "step": 1269 }, { "epoch": 5.019762845849803, "grad_norm": 4.122332572937012, "learning_rate": 2.5e-05, "loss": 0.3555, "step": 1270 }, { "epoch": 5.023715415019763, "grad_norm": 4.3077712059021, "learning_rate": 2.498015873015873e-05, "loss": 0.2416, "step": 1271 }, { "epoch": 5.027667984189724, "grad_norm": 4.278382778167725, "learning_rate": 2.4960317460317462e-05, "loss": 0.2413, "step": 1272 }, { "epoch": 5.031620553359684, "grad_norm": 5.61036491394043, "learning_rate": 2.4940476190476192e-05, "loss": 0.4181, "step": 1273 }, { "epoch": 5.0355731225296445, "grad_norm": 5.716897487640381, "learning_rate": 2.4920634920634923e-05, "loss": 0.4539, "step": 1274 }, { "epoch": 5.0395256916996045, "grad_norm": 4.314607620239258, "learning_rate": 2.4900793650793653e-05, "loss": 0.3872, "step": 1275 }, { "epoch": 5.043478260869565, "grad_norm": 4.847557067871094, "learning_rate": 2.4880952380952383e-05, "loss": 0.4242, "step": 1276 }, { "epoch": 5.047430830039525, "grad_norm": 4.569025039672852, "learning_rate": 2.4861111111111114e-05, "loss": 0.3746, "step": 1277 }, { "epoch": 5.051383399209486, "grad_norm": 3.309478282928467, "learning_rate": 2.4841269841269844e-05, "loss": 0.263, "step": 1278 }, { "epoch": 5.055335968379446, "grad_norm": 3.7712574005126953, "learning_rate": 2.4821428571428575e-05, "loss": 0.3596, "step": 1279 }, { "epoch": 5.059288537549407, "grad_norm": 4.986356258392334, "learning_rate": 2.4801587301587305e-05, "loss": 0.324, "step": 1280 }, { "epoch": 5.063241106719367, "grad_norm": 3.729706048965454, "learning_rate": 2.4781746031746035e-05, "loss": 0.476, "step": 1281 }, { "epoch": 5.067193675889328, "grad_norm": 3.7136807441711426, "learning_rate": 2.4761904761904762e-05, "loss": 0.2756, "step": 1282 }, { "epoch": 5.071146245059288, "grad_norm": 2.8954904079437256, "learning_rate": 2.4742063492063493e-05, "loss": 0.3505, "step": 1283 }, { "epoch": 5.075098814229249, "grad_norm": 4.071838855743408, "learning_rate": 2.4722222222222223e-05, "loss": 0.447, "step": 1284 }, { "epoch": 5.07905138339921, "grad_norm": 3.134131908416748, "learning_rate": 2.4702380952380953e-05, "loss": 0.3407, "step": 1285 }, { "epoch": 5.08300395256917, "grad_norm": 3.690873861312866, "learning_rate": 2.4682539682539684e-05, "loss": 0.3665, "step": 1286 }, { "epoch": 5.086956521739131, "grad_norm": 4.1070051193237305, "learning_rate": 2.4662698412698414e-05, "loss": 0.3099, "step": 1287 }, { "epoch": 5.090909090909091, "grad_norm": 3.0999696254730225, "learning_rate": 2.4642857142857145e-05, "loss": 0.278, "step": 1288 }, { "epoch": 5.094861660079052, "grad_norm": 4.00131893157959, "learning_rate": 2.4623015873015875e-05, "loss": 0.4586, "step": 1289 }, { "epoch": 5.098814229249012, "grad_norm": 4.4364776611328125, "learning_rate": 2.4603174603174602e-05, "loss": 0.5131, "step": 1290 }, { "epoch": 5.102766798418973, "grad_norm": 4.154432773590088, "learning_rate": 2.4583333333333332e-05, "loss": 0.2289, "step": 1291 }, { "epoch": 5.106719367588933, "grad_norm": 4.669454574584961, "learning_rate": 2.4563492063492063e-05, "loss": 0.3081, "step": 1292 }, { "epoch": 5.1106719367588935, "grad_norm": 3.263782024383545, "learning_rate": 2.4543650793650793e-05, "loss": 0.2814, "step": 1293 }, { "epoch": 5.1146245059288535, "grad_norm": 4.402275085449219, "learning_rate": 2.4523809523809523e-05, "loss": 0.2797, "step": 1294 }, { "epoch": 5.118577075098814, "grad_norm": 3.0900776386260986, "learning_rate": 2.4503968253968254e-05, "loss": 0.2825, "step": 1295 }, { "epoch": 5.122529644268774, "grad_norm": 3.7776150703430176, "learning_rate": 2.4484126984126984e-05, "loss": 0.3906, "step": 1296 }, { "epoch": 5.126482213438735, "grad_norm": 3.909672975540161, "learning_rate": 2.4464285714285715e-05, "loss": 0.3245, "step": 1297 }, { "epoch": 5.130434782608695, "grad_norm": 3.4614851474761963, "learning_rate": 2.4444444444444445e-05, "loss": 0.364, "step": 1298 }, { "epoch": 5.134387351778656, "grad_norm": 3.2292017936706543, "learning_rate": 2.4424603174603175e-05, "loss": 0.3371, "step": 1299 }, { "epoch": 5.138339920948616, "grad_norm": 4.739649772644043, "learning_rate": 2.4404761904761906e-05, "loss": 0.3461, "step": 1300 }, { "epoch": 5.142292490118577, "grad_norm": 3.756049156188965, "learning_rate": 2.4384920634920636e-05, "loss": 0.3143, "step": 1301 }, { "epoch": 5.146245059288537, "grad_norm": 4.8067145347595215, "learning_rate": 2.4365079365079366e-05, "loss": 0.4448, "step": 1302 }, { "epoch": 5.150197628458498, "grad_norm": 3.8932623863220215, "learning_rate": 2.4345238095238097e-05, "loss": 0.2526, "step": 1303 }, { "epoch": 5.154150197628459, "grad_norm": 3.6755340099334717, "learning_rate": 2.4325396825396827e-05, "loss": 0.397, "step": 1304 }, { "epoch": 5.158102766798419, "grad_norm": 3.7219412326812744, "learning_rate": 2.4305555555555558e-05, "loss": 0.1992, "step": 1305 }, { "epoch": 5.16205533596838, "grad_norm": 3.744370698928833, "learning_rate": 2.4285714285714288e-05, "loss": 0.2522, "step": 1306 }, { "epoch": 5.16600790513834, "grad_norm": 3.2599740028381348, "learning_rate": 2.426587301587302e-05, "loss": 0.3581, "step": 1307 }, { "epoch": 5.169960474308301, "grad_norm": 3.48649525642395, "learning_rate": 2.424603174603175e-05, "loss": 0.3864, "step": 1308 }, { "epoch": 5.173913043478261, "grad_norm": 3.1932923793792725, "learning_rate": 2.4226190476190476e-05, "loss": 0.3054, "step": 1309 }, { "epoch": 5.177865612648222, "grad_norm": 4.015415191650391, "learning_rate": 2.4206349206349206e-05, "loss": 0.2925, "step": 1310 }, { "epoch": 5.181818181818182, "grad_norm": 3.9713363647460938, "learning_rate": 2.4186507936507936e-05, "loss": 0.2362, "step": 1311 }, { "epoch": 5.1857707509881426, "grad_norm": 4.697850704193115, "learning_rate": 2.4166666666666667e-05, "loss": 0.2583, "step": 1312 }, { "epoch": 5.189723320158103, "grad_norm": 3.067460298538208, "learning_rate": 2.4146825396825397e-05, "loss": 0.2453, "step": 1313 }, { "epoch": 5.1936758893280635, "grad_norm": 4.411377906799316, "learning_rate": 2.4126984126984128e-05, "loss": 0.1776, "step": 1314 }, { "epoch": 5.1976284584980235, "grad_norm": 3.420597791671753, "learning_rate": 2.4107142857142858e-05, "loss": 0.3165, "step": 1315 }, { "epoch": 5.201581027667984, "grad_norm": 4.028104305267334, "learning_rate": 2.408730158730159e-05, "loss": 0.4102, "step": 1316 }, { "epoch": 5.205533596837944, "grad_norm": 3.9082906246185303, "learning_rate": 2.406746031746032e-05, "loss": 0.3382, "step": 1317 }, { "epoch": 5.209486166007905, "grad_norm": 4.633881092071533, "learning_rate": 2.404761904761905e-05, "loss": 0.2822, "step": 1318 }, { "epoch": 5.213438735177865, "grad_norm": 3.6297354698181152, "learning_rate": 2.402777777777778e-05, "loss": 0.2072, "step": 1319 }, { "epoch": 5.217391304347826, "grad_norm": 3.1377158164978027, "learning_rate": 2.400793650793651e-05, "loss": 0.271, "step": 1320 }, { "epoch": 5.221343873517786, "grad_norm": 3.472032308578491, "learning_rate": 2.398809523809524e-05, "loss": 0.2801, "step": 1321 }, { "epoch": 5.225296442687747, "grad_norm": 3.7419073581695557, "learning_rate": 2.396825396825397e-05, "loss": 0.3191, "step": 1322 }, { "epoch": 5.229249011857707, "grad_norm": 3.5220224857330322, "learning_rate": 2.39484126984127e-05, "loss": 0.4082, "step": 1323 }, { "epoch": 5.233201581027668, "grad_norm": 3.874562978744507, "learning_rate": 2.392857142857143e-05, "loss": 0.3281, "step": 1324 }, { "epoch": 5.237154150197629, "grad_norm": 4.159897804260254, "learning_rate": 2.390873015873016e-05, "loss": 0.437, "step": 1325 }, { "epoch": 5.241106719367589, "grad_norm": 4.653357982635498, "learning_rate": 2.3888888888888892e-05, "loss": 0.4198, "step": 1326 }, { "epoch": 5.24505928853755, "grad_norm": 5.037525177001953, "learning_rate": 2.3869047619047622e-05, "loss": 0.4139, "step": 1327 }, { "epoch": 5.24901185770751, "grad_norm": 4.0110368728637695, "learning_rate": 2.3849206349206353e-05, "loss": 0.2786, "step": 1328 }, { "epoch": 5.252964426877471, "grad_norm": 3.9994056224823, "learning_rate": 2.3829365079365083e-05, "loss": 0.3709, "step": 1329 }, { "epoch": 5.256916996047431, "grad_norm": 3.936352014541626, "learning_rate": 2.380952380952381e-05, "loss": 0.3744, "step": 1330 }, { "epoch": 5.260869565217392, "grad_norm": 3.9300754070281982, "learning_rate": 2.378968253968254e-05, "loss": 0.5034, "step": 1331 }, { "epoch": 5.264822134387352, "grad_norm": 4.13817024230957, "learning_rate": 2.376984126984127e-05, "loss": 0.4842, "step": 1332 }, { "epoch": 5.2687747035573125, "grad_norm": 3.0476675033569336, "learning_rate": 2.375e-05, "loss": 0.2846, "step": 1333 }, { "epoch": 5.2727272727272725, "grad_norm": 4.827993869781494, "learning_rate": 2.373015873015873e-05, "loss": 0.4083, "step": 1334 }, { "epoch": 5.276679841897233, "grad_norm": 3.111452341079712, "learning_rate": 2.3710317460317462e-05, "loss": 0.3334, "step": 1335 }, { "epoch": 5.280632411067193, "grad_norm": 4.097339630126953, "learning_rate": 2.369047619047619e-05, "loss": 0.3375, "step": 1336 }, { "epoch": 5.284584980237154, "grad_norm": 4.1747822761535645, "learning_rate": 2.367063492063492e-05, "loss": 0.3817, "step": 1337 }, { "epoch": 5.288537549407114, "grad_norm": 3.4249610900878906, "learning_rate": 2.365079365079365e-05, "loss": 0.3121, "step": 1338 }, { "epoch": 5.292490118577075, "grad_norm": 3.0609562397003174, "learning_rate": 2.363095238095238e-05, "loss": 0.3, "step": 1339 }, { "epoch": 5.296442687747035, "grad_norm": 4.675295829772949, "learning_rate": 2.361111111111111e-05, "loss": 0.4734, "step": 1340 }, { "epoch": 5.300395256916996, "grad_norm": 4.56593656539917, "learning_rate": 2.359126984126984e-05, "loss": 0.386, "step": 1341 }, { "epoch": 5.304347826086957, "grad_norm": 5.29105806350708, "learning_rate": 2.357142857142857e-05, "loss": 0.3133, "step": 1342 }, { "epoch": 5.308300395256917, "grad_norm": 4.2684712409973145, "learning_rate": 2.35515873015873e-05, "loss": 0.4087, "step": 1343 }, { "epoch": 5.312252964426877, "grad_norm": 5.04403018951416, "learning_rate": 2.3531746031746032e-05, "loss": 0.4299, "step": 1344 }, { "epoch": 5.316205533596838, "grad_norm": 4.923758506774902, "learning_rate": 2.3511904761904762e-05, "loss": 0.3231, "step": 1345 }, { "epoch": 5.320158102766799, "grad_norm": 2.760050058364868, "learning_rate": 2.3492063492063493e-05, "loss": 0.2268, "step": 1346 }, { "epoch": 5.324110671936759, "grad_norm": 3.437049150466919, "learning_rate": 2.3472222222222223e-05, "loss": 0.391, "step": 1347 }, { "epoch": 5.32806324110672, "grad_norm": 4.473261833190918, "learning_rate": 2.3452380952380954e-05, "loss": 0.4342, "step": 1348 }, { "epoch": 5.33201581027668, "grad_norm": 4.289645195007324, "learning_rate": 2.3432539682539684e-05, "loss": 0.4789, "step": 1349 }, { "epoch": 5.335968379446641, "grad_norm": 3.814570426940918, "learning_rate": 2.3412698412698414e-05, "loss": 0.3432, "step": 1350 }, { "epoch": 5.339920948616601, "grad_norm": 3.5725395679473877, "learning_rate": 2.3392857142857145e-05, "loss": 0.2394, "step": 1351 }, { "epoch": 5.3438735177865615, "grad_norm": 3.9966726303100586, "learning_rate": 2.3373015873015875e-05, "loss": 0.3657, "step": 1352 }, { "epoch": 5.3478260869565215, "grad_norm": 3.7222578525543213, "learning_rate": 2.3353174603174605e-05, "loss": 0.2929, "step": 1353 }, { "epoch": 5.351778656126482, "grad_norm": 3.659536361694336, "learning_rate": 2.3333333333333336e-05, "loss": 0.5094, "step": 1354 }, { "epoch": 5.355731225296442, "grad_norm": 4.850214004516602, "learning_rate": 2.3313492063492066e-05, "loss": 0.4599, "step": 1355 }, { "epoch": 5.359683794466403, "grad_norm": 3.3052451610565186, "learning_rate": 2.3293650793650797e-05, "loss": 0.1597, "step": 1356 }, { "epoch": 5.363636363636363, "grad_norm": 3.6496050357818604, "learning_rate": 2.3273809523809527e-05, "loss": 0.3862, "step": 1357 }, { "epoch": 5.367588932806324, "grad_norm": 4.302274703979492, "learning_rate": 2.3253968253968257e-05, "loss": 0.2688, "step": 1358 }, { "epoch": 5.371541501976284, "grad_norm": 4.052597522735596, "learning_rate": 2.3234126984126984e-05, "loss": 0.328, "step": 1359 }, { "epoch": 5.375494071146245, "grad_norm": 4.061760425567627, "learning_rate": 2.3214285714285715e-05, "loss": 0.394, "step": 1360 }, { "epoch": 5.379446640316205, "grad_norm": 4.183040618896484, "learning_rate": 2.3194444444444445e-05, "loss": 0.4477, "step": 1361 }, { "epoch": 5.383399209486166, "grad_norm": 2.766695737838745, "learning_rate": 2.3174603174603175e-05, "loss": 0.1786, "step": 1362 }, { "epoch": 5.387351778656127, "grad_norm": 5.040890693664551, "learning_rate": 2.3154761904761906e-05, "loss": 0.4192, "step": 1363 }, { "epoch": 5.391304347826087, "grad_norm": 4.445863246917725, "learning_rate": 2.3134920634920636e-05, "loss": 0.5025, "step": 1364 }, { "epoch": 5.395256916996048, "grad_norm": 3.4918015003204346, "learning_rate": 2.3115079365079367e-05, "loss": 0.2392, "step": 1365 }, { "epoch": 5.399209486166008, "grad_norm": 3.366082191467285, "learning_rate": 2.3095238095238097e-05, "loss": 0.4595, "step": 1366 }, { "epoch": 5.403162055335969, "grad_norm": 3.521512508392334, "learning_rate": 2.3075396825396827e-05, "loss": 0.422, "step": 1367 }, { "epoch": 5.407114624505929, "grad_norm": 3.9632761478424072, "learning_rate": 2.3055555555555558e-05, "loss": 0.4579, "step": 1368 }, { "epoch": 5.41106719367589, "grad_norm": 3.9189956188201904, "learning_rate": 2.3035714285714285e-05, "loss": 0.3785, "step": 1369 }, { "epoch": 5.41501976284585, "grad_norm": 3.7866268157958984, "learning_rate": 2.3015873015873015e-05, "loss": 0.223, "step": 1370 }, { "epoch": 5.4189723320158105, "grad_norm": 3.4151864051818848, "learning_rate": 2.2996031746031745e-05, "loss": 0.1987, "step": 1371 }, { "epoch": 5.4229249011857705, "grad_norm": 3.8373048305511475, "learning_rate": 2.2976190476190476e-05, "loss": 0.3249, "step": 1372 }, { "epoch": 5.426877470355731, "grad_norm": 4.3949103355407715, "learning_rate": 2.2956349206349206e-05, "loss": 0.3704, "step": 1373 }, { "epoch": 5.430830039525691, "grad_norm": 3.6524100303649902, "learning_rate": 2.2936507936507937e-05, "loss": 0.4462, "step": 1374 }, { "epoch": 5.434782608695652, "grad_norm": 3.927030086517334, "learning_rate": 2.2916666666666667e-05, "loss": 0.3642, "step": 1375 }, { "epoch": 5.438735177865612, "grad_norm": 4.005973815917969, "learning_rate": 2.2896825396825397e-05, "loss": 0.4237, "step": 1376 }, { "epoch": 5.442687747035573, "grad_norm": 3.733210325241089, "learning_rate": 2.2876984126984128e-05, "loss": 0.3089, "step": 1377 }, { "epoch": 5.446640316205533, "grad_norm": 5.140311241149902, "learning_rate": 2.2857142857142858e-05, "loss": 0.4288, "step": 1378 }, { "epoch": 5.450592885375494, "grad_norm": 3.3404946327209473, "learning_rate": 2.283730158730159e-05, "loss": 0.2962, "step": 1379 }, { "epoch": 5.454545454545454, "grad_norm": 4.286531448364258, "learning_rate": 2.281746031746032e-05, "loss": 0.3931, "step": 1380 }, { "epoch": 5.458498023715415, "grad_norm": 3.68621826171875, "learning_rate": 2.279761904761905e-05, "loss": 0.2881, "step": 1381 }, { "epoch": 5.462450592885375, "grad_norm": 3.6594183444976807, "learning_rate": 2.277777777777778e-05, "loss": 0.293, "step": 1382 }, { "epoch": 5.466403162055336, "grad_norm": 3.7897143363952637, "learning_rate": 2.275793650793651e-05, "loss": 0.207, "step": 1383 }, { "epoch": 5.470355731225297, "grad_norm": 4.473965644836426, "learning_rate": 2.273809523809524e-05, "loss": 0.2935, "step": 1384 }, { "epoch": 5.474308300395257, "grad_norm": 3.8163721561431885, "learning_rate": 2.271825396825397e-05, "loss": 0.4574, "step": 1385 }, { "epoch": 5.478260869565218, "grad_norm": 3.263646125793457, "learning_rate": 2.2698412698412698e-05, "loss": 0.3372, "step": 1386 }, { "epoch": 5.482213438735178, "grad_norm": 4.062425136566162, "learning_rate": 2.2678571428571428e-05, "loss": 0.2997, "step": 1387 }, { "epoch": 5.486166007905139, "grad_norm": 3.691974401473999, "learning_rate": 2.265873015873016e-05, "loss": 0.3711, "step": 1388 }, { "epoch": 5.490118577075099, "grad_norm": 3.3711211681365967, "learning_rate": 2.263888888888889e-05, "loss": 0.293, "step": 1389 }, { "epoch": 5.4940711462450595, "grad_norm": 3.659691572189331, "learning_rate": 2.261904761904762e-05, "loss": 0.2519, "step": 1390 }, { "epoch": 5.4980237154150196, "grad_norm": 4.6761088371276855, "learning_rate": 2.259920634920635e-05, "loss": 0.3788, "step": 1391 }, { "epoch": 5.5019762845849804, "grad_norm": 4.013514995574951, "learning_rate": 2.257936507936508e-05, "loss": 0.3685, "step": 1392 }, { "epoch": 5.5059288537549405, "grad_norm": 3.272243022918701, "learning_rate": 2.255952380952381e-05, "loss": 0.2541, "step": 1393 }, { "epoch": 5.509881422924901, "grad_norm": 4.194965362548828, "learning_rate": 2.253968253968254e-05, "loss": 0.3032, "step": 1394 }, { "epoch": 5.513833992094861, "grad_norm": 5.20955228805542, "learning_rate": 2.251984126984127e-05, "loss": 0.4339, "step": 1395 }, { "epoch": 5.517786561264822, "grad_norm": 3.9413256645202637, "learning_rate": 2.25e-05, "loss": 0.2921, "step": 1396 }, { "epoch": 5.521739130434782, "grad_norm": 3.783141613006592, "learning_rate": 2.2480158730158732e-05, "loss": 0.2763, "step": 1397 }, { "epoch": 5.525691699604743, "grad_norm": 3.764054298400879, "learning_rate": 2.2460317460317462e-05, "loss": 0.4252, "step": 1398 }, { "epoch": 5.529644268774703, "grad_norm": 3.668379545211792, "learning_rate": 2.2440476190476193e-05, "loss": 0.3577, "step": 1399 }, { "epoch": 5.533596837944664, "grad_norm": 4.717470645904541, "learning_rate": 2.2420634920634923e-05, "loss": 0.2837, "step": 1400 }, { "epoch": 5.537549407114625, "grad_norm": 4.919825553894043, "learning_rate": 2.2400793650793653e-05, "loss": 0.4905, "step": 1401 }, { "epoch": 5.541501976284585, "grad_norm": 3.827908754348755, "learning_rate": 2.2380952380952384e-05, "loss": 0.219, "step": 1402 }, { "epoch": 5.545454545454545, "grad_norm": 3.300463914871216, "learning_rate": 2.2361111111111114e-05, "loss": 0.2114, "step": 1403 }, { "epoch": 5.549407114624506, "grad_norm": 4.180085182189941, "learning_rate": 2.2341269841269844e-05, "loss": 0.4172, "step": 1404 }, { "epoch": 5.553359683794467, "grad_norm": 3.7211532592773438, "learning_rate": 2.2321428571428575e-05, "loss": 0.3994, "step": 1405 }, { "epoch": 5.557312252964427, "grad_norm": 4.4185991287231445, "learning_rate": 2.2301587301587305e-05, "loss": 0.4922, "step": 1406 }, { "epoch": 5.561264822134388, "grad_norm": 5.1140456199646, "learning_rate": 2.2281746031746036e-05, "loss": 0.4038, "step": 1407 }, { "epoch": 5.565217391304348, "grad_norm": 3.7456719875335693, "learning_rate": 2.2261904761904763e-05, "loss": 0.2587, "step": 1408 }, { "epoch": 5.569169960474309, "grad_norm": 4.17236852645874, "learning_rate": 2.2242063492063493e-05, "loss": 0.3107, "step": 1409 }, { "epoch": 5.573122529644269, "grad_norm": 3.580836057662964, "learning_rate": 2.2222222222222223e-05, "loss": 0.3648, "step": 1410 }, { "epoch": 5.5770750988142295, "grad_norm": 4.266388893127441, "learning_rate": 2.2202380952380954e-05, "loss": 0.3403, "step": 1411 }, { "epoch": 5.5810276679841895, "grad_norm": 2.99090576171875, "learning_rate": 2.2182539682539684e-05, "loss": 0.3277, "step": 1412 }, { "epoch": 5.58498023715415, "grad_norm": 4.490349292755127, "learning_rate": 2.2162698412698414e-05, "loss": 0.4371, "step": 1413 }, { "epoch": 5.58893280632411, "grad_norm": 4.330888271331787, "learning_rate": 2.214285714285714e-05, "loss": 0.2821, "step": 1414 }, { "epoch": 5.592885375494071, "grad_norm": 3.7015597820281982, "learning_rate": 2.2123015873015872e-05, "loss": 0.2986, "step": 1415 }, { "epoch": 5.596837944664031, "grad_norm": 4.108189582824707, "learning_rate": 2.2103174603174602e-05, "loss": 0.4689, "step": 1416 }, { "epoch": 5.600790513833992, "grad_norm": 3.0757765769958496, "learning_rate": 2.2083333333333333e-05, "loss": 0.2335, "step": 1417 }, { "epoch": 5.604743083003952, "grad_norm": 3.065613031387329, "learning_rate": 2.2063492063492063e-05, "loss": 0.3694, "step": 1418 }, { "epoch": 5.608695652173913, "grad_norm": 4.539591312408447, "learning_rate": 2.2043650793650793e-05, "loss": 0.3448, "step": 1419 }, { "epoch": 5.612648221343873, "grad_norm": 3.5507845878601074, "learning_rate": 2.2023809523809524e-05, "loss": 0.4036, "step": 1420 }, { "epoch": 5.616600790513834, "grad_norm": 3.766512632369995, "learning_rate": 2.2003968253968254e-05, "loss": 0.3682, "step": 1421 }, { "epoch": 5.620553359683795, "grad_norm": 3.268449306488037, "learning_rate": 2.1984126984126984e-05, "loss": 0.2589, "step": 1422 }, { "epoch": 5.624505928853755, "grad_norm": 3.850033760070801, "learning_rate": 2.1964285714285715e-05, "loss": 0.2949, "step": 1423 }, { "epoch": 5.628458498023716, "grad_norm": 4.563751220703125, "learning_rate": 2.1944444444444445e-05, "loss": 0.387, "step": 1424 }, { "epoch": 5.632411067193676, "grad_norm": 3.109978199005127, "learning_rate": 2.1924603174603176e-05, "loss": 0.2175, "step": 1425 }, { "epoch": 5.636363636363637, "grad_norm": 4.669355869293213, "learning_rate": 2.1904761904761906e-05, "loss": 0.2422, "step": 1426 }, { "epoch": 5.640316205533597, "grad_norm": 3.95523738861084, "learning_rate": 2.1884920634920636e-05, "loss": 0.3867, "step": 1427 }, { "epoch": 5.644268774703558, "grad_norm": 3.9912211894989014, "learning_rate": 2.1865079365079367e-05, "loss": 0.2815, "step": 1428 }, { "epoch": 5.648221343873518, "grad_norm": 3.4305765628814697, "learning_rate": 2.1845238095238097e-05, "loss": 0.263, "step": 1429 }, { "epoch": 5.6521739130434785, "grad_norm": 4.1827216148376465, "learning_rate": 2.1825396825396827e-05, "loss": 0.292, "step": 1430 }, { "epoch": 5.6561264822134385, "grad_norm": 3.6767704486846924, "learning_rate": 2.1805555555555558e-05, "loss": 0.2732, "step": 1431 }, { "epoch": 5.660079051383399, "grad_norm": 4.459062099456787, "learning_rate": 2.1785714285714288e-05, "loss": 0.3492, "step": 1432 }, { "epoch": 5.664031620553359, "grad_norm": 3.5659234523773193, "learning_rate": 2.176587301587302e-05, "loss": 0.2712, "step": 1433 }, { "epoch": 5.66798418972332, "grad_norm": 4.487940788269043, "learning_rate": 2.174603174603175e-05, "loss": 0.2831, "step": 1434 }, { "epoch": 5.67193675889328, "grad_norm": 5.612968921661377, "learning_rate": 2.172619047619048e-05, "loss": 0.5013, "step": 1435 }, { "epoch": 5.675889328063241, "grad_norm": 5.958313465118408, "learning_rate": 2.170634920634921e-05, "loss": 0.4195, "step": 1436 }, { "epoch": 5.679841897233201, "grad_norm": 4.63142728805542, "learning_rate": 2.1686507936507937e-05, "loss": 0.421, "step": 1437 }, { "epoch": 5.683794466403162, "grad_norm": 3.549577236175537, "learning_rate": 2.1666666666666667e-05, "loss": 0.3966, "step": 1438 }, { "epoch": 5.687747035573123, "grad_norm": 3.38968825340271, "learning_rate": 2.1646825396825397e-05, "loss": 0.3352, "step": 1439 }, { "epoch": 5.691699604743083, "grad_norm": 3.199084520339966, "learning_rate": 2.1626984126984128e-05, "loss": 0.2996, "step": 1440 }, { "epoch": 5.695652173913043, "grad_norm": 3.6920907497406006, "learning_rate": 2.1607142857142858e-05, "loss": 0.329, "step": 1441 }, { "epoch": 5.699604743083004, "grad_norm": 3.676727533340454, "learning_rate": 2.158730158730159e-05, "loss": 0.5122, "step": 1442 }, { "epoch": 5.703557312252965, "grad_norm": 3.520972728729248, "learning_rate": 2.156746031746032e-05, "loss": 0.2829, "step": 1443 }, { "epoch": 5.707509881422925, "grad_norm": 4.055224895477295, "learning_rate": 2.154761904761905e-05, "loss": 0.3474, "step": 1444 }, { "epoch": 5.711462450592886, "grad_norm": 4.417123317718506, "learning_rate": 2.152777777777778e-05, "loss": 0.3876, "step": 1445 }, { "epoch": 5.715415019762846, "grad_norm": 3.9218266010284424, "learning_rate": 2.150793650793651e-05, "loss": 0.4128, "step": 1446 }, { "epoch": 5.719367588932807, "grad_norm": 4.506389141082764, "learning_rate": 2.148809523809524e-05, "loss": 0.4672, "step": 1447 }, { "epoch": 5.723320158102767, "grad_norm": 3.930652141571045, "learning_rate": 2.1468253968253967e-05, "loss": 0.2926, "step": 1448 }, { "epoch": 5.7272727272727275, "grad_norm": 4.012630939483643, "learning_rate": 2.1448412698412698e-05, "loss": 0.4217, "step": 1449 }, { "epoch": 5.7312252964426875, "grad_norm": 4.338024139404297, "learning_rate": 2.1428571428571428e-05, "loss": 0.2772, "step": 1450 }, { "epoch": 5.735177865612648, "grad_norm": 3.886202096939087, "learning_rate": 2.140873015873016e-05, "loss": 0.3482, "step": 1451 }, { "epoch": 5.739130434782608, "grad_norm": 4.083372116088867, "learning_rate": 2.138888888888889e-05, "loss": 0.2735, "step": 1452 }, { "epoch": 5.743083003952569, "grad_norm": 3.7557191848754883, "learning_rate": 2.136904761904762e-05, "loss": 0.4228, "step": 1453 }, { "epoch": 5.747035573122529, "grad_norm": 3.172152519226074, "learning_rate": 2.134920634920635e-05, "loss": 0.3562, "step": 1454 }, { "epoch": 5.75098814229249, "grad_norm": 3.7699217796325684, "learning_rate": 2.132936507936508e-05, "loss": 0.2838, "step": 1455 }, { "epoch": 5.75494071146245, "grad_norm": 3.8559303283691406, "learning_rate": 2.130952380952381e-05, "loss": 0.5692, "step": 1456 }, { "epoch": 5.758893280632411, "grad_norm": 5.278477191925049, "learning_rate": 2.128968253968254e-05, "loss": 0.5161, "step": 1457 }, { "epoch": 5.762845849802371, "grad_norm": 4.712038993835449, "learning_rate": 2.126984126984127e-05, "loss": 0.473, "step": 1458 }, { "epoch": 5.766798418972332, "grad_norm": 3.714198112487793, "learning_rate": 2.125e-05, "loss": 0.2426, "step": 1459 }, { "epoch": 5.770750988142293, "grad_norm": 3.246757745742798, "learning_rate": 2.1230158730158732e-05, "loss": 0.4447, "step": 1460 }, { "epoch": 5.774703557312253, "grad_norm": 4.785327434539795, "learning_rate": 2.1210317460317462e-05, "loss": 0.3517, "step": 1461 }, { "epoch": 5.778656126482213, "grad_norm": 4.272199630737305, "learning_rate": 2.1190476190476193e-05, "loss": 0.329, "step": 1462 }, { "epoch": 5.782608695652174, "grad_norm": 4.346473217010498, "learning_rate": 2.1170634920634923e-05, "loss": 0.4102, "step": 1463 }, { "epoch": 5.786561264822135, "grad_norm": 3.4643688201904297, "learning_rate": 2.115079365079365e-05, "loss": 0.2851, "step": 1464 }, { "epoch": 5.790513833992095, "grad_norm": 4.473227024078369, "learning_rate": 2.113095238095238e-05, "loss": 0.3341, "step": 1465 }, { "epoch": 5.794466403162056, "grad_norm": 3.813743829727173, "learning_rate": 2.111111111111111e-05, "loss": 0.4491, "step": 1466 }, { "epoch": 5.798418972332016, "grad_norm": 4.01721715927124, "learning_rate": 2.109126984126984e-05, "loss": 0.1942, "step": 1467 }, { "epoch": 5.8023715415019765, "grad_norm": 3.757263422012329, "learning_rate": 2.107142857142857e-05, "loss": 0.3666, "step": 1468 }, { "epoch": 5.8063241106719365, "grad_norm": 3.4394917488098145, "learning_rate": 2.1051587301587302e-05, "loss": 0.3211, "step": 1469 }, { "epoch": 5.810276679841897, "grad_norm": 5.106522083282471, "learning_rate": 2.1031746031746032e-05, "loss": 0.3757, "step": 1470 }, { "epoch": 5.8142292490118574, "grad_norm": 4.171218395233154, "learning_rate": 2.1011904761904763e-05, "loss": 0.4219, "step": 1471 }, { "epoch": 5.818181818181818, "grad_norm": 4.083078384399414, "learning_rate": 2.0992063492063493e-05, "loss": 0.3451, "step": 1472 }, { "epoch": 5.822134387351778, "grad_norm": 4.33717679977417, "learning_rate": 2.0972222222222223e-05, "loss": 0.5292, "step": 1473 }, { "epoch": 5.826086956521739, "grad_norm": 4.866550445556641, "learning_rate": 2.0952380952380954e-05, "loss": 0.3334, "step": 1474 }, { "epoch": 5.830039525691699, "grad_norm": 3.197094202041626, "learning_rate": 2.0932539682539684e-05, "loss": 0.2206, "step": 1475 }, { "epoch": 5.83399209486166, "grad_norm": 3.3910369873046875, "learning_rate": 2.0912698412698415e-05, "loss": 0.1893, "step": 1476 }, { "epoch": 5.837944664031621, "grad_norm": 4.107388496398926, "learning_rate": 2.0892857142857145e-05, "loss": 0.4357, "step": 1477 }, { "epoch": 5.841897233201581, "grad_norm": 2.6555662155151367, "learning_rate": 2.0873015873015875e-05, "loss": 0.2589, "step": 1478 }, { "epoch": 5.845849802371541, "grad_norm": 3.3115289211273193, "learning_rate": 2.0853174603174606e-05, "loss": 0.3587, "step": 1479 }, { "epoch": 5.849802371541502, "grad_norm": 3.604375123977661, "learning_rate": 2.0833333333333336e-05, "loss": 0.273, "step": 1480 }, { "epoch": 5.853754940711463, "grad_norm": 4.312830448150635, "learning_rate": 2.0813492063492066e-05, "loss": 0.2748, "step": 1481 }, { "epoch": 5.857707509881423, "grad_norm": 4.150437831878662, "learning_rate": 2.0793650793650797e-05, "loss": 0.3099, "step": 1482 }, { "epoch": 5.861660079051384, "grad_norm": 4.916041851043701, "learning_rate": 2.0773809523809527e-05, "loss": 0.3229, "step": 1483 }, { "epoch": 5.865612648221344, "grad_norm": 4.0450663566589355, "learning_rate": 2.0753968253968258e-05, "loss": 0.3658, "step": 1484 }, { "epoch": 5.869565217391305, "grad_norm": 3.5350306034088135, "learning_rate": 2.0734126984126988e-05, "loss": 0.3391, "step": 1485 }, { "epoch": 5.873517786561265, "grad_norm": 3.364154100418091, "learning_rate": 2.0714285714285718e-05, "loss": 0.1964, "step": 1486 }, { "epoch": 5.877470355731226, "grad_norm": 5.209838390350342, "learning_rate": 2.0694444444444445e-05, "loss": 0.3585, "step": 1487 }, { "epoch": 5.881422924901186, "grad_norm": 3.9065868854522705, "learning_rate": 2.0674603174603176e-05, "loss": 0.2676, "step": 1488 }, { "epoch": 5.8853754940711465, "grad_norm": 4.029334545135498, "learning_rate": 2.0654761904761906e-05, "loss": 0.2837, "step": 1489 }, { "epoch": 5.8893280632411065, "grad_norm": 4.0023112297058105, "learning_rate": 2.0634920634920636e-05, "loss": 0.4846, "step": 1490 }, { "epoch": 5.893280632411067, "grad_norm": 4.869349479675293, "learning_rate": 2.0615079365079363e-05, "loss": 0.3852, "step": 1491 }, { "epoch": 5.897233201581027, "grad_norm": 4.35573148727417, "learning_rate": 2.0595238095238094e-05, "loss": 0.2365, "step": 1492 }, { "epoch": 5.901185770750988, "grad_norm": 4.35988187789917, "learning_rate": 2.0575396825396824e-05, "loss": 0.6209, "step": 1493 }, { "epoch": 5.905138339920948, "grad_norm": 3.8130955696105957, "learning_rate": 2.0555555555555555e-05, "loss": 0.3111, "step": 1494 }, { "epoch": 5.909090909090909, "grad_norm": 3.3116374015808105, "learning_rate": 2.0535714285714285e-05, "loss": 0.261, "step": 1495 }, { "epoch": 5.913043478260869, "grad_norm": 4.634829521179199, "learning_rate": 2.0515873015873015e-05, "loss": 0.3943, "step": 1496 }, { "epoch": 5.91699604743083, "grad_norm": 3.972263813018799, "learning_rate": 2.0496031746031746e-05, "loss": 0.3012, "step": 1497 }, { "epoch": 5.920948616600791, "grad_norm": 4.382541656494141, "learning_rate": 2.0476190476190476e-05, "loss": 0.2957, "step": 1498 }, { "epoch": 5.924901185770751, "grad_norm": 4.666507720947266, "learning_rate": 2.0456349206349206e-05, "loss": 0.4242, "step": 1499 }, { "epoch": 5.928853754940711, "grad_norm": 3.7338948249816895, "learning_rate": 2.0436507936507937e-05, "loss": 0.2565, "step": 1500 }, { "epoch": 5.932806324110672, "grad_norm": 3.5870771408081055, "learning_rate": 2.0416666666666667e-05, "loss": 0.4244, "step": 1501 }, { "epoch": 5.936758893280633, "grad_norm": 4.410994052886963, "learning_rate": 2.0396825396825398e-05, "loss": 0.3985, "step": 1502 }, { "epoch": 5.940711462450593, "grad_norm": 5.01856803894043, "learning_rate": 2.0376984126984128e-05, "loss": 0.354, "step": 1503 }, { "epoch": 5.944664031620554, "grad_norm": 3.5459818840026855, "learning_rate": 2.0357142857142858e-05, "loss": 0.3448, "step": 1504 }, { "epoch": 5.948616600790514, "grad_norm": 4.243325233459473, "learning_rate": 2.033730158730159e-05, "loss": 0.4499, "step": 1505 }, { "epoch": 5.952569169960475, "grad_norm": 4.445835113525391, "learning_rate": 2.031746031746032e-05, "loss": 0.4184, "step": 1506 }, { "epoch": 5.956521739130435, "grad_norm": 4.013076305389404, "learning_rate": 2.029761904761905e-05, "loss": 0.3721, "step": 1507 }, { "epoch": 5.9604743083003955, "grad_norm": 4.0771284103393555, "learning_rate": 2.027777777777778e-05, "loss": 0.414, "step": 1508 }, { "epoch": 5.9644268774703555, "grad_norm": 4.353712558746338, "learning_rate": 2.025793650793651e-05, "loss": 0.4212, "step": 1509 }, { "epoch": 5.968379446640316, "grad_norm": 3.816298007965088, "learning_rate": 2.023809523809524e-05, "loss": 0.2885, "step": 1510 }, { "epoch": 5.972332015810276, "grad_norm": 5.043026447296143, "learning_rate": 2.021825396825397e-05, "loss": 0.4901, "step": 1511 }, { "epoch": 5.976284584980237, "grad_norm": 3.432596206665039, "learning_rate": 2.01984126984127e-05, "loss": 0.4023, "step": 1512 }, { "epoch": 5.980237154150197, "grad_norm": 5.160770893096924, "learning_rate": 2.017857142857143e-05, "loss": 0.435, "step": 1513 }, { "epoch": 5.984189723320158, "grad_norm": 3.863649845123291, "learning_rate": 2.015873015873016e-05, "loss": 0.2929, "step": 1514 }, { "epoch": 5.988142292490118, "grad_norm": 3.960110902786255, "learning_rate": 2.013888888888889e-05, "loss": 0.3358, "step": 1515 }, { "epoch": 5.992094861660079, "grad_norm": 3.597496747970581, "learning_rate": 2.011904761904762e-05, "loss": 0.3232, "step": 1516 }, { "epoch": 5.996047430830039, "grad_norm": 3.676575183868408, "learning_rate": 2.009920634920635e-05, "loss": 0.3356, "step": 1517 }, { "epoch": 6.0, "grad_norm": 4.146391868591309, "learning_rate": 2.007936507936508e-05, "loss": 0.3818, "step": 1518 }, { "epoch": 6.003952569169961, "grad_norm": 1.9677680730819702, "learning_rate": 2.005952380952381e-05, "loss": 0.1328, "step": 1519 }, { "epoch": 6.007905138339921, "grad_norm": 3.2990286350250244, "learning_rate": 2.003968253968254e-05, "loss": 0.1407, "step": 1520 }, { "epoch": 6.011857707509882, "grad_norm": 3.0897316932678223, "learning_rate": 2.001984126984127e-05, "loss": 0.23, "step": 1521 }, { "epoch": 6.015810276679842, "grad_norm": 3.0987026691436768, "learning_rate": 2e-05, "loss": 0.2357, "step": 1522 }, { "epoch": 6.019762845849803, "grad_norm": 2.3966803550720215, "learning_rate": 1.9980158730158732e-05, "loss": 0.1346, "step": 1523 }, { "epoch": 6.023715415019763, "grad_norm": 2.7668561935424805, "learning_rate": 1.9960317460317462e-05, "loss": 0.1102, "step": 1524 }, { "epoch": 6.027667984189724, "grad_norm": 3.623638868331909, "learning_rate": 1.9940476190476193e-05, "loss": 0.1318, "step": 1525 }, { "epoch": 6.031620553359684, "grad_norm": 3.124091148376465, "learning_rate": 1.992063492063492e-05, "loss": 0.2056, "step": 1526 }, { "epoch": 6.0355731225296445, "grad_norm": 3.1599087715148926, "learning_rate": 1.990079365079365e-05, "loss": 0.1375, "step": 1527 }, { "epoch": 6.0395256916996045, "grad_norm": 3.4084415435791016, "learning_rate": 1.988095238095238e-05, "loss": 0.1443, "step": 1528 }, { "epoch": 6.043478260869565, "grad_norm": 3.607503652572632, "learning_rate": 1.986111111111111e-05, "loss": 0.1846, "step": 1529 }, { "epoch": 6.047430830039525, "grad_norm": 3.6665408611297607, "learning_rate": 1.984126984126984e-05, "loss": 0.1801, "step": 1530 }, { "epoch": 6.051383399209486, "grad_norm": 3.5593857765197754, "learning_rate": 1.982142857142857e-05, "loss": 0.2119, "step": 1531 }, { "epoch": 6.055335968379446, "grad_norm": 2.55330228805542, "learning_rate": 1.9801587301587302e-05, "loss": 0.119, "step": 1532 }, { "epoch": 6.059288537549407, "grad_norm": 4.402235984802246, "learning_rate": 1.9781746031746032e-05, "loss": 0.2068, "step": 1533 }, { "epoch": 6.063241106719367, "grad_norm": 2.9708304405212402, "learning_rate": 1.9761904761904763e-05, "loss": 0.1736, "step": 1534 }, { "epoch": 6.067193675889328, "grad_norm": 4.265127182006836, "learning_rate": 1.9742063492063493e-05, "loss": 0.2059, "step": 1535 }, { "epoch": 6.071146245059288, "grad_norm": 3.346165895462036, "learning_rate": 1.9722222222222224e-05, "loss": 0.2927, "step": 1536 }, { "epoch": 6.075098814229249, "grad_norm": 3.0124001502990723, "learning_rate": 1.9702380952380954e-05, "loss": 0.1643, "step": 1537 }, { "epoch": 6.07905138339921, "grad_norm": 3.65307879447937, "learning_rate": 1.9682539682539684e-05, "loss": 0.1432, "step": 1538 }, { "epoch": 6.08300395256917, "grad_norm": 3.4866161346435547, "learning_rate": 1.9662698412698415e-05, "loss": 0.1515, "step": 1539 }, { "epoch": 6.086956521739131, "grad_norm": 3.2024996280670166, "learning_rate": 1.9642857142857145e-05, "loss": 0.1209, "step": 1540 }, { "epoch": 6.090909090909091, "grad_norm": 3.161350965499878, "learning_rate": 1.9623015873015872e-05, "loss": 0.1511, "step": 1541 }, { "epoch": 6.094861660079052, "grad_norm": 2.771225929260254, "learning_rate": 1.9603174603174602e-05, "loss": 0.1782, "step": 1542 }, { "epoch": 6.098814229249012, "grad_norm": 2.3539700508117676, "learning_rate": 1.9583333333333333e-05, "loss": 0.1533, "step": 1543 }, { "epoch": 6.102766798418973, "grad_norm": 3.1462249755859375, "learning_rate": 1.9563492063492063e-05, "loss": 0.1733, "step": 1544 }, { "epoch": 6.106719367588933, "grad_norm": 3.8993260860443115, "learning_rate": 1.9543650793650793e-05, "loss": 0.2576, "step": 1545 }, { "epoch": 6.1106719367588935, "grad_norm": 4.060632228851318, "learning_rate": 1.9523809523809524e-05, "loss": 0.2333, "step": 1546 }, { "epoch": 6.1146245059288535, "grad_norm": 3.7323641777038574, "learning_rate": 1.9503968253968254e-05, "loss": 0.256, "step": 1547 }, { "epoch": 6.118577075098814, "grad_norm": 2.8360302448272705, "learning_rate": 1.9484126984126985e-05, "loss": 0.1162, "step": 1548 }, { "epoch": 6.122529644268774, "grad_norm": 4.867652416229248, "learning_rate": 1.9464285714285715e-05, "loss": 0.2682, "step": 1549 }, { "epoch": 6.126482213438735, "grad_norm": 3.90091872215271, "learning_rate": 1.9444444444444445e-05, "loss": 0.2084, "step": 1550 }, { "epoch": 6.130434782608695, "grad_norm": 3.1003096103668213, "learning_rate": 1.9424603174603176e-05, "loss": 0.1783, "step": 1551 }, { "epoch": 6.134387351778656, "grad_norm": 4.112887859344482, "learning_rate": 1.9404761904761906e-05, "loss": 0.2177, "step": 1552 }, { "epoch": 6.138339920948616, "grad_norm": 2.7363293170928955, "learning_rate": 1.9384920634920637e-05, "loss": 0.1403, "step": 1553 }, { "epoch": 6.142292490118577, "grad_norm": 3.061474561691284, "learning_rate": 1.9365079365079367e-05, "loss": 0.1204, "step": 1554 }, { "epoch": 6.146245059288537, "grad_norm": 3.5960800647735596, "learning_rate": 1.9345238095238097e-05, "loss": 0.1865, "step": 1555 }, { "epoch": 6.150197628458498, "grad_norm": 3.1782121658325195, "learning_rate": 1.9325396825396828e-05, "loss": 0.1481, "step": 1556 }, { "epoch": 6.154150197628459, "grad_norm": 3.249864339828491, "learning_rate": 1.9305555555555558e-05, "loss": 0.2116, "step": 1557 }, { "epoch": 6.158102766798419, "grad_norm": 4.330471515655518, "learning_rate": 1.928571428571429e-05, "loss": 0.158, "step": 1558 }, { "epoch": 6.16205533596838, "grad_norm": 4.093515872955322, "learning_rate": 1.926587301587302e-05, "loss": 0.1871, "step": 1559 }, { "epoch": 6.16600790513834, "grad_norm": 3.096252918243408, "learning_rate": 1.924603174603175e-05, "loss": 0.1601, "step": 1560 }, { "epoch": 6.169960474308301, "grad_norm": 3.766211748123169, "learning_rate": 1.922619047619048e-05, "loss": 0.1666, "step": 1561 }, { "epoch": 6.173913043478261, "grad_norm": 3.036038637161255, "learning_rate": 1.920634920634921e-05, "loss": 0.1687, "step": 1562 }, { "epoch": 6.177865612648222, "grad_norm": 3.8249239921569824, "learning_rate": 1.918650793650794e-05, "loss": 0.1887, "step": 1563 }, { "epoch": 6.181818181818182, "grad_norm": 3.461127281188965, "learning_rate": 1.9166666666666667e-05, "loss": 0.1666, "step": 1564 }, { "epoch": 6.1857707509881426, "grad_norm": 3.262033462524414, "learning_rate": 1.9146825396825398e-05, "loss": 0.1546, "step": 1565 }, { "epoch": 6.189723320158103, "grad_norm": 3.1641769409179688, "learning_rate": 1.9126984126984128e-05, "loss": 0.1628, "step": 1566 }, { "epoch": 6.1936758893280635, "grad_norm": 2.8139045238494873, "learning_rate": 1.910714285714286e-05, "loss": 0.1658, "step": 1567 }, { "epoch": 6.1976284584980235, "grad_norm": 4.408447265625, "learning_rate": 1.9087301587301585e-05, "loss": 0.2102, "step": 1568 }, { "epoch": 6.201581027667984, "grad_norm": 2.9684667587280273, "learning_rate": 1.9067460317460316e-05, "loss": 0.2325, "step": 1569 }, { "epoch": 6.205533596837944, "grad_norm": 2.9627156257629395, "learning_rate": 1.9047619047619046e-05, "loss": 0.1779, "step": 1570 }, { "epoch": 6.209486166007905, "grad_norm": 3.4896349906921387, "learning_rate": 1.9027777777777776e-05, "loss": 0.1404, "step": 1571 }, { "epoch": 6.213438735177865, "grad_norm": 2.7135539054870605, "learning_rate": 1.9007936507936507e-05, "loss": 0.1057, "step": 1572 }, { "epoch": 6.217391304347826, "grad_norm": 3.230480909347534, "learning_rate": 1.8988095238095237e-05, "loss": 0.1485, "step": 1573 }, { "epoch": 6.221343873517786, "grad_norm": 2.9966936111450195, "learning_rate": 1.8968253968253968e-05, "loss": 0.1521, "step": 1574 }, { "epoch": 6.225296442687747, "grad_norm": 2.451078414916992, "learning_rate": 1.8948412698412698e-05, "loss": 0.1289, "step": 1575 }, { "epoch": 6.229249011857707, "grad_norm": 3.682922124862671, "learning_rate": 1.892857142857143e-05, "loss": 0.1696, "step": 1576 }, { "epoch": 6.233201581027668, "grad_norm": 4.325542449951172, "learning_rate": 1.890873015873016e-05, "loss": 0.2367, "step": 1577 }, { "epoch": 6.237154150197629, "grad_norm": 3.799766778945923, "learning_rate": 1.888888888888889e-05, "loss": 0.1636, "step": 1578 }, { "epoch": 6.241106719367589, "grad_norm": 2.6366379261016846, "learning_rate": 1.886904761904762e-05, "loss": 0.126, "step": 1579 }, { "epoch": 6.24505928853755, "grad_norm": 3.575343608856201, "learning_rate": 1.884920634920635e-05, "loss": 0.1621, "step": 1580 }, { "epoch": 6.24901185770751, "grad_norm": 4.663724422454834, "learning_rate": 1.882936507936508e-05, "loss": 0.2898, "step": 1581 }, { "epoch": 6.252964426877471, "grad_norm": 3.3895034790039062, "learning_rate": 1.880952380952381e-05, "loss": 0.2493, "step": 1582 }, { "epoch": 6.256916996047431, "grad_norm": 4.064142227172852, "learning_rate": 1.878968253968254e-05, "loss": 0.2136, "step": 1583 }, { "epoch": 6.260869565217392, "grad_norm": 4.431739330291748, "learning_rate": 1.876984126984127e-05, "loss": 0.168, "step": 1584 }, { "epoch": 6.264822134387352, "grad_norm": 3.109384536743164, "learning_rate": 1.8750000000000002e-05, "loss": 0.1606, "step": 1585 }, { "epoch": 6.2687747035573125, "grad_norm": 3.7452895641326904, "learning_rate": 1.8730158730158732e-05, "loss": 0.2287, "step": 1586 }, { "epoch": 6.2727272727272725, "grad_norm": 3.320481777191162, "learning_rate": 1.8710317460317462e-05, "loss": 0.1221, "step": 1587 }, { "epoch": 6.276679841897233, "grad_norm": 3.2335190773010254, "learning_rate": 1.8690476190476193e-05, "loss": 0.1522, "step": 1588 }, { "epoch": 6.280632411067193, "grad_norm": 2.431941509246826, "learning_rate": 1.8670634920634923e-05, "loss": 0.1787, "step": 1589 }, { "epoch": 6.284584980237154, "grad_norm": 2.7418622970581055, "learning_rate": 1.8650793650793654e-05, "loss": 0.1862, "step": 1590 }, { "epoch": 6.288537549407114, "grad_norm": 3.3152060508728027, "learning_rate": 1.863095238095238e-05, "loss": 0.1879, "step": 1591 }, { "epoch": 6.292490118577075, "grad_norm": 2.946201801300049, "learning_rate": 1.861111111111111e-05, "loss": 0.1429, "step": 1592 }, { "epoch": 6.296442687747035, "grad_norm": 4.4527587890625, "learning_rate": 1.859126984126984e-05, "loss": 0.1634, "step": 1593 }, { "epoch": 6.300395256916996, "grad_norm": 2.763129711151123, "learning_rate": 1.8571428571428572e-05, "loss": 0.1338, "step": 1594 }, { "epoch": 6.304347826086957, "grad_norm": 3.8917641639709473, "learning_rate": 1.8551587301587302e-05, "loss": 0.1526, "step": 1595 }, { "epoch": 6.308300395256917, "grad_norm": 3.10577130317688, "learning_rate": 1.8531746031746032e-05, "loss": 0.1993, "step": 1596 }, { "epoch": 6.312252964426877, "grad_norm": 4.823581695556641, "learning_rate": 1.8511904761904763e-05, "loss": 0.1888, "step": 1597 }, { "epoch": 6.316205533596838, "grad_norm": 3.8852086067199707, "learning_rate": 1.8492063492063493e-05, "loss": 0.1369, "step": 1598 }, { "epoch": 6.320158102766799, "grad_norm": 3.8083715438842773, "learning_rate": 1.8472222222222224e-05, "loss": 0.1808, "step": 1599 }, { "epoch": 6.324110671936759, "grad_norm": 3.3586747646331787, "learning_rate": 1.8452380952380954e-05, "loss": 0.1479, "step": 1600 }, { "epoch": 6.32806324110672, "grad_norm": 2.599363088607788, "learning_rate": 1.8432539682539684e-05, "loss": 0.1342, "step": 1601 }, { "epoch": 6.33201581027668, "grad_norm": 3.6683595180511475, "learning_rate": 1.8412698412698415e-05, "loss": 0.1694, "step": 1602 }, { "epoch": 6.335968379446641, "grad_norm": 2.9072253704071045, "learning_rate": 1.8392857142857145e-05, "loss": 0.1598, "step": 1603 }, { "epoch": 6.339920948616601, "grad_norm": 4.043822288513184, "learning_rate": 1.8373015873015875e-05, "loss": 0.2375, "step": 1604 }, { "epoch": 6.3438735177865615, "grad_norm": 4.241188049316406, "learning_rate": 1.8353174603174602e-05, "loss": 0.2391, "step": 1605 }, { "epoch": 6.3478260869565215, "grad_norm": 3.922586441040039, "learning_rate": 1.8333333333333333e-05, "loss": 0.2149, "step": 1606 }, { "epoch": 6.351778656126482, "grad_norm": 4.914572715759277, "learning_rate": 1.8313492063492063e-05, "loss": 0.2859, "step": 1607 }, { "epoch": 6.355731225296442, "grad_norm": 3.6631100177764893, "learning_rate": 1.8293650793650794e-05, "loss": 0.2134, "step": 1608 }, { "epoch": 6.359683794466403, "grad_norm": 2.8764913082122803, "learning_rate": 1.8273809523809524e-05, "loss": 0.1453, "step": 1609 }, { "epoch": 6.363636363636363, "grad_norm": 2.821441173553467, "learning_rate": 1.8253968253968254e-05, "loss": 0.1457, "step": 1610 }, { "epoch": 6.367588932806324, "grad_norm": 2.9946677684783936, "learning_rate": 1.8234126984126985e-05, "loss": 0.144, "step": 1611 }, { "epoch": 6.371541501976284, "grad_norm": 2.6403133869171143, "learning_rate": 1.8214285714285715e-05, "loss": 0.1484, "step": 1612 }, { "epoch": 6.375494071146245, "grad_norm": 2.8457889556884766, "learning_rate": 1.8194444444444445e-05, "loss": 0.1816, "step": 1613 }, { "epoch": 6.379446640316205, "grad_norm": 2.93542742729187, "learning_rate": 1.8174603174603176e-05, "loss": 0.1347, "step": 1614 }, { "epoch": 6.383399209486166, "grad_norm": 2.890878915786743, "learning_rate": 1.8154761904761906e-05, "loss": 0.1245, "step": 1615 }, { "epoch": 6.387351778656127, "grad_norm": 2.7771224975585938, "learning_rate": 1.8134920634920637e-05, "loss": 0.1573, "step": 1616 }, { "epoch": 6.391304347826087, "grad_norm": 3.317066192626953, "learning_rate": 1.8115079365079367e-05, "loss": 0.1826, "step": 1617 }, { "epoch": 6.395256916996048, "grad_norm": 3.2389166355133057, "learning_rate": 1.8095238095238094e-05, "loss": 0.1539, "step": 1618 }, { "epoch": 6.399209486166008, "grad_norm": 3.585541009902954, "learning_rate": 1.8075396825396824e-05, "loss": 0.1731, "step": 1619 }, { "epoch": 6.403162055335969, "grad_norm": 3.805875778198242, "learning_rate": 1.8055555555555555e-05, "loss": 0.2165, "step": 1620 }, { "epoch": 6.407114624505929, "grad_norm": 4.0997090339660645, "learning_rate": 1.8035714285714285e-05, "loss": 0.178, "step": 1621 }, { "epoch": 6.41106719367589, "grad_norm": 3.204512596130371, "learning_rate": 1.8015873015873015e-05, "loss": 0.1188, "step": 1622 }, { "epoch": 6.41501976284585, "grad_norm": 3.9086551666259766, "learning_rate": 1.7996031746031746e-05, "loss": 0.1424, "step": 1623 }, { "epoch": 6.4189723320158105, "grad_norm": 3.208350658416748, "learning_rate": 1.7976190476190476e-05, "loss": 0.1119, "step": 1624 }, { "epoch": 6.4229249011857705, "grad_norm": 4.048145771026611, "learning_rate": 1.7956349206349207e-05, "loss": 0.2034, "step": 1625 }, { "epoch": 6.426877470355731, "grad_norm": 3.807363986968994, "learning_rate": 1.7936507936507937e-05, "loss": 0.1894, "step": 1626 }, { "epoch": 6.430830039525691, "grad_norm": 3.468174457550049, "learning_rate": 1.7916666666666667e-05, "loss": 0.1755, "step": 1627 }, { "epoch": 6.434782608695652, "grad_norm": 3.572981595993042, "learning_rate": 1.7896825396825398e-05, "loss": 0.2043, "step": 1628 }, { "epoch": 6.438735177865612, "grad_norm": 4.217936992645264, "learning_rate": 1.7876984126984128e-05, "loss": 0.3312, "step": 1629 }, { "epoch": 6.442687747035573, "grad_norm": 3.2552435398101807, "learning_rate": 1.785714285714286e-05, "loss": 0.1874, "step": 1630 }, { "epoch": 6.446640316205533, "grad_norm": 3.2607510089874268, "learning_rate": 1.783730158730159e-05, "loss": 0.1648, "step": 1631 }, { "epoch": 6.450592885375494, "grad_norm": 4.080394268035889, "learning_rate": 1.781746031746032e-05, "loss": 0.1779, "step": 1632 }, { "epoch": 6.454545454545454, "grad_norm": 3.7420568466186523, "learning_rate": 1.779761904761905e-05, "loss": 0.161, "step": 1633 }, { "epoch": 6.458498023715415, "grad_norm": 3.199740171432495, "learning_rate": 1.777777777777778e-05, "loss": 0.1783, "step": 1634 }, { "epoch": 6.462450592885375, "grad_norm": 3.5963351726531982, "learning_rate": 1.775793650793651e-05, "loss": 0.1577, "step": 1635 }, { "epoch": 6.466403162055336, "grad_norm": 2.5002620220184326, "learning_rate": 1.773809523809524e-05, "loss": 0.1206, "step": 1636 }, { "epoch": 6.470355731225297, "grad_norm": 3.222224235534668, "learning_rate": 1.771825396825397e-05, "loss": 0.1532, "step": 1637 }, { "epoch": 6.474308300395257, "grad_norm": 2.948617935180664, "learning_rate": 1.76984126984127e-05, "loss": 0.1807, "step": 1638 }, { "epoch": 6.478260869565218, "grad_norm": 3.9954729080200195, "learning_rate": 1.7678571428571432e-05, "loss": 0.1606, "step": 1639 }, { "epoch": 6.482213438735178, "grad_norm": 3.8689181804656982, "learning_rate": 1.7658730158730162e-05, "loss": 0.188, "step": 1640 }, { "epoch": 6.486166007905139, "grad_norm": 2.888376235961914, "learning_rate": 1.763888888888889e-05, "loss": 0.1553, "step": 1641 }, { "epoch": 6.490118577075099, "grad_norm": 3.6249587535858154, "learning_rate": 1.761904761904762e-05, "loss": 0.178, "step": 1642 }, { "epoch": 6.4940711462450595, "grad_norm": 2.787454843521118, "learning_rate": 1.759920634920635e-05, "loss": 0.129, "step": 1643 }, { "epoch": 6.4980237154150196, "grad_norm": 4.260427474975586, "learning_rate": 1.757936507936508e-05, "loss": 0.2073, "step": 1644 }, { "epoch": 6.5019762845849804, "grad_norm": 4.095132827758789, "learning_rate": 1.755952380952381e-05, "loss": 0.1609, "step": 1645 }, { "epoch": 6.5059288537549405, "grad_norm": 3.89748477935791, "learning_rate": 1.7539682539682538e-05, "loss": 0.2223, "step": 1646 }, { "epoch": 6.509881422924901, "grad_norm": 3.197842597961426, "learning_rate": 1.7519841269841268e-05, "loss": 0.1804, "step": 1647 }, { "epoch": 6.513833992094861, "grad_norm": 3.7351438999176025, "learning_rate": 1.75e-05, "loss": 0.1659, "step": 1648 }, { "epoch": 6.517786561264822, "grad_norm": 3.417168378829956, "learning_rate": 1.748015873015873e-05, "loss": 0.1493, "step": 1649 }, { "epoch": 6.521739130434782, "grad_norm": 2.904148578643799, "learning_rate": 1.746031746031746e-05, "loss": 0.1689, "step": 1650 }, { "epoch": 6.525691699604743, "grad_norm": 3.141493558883667, "learning_rate": 1.744047619047619e-05, "loss": 0.1433, "step": 1651 }, { "epoch": 6.529644268774703, "grad_norm": 3.594679832458496, "learning_rate": 1.742063492063492e-05, "loss": 0.2365, "step": 1652 }, { "epoch": 6.533596837944664, "grad_norm": 3.7037243843078613, "learning_rate": 1.740079365079365e-05, "loss": 0.2317, "step": 1653 }, { "epoch": 6.537549407114625, "grad_norm": 3.6561882495880127, "learning_rate": 1.738095238095238e-05, "loss": 0.1495, "step": 1654 }, { "epoch": 6.541501976284585, "grad_norm": 3.278259754180908, "learning_rate": 1.736111111111111e-05, "loss": 0.1314, "step": 1655 }, { "epoch": 6.545454545454545, "grad_norm": 3.1691975593566895, "learning_rate": 1.734126984126984e-05, "loss": 0.1566, "step": 1656 }, { "epoch": 6.549407114624506, "grad_norm": 3.14813494682312, "learning_rate": 1.7321428571428572e-05, "loss": 0.1585, "step": 1657 }, { "epoch": 6.553359683794467, "grad_norm": 3.2934324741363525, "learning_rate": 1.7301587301587302e-05, "loss": 0.1568, "step": 1658 }, { "epoch": 6.557312252964427, "grad_norm": 4.18383264541626, "learning_rate": 1.7281746031746033e-05, "loss": 0.3083, "step": 1659 }, { "epoch": 6.561264822134388, "grad_norm": 4.086765766143799, "learning_rate": 1.7261904761904763e-05, "loss": 0.1687, "step": 1660 }, { "epoch": 6.565217391304348, "grad_norm": 3.70186185836792, "learning_rate": 1.7242063492063493e-05, "loss": 0.1741, "step": 1661 }, { "epoch": 6.569169960474309, "grad_norm": 3.012298583984375, "learning_rate": 1.7222222222222224e-05, "loss": 0.1249, "step": 1662 }, { "epoch": 6.573122529644269, "grad_norm": 5.418375492095947, "learning_rate": 1.7202380952380954e-05, "loss": 0.328, "step": 1663 }, { "epoch": 6.5770750988142295, "grad_norm": 2.400665521621704, "learning_rate": 1.7182539682539684e-05, "loss": 0.1125, "step": 1664 }, { "epoch": 6.5810276679841895, "grad_norm": 3.7247157096862793, "learning_rate": 1.7162698412698415e-05, "loss": 0.1577, "step": 1665 }, { "epoch": 6.58498023715415, "grad_norm": 2.6763904094696045, "learning_rate": 1.7142857142857145e-05, "loss": 0.1269, "step": 1666 }, { "epoch": 6.58893280632411, "grad_norm": 2.851741313934326, "learning_rate": 1.7123015873015876e-05, "loss": 0.141, "step": 1667 }, { "epoch": 6.592885375494071, "grad_norm": 3.190314769744873, "learning_rate": 1.7103174603174606e-05, "loss": 0.2027, "step": 1668 }, { "epoch": 6.596837944664031, "grad_norm": 3.6213324069976807, "learning_rate": 1.7083333333333333e-05, "loss": 0.2126, "step": 1669 }, { "epoch": 6.600790513833992, "grad_norm": 3.6748242378234863, "learning_rate": 1.7063492063492063e-05, "loss": 0.1946, "step": 1670 }, { "epoch": 6.604743083003952, "grad_norm": 2.7625160217285156, "learning_rate": 1.7043650793650794e-05, "loss": 0.1886, "step": 1671 }, { "epoch": 6.608695652173913, "grad_norm": 3.2063212394714355, "learning_rate": 1.7023809523809524e-05, "loss": 0.1705, "step": 1672 }, { "epoch": 6.612648221343873, "grad_norm": 3.1696741580963135, "learning_rate": 1.7003968253968254e-05, "loss": 0.1476, "step": 1673 }, { "epoch": 6.616600790513834, "grad_norm": 2.514961004257202, "learning_rate": 1.6984126984126985e-05, "loss": 0.1324, "step": 1674 }, { "epoch": 6.620553359683795, "grad_norm": 3.8006770610809326, "learning_rate": 1.6964285714285715e-05, "loss": 0.2031, "step": 1675 }, { "epoch": 6.624505928853755, "grad_norm": 2.094867706298828, "learning_rate": 1.6944444444444446e-05, "loss": 0.0909, "step": 1676 }, { "epoch": 6.628458498023716, "grad_norm": 5.729726314544678, "learning_rate": 1.6924603174603176e-05, "loss": 0.2006, "step": 1677 }, { "epoch": 6.632411067193676, "grad_norm": 4.788626194000244, "learning_rate": 1.6904761904761906e-05, "loss": 0.188, "step": 1678 }, { "epoch": 6.636363636363637, "grad_norm": 3.7394933700561523, "learning_rate": 1.6884920634920637e-05, "loss": 0.1474, "step": 1679 }, { "epoch": 6.640316205533597, "grad_norm": 3.15619158744812, "learning_rate": 1.6865079365079367e-05, "loss": 0.1226, "step": 1680 }, { "epoch": 6.644268774703558, "grad_norm": 3.1773312091827393, "learning_rate": 1.6845238095238097e-05, "loss": 0.1683, "step": 1681 }, { "epoch": 6.648221343873518, "grad_norm": 3.101545572280884, "learning_rate": 1.6825396825396828e-05, "loss": 0.1786, "step": 1682 }, { "epoch": 6.6521739130434785, "grad_norm": 2.9866223335266113, "learning_rate": 1.6805555555555558e-05, "loss": 0.1385, "step": 1683 }, { "epoch": 6.6561264822134385, "grad_norm": 2.350311517715454, "learning_rate": 1.6785714285714285e-05, "loss": 0.1332, "step": 1684 }, { "epoch": 6.660079051383399, "grad_norm": 3.1310527324676514, "learning_rate": 1.6765873015873016e-05, "loss": 0.1336, "step": 1685 }, { "epoch": 6.664031620553359, "grad_norm": 3.0900039672851562, "learning_rate": 1.6746031746031746e-05, "loss": 0.1304, "step": 1686 }, { "epoch": 6.66798418972332, "grad_norm": 3.2894325256347656, "learning_rate": 1.6726190476190476e-05, "loss": 0.1869, "step": 1687 }, { "epoch": 6.67193675889328, "grad_norm": 3.1792759895324707, "learning_rate": 1.6706349206349207e-05, "loss": 0.145, "step": 1688 }, { "epoch": 6.675889328063241, "grad_norm": 2.9144246578216553, "learning_rate": 1.6686507936507937e-05, "loss": 0.1617, "step": 1689 }, { "epoch": 6.679841897233201, "grad_norm": 3.1634509563446045, "learning_rate": 1.6666666666666667e-05, "loss": 0.159, "step": 1690 }, { "epoch": 6.683794466403162, "grad_norm": 3.328819990158081, "learning_rate": 1.6646825396825398e-05, "loss": 0.2096, "step": 1691 }, { "epoch": 6.687747035573123, "grad_norm": 3.085995674133301, "learning_rate": 1.6626984126984128e-05, "loss": 0.1649, "step": 1692 }, { "epoch": 6.691699604743083, "grad_norm": 2.8405370712280273, "learning_rate": 1.660714285714286e-05, "loss": 0.1666, "step": 1693 }, { "epoch": 6.695652173913043, "grad_norm": 3.161557912826538, "learning_rate": 1.658730158730159e-05, "loss": 0.1811, "step": 1694 }, { "epoch": 6.699604743083004, "grad_norm": 4.0587158203125, "learning_rate": 1.656746031746032e-05, "loss": 0.2273, "step": 1695 }, { "epoch": 6.703557312252965, "grad_norm": 3.1666932106018066, "learning_rate": 1.6547619047619046e-05, "loss": 0.2019, "step": 1696 }, { "epoch": 6.707509881422925, "grad_norm": 4.08314847946167, "learning_rate": 1.6527777777777777e-05, "loss": 0.2561, "step": 1697 }, { "epoch": 6.711462450592886, "grad_norm": 3.302401065826416, "learning_rate": 1.6507936507936507e-05, "loss": 0.1543, "step": 1698 }, { "epoch": 6.715415019762846, "grad_norm": 3.8679957389831543, "learning_rate": 1.6488095238095237e-05, "loss": 0.2805, "step": 1699 }, { "epoch": 6.719367588932807, "grad_norm": 2.699409008026123, "learning_rate": 1.6468253968253968e-05, "loss": 0.1732, "step": 1700 }, { "epoch": 6.723320158102767, "grad_norm": 4.578848838806152, "learning_rate": 1.6448412698412698e-05, "loss": 0.1573, "step": 1701 }, { "epoch": 6.7272727272727275, "grad_norm": 4.305120468139648, "learning_rate": 1.642857142857143e-05, "loss": 0.1459, "step": 1702 }, { "epoch": 6.7312252964426875, "grad_norm": 2.981783628463745, "learning_rate": 1.640873015873016e-05, "loss": 0.1605, "step": 1703 }, { "epoch": 6.735177865612648, "grad_norm": 3.4200551509857178, "learning_rate": 1.638888888888889e-05, "loss": 0.1136, "step": 1704 }, { "epoch": 6.739130434782608, "grad_norm": 3.0042500495910645, "learning_rate": 1.636904761904762e-05, "loss": 0.1645, "step": 1705 }, { "epoch": 6.743083003952569, "grad_norm": 5.0387043952941895, "learning_rate": 1.634920634920635e-05, "loss": 0.3057, "step": 1706 }, { "epoch": 6.747035573122529, "grad_norm": 3.7466354370117188, "learning_rate": 1.632936507936508e-05, "loss": 0.1289, "step": 1707 }, { "epoch": 6.75098814229249, "grad_norm": 2.57358980178833, "learning_rate": 1.630952380952381e-05, "loss": 0.1363, "step": 1708 }, { "epoch": 6.75494071146245, "grad_norm": 3.13360333442688, "learning_rate": 1.628968253968254e-05, "loss": 0.1979, "step": 1709 }, { "epoch": 6.758893280632411, "grad_norm": 3.5185632705688477, "learning_rate": 1.626984126984127e-05, "loss": 0.1732, "step": 1710 }, { "epoch": 6.762845849802371, "grad_norm": 4.430140018463135, "learning_rate": 1.6250000000000002e-05, "loss": 0.1609, "step": 1711 }, { "epoch": 6.766798418972332, "grad_norm": 3.1439976692199707, "learning_rate": 1.6230158730158732e-05, "loss": 0.1765, "step": 1712 }, { "epoch": 6.770750988142293, "grad_norm": 2.766545534133911, "learning_rate": 1.6210317460317463e-05, "loss": 0.173, "step": 1713 }, { "epoch": 6.774703557312253, "grad_norm": 3.4417989253997803, "learning_rate": 1.6190476190476193e-05, "loss": 0.1748, "step": 1714 }, { "epoch": 6.778656126482213, "grad_norm": 3.091768503189087, "learning_rate": 1.6170634920634923e-05, "loss": 0.1947, "step": 1715 }, { "epoch": 6.782608695652174, "grad_norm": 2.7375543117523193, "learning_rate": 1.6150793650793654e-05, "loss": 0.1786, "step": 1716 }, { "epoch": 6.786561264822135, "grad_norm": 3.463697671890259, "learning_rate": 1.6130952380952384e-05, "loss": 0.2595, "step": 1717 }, { "epoch": 6.790513833992095, "grad_norm": 3.0348737239837646, "learning_rate": 1.6111111111111115e-05, "loss": 0.123, "step": 1718 }, { "epoch": 6.794466403162056, "grad_norm": 3.646333694458008, "learning_rate": 1.609126984126984e-05, "loss": 0.1529, "step": 1719 }, { "epoch": 6.798418972332016, "grad_norm": 4.054477691650391, "learning_rate": 1.6071428571428572e-05, "loss": 0.1475, "step": 1720 }, { "epoch": 6.8023715415019765, "grad_norm": 3.619706869125366, "learning_rate": 1.6051587301587302e-05, "loss": 0.1682, "step": 1721 }, { "epoch": 6.8063241106719365, "grad_norm": 3.0783016681671143, "learning_rate": 1.6031746031746033e-05, "loss": 0.1772, "step": 1722 }, { "epoch": 6.810276679841897, "grad_norm": 2.9383182525634766, "learning_rate": 1.601190476190476e-05, "loss": 0.149, "step": 1723 }, { "epoch": 6.8142292490118574, "grad_norm": 3.6608943939208984, "learning_rate": 1.599206349206349e-05, "loss": 0.152, "step": 1724 }, { "epoch": 6.818181818181818, "grad_norm": 3.396385908126831, "learning_rate": 1.597222222222222e-05, "loss": 0.1656, "step": 1725 }, { "epoch": 6.822134387351778, "grad_norm": 2.923689126968384, "learning_rate": 1.595238095238095e-05, "loss": 0.1529, "step": 1726 }, { "epoch": 6.826086956521739, "grad_norm": 4.30761194229126, "learning_rate": 1.593253968253968e-05, "loss": 0.3032, "step": 1727 }, { "epoch": 6.830039525691699, "grad_norm": 3.8314170837402344, "learning_rate": 1.591269841269841e-05, "loss": 0.1874, "step": 1728 }, { "epoch": 6.83399209486166, "grad_norm": 2.5451226234436035, "learning_rate": 1.5892857142857142e-05, "loss": 0.1155, "step": 1729 }, { "epoch": 6.837944664031621, "grad_norm": 3.4286904335021973, "learning_rate": 1.5873015873015872e-05, "loss": 0.2253, "step": 1730 }, { "epoch": 6.841897233201581, "grad_norm": 3.8259549140930176, "learning_rate": 1.5853174603174603e-05, "loss": 0.2408, "step": 1731 }, { "epoch": 6.845849802371541, "grad_norm": 3.6046714782714844, "learning_rate": 1.5833333333333333e-05, "loss": 0.1758, "step": 1732 }, { "epoch": 6.849802371541502, "grad_norm": 3.0990259647369385, "learning_rate": 1.5813492063492063e-05, "loss": 0.1622, "step": 1733 }, { "epoch": 6.853754940711463, "grad_norm": 2.931340217590332, "learning_rate": 1.5793650793650794e-05, "loss": 0.1598, "step": 1734 }, { "epoch": 6.857707509881423, "grad_norm": 2.4774787425994873, "learning_rate": 1.5773809523809524e-05, "loss": 0.141, "step": 1735 }, { "epoch": 6.861660079051384, "grad_norm": 2.9939541816711426, "learning_rate": 1.5753968253968255e-05, "loss": 0.135, "step": 1736 }, { "epoch": 6.865612648221344, "grad_norm": 2.929865598678589, "learning_rate": 1.5734126984126985e-05, "loss": 0.1947, "step": 1737 }, { "epoch": 6.869565217391305, "grad_norm": 3.4848814010620117, "learning_rate": 1.5714285714285715e-05, "loss": 0.1808, "step": 1738 }, { "epoch": 6.873517786561265, "grad_norm": 2.8920042514801025, "learning_rate": 1.5694444444444446e-05, "loss": 0.102, "step": 1739 }, { "epoch": 6.877470355731226, "grad_norm": 4.020946979522705, "learning_rate": 1.5674603174603176e-05, "loss": 0.1483, "step": 1740 }, { "epoch": 6.881422924901186, "grad_norm": 3.165574073791504, "learning_rate": 1.5654761904761906e-05, "loss": 0.1437, "step": 1741 }, { "epoch": 6.8853754940711465, "grad_norm": 3.595200538635254, "learning_rate": 1.5634920634920637e-05, "loss": 0.1561, "step": 1742 }, { "epoch": 6.8893280632411065, "grad_norm": 2.952357530593872, "learning_rate": 1.5615079365079367e-05, "loss": 0.1786, "step": 1743 }, { "epoch": 6.893280632411067, "grad_norm": 3.0202043056488037, "learning_rate": 1.5595238095238098e-05, "loss": 0.1493, "step": 1744 }, { "epoch": 6.897233201581027, "grad_norm": 3.46177339553833, "learning_rate": 1.5575396825396828e-05, "loss": 0.1578, "step": 1745 }, { "epoch": 6.901185770750988, "grad_norm": 4.415173053741455, "learning_rate": 1.5555555555555555e-05, "loss": 0.205, "step": 1746 }, { "epoch": 6.905138339920948, "grad_norm": 3.0862183570861816, "learning_rate": 1.5535714285714285e-05, "loss": 0.1837, "step": 1747 }, { "epoch": 6.909090909090909, "grad_norm": 3.788036584854126, "learning_rate": 1.5515873015873016e-05, "loss": 0.1423, "step": 1748 }, { "epoch": 6.913043478260869, "grad_norm": 3.3133058547973633, "learning_rate": 1.5496031746031746e-05, "loss": 0.2186, "step": 1749 }, { "epoch": 6.91699604743083, "grad_norm": 4.402420520782471, "learning_rate": 1.5476190476190476e-05, "loss": 0.1938, "step": 1750 }, { "epoch": 6.920948616600791, "grad_norm": 3.363860607147217, "learning_rate": 1.5456349206349207e-05, "loss": 0.1936, "step": 1751 }, { "epoch": 6.924901185770751, "grad_norm": 3.4065957069396973, "learning_rate": 1.5436507936507937e-05, "loss": 0.174, "step": 1752 }, { "epoch": 6.928853754940711, "grad_norm": 2.393531322479248, "learning_rate": 1.5416666666666668e-05, "loss": 0.1511, "step": 1753 }, { "epoch": 6.932806324110672, "grad_norm": 3.180001974105835, "learning_rate": 1.5396825396825398e-05, "loss": 0.1551, "step": 1754 }, { "epoch": 6.936758893280633, "grad_norm": 2.9649102687835693, "learning_rate": 1.537698412698413e-05, "loss": 0.1588, "step": 1755 }, { "epoch": 6.940711462450593, "grad_norm": 3.6014418601989746, "learning_rate": 1.535714285714286e-05, "loss": 0.3112, "step": 1756 }, { "epoch": 6.944664031620554, "grad_norm": 3.6775028705596924, "learning_rate": 1.533730158730159e-05, "loss": 0.2638, "step": 1757 }, { "epoch": 6.948616600790514, "grad_norm": 2.682131052017212, "learning_rate": 1.531746031746032e-05, "loss": 0.1377, "step": 1758 }, { "epoch": 6.952569169960475, "grad_norm": 3.296804904937744, "learning_rate": 1.529761904761905e-05, "loss": 0.1325, "step": 1759 }, { "epoch": 6.956521739130435, "grad_norm": 3.019824743270874, "learning_rate": 1.527777777777778e-05, "loss": 0.1358, "step": 1760 }, { "epoch": 6.9604743083003955, "grad_norm": 3.4432945251464844, "learning_rate": 1.525793650793651e-05, "loss": 0.1674, "step": 1761 }, { "epoch": 6.9644268774703555, "grad_norm": 4.434787273406982, "learning_rate": 1.5238095238095241e-05, "loss": 0.2198, "step": 1762 }, { "epoch": 6.968379446640316, "grad_norm": 2.954920530319214, "learning_rate": 1.5218253968253968e-05, "loss": 0.1315, "step": 1763 }, { "epoch": 6.972332015810276, "grad_norm": 3.6730740070343018, "learning_rate": 1.5198412698412698e-05, "loss": 0.1586, "step": 1764 }, { "epoch": 6.976284584980237, "grad_norm": 4.036146640777588, "learning_rate": 1.5178571428571429e-05, "loss": 0.2249, "step": 1765 }, { "epoch": 6.980237154150197, "grad_norm": 3.2338225841522217, "learning_rate": 1.5158730158730159e-05, "loss": 0.138, "step": 1766 }, { "epoch": 6.984189723320158, "grad_norm": 3.692736864089966, "learning_rate": 1.5138888888888888e-05, "loss": 0.1937, "step": 1767 }, { "epoch": 6.988142292490118, "grad_norm": 2.946046829223633, "learning_rate": 1.5119047619047618e-05, "loss": 0.1506, "step": 1768 }, { "epoch": 6.992094861660079, "grad_norm": 4.145671367645264, "learning_rate": 1.5099206349206349e-05, "loss": 0.1591, "step": 1769 }, { "epoch": 6.996047430830039, "grad_norm": 3.8622443675994873, "learning_rate": 1.5079365079365079e-05, "loss": 0.2398, "step": 1770 }, { "epoch": 7.0, "grad_norm": 3.0714492797851562, "learning_rate": 1.505952380952381e-05, "loss": 0.1849, "step": 1771 }, { "epoch": 7.003952569169961, "grad_norm": 1.683645248413086, "learning_rate": 1.503968253968254e-05, "loss": 0.0714, "step": 1772 }, { "epoch": 7.007905138339921, "grad_norm": 2.0551791191101074, "learning_rate": 1.501984126984127e-05, "loss": 0.0825, "step": 1773 }, { "epoch": 7.011857707509882, "grad_norm": 2.248523712158203, "learning_rate": 1.5e-05, "loss": 0.0954, "step": 1774 }, { "epoch": 7.015810276679842, "grad_norm": 2.069105863571167, "learning_rate": 1.498015873015873e-05, "loss": 0.1108, "step": 1775 }, { "epoch": 7.019762845849803, "grad_norm": 2.3756532669067383, "learning_rate": 1.4960317460317461e-05, "loss": 0.1123, "step": 1776 }, { "epoch": 7.023715415019763, "grad_norm": 2.0308547019958496, "learning_rate": 1.4940476190476192e-05, "loss": 0.0715, "step": 1777 }, { "epoch": 7.027667984189724, "grad_norm": 2.602980613708496, "learning_rate": 1.4920634920634922e-05, "loss": 0.1107, "step": 1778 }, { "epoch": 7.031620553359684, "grad_norm": 1.8657187223434448, "learning_rate": 1.490079365079365e-05, "loss": 0.0681, "step": 1779 }, { "epoch": 7.0355731225296445, "grad_norm": 1.9880305528640747, "learning_rate": 1.4880952380952381e-05, "loss": 0.0644, "step": 1780 }, { "epoch": 7.0395256916996045, "grad_norm": 1.8279154300689697, "learning_rate": 1.4861111111111111e-05, "loss": 0.0662, "step": 1781 }, { "epoch": 7.043478260869565, "grad_norm": 2.1131718158721924, "learning_rate": 1.4841269841269842e-05, "loss": 0.0787, "step": 1782 }, { "epoch": 7.047430830039525, "grad_norm": 2.5316834449768066, "learning_rate": 1.4821428571428572e-05, "loss": 0.1016, "step": 1783 }, { "epoch": 7.051383399209486, "grad_norm": 2.494107961654663, "learning_rate": 1.4801587301587302e-05, "loss": 0.0835, "step": 1784 }, { "epoch": 7.055335968379446, "grad_norm": 2.633171796798706, "learning_rate": 1.4781746031746033e-05, "loss": 0.0724, "step": 1785 }, { "epoch": 7.059288537549407, "grad_norm": 1.709030270576477, "learning_rate": 1.4761904761904763e-05, "loss": 0.0618, "step": 1786 }, { "epoch": 7.063241106719367, "grad_norm": 2.1349966526031494, "learning_rate": 1.4742063492063494e-05, "loss": 0.0763, "step": 1787 }, { "epoch": 7.067193675889328, "grad_norm": 3.602543354034424, "learning_rate": 1.4722222222222224e-05, "loss": 0.1131, "step": 1788 }, { "epoch": 7.071146245059288, "grad_norm": 3.5014562606811523, "learning_rate": 1.4702380952380954e-05, "loss": 0.1475, "step": 1789 }, { "epoch": 7.075098814229249, "grad_norm": 1.7112377882003784, "learning_rate": 1.4682539682539683e-05, "loss": 0.0655, "step": 1790 }, { "epoch": 7.07905138339921, "grad_norm": 2.5485458374023438, "learning_rate": 1.4662698412698413e-05, "loss": 0.0878, "step": 1791 }, { "epoch": 7.08300395256917, "grad_norm": 2.0620596408843994, "learning_rate": 1.4642857142857144e-05, "loss": 0.0801, "step": 1792 }, { "epoch": 7.086956521739131, "grad_norm": 2.537457227706909, "learning_rate": 1.4623015873015874e-05, "loss": 0.0698, "step": 1793 }, { "epoch": 7.090909090909091, "grad_norm": 1.8265228271484375, "learning_rate": 1.4603174603174605e-05, "loss": 0.0559, "step": 1794 }, { "epoch": 7.094861660079052, "grad_norm": 1.819351315498352, "learning_rate": 1.4583333333333335e-05, "loss": 0.0665, "step": 1795 }, { "epoch": 7.098814229249012, "grad_norm": 2.4396214485168457, "learning_rate": 1.4563492063492065e-05, "loss": 0.0786, "step": 1796 }, { "epoch": 7.102766798418973, "grad_norm": 2.4278268814086914, "learning_rate": 1.4543650793650796e-05, "loss": 0.0957, "step": 1797 }, { "epoch": 7.106719367588933, "grad_norm": 2.5539965629577637, "learning_rate": 1.4523809523809526e-05, "loss": 0.0876, "step": 1798 }, { "epoch": 7.1106719367588935, "grad_norm": 2.3676304817199707, "learning_rate": 1.4503968253968256e-05, "loss": 0.0801, "step": 1799 }, { "epoch": 7.1146245059288535, "grad_norm": 2.803798198699951, "learning_rate": 1.4484126984126987e-05, "loss": 0.1065, "step": 1800 }, { "epoch": 7.118577075098814, "grad_norm": 2.0833263397216797, "learning_rate": 1.4464285714285717e-05, "loss": 0.0701, "step": 1801 }, { "epoch": 7.122529644268774, "grad_norm": 1.9213995933532715, "learning_rate": 1.4444444444444444e-05, "loss": 0.0689, "step": 1802 }, { "epoch": 7.126482213438735, "grad_norm": 2.999032974243164, "learning_rate": 1.4424603174603174e-05, "loss": 0.1407, "step": 1803 }, { "epoch": 7.130434782608695, "grad_norm": 2.492380142211914, "learning_rate": 1.4404761904761905e-05, "loss": 0.0919, "step": 1804 }, { "epoch": 7.134387351778656, "grad_norm": 2.170208692550659, "learning_rate": 1.4384920634920635e-05, "loss": 0.0862, "step": 1805 }, { "epoch": 7.138339920948616, "grad_norm": 2.5370352268218994, "learning_rate": 1.4365079365079364e-05, "loss": 0.0899, "step": 1806 }, { "epoch": 7.142292490118577, "grad_norm": 2.430377960205078, "learning_rate": 1.4345238095238094e-05, "loss": 0.1082, "step": 1807 }, { "epoch": 7.146245059288537, "grad_norm": 3.0296268463134766, "learning_rate": 1.4325396825396825e-05, "loss": 0.1017, "step": 1808 }, { "epoch": 7.150197628458498, "grad_norm": 2.192507743835449, "learning_rate": 1.4305555555555555e-05, "loss": 0.0969, "step": 1809 }, { "epoch": 7.154150197628459, "grad_norm": 2.191904306411743, "learning_rate": 1.4285714285714285e-05, "loss": 0.0905, "step": 1810 }, { "epoch": 7.158102766798419, "grad_norm": 1.9413102865219116, "learning_rate": 1.4265873015873016e-05, "loss": 0.0746, "step": 1811 }, { "epoch": 7.16205533596838, "grad_norm": 2.1803011894226074, "learning_rate": 1.4246031746031746e-05, "loss": 0.0679, "step": 1812 }, { "epoch": 7.16600790513834, "grad_norm": 1.9626377820968628, "learning_rate": 1.4226190476190477e-05, "loss": 0.0784, "step": 1813 }, { "epoch": 7.169960474308301, "grad_norm": 3.6826534271240234, "learning_rate": 1.4206349206349207e-05, "loss": 0.1193, "step": 1814 }, { "epoch": 7.173913043478261, "grad_norm": 2.018355131149292, "learning_rate": 1.4186507936507937e-05, "loss": 0.0614, "step": 1815 }, { "epoch": 7.177865612648222, "grad_norm": 2.3391740322113037, "learning_rate": 1.4166666666666668e-05, "loss": 0.1052, "step": 1816 }, { "epoch": 7.181818181818182, "grad_norm": 3.039984941482544, "learning_rate": 1.4146825396825396e-05, "loss": 0.1077, "step": 1817 }, { "epoch": 7.1857707509881426, "grad_norm": 2.2779464721679688, "learning_rate": 1.4126984126984127e-05, "loss": 0.0654, "step": 1818 }, { "epoch": 7.189723320158103, "grad_norm": 1.959446907043457, "learning_rate": 1.4107142857142857e-05, "loss": 0.0676, "step": 1819 }, { "epoch": 7.1936758893280635, "grad_norm": 3.0260629653930664, "learning_rate": 1.4087301587301587e-05, "loss": 0.0924, "step": 1820 }, { "epoch": 7.1976284584980235, "grad_norm": 1.552370548248291, "learning_rate": 1.4067460317460318e-05, "loss": 0.0557, "step": 1821 }, { "epoch": 7.201581027667984, "grad_norm": 1.851386308670044, "learning_rate": 1.4047619047619048e-05, "loss": 0.0842, "step": 1822 }, { "epoch": 7.205533596837944, "grad_norm": 1.901151180267334, "learning_rate": 1.4027777777777779e-05, "loss": 0.0602, "step": 1823 }, { "epoch": 7.209486166007905, "grad_norm": 2.256410598754883, "learning_rate": 1.4007936507936509e-05, "loss": 0.0779, "step": 1824 }, { "epoch": 7.213438735177865, "grad_norm": 2.2768027782440186, "learning_rate": 1.398809523809524e-05, "loss": 0.0701, "step": 1825 }, { "epoch": 7.217391304347826, "grad_norm": 1.8319358825683594, "learning_rate": 1.396825396825397e-05, "loss": 0.0711, "step": 1826 }, { "epoch": 7.221343873517786, "grad_norm": 1.6305453777313232, "learning_rate": 1.39484126984127e-05, "loss": 0.0601, "step": 1827 }, { "epoch": 7.225296442687747, "grad_norm": 2.377932071685791, "learning_rate": 1.392857142857143e-05, "loss": 0.084, "step": 1828 }, { "epoch": 7.229249011857707, "grad_norm": 2.3718206882476807, "learning_rate": 1.390873015873016e-05, "loss": 0.1073, "step": 1829 }, { "epoch": 7.233201581027668, "grad_norm": 1.8948851823806763, "learning_rate": 1.388888888888889e-05, "loss": 0.066, "step": 1830 }, { "epoch": 7.237154150197629, "grad_norm": 2.6554150581359863, "learning_rate": 1.386904761904762e-05, "loss": 0.1004, "step": 1831 }, { "epoch": 7.241106719367589, "grad_norm": 1.7534379959106445, "learning_rate": 1.384920634920635e-05, "loss": 0.0686, "step": 1832 }, { "epoch": 7.24505928853755, "grad_norm": 2.8044631481170654, "learning_rate": 1.382936507936508e-05, "loss": 0.0981, "step": 1833 }, { "epoch": 7.24901185770751, "grad_norm": 1.962010145187378, "learning_rate": 1.3809523809523811e-05, "loss": 0.0606, "step": 1834 }, { "epoch": 7.252964426877471, "grad_norm": 2.598278522491455, "learning_rate": 1.3789682539682541e-05, "loss": 0.0823, "step": 1835 }, { "epoch": 7.256916996047431, "grad_norm": 2.0581893920898438, "learning_rate": 1.3769841269841272e-05, "loss": 0.0724, "step": 1836 }, { "epoch": 7.260869565217392, "grad_norm": 2.1700425148010254, "learning_rate": 1.3750000000000002e-05, "loss": 0.0747, "step": 1837 }, { "epoch": 7.264822134387352, "grad_norm": 2.3965978622436523, "learning_rate": 1.3730158730158733e-05, "loss": 0.0756, "step": 1838 }, { "epoch": 7.2687747035573125, "grad_norm": 2.8531904220581055, "learning_rate": 1.3710317460317463e-05, "loss": 0.0915, "step": 1839 }, { "epoch": 7.2727272727272725, "grad_norm": 1.8420287370681763, "learning_rate": 1.3690476190476192e-05, "loss": 0.0674, "step": 1840 }, { "epoch": 7.276679841897233, "grad_norm": 2.040949583053589, "learning_rate": 1.367063492063492e-05, "loss": 0.0662, "step": 1841 }, { "epoch": 7.280632411067193, "grad_norm": 4.134637832641602, "learning_rate": 1.365079365079365e-05, "loss": 0.1148, "step": 1842 }, { "epoch": 7.284584980237154, "grad_norm": 2.1330273151397705, "learning_rate": 1.3630952380952381e-05, "loss": 0.0594, "step": 1843 }, { "epoch": 7.288537549407114, "grad_norm": 2.3376622200012207, "learning_rate": 1.3611111111111111e-05, "loss": 0.1026, "step": 1844 }, { "epoch": 7.292490118577075, "grad_norm": 2.1795151233673096, "learning_rate": 1.359126984126984e-05, "loss": 0.0685, "step": 1845 }, { "epoch": 7.296442687747035, "grad_norm": 2.4371635913848877, "learning_rate": 1.357142857142857e-05, "loss": 0.0856, "step": 1846 }, { "epoch": 7.300395256916996, "grad_norm": 1.8547295331954956, "learning_rate": 1.3551587301587301e-05, "loss": 0.0593, "step": 1847 }, { "epoch": 7.304347826086957, "grad_norm": 2.3141019344329834, "learning_rate": 1.3531746031746031e-05, "loss": 0.0935, "step": 1848 }, { "epoch": 7.308300395256917, "grad_norm": 1.9820470809936523, "learning_rate": 1.3511904761904762e-05, "loss": 0.069, "step": 1849 }, { "epoch": 7.312252964426877, "grad_norm": 2.5655643939971924, "learning_rate": 1.3492063492063492e-05, "loss": 0.0899, "step": 1850 }, { "epoch": 7.316205533596838, "grad_norm": 2.5492866039276123, "learning_rate": 1.3472222222222222e-05, "loss": 0.0874, "step": 1851 }, { "epoch": 7.320158102766799, "grad_norm": 2.2358458042144775, "learning_rate": 1.3452380952380953e-05, "loss": 0.0769, "step": 1852 }, { "epoch": 7.324110671936759, "grad_norm": 1.9372227191925049, "learning_rate": 1.3432539682539683e-05, "loss": 0.0729, "step": 1853 }, { "epoch": 7.32806324110672, "grad_norm": 2.8513784408569336, "learning_rate": 1.3412698412698413e-05, "loss": 0.0988, "step": 1854 }, { "epoch": 7.33201581027668, "grad_norm": 2.437175989151001, "learning_rate": 1.3392857142857144e-05, "loss": 0.0891, "step": 1855 }, { "epoch": 7.335968379446641, "grad_norm": 1.8044086694717407, "learning_rate": 1.3373015873015873e-05, "loss": 0.0611, "step": 1856 }, { "epoch": 7.339920948616601, "grad_norm": 1.8136906623840332, "learning_rate": 1.3353174603174603e-05, "loss": 0.0604, "step": 1857 }, { "epoch": 7.3438735177865615, "grad_norm": 2.4834377765655518, "learning_rate": 1.3333333333333333e-05, "loss": 0.126, "step": 1858 }, { "epoch": 7.3478260869565215, "grad_norm": 2.810823678970337, "learning_rate": 1.3313492063492064e-05, "loss": 0.1165, "step": 1859 }, { "epoch": 7.351778656126482, "grad_norm": 3.216146945953369, "learning_rate": 1.3293650793650794e-05, "loss": 0.0965, "step": 1860 }, { "epoch": 7.355731225296442, "grad_norm": 2.910884141921997, "learning_rate": 1.3273809523809524e-05, "loss": 0.103, "step": 1861 }, { "epoch": 7.359683794466403, "grad_norm": 1.8496427536010742, "learning_rate": 1.3253968253968255e-05, "loss": 0.0602, "step": 1862 }, { "epoch": 7.363636363636363, "grad_norm": 2.382444381713867, "learning_rate": 1.3234126984126985e-05, "loss": 0.1, "step": 1863 }, { "epoch": 7.367588932806324, "grad_norm": 2.162414789199829, "learning_rate": 1.3214285714285716e-05, "loss": 0.0704, "step": 1864 }, { "epoch": 7.371541501976284, "grad_norm": 2.582324981689453, "learning_rate": 1.3194444444444446e-05, "loss": 0.0735, "step": 1865 }, { "epoch": 7.375494071146245, "grad_norm": 2.0686068534851074, "learning_rate": 1.3174603174603176e-05, "loss": 0.0691, "step": 1866 }, { "epoch": 7.379446640316205, "grad_norm": 2.73799467086792, "learning_rate": 1.3154761904761907e-05, "loss": 0.0977, "step": 1867 }, { "epoch": 7.383399209486166, "grad_norm": 2.5663845539093018, "learning_rate": 1.3134920634920635e-05, "loss": 0.0964, "step": 1868 }, { "epoch": 7.387351778656127, "grad_norm": 2.602886199951172, "learning_rate": 1.3115079365079366e-05, "loss": 0.0834, "step": 1869 }, { "epoch": 7.391304347826087, "grad_norm": 2.9359493255615234, "learning_rate": 1.3095238095238096e-05, "loss": 0.1205, "step": 1870 }, { "epoch": 7.395256916996048, "grad_norm": 2.4510388374328613, "learning_rate": 1.3075396825396826e-05, "loss": 0.0765, "step": 1871 }, { "epoch": 7.399209486166008, "grad_norm": 1.9874929189682007, "learning_rate": 1.3055555555555557e-05, "loss": 0.0798, "step": 1872 }, { "epoch": 7.403162055335969, "grad_norm": 1.9240570068359375, "learning_rate": 1.3035714285714287e-05, "loss": 0.0649, "step": 1873 }, { "epoch": 7.407114624505929, "grad_norm": 1.8569307327270508, "learning_rate": 1.3015873015873018e-05, "loss": 0.061, "step": 1874 }, { "epoch": 7.41106719367589, "grad_norm": 2.232877016067505, "learning_rate": 1.2996031746031748e-05, "loss": 0.0778, "step": 1875 }, { "epoch": 7.41501976284585, "grad_norm": 2.330413818359375, "learning_rate": 1.2976190476190478e-05, "loss": 0.0897, "step": 1876 }, { "epoch": 7.4189723320158105, "grad_norm": 2.543062686920166, "learning_rate": 1.2956349206349209e-05, "loss": 0.1039, "step": 1877 }, { "epoch": 7.4229249011857705, "grad_norm": 2.075242757797241, "learning_rate": 1.2936507936507939e-05, "loss": 0.0918, "step": 1878 }, { "epoch": 7.426877470355731, "grad_norm": 2.339674234390259, "learning_rate": 1.2916666666666668e-05, "loss": 0.0905, "step": 1879 }, { "epoch": 7.430830039525691, "grad_norm": 2.0178771018981934, "learning_rate": 1.2896825396825398e-05, "loss": 0.074, "step": 1880 }, { "epoch": 7.434782608695652, "grad_norm": 2.169339179992676, "learning_rate": 1.2876984126984127e-05, "loss": 0.0627, "step": 1881 }, { "epoch": 7.438735177865612, "grad_norm": 3.798720121383667, "learning_rate": 1.2857142857142857e-05, "loss": 0.1162, "step": 1882 }, { "epoch": 7.442687747035573, "grad_norm": 3.5218403339385986, "learning_rate": 1.2837301587301586e-05, "loss": 0.1009, "step": 1883 }, { "epoch": 7.446640316205533, "grad_norm": 2.855560302734375, "learning_rate": 1.2817460317460316e-05, "loss": 0.1037, "step": 1884 }, { "epoch": 7.450592885375494, "grad_norm": 1.8869415521621704, "learning_rate": 1.2797619047619047e-05, "loss": 0.0598, "step": 1885 }, { "epoch": 7.454545454545454, "grad_norm": 1.6059690713882446, "learning_rate": 1.2777777777777777e-05, "loss": 0.0565, "step": 1886 }, { "epoch": 7.458498023715415, "grad_norm": 3.0044198036193848, "learning_rate": 1.2757936507936507e-05, "loss": 0.131, "step": 1887 }, { "epoch": 7.462450592885375, "grad_norm": 2.2843515872955322, "learning_rate": 1.2738095238095238e-05, "loss": 0.0856, "step": 1888 }, { "epoch": 7.466403162055336, "grad_norm": 1.9446831941604614, "learning_rate": 1.2718253968253968e-05, "loss": 0.0657, "step": 1889 }, { "epoch": 7.470355731225297, "grad_norm": 2.3453097343444824, "learning_rate": 1.2698412698412699e-05, "loss": 0.0929, "step": 1890 }, { "epoch": 7.474308300395257, "grad_norm": 2.4141080379486084, "learning_rate": 1.2678571428571429e-05, "loss": 0.0882, "step": 1891 }, { "epoch": 7.478260869565218, "grad_norm": 3.0322489738464355, "learning_rate": 1.265873015873016e-05, "loss": 0.0845, "step": 1892 }, { "epoch": 7.482213438735178, "grad_norm": 2.348433017730713, "learning_rate": 1.263888888888889e-05, "loss": 0.0828, "step": 1893 }, { "epoch": 7.486166007905139, "grad_norm": 2.1347391605377197, "learning_rate": 1.261904761904762e-05, "loss": 0.0667, "step": 1894 }, { "epoch": 7.490118577075099, "grad_norm": 2.272301435470581, "learning_rate": 1.2599206349206349e-05, "loss": 0.0898, "step": 1895 }, { "epoch": 7.4940711462450595, "grad_norm": 2.3849878311157227, "learning_rate": 1.2579365079365079e-05, "loss": 0.0747, "step": 1896 }, { "epoch": 7.4980237154150196, "grad_norm": 2.220501661300659, "learning_rate": 1.255952380952381e-05, "loss": 0.0794, "step": 1897 }, { "epoch": 7.5019762845849804, "grad_norm": 2.281405448913574, "learning_rate": 1.253968253968254e-05, "loss": 0.0915, "step": 1898 }, { "epoch": 7.5059288537549405, "grad_norm": 2.4831249713897705, "learning_rate": 1.251984126984127e-05, "loss": 0.0725, "step": 1899 }, { "epoch": 7.509881422924901, "grad_norm": 2.52744197845459, "learning_rate": 1.25e-05, "loss": 0.0719, "step": 1900 }, { "epoch": 7.513833992094861, "grad_norm": 2.3339502811431885, "learning_rate": 1.2480158730158731e-05, "loss": 0.0821, "step": 1901 }, { "epoch": 7.517786561264822, "grad_norm": 2.408015012741089, "learning_rate": 1.2460317460317461e-05, "loss": 0.0929, "step": 1902 }, { "epoch": 7.521739130434782, "grad_norm": 2.307608127593994, "learning_rate": 1.2440476190476192e-05, "loss": 0.0884, "step": 1903 }, { "epoch": 7.525691699604743, "grad_norm": 2.454751491546631, "learning_rate": 1.2420634920634922e-05, "loss": 0.0851, "step": 1904 }, { "epoch": 7.529644268774703, "grad_norm": 1.6989669799804688, "learning_rate": 1.2400793650793652e-05, "loss": 0.0765, "step": 1905 }, { "epoch": 7.533596837944664, "grad_norm": 3.3208930492401123, "learning_rate": 1.2380952380952381e-05, "loss": 0.1033, "step": 1906 }, { "epoch": 7.537549407114625, "grad_norm": 2.317495822906494, "learning_rate": 1.2361111111111112e-05, "loss": 0.0911, "step": 1907 }, { "epoch": 7.541501976284585, "grad_norm": 2.3214991092681885, "learning_rate": 1.2341269841269842e-05, "loss": 0.0985, "step": 1908 }, { "epoch": 7.545454545454545, "grad_norm": 2.3527603149414062, "learning_rate": 1.2321428571428572e-05, "loss": 0.077, "step": 1909 }, { "epoch": 7.549407114624506, "grad_norm": 3.5933728218078613, "learning_rate": 1.2301587301587301e-05, "loss": 0.1284, "step": 1910 }, { "epoch": 7.553359683794467, "grad_norm": 2.274735450744629, "learning_rate": 1.2281746031746031e-05, "loss": 0.1062, "step": 1911 }, { "epoch": 7.557312252964427, "grad_norm": 3.133971691131592, "learning_rate": 1.2261904761904762e-05, "loss": 0.1185, "step": 1912 }, { "epoch": 7.561264822134388, "grad_norm": 3.7312350273132324, "learning_rate": 1.2242063492063492e-05, "loss": 0.1343, "step": 1913 }, { "epoch": 7.565217391304348, "grad_norm": 1.8403165340423584, "learning_rate": 1.2222222222222222e-05, "loss": 0.0699, "step": 1914 }, { "epoch": 7.569169960474309, "grad_norm": 3.350177049636841, "learning_rate": 1.2202380952380953e-05, "loss": 0.1266, "step": 1915 }, { "epoch": 7.573122529644269, "grad_norm": 1.6349577903747559, "learning_rate": 1.2182539682539683e-05, "loss": 0.0628, "step": 1916 }, { "epoch": 7.5770750988142295, "grad_norm": 1.906072974205017, "learning_rate": 1.2162698412698414e-05, "loss": 0.0835, "step": 1917 }, { "epoch": 7.5810276679841895, "grad_norm": 2.1547391414642334, "learning_rate": 1.2142857142857144e-05, "loss": 0.0803, "step": 1918 }, { "epoch": 7.58498023715415, "grad_norm": 2.4813320636749268, "learning_rate": 1.2123015873015874e-05, "loss": 0.1215, "step": 1919 }, { "epoch": 7.58893280632411, "grad_norm": 2.235426664352417, "learning_rate": 1.2103174603174603e-05, "loss": 0.0785, "step": 1920 }, { "epoch": 7.592885375494071, "grad_norm": 2.80841326713562, "learning_rate": 1.2083333333333333e-05, "loss": 0.1276, "step": 1921 }, { "epoch": 7.596837944664031, "grad_norm": 2.305530548095703, "learning_rate": 1.2063492063492064e-05, "loss": 0.065, "step": 1922 }, { "epoch": 7.600790513833992, "grad_norm": 1.6414098739624023, "learning_rate": 1.2043650793650794e-05, "loss": 0.0607, "step": 1923 }, { "epoch": 7.604743083003952, "grad_norm": 2.6492977142333984, "learning_rate": 1.2023809523809525e-05, "loss": 0.0877, "step": 1924 }, { "epoch": 7.608695652173913, "grad_norm": 2.1286118030548096, "learning_rate": 1.2003968253968255e-05, "loss": 0.0843, "step": 1925 }, { "epoch": 7.612648221343873, "grad_norm": 2.137827157974243, "learning_rate": 1.1984126984126985e-05, "loss": 0.0691, "step": 1926 }, { "epoch": 7.616600790513834, "grad_norm": 2.486067056655884, "learning_rate": 1.1964285714285716e-05, "loss": 0.0795, "step": 1927 }, { "epoch": 7.620553359683795, "grad_norm": 3.5269148349761963, "learning_rate": 1.1944444444444446e-05, "loss": 0.1021, "step": 1928 }, { "epoch": 7.624505928853755, "grad_norm": 3.005458116531372, "learning_rate": 1.1924603174603176e-05, "loss": 0.0863, "step": 1929 }, { "epoch": 7.628458498023716, "grad_norm": 1.5922969579696655, "learning_rate": 1.1904761904761905e-05, "loss": 0.0562, "step": 1930 }, { "epoch": 7.632411067193676, "grad_norm": 2.0607211589813232, "learning_rate": 1.1884920634920635e-05, "loss": 0.0796, "step": 1931 }, { "epoch": 7.636363636363637, "grad_norm": 2.8598814010620117, "learning_rate": 1.1865079365079366e-05, "loss": 0.1132, "step": 1932 }, { "epoch": 7.640316205533597, "grad_norm": 2.145017147064209, "learning_rate": 1.1845238095238095e-05, "loss": 0.081, "step": 1933 }, { "epoch": 7.644268774703558, "grad_norm": 1.9191386699676514, "learning_rate": 1.1825396825396825e-05, "loss": 0.0867, "step": 1934 }, { "epoch": 7.648221343873518, "grad_norm": 2.149658441543579, "learning_rate": 1.1805555555555555e-05, "loss": 0.0706, "step": 1935 }, { "epoch": 7.6521739130434785, "grad_norm": 2.7651522159576416, "learning_rate": 1.1785714285714286e-05, "loss": 0.0835, "step": 1936 }, { "epoch": 7.6561264822134385, "grad_norm": 3.162454605102539, "learning_rate": 1.1765873015873016e-05, "loss": 0.1092, "step": 1937 }, { "epoch": 7.660079051383399, "grad_norm": 2.8850865364074707, "learning_rate": 1.1746031746031746e-05, "loss": 0.0828, "step": 1938 }, { "epoch": 7.664031620553359, "grad_norm": 1.7384766340255737, "learning_rate": 1.1726190476190477e-05, "loss": 0.057, "step": 1939 }, { "epoch": 7.66798418972332, "grad_norm": 1.8011753559112549, "learning_rate": 1.1706349206349207e-05, "loss": 0.0737, "step": 1940 }, { "epoch": 7.67193675889328, "grad_norm": 3.141075372695923, "learning_rate": 1.1686507936507938e-05, "loss": 0.0868, "step": 1941 }, { "epoch": 7.675889328063241, "grad_norm": 3.0187206268310547, "learning_rate": 1.1666666666666668e-05, "loss": 0.1188, "step": 1942 }, { "epoch": 7.679841897233201, "grad_norm": 2.71610689163208, "learning_rate": 1.1646825396825398e-05, "loss": 0.0873, "step": 1943 }, { "epoch": 7.683794466403162, "grad_norm": 3.3756630420684814, "learning_rate": 1.1626984126984129e-05, "loss": 0.1259, "step": 1944 }, { "epoch": 7.687747035573123, "grad_norm": 2.53981351852417, "learning_rate": 1.1607142857142857e-05, "loss": 0.0935, "step": 1945 }, { "epoch": 7.691699604743083, "grad_norm": 1.9116166830062866, "learning_rate": 1.1587301587301588e-05, "loss": 0.0657, "step": 1946 }, { "epoch": 7.695652173913043, "grad_norm": 2.145357370376587, "learning_rate": 1.1567460317460318e-05, "loss": 0.0928, "step": 1947 }, { "epoch": 7.699604743083004, "grad_norm": 2.8089849948883057, "learning_rate": 1.1547619047619048e-05, "loss": 0.0938, "step": 1948 }, { "epoch": 7.703557312252965, "grad_norm": 2.510547399520874, "learning_rate": 1.1527777777777779e-05, "loss": 0.0942, "step": 1949 }, { "epoch": 7.707509881422925, "grad_norm": 2.142611503601074, "learning_rate": 1.1507936507936508e-05, "loss": 0.0776, "step": 1950 }, { "epoch": 7.711462450592886, "grad_norm": 2.8611981868743896, "learning_rate": 1.1488095238095238e-05, "loss": 0.0874, "step": 1951 }, { "epoch": 7.715415019762846, "grad_norm": 2.1545233726501465, "learning_rate": 1.1468253968253968e-05, "loss": 0.0846, "step": 1952 }, { "epoch": 7.719367588932807, "grad_norm": 2.4546070098876953, "learning_rate": 1.1448412698412699e-05, "loss": 0.0745, "step": 1953 }, { "epoch": 7.723320158102767, "grad_norm": 3.605870008468628, "learning_rate": 1.1428571428571429e-05, "loss": 0.1321, "step": 1954 }, { "epoch": 7.7272727272727275, "grad_norm": 3.173536539077759, "learning_rate": 1.140873015873016e-05, "loss": 0.1024, "step": 1955 }, { "epoch": 7.7312252964426875, "grad_norm": 2.163079023361206, "learning_rate": 1.138888888888889e-05, "loss": 0.0626, "step": 1956 }, { "epoch": 7.735177865612648, "grad_norm": 2.247568130493164, "learning_rate": 1.136904761904762e-05, "loss": 0.0801, "step": 1957 }, { "epoch": 7.739130434782608, "grad_norm": 2.0294995307922363, "learning_rate": 1.1349206349206349e-05, "loss": 0.0987, "step": 1958 }, { "epoch": 7.743083003952569, "grad_norm": 2.5226917266845703, "learning_rate": 1.132936507936508e-05, "loss": 0.0815, "step": 1959 }, { "epoch": 7.747035573122529, "grad_norm": 2.68029522895813, "learning_rate": 1.130952380952381e-05, "loss": 0.1003, "step": 1960 }, { "epoch": 7.75098814229249, "grad_norm": 2.617349624633789, "learning_rate": 1.128968253968254e-05, "loss": 0.0843, "step": 1961 }, { "epoch": 7.75494071146245, "grad_norm": 2.5157854557037354, "learning_rate": 1.126984126984127e-05, "loss": 0.1024, "step": 1962 }, { "epoch": 7.758893280632411, "grad_norm": 2.2654972076416016, "learning_rate": 1.125e-05, "loss": 0.0801, "step": 1963 }, { "epoch": 7.762845849802371, "grad_norm": 2.273188352584839, "learning_rate": 1.1230158730158731e-05, "loss": 0.0774, "step": 1964 }, { "epoch": 7.766798418972332, "grad_norm": 2.1782922744750977, "learning_rate": 1.1210317460317461e-05, "loss": 0.069, "step": 1965 }, { "epoch": 7.770750988142293, "grad_norm": 2.23232102394104, "learning_rate": 1.1190476190476192e-05, "loss": 0.0868, "step": 1966 }, { "epoch": 7.774703557312253, "grad_norm": 2.0577633380889893, "learning_rate": 1.1170634920634922e-05, "loss": 0.0718, "step": 1967 }, { "epoch": 7.778656126482213, "grad_norm": 2.2454826831817627, "learning_rate": 1.1150793650793653e-05, "loss": 0.0789, "step": 1968 }, { "epoch": 7.782608695652174, "grad_norm": 3.088087558746338, "learning_rate": 1.1130952380952381e-05, "loss": 0.0944, "step": 1969 }, { "epoch": 7.786561264822135, "grad_norm": 2.3378899097442627, "learning_rate": 1.1111111111111112e-05, "loss": 0.0908, "step": 1970 }, { "epoch": 7.790513833992095, "grad_norm": 2.6260411739349365, "learning_rate": 1.1091269841269842e-05, "loss": 0.087, "step": 1971 }, { "epoch": 7.794466403162056, "grad_norm": 1.8749479055404663, "learning_rate": 1.107142857142857e-05, "loss": 0.0641, "step": 1972 }, { "epoch": 7.798418972332016, "grad_norm": 2.9281997680664062, "learning_rate": 1.1051587301587301e-05, "loss": 0.1033, "step": 1973 }, { "epoch": 7.8023715415019765, "grad_norm": 1.7113523483276367, "learning_rate": 1.1031746031746031e-05, "loss": 0.0573, "step": 1974 }, { "epoch": 7.8063241106719365, "grad_norm": 4.461965560913086, "learning_rate": 1.1011904761904762e-05, "loss": 0.1805, "step": 1975 }, { "epoch": 7.810276679841897, "grad_norm": 2.3460776805877686, "learning_rate": 1.0992063492063492e-05, "loss": 0.0981, "step": 1976 }, { "epoch": 7.8142292490118574, "grad_norm": 2.7546355724334717, "learning_rate": 1.0972222222222223e-05, "loss": 0.0963, "step": 1977 }, { "epoch": 7.818181818181818, "grad_norm": 3.3164448738098145, "learning_rate": 1.0952380952380953e-05, "loss": 0.096, "step": 1978 }, { "epoch": 7.822134387351778, "grad_norm": 1.8524004220962524, "learning_rate": 1.0932539682539683e-05, "loss": 0.0725, "step": 1979 }, { "epoch": 7.826086956521739, "grad_norm": 2.2797439098358154, "learning_rate": 1.0912698412698414e-05, "loss": 0.0792, "step": 1980 }, { "epoch": 7.830039525691699, "grad_norm": 2.863020420074463, "learning_rate": 1.0892857142857144e-05, "loss": 0.0749, "step": 1981 }, { "epoch": 7.83399209486166, "grad_norm": 2.2198400497436523, "learning_rate": 1.0873015873015874e-05, "loss": 0.0986, "step": 1982 }, { "epoch": 7.837944664031621, "grad_norm": 1.540998935699463, "learning_rate": 1.0853174603174605e-05, "loss": 0.0654, "step": 1983 }, { "epoch": 7.841897233201581, "grad_norm": 1.856723666191101, "learning_rate": 1.0833333333333334e-05, "loss": 0.0751, "step": 1984 }, { "epoch": 7.845849802371541, "grad_norm": 2.4738926887512207, "learning_rate": 1.0813492063492064e-05, "loss": 0.0848, "step": 1985 }, { "epoch": 7.849802371541502, "grad_norm": 1.9589910507202148, "learning_rate": 1.0793650793650794e-05, "loss": 0.0674, "step": 1986 }, { "epoch": 7.853754940711463, "grad_norm": 2.1947100162506104, "learning_rate": 1.0773809523809525e-05, "loss": 0.077, "step": 1987 }, { "epoch": 7.857707509881423, "grad_norm": 3.0187575817108154, "learning_rate": 1.0753968253968255e-05, "loss": 0.1128, "step": 1988 }, { "epoch": 7.861660079051384, "grad_norm": 1.8199398517608643, "learning_rate": 1.0734126984126984e-05, "loss": 0.077, "step": 1989 }, { "epoch": 7.865612648221344, "grad_norm": 2.459689140319824, "learning_rate": 1.0714285714285714e-05, "loss": 0.0834, "step": 1990 }, { "epoch": 7.869565217391305, "grad_norm": 2.451312303543091, "learning_rate": 1.0694444444444444e-05, "loss": 0.1036, "step": 1991 }, { "epoch": 7.873517786561265, "grad_norm": 2.400502920150757, "learning_rate": 1.0674603174603175e-05, "loss": 0.0821, "step": 1992 }, { "epoch": 7.877470355731226, "grad_norm": 1.9994014501571655, "learning_rate": 1.0654761904761905e-05, "loss": 0.072, "step": 1993 }, { "epoch": 7.881422924901186, "grad_norm": 1.9804929494857788, "learning_rate": 1.0634920634920636e-05, "loss": 0.0723, "step": 1994 }, { "epoch": 7.8853754940711465, "grad_norm": 1.7949855327606201, "learning_rate": 1.0615079365079366e-05, "loss": 0.0725, "step": 1995 }, { "epoch": 7.8893280632411065, "grad_norm": 3.4392995834350586, "learning_rate": 1.0595238095238096e-05, "loss": 0.1088, "step": 1996 }, { "epoch": 7.893280632411067, "grad_norm": 2.1142258644104004, "learning_rate": 1.0575396825396825e-05, "loss": 0.0653, "step": 1997 }, { "epoch": 7.897233201581027, "grad_norm": 1.7416729927062988, "learning_rate": 1.0555555555555555e-05, "loss": 0.0623, "step": 1998 }, { "epoch": 7.901185770750988, "grad_norm": 2.438103437423706, "learning_rate": 1.0535714285714286e-05, "loss": 0.0712, "step": 1999 }, { "epoch": 7.905138339920948, "grad_norm": 2.4873170852661133, "learning_rate": 1.0515873015873016e-05, "loss": 0.0704, "step": 2000 }, { "epoch": 7.909090909090909, "grad_norm": 2.9187819957733154, "learning_rate": 1.0496031746031747e-05, "loss": 0.1055, "step": 2001 }, { "epoch": 7.913043478260869, "grad_norm": 2.2539193630218506, "learning_rate": 1.0476190476190477e-05, "loss": 0.0777, "step": 2002 }, { "epoch": 7.91699604743083, "grad_norm": 1.9834545850753784, "learning_rate": 1.0456349206349207e-05, "loss": 0.0651, "step": 2003 }, { "epoch": 7.920948616600791, "grad_norm": 2.7357428073883057, "learning_rate": 1.0436507936507938e-05, "loss": 0.0943, "step": 2004 }, { "epoch": 7.924901185770751, "grad_norm": 2.9565341472625732, "learning_rate": 1.0416666666666668e-05, "loss": 0.1032, "step": 2005 }, { "epoch": 7.928853754940711, "grad_norm": 2.176581382751465, "learning_rate": 1.0396825396825398e-05, "loss": 0.0774, "step": 2006 }, { "epoch": 7.932806324110672, "grad_norm": 2.717416524887085, "learning_rate": 1.0376984126984129e-05, "loss": 0.1105, "step": 2007 }, { "epoch": 7.936758893280633, "grad_norm": 2.976388931274414, "learning_rate": 1.0357142857142859e-05, "loss": 0.1036, "step": 2008 }, { "epoch": 7.940711462450593, "grad_norm": 4.3491411209106445, "learning_rate": 1.0337301587301588e-05, "loss": 0.166, "step": 2009 }, { "epoch": 7.944664031620554, "grad_norm": 2.4038586616516113, "learning_rate": 1.0317460317460318e-05, "loss": 0.0833, "step": 2010 }, { "epoch": 7.948616600790514, "grad_norm": 2.2004966735839844, "learning_rate": 1.0297619047619047e-05, "loss": 0.079, "step": 2011 }, { "epoch": 7.952569169960475, "grad_norm": 2.817812919616699, "learning_rate": 1.0277777777777777e-05, "loss": 0.1185, "step": 2012 }, { "epoch": 7.956521739130435, "grad_norm": 2.230272054672241, "learning_rate": 1.0257936507936508e-05, "loss": 0.0932, "step": 2013 }, { "epoch": 7.9604743083003955, "grad_norm": 2.4217472076416016, "learning_rate": 1.0238095238095238e-05, "loss": 0.0912, "step": 2014 }, { "epoch": 7.9644268774703555, "grad_norm": 2.3574728965759277, "learning_rate": 1.0218253968253968e-05, "loss": 0.0844, "step": 2015 }, { "epoch": 7.968379446640316, "grad_norm": 1.9760262966156006, "learning_rate": 1.0198412698412699e-05, "loss": 0.0852, "step": 2016 }, { "epoch": 7.972332015810276, "grad_norm": 3.1597955226898193, "learning_rate": 1.0178571428571429e-05, "loss": 0.1101, "step": 2017 }, { "epoch": 7.976284584980237, "grad_norm": 2.150995969772339, "learning_rate": 1.015873015873016e-05, "loss": 0.0867, "step": 2018 }, { "epoch": 7.980237154150197, "grad_norm": 2.0937557220458984, "learning_rate": 1.013888888888889e-05, "loss": 0.0701, "step": 2019 }, { "epoch": 7.984189723320158, "grad_norm": 3.2109408378601074, "learning_rate": 1.011904761904762e-05, "loss": 0.0974, "step": 2020 }, { "epoch": 7.988142292490118, "grad_norm": 3.7526028156280518, "learning_rate": 1.009920634920635e-05, "loss": 0.1281, "step": 2021 }, { "epoch": 7.992094861660079, "grad_norm": 2.220501184463501, "learning_rate": 1.007936507936508e-05, "loss": 0.1216, "step": 2022 }, { "epoch": 7.996047430830039, "grad_norm": 2.583831548690796, "learning_rate": 1.005952380952381e-05, "loss": 0.0817, "step": 2023 }, { "epoch": 8.0, "grad_norm": 2.142831802368164, "learning_rate": 1.003968253968254e-05, "loss": 0.0659, "step": 2024 }, { "epoch": 8.003952569169961, "grad_norm": 0.9592176079750061, "learning_rate": 1.001984126984127e-05, "loss": 0.0366, "step": 2025 }, { "epoch": 8.007905138339922, "grad_norm": 1.110743522644043, "learning_rate": 1e-05, "loss": 0.0397, "step": 2026 }, { "epoch": 8.011857707509881, "grad_norm": 1.0800786018371582, "learning_rate": 9.980158730158731e-06, "loss": 0.04, "step": 2027 }, { "epoch": 8.015810276679842, "grad_norm": 1.1953082084655762, "learning_rate": 9.96031746031746e-06, "loss": 0.04, "step": 2028 }, { "epoch": 8.019762845849803, "grad_norm": 1.4953047037124634, "learning_rate": 9.94047619047619e-06, "loss": 0.0726, "step": 2029 }, { "epoch": 8.023715415019764, "grad_norm": 1.4194068908691406, "learning_rate": 9.92063492063492e-06, "loss": 0.0474, "step": 2030 }, { "epoch": 8.027667984189723, "grad_norm": 1.2994885444641113, "learning_rate": 9.900793650793651e-06, "loss": 0.0462, "step": 2031 }, { "epoch": 8.031620553359684, "grad_norm": 1.5428236722946167, "learning_rate": 9.880952380952381e-06, "loss": 0.0681, "step": 2032 }, { "epoch": 8.035573122529645, "grad_norm": 1.375458836555481, "learning_rate": 9.861111111111112e-06, "loss": 0.0437, "step": 2033 }, { "epoch": 8.039525691699605, "grad_norm": 1.303333044052124, "learning_rate": 9.841269841269842e-06, "loss": 0.0418, "step": 2034 }, { "epoch": 8.043478260869565, "grad_norm": 1.6367980241775513, "learning_rate": 9.821428571428573e-06, "loss": 0.0603, "step": 2035 }, { "epoch": 8.047430830039525, "grad_norm": 1.2272205352783203, "learning_rate": 9.801587301587301e-06, "loss": 0.0373, "step": 2036 }, { "epoch": 8.051383399209486, "grad_norm": 1.42149019241333, "learning_rate": 9.781746031746032e-06, "loss": 0.0376, "step": 2037 }, { "epoch": 8.055335968379447, "grad_norm": 1.757165789604187, "learning_rate": 9.761904761904762e-06, "loss": 0.051, "step": 2038 }, { "epoch": 8.059288537549406, "grad_norm": 2.7419040203094482, "learning_rate": 9.742063492063492e-06, "loss": 0.0709, "step": 2039 }, { "epoch": 8.063241106719367, "grad_norm": 1.15654718875885, "learning_rate": 9.722222222222223e-06, "loss": 0.0328, "step": 2040 }, { "epoch": 8.067193675889328, "grad_norm": 1.6456338167190552, "learning_rate": 9.702380952380953e-06, "loss": 0.0584, "step": 2041 }, { "epoch": 8.071146245059289, "grad_norm": 1.8168511390686035, "learning_rate": 9.682539682539683e-06, "loss": 0.0547, "step": 2042 }, { "epoch": 8.075098814229248, "grad_norm": 1.2571635246276855, "learning_rate": 9.662698412698414e-06, "loss": 0.0409, "step": 2043 }, { "epoch": 8.079051383399209, "grad_norm": 1.5656439065933228, "learning_rate": 9.642857142857144e-06, "loss": 0.0416, "step": 2044 }, { "epoch": 8.08300395256917, "grad_norm": 1.297942042350769, "learning_rate": 9.623015873015875e-06, "loss": 0.0449, "step": 2045 }, { "epoch": 8.08695652173913, "grad_norm": 1.577046275138855, "learning_rate": 9.603174603174605e-06, "loss": 0.0613, "step": 2046 }, { "epoch": 8.090909090909092, "grad_norm": 1.2949413061141968, "learning_rate": 9.583333333333334e-06, "loss": 0.0421, "step": 2047 }, { "epoch": 8.09486166007905, "grad_norm": 1.3732746839523315, "learning_rate": 9.563492063492064e-06, "loss": 0.0487, "step": 2048 }, { "epoch": 8.098814229249012, "grad_norm": 1.6629955768585205, "learning_rate": 9.543650793650793e-06, "loss": 0.0535, "step": 2049 }, { "epoch": 8.102766798418973, "grad_norm": 1.3761000633239746, "learning_rate": 9.523809523809523e-06, "loss": 0.0475, "step": 2050 }, { "epoch": 8.106719367588934, "grad_norm": 1.721295952796936, "learning_rate": 9.503968253968253e-06, "loss": 0.0532, "step": 2051 }, { "epoch": 8.110671936758893, "grad_norm": 2.0054855346679688, "learning_rate": 9.484126984126984e-06, "loss": 0.0652, "step": 2052 }, { "epoch": 8.114624505928854, "grad_norm": 1.1261333227157593, "learning_rate": 9.464285714285714e-06, "loss": 0.0385, "step": 2053 }, { "epoch": 8.118577075098814, "grad_norm": 1.6062073707580566, "learning_rate": 9.444444444444445e-06, "loss": 0.0692, "step": 2054 }, { "epoch": 8.122529644268775, "grad_norm": 1.7869125604629517, "learning_rate": 9.424603174603175e-06, "loss": 0.0545, "step": 2055 }, { "epoch": 8.126482213438734, "grad_norm": 1.4991074800491333, "learning_rate": 9.404761904761905e-06, "loss": 0.0564, "step": 2056 }, { "epoch": 8.130434782608695, "grad_norm": 1.527508020401001, "learning_rate": 9.384920634920636e-06, "loss": 0.0515, "step": 2057 }, { "epoch": 8.134387351778656, "grad_norm": 1.497931957244873, "learning_rate": 9.365079365079366e-06, "loss": 0.0542, "step": 2058 }, { "epoch": 8.138339920948617, "grad_norm": 1.5828138589859009, "learning_rate": 9.345238095238096e-06, "loss": 0.0502, "step": 2059 }, { "epoch": 8.142292490118576, "grad_norm": 2.0395078659057617, "learning_rate": 9.325396825396827e-06, "loss": 0.0642, "step": 2060 }, { "epoch": 8.146245059288537, "grad_norm": 1.7669570446014404, "learning_rate": 9.305555555555555e-06, "loss": 0.0685, "step": 2061 }, { "epoch": 8.150197628458498, "grad_norm": 1.2871284484863281, "learning_rate": 9.285714285714286e-06, "loss": 0.0494, "step": 2062 }, { "epoch": 8.154150197628459, "grad_norm": 1.5461068153381348, "learning_rate": 9.265873015873016e-06, "loss": 0.0486, "step": 2063 }, { "epoch": 8.15810276679842, "grad_norm": 3.5550456047058105, "learning_rate": 9.246031746031747e-06, "loss": 0.0718, "step": 2064 }, { "epoch": 8.162055335968379, "grad_norm": 1.4119137525558472, "learning_rate": 9.226190476190477e-06, "loss": 0.039, "step": 2065 }, { "epoch": 8.16600790513834, "grad_norm": 1.7155872583389282, "learning_rate": 9.206349206349207e-06, "loss": 0.0476, "step": 2066 }, { "epoch": 8.1699604743083, "grad_norm": 1.5434550046920776, "learning_rate": 9.186507936507938e-06, "loss": 0.0549, "step": 2067 }, { "epoch": 8.173913043478262, "grad_norm": 1.677038550376892, "learning_rate": 9.166666666666666e-06, "loss": 0.0592, "step": 2068 }, { "epoch": 8.17786561264822, "grad_norm": 1.7242430448532104, "learning_rate": 9.146825396825397e-06, "loss": 0.0461, "step": 2069 }, { "epoch": 8.181818181818182, "grad_norm": 1.5962724685668945, "learning_rate": 9.126984126984127e-06, "loss": 0.056, "step": 2070 }, { "epoch": 8.185770750988143, "grad_norm": 1.2169913053512573, "learning_rate": 9.107142857142858e-06, "loss": 0.0414, "step": 2071 }, { "epoch": 8.189723320158103, "grad_norm": 1.2942997217178345, "learning_rate": 9.087301587301588e-06, "loss": 0.0397, "step": 2072 }, { "epoch": 8.193675889328063, "grad_norm": 1.064394474029541, "learning_rate": 9.067460317460318e-06, "loss": 0.0412, "step": 2073 }, { "epoch": 8.197628458498023, "grad_norm": 1.550001859664917, "learning_rate": 9.047619047619047e-06, "loss": 0.0496, "step": 2074 }, { "epoch": 8.201581027667984, "grad_norm": 0.9107499718666077, "learning_rate": 9.027777777777777e-06, "loss": 0.036, "step": 2075 }, { "epoch": 8.205533596837945, "grad_norm": 1.4544087648391724, "learning_rate": 9.007936507936508e-06, "loss": 0.0486, "step": 2076 }, { "epoch": 8.209486166007904, "grad_norm": 1.8703144788742065, "learning_rate": 8.988095238095238e-06, "loss": 0.0581, "step": 2077 }, { "epoch": 8.213438735177865, "grad_norm": 1.3876852989196777, "learning_rate": 8.968253968253968e-06, "loss": 0.0463, "step": 2078 }, { "epoch": 8.217391304347826, "grad_norm": 1.303426742553711, "learning_rate": 8.948412698412699e-06, "loss": 0.0374, "step": 2079 }, { "epoch": 8.221343873517787, "grad_norm": 1.1986994743347168, "learning_rate": 8.92857142857143e-06, "loss": 0.0455, "step": 2080 }, { "epoch": 8.225296442687746, "grad_norm": 1.7828227281570435, "learning_rate": 8.90873015873016e-06, "loss": 0.0522, "step": 2081 }, { "epoch": 8.229249011857707, "grad_norm": 1.5747590065002441, "learning_rate": 8.88888888888889e-06, "loss": 0.0524, "step": 2082 }, { "epoch": 8.233201581027668, "grad_norm": 1.693088412284851, "learning_rate": 8.86904761904762e-06, "loss": 0.0467, "step": 2083 }, { "epoch": 8.237154150197629, "grad_norm": 1.7213762998580933, "learning_rate": 8.84920634920635e-06, "loss": 0.0479, "step": 2084 }, { "epoch": 8.24110671936759, "grad_norm": 1.4260462522506714, "learning_rate": 8.829365079365081e-06, "loss": 0.0515, "step": 2085 }, { "epoch": 8.245059288537549, "grad_norm": 1.4755898714065552, "learning_rate": 8.80952380952381e-06, "loss": 0.0434, "step": 2086 }, { "epoch": 8.24901185770751, "grad_norm": 1.2368816137313843, "learning_rate": 8.78968253968254e-06, "loss": 0.0404, "step": 2087 }, { "epoch": 8.25296442687747, "grad_norm": 1.3295128345489502, "learning_rate": 8.769841269841269e-06, "loss": 0.0443, "step": 2088 }, { "epoch": 8.256916996047432, "grad_norm": 1.3453065156936646, "learning_rate": 8.75e-06, "loss": 0.044, "step": 2089 }, { "epoch": 8.26086956521739, "grad_norm": 1.3558346033096313, "learning_rate": 8.73015873015873e-06, "loss": 0.0477, "step": 2090 }, { "epoch": 8.264822134387352, "grad_norm": 1.5338172912597656, "learning_rate": 8.71031746031746e-06, "loss": 0.0489, "step": 2091 }, { "epoch": 8.268774703557312, "grad_norm": 1.350160002708435, "learning_rate": 8.69047619047619e-06, "loss": 0.0508, "step": 2092 }, { "epoch": 8.272727272727273, "grad_norm": 1.5303537845611572, "learning_rate": 8.67063492063492e-06, "loss": 0.0465, "step": 2093 }, { "epoch": 8.276679841897232, "grad_norm": 1.5438976287841797, "learning_rate": 8.650793650793651e-06, "loss": 0.0692, "step": 2094 }, { "epoch": 8.280632411067193, "grad_norm": 1.1079212427139282, "learning_rate": 8.630952380952381e-06, "loss": 0.044, "step": 2095 }, { "epoch": 8.284584980237154, "grad_norm": 1.2162022590637207, "learning_rate": 8.611111111111112e-06, "loss": 0.0369, "step": 2096 }, { "epoch": 8.288537549407115, "grad_norm": 1.4534077644348145, "learning_rate": 8.591269841269842e-06, "loss": 0.0549, "step": 2097 }, { "epoch": 8.292490118577074, "grad_norm": 1.3834608793258667, "learning_rate": 8.571428571428573e-06, "loss": 0.0423, "step": 2098 }, { "epoch": 8.296442687747035, "grad_norm": 1.7133499383926392, "learning_rate": 8.551587301587303e-06, "loss": 0.0612, "step": 2099 }, { "epoch": 8.300395256916996, "grad_norm": 1.966752052307129, "learning_rate": 8.531746031746032e-06, "loss": 0.0524, "step": 2100 }, { "epoch": 8.304347826086957, "grad_norm": 2.143880844116211, "learning_rate": 8.511904761904762e-06, "loss": 0.0515, "step": 2101 }, { "epoch": 8.308300395256918, "grad_norm": 2.052530527114868, "learning_rate": 8.492063492063492e-06, "loss": 0.0721, "step": 2102 }, { "epoch": 8.312252964426877, "grad_norm": 2.0002057552337646, "learning_rate": 8.472222222222223e-06, "loss": 0.0559, "step": 2103 }, { "epoch": 8.316205533596838, "grad_norm": 1.0597388744354248, "learning_rate": 8.452380952380953e-06, "loss": 0.0444, "step": 2104 }, { "epoch": 8.320158102766799, "grad_norm": 1.3378074169158936, "learning_rate": 8.432539682539684e-06, "loss": 0.0408, "step": 2105 }, { "epoch": 8.32411067193676, "grad_norm": 1.926456093788147, "learning_rate": 8.412698412698414e-06, "loss": 0.0592, "step": 2106 }, { "epoch": 8.328063241106719, "grad_norm": 1.6579585075378418, "learning_rate": 8.392857142857143e-06, "loss": 0.0448, "step": 2107 }, { "epoch": 8.33201581027668, "grad_norm": 1.6691573858261108, "learning_rate": 8.373015873015873e-06, "loss": 0.0476, "step": 2108 }, { "epoch": 8.33596837944664, "grad_norm": 1.7489898204803467, "learning_rate": 8.353174603174603e-06, "loss": 0.067, "step": 2109 }, { "epoch": 8.339920948616601, "grad_norm": 1.1511108875274658, "learning_rate": 8.333333333333334e-06, "loss": 0.0477, "step": 2110 }, { "epoch": 8.34387351778656, "grad_norm": 1.527985692024231, "learning_rate": 8.313492063492064e-06, "loss": 0.041, "step": 2111 }, { "epoch": 8.347826086956522, "grad_norm": 1.5950675010681152, "learning_rate": 8.293650793650794e-06, "loss": 0.0588, "step": 2112 }, { "epoch": 8.351778656126482, "grad_norm": 1.2112717628479004, "learning_rate": 8.273809523809523e-06, "loss": 0.0397, "step": 2113 }, { "epoch": 8.355731225296443, "grad_norm": 1.6547131538391113, "learning_rate": 8.253968253968254e-06, "loss": 0.0609, "step": 2114 }, { "epoch": 8.359683794466402, "grad_norm": 1.6982663869857788, "learning_rate": 8.234126984126984e-06, "loss": 0.0554, "step": 2115 }, { "epoch": 8.363636363636363, "grad_norm": 1.5355052947998047, "learning_rate": 8.214285714285714e-06, "loss": 0.0663, "step": 2116 }, { "epoch": 8.367588932806324, "grad_norm": 0.8851954340934753, "learning_rate": 8.194444444444445e-06, "loss": 0.0346, "step": 2117 }, { "epoch": 8.371541501976285, "grad_norm": 1.6623241901397705, "learning_rate": 8.174603174603175e-06, "loss": 0.0488, "step": 2118 }, { "epoch": 8.375494071146244, "grad_norm": 0.9663978219032288, "learning_rate": 8.154761904761905e-06, "loss": 0.0372, "step": 2119 }, { "epoch": 8.379446640316205, "grad_norm": 1.2707703113555908, "learning_rate": 8.134920634920636e-06, "loss": 0.0439, "step": 2120 }, { "epoch": 8.383399209486166, "grad_norm": 2.081395149230957, "learning_rate": 8.115079365079366e-06, "loss": 0.0474, "step": 2121 }, { "epoch": 8.387351778656127, "grad_norm": 1.8119603395462036, "learning_rate": 8.095238095238097e-06, "loss": 0.0496, "step": 2122 }, { "epoch": 8.391304347826088, "grad_norm": 1.7686362266540527, "learning_rate": 8.075396825396827e-06, "loss": 0.0549, "step": 2123 }, { "epoch": 8.395256916996047, "grad_norm": 1.752198338508606, "learning_rate": 8.055555555555557e-06, "loss": 0.044, "step": 2124 }, { "epoch": 8.399209486166008, "grad_norm": 1.523292064666748, "learning_rate": 8.035714285714286e-06, "loss": 0.0491, "step": 2125 }, { "epoch": 8.403162055335969, "grad_norm": 1.8821699619293213, "learning_rate": 8.015873015873016e-06, "loss": 0.0473, "step": 2126 }, { "epoch": 8.40711462450593, "grad_norm": 1.762847661972046, "learning_rate": 7.996031746031745e-06, "loss": 0.0498, "step": 2127 }, { "epoch": 8.411067193675889, "grad_norm": 1.716994285583496, "learning_rate": 7.976190476190475e-06, "loss": 0.0536, "step": 2128 }, { "epoch": 8.41501976284585, "grad_norm": 1.1499348878860474, "learning_rate": 7.956349206349206e-06, "loss": 0.0351, "step": 2129 }, { "epoch": 8.41897233201581, "grad_norm": 0.8600573539733887, "learning_rate": 7.936507936507936e-06, "loss": 0.036, "step": 2130 }, { "epoch": 8.422924901185771, "grad_norm": 1.5126005411148071, "learning_rate": 7.916666666666667e-06, "loss": 0.0523, "step": 2131 }, { "epoch": 8.42687747035573, "grad_norm": 0.9862467050552368, "learning_rate": 7.896825396825397e-06, "loss": 0.0395, "step": 2132 }, { "epoch": 8.430830039525691, "grad_norm": 1.1598824262619019, "learning_rate": 7.876984126984127e-06, "loss": 0.0387, "step": 2133 }, { "epoch": 8.434782608695652, "grad_norm": 2.440631151199341, "learning_rate": 7.857142857142858e-06, "loss": 0.0462, "step": 2134 }, { "epoch": 8.438735177865613, "grad_norm": 1.5269445180892944, "learning_rate": 7.837301587301588e-06, "loss": 0.0576, "step": 2135 }, { "epoch": 8.442687747035572, "grad_norm": 1.6497447490692139, "learning_rate": 7.817460317460318e-06, "loss": 0.0695, "step": 2136 }, { "epoch": 8.446640316205533, "grad_norm": 2.240730047225952, "learning_rate": 7.797619047619049e-06, "loss": 0.0558, "step": 2137 }, { "epoch": 8.450592885375494, "grad_norm": 1.2452744245529175, "learning_rate": 7.777777777777777e-06, "loss": 0.0861, "step": 2138 }, { "epoch": 8.454545454545455, "grad_norm": 1.3848949670791626, "learning_rate": 7.757936507936508e-06, "loss": 0.0522, "step": 2139 }, { "epoch": 8.458498023715414, "grad_norm": 1.5077615976333618, "learning_rate": 7.738095238095238e-06, "loss": 0.051, "step": 2140 }, { "epoch": 8.462450592885375, "grad_norm": 1.707553505897522, "learning_rate": 7.718253968253969e-06, "loss": 0.0592, "step": 2141 }, { "epoch": 8.466403162055336, "grad_norm": 1.6871522665023804, "learning_rate": 7.698412698412699e-06, "loss": 0.0518, "step": 2142 }, { "epoch": 8.470355731225297, "grad_norm": 2.1698107719421387, "learning_rate": 7.67857142857143e-06, "loss": 0.0599, "step": 2143 }, { "epoch": 8.474308300395258, "grad_norm": 1.105559229850769, "learning_rate": 7.65873015873016e-06, "loss": 0.0368, "step": 2144 }, { "epoch": 8.478260869565217, "grad_norm": 1.6224021911621094, "learning_rate": 7.63888888888889e-06, "loss": 0.0513, "step": 2145 }, { "epoch": 8.482213438735178, "grad_norm": 1.7027612924575806, "learning_rate": 7.6190476190476205e-06, "loss": 0.0502, "step": 2146 }, { "epoch": 8.486166007905139, "grad_norm": 1.5517146587371826, "learning_rate": 7.599206349206349e-06, "loss": 0.0515, "step": 2147 }, { "epoch": 8.4901185770751, "grad_norm": 1.5171629190444946, "learning_rate": 7.5793650793650795e-06, "loss": 0.0509, "step": 2148 }, { "epoch": 8.494071146245059, "grad_norm": 1.476023554801941, "learning_rate": 7.559523809523809e-06, "loss": 0.0498, "step": 2149 }, { "epoch": 8.49802371541502, "grad_norm": 1.404647707939148, "learning_rate": 7.5396825396825394e-06, "loss": 0.0449, "step": 2150 }, { "epoch": 8.50197628458498, "grad_norm": 1.2494837045669556, "learning_rate": 7.51984126984127e-06, "loss": 0.0503, "step": 2151 }, { "epoch": 8.505928853754941, "grad_norm": 1.9429749250411987, "learning_rate": 7.5e-06, "loss": 0.0454, "step": 2152 }, { "epoch": 8.5098814229249, "grad_norm": 1.3501893281936646, "learning_rate": 7.4801587301587306e-06, "loss": 0.0435, "step": 2153 }, { "epoch": 8.513833992094861, "grad_norm": 1.3887948989868164, "learning_rate": 7.460317460317461e-06, "loss": 0.0479, "step": 2154 }, { "epoch": 8.517786561264822, "grad_norm": 1.7705752849578857, "learning_rate": 7.4404761904761905e-06, "loss": 0.0579, "step": 2155 }, { "epoch": 8.521739130434783, "grad_norm": 1.6018643379211426, "learning_rate": 7.420634920634921e-06, "loss": 0.0488, "step": 2156 }, { "epoch": 8.525691699604742, "grad_norm": 1.2142655849456787, "learning_rate": 7.400793650793651e-06, "loss": 0.0474, "step": 2157 }, { "epoch": 8.529644268774703, "grad_norm": 1.2533504962921143, "learning_rate": 7.380952380952382e-06, "loss": 0.0428, "step": 2158 }, { "epoch": 8.533596837944664, "grad_norm": 1.5056265592575073, "learning_rate": 7.361111111111112e-06, "loss": 0.0412, "step": 2159 }, { "epoch": 8.537549407114625, "grad_norm": 1.0978323221206665, "learning_rate": 7.3412698412698415e-06, "loss": 0.0429, "step": 2160 }, { "epoch": 8.541501976284586, "grad_norm": 1.600511908531189, "learning_rate": 7.321428571428572e-06, "loss": 0.0506, "step": 2161 }, { "epoch": 8.545454545454545, "grad_norm": 1.2966439723968506, "learning_rate": 7.301587301587302e-06, "loss": 0.0445, "step": 2162 }, { "epoch": 8.549407114624506, "grad_norm": 1.1138356924057007, "learning_rate": 7.281746031746033e-06, "loss": 0.0362, "step": 2163 }, { "epoch": 8.553359683794467, "grad_norm": 1.2294458150863647, "learning_rate": 7.261904761904763e-06, "loss": 0.0418, "step": 2164 }, { "epoch": 8.557312252964428, "grad_norm": 2.5378119945526123, "learning_rate": 7.242063492063493e-06, "loss": 0.0703, "step": 2165 }, { "epoch": 8.561264822134387, "grad_norm": 1.068594217300415, "learning_rate": 7.222222222222222e-06, "loss": 0.0407, "step": 2166 }, { "epoch": 8.565217391304348, "grad_norm": 1.1735312938690186, "learning_rate": 7.2023809523809524e-06, "loss": 0.0435, "step": 2167 }, { "epoch": 8.569169960474309, "grad_norm": 2.6890134811401367, "learning_rate": 7.182539682539682e-06, "loss": 0.0897, "step": 2168 }, { "epoch": 8.57312252964427, "grad_norm": 1.2781907320022583, "learning_rate": 7.162698412698412e-06, "loss": 0.0517, "step": 2169 }, { "epoch": 8.577075098814229, "grad_norm": 1.8715581893920898, "learning_rate": 7.142857142857143e-06, "loss": 0.0598, "step": 2170 }, { "epoch": 8.58102766798419, "grad_norm": 2.0492424964904785, "learning_rate": 7.123015873015873e-06, "loss": 0.0498, "step": 2171 }, { "epoch": 8.58498023715415, "grad_norm": 2.0802016258239746, "learning_rate": 7.1031746031746035e-06, "loss": 0.0543, "step": 2172 }, { "epoch": 8.588932806324111, "grad_norm": 1.2858587503433228, "learning_rate": 7.083333333333334e-06, "loss": 0.0445, "step": 2173 }, { "epoch": 8.59288537549407, "grad_norm": 1.2407563924789429, "learning_rate": 7.063492063492063e-06, "loss": 0.0393, "step": 2174 }, { "epoch": 8.596837944664031, "grad_norm": 1.4286069869995117, "learning_rate": 7.043650793650794e-06, "loss": 0.0472, "step": 2175 }, { "epoch": 8.600790513833992, "grad_norm": 2.2047770023345947, "learning_rate": 7.023809523809524e-06, "loss": 0.0527, "step": 2176 }, { "epoch": 8.604743083003953, "grad_norm": 1.2905137538909912, "learning_rate": 7.0039682539682545e-06, "loss": 0.0412, "step": 2177 }, { "epoch": 8.608695652173914, "grad_norm": 2.2183566093444824, "learning_rate": 6.984126984126985e-06, "loss": 0.0679, "step": 2178 }, { "epoch": 8.612648221343873, "grad_norm": 1.4202773571014404, "learning_rate": 6.964285714285715e-06, "loss": 0.0462, "step": 2179 }, { "epoch": 8.616600790513834, "grad_norm": 1.6715538501739502, "learning_rate": 6.944444444444445e-06, "loss": 0.0637, "step": 2180 }, { "epoch": 8.620553359683795, "grad_norm": 1.480151891708374, "learning_rate": 6.924603174603175e-06, "loss": 0.0496, "step": 2181 }, { "epoch": 8.624505928853754, "grad_norm": 1.6258138418197632, "learning_rate": 6.9047619047619055e-06, "loss": 0.0608, "step": 2182 }, { "epoch": 8.628458498023715, "grad_norm": 1.6677876710891724, "learning_rate": 6.884920634920636e-06, "loss": 0.0547, "step": 2183 }, { "epoch": 8.632411067193676, "grad_norm": 1.3751989603042603, "learning_rate": 6.865079365079366e-06, "loss": 0.0441, "step": 2184 }, { "epoch": 8.636363636363637, "grad_norm": 1.4697294235229492, "learning_rate": 6.845238095238096e-06, "loss": 0.0485, "step": 2185 }, { "epoch": 8.640316205533598, "grad_norm": 1.547059178352356, "learning_rate": 6.825396825396825e-06, "loss": 0.0515, "step": 2186 }, { "epoch": 8.644268774703557, "grad_norm": 1.3061259984970093, "learning_rate": 6.805555555555556e-06, "loss": 0.0428, "step": 2187 }, { "epoch": 8.648221343873518, "grad_norm": 1.2875500917434692, "learning_rate": 6.785714285714285e-06, "loss": 0.0422, "step": 2188 }, { "epoch": 8.652173913043478, "grad_norm": 1.4154988527297974, "learning_rate": 6.765873015873016e-06, "loss": 0.051, "step": 2189 }, { "epoch": 8.65612648221344, "grad_norm": 1.7555774450302124, "learning_rate": 6.746031746031746e-06, "loss": 0.0484, "step": 2190 }, { "epoch": 8.660079051383399, "grad_norm": 1.8239620923995972, "learning_rate": 6.726190476190476e-06, "loss": 0.0498, "step": 2191 }, { "epoch": 8.66403162055336, "grad_norm": 1.1520577669143677, "learning_rate": 6.706349206349207e-06, "loss": 0.042, "step": 2192 }, { "epoch": 8.66798418972332, "grad_norm": 1.5023807287216187, "learning_rate": 6.686507936507936e-06, "loss": 0.0578, "step": 2193 }, { "epoch": 8.671936758893281, "grad_norm": 2.02333927154541, "learning_rate": 6.666666666666667e-06, "loss": 0.05, "step": 2194 }, { "epoch": 8.67588932806324, "grad_norm": 1.4692609310150146, "learning_rate": 6.646825396825397e-06, "loss": 0.0454, "step": 2195 }, { "epoch": 8.679841897233201, "grad_norm": 1.0825679302215576, "learning_rate": 6.626984126984127e-06, "loss": 0.0362, "step": 2196 }, { "epoch": 8.683794466403162, "grad_norm": 1.203744649887085, "learning_rate": 6.607142857142858e-06, "loss": 0.0464, "step": 2197 }, { "epoch": 8.687747035573123, "grad_norm": 2.339820623397827, "learning_rate": 6.587301587301588e-06, "loss": 0.0745, "step": 2198 }, { "epoch": 8.691699604743082, "grad_norm": 1.2836312055587769, "learning_rate": 6.567460317460318e-06, "loss": 0.0443, "step": 2199 }, { "epoch": 8.695652173913043, "grad_norm": 2.841017723083496, "learning_rate": 6.547619047619048e-06, "loss": 0.0571, "step": 2200 }, { "epoch": 8.699604743083004, "grad_norm": 1.3378969430923462, "learning_rate": 6.5277777777777784e-06, "loss": 0.0398, "step": 2201 }, { "epoch": 8.703557312252965, "grad_norm": 3.261378526687622, "learning_rate": 6.507936507936509e-06, "loss": 0.044, "step": 2202 }, { "epoch": 8.707509881422926, "grad_norm": 1.5358128547668457, "learning_rate": 6.488095238095239e-06, "loss": 0.0509, "step": 2203 }, { "epoch": 8.711462450592885, "grad_norm": 1.134779930114746, "learning_rate": 6.4682539682539696e-06, "loss": 0.0338, "step": 2204 }, { "epoch": 8.715415019762846, "grad_norm": 1.4963998794555664, "learning_rate": 6.448412698412699e-06, "loss": 0.0444, "step": 2205 }, { "epoch": 8.719367588932807, "grad_norm": 1.054849624633789, "learning_rate": 6.428571428571429e-06, "loss": 0.0365, "step": 2206 }, { "epoch": 8.723320158102768, "grad_norm": 1.3599568605422974, "learning_rate": 6.408730158730158e-06, "loss": 0.0443, "step": 2207 }, { "epoch": 8.727272727272727, "grad_norm": 2.381058931350708, "learning_rate": 6.3888888888888885e-06, "loss": 0.0691, "step": 2208 }, { "epoch": 8.731225296442688, "grad_norm": 1.278258204460144, "learning_rate": 6.369047619047619e-06, "loss": 0.051, "step": 2209 }, { "epoch": 8.735177865612648, "grad_norm": 1.1571305990219116, "learning_rate": 6.349206349206349e-06, "loss": 0.0556, "step": 2210 }, { "epoch": 8.73913043478261, "grad_norm": 1.4655886888504028, "learning_rate": 6.32936507936508e-06, "loss": 0.0484, "step": 2211 }, { "epoch": 8.743083003952568, "grad_norm": 1.1832677125930786, "learning_rate": 6.30952380952381e-06, "loss": 0.0396, "step": 2212 }, { "epoch": 8.74703557312253, "grad_norm": 1.5218229293823242, "learning_rate": 6.2896825396825395e-06, "loss": 0.0535, "step": 2213 }, { "epoch": 8.75098814229249, "grad_norm": 1.2771645784378052, "learning_rate": 6.26984126984127e-06, "loss": 0.0454, "step": 2214 }, { "epoch": 8.754940711462451, "grad_norm": 1.9904732704162598, "learning_rate": 6.25e-06, "loss": 0.0582, "step": 2215 }, { "epoch": 8.75889328063241, "grad_norm": 1.5673877000808716, "learning_rate": 6.230158730158731e-06, "loss": 0.0481, "step": 2216 }, { "epoch": 8.762845849802371, "grad_norm": 1.2909679412841797, "learning_rate": 6.210317460317461e-06, "loss": 0.0427, "step": 2217 }, { "epoch": 8.766798418972332, "grad_norm": 1.0783694982528687, "learning_rate": 6.190476190476191e-06, "loss": 0.0379, "step": 2218 }, { "epoch": 8.770750988142293, "grad_norm": 1.3569505214691162, "learning_rate": 6.170634920634921e-06, "loss": 0.0438, "step": 2219 }, { "epoch": 8.774703557312254, "grad_norm": 1.4521833658218384, "learning_rate": 6.1507936507936505e-06, "loss": 0.0533, "step": 2220 }, { "epoch": 8.778656126482213, "grad_norm": 1.4218260049819946, "learning_rate": 6.130952380952381e-06, "loss": 0.0499, "step": 2221 }, { "epoch": 8.782608695652174, "grad_norm": 1.4923803806304932, "learning_rate": 6.111111111111111e-06, "loss": 0.066, "step": 2222 }, { "epoch": 8.786561264822135, "grad_norm": 1.5490056276321411, "learning_rate": 6.091269841269842e-06, "loss": 0.0537, "step": 2223 }, { "epoch": 8.790513833992096, "grad_norm": 2.071575164794922, "learning_rate": 6.071428571428572e-06, "loss": 0.0601, "step": 2224 }, { "epoch": 8.794466403162055, "grad_norm": 1.5150130987167358, "learning_rate": 6.0515873015873015e-06, "loss": 0.0486, "step": 2225 }, { "epoch": 8.798418972332016, "grad_norm": 1.5356022119522095, "learning_rate": 6.031746031746032e-06, "loss": 0.0446, "step": 2226 }, { "epoch": 8.802371541501977, "grad_norm": 1.9417153596878052, "learning_rate": 6.011904761904762e-06, "loss": 0.0542, "step": 2227 }, { "epoch": 8.806324110671937, "grad_norm": 1.9926491975784302, "learning_rate": 5.992063492063493e-06, "loss": 0.0664, "step": 2228 }, { "epoch": 8.810276679841897, "grad_norm": 1.1277952194213867, "learning_rate": 5.972222222222223e-06, "loss": 0.0478, "step": 2229 }, { "epoch": 8.814229249011857, "grad_norm": 1.2221788167953491, "learning_rate": 5.9523809523809525e-06, "loss": 0.0366, "step": 2230 }, { "epoch": 8.818181818181818, "grad_norm": 1.5764946937561035, "learning_rate": 5.932539682539683e-06, "loss": 0.0477, "step": 2231 }, { "epoch": 8.82213438735178, "grad_norm": 1.4645206928253174, "learning_rate": 5.9126984126984124e-06, "loss": 0.0418, "step": 2232 }, { "epoch": 8.826086956521738, "grad_norm": 1.9131215810775757, "learning_rate": 5.892857142857143e-06, "loss": 0.0689, "step": 2233 }, { "epoch": 8.8300395256917, "grad_norm": 1.1039915084838867, "learning_rate": 5.873015873015873e-06, "loss": 0.035, "step": 2234 }, { "epoch": 8.83399209486166, "grad_norm": 1.4914485216140747, "learning_rate": 5.8531746031746036e-06, "loss": 0.0451, "step": 2235 }, { "epoch": 8.837944664031621, "grad_norm": 2.195981502532959, "learning_rate": 5.833333333333334e-06, "loss": 0.0672, "step": 2236 }, { "epoch": 8.841897233201582, "grad_norm": 1.5078537464141846, "learning_rate": 5.813492063492064e-06, "loss": 0.0416, "step": 2237 }, { "epoch": 8.845849802371541, "grad_norm": 1.2279962301254272, "learning_rate": 5.793650793650794e-06, "loss": 0.0414, "step": 2238 }, { "epoch": 8.849802371541502, "grad_norm": 1.8902143239974976, "learning_rate": 5.773809523809524e-06, "loss": 0.0677, "step": 2239 }, { "epoch": 8.853754940711463, "grad_norm": 0.9590296745300293, "learning_rate": 5.753968253968254e-06, "loss": 0.0366, "step": 2240 }, { "epoch": 8.857707509881424, "grad_norm": 1.2233220338821411, "learning_rate": 5.734126984126984e-06, "loss": 0.0498, "step": 2241 }, { "epoch": 8.861660079051383, "grad_norm": 1.215208888053894, "learning_rate": 5.7142857142857145e-06, "loss": 0.049, "step": 2242 }, { "epoch": 8.865612648221344, "grad_norm": 1.1311190128326416, "learning_rate": 5.694444444444445e-06, "loss": 0.0442, "step": 2243 }, { "epoch": 8.869565217391305, "grad_norm": 1.8418809175491333, "learning_rate": 5.674603174603174e-06, "loss": 0.0535, "step": 2244 }, { "epoch": 8.873517786561266, "grad_norm": 1.3625643253326416, "learning_rate": 5.654761904761905e-06, "loss": 0.0461, "step": 2245 }, { "epoch": 8.877470355731225, "grad_norm": 1.0108672380447388, "learning_rate": 5.634920634920635e-06, "loss": 0.0362, "step": 2246 }, { "epoch": 8.881422924901186, "grad_norm": 1.096070647239685, "learning_rate": 5.6150793650793655e-06, "loss": 0.0467, "step": 2247 }, { "epoch": 8.885375494071146, "grad_norm": 1.4030896425247192, "learning_rate": 5.595238095238096e-06, "loss": 0.0459, "step": 2248 }, { "epoch": 8.889328063241107, "grad_norm": 1.9264237880706787, "learning_rate": 5.575396825396826e-06, "loss": 0.0593, "step": 2249 }, { "epoch": 8.893280632411066, "grad_norm": 1.8848391771316528, "learning_rate": 5.555555555555556e-06, "loss": 0.0654, "step": 2250 }, { "epoch": 8.897233201581027, "grad_norm": 1.1368132829666138, "learning_rate": 5.535714285714285e-06, "loss": 0.0356, "step": 2251 }, { "epoch": 8.901185770750988, "grad_norm": 1.4050102233886719, "learning_rate": 5.515873015873016e-06, "loss": 0.0568, "step": 2252 }, { "epoch": 8.90513833992095, "grad_norm": 1.6391522884368896, "learning_rate": 5.496031746031746e-06, "loss": 0.0512, "step": 2253 }, { "epoch": 8.909090909090908, "grad_norm": 1.2339924573898315, "learning_rate": 5.4761904761904765e-06, "loss": 0.0423, "step": 2254 }, { "epoch": 8.91304347826087, "grad_norm": 1.334693431854248, "learning_rate": 5.456349206349207e-06, "loss": 0.043, "step": 2255 }, { "epoch": 8.91699604743083, "grad_norm": 1.6208196878433228, "learning_rate": 5.436507936507937e-06, "loss": 0.0515, "step": 2256 }, { "epoch": 8.920948616600791, "grad_norm": 1.959010124206543, "learning_rate": 5.416666666666667e-06, "loss": 0.0591, "step": 2257 }, { "epoch": 8.92490118577075, "grad_norm": 1.2464101314544678, "learning_rate": 5.396825396825397e-06, "loss": 0.0444, "step": 2258 }, { "epoch": 8.928853754940711, "grad_norm": 1.113886833190918, "learning_rate": 5.3769841269841275e-06, "loss": 0.045, "step": 2259 }, { "epoch": 8.932806324110672, "grad_norm": 1.9017833471298218, "learning_rate": 5.357142857142857e-06, "loss": 0.0586, "step": 2260 }, { "epoch": 8.936758893280633, "grad_norm": 1.6921342611312866, "learning_rate": 5.337301587301587e-06, "loss": 0.0678, "step": 2261 }, { "epoch": 8.940711462450594, "grad_norm": 1.1940516233444214, "learning_rate": 5.317460317460318e-06, "loss": 0.0476, "step": 2262 }, { "epoch": 8.944664031620553, "grad_norm": 1.899581789970398, "learning_rate": 5.297619047619048e-06, "loss": 0.0688, "step": 2263 }, { "epoch": 8.948616600790514, "grad_norm": 1.1794816255569458, "learning_rate": 5.277777777777778e-06, "loss": 0.0459, "step": 2264 }, { "epoch": 8.952569169960475, "grad_norm": 1.233652949333191, "learning_rate": 5.257936507936508e-06, "loss": 0.0436, "step": 2265 }, { "epoch": 8.956521739130435, "grad_norm": 1.3263018131256104, "learning_rate": 5.2380952380952384e-06, "loss": 0.046, "step": 2266 }, { "epoch": 8.960474308300395, "grad_norm": 1.1982510089874268, "learning_rate": 5.218253968253969e-06, "loss": 0.0414, "step": 2267 }, { "epoch": 8.964426877470355, "grad_norm": 2.1782376766204834, "learning_rate": 5.198412698412699e-06, "loss": 0.075, "step": 2268 }, { "epoch": 8.968379446640316, "grad_norm": 1.3456032276153564, "learning_rate": 5.1785714285714296e-06, "loss": 0.0451, "step": 2269 }, { "epoch": 8.972332015810277, "grad_norm": 1.7961373329162598, "learning_rate": 5.158730158730159e-06, "loss": 0.0464, "step": 2270 }, { "epoch": 8.976284584980236, "grad_norm": 1.0362279415130615, "learning_rate": 5.138888888888889e-06, "loss": 0.0374, "step": 2271 }, { "epoch": 8.980237154150197, "grad_norm": 1.464831829071045, "learning_rate": 5.119047619047619e-06, "loss": 0.0419, "step": 2272 }, { "epoch": 8.984189723320158, "grad_norm": 1.7607347965240479, "learning_rate": 5.099206349206349e-06, "loss": 0.053, "step": 2273 }, { "epoch": 8.988142292490119, "grad_norm": 1.2373989820480347, "learning_rate": 5.07936507936508e-06, "loss": 0.05, "step": 2274 }, { "epoch": 8.992094861660078, "grad_norm": 1.8209704160690308, "learning_rate": 5.05952380952381e-06, "loss": 0.0494, "step": 2275 }, { "epoch": 8.996047430830039, "grad_norm": 1.3406612873077393, "learning_rate": 5.03968253968254e-06, "loss": 0.0461, "step": 2276 }, { "epoch": 9.0, "grad_norm": 1.2392350435256958, "learning_rate": 5.01984126984127e-06, "loss": 0.0467, "step": 2277 }, { "epoch": 9.003952569169961, "grad_norm": 0.9584967494010925, "learning_rate": 5e-06, "loss": 0.0403, "step": 2278 }, { "epoch": 9.007905138339922, "grad_norm": 0.5832052826881409, "learning_rate": 4.98015873015873e-06, "loss": 0.0321, "step": 2279 }, { "epoch": 9.011857707509881, "grad_norm": 1.2194710969924927, "learning_rate": 4.96031746031746e-06, "loss": 0.0532, "step": 2280 }, { "epoch": 9.015810276679842, "grad_norm": 0.5771675705909729, "learning_rate": 4.940476190476191e-06, "loss": 0.028, "step": 2281 }, { "epoch": 9.019762845849803, "grad_norm": 1.116558313369751, "learning_rate": 4.920634920634921e-06, "loss": 0.0355, "step": 2282 }, { "epoch": 9.023715415019764, "grad_norm": 0.4172620475292206, "learning_rate": 4.900793650793651e-06, "loss": 0.0239, "step": 2283 }, { "epoch": 9.027667984189723, "grad_norm": 0.6424721479415894, "learning_rate": 4.880952380952381e-06, "loss": 0.027, "step": 2284 }, { "epoch": 9.031620553359684, "grad_norm": 1.7028534412384033, "learning_rate": 4.861111111111111e-06, "loss": 0.0455, "step": 2285 }, { "epoch": 9.035573122529645, "grad_norm": 0.9933211803436279, "learning_rate": 4.841269841269842e-06, "loss": 0.033, "step": 2286 }, { "epoch": 9.039525691699605, "grad_norm": 0.9646832942962646, "learning_rate": 4.821428571428572e-06, "loss": 0.0386, "step": 2287 }, { "epoch": 9.043478260869565, "grad_norm": 0.6727697253227234, "learning_rate": 4.8015873015873025e-06, "loss": 0.0352, "step": 2288 }, { "epoch": 9.047430830039525, "grad_norm": 0.7543931603431702, "learning_rate": 4.781746031746032e-06, "loss": 0.0295, "step": 2289 }, { "epoch": 9.051383399209486, "grad_norm": 0.46685945987701416, "learning_rate": 4.7619047619047615e-06, "loss": 0.0243, "step": 2290 }, { "epoch": 9.055335968379447, "grad_norm": 0.4817000925540924, "learning_rate": 4.742063492063492e-06, "loss": 0.0292, "step": 2291 }, { "epoch": 9.059288537549406, "grad_norm": 0.8889188170433044, "learning_rate": 4.722222222222222e-06, "loss": 0.0369, "step": 2292 }, { "epoch": 9.063241106719367, "grad_norm": 0.5639324188232422, "learning_rate": 4.702380952380953e-06, "loss": 0.0307, "step": 2293 }, { "epoch": 9.067193675889328, "grad_norm": 0.6505725383758545, "learning_rate": 4.682539682539683e-06, "loss": 0.0313, "step": 2294 }, { "epoch": 9.071146245059289, "grad_norm": 0.9556856751441956, "learning_rate": 4.662698412698413e-06, "loss": 0.0463, "step": 2295 }, { "epoch": 9.075098814229248, "grad_norm": 0.49648115038871765, "learning_rate": 4.642857142857143e-06, "loss": 0.0246, "step": 2296 }, { "epoch": 9.079051383399209, "grad_norm": 0.877655029296875, "learning_rate": 4.623015873015873e-06, "loss": 0.0309, "step": 2297 }, { "epoch": 9.08300395256917, "grad_norm": 1.1071048974990845, "learning_rate": 4.603174603174604e-06, "loss": 0.0437, "step": 2298 }, { "epoch": 9.08695652173913, "grad_norm": 1.238307237625122, "learning_rate": 4.583333333333333e-06, "loss": 0.0346, "step": 2299 }, { "epoch": 9.090909090909092, "grad_norm": 0.9343061447143555, "learning_rate": 4.563492063492064e-06, "loss": 0.0306, "step": 2300 }, { "epoch": 9.09486166007905, "grad_norm": 1.3237608671188354, "learning_rate": 4.543650793650794e-06, "loss": 0.0405, "step": 2301 }, { "epoch": 9.098814229249012, "grad_norm": 1.2943129539489746, "learning_rate": 4.5238095238095235e-06, "loss": 0.0414, "step": 2302 }, { "epoch": 9.102766798418973, "grad_norm": 0.4701171815395355, "learning_rate": 4.503968253968254e-06, "loss": 0.0275, "step": 2303 }, { "epoch": 9.106719367588934, "grad_norm": 1.5329245328903198, "learning_rate": 4.484126984126984e-06, "loss": 0.0412, "step": 2304 }, { "epoch": 9.110671936758893, "grad_norm": 1.2468199729919434, "learning_rate": 4.464285714285715e-06, "loss": 0.0361, "step": 2305 }, { "epoch": 9.114624505928854, "grad_norm": 0.4569074511528015, "learning_rate": 4.444444444444445e-06, "loss": 0.0251, "step": 2306 }, { "epoch": 9.118577075098814, "grad_norm": 0.7745764255523682, "learning_rate": 4.424603174603175e-06, "loss": 0.033, "step": 2307 }, { "epoch": 9.122529644268775, "grad_norm": 0.8108665943145752, "learning_rate": 4.404761904761905e-06, "loss": 0.0239, "step": 2308 }, { "epoch": 9.126482213438734, "grad_norm": 0.8088359832763672, "learning_rate": 4.3849206349206344e-06, "loss": 0.0303, "step": 2309 }, { "epoch": 9.130434782608695, "grad_norm": 0.9375463724136353, "learning_rate": 4.365079365079365e-06, "loss": 0.0593, "step": 2310 }, { "epoch": 9.134387351778656, "grad_norm": 1.1133482456207275, "learning_rate": 4.345238095238095e-06, "loss": 0.0357, "step": 2311 }, { "epoch": 9.138339920948617, "grad_norm": 0.5319989919662476, "learning_rate": 4.3253968253968256e-06, "loss": 0.029, "step": 2312 }, { "epoch": 9.142292490118576, "grad_norm": 0.9559447169303894, "learning_rate": 4.305555555555556e-06, "loss": 0.0377, "step": 2313 }, { "epoch": 9.146245059288537, "grad_norm": 1.1195396184921265, "learning_rate": 4.285714285714286e-06, "loss": 0.0254, "step": 2314 }, { "epoch": 9.150197628458498, "grad_norm": 0.6906282901763916, "learning_rate": 4.265873015873016e-06, "loss": 0.0417, "step": 2315 }, { "epoch": 9.154150197628459, "grad_norm": 0.730162501335144, "learning_rate": 4.246031746031746e-06, "loss": 0.0304, "step": 2316 }, { "epoch": 9.15810276679842, "grad_norm": 0.8155921697616577, "learning_rate": 4.226190476190477e-06, "loss": 0.0334, "step": 2317 }, { "epoch": 9.162055335968379, "grad_norm": 1.145893931388855, "learning_rate": 4.206349206349207e-06, "loss": 0.0472, "step": 2318 }, { "epoch": 9.16600790513834, "grad_norm": 0.6284162998199463, "learning_rate": 4.1865079365079365e-06, "loss": 0.0302, "step": 2319 }, { "epoch": 9.1699604743083, "grad_norm": 3.132331371307373, "learning_rate": 4.166666666666667e-06, "loss": 0.0671, "step": 2320 }, { "epoch": 9.173913043478262, "grad_norm": 0.725439190864563, "learning_rate": 4.146825396825397e-06, "loss": 0.029, "step": 2321 }, { "epoch": 9.17786561264822, "grad_norm": 0.6807708144187927, "learning_rate": 4.126984126984127e-06, "loss": 0.0265, "step": 2322 }, { "epoch": 9.181818181818182, "grad_norm": 1.1239207983016968, "learning_rate": 4.107142857142857e-06, "loss": 0.0428, "step": 2323 }, { "epoch": 9.185770750988143, "grad_norm": 0.6186617612838745, "learning_rate": 4.0873015873015875e-06, "loss": 0.026, "step": 2324 }, { "epoch": 9.189723320158103, "grad_norm": 0.5834595561027527, "learning_rate": 4.067460317460318e-06, "loss": 0.0281, "step": 2325 }, { "epoch": 9.193675889328063, "grad_norm": 0.7775494456291199, "learning_rate": 4.047619047619048e-06, "loss": 0.027, "step": 2326 }, { "epoch": 9.197628458498023, "grad_norm": 0.5415392518043518, "learning_rate": 4.027777777777779e-06, "loss": 0.0291, "step": 2327 }, { "epoch": 9.201581027667984, "grad_norm": 1.0766407251358032, "learning_rate": 4.007936507936508e-06, "loss": 0.0347, "step": 2328 }, { "epoch": 9.205533596837945, "grad_norm": 0.705011785030365, "learning_rate": 3.988095238095238e-06, "loss": 0.0274, "step": 2329 }, { "epoch": 9.209486166007904, "grad_norm": 1.0642330646514893, "learning_rate": 3.968253968253968e-06, "loss": 0.0432, "step": 2330 }, { "epoch": 9.213438735177865, "grad_norm": 0.8075931072235107, "learning_rate": 3.9484126984126985e-06, "loss": 0.031, "step": 2331 }, { "epoch": 9.217391304347826, "grad_norm": 0.5113716721534729, "learning_rate": 3.928571428571429e-06, "loss": 0.0224, "step": 2332 }, { "epoch": 9.221343873517787, "grad_norm": 0.9608832597732544, "learning_rate": 3.908730158730159e-06, "loss": 0.0319, "step": 2333 }, { "epoch": 9.225296442687746, "grad_norm": 1.2569172382354736, "learning_rate": 3.888888888888889e-06, "loss": 0.0328, "step": 2334 }, { "epoch": 9.229249011857707, "grad_norm": 0.7299090027809143, "learning_rate": 3.869047619047619e-06, "loss": 0.0258, "step": 2335 }, { "epoch": 9.233201581027668, "grad_norm": 0.5903069972991943, "learning_rate": 3.8492063492063495e-06, "loss": 0.0313, "step": 2336 }, { "epoch": 9.237154150197629, "grad_norm": 0.715298056602478, "learning_rate": 3.82936507936508e-06, "loss": 0.042, "step": 2337 }, { "epoch": 9.24110671936759, "grad_norm": 0.6976611018180847, "learning_rate": 3.8095238095238102e-06, "loss": 0.0399, "step": 2338 }, { "epoch": 9.245059288537549, "grad_norm": 0.9303411245346069, "learning_rate": 3.7896825396825398e-06, "loss": 0.0319, "step": 2339 }, { "epoch": 9.24901185770751, "grad_norm": 0.706676721572876, "learning_rate": 3.7698412698412697e-06, "loss": 0.0332, "step": 2340 }, { "epoch": 9.25296442687747, "grad_norm": 0.553431510925293, "learning_rate": 3.75e-06, "loss": 0.0339, "step": 2341 }, { "epoch": 9.256916996047432, "grad_norm": 1.1647964715957642, "learning_rate": 3.7301587301587305e-06, "loss": 0.0277, "step": 2342 }, { "epoch": 9.26086956521739, "grad_norm": 0.5181522369384766, "learning_rate": 3.7103174603174604e-06, "loss": 0.0231, "step": 2343 }, { "epoch": 9.264822134387352, "grad_norm": 1.1019365787506104, "learning_rate": 3.690476190476191e-06, "loss": 0.0654, "step": 2344 }, { "epoch": 9.268774703557312, "grad_norm": 0.6852990984916687, "learning_rate": 3.6706349206349208e-06, "loss": 0.0254, "step": 2345 }, { "epoch": 9.272727272727273, "grad_norm": 0.6852630972862244, "learning_rate": 3.650793650793651e-06, "loss": 0.028, "step": 2346 }, { "epoch": 9.276679841897232, "grad_norm": 0.79278165102005, "learning_rate": 3.6309523809523815e-06, "loss": 0.0321, "step": 2347 }, { "epoch": 9.280632411067193, "grad_norm": 1.3321659564971924, "learning_rate": 3.611111111111111e-06, "loss": 0.0305, "step": 2348 }, { "epoch": 9.284584980237154, "grad_norm": 1.0156902074813843, "learning_rate": 3.591269841269841e-06, "loss": 0.0331, "step": 2349 }, { "epoch": 9.288537549407115, "grad_norm": 0.9449909329414368, "learning_rate": 3.5714285714285714e-06, "loss": 0.0291, "step": 2350 }, { "epoch": 9.292490118577074, "grad_norm": 0.6974058747291565, "learning_rate": 3.5515873015873017e-06, "loss": 0.0369, "step": 2351 }, { "epoch": 9.296442687747035, "grad_norm": 0.6614970564842224, "learning_rate": 3.5317460317460317e-06, "loss": 0.0245, "step": 2352 }, { "epoch": 9.300395256916996, "grad_norm": 1.0674023628234863, "learning_rate": 3.511904761904762e-06, "loss": 0.0361, "step": 2353 }, { "epoch": 9.304347826086957, "grad_norm": 0.8966097831726074, "learning_rate": 3.4920634920634924e-06, "loss": 0.0399, "step": 2354 }, { "epoch": 9.308300395256918, "grad_norm": 0.8403730988502502, "learning_rate": 3.4722222222222224e-06, "loss": 0.0403, "step": 2355 }, { "epoch": 9.312252964426877, "grad_norm": 0.9505050182342529, "learning_rate": 3.4523809523809528e-06, "loss": 0.0277, "step": 2356 }, { "epoch": 9.316205533596838, "grad_norm": 0.5641964673995972, "learning_rate": 3.432539682539683e-06, "loss": 0.0316, "step": 2357 }, { "epoch": 9.320158102766799, "grad_norm": 0.9463332891464233, "learning_rate": 3.4126984126984127e-06, "loss": 0.0395, "step": 2358 }, { "epoch": 9.32411067193676, "grad_norm": 1.2878776788711548, "learning_rate": 3.3928571428571426e-06, "loss": 0.0397, "step": 2359 }, { "epoch": 9.328063241106719, "grad_norm": 1.2284893989562988, "learning_rate": 3.373015873015873e-06, "loss": 0.0421, "step": 2360 }, { "epoch": 9.33201581027668, "grad_norm": 0.9104984402656555, "learning_rate": 3.3531746031746034e-06, "loss": 0.0384, "step": 2361 }, { "epoch": 9.33596837944664, "grad_norm": 0.7004300951957703, "learning_rate": 3.3333333333333333e-06, "loss": 0.0506, "step": 2362 }, { "epoch": 9.339920948616601, "grad_norm": 0.9767426252365112, "learning_rate": 3.3134920634920637e-06, "loss": 0.0415, "step": 2363 }, { "epoch": 9.34387351778656, "grad_norm": 0.7821481227874756, "learning_rate": 3.293650793650794e-06, "loss": 0.031, "step": 2364 }, { "epoch": 9.347826086956522, "grad_norm": 0.9590467810630798, "learning_rate": 3.273809523809524e-06, "loss": 0.0393, "step": 2365 }, { "epoch": 9.351778656126482, "grad_norm": 0.6908400654792786, "learning_rate": 3.2539682539682544e-06, "loss": 0.0405, "step": 2366 }, { "epoch": 9.355731225296443, "grad_norm": 0.8684214949607849, "learning_rate": 3.2341269841269848e-06, "loss": 0.0313, "step": 2367 }, { "epoch": 9.359683794466402, "grad_norm": 0.5750323534011841, "learning_rate": 3.2142857142857143e-06, "loss": 0.0323, "step": 2368 }, { "epoch": 9.363636363636363, "grad_norm": 0.7478132247924805, "learning_rate": 3.1944444444444443e-06, "loss": 0.0299, "step": 2369 }, { "epoch": 9.367588932806324, "grad_norm": 0.4561312794685364, "learning_rate": 3.1746031746031746e-06, "loss": 0.0263, "step": 2370 }, { "epoch": 9.371541501976285, "grad_norm": 0.856050968170166, "learning_rate": 3.154761904761905e-06, "loss": 0.032, "step": 2371 }, { "epoch": 9.375494071146244, "grad_norm": 0.4932451844215393, "learning_rate": 3.134920634920635e-06, "loss": 0.0264, "step": 2372 }, { "epoch": 9.379446640316205, "grad_norm": 0.6153606176376343, "learning_rate": 3.1150793650793653e-06, "loss": 0.0281, "step": 2373 }, { "epoch": 9.383399209486166, "grad_norm": 0.8731127381324768, "learning_rate": 3.0952380952380953e-06, "loss": 0.0324, "step": 2374 }, { "epoch": 9.387351778656127, "grad_norm": 0.7539214491844177, "learning_rate": 3.0753968253968252e-06, "loss": 0.0375, "step": 2375 }, { "epoch": 9.391304347826088, "grad_norm": 1.1730583906173706, "learning_rate": 3.0555555555555556e-06, "loss": 0.0359, "step": 2376 }, { "epoch": 9.395256916996047, "grad_norm": 0.7655700445175171, "learning_rate": 3.035714285714286e-06, "loss": 0.0297, "step": 2377 }, { "epoch": 9.399209486166008, "grad_norm": 0.5464181900024414, "learning_rate": 3.015873015873016e-06, "loss": 0.0272, "step": 2378 }, { "epoch": 9.403162055335969, "grad_norm": 1.2985944747924805, "learning_rate": 2.9960317460317463e-06, "loss": 0.0368, "step": 2379 }, { "epoch": 9.40711462450593, "grad_norm": 0.7202474474906921, "learning_rate": 2.9761904761904763e-06, "loss": 0.0431, "step": 2380 }, { "epoch": 9.411067193675889, "grad_norm": 0.859272301197052, "learning_rate": 2.9563492063492062e-06, "loss": 0.0339, "step": 2381 }, { "epoch": 9.41501976284585, "grad_norm": 0.9698725342750549, "learning_rate": 2.9365079365079366e-06, "loss": 0.0264, "step": 2382 }, { "epoch": 9.41897233201581, "grad_norm": 0.8615301251411438, "learning_rate": 2.916666666666667e-06, "loss": 0.0291, "step": 2383 }, { "epoch": 9.422924901185771, "grad_norm": 0.5525624752044678, "learning_rate": 2.896825396825397e-06, "loss": 0.0368, "step": 2384 }, { "epoch": 9.42687747035573, "grad_norm": 1.0065622329711914, "learning_rate": 2.876984126984127e-06, "loss": 0.0386, "step": 2385 }, { "epoch": 9.430830039525691, "grad_norm": 1.2710402011871338, "learning_rate": 2.8571428571428573e-06, "loss": 0.0432, "step": 2386 }, { "epoch": 9.434782608695652, "grad_norm": 0.5823608636856079, "learning_rate": 2.837301587301587e-06, "loss": 0.0302, "step": 2387 }, { "epoch": 9.438735177865613, "grad_norm": 0.8949801325798035, "learning_rate": 2.8174603174603176e-06, "loss": 0.0464, "step": 2388 }, { "epoch": 9.442687747035572, "grad_norm": 0.6641007661819458, "learning_rate": 2.797619047619048e-06, "loss": 0.0232, "step": 2389 }, { "epoch": 9.446640316205533, "grad_norm": 0.9579842686653137, "learning_rate": 2.777777777777778e-06, "loss": 0.0421, "step": 2390 }, { "epoch": 9.450592885375494, "grad_norm": 0.843721866607666, "learning_rate": 2.757936507936508e-06, "loss": 0.0329, "step": 2391 }, { "epoch": 9.454545454545455, "grad_norm": 0.8626483082771301, "learning_rate": 2.7380952380952382e-06, "loss": 0.0407, "step": 2392 }, { "epoch": 9.458498023715414, "grad_norm": 0.610456109046936, "learning_rate": 2.7182539682539686e-06, "loss": 0.0307, "step": 2393 }, { "epoch": 9.462450592885375, "grad_norm": 0.6262418031692505, "learning_rate": 2.6984126984126986e-06, "loss": 0.0261, "step": 2394 }, { "epoch": 9.466403162055336, "grad_norm": 0.5422096848487854, "learning_rate": 2.6785714285714285e-06, "loss": 0.0271, "step": 2395 }, { "epoch": 9.470355731225297, "grad_norm": 0.46671536564826965, "learning_rate": 2.658730158730159e-06, "loss": 0.0212, "step": 2396 }, { "epoch": 9.474308300395258, "grad_norm": 0.6522403359413147, "learning_rate": 2.638888888888889e-06, "loss": 0.0346, "step": 2397 }, { "epoch": 9.478260869565217, "grad_norm": 1.04923677444458, "learning_rate": 2.6190476190476192e-06, "loss": 0.0386, "step": 2398 }, { "epoch": 9.482213438735178, "grad_norm": 0.817672610282898, "learning_rate": 2.5992063492063496e-06, "loss": 0.0375, "step": 2399 }, { "epoch": 9.486166007905139, "grad_norm": 0.4436691105365753, "learning_rate": 2.5793650793650795e-06, "loss": 0.0337, "step": 2400 }, { "epoch": 9.4901185770751, "grad_norm": 0.5380107164382935, "learning_rate": 2.5595238095238095e-06, "loss": 0.0252, "step": 2401 }, { "epoch": 9.494071146245059, "grad_norm": 0.5638983845710754, "learning_rate": 2.53968253968254e-06, "loss": 0.0258, "step": 2402 }, { "epoch": 9.49802371541502, "grad_norm": 0.5976006388664246, "learning_rate": 2.51984126984127e-06, "loss": 0.026, "step": 2403 }, { "epoch": 9.50197628458498, "grad_norm": 0.8063969612121582, "learning_rate": 2.5e-06, "loss": 0.035, "step": 2404 }, { "epoch": 9.505928853754941, "grad_norm": 0.7016451954841614, "learning_rate": 2.48015873015873e-06, "loss": 0.031, "step": 2405 }, { "epoch": 9.5098814229249, "grad_norm": 0.6681998372077942, "learning_rate": 2.4603174603174605e-06, "loss": 0.0293, "step": 2406 }, { "epoch": 9.513833992094861, "grad_norm": 0.8126972913742065, "learning_rate": 2.4404761904761905e-06, "loss": 0.0296, "step": 2407 }, { "epoch": 9.517786561264822, "grad_norm": 0.6644697189331055, "learning_rate": 2.420634920634921e-06, "loss": 0.028, "step": 2408 }, { "epoch": 9.521739130434783, "grad_norm": 0.7953290939331055, "learning_rate": 2.4007936507936512e-06, "loss": 0.0244, "step": 2409 }, { "epoch": 9.525691699604742, "grad_norm": 0.8905209302902222, "learning_rate": 2.3809523809523808e-06, "loss": 0.0629, "step": 2410 }, { "epoch": 9.529644268774703, "grad_norm": 0.6507531404495239, "learning_rate": 2.361111111111111e-06, "loss": 0.026, "step": 2411 }, { "epoch": 9.533596837944664, "grad_norm": 0.6166356205940247, "learning_rate": 2.3412698412698415e-06, "loss": 0.0342, "step": 2412 }, { "epoch": 9.537549407114625, "grad_norm": 0.6299740076065063, "learning_rate": 2.3214285714285715e-06, "loss": 0.0307, "step": 2413 }, { "epoch": 9.541501976284586, "grad_norm": 0.6078210473060608, "learning_rate": 2.301587301587302e-06, "loss": 0.0341, "step": 2414 }, { "epoch": 9.545454545454545, "grad_norm": 1.263504981994629, "learning_rate": 2.281746031746032e-06, "loss": 0.0389, "step": 2415 }, { "epoch": 9.549407114624506, "grad_norm": 0.6518561244010925, "learning_rate": 2.2619047619047617e-06, "loss": 0.0344, "step": 2416 }, { "epoch": 9.553359683794467, "grad_norm": 0.9337310194969177, "learning_rate": 2.242063492063492e-06, "loss": 0.0299, "step": 2417 }, { "epoch": 9.557312252964428, "grad_norm": 0.6526855826377869, "learning_rate": 2.2222222222222225e-06, "loss": 0.0307, "step": 2418 }, { "epoch": 9.561264822134387, "grad_norm": 0.8223960399627686, "learning_rate": 2.2023809523809525e-06, "loss": 0.0279, "step": 2419 }, { "epoch": 9.565217391304348, "grad_norm": 0.6811292171478271, "learning_rate": 2.1825396825396824e-06, "loss": 0.0301, "step": 2420 }, { "epoch": 9.569169960474309, "grad_norm": 0.9670735597610474, "learning_rate": 2.1626984126984128e-06, "loss": 0.031, "step": 2421 }, { "epoch": 9.57312252964427, "grad_norm": 0.5966360569000244, "learning_rate": 2.142857142857143e-06, "loss": 0.0266, "step": 2422 }, { "epoch": 9.577075098814229, "grad_norm": 0.6706477999687195, "learning_rate": 2.123015873015873e-06, "loss": 0.0356, "step": 2423 }, { "epoch": 9.58102766798419, "grad_norm": 0.7263142466545105, "learning_rate": 2.1031746031746035e-06, "loss": 0.0336, "step": 2424 }, { "epoch": 9.58498023715415, "grad_norm": 0.509760856628418, "learning_rate": 2.0833333333333334e-06, "loss": 0.0276, "step": 2425 }, { "epoch": 9.588932806324111, "grad_norm": 0.6894228458404541, "learning_rate": 2.0634920634920634e-06, "loss": 0.0322, "step": 2426 }, { "epoch": 9.59288537549407, "grad_norm": 0.751865029335022, "learning_rate": 2.0436507936507938e-06, "loss": 0.0364, "step": 2427 }, { "epoch": 9.596837944664031, "grad_norm": 0.9960070848464966, "learning_rate": 2.023809523809524e-06, "loss": 0.0268, "step": 2428 }, { "epoch": 9.600790513833992, "grad_norm": 1.3808013200759888, "learning_rate": 2.003968253968254e-06, "loss": 0.0395, "step": 2429 }, { "epoch": 9.604743083003953, "grad_norm": 0.6000566482543945, "learning_rate": 1.984126984126984e-06, "loss": 0.0259, "step": 2430 }, { "epoch": 9.608695652173914, "grad_norm": 0.715231716632843, "learning_rate": 1.9642857142857144e-06, "loss": 0.0297, "step": 2431 }, { "epoch": 9.612648221343873, "grad_norm": 0.8936877250671387, "learning_rate": 1.9444444444444444e-06, "loss": 0.0346, "step": 2432 }, { "epoch": 9.616600790513834, "grad_norm": 0.7350375056266785, "learning_rate": 1.9246031746031747e-06, "loss": 0.0375, "step": 2433 }, { "epoch": 9.620553359683795, "grad_norm": 0.7897645235061646, "learning_rate": 1.9047619047619051e-06, "loss": 0.0312, "step": 2434 }, { "epoch": 9.624505928853754, "grad_norm": 0.4184766113758087, "learning_rate": 1.8849206349206349e-06, "loss": 0.0244, "step": 2435 }, { "epoch": 9.628458498023715, "grad_norm": 0.7881268262863159, "learning_rate": 1.8650793650793652e-06, "loss": 0.0385, "step": 2436 }, { "epoch": 9.632411067193676, "grad_norm": 0.83879554271698, "learning_rate": 1.8452380952380954e-06, "loss": 0.0278, "step": 2437 }, { "epoch": 9.636363636363637, "grad_norm": 0.9279481768608093, "learning_rate": 1.8253968253968256e-06, "loss": 0.0288, "step": 2438 }, { "epoch": 9.640316205533598, "grad_norm": 1.2790271043777466, "learning_rate": 1.8055555555555555e-06, "loss": 0.0289, "step": 2439 }, { "epoch": 9.644268774703557, "grad_norm": 0.5256231427192688, "learning_rate": 1.7857142857142857e-06, "loss": 0.0262, "step": 2440 }, { "epoch": 9.648221343873518, "grad_norm": 0.5460281372070312, "learning_rate": 1.7658730158730158e-06, "loss": 0.0312, "step": 2441 }, { "epoch": 9.652173913043478, "grad_norm": 0.8667499423027039, "learning_rate": 1.7460317460317462e-06, "loss": 0.0355, "step": 2442 }, { "epoch": 9.65612648221344, "grad_norm": 1.1661618947982788, "learning_rate": 1.7261904761904764e-06, "loss": 0.0397, "step": 2443 }, { "epoch": 9.660079051383399, "grad_norm": 0.5786067843437195, "learning_rate": 1.7063492063492063e-06, "loss": 0.0255, "step": 2444 }, { "epoch": 9.66403162055336, "grad_norm": 1.2098567485809326, "learning_rate": 1.6865079365079365e-06, "loss": 0.0829, "step": 2445 }, { "epoch": 9.66798418972332, "grad_norm": 1.0478078126907349, "learning_rate": 1.6666666666666667e-06, "loss": 0.0351, "step": 2446 }, { "epoch": 9.671936758893281, "grad_norm": 0.5507435202598572, "learning_rate": 1.646825396825397e-06, "loss": 0.0268, "step": 2447 }, { "epoch": 9.67588932806324, "grad_norm": 0.4946918785572052, "learning_rate": 1.6269841269841272e-06, "loss": 0.0291, "step": 2448 }, { "epoch": 9.679841897233201, "grad_norm": 1.1714826822280884, "learning_rate": 1.6071428571428572e-06, "loss": 0.0418, "step": 2449 }, { "epoch": 9.683794466403162, "grad_norm": 0.6348810791969299, "learning_rate": 1.5873015873015873e-06, "loss": 0.0329, "step": 2450 }, { "epoch": 9.687747035573123, "grad_norm": 1.8041942119598389, "learning_rate": 1.5674603174603175e-06, "loss": 0.0434, "step": 2451 }, { "epoch": 9.691699604743082, "grad_norm": 0.9931659698486328, "learning_rate": 1.5476190476190476e-06, "loss": 0.0405, "step": 2452 }, { "epoch": 9.695652173913043, "grad_norm": 0.7843307256698608, "learning_rate": 1.5277777777777778e-06, "loss": 0.0316, "step": 2453 }, { "epoch": 9.699604743083004, "grad_norm": 0.6582829356193542, "learning_rate": 1.507936507936508e-06, "loss": 0.0373, "step": 2454 }, { "epoch": 9.703557312252965, "grad_norm": 1.3511526584625244, "learning_rate": 1.4880952380952381e-06, "loss": 0.0397, "step": 2455 }, { "epoch": 9.707509881422926, "grad_norm": 0.6811285614967346, "learning_rate": 1.4682539682539683e-06, "loss": 0.0257, "step": 2456 }, { "epoch": 9.711462450592885, "grad_norm": 1.0743825435638428, "learning_rate": 1.4484126984126985e-06, "loss": 0.0378, "step": 2457 }, { "epoch": 9.715415019762846, "grad_norm": 0.5769475102424622, "learning_rate": 1.4285714285714286e-06, "loss": 0.0271, "step": 2458 }, { "epoch": 9.719367588932807, "grad_norm": 0.5640432834625244, "learning_rate": 1.4087301587301588e-06, "loss": 0.023, "step": 2459 }, { "epoch": 9.723320158102768, "grad_norm": 0.6095978021621704, "learning_rate": 1.388888888888889e-06, "loss": 0.0368, "step": 2460 }, { "epoch": 9.727272727272727, "grad_norm": 0.6505516767501831, "learning_rate": 1.3690476190476191e-06, "loss": 0.025, "step": 2461 }, { "epoch": 9.731225296442688, "grad_norm": 0.7247635722160339, "learning_rate": 1.3492063492063493e-06, "loss": 0.0271, "step": 2462 }, { "epoch": 9.735177865612648, "grad_norm": 0.7161766290664673, "learning_rate": 1.3293650793650794e-06, "loss": 0.0316, "step": 2463 }, { "epoch": 9.73913043478261, "grad_norm": 0.8741920590400696, "learning_rate": 1.3095238095238096e-06, "loss": 0.0356, "step": 2464 }, { "epoch": 9.743083003952568, "grad_norm": 0.7110796570777893, "learning_rate": 1.2896825396825398e-06, "loss": 0.0312, "step": 2465 }, { "epoch": 9.74703557312253, "grad_norm": 0.5976476669311523, "learning_rate": 1.26984126984127e-06, "loss": 0.0341, "step": 2466 }, { "epoch": 9.75098814229249, "grad_norm": 1.2879040241241455, "learning_rate": 1.25e-06, "loss": 0.0361, "step": 2467 }, { "epoch": 9.754940711462451, "grad_norm": 0.5123355388641357, "learning_rate": 1.2301587301587303e-06, "loss": 0.0235, "step": 2468 }, { "epoch": 9.75889328063241, "grad_norm": 1.0054389238357544, "learning_rate": 1.2103174603174604e-06, "loss": 0.0441, "step": 2469 }, { "epoch": 9.762845849802371, "grad_norm": 0.8575003147125244, "learning_rate": 1.1904761904761904e-06, "loss": 0.0337, "step": 2470 }, { "epoch": 9.766798418972332, "grad_norm": 0.6102113127708435, "learning_rate": 1.1706349206349208e-06, "loss": 0.0359, "step": 2471 }, { "epoch": 9.770750988142293, "grad_norm": 1.0554832220077515, "learning_rate": 1.150793650793651e-06, "loss": 0.0388, "step": 2472 }, { "epoch": 9.774703557312254, "grad_norm": 0.7735835909843445, "learning_rate": 1.1309523809523809e-06, "loss": 0.0271, "step": 2473 }, { "epoch": 9.778656126482213, "grad_norm": 1.1842035055160522, "learning_rate": 1.1111111111111112e-06, "loss": 0.0413, "step": 2474 }, { "epoch": 9.782608695652174, "grad_norm": 0.8943471312522888, "learning_rate": 1.0912698412698412e-06, "loss": 0.0396, "step": 2475 }, { "epoch": 9.786561264822135, "grad_norm": 0.7007603049278259, "learning_rate": 1.0714285714285716e-06, "loss": 0.0315, "step": 2476 }, { "epoch": 9.790513833992096, "grad_norm": 0.545301616191864, "learning_rate": 1.0515873015873017e-06, "loss": 0.0243, "step": 2477 }, { "epoch": 9.794466403162055, "grad_norm": 0.3348066210746765, "learning_rate": 1.0317460317460317e-06, "loss": 0.0259, "step": 2478 }, { "epoch": 9.798418972332016, "grad_norm": 0.856115996837616, "learning_rate": 1.011904761904762e-06, "loss": 0.0496, "step": 2479 }, { "epoch": 9.802371541501977, "grad_norm": 1.026168704032898, "learning_rate": 9.92063492063492e-07, "loss": 0.0343, "step": 2480 }, { "epoch": 9.806324110671937, "grad_norm": 0.5965343117713928, "learning_rate": 9.722222222222222e-07, "loss": 0.0248, "step": 2481 }, { "epoch": 9.810276679841897, "grad_norm": 0.6260294318199158, "learning_rate": 9.523809523809526e-07, "loss": 0.0229, "step": 2482 }, { "epoch": 9.814229249011857, "grad_norm": 0.7057980298995972, "learning_rate": 9.325396825396826e-07, "loss": 0.0344, "step": 2483 }, { "epoch": 9.818181818181818, "grad_norm": 1.1280661821365356, "learning_rate": 9.126984126984128e-07, "loss": 0.0297, "step": 2484 }, { "epoch": 9.82213438735178, "grad_norm": 0.6508181095123291, "learning_rate": 8.928571428571428e-07, "loss": 0.0312, "step": 2485 }, { "epoch": 9.826086956521738, "grad_norm": 2.5763347148895264, "learning_rate": 8.730158730158731e-07, "loss": 0.0786, "step": 2486 }, { "epoch": 9.8300395256917, "grad_norm": 0.6393975615501404, "learning_rate": 8.531746031746032e-07, "loss": 0.0292, "step": 2487 }, { "epoch": 9.83399209486166, "grad_norm": 0.559859037399292, "learning_rate": 8.333333333333333e-07, "loss": 0.0322, "step": 2488 }, { "epoch": 9.837944664031621, "grad_norm": 0.8873099088668823, "learning_rate": 8.134920634920636e-07, "loss": 0.0402, "step": 2489 }, { "epoch": 9.841897233201582, "grad_norm": 1.0293989181518555, "learning_rate": 7.936507936507937e-07, "loss": 0.0487, "step": 2490 }, { "epoch": 9.845849802371541, "grad_norm": 0.8084055185317993, "learning_rate": 7.738095238095238e-07, "loss": 0.0415, "step": 2491 }, { "epoch": 9.849802371541502, "grad_norm": 0.8011215329170227, "learning_rate": 7.53968253968254e-07, "loss": 0.0332, "step": 2492 }, { "epoch": 9.853754940711463, "grad_norm": 0.6973430514335632, "learning_rate": 7.341269841269842e-07, "loss": 0.0338, "step": 2493 }, { "epoch": 9.857707509881424, "grad_norm": 1.018647313117981, "learning_rate": 7.142857142857143e-07, "loss": 0.0305, "step": 2494 }, { "epoch": 9.861660079051383, "grad_norm": 0.7031568884849548, "learning_rate": 6.944444444444445e-07, "loss": 0.0297, "step": 2495 }, { "epoch": 9.865612648221344, "grad_norm": 0.6655853986740112, "learning_rate": 6.746031746031746e-07, "loss": 0.0298, "step": 2496 }, { "epoch": 9.869565217391305, "grad_norm": 1.6888632774353027, "learning_rate": 6.547619047619048e-07, "loss": 0.0578, "step": 2497 }, { "epoch": 9.873517786561266, "grad_norm": 1.1527448892593384, "learning_rate": 6.34920634920635e-07, "loss": 0.0444, "step": 2498 }, { "epoch": 9.877470355731225, "grad_norm": 0.6494206190109253, "learning_rate": 6.150793650793651e-07, "loss": 0.034, "step": 2499 }, { "epoch": 9.881422924901186, "grad_norm": 0.9560872912406921, "learning_rate": 5.952380952380952e-07, "loss": 0.0283, "step": 2500 } ], "logging_steps": 1, "max_steps": 2530, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.237898477266954e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }