diff --git "a/checkpoints/checkpoint-2500/trainer_state.json" "b/checkpoints/checkpoint-2500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/checkpoint-2500/trainer_state.json" @@ -0,0 +1,17533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.881422924901186, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003952569169960474, + "grad_norm": 10.89438533782959, + "learning_rate": 5e-06, + "loss": 4.1054, + "step": 1 + }, + { + "epoch": 0.007905138339920948, + "grad_norm": 10.555335998535156, + "learning_rate": 1e-05, + "loss": 4.1308, + "step": 2 + }, + { + "epoch": 0.011857707509881422, + "grad_norm": 10.703575134277344, + "learning_rate": 1.5e-05, + "loss": 4.3599, + "step": 3 + }, + { + "epoch": 0.015810276679841896, + "grad_norm": 5.956073760986328, + "learning_rate": 2e-05, + "loss": 3.8205, + "step": 4 + }, + { + "epoch": 0.019762845849802372, + "grad_norm": 6.856590270996094, + "learning_rate": 2.5e-05, + "loss": 3.5323, + "step": 5 + }, + { + "epoch": 0.023715415019762844, + "grad_norm": 7.316745281219482, + "learning_rate": 3e-05, + "loss": 3.7808, + "step": 6 + }, + { + "epoch": 0.02766798418972332, + "grad_norm": 4.9175705909729, + "learning_rate": 3.5e-05, + "loss": 3.4615, + "step": 7 + }, + { + "epoch": 0.03162055335968379, + "grad_norm": 4.404887676239014, + "learning_rate": 4e-05, + "loss": 3.3232, + "step": 8 + }, + { + "epoch": 0.03557312252964427, + "grad_norm": 4.211629390716553, + "learning_rate": 4.5e-05, + "loss": 3.2175, + "step": 9 + }, + { + "epoch": 0.039525691699604744, + "grad_norm": 4.854459762573242, + "learning_rate": 5e-05, + "loss": 3.3368, + "step": 10 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 3.71142840385437, + "learning_rate": 4.998015873015873e-05, + "loss": 3.2278, + "step": 11 + }, + { + "epoch": 0.04743083003952569, + "grad_norm": 2.8285276889801025, + "learning_rate": 4.996031746031746e-05, + "loss": 2.9991, + "step": 12 + }, + { + "epoch": 0.05138339920948617, + "grad_norm": 2.7281620502471924, + "learning_rate": 4.994047619047619e-05, + "loss": 3.2112, + "step": 13 + }, + { + "epoch": 0.05533596837944664, + "grad_norm": 2.6310040950775146, + "learning_rate": 4.9920634920634924e-05, + "loss": 3.3289, + "step": 14 + }, + { + "epoch": 0.05928853754940711, + "grad_norm": 2.928968667984009, + "learning_rate": 4.990079365079365e-05, + "loss": 3.3052, + "step": 15 + }, + { + "epoch": 0.06324110671936758, + "grad_norm": 2.7853801250457764, + "learning_rate": 4.9880952380952385e-05, + "loss": 3.0065, + "step": 16 + }, + { + "epoch": 0.06719367588932806, + "grad_norm": 2.0545976161956787, + "learning_rate": 4.986111111111111e-05, + "loss": 3.0645, + "step": 17 + }, + { + "epoch": 0.07114624505928854, + "grad_norm": 2.3543589115142822, + "learning_rate": 4.9841269841269845e-05, + "loss": 3.0591, + "step": 18 + }, + { + "epoch": 0.07509881422924901, + "grad_norm": 2.1679906845092773, + "learning_rate": 4.982142857142857e-05, + "loss": 2.9374, + "step": 19 + }, + { + "epoch": 0.07905138339920949, + "grad_norm": 2.5650746822357178, + "learning_rate": 4.9801587301587306e-05, + "loss": 2.9292, + "step": 20 + }, + { + "epoch": 0.08300395256916997, + "grad_norm": 2.4944090843200684, + "learning_rate": 4.978174603174603e-05, + "loss": 3.0967, + "step": 21 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 2.8087055683135986, + "learning_rate": 4.976190476190477e-05, + "loss": 3.076, + "step": 22 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.570450782775879, + "learning_rate": 4.9742063492063494e-05, + "loss": 2.8585, + "step": 23 + }, + { + "epoch": 0.09486166007905138, + "grad_norm": 2.3296656608581543, + "learning_rate": 4.972222222222223e-05, + "loss": 2.7182, + "step": 24 + }, + { + "epoch": 0.09881422924901186, + "grad_norm": 2.1972436904907227, + "learning_rate": 4.9702380952380955e-05, + "loss": 2.8991, + "step": 25 + }, + { + "epoch": 0.10276679841897234, + "grad_norm": 2.7606372833251953, + "learning_rate": 4.968253968253969e-05, + "loss": 2.912, + "step": 26 + }, + { + "epoch": 0.1067193675889328, + "grad_norm": 2.7917637825012207, + "learning_rate": 4.9662698412698415e-05, + "loss": 2.8944, + "step": 27 + }, + { + "epoch": 0.11067193675889328, + "grad_norm": 2.271846055984497, + "learning_rate": 4.964285714285715e-05, + "loss": 2.7234, + "step": 28 + }, + { + "epoch": 0.11462450592885376, + "grad_norm": 2.8581902980804443, + "learning_rate": 4.9623015873015876e-05, + "loss": 2.662, + "step": 29 + }, + { + "epoch": 0.11857707509881422, + "grad_norm": 2.4018585681915283, + "learning_rate": 4.960317460317461e-05, + "loss": 2.554, + "step": 30 + }, + { + "epoch": 0.1225296442687747, + "grad_norm": 3.0106191635131836, + "learning_rate": 4.958333333333334e-05, + "loss": 2.6187, + "step": 31 + }, + { + "epoch": 0.12648221343873517, + "grad_norm": 2.4871628284454346, + "learning_rate": 4.956349206349207e-05, + "loss": 2.5486, + "step": 32 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 3.1579999923706055, + "learning_rate": 4.95436507936508e-05, + "loss": 2.4142, + "step": 33 + }, + { + "epoch": 0.13438735177865613, + "grad_norm": 2.4816365242004395, + "learning_rate": 4.9523809523809525e-05, + "loss": 2.8092, + "step": 34 + }, + { + "epoch": 0.1383399209486166, + "grad_norm": 2.4598770141601562, + "learning_rate": 4.950396825396826e-05, + "loss": 2.5606, + "step": 35 + }, + { + "epoch": 0.1422924901185771, + "grad_norm": 2.9210352897644043, + "learning_rate": 4.9484126984126985e-05, + "loss": 2.6477, + "step": 36 + }, + { + "epoch": 0.14624505928853754, + "grad_norm": 2.082595109939575, + "learning_rate": 4.946428571428572e-05, + "loss": 2.6539, + "step": 37 + }, + { + "epoch": 0.15019762845849802, + "grad_norm": 2.171468734741211, + "learning_rate": 4.9444444444444446e-05, + "loss": 2.8466, + "step": 38 + }, + { + "epoch": 0.1541501976284585, + "grad_norm": 2.395892858505249, + "learning_rate": 4.942460317460318e-05, + "loss": 2.4224, + "step": 39 + }, + { + "epoch": 0.15810276679841898, + "grad_norm": 3.123197317123413, + "learning_rate": 4.940476190476191e-05, + "loss": 2.9204, + "step": 40 + }, + { + "epoch": 0.16205533596837945, + "grad_norm": 2.659156084060669, + "learning_rate": 4.938492063492064e-05, + "loss": 2.43, + "step": 41 + }, + { + "epoch": 0.16600790513833993, + "grad_norm": 2.520598888397217, + "learning_rate": 4.936507936507937e-05, + "loss": 2.4932, + "step": 42 + }, + { + "epoch": 0.16996047430830039, + "grad_norm": 2.3670473098754883, + "learning_rate": 4.93452380952381e-05, + "loss": 2.5526, + "step": 43 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 2.4440720081329346, + "learning_rate": 4.932539682539683e-05, + "loss": 2.7987, + "step": 44 + }, + { + "epoch": 0.17786561264822134, + "grad_norm": 2.4289233684539795, + "learning_rate": 4.930555555555556e-05, + "loss": 2.5825, + "step": 45 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.3470001220703125, + "learning_rate": 4.928571428571429e-05, + "loss": 2.6192, + "step": 46 + }, + { + "epoch": 0.1857707509881423, + "grad_norm": 2.0881083011627197, + "learning_rate": 4.926587301587302e-05, + "loss": 2.5765, + "step": 47 + }, + { + "epoch": 0.18972332015810275, + "grad_norm": 2.184652328491211, + "learning_rate": 4.924603174603175e-05, + "loss": 2.5345, + "step": 48 + }, + { + "epoch": 0.19367588932806323, + "grad_norm": 2.1539316177368164, + "learning_rate": 4.9226190476190484e-05, + "loss": 2.7078, + "step": 49 + }, + { + "epoch": 0.1976284584980237, + "grad_norm": 2.476987361907959, + "learning_rate": 4.9206349206349204e-05, + "loss": 2.5304, + "step": 50 + }, + { + "epoch": 0.2015810276679842, + "grad_norm": 2.591413974761963, + "learning_rate": 4.918650793650794e-05, + "loss": 2.2591, + "step": 51 + }, + { + "epoch": 0.20553359683794467, + "grad_norm": 6.341241836547852, + "learning_rate": 4.9166666666666665e-05, + "loss": 2.4837, + "step": 52 + }, + { + "epoch": 0.20948616600790515, + "grad_norm": 2.9513611793518066, + "learning_rate": 4.91468253968254e-05, + "loss": 2.6514, + "step": 53 + }, + { + "epoch": 0.2134387351778656, + "grad_norm": 2.2543134689331055, + "learning_rate": 4.9126984126984125e-05, + "loss": 2.657, + "step": 54 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 2.0405731201171875, + "learning_rate": 4.910714285714286e-05, + "loss": 2.5088, + "step": 55 + }, + { + "epoch": 0.22134387351778656, + "grad_norm": 2.6079225540161133, + "learning_rate": 4.9087301587301586e-05, + "loss": 2.2703, + "step": 56 + }, + { + "epoch": 0.22529644268774704, + "grad_norm": 2.587153673171997, + "learning_rate": 4.906746031746032e-05, + "loss": 2.5027, + "step": 57 + }, + { + "epoch": 0.22924901185770752, + "grad_norm": 2.083115577697754, + "learning_rate": 4.904761904761905e-05, + "loss": 2.7224, + "step": 58 + }, + { + "epoch": 0.233201581027668, + "grad_norm": 2.291238784790039, + "learning_rate": 4.902777777777778e-05, + "loss": 2.7383, + "step": 59 + }, + { + "epoch": 0.23715415019762845, + "grad_norm": 2.4876511096954346, + "learning_rate": 4.900793650793651e-05, + "loss": 2.3814, + "step": 60 + }, + { + "epoch": 0.24110671936758893, + "grad_norm": 2.6496829986572266, + "learning_rate": 4.898809523809524e-05, + "loss": 2.2271, + "step": 61 + }, + { + "epoch": 0.2450592885375494, + "grad_norm": 2.8697926998138428, + "learning_rate": 4.896825396825397e-05, + "loss": 2.5395, + "step": 62 + }, + { + "epoch": 0.2490118577075099, + "grad_norm": 2.5211477279663086, + "learning_rate": 4.89484126984127e-05, + "loss": 2.0096, + "step": 63 + }, + { + "epoch": 0.25296442687747034, + "grad_norm": 2.0613839626312256, + "learning_rate": 4.892857142857143e-05, + "loss": 2.1318, + "step": 64 + }, + { + "epoch": 0.25691699604743085, + "grad_norm": 2.4938395023345947, + "learning_rate": 4.8908730158730156e-05, + "loss": 2.4198, + "step": 65 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 2.5162112712860107, + "learning_rate": 4.888888888888889e-05, + "loss": 2.2697, + "step": 66 + }, + { + "epoch": 0.2648221343873518, + "grad_norm": 2.361053943634033, + "learning_rate": 4.886904761904762e-05, + "loss": 2.6438, + "step": 67 + }, + { + "epoch": 0.26877470355731226, + "grad_norm": 2.257228374481201, + "learning_rate": 4.884920634920635e-05, + "loss": 2.4411, + "step": 68 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 3.0840468406677246, + "learning_rate": 4.882936507936508e-05, + "loss": 2.1741, + "step": 69 + }, + { + "epoch": 0.2766798418972332, + "grad_norm": 2.4883201122283936, + "learning_rate": 4.880952380952381e-05, + "loss": 2.2851, + "step": 70 + }, + { + "epoch": 0.28063241106719367, + "grad_norm": 2.0108885765075684, + "learning_rate": 4.878968253968254e-05, + "loss": 2.3053, + "step": 71 + }, + { + "epoch": 0.2845849802371542, + "grad_norm": 2.4046590328216553, + "learning_rate": 4.876984126984127e-05, + "loss": 2.2728, + "step": 72 + }, + { + "epoch": 0.2885375494071146, + "grad_norm": 2.492821216583252, + "learning_rate": 4.875e-05, + "loss": 2.463, + "step": 73 + }, + { + "epoch": 0.2924901185770751, + "grad_norm": 1.9024349451065063, + "learning_rate": 4.873015873015873e-05, + "loss": 2.5196, + "step": 74 + }, + { + "epoch": 0.2964426877470356, + "grad_norm": 2.240678310394287, + "learning_rate": 4.871031746031746e-05, + "loss": 2.2548, + "step": 75 + }, + { + "epoch": 0.30039525691699603, + "grad_norm": 2.447615146636963, + "learning_rate": 4.8690476190476194e-05, + "loss": 2.9293, + "step": 76 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 1.805576205253601, + "learning_rate": 4.867063492063492e-05, + "loss": 2.3533, + "step": 77 + }, + { + "epoch": 0.308300395256917, + "grad_norm": 2.2282891273498535, + "learning_rate": 4.8650793650793654e-05, + "loss": 2.5451, + "step": 78 + }, + { + "epoch": 0.31225296442687744, + "grad_norm": 2.568352699279785, + "learning_rate": 4.863095238095238e-05, + "loss": 1.9444, + "step": 79 + }, + { + "epoch": 0.31620553359683795, + "grad_norm": 1.9792273044586182, + "learning_rate": 4.8611111111111115e-05, + "loss": 2.4367, + "step": 80 + }, + { + "epoch": 0.3201581027667984, + "grad_norm": 2.3930373191833496, + "learning_rate": 4.859126984126984e-05, + "loss": 2.3669, + "step": 81 + }, + { + "epoch": 0.3241106719367589, + "grad_norm": 2.2622337341308594, + "learning_rate": 4.8571428571428576e-05, + "loss": 2.4811, + "step": 82 + }, + { + "epoch": 0.32806324110671936, + "grad_norm": 2.4643726348876953, + "learning_rate": 4.85515873015873e-05, + "loss": 2.3285, + "step": 83 + }, + { + "epoch": 0.33201581027667987, + "grad_norm": 2.4243311882019043, + "learning_rate": 4.853174603174604e-05, + "loss": 2.579, + "step": 84 + }, + { + "epoch": 0.3359683794466403, + "grad_norm": 3.0440781116485596, + "learning_rate": 4.8511904761904764e-05, + "loss": 2.0484, + "step": 85 + }, + { + "epoch": 0.33992094861660077, + "grad_norm": 2.282578706741333, + "learning_rate": 4.84920634920635e-05, + "loss": 2.2485, + "step": 86 + }, + { + "epoch": 0.3438735177865613, + "grad_norm": 2.2174232006073, + "learning_rate": 4.8472222222222224e-05, + "loss": 2.6938, + "step": 87 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 2.4350028038024902, + "learning_rate": 4.845238095238095e-05, + "loss": 2.1194, + "step": 88 + }, + { + "epoch": 0.35177865612648224, + "grad_norm": 2.424428939819336, + "learning_rate": 4.8432539682539685e-05, + "loss": 2.1156, + "step": 89 + }, + { + "epoch": 0.3557312252964427, + "grad_norm": 2.444096326828003, + "learning_rate": 4.841269841269841e-05, + "loss": 2.2003, + "step": 90 + }, + { + "epoch": 0.35968379446640314, + "grad_norm": 2.2972753047943115, + "learning_rate": 4.8392857142857146e-05, + "loss": 2.273, + "step": 91 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 2.599595546722412, + "learning_rate": 4.837301587301587e-05, + "loss": 2.1506, + "step": 92 + }, + { + "epoch": 0.3675889328063241, + "grad_norm": 2.1127512454986572, + "learning_rate": 4.835317460317461e-05, + "loss": 2.2395, + "step": 93 + }, + { + "epoch": 0.3715415019762846, + "grad_norm": 1.8674228191375732, + "learning_rate": 4.8333333333333334e-05, + "loss": 2.4755, + "step": 94 + }, + { + "epoch": 0.37549407114624506, + "grad_norm": 2.062201499938965, + "learning_rate": 4.831349206349207e-05, + "loss": 2.8275, + "step": 95 + }, + { + "epoch": 0.3794466403162055, + "grad_norm": 2.196667432785034, + "learning_rate": 4.8293650793650794e-05, + "loss": 2.3944, + "step": 96 + }, + { + "epoch": 0.383399209486166, + "grad_norm": 2.0713300704956055, + "learning_rate": 4.827380952380953e-05, + "loss": 2.4696, + "step": 97 + }, + { + "epoch": 0.38735177865612647, + "grad_norm": 2.0104503631591797, + "learning_rate": 4.8253968253968255e-05, + "loss": 2.334, + "step": 98 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 2.1917121410369873, + "learning_rate": 4.823412698412699e-05, + "loss": 2.2834, + "step": 99 + }, + { + "epoch": 0.3952569169960474, + "grad_norm": 2.4141860008239746, + "learning_rate": 4.8214285714285716e-05, + "loss": 2.3086, + "step": 100 + }, + { + "epoch": 0.39920948616600793, + "grad_norm": 2.384432792663574, + "learning_rate": 4.819444444444445e-05, + "loss": 2.1402, + "step": 101 + }, + { + "epoch": 0.4031620553359684, + "grad_norm": 2.4520158767700195, + "learning_rate": 4.817460317460318e-05, + "loss": 2.033, + "step": 102 + }, + { + "epoch": 0.40711462450592883, + "grad_norm": 2.809053659439087, + "learning_rate": 4.815476190476191e-05, + "loss": 2.0992, + "step": 103 + }, + { + "epoch": 0.41106719367588934, + "grad_norm": 2.3782355785369873, + "learning_rate": 4.813492063492064e-05, + "loss": 2.3137, + "step": 104 + }, + { + "epoch": 0.4150197628458498, + "grad_norm": 1.9347542524337769, + "learning_rate": 4.811507936507937e-05, + "loss": 2.3339, + "step": 105 + }, + { + "epoch": 0.4189723320158103, + "grad_norm": 2.0949320793151855, + "learning_rate": 4.80952380952381e-05, + "loss": 2.4536, + "step": 106 + }, + { + "epoch": 0.42292490118577075, + "grad_norm": 1.8690857887268066, + "learning_rate": 4.807539682539683e-05, + "loss": 2.0155, + "step": 107 + }, + { + "epoch": 0.4268774703557312, + "grad_norm": 2.1170482635498047, + "learning_rate": 4.805555555555556e-05, + "loss": 2.2891, + "step": 108 + }, + { + "epoch": 0.4308300395256917, + "grad_norm": 2.1500346660614014, + "learning_rate": 4.803571428571429e-05, + "loss": 2.4996, + "step": 109 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 1.9133753776550293, + "learning_rate": 4.801587301587302e-05, + "loss": 2.57, + "step": 110 + }, + { + "epoch": 0.43873517786561267, + "grad_norm": 2.0104360580444336, + "learning_rate": 4.799603174603175e-05, + "loss": 2.1764, + "step": 111 + }, + { + "epoch": 0.4426877470355731, + "grad_norm": 2.2259364128112793, + "learning_rate": 4.797619047619048e-05, + "loss": 2.4447, + "step": 112 + }, + { + "epoch": 0.44664031620553357, + "grad_norm": 1.6618857383728027, + "learning_rate": 4.795634920634921e-05, + "loss": 2.3628, + "step": 113 + }, + { + "epoch": 0.4505928853754941, + "grad_norm": 2.0611374378204346, + "learning_rate": 4.793650793650794e-05, + "loss": 2.7009, + "step": 114 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 2.3287601470947266, + "learning_rate": 4.791666666666667e-05, + "loss": 2.5049, + "step": 115 + }, + { + "epoch": 0.45849802371541504, + "grad_norm": 2.238161563873291, + "learning_rate": 4.78968253968254e-05, + "loss": 2.3077, + "step": 116 + }, + { + "epoch": 0.4624505928853755, + "grad_norm": 2.3234920501708984, + "learning_rate": 4.787698412698413e-05, + "loss": 2.3173, + "step": 117 + }, + { + "epoch": 0.466403162055336, + "grad_norm": 2.1455657482147217, + "learning_rate": 4.785714285714286e-05, + "loss": 2.329, + "step": 118 + }, + { + "epoch": 0.47035573122529645, + "grad_norm": 2.4944536685943604, + "learning_rate": 4.783730158730159e-05, + "loss": 2.1652, + "step": 119 + }, + { + "epoch": 0.4743083003952569, + "grad_norm": 1.9373809099197388, + "learning_rate": 4.781746031746032e-05, + "loss": 2.3349, + "step": 120 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 2.0994389057159424, + "learning_rate": 4.779761904761905e-05, + "loss": 2.27, + "step": 121 + }, + { + "epoch": 0.48221343873517786, + "grad_norm": 1.697857141494751, + "learning_rate": 4.7777777777777784e-05, + "loss": 2.3405, + "step": 122 + }, + { + "epoch": 0.48616600790513836, + "grad_norm": 2.085141658782959, + "learning_rate": 4.775793650793651e-05, + "loss": 1.9531, + "step": 123 + }, + { + "epoch": 0.4901185770750988, + "grad_norm": 2.2885854244232178, + "learning_rate": 4.7738095238095245e-05, + "loss": 2.2837, + "step": 124 + }, + { + "epoch": 0.49407114624505927, + "grad_norm": 2.0135693550109863, + "learning_rate": 4.771825396825397e-05, + "loss": 2.3009, + "step": 125 + }, + { + "epoch": 0.4980237154150198, + "grad_norm": 2.485074520111084, + "learning_rate": 4.7698412698412706e-05, + "loss": 1.9311, + "step": 126 + }, + { + "epoch": 0.5019762845849802, + "grad_norm": 2.2360167503356934, + "learning_rate": 4.767857142857143e-05, + "loss": 2.426, + "step": 127 + }, + { + "epoch": 0.5059288537549407, + "grad_norm": 2.3205668926239014, + "learning_rate": 4.7658730158730166e-05, + "loss": 1.9649, + "step": 128 + }, + { + "epoch": 0.5098814229249012, + "grad_norm": 2.3518431186676025, + "learning_rate": 4.7638888888888887e-05, + "loss": 2.5223, + "step": 129 + }, + { + "epoch": 0.5138339920948617, + "grad_norm": 2.197111129760742, + "learning_rate": 4.761904761904762e-05, + "loss": 2.303, + "step": 130 + }, + { + "epoch": 0.5177865612648221, + "grad_norm": 2.4568827152252197, + "learning_rate": 4.759920634920635e-05, + "loss": 2.0753, + "step": 131 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 2.004725217819214, + "learning_rate": 4.757936507936508e-05, + "loss": 2.3092, + "step": 132 + }, + { + "epoch": 0.525691699604743, + "grad_norm": 2.0977554321289062, + "learning_rate": 4.755952380952381e-05, + "loss": 2.4393, + "step": 133 + }, + { + "epoch": 0.5296442687747036, + "grad_norm": 1.903745412826538, + "learning_rate": 4.753968253968254e-05, + "loss": 2.0935, + "step": 134 + }, + { + "epoch": 0.5335968379446641, + "grad_norm": 2.0144717693328857, + "learning_rate": 4.751984126984127e-05, + "loss": 2.2653, + "step": 135 + }, + { + "epoch": 0.5375494071146245, + "grad_norm": 2.0766761302948, + "learning_rate": 4.75e-05, + "loss": 2.2034, + "step": 136 + }, + { + "epoch": 0.541501976284585, + "grad_norm": 2.109747886657715, + "learning_rate": 4.748015873015873e-05, + "loss": 2.2778, + "step": 137 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 8.714595794677734, + "learning_rate": 4.746031746031746e-05, + "loss": 1.945, + "step": 138 + }, + { + "epoch": 0.549407114624506, + "grad_norm": 2.571970224380493, + "learning_rate": 4.744047619047619e-05, + "loss": 2.0047, + "step": 139 + }, + { + "epoch": 0.5533596837944664, + "grad_norm": 2.0782594680786133, + "learning_rate": 4.7420634920634924e-05, + "loss": 2.2821, + "step": 140 + }, + { + "epoch": 0.5573122529644269, + "grad_norm": 1.9552282094955444, + "learning_rate": 4.740079365079365e-05, + "loss": 1.9381, + "step": 141 + }, + { + "epoch": 0.5612648221343873, + "grad_norm": 2.5030784606933594, + "learning_rate": 4.738095238095238e-05, + "loss": 1.9552, + "step": 142 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 1.996016263961792, + "learning_rate": 4.736111111111111e-05, + "loss": 2.3815, + "step": 143 + }, + { + "epoch": 0.5691699604743083, + "grad_norm": 2.2575206756591797, + "learning_rate": 4.734126984126984e-05, + "loss": 2.3244, + "step": 144 + }, + { + "epoch": 0.5731225296442688, + "grad_norm": 2.067824363708496, + "learning_rate": 4.732142857142857e-05, + "loss": 2.338, + "step": 145 + }, + { + "epoch": 0.5770750988142292, + "grad_norm": 1.8195747137069702, + "learning_rate": 4.73015873015873e-05, + "loss": 2.4185, + "step": 146 + }, + { + "epoch": 0.5810276679841897, + "grad_norm": 2.230973958969116, + "learning_rate": 4.728174603174603e-05, + "loss": 2.2955, + "step": 147 + }, + { + "epoch": 0.5849802371541502, + "grad_norm": 2.303642749786377, + "learning_rate": 4.726190476190476e-05, + "loss": 2.6163, + "step": 148 + }, + { + "epoch": 0.5889328063241107, + "grad_norm": 2.1462321281433105, + "learning_rate": 4.7242063492063494e-05, + "loss": 2.1224, + "step": 149 + }, + { + "epoch": 0.5928853754940712, + "grad_norm": 1.8279284238815308, + "learning_rate": 4.722222222222222e-05, + "loss": 2.1556, + "step": 150 + }, + { + "epoch": 0.5968379446640316, + "grad_norm": 2.0945279598236084, + "learning_rate": 4.7202380952380955e-05, + "loss": 2.2004, + "step": 151 + }, + { + "epoch": 0.6007905138339921, + "grad_norm": 2.0842084884643555, + "learning_rate": 4.718253968253968e-05, + "loss": 1.9211, + "step": 152 + }, + { + "epoch": 0.6047430830039525, + "grad_norm": 1.9995882511138916, + "learning_rate": 4.7162698412698416e-05, + "loss": 2.116, + "step": 153 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 2.471076250076294, + "learning_rate": 4.714285714285714e-05, + "loss": 2.035, + "step": 154 + }, + { + "epoch": 0.6126482213438735, + "grad_norm": 2.1906332969665527, + "learning_rate": 4.7123015873015876e-05, + "loss": 1.8779, + "step": 155 + }, + { + "epoch": 0.616600790513834, + "grad_norm": 2.511838674545288, + "learning_rate": 4.71031746031746e-05, + "loss": 1.9335, + "step": 156 + }, + { + "epoch": 0.6205533596837944, + "grad_norm": 1.754230260848999, + "learning_rate": 4.708333333333334e-05, + "loss": 2.1955, + "step": 157 + }, + { + "epoch": 0.6245059288537549, + "grad_norm": 1.9571726322174072, + "learning_rate": 4.7063492063492064e-05, + "loss": 2.0802, + "step": 158 + }, + { + "epoch": 0.6284584980237155, + "grad_norm": 2.271517038345337, + "learning_rate": 4.70436507936508e-05, + "loss": 2.2352, + "step": 159 + }, + { + "epoch": 0.6324110671936759, + "grad_norm": 1.9034878015518188, + "learning_rate": 4.7023809523809525e-05, + "loss": 2.2499, + "step": 160 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.9912493228912354, + "learning_rate": 4.700396825396826e-05, + "loss": 2.6197, + "step": 161 + }, + { + "epoch": 0.6403162055335968, + "grad_norm": 2.255777597427368, + "learning_rate": 4.6984126984126986e-05, + "loss": 2.36, + "step": 162 + }, + { + "epoch": 0.6442687747035574, + "grad_norm": 2.2219536304473877, + "learning_rate": 4.696428571428572e-05, + "loss": 1.9835, + "step": 163 + }, + { + "epoch": 0.6482213438735178, + "grad_norm": 2.551605224609375, + "learning_rate": 4.6944444444444446e-05, + "loss": 2.2187, + "step": 164 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 2.3967275619506836, + "learning_rate": 4.692460317460317e-05, + "loss": 1.8407, + "step": 165 + }, + { + "epoch": 0.6561264822134387, + "grad_norm": 2.194493293762207, + "learning_rate": 4.690476190476191e-05, + "loss": 2.3458, + "step": 166 + }, + { + "epoch": 0.6600790513833992, + "grad_norm": 1.9432865381240845, + "learning_rate": 4.6884920634920634e-05, + "loss": 2.2172, + "step": 167 + }, + { + "epoch": 0.6640316205533597, + "grad_norm": 2.18040132522583, + "learning_rate": 4.686507936507937e-05, + "loss": 2.1371, + "step": 168 + }, + { + "epoch": 0.6679841897233202, + "grad_norm": 2.4075417518615723, + "learning_rate": 4.6845238095238095e-05, + "loss": 2.2766, + "step": 169 + }, + { + "epoch": 0.6719367588932806, + "grad_norm": 1.8353605270385742, + "learning_rate": 4.682539682539683e-05, + "loss": 1.809, + "step": 170 + }, + { + "epoch": 0.6758893280632411, + "grad_norm": 2.305044651031494, + "learning_rate": 4.6805555555555556e-05, + "loss": 1.9724, + "step": 171 + }, + { + "epoch": 0.6798418972332015, + "grad_norm": 2.7235798835754395, + "learning_rate": 4.678571428571429e-05, + "loss": 1.6704, + "step": 172 + }, + { + "epoch": 0.6837944664031621, + "grad_norm": 2.5702693462371826, + "learning_rate": 4.6765873015873016e-05, + "loss": 2.0057, + "step": 173 + }, + { + "epoch": 0.6877470355731226, + "grad_norm": 1.952614188194275, + "learning_rate": 4.674603174603175e-05, + "loss": 2.3162, + "step": 174 + }, + { + "epoch": 0.691699604743083, + "grad_norm": 2.334252119064331, + "learning_rate": 4.672619047619048e-05, + "loss": 2.2679, + "step": 175 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 2.064568519592285, + "learning_rate": 4.670634920634921e-05, + "loss": 2.2213, + "step": 176 + }, + { + "epoch": 0.6996047430830039, + "grad_norm": 1.8959503173828125, + "learning_rate": 4.668650793650794e-05, + "loss": 2.4685, + "step": 177 + }, + { + "epoch": 0.7035573122529645, + "grad_norm": 2.5481746196746826, + "learning_rate": 4.666666666666667e-05, + "loss": 2.2007, + "step": 178 + }, + { + "epoch": 0.7075098814229249, + "grad_norm": 1.9843651056289673, + "learning_rate": 4.66468253968254e-05, + "loss": 2.2955, + "step": 179 + }, + { + "epoch": 0.7114624505928854, + "grad_norm": 1.9685429334640503, + "learning_rate": 4.662698412698413e-05, + "loss": 2.0954, + "step": 180 + }, + { + "epoch": 0.7154150197628458, + "grad_norm": 2.216379165649414, + "learning_rate": 4.660714285714286e-05, + "loss": 2.3786, + "step": 181 + }, + { + "epoch": 0.7193675889328063, + "grad_norm": 2.144599437713623, + "learning_rate": 4.658730158730159e-05, + "loss": 2.1757, + "step": 182 + }, + { + "epoch": 0.7233201581027668, + "grad_norm": 1.829734206199646, + "learning_rate": 4.656746031746032e-05, + "loss": 2.2259, + "step": 183 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 1.8877259492874146, + "learning_rate": 4.6547619047619054e-05, + "loss": 2.1431, + "step": 184 + }, + { + "epoch": 0.7312252964426877, + "grad_norm": 1.9971317052841187, + "learning_rate": 4.652777777777778e-05, + "loss": 2.263, + "step": 185 + }, + { + "epoch": 0.7351778656126482, + "grad_norm": 2.129324436187744, + "learning_rate": 4.6507936507936515e-05, + "loss": 2.1523, + "step": 186 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 2.2705445289611816, + "learning_rate": 4.648809523809524e-05, + "loss": 2.0131, + "step": 187 + }, + { + "epoch": 0.7430830039525692, + "grad_norm": 1.9856712818145752, + "learning_rate": 4.646825396825397e-05, + "loss": 1.8903, + "step": 188 + }, + { + "epoch": 0.7470355731225297, + "grad_norm": 2.1891982555389404, + "learning_rate": 4.64484126984127e-05, + "loss": 2.2185, + "step": 189 + }, + { + "epoch": 0.7509881422924901, + "grad_norm": 2.514817476272583, + "learning_rate": 4.642857142857143e-05, + "loss": 2.3529, + "step": 190 + }, + { + "epoch": 0.7549407114624506, + "grad_norm": 1.9026365280151367, + "learning_rate": 4.640873015873016e-05, + "loss": 2.4147, + "step": 191 + }, + { + "epoch": 0.758893280632411, + "grad_norm": 2.04667329788208, + "learning_rate": 4.638888888888889e-05, + "loss": 1.9511, + "step": 192 + }, + { + "epoch": 0.7628458498023716, + "grad_norm": 1.8381803035736084, + "learning_rate": 4.6369047619047624e-05, + "loss": 2.1575, + "step": 193 + }, + { + "epoch": 0.766798418972332, + "grad_norm": 2.05672550201416, + "learning_rate": 4.634920634920635e-05, + "loss": 2.0799, + "step": 194 + }, + { + "epoch": 0.7707509881422925, + "grad_norm": 1.973151445388794, + "learning_rate": 4.6329365079365085e-05, + "loss": 2.2552, + "step": 195 + }, + { + "epoch": 0.7747035573122529, + "grad_norm": 1.8159013986587524, + "learning_rate": 4.630952380952381e-05, + "loss": 1.8409, + "step": 196 + }, + { + "epoch": 0.7786561264822134, + "grad_norm": 2.1178860664367676, + "learning_rate": 4.6289682539682545e-05, + "loss": 2.1587, + "step": 197 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 2.200260877609253, + "learning_rate": 4.626984126984127e-05, + "loss": 1.5963, + "step": 198 + }, + { + "epoch": 0.7865612648221344, + "grad_norm": 2.7356910705566406, + "learning_rate": 4.6250000000000006e-05, + "loss": 1.7795, + "step": 199 + }, + { + "epoch": 0.7905138339920948, + "grad_norm": 2.6005921363830566, + "learning_rate": 4.623015873015873e-05, + "loss": 2.07, + "step": 200 + }, + { + "epoch": 0.7944664031620553, + "grad_norm": 2.155571937561035, + "learning_rate": 4.621031746031747e-05, + "loss": 1.9797, + "step": 201 + }, + { + "epoch": 0.7984189723320159, + "grad_norm": 1.8139017820358276, + "learning_rate": 4.6190476190476194e-05, + "loss": 2.0915, + "step": 202 + }, + { + "epoch": 0.8023715415019763, + "grad_norm": 2.1978001594543457, + "learning_rate": 4.617063492063493e-05, + "loss": 2.0828, + "step": 203 + }, + { + "epoch": 0.8063241106719368, + "grad_norm": 2.4255006313323975, + "learning_rate": 4.6150793650793655e-05, + "loss": 1.8046, + "step": 204 + }, + { + "epoch": 0.8102766798418972, + "grad_norm": 2.3868203163146973, + "learning_rate": 4.613095238095239e-05, + "loss": 2.0577, + "step": 205 + }, + { + "epoch": 0.8142292490118577, + "grad_norm": 1.7391860485076904, + "learning_rate": 4.6111111111111115e-05, + "loss": 2.292, + "step": 206 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.9303078651428223, + "learning_rate": 4.609126984126984e-05, + "loss": 1.8133, + "step": 207 + }, + { + "epoch": 0.8221343873517787, + "grad_norm": 1.9964845180511475, + "learning_rate": 4.607142857142857e-05, + "loss": 2.1007, + "step": 208 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 2.102794885635376, + "learning_rate": 4.60515873015873e-05, + "loss": 2.2607, + "step": 209 + }, + { + "epoch": 0.8300395256916996, + "grad_norm": 2.2853715419769287, + "learning_rate": 4.603174603174603e-05, + "loss": 2.2519, + "step": 210 + }, + { + "epoch": 0.83399209486166, + "grad_norm": 1.8562860488891602, + "learning_rate": 4.6011904761904764e-05, + "loss": 2.1676, + "step": 211 + }, + { + "epoch": 0.8379446640316206, + "grad_norm": 2.448793649673462, + "learning_rate": 4.599206349206349e-05, + "loss": 2.1657, + "step": 212 + }, + { + "epoch": 0.841897233201581, + "grad_norm": 1.8922572135925293, + "learning_rate": 4.5972222222222225e-05, + "loss": 1.7849, + "step": 213 + }, + { + "epoch": 0.8458498023715415, + "grad_norm": 2.190263509750366, + "learning_rate": 4.595238095238095e-05, + "loss": 2.1807, + "step": 214 + }, + { + "epoch": 0.849802371541502, + "grad_norm": 2.0355405807495117, + "learning_rate": 4.5932539682539685e-05, + "loss": 2.163, + "step": 215 + }, + { + "epoch": 0.8537549407114624, + "grad_norm": 2.3606069087982178, + "learning_rate": 4.591269841269841e-05, + "loss": 2.3872, + "step": 216 + }, + { + "epoch": 0.857707509881423, + "grad_norm": 2.091801166534424, + "learning_rate": 4.5892857142857146e-05, + "loss": 2.0303, + "step": 217 + }, + { + "epoch": 0.8616600790513834, + "grad_norm": 2.5148980617523193, + "learning_rate": 4.587301587301587e-05, + "loss": 1.7546, + "step": 218 + }, + { + "epoch": 0.8656126482213439, + "grad_norm": 2.172477960586548, + "learning_rate": 4.58531746031746e-05, + "loss": 2.0569, + "step": 219 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 1.820509433746338, + "learning_rate": 4.5833333333333334e-05, + "loss": 2.2322, + "step": 220 + }, + { + "epoch": 0.8735177865612648, + "grad_norm": 1.5621439218521118, + "learning_rate": 4.581349206349206e-05, + "loss": 2.0088, + "step": 221 + }, + { + "epoch": 0.8774703557312253, + "grad_norm": 2.2147130966186523, + "learning_rate": 4.5793650793650795e-05, + "loss": 1.9092, + "step": 222 + }, + { + "epoch": 0.8814229249011858, + "grad_norm": 2.103334426879883, + "learning_rate": 4.577380952380952e-05, + "loss": 1.9992, + "step": 223 + }, + { + "epoch": 0.8853754940711462, + "grad_norm": 1.926761507987976, + "learning_rate": 4.5753968253968255e-05, + "loss": 1.8496, + "step": 224 + }, + { + "epoch": 0.8893280632411067, + "grad_norm": 2.040013313293457, + "learning_rate": 4.573412698412698e-05, + "loss": 1.9573, + "step": 225 + }, + { + "epoch": 0.8932806324110671, + "grad_norm": 1.969488263130188, + "learning_rate": 4.5714285714285716e-05, + "loss": 2.1783, + "step": 226 + }, + { + "epoch": 0.8972332015810277, + "grad_norm": 2.0519626140594482, + "learning_rate": 4.569444444444444e-05, + "loss": 1.9827, + "step": 227 + }, + { + "epoch": 0.9011857707509882, + "grad_norm": 2.2971911430358887, + "learning_rate": 4.567460317460318e-05, + "loss": 1.9498, + "step": 228 + }, + { + "epoch": 0.9051383399209486, + "grad_norm": 2.0618982315063477, + "learning_rate": 4.5654761904761904e-05, + "loss": 1.8099, + "step": 229 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.670811414718628, + "learning_rate": 4.563492063492064e-05, + "loss": 2.1808, + "step": 230 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 3.3800909519195557, + "learning_rate": 4.5615079365079365e-05, + "loss": 1.458, + "step": 231 + }, + { + "epoch": 0.9169960474308301, + "grad_norm": 2.106755495071411, + "learning_rate": 4.55952380952381e-05, + "loss": 2.1909, + "step": 232 + }, + { + "epoch": 0.9209486166007905, + "grad_norm": 2.3558948040008545, + "learning_rate": 4.5575396825396825e-05, + "loss": 2.1529, + "step": 233 + }, + { + "epoch": 0.924901185770751, + "grad_norm": 2.1475250720977783, + "learning_rate": 4.555555555555556e-05, + "loss": 2.0272, + "step": 234 + }, + { + "epoch": 0.9288537549407114, + "grad_norm": 2.1931400299072266, + "learning_rate": 4.5535714285714286e-05, + "loss": 2.0567, + "step": 235 + }, + { + "epoch": 0.932806324110672, + "grad_norm": 2.34619140625, + "learning_rate": 4.551587301587302e-05, + "loss": 2.36, + "step": 236 + }, + { + "epoch": 0.9367588932806324, + "grad_norm": 1.8363399505615234, + "learning_rate": 4.549603174603175e-05, + "loss": 2.2903, + "step": 237 + }, + { + "epoch": 0.9407114624505929, + "grad_norm": 2.0586445331573486, + "learning_rate": 4.547619047619048e-05, + "loss": 1.8766, + "step": 238 + }, + { + "epoch": 0.9446640316205533, + "grad_norm": 1.7671842575073242, + "learning_rate": 4.545634920634921e-05, + "loss": 1.7877, + "step": 239 + }, + { + "epoch": 0.9486166007905138, + "grad_norm": 2.8485286235809326, + "learning_rate": 4.543650793650794e-05, + "loss": 1.9066, + "step": 240 + }, + { + "epoch": 0.9525691699604744, + "grad_norm": 2.1801576614379883, + "learning_rate": 4.541666666666667e-05, + "loss": 2.2792, + "step": 241 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 1.9892218112945557, + "learning_rate": 4.5396825396825395e-05, + "loss": 2.1138, + "step": 242 + }, + { + "epoch": 0.9604743083003953, + "grad_norm": 2.0810766220092773, + "learning_rate": 4.537698412698413e-05, + "loss": 1.8441, + "step": 243 + }, + { + "epoch": 0.9644268774703557, + "grad_norm": 2.243373155593872, + "learning_rate": 4.5357142857142856e-05, + "loss": 2.0694, + "step": 244 + }, + { + "epoch": 0.9683794466403162, + "grad_norm": 2.5647904872894287, + "learning_rate": 4.533730158730159e-05, + "loss": 1.861, + "step": 245 + }, + { + "epoch": 0.9723320158102767, + "grad_norm": 1.625081181526184, + "learning_rate": 4.531746031746032e-05, + "loss": 2.1496, + "step": 246 + }, + { + "epoch": 0.9762845849802372, + "grad_norm": 1.7020546197891235, + "learning_rate": 4.529761904761905e-05, + "loss": 1.9332, + "step": 247 + }, + { + "epoch": 0.9802371541501976, + "grad_norm": 1.802681565284729, + "learning_rate": 4.527777777777778e-05, + "loss": 1.8204, + "step": 248 + }, + { + "epoch": 0.9841897233201581, + "grad_norm": 1.7810888290405273, + "learning_rate": 4.525793650793651e-05, + "loss": 1.8915, + "step": 249 + }, + { + "epoch": 0.9881422924901185, + "grad_norm": 1.6715744733810425, + "learning_rate": 4.523809523809524e-05, + "loss": 2.0593, + "step": 250 + }, + { + "epoch": 0.9920948616600791, + "grad_norm": 2.5212719440460205, + "learning_rate": 4.521825396825397e-05, + "loss": 2.0725, + "step": 251 + }, + { + "epoch": 0.9960474308300395, + "grad_norm": 2.189178943634033, + "learning_rate": 4.51984126984127e-05, + "loss": 2.0336, + "step": 252 + }, + { + "epoch": 1.0, + "grad_norm": 2.2733006477355957, + "learning_rate": 4.517857142857143e-05, + "loss": 1.9606, + "step": 253 + }, + { + "epoch": 1.0039525691699605, + "grad_norm": 1.8183317184448242, + "learning_rate": 4.515873015873016e-05, + "loss": 1.9106, + "step": 254 + }, + { + "epoch": 1.007905138339921, + "grad_norm": 2.471332550048828, + "learning_rate": 4.5138888888888894e-05, + "loss": 1.9963, + "step": 255 + }, + { + "epoch": 1.0118577075098814, + "grad_norm": 1.8422023057937622, + "learning_rate": 4.511904761904762e-05, + "loss": 2.0978, + "step": 256 + }, + { + "epoch": 1.0158102766798418, + "grad_norm": 1.79561185836792, + "learning_rate": 4.5099206349206354e-05, + "loss": 1.4994, + "step": 257 + }, + { + "epoch": 1.0197628458498025, + "grad_norm": 2.271358013153076, + "learning_rate": 4.507936507936508e-05, + "loss": 1.7649, + "step": 258 + }, + { + "epoch": 1.023715415019763, + "grad_norm": 2.6813673973083496, + "learning_rate": 4.5059523809523815e-05, + "loss": 1.8252, + "step": 259 + }, + { + "epoch": 1.0276679841897234, + "grad_norm": 1.8112016916275024, + "learning_rate": 4.503968253968254e-05, + "loss": 1.7883, + "step": 260 + }, + { + "epoch": 1.0316205533596838, + "grad_norm": 2.4050188064575195, + "learning_rate": 4.5019841269841276e-05, + "loss": 1.9855, + "step": 261 + }, + { + "epoch": 1.0355731225296443, + "grad_norm": 2.38291072845459, + "learning_rate": 4.5e-05, + "loss": 1.8171, + "step": 262 + }, + { + "epoch": 1.0395256916996047, + "grad_norm": 2.221317768096924, + "learning_rate": 4.4980158730158737e-05, + "loss": 1.5565, + "step": 263 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 2.3104920387268066, + "learning_rate": 4.4960317460317464e-05, + "loss": 1.926, + "step": 264 + }, + { + "epoch": 1.0474308300395256, + "grad_norm": 2.0159404277801514, + "learning_rate": 4.494047619047619e-05, + "loss": 1.6123, + "step": 265 + }, + { + "epoch": 1.051383399209486, + "grad_norm": 2.0315968990325928, + "learning_rate": 4.4920634920634924e-05, + "loss": 1.9577, + "step": 266 + }, + { + "epoch": 1.0553359683794465, + "grad_norm": 2.021657943725586, + "learning_rate": 4.490079365079365e-05, + "loss": 1.7845, + "step": 267 + }, + { + "epoch": 1.0592885375494072, + "grad_norm": 2.1020936965942383, + "learning_rate": 4.4880952380952385e-05, + "loss": 1.8233, + "step": 268 + }, + { + "epoch": 1.0632411067193677, + "grad_norm": 2.0355613231658936, + "learning_rate": 4.486111111111111e-05, + "loss": 1.9241, + "step": 269 + }, + { + "epoch": 1.0671936758893281, + "grad_norm": 1.797027349472046, + "learning_rate": 4.4841269841269846e-05, + "loss": 1.7056, + "step": 270 + }, + { + "epoch": 1.0711462450592886, + "grad_norm": 2.725644826889038, + "learning_rate": 4.482142857142857e-05, + "loss": 1.4221, + "step": 271 + }, + { + "epoch": 1.075098814229249, + "grad_norm": 2.5590972900390625, + "learning_rate": 4.4801587301587307e-05, + "loss": 1.7194, + "step": 272 + }, + { + "epoch": 1.0790513833992095, + "grad_norm": 2.4488916397094727, + "learning_rate": 4.4781746031746034e-05, + "loss": 1.3487, + "step": 273 + }, + { + "epoch": 1.08300395256917, + "grad_norm": 2.245779037475586, + "learning_rate": 4.476190476190477e-05, + "loss": 1.6895, + "step": 274 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 2.3138208389282227, + "learning_rate": 4.4742063492063494e-05, + "loss": 2.0797, + "step": 275 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 2.106358528137207, + "learning_rate": 4.472222222222223e-05, + "loss": 1.7057, + "step": 276 + }, + { + "epoch": 1.0948616600790513, + "grad_norm": 2.0810418128967285, + "learning_rate": 4.4702380952380955e-05, + "loss": 1.6139, + "step": 277 + }, + { + "epoch": 1.098814229249012, + "grad_norm": 2.3179874420166016, + "learning_rate": 4.468253968253969e-05, + "loss": 1.7748, + "step": 278 + }, + { + "epoch": 1.1027667984189724, + "grad_norm": 2.4948816299438477, + "learning_rate": 4.4662698412698416e-05, + "loss": 2.0072, + "step": 279 + }, + { + "epoch": 1.1067193675889329, + "grad_norm": 2.1853582859039307, + "learning_rate": 4.464285714285715e-05, + "loss": 2.017, + "step": 280 + }, + { + "epoch": 1.1106719367588933, + "grad_norm": 2.066575288772583, + "learning_rate": 4.4623015873015877e-05, + "loss": 1.7979, + "step": 281 + }, + { + "epoch": 1.1146245059288538, + "grad_norm": 2.454230785369873, + "learning_rate": 4.460317460317461e-05, + "loss": 1.6153, + "step": 282 + }, + { + "epoch": 1.1185770750988142, + "grad_norm": 2.088303804397583, + "learning_rate": 4.458333333333334e-05, + "loss": 1.7366, + "step": 283 + }, + { + "epoch": 1.1225296442687747, + "grad_norm": 2.0203330516815186, + "learning_rate": 4.456349206349207e-05, + "loss": 1.8878, + "step": 284 + }, + { + "epoch": 1.1264822134387351, + "grad_norm": 1.8328367471694946, + "learning_rate": 4.45436507936508e-05, + "loss": 1.4853, + "step": 285 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 2.1855709552764893, + "learning_rate": 4.4523809523809525e-05, + "loss": 2.1256, + "step": 286 + }, + { + "epoch": 1.1343873517786562, + "grad_norm": 2.0757203102111816, + "learning_rate": 4.450396825396825e-05, + "loss": 2.1395, + "step": 287 + }, + { + "epoch": 1.1383399209486167, + "grad_norm": 2.217658519744873, + "learning_rate": 4.4484126984126986e-05, + "loss": 1.6217, + "step": 288 + }, + { + "epoch": 1.1422924901185771, + "grad_norm": 2.346212387084961, + "learning_rate": 4.446428571428571e-05, + "loss": 1.6561, + "step": 289 + }, + { + "epoch": 1.1462450592885376, + "grad_norm": 2.340932607650757, + "learning_rate": 4.4444444444444447e-05, + "loss": 1.5699, + "step": 290 + }, + { + "epoch": 1.150197628458498, + "grad_norm": 2.2137064933776855, + "learning_rate": 4.4424603174603174e-05, + "loss": 1.9642, + "step": 291 + }, + { + "epoch": 1.1541501976284585, + "grad_norm": 1.6653351783752441, + "learning_rate": 4.440476190476191e-05, + "loss": 1.8973, + "step": 292 + }, + { + "epoch": 1.158102766798419, + "grad_norm": 2.238419771194458, + "learning_rate": 4.4384920634920634e-05, + "loss": 1.7091, + "step": 293 + }, + { + "epoch": 1.1620553359683794, + "grad_norm": 1.8844563961029053, + "learning_rate": 4.436507936507937e-05, + "loss": 1.8355, + "step": 294 + }, + { + "epoch": 1.1660079051383399, + "grad_norm": 1.5482860803604126, + "learning_rate": 4.4345238095238095e-05, + "loss": 1.8152, + "step": 295 + }, + { + "epoch": 1.1699604743083003, + "grad_norm": 2.046618700027466, + "learning_rate": 4.432539682539683e-05, + "loss": 1.5727, + "step": 296 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 1.9557174444198608, + "learning_rate": 4.4305555555555556e-05, + "loss": 1.2086, + "step": 297 + }, + { + "epoch": 1.1778656126482214, + "grad_norm": 2.5918216705322266, + "learning_rate": 4.428571428571428e-05, + "loss": 1.8132, + "step": 298 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 2.1008517742156982, + "learning_rate": 4.4265873015873017e-05, + "loss": 1.7421, + "step": 299 + }, + { + "epoch": 1.1857707509881423, + "grad_norm": 2.3811569213867188, + "learning_rate": 4.4246031746031744e-05, + "loss": 1.9358, + "step": 300 + }, + { + "epoch": 1.1897233201581028, + "grad_norm": 2.317112922668457, + "learning_rate": 4.422619047619048e-05, + "loss": 1.8341, + "step": 301 + }, + { + "epoch": 1.1936758893280632, + "grad_norm": 1.8912357091903687, + "learning_rate": 4.4206349206349204e-05, + "loss": 1.7062, + "step": 302 + }, + { + "epoch": 1.1976284584980237, + "grad_norm": 1.7493529319763184, + "learning_rate": 4.418650793650794e-05, + "loss": 1.9519, + "step": 303 + }, + { + "epoch": 1.2015810276679841, + "grad_norm": 2.0161383152008057, + "learning_rate": 4.4166666666666665e-05, + "loss": 1.6137, + "step": 304 + }, + { + "epoch": 1.2055335968379446, + "grad_norm": 1.7949028015136719, + "learning_rate": 4.41468253968254e-05, + "loss": 1.7514, + "step": 305 + }, + { + "epoch": 1.2094861660079053, + "grad_norm": 2.1296987533569336, + "learning_rate": 4.4126984126984126e-05, + "loss": 1.6876, + "step": 306 + }, + { + "epoch": 1.2134387351778657, + "grad_norm": 2.3091883659362793, + "learning_rate": 4.410714285714286e-05, + "loss": 1.9752, + "step": 307 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 2.804625988006592, + "learning_rate": 4.4087301587301587e-05, + "loss": 1.7697, + "step": 308 + }, + { + "epoch": 1.2213438735177866, + "grad_norm": 1.7799369096755981, + "learning_rate": 4.406746031746032e-05, + "loss": 1.8301, + "step": 309 + }, + { + "epoch": 1.225296442687747, + "grad_norm": 2.0240299701690674, + "learning_rate": 4.404761904761905e-05, + "loss": 1.6604, + "step": 310 + }, + { + "epoch": 1.2292490118577075, + "grad_norm": 2.3005123138427734, + "learning_rate": 4.402777777777778e-05, + "loss": 1.8875, + "step": 311 + }, + { + "epoch": 1.233201581027668, + "grad_norm": 2.1108436584472656, + "learning_rate": 4.400793650793651e-05, + "loss": 1.8508, + "step": 312 + }, + { + "epoch": 1.2371541501976284, + "grad_norm": 2.7670435905456543, + "learning_rate": 4.398809523809524e-05, + "loss": 1.8427, + "step": 313 + }, + { + "epoch": 1.2411067193675889, + "grad_norm": 2.1103644371032715, + "learning_rate": 4.396825396825397e-05, + "loss": 1.5623, + "step": 314 + }, + { + "epoch": 1.2450592885375493, + "grad_norm": 2.517120838165283, + "learning_rate": 4.39484126984127e-05, + "loss": 1.688, + "step": 315 + }, + { + "epoch": 1.2490118577075098, + "grad_norm": 1.799034595489502, + "learning_rate": 4.392857142857143e-05, + "loss": 1.7609, + "step": 316 + }, + { + "epoch": 1.2529644268774702, + "grad_norm": 2.450300455093384, + "learning_rate": 4.390873015873016e-05, + "loss": 1.6296, + "step": 317 + }, + { + "epoch": 1.256916996047431, + "grad_norm": 2.148935317993164, + "learning_rate": 4.388888888888889e-05, + "loss": 1.849, + "step": 318 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 2.3157055377960205, + "learning_rate": 4.3869047619047624e-05, + "loss": 1.62, + "step": 319 + }, + { + "epoch": 1.2648221343873518, + "grad_norm": 1.6752326488494873, + "learning_rate": 4.384920634920635e-05, + "loss": 1.594, + "step": 320 + }, + { + "epoch": 1.2687747035573123, + "grad_norm": 2.3844189643859863, + "learning_rate": 4.382936507936508e-05, + "loss": 2.033, + "step": 321 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 2.3986222743988037, + "learning_rate": 4.380952380952381e-05, + "loss": 1.4159, + "step": 322 + }, + { + "epoch": 1.2766798418972332, + "grad_norm": 1.919364094734192, + "learning_rate": 4.378968253968254e-05, + "loss": 1.9236, + "step": 323 + }, + { + "epoch": 1.2806324110671936, + "grad_norm": 2.3783788681030273, + "learning_rate": 4.376984126984127e-05, + "loss": 1.8965, + "step": 324 + }, + { + "epoch": 1.2845849802371543, + "grad_norm": 2.122201919555664, + "learning_rate": 4.375e-05, + "loss": 1.7124, + "step": 325 + }, + { + "epoch": 1.2885375494071147, + "grad_norm": 1.829590082168579, + "learning_rate": 4.373015873015873e-05, + "loss": 1.7605, + "step": 326 + }, + { + "epoch": 1.2924901185770752, + "grad_norm": 1.9862433671951294, + "learning_rate": 4.371031746031746e-05, + "loss": 1.6903, + "step": 327 + }, + { + "epoch": 1.2964426877470356, + "grad_norm": 2.1671435832977295, + "learning_rate": 4.3690476190476194e-05, + "loss": 1.9038, + "step": 328 + }, + { + "epoch": 1.300395256916996, + "grad_norm": 2.0836524963378906, + "learning_rate": 4.367063492063492e-05, + "loss": 1.8675, + "step": 329 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 2.2062249183654785, + "learning_rate": 4.3650793650793655e-05, + "loss": 1.7196, + "step": 330 + }, + { + "epoch": 1.308300395256917, + "grad_norm": 2.018446683883667, + "learning_rate": 4.363095238095238e-05, + "loss": 1.6995, + "step": 331 + }, + { + "epoch": 1.3122529644268774, + "grad_norm": 2.521754026412964, + "learning_rate": 4.3611111111111116e-05, + "loss": 1.649, + "step": 332 + }, + { + "epoch": 1.316205533596838, + "grad_norm": 2.223747968673706, + "learning_rate": 4.359126984126984e-05, + "loss": 1.5682, + "step": 333 + }, + { + "epoch": 1.3201581027667983, + "grad_norm": 2.356834650039673, + "learning_rate": 4.3571428571428576e-05, + "loss": 1.7477, + "step": 334 + }, + { + "epoch": 1.3241106719367588, + "grad_norm": 1.9272010326385498, + "learning_rate": 4.35515873015873e-05, + "loss": 1.8585, + "step": 335 + }, + { + "epoch": 1.3280632411067192, + "grad_norm": 2.0459604263305664, + "learning_rate": 4.353174603174604e-05, + "loss": 1.5119, + "step": 336 + }, + { + "epoch": 1.33201581027668, + "grad_norm": 1.9021974802017212, + "learning_rate": 4.3511904761904764e-05, + "loss": 1.9379, + "step": 337 + }, + { + "epoch": 1.3359683794466404, + "grad_norm": 2.5137360095977783, + "learning_rate": 4.34920634920635e-05, + "loss": 1.8224, + "step": 338 + }, + { + "epoch": 1.3399209486166008, + "grad_norm": 1.5489245653152466, + "learning_rate": 4.3472222222222225e-05, + "loss": 1.8488, + "step": 339 + }, + { + "epoch": 1.3438735177865613, + "grad_norm": 1.811347484588623, + "learning_rate": 4.345238095238096e-05, + "loss": 1.5281, + "step": 340 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 2.5799477100372314, + "learning_rate": 4.3432539682539686e-05, + "loss": 1.7175, + "step": 341 + }, + { + "epoch": 1.3517786561264822, + "grad_norm": 2.4223809242248535, + "learning_rate": 4.341269841269842e-05, + "loss": 1.6518, + "step": 342 + }, + { + "epoch": 1.3557312252964426, + "grad_norm": 2.713998556137085, + "learning_rate": 4.3392857142857146e-05, + "loss": 1.6456, + "step": 343 + }, + { + "epoch": 1.359683794466403, + "grad_norm": 2.6156938076019287, + "learning_rate": 4.337301587301587e-05, + "loss": 1.6932, + "step": 344 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 2.451413154602051, + "learning_rate": 4.335317460317461e-05, + "loss": 1.7987, + "step": 345 + }, + { + "epoch": 1.3675889328063242, + "grad_norm": 1.895689845085144, + "learning_rate": 4.3333333333333334e-05, + "loss": 1.7549, + "step": 346 + }, + { + "epoch": 1.3715415019762847, + "grad_norm": 2.4354727268218994, + "learning_rate": 4.331349206349207e-05, + "loss": 1.5661, + "step": 347 + }, + { + "epoch": 1.3754940711462451, + "grad_norm": 2.5790798664093018, + "learning_rate": 4.3293650793650795e-05, + "loss": 1.89, + "step": 348 + }, + { + "epoch": 1.3794466403162056, + "grad_norm": 1.9985342025756836, + "learning_rate": 4.327380952380953e-05, + "loss": 1.5705, + "step": 349 + }, + { + "epoch": 1.383399209486166, + "grad_norm": 1.87380850315094, + "learning_rate": 4.3253968253968256e-05, + "loss": 1.8884, + "step": 350 + }, + { + "epoch": 1.3873517786561265, + "grad_norm": 2.4583888053894043, + "learning_rate": 4.323412698412699e-05, + "loss": 1.51, + "step": 351 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 2.0746865272521973, + "learning_rate": 4.3214285714285716e-05, + "loss": 1.688, + "step": 352 + }, + { + "epoch": 1.3952569169960474, + "grad_norm": 2.3721249103546143, + "learning_rate": 4.319444444444445e-05, + "loss": 1.2734, + "step": 353 + }, + { + "epoch": 1.3992094861660078, + "grad_norm": 2.44508695602417, + "learning_rate": 4.317460317460318e-05, + "loss": 1.8527, + "step": 354 + }, + { + "epoch": 1.4031620553359683, + "grad_norm": 2.2311763763427734, + "learning_rate": 4.315476190476191e-05, + "loss": 1.6745, + "step": 355 + }, + { + "epoch": 1.4071146245059287, + "grad_norm": 1.867505431175232, + "learning_rate": 4.313492063492064e-05, + "loss": 1.9571, + "step": 356 + }, + { + "epoch": 1.4110671936758894, + "grad_norm": 2.2237112522125244, + "learning_rate": 4.311507936507937e-05, + "loss": 2.133, + "step": 357 + }, + { + "epoch": 1.4150197628458498, + "grad_norm": 1.834166169166565, + "learning_rate": 4.30952380952381e-05, + "loss": 2.0071, + "step": 358 + }, + { + "epoch": 1.4189723320158103, + "grad_norm": 1.9390510320663452, + "learning_rate": 4.307539682539683e-05, + "loss": 1.5187, + "step": 359 + }, + { + "epoch": 1.4229249011857708, + "grad_norm": 2.7471730709075928, + "learning_rate": 4.305555555555556e-05, + "loss": 1.7342, + "step": 360 + }, + { + "epoch": 1.4268774703557312, + "grad_norm": 1.8465523719787598, + "learning_rate": 4.303571428571429e-05, + "loss": 1.9512, + "step": 361 + }, + { + "epoch": 1.4308300395256917, + "grad_norm": 2.100477933883667, + "learning_rate": 4.301587301587302e-05, + "loss": 1.6373, + "step": 362 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 2.298740863800049, + "learning_rate": 4.2996031746031754e-05, + "loss": 1.7411, + "step": 363 + }, + { + "epoch": 1.4387351778656128, + "grad_norm": 1.8366012573242188, + "learning_rate": 4.297619047619048e-05, + "loss": 1.89, + "step": 364 + }, + { + "epoch": 1.4426877470355732, + "grad_norm": 2.246143341064453, + "learning_rate": 4.295634920634921e-05, + "loss": 1.6918, + "step": 365 + }, + { + "epoch": 1.4466403162055337, + "grad_norm": 1.8526705503463745, + "learning_rate": 4.2936507936507935e-05, + "loss": 1.6653, + "step": 366 + }, + { + "epoch": 1.4505928853754941, + "grad_norm": 2.273653984069824, + "learning_rate": 4.291666666666667e-05, + "loss": 2.0948, + "step": 367 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 2.1943068504333496, + "learning_rate": 4.2896825396825396e-05, + "loss": 1.6798, + "step": 368 + }, + { + "epoch": 1.458498023715415, + "grad_norm": 1.9659279584884644, + "learning_rate": 4.287698412698413e-05, + "loss": 1.9008, + "step": 369 + }, + { + "epoch": 1.4624505928853755, + "grad_norm": 1.792734980583191, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.6039, + "step": 370 + }, + { + "epoch": 1.466403162055336, + "grad_norm": 2.0952234268188477, + "learning_rate": 4.283730158730159e-05, + "loss": 1.5815, + "step": 371 + }, + { + "epoch": 1.4703557312252964, + "grad_norm": 1.9501980543136597, + "learning_rate": 4.281746031746032e-05, + "loss": 1.5774, + "step": 372 + }, + { + "epoch": 1.4743083003952568, + "grad_norm": 1.960719347000122, + "learning_rate": 4.279761904761905e-05, + "loss": 1.5171, + "step": 373 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 2.022639751434326, + "learning_rate": 4.277777777777778e-05, + "loss": 1.8212, + "step": 374 + }, + { + "epoch": 1.4822134387351777, + "grad_norm": 1.9371771812438965, + "learning_rate": 4.2757936507936505e-05, + "loss": 1.6054, + "step": 375 + }, + { + "epoch": 1.4861660079051384, + "grad_norm": 2.072070598602295, + "learning_rate": 4.273809523809524e-05, + "loss": 1.6589, + "step": 376 + }, + { + "epoch": 1.4901185770750989, + "grad_norm": 1.8878108263015747, + "learning_rate": 4.2718253968253966e-05, + "loss": 1.9921, + "step": 377 + }, + { + "epoch": 1.4940711462450593, + "grad_norm": 1.9379626512527466, + "learning_rate": 4.26984126984127e-05, + "loss": 1.8345, + "step": 378 + }, + { + "epoch": 1.4980237154150198, + "grad_norm": 2.775383234024048, + "learning_rate": 4.2678571428571426e-05, + "loss": 1.8299, + "step": 379 + }, + { + "epoch": 1.5019762845849802, + "grad_norm": 2.041731119155884, + "learning_rate": 4.265873015873016e-05, + "loss": 1.4022, + "step": 380 + }, + { + "epoch": 1.5059288537549407, + "grad_norm": 2.0958447456359863, + "learning_rate": 4.263888888888889e-05, + "loss": 1.8758, + "step": 381 + }, + { + "epoch": 1.5098814229249014, + "grad_norm": 2.6933484077453613, + "learning_rate": 4.261904761904762e-05, + "loss": 2.0389, + "step": 382 + }, + { + "epoch": 1.5138339920948618, + "grad_norm": 2.594853401184082, + "learning_rate": 4.259920634920635e-05, + "loss": 1.5752, + "step": 383 + }, + { + "epoch": 1.5177865612648223, + "grad_norm": 2.6426239013671875, + "learning_rate": 4.257936507936508e-05, + "loss": 1.9273, + "step": 384 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 1.837033748626709, + "learning_rate": 4.255952380952381e-05, + "loss": 1.774, + "step": 385 + }, + { + "epoch": 1.5256916996047432, + "grad_norm": 2.4687540531158447, + "learning_rate": 4.253968253968254e-05, + "loss": 2.3856, + "step": 386 + }, + { + "epoch": 1.5296442687747036, + "grad_norm": 1.7977819442749023, + "learning_rate": 4.251984126984127e-05, + "loss": 1.6884, + "step": 387 + }, + { + "epoch": 1.533596837944664, + "grad_norm": 2.1651384830474854, + "learning_rate": 4.25e-05, + "loss": 2.0945, + "step": 388 + }, + { + "epoch": 1.5375494071146245, + "grad_norm": 1.796292781829834, + "learning_rate": 4.248015873015873e-05, + "loss": 1.7548, + "step": 389 + }, + { + "epoch": 1.541501976284585, + "grad_norm": 3.0309319496154785, + "learning_rate": 4.2460317460317464e-05, + "loss": 1.9628, + "step": 390 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.8279919624328613, + "learning_rate": 4.244047619047619e-05, + "loss": 1.9005, + "step": 391 + }, + { + "epoch": 1.5494071146245059, + "grad_norm": 1.9936280250549316, + "learning_rate": 4.2420634920634925e-05, + "loss": 1.7502, + "step": 392 + }, + { + "epoch": 1.5533596837944663, + "grad_norm": 1.728090763092041, + "learning_rate": 4.240079365079365e-05, + "loss": 1.8107, + "step": 393 + }, + { + "epoch": 1.5573122529644268, + "grad_norm": 1.614793300628662, + "learning_rate": 4.2380952380952385e-05, + "loss": 1.6821, + "step": 394 + }, + { + "epoch": 1.5612648221343872, + "grad_norm": 2.016058921813965, + "learning_rate": 4.236111111111111e-05, + "loss": 1.6821, + "step": 395 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 2.421929359436035, + "learning_rate": 4.2341269841269846e-05, + "loss": 2.0802, + "step": 396 + }, + { + "epoch": 1.5691699604743083, + "grad_norm": 1.9785467386245728, + "learning_rate": 4.232142857142857e-05, + "loss": 1.7954, + "step": 397 + }, + { + "epoch": 1.5731225296442688, + "grad_norm": 2.4265410900115967, + "learning_rate": 4.23015873015873e-05, + "loss": 1.6911, + "step": 398 + }, + { + "epoch": 1.5770750988142292, + "grad_norm": 2.7018771171569824, + "learning_rate": 4.2281746031746034e-05, + "loss": 1.7161, + "step": 399 + }, + { + "epoch": 1.5810276679841897, + "grad_norm": 1.9163415431976318, + "learning_rate": 4.226190476190476e-05, + "loss": 1.6967, + "step": 400 + }, + { + "epoch": 1.5849802371541502, + "grad_norm": 1.7845193147659302, + "learning_rate": 4.2242063492063495e-05, + "loss": 2.0456, + "step": 401 + }, + { + "epoch": 1.5889328063241108, + "grad_norm": 2.364121675491333, + "learning_rate": 4.222222222222222e-05, + "loss": 1.8798, + "step": 402 + }, + { + "epoch": 1.5928853754940713, + "grad_norm": 1.998205542564392, + "learning_rate": 4.2202380952380955e-05, + "loss": 1.6054, + "step": 403 + }, + { + "epoch": 1.5968379446640317, + "grad_norm": 2.847519874572754, + "learning_rate": 4.218253968253968e-05, + "loss": 1.723, + "step": 404 + }, + { + "epoch": 1.6007905138339922, + "grad_norm": 2.2237510681152344, + "learning_rate": 4.2162698412698416e-05, + "loss": 1.5771, + "step": 405 + }, + { + "epoch": 1.6047430830039526, + "grad_norm": 2.4652063846588135, + "learning_rate": 4.214285714285714e-05, + "loss": 1.9165, + "step": 406 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 1.9992256164550781, + "learning_rate": 4.212301587301588e-05, + "loss": 1.9398, + "step": 407 + }, + { + "epoch": 1.6126482213438735, + "grad_norm": 1.7840253114700317, + "learning_rate": 4.2103174603174604e-05, + "loss": 1.623, + "step": 408 + }, + { + "epoch": 1.616600790513834, + "grad_norm": 1.5577056407928467, + "learning_rate": 4.208333333333334e-05, + "loss": 1.6815, + "step": 409 + }, + { + "epoch": 1.6205533596837944, + "grad_norm": 2.6008100509643555, + "learning_rate": 4.2063492063492065e-05, + "loss": 1.6165, + "step": 410 + }, + { + "epoch": 1.6245059288537549, + "grad_norm": 1.9897792339324951, + "learning_rate": 4.20436507936508e-05, + "loss": 1.6959, + "step": 411 + }, + { + "epoch": 1.6284584980237153, + "grad_norm": 1.9050272703170776, + "learning_rate": 4.2023809523809525e-05, + "loss": 1.9483, + "step": 412 + }, + { + "epoch": 1.6324110671936758, + "grad_norm": 2.1082065105438232, + "learning_rate": 4.200396825396826e-05, + "loss": 1.6326, + "step": 413 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.8587292432785034, + "learning_rate": 4.1984126984126986e-05, + "loss": 1.5015, + "step": 414 + }, + { + "epoch": 1.6403162055335967, + "grad_norm": 2.440154552459717, + "learning_rate": 4.196428571428572e-05, + "loss": 1.8917, + "step": 415 + }, + { + "epoch": 1.6442687747035574, + "grad_norm": 2.1441152095794678, + "learning_rate": 4.194444444444445e-05, + "loss": 1.8155, + "step": 416 + }, + { + "epoch": 1.6482213438735178, + "grad_norm": 1.9019662141799927, + "learning_rate": 4.192460317460318e-05, + "loss": 1.9238, + "step": 417 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 1.8844928741455078, + "learning_rate": 4.190476190476191e-05, + "loss": 1.6799, + "step": 418 + }, + { + "epoch": 1.6561264822134387, + "grad_norm": 2.373283624649048, + "learning_rate": 4.188492063492064e-05, + "loss": 1.6916, + "step": 419 + }, + { + "epoch": 1.6600790513833992, + "grad_norm": 2.1077182292938232, + "learning_rate": 4.186507936507937e-05, + "loss": 1.924, + "step": 420 + }, + { + "epoch": 1.6640316205533598, + "grad_norm": 2.5388450622558594, + "learning_rate": 4.1845238095238095e-05, + "loss": 1.7846, + "step": 421 + }, + { + "epoch": 1.6679841897233203, + "grad_norm": 2.8166000843048096, + "learning_rate": 4.182539682539683e-05, + "loss": 1.5143, + "step": 422 + }, + { + "epoch": 1.6719367588932808, + "grad_norm": 1.919756531715393, + "learning_rate": 4.1805555555555556e-05, + "loss": 1.7846, + "step": 423 + }, + { + "epoch": 1.6758893280632412, + "grad_norm": 2.265761375427246, + "learning_rate": 4.178571428571429e-05, + "loss": 1.2774, + "step": 424 + }, + { + "epoch": 1.6798418972332017, + "grad_norm": 2.4432921409606934, + "learning_rate": 4.176587301587302e-05, + "loss": 1.637, + "step": 425 + }, + { + "epoch": 1.683794466403162, + "grad_norm": 2.095233678817749, + "learning_rate": 4.174603174603175e-05, + "loss": 1.4832, + "step": 426 + }, + { + "epoch": 1.6877470355731226, + "grad_norm": 2.602231740951538, + "learning_rate": 4.172619047619048e-05, + "loss": 1.5574, + "step": 427 + }, + { + "epoch": 1.691699604743083, + "grad_norm": 1.9906206130981445, + "learning_rate": 4.170634920634921e-05, + "loss": 1.5909, + "step": 428 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 2.3297181129455566, + "learning_rate": 4.168650793650794e-05, + "loss": 1.7061, + "step": 429 + }, + { + "epoch": 1.699604743083004, + "grad_norm": 2.3630523681640625, + "learning_rate": 4.166666666666667e-05, + "loss": 1.6981, + "step": 430 + }, + { + "epoch": 1.7035573122529644, + "grad_norm": 1.9802945852279663, + "learning_rate": 4.16468253968254e-05, + "loss": 1.7784, + "step": 431 + }, + { + "epoch": 1.7075098814229248, + "grad_norm": 2.8566882610321045, + "learning_rate": 4.162698412698413e-05, + "loss": 1.5392, + "step": 432 + }, + { + "epoch": 1.7114624505928853, + "grad_norm": 2.2854273319244385, + "learning_rate": 4.160714285714286e-05, + "loss": 1.7431, + "step": 433 + }, + { + "epoch": 1.7154150197628457, + "grad_norm": 1.9989619255065918, + "learning_rate": 4.1587301587301594e-05, + "loss": 1.8095, + "step": 434 + }, + { + "epoch": 1.7193675889328062, + "grad_norm": 2.6097142696380615, + "learning_rate": 4.156746031746032e-05, + "loss": 1.464, + "step": 435 + }, + { + "epoch": 1.7233201581027668, + "grad_norm": 2.048234462738037, + "learning_rate": 4.1547619047619054e-05, + "loss": 2.0557, + "step": 436 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 2.046173334121704, + "learning_rate": 4.152777777777778e-05, + "loss": 1.4132, + "step": 437 + }, + { + "epoch": 1.7312252964426877, + "grad_norm": 2.1724562644958496, + "learning_rate": 4.1507936507936515e-05, + "loss": 1.6773, + "step": 438 + }, + { + "epoch": 1.7351778656126482, + "grad_norm": 2.2624452114105225, + "learning_rate": 4.148809523809524e-05, + "loss": 1.6013, + "step": 439 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 2.2074332237243652, + "learning_rate": 4.1468253968253976e-05, + "loss": 1.7686, + "step": 440 + }, + { + "epoch": 1.7430830039525693, + "grad_norm": 2.1703858375549316, + "learning_rate": 4.14484126984127e-05, + "loss": 1.5108, + "step": 441 + }, + { + "epoch": 1.7470355731225298, + "grad_norm": 1.9463564157485962, + "learning_rate": 4.1428571428571437e-05, + "loss": 1.9879, + "step": 442 + }, + { + "epoch": 1.7509881422924902, + "grad_norm": 2.4043517112731934, + "learning_rate": 4.1408730158730164e-05, + "loss": 1.6002, + "step": 443 + }, + { + "epoch": 1.7549407114624507, + "grad_norm": 1.906467080116272, + "learning_rate": 4.138888888888889e-05, + "loss": 1.8149, + "step": 444 + }, + { + "epoch": 1.7588932806324111, + "grad_norm": 2.612128257751465, + "learning_rate": 4.136904761904762e-05, + "loss": 1.8805, + "step": 445 + }, + { + "epoch": 1.7628458498023716, + "grad_norm": 2.056048631668091, + "learning_rate": 4.134920634920635e-05, + "loss": 1.6064, + "step": 446 + }, + { + "epoch": 1.766798418972332, + "grad_norm": 1.9283335208892822, + "learning_rate": 4.132936507936508e-05, + "loss": 2.2151, + "step": 447 + }, + { + "epoch": 1.7707509881422925, + "grad_norm": 1.8714007139205933, + "learning_rate": 4.130952380952381e-05, + "loss": 1.6642, + "step": 448 + }, + { + "epoch": 1.774703557312253, + "grad_norm": 2.0939621925354004, + "learning_rate": 4.128968253968254e-05, + "loss": 1.558, + "step": 449 + }, + { + "epoch": 1.7786561264822134, + "grad_norm": 2.0754241943359375, + "learning_rate": 4.126984126984127e-05, + "loss": 1.6465, + "step": 450 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 2.7905728816986084, + "learning_rate": 4.125e-05, + "loss": 1.7298, + "step": 451 + }, + { + "epoch": 1.7865612648221343, + "grad_norm": 2.2117226123809814, + "learning_rate": 4.123015873015873e-05, + "loss": 1.5478, + "step": 452 + }, + { + "epoch": 1.7905138339920947, + "grad_norm": 2.22251033782959, + "learning_rate": 4.121031746031746e-05, + "loss": 1.4483, + "step": 453 + }, + { + "epoch": 1.7944664031620552, + "grad_norm": 2.636388063430786, + "learning_rate": 4.119047619047619e-05, + "loss": 1.3413, + "step": 454 + }, + { + "epoch": 1.7984189723320159, + "grad_norm": 2.417893409729004, + "learning_rate": 4.117063492063492e-05, + "loss": 1.5273, + "step": 455 + }, + { + "epoch": 1.8023715415019763, + "grad_norm": 1.960670828819275, + "learning_rate": 4.115079365079365e-05, + "loss": 1.8887, + "step": 456 + }, + { + "epoch": 1.8063241106719368, + "grad_norm": 2.0714080333709717, + "learning_rate": 4.113095238095238e-05, + "loss": 1.9395, + "step": 457 + }, + { + "epoch": 1.8102766798418972, + "grad_norm": 2.6367220878601074, + "learning_rate": 4.111111111111111e-05, + "loss": 1.475, + "step": 458 + }, + { + "epoch": 1.8142292490118577, + "grad_norm": 2.8209869861602783, + "learning_rate": 4.109126984126984e-05, + "loss": 1.4326, + "step": 459 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.884318470954895, + "learning_rate": 4.107142857142857e-05, + "loss": 1.5297, + "step": 460 + }, + { + "epoch": 1.8221343873517788, + "grad_norm": 1.7990609407424927, + "learning_rate": 4.1051587301587304e-05, + "loss": 1.9113, + "step": 461 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 1.8307974338531494, + "learning_rate": 4.103174603174603e-05, + "loss": 1.5365, + "step": 462 + }, + { + "epoch": 1.8300395256916997, + "grad_norm": 2.2995738983154297, + "learning_rate": 4.1011904761904764e-05, + "loss": 1.4421, + "step": 463 + }, + { + "epoch": 1.8339920948616601, + "grad_norm": 2.2058722972869873, + "learning_rate": 4.099206349206349e-05, + "loss": 1.5384, + "step": 464 + }, + { + "epoch": 1.8379446640316206, + "grad_norm": 2.011327028274536, + "learning_rate": 4.0972222222222225e-05, + "loss": 1.6523, + "step": 465 + }, + { + "epoch": 1.841897233201581, + "grad_norm": 2.0535199642181396, + "learning_rate": 4.095238095238095e-05, + "loss": 1.6645, + "step": 466 + }, + { + "epoch": 1.8458498023715415, + "grad_norm": 2.485356092453003, + "learning_rate": 4.0932539682539686e-05, + "loss": 1.5562, + "step": 467 + }, + { + "epoch": 1.849802371541502, + "grad_norm": 2.368373155593872, + "learning_rate": 4.091269841269841e-05, + "loss": 1.806, + "step": 468 + }, + { + "epoch": 1.8537549407114624, + "grad_norm": 2.3104066848754883, + "learning_rate": 4.0892857142857147e-05, + "loss": 1.9633, + "step": 469 + }, + { + "epoch": 1.8577075098814229, + "grad_norm": 2.3505473136901855, + "learning_rate": 4.0873015873015874e-05, + "loss": 1.6175, + "step": 470 + }, + { + "epoch": 1.8616600790513833, + "grad_norm": 1.9146708250045776, + "learning_rate": 4.085317460317461e-05, + "loss": 1.5535, + "step": 471 + }, + { + "epoch": 1.8656126482213438, + "grad_norm": 1.624837040901184, + "learning_rate": 4.0833333333333334e-05, + "loss": 1.6677, + "step": 472 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 1.7742847204208374, + "learning_rate": 4.081349206349207e-05, + "loss": 1.5509, + "step": 473 + }, + { + "epoch": 1.8735177865612647, + "grad_norm": 1.9143329858779907, + "learning_rate": 4.0793650793650795e-05, + "loss": 1.8877, + "step": 474 + }, + { + "epoch": 1.8774703557312253, + "grad_norm": 2.1041526794433594, + "learning_rate": 4.077380952380952e-05, + "loss": 1.6796, + "step": 475 + }, + { + "epoch": 1.8814229249011858, + "grad_norm": 2.0280792713165283, + "learning_rate": 4.0753968253968256e-05, + "loss": 1.7892, + "step": 476 + }, + { + "epoch": 1.8853754940711462, + "grad_norm": 2.8694331645965576, + "learning_rate": 4.073412698412698e-05, + "loss": 1.9448, + "step": 477 + }, + { + "epoch": 1.8893280632411067, + "grad_norm": 2.471470832824707, + "learning_rate": 4.0714285714285717e-05, + "loss": 1.7856, + "step": 478 + }, + { + "epoch": 1.8932806324110671, + "grad_norm": 2.2044591903686523, + "learning_rate": 4.0694444444444444e-05, + "loss": 1.7688, + "step": 479 + }, + { + "epoch": 1.8972332015810278, + "grad_norm": 2.3773610591888428, + "learning_rate": 4.067460317460318e-05, + "loss": 1.908, + "step": 480 + }, + { + "epoch": 1.9011857707509883, + "grad_norm": 2.1004207134246826, + "learning_rate": 4.0654761904761904e-05, + "loss": 1.4418, + "step": 481 + }, + { + "epoch": 1.9051383399209487, + "grad_norm": 2.415409803390503, + "learning_rate": 4.063492063492064e-05, + "loss": 1.7695, + "step": 482 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.89458167552948, + "learning_rate": 4.0615079365079365e-05, + "loss": 1.8081, + "step": 483 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 2.560452699661255, + "learning_rate": 4.05952380952381e-05, + "loss": 2.0119, + "step": 484 + }, + { + "epoch": 1.91699604743083, + "grad_norm": 2.115086793899536, + "learning_rate": 4.0575396825396826e-05, + "loss": 1.7012, + "step": 485 + }, + { + "epoch": 1.9209486166007905, + "grad_norm": 2.1731040477752686, + "learning_rate": 4.055555555555556e-05, + "loss": 1.7267, + "step": 486 + }, + { + "epoch": 1.924901185770751, + "grad_norm": 1.9672155380249023, + "learning_rate": 4.0535714285714287e-05, + "loss": 1.7396, + "step": 487 + }, + { + "epoch": 1.9288537549407114, + "grad_norm": 2.380613327026367, + "learning_rate": 4.051587301587302e-05, + "loss": 1.4355, + "step": 488 + }, + { + "epoch": 1.9328063241106719, + "grad_norm": 2.087024450302124, + "learning_rate": 4.049603174603175e-05, + "loss": 1.812, + "step": 489 + }, + { + "epoch": 1.9367588932806323, + "grad_norm": 2.704306125640869, + "learning_rate": 4.047619047619048e-05, + "loss": 1.7029, + "step": 490 + }, + { + "epoch": 1.9407114624505928, + "grad_norm": 1.8128267526626587, + "learning_rate": 4.045634920634921e-05, + "loss": 1.616, + "step": 491 + }, + { + "epoch": 1.9446640316205532, + "grad_norm": 1.7133640050888062, + "learning_rate": 4.043650793650794e-05, + "loss": 1.4945, + "step": 492 + }, + { + "epoch": 1.9486166007905137, + "grad_norm": 1.8677951097488403, + "learning_rate": 4.041666666666667e-05, + "loss": 1.6594, + "step": 493 + }, + { + "epoch": 1.9525691699604744, + "grad_norm": 1.908327579498291, + "learning_rate": 4.03968253968254e-05, + "loss": 1.7492, + "step": 494 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 2.1175668239593506, + "learning_rate": 4.037698412698413e-05, + "loss": 1.6509, + "step": 495 + }, + { + "epoch": 1.9604743083003953, + "grad_norm": 2.284240961074829, + "learning_rate": 4.035714285714286e-05, + "loss": 1.4284, + "step": 496 + }, + { + "epoch": 1.9644268774703557, + "grad_norm": 1.581749677658081, + "learning_rate": 4.033730158730159e-05, + "loss": 1.8258, + "step": 497 + }, + { + "epoch": 1.9683794466403162, + "grad_norm": 2.200153112411499, + "learning_rate": 4.031746031746032e-05, + "loss": 1.573, + "step": 498 + }, + { + "epoch": 1.9723320158102768, + "grad_norm": 2.2217061519622803, + "learning_rate": 4.029761904761905e-05, + "loss": 1.6952, + "step": 499 + }, + { + "epoch": 1.9762845849802373, + "grad_norm": 2.2044413089752197, + "learning_rate": 4.027777777777778e-05, + "loss": 1.5415, + "step": 500 + }, + { + "epoch": 1.9802371541501977, + "grad_norm": 2.001845359802246, + "learning_rate": 4.025793650793651e-05, + "loss": 1.8791, + "step": 501 + }, + { + "epoch": 1.9841897233201582, + "grad_norm": 1.7531094551086426, + "learning_rate": 4.023809523809524e-05, + "loss": 1.7529, + "step": 502 + }, + { + "epoch": 1.9881422924901186, + "grad_norm": 3.204590320587158, + "learning_rate": 4.021825396825397e-05, + "loss": 1.6866, + "step": 503 + }, + { + "epoch": 1.992094861660079, + "grad_norm": 2.3752427101135254, + "learning_rate": 4.01984126984127e-05, + "loss": 1.4053, + "step": 504 + }, + { + "epoch": 1.9960474308300395, + "grad_norm": 1.789313793182373, + "learning_rate": 4.017857142857143e-05, + "loss": 1.6106, + "step": 505 + }, + { + "epoch": 2.0, + "grad_norm": 2.152024269104004, + "learning_rate": 4.015873015873016e-05, + "loss": 1.7857, + "step": 506 + }, + { + "epoch": 2.0039525691699605, + "grad_norm": 1.7887808084487915, + "learning_rate": 4.0138888888888894e-05, + "loss": 1.2694, + "step": 507 + }, + { + "epoch": 2.007905138339921, + "grad_norm": 2.0157063007354736, + "learning_rate": 4.011904761904762e-05, + "loss": 1.3274, + "step": 508 + }, + { + "epoch": 2.0118577075098814, + "grad_norm": 2.462636709213257, + "learning_rate": 4.0099206349206355e-05, + "loss": 1.5154, + "step": 509 + }, + { + "epoch": 2.015810276679842, + "grad_norm": 2.299966812133789, + "learning_rate": 4.007936507936508e-05, + "loss": 1.378, + "step": 510 + }, + { + "epoch": 2.0197628458498023, + "grad_norm": 2.748854398727417, + "learning_rate": 4.0059523809523816e-05, + "loss": 1.5578, + "step": 511 + }, + { + "epoch": 2.0237154150197627, + "grad_norm": 2.8967697620391846, + "learning_rate": 4.003968253968254e-05, + "loss": 1.2523, + "step": 512 + }, + { + "epoch": 2.027667984189723, + "grad_norm": 3.2559096813201904, + "learning_rate": 4.0019841269841276e-05, + "loss": 1.3687, + "step": 513 + }, + { + "epoch": 2.0316205533596836, + "grad_norm": 3.0115792751312256, + "learning_rate": 4e-05, + "loss": 1.6751, + "step": 514 + }, + { + "epoch": 2.035573122529644, + "grad_norm": 2.7409708499908447, + "learning_rate": 3.998015873015874e-05, + "loss": 1.5692, + "step": 515 + }, + { + "epoch": 2.039525691699605, + "grad_norm": 2.488542318344116, + "learning_rate": 3.9960317460317464e-05, + "loss": 1.6312, + "step": 516 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 2.267803192138672, + "learning_rate": 3.99404761904762e-05, + "loss": 1.3411, + "step": 517 + }, + { + "epoch": 2.047430830039526, + "grad_norm": 2.1117396354675293, + "learning_rate": 3.9920634920634925e-05, + "loss": 1.3751, + "step": 518 + }, + { + "epoch": 2.0513833992094863, + "grad_norm": 2.5073764324188232, + "learning_rate": 3.990079365079366e-05, + "loss": 1.3868, + "step": 519 + }, + { + "epoch": 2.0553359683794468, + "grad_norm": 2.462871789932251, + "learning_rate": 3.9880952380952386e-05, + "loss": 1.2461, + "step": 520 + }, + { + "epoch": 2.059288537549407, + "grad_norm": 2.81300950050354, + "learning_rate": 3.986111111111111e-05, + "loss": 1.6863, + "step": 521 + }, + { + "epoch": 2.0632411067193677, + "grad_norm": 2.345595359802246, + "learning_rate": 3.984126984126984e-05, + "loss": 1.6056, + "step": 522 + }, + { + "epoch": 2.067193675889328, + "grad_norm": 3.0314559936523438, + "learning_rate": 3.982142857142857e-05, + "loss": 1.4429, + "step": 523 + }, + { + "epoch": 2.0711462450592886, + "grad_norm": 2.563462257385254, + "learning_rate": 3.98015873015873e-05, + "loss": 1.4249, + "step": 524 + }, + { + "epoch": 2.075098814229249, + "grad_norm": 3.088543653488159, + "learning_rate": 3.9781746031746034e-05, + "loss": 1.6226, + "step": 525 + }, + { + "epoch": 2.0790513833992095, + "grad_norm": 2.6812891960144043, + "learning_rate": 3.976190476190476e-05, + "loss": 1.4069, + "step": 526 + }, + { + "epoch": 2.08300395256917, + "grad_norm": 2.729356288909912, + "learning_rate": 3.9742063492063495e-05, + "loss": 1.4145, + "step": 527 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 1.9687360525131226, + "learning_rate": 3.972222222222222e-05, + "loss": 1.2861, + "step": 528 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 2.8874728679656982, + "learning_rate": 3.970238095238095e-05, + "loss": 1.0838, + "step": 529 + }, + { + "epoch": 2.0948616600790513, + "grad_norm": 2.1731650829315186, + "learning_rate": 3.968253968253968e-05, + "loss": 1.5371, + "step": 530 + }, + { + "epoch": 2.0988142292490117, + "grad_norm": 2.3168811798095703, + "learning_rate": 3.966269841269841e-05, + "loss": 1.6941, + "step": 531 + }, + { + "epoch": 2.102766798418972, + "grad_norm": 2.0623104572296143, + "learning_rate": 3.964285714285714e-05, + "loss": 1.2772, + "step": 532 + }, + { + "epoch": 2.1067193675889326, + "grad_norm": 3.4326181411743164, + "learning_rate": 3.962301587301587e-05, + "loss": 1.3626, + "step": 533 + }, + { + "epoch": 2.110671936758893, + "grad_norm": 2.1339476108551025, + "learning_rate": 3.9603174603174604e-05, + "loss": 1.4972, + "step": 534 + }, + { + "epoch": 2.1146245059288535, + "grad_norm": 2.1961209774017334, + "learning_rate": 3.958333333333333e-05, + "loss": 1.2863, + "step": 535 + }, + { + "epoch": 2.1185770750988144, + "grad_norm": 2.13753604888916, + "learning_rate": 3.9563492063492065e-05, + "loss": 1.3903, + "step": 536 + }, + { + "epoch": 2.122529644268775, + "grad_norm": 2.521846055984497, + "learning_rate": 3.954365079365079e-05, + "loss": 1.2818, + "step": 537 + }, + { + "epoch": 2.1264822134387353, + "grad_norm": 2.1714985370635986, + "learning_rate": 3.9523809523809526e-05, + "loss": 1.4502, + "step": 538 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 3.092069149017334, + "learning_rate": 3.950396825396825e-05, + "loss": 1.5637, + "step": 539 + }, + { + "epoch": 2.1343873517786562, + "grad_norm": 2.6351075172424316, + "learning_rate": 3.9484126984126986e-05, + "loss": 1.4462, + "step": 540 + }, + { + "epoch": 2.1383399209486167, + "grad_norm": 1.9999125003814697, + "learning_rate": 3.946428571428571e-05, + "loss": 1.4445, + "step": 541 + }, + { + "epoch": 2.142292490118577, + "grad_norm": 3.0483391284942627, + "learning_rate": 3.944444444444445e-05, + "loss": 1.206, + "step": 542 + }, + { + "epoch": 2.1462450592885376, + "grad_norm": 2.455862522125244, + "learning_rate": 3.9424603174603174e-05, + "loss": 1.3128, + "step": 543 + }, + { + "epoch": 2.150197628458498, + "grad_norm": 2.6878578662872314, + "learning_rate": 3.940476190476191e-05, + "loss": 1.3592, + "step": 544 + }, + { + "epoch": 2.1541501976284585, + "grad_norm": 2.25331449508667, + "learning_rate": 3.9384920634920635e-05, + "loss": 1.2749, + "step": 545 + }, + { + "epoch": 2.158102766798419, + "grad_norm": 2.7239346504211426, + "learning_rate": 3.936507936507937e-05, + "loss": 1.0487, + "step": 546 + }, + { + "epoch": 2.1620553359683794, + "grad_norm": 2.8506832122802734, + "learning_rate": 3.9345238095238096e-05, + "loss": 1.2713, + "step": 547 + }, + { + "epoch": 2.16600790513834, + "grad_norm": 2.3467752933502197, + "learning_rate": 3.932539682539683e-05, + "loss": 1.1959, + "step": 548 + }, + { + "epoch": 2.1699604743083003, + "grad_norm": 1.99912428855896, + "learning_rate": 3.9305555555555556e-05, + "loss": 1.2617, + "step": 549 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 2.6946754455566406, + "learning_rate": 3.928571428571429e-05, + "loss": 1.5949, + "step": 550 + }, + { + "epoch": 2.177865612648221, + "grad_norm": 2.771515369415283, + "learning_rate": 3.926587301587302e-05, + "loss": 1.4533, + "step": 551 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 2.188755989074707, + "learning_rate": 3.9246031746031744e-05, + "loss": 1.3215, + "step": 552 + }, + { + "epoch": 2.185770750988142, + "grad_norm": 2.7638049125671387, + "learning_rate": 3.922619047619048e-05, + "loss": 1.1885, + "step": 553 + }, + { + "epoch": 2.1897233201581026, + "grad_norm": 2.2438464164733887, + "learning_rate": 3.9206349206349205e-05, + "loss": 1.2809, + "step": 554 + }, + { + "epoch": 2.1936758893280635, + "grad_norm": 2.8813867568969727, + "learning_rate": 3.918650793650794e-05, + "loss": 1.3764, + "step": 555 + }, + { + "epoch": 2.197628458498024, + "grad_norm": 2.219583034515381, + "learning_rate": 3.9166666666666665e-05, + "loss": 1.5119, + "step": 556 + }, + { + "epoch": 2.2015810276679844, + "grad_norm": 2.888500452041626, + "learning_rate": 3.91468253968254e-05, + "loss": 1.5976, + "step": 557 + }, + { + "epoch": 2.205533596837945, + "grad_norm": 2.0659193992614746, + "learning_rate": 3.9126984126984126e-05, + "loss": 1.3095, + "step": 558 + }, + { + "epoch": 2.2094861660079053, + "grad_norm": 2.3203117847442627, + "learning_rate": 3.910714285714286e-05, + "loss": 1.2698, + "step": 559 + }, + { + "epoch": 2.2134387351778657, + "grad_norm": 2.8916614055633545, + "learning_rate": 3.908730158730159e-05, + "loss": 1.4077, + "step": 560 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 2.6932151317596436, + "learning_rate": 3.906746031746032e-05, + "loss": 1.1033, + "step": 561 + }, + { + "epoch": 2.2213438735177866, + "grad_norm": 2.4444422721862793, + "learning_rate": 3.904761904761905e-05, + "loss": 1.2782, + "step": 562 + }, + { + "epoch": 2.225296442687747, + "grad_norm": 2.334657669067383, + "learning_rate": 3.902777777777778e-05, + "loss": 1.5071, + "step": 563 + }, + { + "epoch": 2.2292490118577075, + "grad_norm": 2.4118669033050537, + "learning_rate": 3.900793650793651e-05, + "loss": 1.2665, + "step": 564 + }, + { + "epoch": 2.233201581027668, + "grad_norm": 2.160811185836792, + "learning_rate": 3.898809523809524e-05, + "loss": 1.4489, + "step": 565 + }, + { + "epoch": 2.2371541501976284, + "grad_norm": 2.49159574508667, + "learning_rate": 3.896825396825397e-05, + "loss": 1.3127, + "step": 566 + }, + { + "epoch": 2.241106719367589, + "grad_norm": 2.5408575534820557, + "learning_rate": 3.89484126984127e-05, + "loss": 1.0498, + "step": 567 + }, + { + "epoch": 2.2450592885375493, + "grad_norm": 2.121267557144165, + "learning_rate": 3.892857142857143e-05, + "loss": 1.3147, + "step": 568 + }, + { + "epoch": 2.2490118577075098, + "grad_norm": 2.7093420028686523, + "learning_rate": 3.8908730158730164e-05, + "loss": 1.3408, + "step": 569 + }, + { + "epoch": 2.2529644268774702, + "grad_norm": 3.397962808609009, + "learning_rate": 3.888888888888889e-05, + "loss": 1.4971, + "step": 570 + }, + { + "epoch": 2.2569169960474307, + "grad_norm": 2.7930893898010254, + "learning_rate": 3.8869047619047625e-05, + "loss": 1.4294, + "step": 571 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 2.7343547344207764, + "learning_rate": 3.884920634920635e-05, + "loss": 0.9542, + "step": 572 + }, + { + "epoch": 2.2648221343873516, + "grad_norm": 2.365590810775757, + "learning_rate": 3.8829365079365085e-05, + "loss": 1.2086, + "step": 573 + }, + { + "epoch": 2.2687747035573125, + "grad_norm": 2.1030609607696533, + "learning_rate": 3.880952380952381e-05, + "loss": 1.3605, + "step": 574 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 2.1257293224334717, + "learning_rate": 3.878968253968254e-05, + "loss": 1.4352, + "step": 575 + }, + { + "epoch": 2.2766798418972334, + "grad_norm": 2.316331386566162, + "learning_rate": 3.876984126984127e-05, + "loss": 1.4796, + "step": 576 + }, + { + "epoch": 2.280632411067194, + "grad_norm": 2.8514187335968018, + "learning_rate": 3.875e-05, + "loss": 1.3073, + "step": 577 + }, + { + "epoch": 2.2845849802371543, + "grad_norm": 2.8949341773986816, + "learning_rate": 3.8730158730158734e-05, + "loss": 1.3915, + "step": 578 + }, + { + "epoch": 2.2885375494071147, + "grad_norm": 2.150175094604492, + "learning_rate": 3.871031746031746e-05, + "loss": 1.2567, + "step": 579 + }, + { + "epoch": 2.292490118577075, + "grad_norm": 2.0244216918945312, + "learning_rate": 3.8690476190476195e-05, + "loss": 1.3674, + "step": 580 + }, + { + "epoch": 2.2964426877470356, + "grad_norm": 2.2279505729675293, + "learning_rate": 3.867063492063492e-05, + "loss": 1.4776, + "step": 581 + }, + { + "epoch": 2.300395256916996, + "grad_norm": 2.4603147506713867, + "learning_rate": 3.8650793650793655e-05, + "loss": 1.6039, + "step": 582 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 2.8002820014953613, + "learning_rate": 3.863095238095238e-05, + "loss": 1.2765, + "step": 583 + }, + { + "epoch": 2.308300395256917, + "grad_norm": 2.505256414413452, + "learning_rate": 3.8611111111111116e-05, + "loss": 1.1921, + "step": 584 + }, + { + "epoch": 2.3122529644268774, + "grad_norm": 2.458230972290039, + "learning_rate": 3.859126984126984e-05, + "loss": 1.7022, + "step": 585 + }, + { + "epoch": 2.316205533596838, + "grad_norm": 2.3297207355499268, + "learning_rate": 3.857142857142858e-05, + "loss": 1.5739, + "step": 586 + }, + { + "epoch": 2.3201581027667983, + "grad_norm": 2.038970708847046, + "learning_rate": 3.8551587301587304e-05, + "loss": 1.2446, + "step": 587 + }, + { + "epoch": 2.324110671936759, + "grad_norm": 2.4906201362609863, + "learning_rate": 3.853174603174604e-05, + "loss": 1.3053, + "step": 588 + }, + { + "epoch": 2.3280632411067192, + "grad_norm": 2.42885160446167, + "learning_rate": 3.8511904761904765e-05, + "loss": 1.1066, + "step": 589 + }, + { + "epoch": 2.3320158102766797, + "grad_norm": 2.2669129371643066, + "learning_rate": 3.84920634920635e-05, + "loss": 1.2192, + "step": 590 + }, + { + "epoch": 2.33596837944664, + "grad_norm": 4.123607635498047, + "learning_rate": 3.8472222222222225e-05, + "loss": 1.3616, + "step": 591 + }, + { + "epoch": 2.3399209486166006, + "grad_norm": 2.522677183151245, + "learning_rate": 3.845238095238096e-05, + "loss": 1.3161, + "step": 592 + }, + { + "epoch": 2.3438735177865615, + "grad_norm": 2.6520330905914307, + "learning_rate": 3.8432539682539686e-05, + "loss": 1.1572, + "step": 593 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 2.5755152702331543, + "learning_rate": 3.841269841269842e-05, + "loss": 1.5446, + "step": 594 + }, + { + "epoch": 2.3517786561264824, + "grad_norm": 2.2885093688964844, + "learning_rate": 3.839285714285715e-05, + "loss": 1.4885, + "step": 595 + }, + { + "epoch": 2.355731225296443, + "grad_norm": 2.176738977432251, + "learning_rate": 3.837301587301588e-05, + "loss": 1.4043, + "step": 596 + }, + { + "epoch": 2.3596837944664033, + "grad_norm": 2.7747554779052734, + "learning_rate": 3.835317460317461e-05, + "loss": 1.6517, + "step": 597 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 1.9070560932159424, + "learning_rate": 3.8333333333333334e-05, + "loss": 1.0572, + "step": 598 + }, + { + "epoch": 2.367588932806324, + "grad_norm": 2.161180019378662, + "learning_rate": 3.831349206349207e-05, + "loss": 1.3089, + "step": 599 + }, + { + "epoch": 2.3715415019762847, + "grad_norm": 2.5658626556396484, + "learning_rate": 3.8293650793650795e-05, + "loss": 1.1435, + "step": 600 + }, + { + "epoch": 2.375494071146245, + "grad_norm": 2.733940839767456, + "learning_rate": 3.827380952380952e-05, + "loss": 1.4425, + "step": 601 + }, + { + "epoch": 2.3794466403162056, + "grad_norm": 2.494096279144287, + "learning_rate": 3.8253968253968256e-05, + "loss": 1.5981, + "step": 602 + }, + { + "epoch": 2.383399209486166, + "grad_norm": 2.405909299850464, + "learning_rate": 3.823412698412698e-05, + "loss": 1.4946, + "step": 603 + }, + { + "epoch": 2.3873517786561265, + "grad_norm": 2.067415952682495, + "learning_rate": 3.821428571428572e-05, + "loss": 1.3633, + "step": 604 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 2.1548092365264893, + "learning_rate": 3.8194444444444444e-05, + "loss": 1.2963, + "step": 605 + }, + { + "epoch": 2.3952569169960474, + "grad_norm": 2.004725694656372, + "learning_rate": 3.817460317460317e-05, + "loss": 1.4675, + "step": 606 + }, + { + "epoch": 2.399209486166008, + "grad_norm": 2.1438629627227783, + "learning_rate": 3.8154761904761904e-05, + "loss": 1.5303, + "step": 607 + }, + { + "epoch": 2.4031620553359683, + "grad_norm": 2.6474621295928955, + "learning_rate": 3.813492063492063e-05, + "loss": 1.5247, + "step": 608 + }, + { + "epoch": 2.4071146245059287, + "grad_norm": 2.7593584060668945, + "learning_rate": 3.8115079365079365e-05, + "loss": 1.2225, + "step": 609 + }, + { + "epoch": 2.411067193675889, + "grad_norm": 2.53412127494812, + "learning_rate": 3.809523809523809e-05, + "loss": 1.3592, + "step": 610 + }, + { + "epoch": 2.4150197628458496, + "grad_norm": 2.477109909057617, + "learning_rate": 3.8075396825396826e-05, + "loss": 1.3446, + "step": 611 + }, + { + "epoch": 2.4189723320158105, + "grad_norm": 2.8138701915740967, + "learning_rate": 3.805555555555555e-05, + "loss": 1.3014, + "step": 612 + }, + { + "epoch": 2.4229249011857705, + "grad_norm": 2.805239200592041, + "learning_rate": 3.803571428571429e-05, + "loss": 1.3786, + "step": 613 + }, + { + "epoch": 2.4268774703557314, + "grad_norm": 2.7038285732269287, + "learning_rate": 3.8015873015873014e-05, + "loss": 1.1213, + "step": 614 + }, + { + "epoch": 2.430830039525692, + "grad_norm": 2.978872299194336, + "learning_rate": 3.799603174603175e-05, + "loss": 1.2804, + "step": 615 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 2.0930662155151367, + "learning_rate": 3.7976190476190474e-05, + "loss": 1.6642, + "step": 616 + }, + { + "epoch": 2.438735177865613, + "grad_norm": 2.3781399726867676, + "learning_rate": 3.795634920634921e-05, + "loss": 1.2398, + "step": 617 + }, + { + "epoch": 2.4426877470355732, + "grad_norm": 2.1842458248138428, + "learning_rate": 3.7936507936507935e-05, + "loss": 1.5295, + "step": 618 + }, + { + "epoch": 2.4466403162055337, + "grad_norm": 3.6338818073272705, + "learning_rate": 3.791666666666667e-05, + "loss": 1.0698, + "step": 619 + }, + { + "epoch": 2.450592885375494, + "grad_norm": 2.5560874938964844, + "learning_rate": 3.7896825396825396e-05, + "loss": 1.3759, + "step": 620 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 2.4692065715789795, + "learning_rate": 3.787698412698413e-05, + "loss": 1.2892, + "step": 621 + }, + { + "epoch": 2.458498023715415, + "grad_norm": 2.5481793880462646, + "learning_rate": 3.785714285714286e-05, + "loss": 1.5718, + "step": 622 + }, + { + "epoch": 2.4624505928853755, + "grad_norm": 2.608428716659546, + "learning_rate": 3.783730158730159e-05, + "loss": 1.3108, + "step": 623 + }, + { + "epoch": 2.466403162055336, + "grad_norm": 2.8005776405334473, + "learning_rate": 3.781746031746032e-05, + "loss": 1.5068, + "step": 624 + }, + { + "epoch": 2.4703557312252964, + "grad_norm": 2.2495791912078857, + "learning_rate": 3.779761904761905e-05, + "loss": 1.3082, + "step": 625 + }, + { + "epoch": 2.474308300395257, + "grad_norm": 2.567267656326294, + "learning_rate": 3.777777777777778e-05, + "loss": 1.4442, + "step": 626 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 2.4826018810272217, + "learning_rate": 3.775793650793651e-05, + "loss": 1.2581, + "step": 627 + }, + { + "epoch": 2.4822134387351777, + "grad_norm": 2.3756072521209717, + "learning_rate": 3.773809523809524e-05, + "loss": 1.2695, + "step": 628 + }, + { + "epoch": 2.486166007905138, + "grad_norm": 2.6527416706085205, + "learning_rate": 3.7718253968253966e-05, + "loss": 1.2956, + "step": 629 + }, + { + "epoch": 2.4901185770750986, + "grad_norm": 2.467822790145874, + "learning_rate": 3.76984126984127e-05, + "loss": 1.4364, + "step": 630 + }, + { + "epoch": 2.494071146245059, + "grad_norm": 2.6203055381774902, + "learning_rate": 3.767857142857143e-05, + "loss": 1.4897, + "step": 631 + }, + { + "epoch": 2.4980237154150196, + "grad_norm": 2.233952045440674, + "learning_rate": 3.765873015873016e-05, + "loss": 1.3318, + "step": 632 + }, + { + "epoch": 2.5019762845849804, + "grad_norm": 1.96635901927948, + "learning_rate": 3.763888888888889e-05, + "loss": 1.2446, + "step": 633 + }, + { + "epoch": 2.5059288537549405, + "grad_norm": 2.349234104156494, + "learning_rate": 3.761904761904762e-05, + "loss": 1.3324, + "step": 634 + }, + { + "epoch": 2.5098814229249014, + "grad_norm": 2.961324691772461, + "learning_rate": 3.759920634920635e-05, + "loss": 1.1765, + "step": 635 + }, + { + "epoch": 2.513833992094862, + "grad_norm": 2.8292768001556396, + "learning_rate": 3.757936507936508e-05, + "loss": 1.0077, + "step": 636 + }, + { + "epoch": 2.5177865612648223, + "grad_norm": 2.9311156272888184, + "learning_rate": 3.755952380952381e-05, + "loss": 1.3392, + "step": 637 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 2.6130383014678955, + "learning_rate": 3.753968253968254e-05, + "loss": 1.2328, + "step": 638 + }, + { + "epoch": 2.525691699604743, + "grad_norm": 2.0875937938690186, + "learning_rate": 3.751984126984127e-05, + "loss": 1.3491, + "step": 639 + }, + { + "epoch": 2.5296442687747036, + "grad_norm": 2.593751907348633, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.1862, + "step": 640 + }, + { + "epoch": 2.533596837944664, + "grad_norm": 2.0226974487304688, + "learning_rate": 3.748015873015873e-05, + "loss": 1.3762, + "step": 641 + }, + { + "epoch": 2.5375494071146245, + "grad_norm": 2.4336142539978027, + "learning_rate": 3.7460317460317464e-05, + "loss": 1.3271, + "step": 642 + }, + { + "epoch": 2.541501976284585, + "grad_norm": 1.8397362232208252, + "learning_rate": 3.744047619047619e-05, + "loss": 1.2235, + "step": 643 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 2.0357463359832764, + "learning_rate": 3.7420634920634925e-05, + "loss": 1.3871, + "step": 644 + }, + { + "epoch": 2.549407114624506, + "grad_norm": 3.3774683475494385, + "learning_rate": 3.740079365079365e-05, + "loss": 1.3579, + "step": 645 + }, + { + "epoch": 2.5533596837944663, + "grad_norm": 3.041534662246704, + "learning_rate": 3.7380952380952386e-05, + "loss": 1.1318, + "step": 646 + }, + { + "epoch": 2.5573122529644268, + "grad_norm": 2.484518527984619, + "learning_rate": 3.736111111111111e-05, + "loss": 1.3531, + "step": 647 + }, + { + "epoch": 2.561264822134387, + "grad_norm": 2.612365245819092, + "learning_rate": 3.7341269841269846e-05, + "loss": 1.3075, + "step": 648 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 2.3737142086029053, + "learning_rate": 3.7321428571428573e-05, + "loss": 1.272, + "step": 649 + }, + { + "epoch": 2.5691699604743086, + "grad_norm": 2.600964069366455, + "learning_rate": 3.730158730158731e-05, + "loss": 1.0839, + "step": 650 + }, + { + "epoch": 2.5731225296442686, + "grad_norm": 2.697070598602295, + "learning_rate": 3.7281746031746034e-05, + "loss": 1.1383, + "step": 651 + }, + { + "epoch": 2.5770750988142295, + "grad_norm": 2.752293109893799, + "learning_rate": 3.726190476190476e-05, + "loss": 1.2823, + "step": 652 + }, + { + "epoch": 2.5810276679841895, + "grad_norm": 2.47784161567688, + "learning_rate": 3.7242063492063495e-05, + "loss": 1.5547, + "step": 653 + }, + { + "epoch": 2.5849802371541504, + "grad_norm": 2.5010085105895996, + "learning_rate": 3.722222222222222e-05, + "loss": 1.2051, + "step": 654 + }, + { + "epoch": 2.588932806324111, + "grad_norm": 2.3102123737335205, + "learning_rate": 3.7202380952380956e-05, + "loss": 1.1448, + "step": 655 + }, + { + "epoch": 2.5928853754940713, + "grad_norm": 2.760524034500122, + "learning_rate": 3.718253968253968e-05, + "loss": 1.3238, + "step": 656 + }, + { + "epoch": 2.5968379446640317, + "grad_norm": 3.336056709289551, + "learning_rate": 3.7162698412698416e-05, + "loss": 1.2137, + "step": 657 + }, + { + "epoch": 2.600790513833992, + "grad_norm": 3.0085883140563965, + "learning_rate": 3.7142857142857143e-05, + "loss": 1.0954, + "step": 658 + }, + { + "epoch": 2.6047430830039526, + "grad_norm": 2.819323778152466, + "learning_rate": 3.712301587301588e-05, + "loss": 1.4706, + "step": 659 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 2.527916193008423, + "learning_rate": 3.7103174603174604e-05, + "loss": 1.2759, + "step": 660 + }, + { + "epoch": 2.6126482213438735, + "grad_norm": 3.203950881958008, + "learning_rate": 3.708333333333334e-05, + "loss": 1.907, + "step": 661 + }, + { + "epoch": 2.616600790513834, + "grad_norm": 2.3258347511291504, + "learning_rate": 3.7063492063492065e-05, + "loss": 1.0956, + "step": 662 + }, + { + "epoch": 2.6205533596837944, + "grad_norm": 2.3885722160339355, + "learning_rate": 3.70436507936508e-05, + "loss": 1.3588, + "step": 663 + }, + { + "epoch": 2.624505928853755, + "grad_norm": 2.46718168258667, + "learning_rate": 3.7023809523809526e-05, + "loss": 1.5113, + "step": 664 + }, + { + "epoch": 2.6284584980237153, + "grad_norm": 2.780848503112793, + "learning_rate": 3.700396825396826e-05, + "loss": 1.3804, + "step": 665 + }, + { + "epoch": 2.632411067193676, + "grad_norm": 2.919987440109253, + "learning_rate": 3.6984126984126986e-05, + "loss": 1.4946, + "step": 666 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 2.093374252319336, + "learning_rate": 3.696428571428572e-05, + "loss": 1.3501, + "step": 667 + }, + { + "epoch": 2.6403162055335967, + "grad_norm": 3.430272102355957, + "learning_rate": 3.694444444444445e-05, + "loss": 0.956, + "step": 668 + }, + { + "epoch": 2.6442687747035576, + "grad_norm": 2.6946187019348145, + "learning_rate": 3.692460317460318e-05, + "loss": 1.3491, + "step": 669 + }, + { + "epoch": 2.6482213438735176, + "grad_norm": 2.1861889362335205, + "learning_rate": 3.690476190476191e-05, + "loss": 1.3425, + "step": 670 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 2.343151569366455, + "learning_rate": 3.688492063492064e-05, + "loss": 1.3148, + "step": 671 + }, + { + "epoch": 2.6561264822134385, + "grad_norm": 2.8744852542877197, + "learning_rate": 3.686507936507937e-05, + "loss": 1.1524, + "step": 672 + }, + { + "epoch": 2.6600790513833994, + "grad_norm": 2.830291271209717, + "learning_rate": 3.68452380952381e-05, + "loss": 1.5512, + "step": 673 + }, + { + "epoch": 2.66403162055336, + "grad_norm": 2.655128002166748, + "learning_rate": 3.682539682539683e-05, + "loss": 1.2045, + "step": 674 + }, + { + "epoch": 2.6679841897233203, + "grad_norm": 2.4060311317443848, + "learning_rate": 3.6805555555555556e-05, + "loss": 1.0609, + "step": 675 + }, + { + "epoch": 2.6719367588932808, + "grad_norm": 2.0129261016845703, + "learning_rate": 3.678571428571429e-05, + "loss": 1.2873, + "step": 676 + }, + { + "epoch": 2.675889328063241, + "grad_norm": 2.645951271057129, + "learning_rate": 3.676587301587302e-05, + "loss": 1.1951, + "step": 677 + }, + { + "epoch": 2.6798418972332017, + "grad_norm": 2.4416396617889404, + "learning_rate": 3.674603174603175e-05, + "loss": 1.3103, + "step": 678 + }, + { + "epoch": 2.683794466403162, + "grad_norm": 3.599039077758789, + "learning_rate": 3.672619047619048e-05, + "loss": 1.4431, + "step": 679 + }, + { + "epoch": 2.6877470355731226, + "grad_norm": 2.941194534301758, + "learning_rate": 3.6706349206349205e-05, + "loss": 1.0918, + "step": 680 + }, + { + "epoch": 2.691699604743083, + "grad_norm": 2.7332816123962402, + "learning_rate": 3.668650793650794e-05, + "loss": 1.3429, + "step": 681 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 3.3440287113189697, + "learning_rate": 3.6666666666666666e-05, + "loss": 1.4627, + "step": 682 + }, + { + "epoch": 2.699604743083004, + "grad_norm": 2.5131101608276367, + "learning_rate": 3.664682539682539e-05, + "loss": 1.5036, + "step": 683 + }, + { + "epoch": 2.7035573122529644, + "grad_norm": 2.991786241531372, + "learning_rate": 3.6626984126984126e-05, + "loss": 1.8342, + "step": 684 + }, + { + "epoch": 2.707509881422925, + "grad_norm": 2.5615289211273193, + "learning_rate": 3.6607142857142853e-05, + "loss": 1.3832, + "step": 685 + }, + { + "epoch": 2.7114624505928853, + "grad_norm": 2.467275857925415, + "learning_rate": 3.658730158730159e-05, + "loss": 1.2683, + "step": 686 + }, + { + "epoch": 2.7154150197628457, + "grad_norm": 2.156137228012085, + "learning_rate": 3.6567460317460314e-05, + "loss": 1.2575, + "step": 687 + }, + { + "epoch": 2.719367588932806, + "grad_norm": 1.868895411491394, + "learning_rate": 3.654761904761905e-05, + "loss": 1.4412, + "step": 688 + }, + { + "epoch": 2.7233201581027666, + "grad_norm": 2.4768173694610596, + "learning_rate": 3.6527777777777775e-05, + "loss": 1.2711, + "step": 689 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 3.286557912826538, + "learning_rate": 3.650793650793651e-05, + "loss": 1.4438, + "step": 690 + }, + { + "epoch": 2.7312252964426875, + "grad_norm": 2.219813823699951, + "learning_rate": 3.6488095238095236e-05, + "loss": 1.309, + "step": 691 + }, + { + "epoch": 2.7351778656126484, + "grad_norm": 2.425921678543091, + "learning_rate": 3.646825396825397e-05, + "loss": 1.4898, + "step": 692 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 2.3334920406341553, + "learning_rate": 3.6448412698412696e-05, + "loss": 1.3917, + "step": 693 + }, + { + "epoch": 2.7430830039525693, + "grad_norm": 2.4505045413970947, + "learning_rate": 3.642857142857143e-05, + "loss": 1.2184, + "step": 694 + }, + { + "epoch": 2.7470355731225298, + "grad_norm": 2.223360538482666, + "learning_rate": 3.640873015873016e-05, + "loss": 1.4157, + "step": 695 + }, + { + "epoch": 2.7509881422924902, + "grad_norm": 2.872119665145874, + "learning_rate": 3.638888888888889e-05, + "loss": 1.1279, + "step": 696 + }, + { + "epoch": 2.7549407114624507, + "grad_norm": 2.502791404724121, + "learning_rate": 3.636904761904762e-05, + "loss": 1.1267, + "step": 697 + }, + { + "epoch": 2.758893280632411, + "grad_norm": 2.880258083343506, + "learning_rate": 3.634920634920635e-05, + "loss": 1.2936, + "step": 698 + }, + { + "epoch": 2.7628458498023716, + "grad_norm": 2.5304715633392334, + "learning_rate": 3.632936507936508e-05, + "loss": 1.0587, + "step": 699 + }, + { + "epoch": 2.766798418972332, + "grad_norm": 2.664189338684082, + "learning_rate": 3.630952380952381e-05, + "loss": 1.4785, + "step": 700 + }, + { + "epoch": 2.7707509881422925, + "grad_norm": 2.274841070175171, + "learning_rate": 3.628968253968254e-05, + "loss": 1.3508, + "step": 701 + }, + { + "epoch": 2.774703557312253, + "grad_norm": 2.5693907737731934, + "learning_rate": 3.626984126984127e-05, + "loss": 1.3676, + "step": 702 + }, + { + "epoch": 2.7786561264822134, + "grad_norm": 2.505685806274414, + "learning_rate": 3.625e-05, + "loss": 1.3922, + "step": 703 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 2.220625400543213, + "learning_rate": 3.6230158730158734e-05, + "loss": 1.2811, + "step": 704 + }, + { + "epoch": 2.7865612648221343, + "grad_norm": 2.694793939590454, + "learning_rate": 3.621031746031746e-05, + "loss": 1.4419, + "step": 705 + }, + { + "epoch": 2.7905138339920947, + "grad_norm": 2.996187925338745, + "learning_rate": 3.619047619047619e-05, + "loss": 1.4447, + "step": 706 + }, + { + "epoch": 2.794466403162055, + "grad_norm": 2.177954912185669, + "learning_rate": 3.617063492063492e-05, + "loss": 1.3629, + "step": 707 + }, + { + "epoch": 2.7984189723320156, + "grad_norm": 2.743326187133789, + "learning_rate": 3.615079365079365e-05, + "loss": 1.4397, + "step": 708 + }, + { + "epoch": 2.8023715415019765, + "grad_norm": 2.6118264198303223, + "learning_rate": 3.613095238095238e-05, + "loss": 1.1212, + "step": 709 + }, + { + "epoch": 2.8063241106719365, + "grad_norm": 2.1388731002807617, + "learning_rate": 3.611111111111111e-05, + "loss": 1.5192, + "step": 710 + }, + { + "epoch": 2.8102766798418974, + "grad_norm": 2.721480131149292, + "learning_rate": 3.609126984126984e-05, + "loss": 1.579, + "step": 711 + }, + { + "epoch": 2.8142292490118574, + "grad_norm": 2.726956844329834, + "learning_rate": 3.607142857142857e-05, + "loss": 1.6523, + "step": 712 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 2.369964599609375, + "learning_rate": 3.6051587301587304e-05, + "loss": 1.3389, + "step": 713 + }, + { + "epoch": 2.822134387351779, + "grad_norm": 2.3911476135253906, + "learning_rate": 3.603174603174603e-05, + "loss": 1.4589, + "step": 714 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 2.717634916305542, + "learning_rate": 3.6011904761904765e-05, + "loss": 1.4241, + "step": 715 + }, + { + "epoch": 2.8300395256916997, + "grad_norm": 2.3168764114379883, + "learning_rate": 3.599206349206349e-05, + "loss": 1.3172, + "step": 716 + }, + { + "epoch": 2.83399209486166, + "grad_norm": 2.34859037399292, + "learning_rate": 3.5972222222222225e-05, + "loss": 1.2871, + "step": 717 + }, + { + "epoch": 2.8379446640316206, + "grad_norm": 2.4705185890197754, + "learning_rate": 3.595238095238095e-05, + "loss": 1.524, + "step": 718 + }, + { + "epoch": 2.841897233201581, + "grad_norm": 2.9651718139648438, + "learning_rate": 3.5932539682539686e-05, + "loss": 1.4915, + "step": 719 + }, + { + "epoch": 2.8458498023715415, + "grad_norm": 2.442030668258667, + "learning_rate": 3.591269841269841e-05, + "loss": 1.5001, + "step": 720 + }, + { + "epoch": 2.849802371541502, + "grad_norm": 4.075084686279297, + "learning_rate": 3.589285714285715e-05, + "loss": 0.9039, + "step": 721 + }, + { + "epoch": 2.8537549407114624, + "grad_norm": 3.0871315002441406, + "learning_rate": 3.5873015873015874e-05, + "loss": 1.6743, + "step": 722 + }, + { + "epoch": 2.857707509881423, + "grad_norm": 2.6841776371002197, + "learning_rate": 3.585317460317461e-05, + "loss": 1.1567, + "step": 723 + }, + { + "epoch": 2.8616600790513833, + "grad_norm": 2.2139689922332764, + "learning_rate": 3.5833333333333335e-05, + "loss": 1.4276, + "step": 724 + }, + { + "epoch": 2.8656126482213438, + "grad_norm": 2.8722684383392334, + "learning_rate": 3.581349206349207e-05, + "loss": 1.2222, + "step": 725 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 3.5767440795898438, + "learning_rate": 3.5793650793650795e-05, + "loss": 1.421, + "step": 726 + }, + { + "epoch": 2.8735177865612647, + "grad_norm": 3.3746676445007324, + "learning_rate": 3.577380952380953e-05, + "loss": 1.9383, + "step": 727 + }, + { + "epoch": 2.8774703557312256, + "grad_norm": 3.810504913330078, + "learning_rate": 3.5753968253968256e-05, + "loss": 1.0087, + "step": 728 + }, + { + "epoch": 2.8814229249011856, + "grad_norm": 2.2538998126983643, + "learning_rate": 3.573412698412698e-05, + "loss": 1.4981, + "step": 729 + }, + { + "epoch": 2.8853754940711465, + "grad_norm": 2.282931327819824, + "learning_rate": 3.571428571428572e-05, + "loss": 1.2655, + "step": 730 + }, + { + "epoch": 2.8893280632411065, + "grad_norm": 2.3477354049682617, + "learning_rate": 3.5694444444444444e-05, + "loss": 1.4641, + "step": 731 + }, + { + "epoch": 2.8932806324110674, + "grad_norm": 2.141794204711914, + "learning_rate": 3.567460317460318e-05, + "loss": 1.5262, + "step": 732 + }, + { + "epoch": 2.897233201581028, + "grad_norm": 2.0820016860961914, + "learning_rate": 3.5654761904761905e-05, + "loss": 1.3743, + "step": 733 + }, + { + "epoch": 2.9011857707509883, + "grad_norm": 2.0662853717803955, + "learning_rate": 3.563492063492064e-05, + "loss": 1.3662, + "step": 734 + }, + { + "epoch": 2.9051383399209487, + "grad_norm": 2.205324411392212, + "learning_rate": 3.5615079365079365e-05, + "loss": 1.4079, + "step": 735 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 2.201070785522461, + "learning_rate": 3.55952380952381e-05, + "loss": 1.1008, + "step": 736 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 2.159719944000244, + "learning_rate": 3.5575396825396826e-05, + "loss": 1.2646, + "step": 737 + }, + { + "epoch": 2.91699604743083, + "grad_norm": 2.3559587001800537, + "learning_rate": 3.555555555555556e-05, + "loss": 1.4704, + "step": 738 + }, + { + "epoch": 2.9209486166007905, + "grad_norm": 3.3258392810821533, + "learning_rate": 3.553571428571429e-05, + "loss": 1.1226, + "step": 739 + }, + { + "epoch": 2.924901185770751, + "grad_norm": 2.6636176109313965, + "learning_rate": 3.551587301587302e-05, + "loss": 1.3304, + "step": 740 + }, + { + "epoch": 2.9288537549407114, + "grad_norm": 2.8491475582122803, + "learning_rate": 3.549603174603175e-05, + "loss": 1.4011, + "step": 741 + }, + { + "epoch": 2.932806324110672, + "grad_norm": 3.1546342372894287, + "learning_rate": 3.547619047619048e-05, + "loss": 1.0591, + "step": 742 + }, + { + "epoch": 2.9367588932806323, + "grad_norm": 3.2184884548187256, + "learning_rate": 3.545634920634921e-05, + "loss": 1.5376, + "step": 743 + }, + { + "epoch": 2.940711462450593, + "grad_norm": 2.49813175201416, + "learning_rate": 3.543650793650794e-05, + "loss": 1.2826, + "step": 744 + }, + { + "epoch": 2.9446640316205532, + "grad_norm": 2.511040210723877, + "learning_rate": 3.541666666666667e-05, + "loss": 1.4989, + "step": 745 + }, + { + "epoch": 2.9486166007905137, + "grad_norm": 2.233522415161133, + "learning_rate": 3.53968253968254e-05, + "loss": 1.1618, + "step": 746 + }, + { + "epoch": 2.9525691699604746, + "grad_norm": 2.288743495941162, + "learning_rate": 3.537698412698413e-05, + "loss": 0.9788, + "step": 747 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 3.0124926567077637, + "learning_rate": 3.5357142857142864e-05, + "loss": 0.9291, + "step": 748 + }, + { + "epoch": 2.9604743083003955, + "grad_norm": 3.1643810272216797, + "learning_rate": 3.533730158730159e-05, + "loss": 0.789, + "step": 749 + }, + { + "epoch": 2.9644268774703555, + "grad_norm": 2.8564984798431396, + "learning_rate": 3.5317460317460324e-05, + "loss": 1.4296, + "step": 750 + }, + { + "epoch": 2.9683794466403164, + "grad_norm": 2.2462079524993896, + "learning_rate": 3.529761904761905e-05, + "loss": 1.275, + "step": 751 + }, + { + "epoch": 2.972332015810277, + "grad_norm": 2.5663106441497803, + "learning_rate": 3.527777777777778e-05, + "loss": 1.7537, + "step": 752 + }, + { + "epoch": 2.9762845849802373, + "grad_norm": 2.8393218517303467, + "learning_rate": 3.525793650793651e-05, + "loss": 1.6171, + "step": 753 + }, + { + "epoch": 2.9802371541501977, + "grad_norm": 2.4854910373687744, + "learning_rate": 3.523809523809524e-05, + "loss": 1.4017, + "step": 754 + }, + { + "epoch": 2.984189723320158, + "grad_norm": 2.6834123134613037, + "learning_rate": 3.521825396825397e-05, + "loss": 1.4678, + "step": 755 + }, + { + "epoch": 2.9881422924901186, + "grad_norm": 2.4412403106689453, + "learning_rate": 3.51984126984127e-05, + "loss": 1.2793, + "step": 756 + }, + { + "epoch": 2.992094861660079, + "grad_norm": 2.7471697330474854, + "learning_rate": 3.5178571428571434e-05, + "loss": 1.4015, + "step": 757 + }, + { + "epoch": 2.9960474308300395, + "grad_norm": 2.6294939517974854, + "learning_rate": 3.515873015873016e-05, + "loss": 1.4328, + "step": 758 + }, + { + "epoch": 3.0, + "grad_norm": 2.844717502593994, + "learning_rate": 3.513888888888889e-05, + "loss": 1.2477, + "step": 759 + }, + { + "epoch": 3.0039525691699605, + "grad_norm": 2.496687173843384, + "learning_rate": 3.511904761904762e-05, + "loss": 0.9741, + "step": 760 + }, + { + "epoch": 3.007905138339921, + "grad_norm": 2.5280258655548096, + "learning_rate": 3.509920634920635e-05, + "loss": 0.8558, + "step": 761 + }, + { + "epoch": 3.0118577075098814, + "grad_norm": 2.257899522781372, + "learning_rate": 3.5079365079365075e-05, + "loss": 1.1979, + "step": 762 + }, + { + "epoch": 3.015810276679842, + "grad_norm": 2.1623339653015137, + "learning_rate": 3.505952380952381e-05, + "loss": 0.9799, + "step": 763 + }, + { + "epoch": 3.0197628458498023, + "grad_norm": 2.9412381649017334, + "learning_rate": 3.5039682539682536e-05, + "loss": 0.8915, + "step": 764 + }, + { + "epoch": 3.0237154150197627, + "grad_norm": 3.138514995574951, + "learning_rate": 3.501984126984127e-05, + "loss": 0.8636, + "step": 765 + }, + { + "epoch": 3.027667984189723, + "grad_norm": 4.3793416023254395, + "learning_rate": 3.5e-05, + "loss": 0.7668, + "step": 766 + }, + { + "epoch": 3.0316205533596836, + "grad_norm": 4.136168003082275, + "learning_rate": 3.498015873015873e-05, + "loss": 0.9277, + "step": 767 + }, + { + "epoch": 3.035573122529644, + "grad_norm": 4.092403888702393, + "learning_rate": 3.496031746031746e-05, + "loss": 1.1269, + "step": 768 + }, + { + "epoch": 3.039525691699605, + "grad_norm": 4.120564937591553, + "learning_rate": 3.494047619047619e-05, + "loss": 0.9644, + "step": 769 + }, + { + "epoch": 3.0434782608695654, + "grad_norm": 2.608088254928589, + "learning_rate": 3.492063492063492e-05, + "loss": 0.8488, + "step": 770 + }, + { + "epoch": 3.047430830039526, + "grad_norm": 3.257272481918335, + "learning_rate": 3.490079365079365e-05, + "loss": 0.8844, + "step": 771 + }, + { + "epoch": 3.0513833992094863, + "grad_norm": 3.9354476928710938, + "learning_rate": 3.488095238095238e-05, + "loss": 0.9306, + "step": 772 + }, + { + "epoch": 3.0553359683794468, + "grad_norm": 3.038783550262451, + "learning_rate": 3.486111111111111e-05, + "loss": 0.9639, + "step": 773 + }, + { + "epoch": 3.059288537549407, + "grad_norm": 2.9691848754882812, + "learning_rate": 3.484126984126984e-05, + "loss": 1.0708, + "step": 774 + }, + { + "epoch": 3.0632411067193677, + "grad_norm": 2.9549548625946045, + "learning_rate": 3.4821428571428574e-05, + "loss": 0.6093, + "step": 775 + }, + { + "epoch": 3.067193675889328, + "grad_norm": 2.429551839828491, + "learning_rate": 3.48015873015873e-05, + "loss": 0.9417, + "step": 776 + }, + { + "epoch": 3.0711462450592886, + "grad_norm": 3.1636385917663574, + "learning_rate": 3.4781746031746034e-05, + "loss": 0.7358, + "step": 777 + }, + { + "epoch": 3.075098814229249, + "grad_norm": 2.7946548461914062, + "learning_rate": 3.476190476190476e-05, + "loss": 1.0406, + "step": 778 + }, + { + "epoch": 3.0790513833992095, + "grad_norm": 2.363107442855835, + "learning_rate": 3.4742063492063495e-05, + "loss": 0.7761, + "step": 779 + }, + { + "epoch": 3.08300395256917, + "grad_norm": 2.957887649536133, + "learning_rate": 3.472222222222222e-05, + "loss": 0.947, + "step": 780 + }, + { + "epoch": 3.0869565217391304, + "grad_norm": 2.936105489730835, + "learning_rate": 3.4702380952380956e-05, + "loss": 0.9311, + "step": 781 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 3.723759174346924, + "learning_rate": 3.468253968253968e-05, + "loss": 0.7467, + "step": 782 + }, + { + "epoch": 3.0948616600790513, + "grad_norm": 2.8762381076812744, + "learning_rate": 3.466269841269842e-05, + "loss": 1.078, + "step": 783 + }, + { + "epoch": 3.0988142292490117, + "grad_norm": 3.5455434322357178, + "learning_rate": 3.4642857142857144e-05, + "loss": 0.9024, + "step": 784 + }, + { + "epoch": 3.102766798418972, + "grad_norm": 3.4952311515808105, + "learning_rate": 3.462301587301587e-05, + "loss": 1.1114, + "step": 785 + }, + { + "epoch": 3.1067193675889326, + "grad_norm": 3.0181546211242676, + "learning_rate": 3.4603174603174604e-05, + "loss": 1.0401, + "step": 786 + }, + { + "epoch": 3.110671936758893, + "grad_norm": 2.7165963649749756, + "learning_rate": 3.458333333333333e-05, + "loss": 1.0207, + "step": 787 + }, + { + "epoch": 3.1146245059288535, + "grad_norm": 3.077030897140503, + "learning_rate": 3.4563492063492065e-05, + "loss": 0.8701, + "step": 788 + }, + { + "epoch": 3.1185770750988144, + "grad_norm": 2.9918665885925293, + "learning_rate": 3.454365079365079e-05, + "loss": 1.2665, + "step": 789 + }, + { + "epoch": 3.122529644268775, + "grad_norm": 3.622313976287842, + "learning_rate": 3.4523809523809526e-05, + "loss": 1.1394, + "step": 790 + }, + { + "epoch": 3.1264822134387353, + "grad_norm": 2.705162286758423, + "learning_rate": 3.450396825396825e-05, + "loss": 0.8924, + "step": 791 + }, + { + "epoch": 3.130434782608696, + "grad_norm": 3.595820426940918, + "learning_rate": 3.448412698412699e-05, + "loss": 1.057, + "step": 792 + }, + { + "epoch": 3.1343873517786562, + "grad_norm": 3.4480679035186768, + "learning_rate": 3.4464285714285714e-05, + "loss": 0.8167, + "step": 793 + }, + { + "epoch": 3.1383399209486167, + "grad_norm": 2.8485183715820312, + "learning_rate": 3.444444444444445e-05, + "loss": 0.8327, + "step": 794 + }, + { + "epoch": 3.142292490118577, + "grad_norm": 3.1256802082061768, + "learning_rate": 3.4424603174603174e-05, + "loss": 1.0551, + "step": 795 + }, + { + "epoch": 3.1462450592885376, + "grad_norm": 3.3785347938537598, + "learning_rate": 3.440476190476191e-05, + "loss": 1.2833, + "step": 796 + }, + { + "epoch": 3.150197628458498, + "grad_norm": 2.9685938358306885, + "learning_rate": 3.4384920634920635e-05, + "loss": 1.0435, + "step": 797 + }, + { + "epoch": 3.1541501976284585, + "grad_norm": 2.9943830966949463, + "learning_rate": 3.436507936507937e-05, + "loss": 0.7879, + "step": 798 + }, + { + "epoch": 3.158102766798419, + "grad_norm": 2.655322790145874, + "learning_rate": 3.4345238095238096e-05, + "loss": 0.8366, + "step": 799 + }, + { + "epoch": 3.1620553359683794, + "grad_norm": 3.5305442810058594, + "learning_rate": 3.432539682539683e-05, + "loss": 0.8311, + "step": 800 + }, + { + "epoch": 3.16600790513834, + "grad_norm": 2.946333169937134, + "learning_rate": 3.430555555555556e-05, + "loss": 1.037, + "step": 801 + }, + { + "epoch": 3.1699604743083003, + "grad_norm": 3.5695722103118896, + "learning_rate": 3.428571428571429e-05, + "loss": 0.7785, + "step": 802 + }, + { + "epoch": 3.1739130434782608, + "grad_norm": 3.318615436553955, + "learning_rate": 3.426587301587302e-05, + "loss": 0.8239, + "step": 803 + }, + { + "epoch": 3.177865612648221, + "grad_norm": 3.4505763053894043, + "learning_rate": 3.424603174603175e-05, + "loss": 0.8528, + "step": 804 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 3.004587411880493, + "learning_rate": 3.422619047619048e-05, + "loss": 1.123, + "step": 805 + }, + { + "epoch": 3.185770750988142, + "grad_norm": 3.96537709236145, + "learning_rate": 3.420634920634921e-05, + "loss": 0.9566, + "step": 806 + }, + { + "epoch": 3.1897233201581026, + "grad_norm": 3.715635061264038, + "learning_rate": 3.418650793650794e-05, + "loss": 0.8639, + "step": 807 + }, + { + "epoch": 3.1936758893280635, + "grad_norm": 2.9800610542297363, + "learning_rate": 3.4166666666666666e-05, + "loss": 0.9108, + "step": 808 + }, + { + "epoch": 3.197628458498024, + "grad_norm": 3.7264318466186523, + "learning_rate": 3.41468253968254e-05, + "loss": 1.0782, + "step": 809 + }, + { + "epoch": 3.2015810276679844, + "grad_norm": 2.8650896549224854, + "learning_rate": 3.412698412698413e-05, + "loss": 0.7647, + "step": 810 + }, + { + "epoch": 3.205533596837945, + "grad_norm": 4.19207763671875, + "learning_rate": 3.410714285714286e-05, + "loss": 1.0105, + "step": 811 + }, + { + "epoch": 3.2094861660079053, + "grad_norm": 2.8715367317199707, + "learning_rate": 3.408730158730159e-05, + "loss": 1.0023, + "step": 812 + }, + { + "epoch": 3.2134387351778657, + "grad_norm": 5.560529708862305, + "learning_rate": 3.406746031746032e-05, + "loss": 0.8081, + "step": 813 + }, + { + "epoch": 3.217391304347826, + "grad_norm": 3.2364330291748047, + "learning_rate": 3.404761904761905e-05, + "loss": 0.8591, + "step": 814 + }, + { + "epoch": 3.2213438735177866, + "grad_norm": 2.6285791397094727, + "learning_rate": 3.402777777777778e-05, + "loss": 1.0187, + "step": 815 + }, + { + "epoch": 3.225296442687747, + "grad_norm": 3.7776012420654297, + "learning_rate": 3.400793650793651e-05, + "loss": 0.819, + "step": 816 + }, + { + "epoch": 3.2292490118577075, + "grad_norm": 2.7896721363067627, + "learning_rate": 3.398809523809524e-05, + "loss": 0.975, + "step": 817 + }, + { + "epoch": 3.233201581027668, + "grad_norm": 2.8350744247436523, + "learning_rate": 3.396825396825397e-05, + "loss": 1.3311, + "step": 818 + }, + { + "epoch": 3.2371541501976284, + "grad_norm": 3.2776432037353516, + "learning_rate": 3.3948412698412703e-05, + "loss": 0.8914, + "step": 819 + }, + { + "epoch": 3.241106719367589, + "grad_norm": 2.9362752437591553, + "learning_rate": 3.392857142857143e-05, + "loss": 1.0549, + "step": 820 + }, + { + "epoch": 3.2450592885375493, + "grad_norm": 2.536288022994995, + "learning_rate": 3.3908730158730164e-05, + "loss": 0.8083, + "step": 821 + }, + { + "epoch": 3.2490118577075098, + "grad_norm": 3.4325833320617676, + "learning_rate": 3.388888888888889e-05, + "loss": 0.9863, + "step": 822 + }, + { + "epoch": 3.2529644268774702, + "grad_norm": 2.966909646987915, + "learning_rate": 3.3869047619047625e-05, + "loss": 0.9928, + "step": 823 + }, + { + "epoch": 3.2569169960474307, + "grad_norm": 3.627739191055298, + "learning_rate": 3.384920634920635e-05, + "loss": 1.0572, + "step": 824 + }, + { + "epoch": 3.260869565217391, + "grad_norm": 3.5908288955688477, + "learning_rate": 3.3829365079365086e-05, + "loss": 1.1389, + "step": 825 + }, + { + "epoch": 3.2648221343873516, + "grad_norm": 2.7947778701782227, + "learning_rate": 3.380952380952381e-05, + "loss": 0.9508, + "step": 826 + }, + { + "epoch": 3.2687747035573125, + "grad_norm": 2.9585814476013184, + "learning_rate": 3.3789682539682546e-05, + "loss": 0.8809, + "step": 827 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 2.968035936355591, + "learning_rate": 3.3769841269841273e-05, + "loss": 1.0937, + "step": 828 + }, + { + "epoch": 3.2766798418972334, + "grad_norm": 4.545178413391113, + "learning_rate": 3.375000000000001e-05, + "loss": 1.3998, + "step": 829 + }, + { + "epoch": 3.280632411067194, + "grad_norm": 3.4058380126953125, + "learning_rate": 3.3730158730158734e-05, + "loss": 1.269, + "step": 830 + }, + { + "epoch": 3.2845849802371543, + "grad_norm": 3.8424315452575684, + "learning_rate": 3.371031746031746e-05, + "loss": 0.7128, + "step": 831 + }, + { + "epoch": 3.2885375494071147, + "grad_norm": 2.7813680171966553, + "learning_rate": 3.3690476190476195e-05, + "loss": 0.8108, + "step": 832 + }, + { + "epoch": 3.292490118577075, + "grad_norm": 2.6684579849243164, + "learning_rate": 3.367063492063492e-05, + "loss": 0.8819, + "step": 833 + }, + { + "epoch": 3.2964426877470356, + "grad_norm": 3.1254680156707764, + "learning_rate": 3.3650793650793656e-05, + "loss": 0.9083, + "step": 834 + }, + { + "epoch": 3.300395256916996, + "grad_norm": 2.6201884746551514, + "learning_rate": 3.363095238095238e-05, + "loss": 0.751, + "step": 835 + }, + { + "epoch": 3.3043478260869565, + "grad_norm": 2.976027250289917, + "learning_rate": 3.3611111111111116e-05, + "loss": 0.7687, + "step": 836 + }, + { + "epoch": 3.308300395256917, + "grad_norm": 4.267848968505859, + "learning_rate": 3.3591269841269843e-05, + "loss": 0.8482, + "step": 837 + }, + { + "epoch": 3.3122529644268774, + "grad_norm": 4.229562759399414, + "learning_rate": 3.357142857142857e-05, + "loss": 0.9975, + "step": 838 + }, + { + "epoch": 3.316205533596838, + "grad_norm": 3.5600497722625732, + "learning_rate": 3.35515873015873e-05, + "loss": 1.197, + "step": 839 + }, + { + "epoch": 3.3201581027667983, + "grad_norm": 3.5733120441436768, + "learning_rate": 3.353174603174603e-05, + "loss": 0.9723, + "step": 840 + }, + { + "epoch": 3.324110671936759, + "grad_norm": 4.316638469696045, + "learning_rate": 3.351190476190476e-05, + "loss": 1.0709, + "step": 841 + }, + { + "epoch": 3.3280632411067192, + "grad_norm": 2.8753249645233154, + "learning_rate": 3.349206349206349e-05, + "loss": 1.044, + "step": 842 + }, + { + "epoch": 3.3320158102766797, + "grad_norm": 3.8951146602630615, + "learning_rate": 3.347222222222222e-05, + "loss": 1.2154, + "step": 843 + }, + { + "epoch": 3.33596837944664, + "grad_norm": 2.5250723361968994, + "learning_rate": 3.345238095238095e-05, + "loss": 0.7862, + "step": 844 + }, + { + "epoch": 3.3399209486166006, + "grad_norm": 2.821502447128296, + "learning_rate": 3.343253968253968e-05, + "loss": 1.0455, + "step": 845 + }, + { + "epoch": 3.3438735177865615, + "grad_norm": 2.9029202461242676, + "learning_rate": 3.3412698412698413e-05, + "loss": 1.1058, + "step": 846 + }, + { + "epoch": 3.3478260869565215, + "grad_norm": 2.247542381286621, + "learning_rate": 3.339285714285714e-05, + "loss": 0.9376, + "step": 847 + }, + { + "epoch": 3.3517786561264824, + "grad_norm": 4.182517051696777, + "learning_rate": 3.3373015873015874e-05, + "loss": 0.7921, + "step": 848 + }, + { + "epoch": 3.355731225296443, + "grad_norm": 2.482083797454834, + "learning_rate": 3.33531746031746e-05, + "loss": 0.9948, + "step": 849 + }, + { + "epoch": 3.3596837944664033, + "grad_norm": 2.999898672103882, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.8631, + "step": 850 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 4.035496234893799, + "learning_rate": 3.331349206349206e-05, + "loss": 0.8496, + "step": 851 + }, + { + "epoch": 3.367588932806324, + "grad_norm": 3.46382737159729, + "learning_rate": 3.3293650793650796e-05, + "loss": 0.7815, + "step": 852 + }, + { + "epoch": 3.3715415019762847, + "grad_norm": 2.803903818130493, + "learning_rate": 3.327380952380952e-05, + "loss": 0.8237, + "step": 853 + }, + { + "epoch": 3.375494071146245, + "grad_norm": 3.75998592376709, + "learning_rate": 3.3253968253968256e-05, + "loss": 1.183, + "step": 854 + }, + { + "epoch": 3.3794466403162056, + "grad_norm": 3.8531124591827393, + "learning_rate": 3.3234126984126983e-05, + "loss": 1.105, + "step": 855 + }, + { + "epoch": 3.383399209486166, + "grad_norm": 4.231212615966797, + "learning_rate": 3.321428571428572e-05, + "loss": 0.8909, + "step": 856 + }, + { + "epoch": 3.3873517786561265, + "grad_norm": 3.3275704383850098, + "learning_rate": 3.3194444444444444e-05, + "loss": 0.8102, + "step": 857 + }, + { + "epoch": 3.391304347826087, + "grad_norm": 3.103105306625366, + "learning_rate": 3.317460317460318e-05, + "loss": 0.6757, + "step": 858 + }, + { + "epoch": 3.3952569169960474, + "grad_norm": 4.611384391784668, + "learning_rate": 3.3154761904761905e-05, + "loss": 1.113, + "step": 859 + }, + { + "epoch": 3.399209486166008, + "grad_norm": 2.466308355331421, + "learning_rate": 3.313492063492064e-05, + "loss": 0.758, + "step": 860 + }, + { + "epoch": 3.4031620553359683, + "grad_norm": 2.4484543800354004, + "learning_rate": 3.3115079365079366e-05, + "loss": 0.9824, + "step": 861 + }, + { + "epoch": 3.4071146245059287, + "grad_norm": 2.8706154823303223, + "learning_rate": 3.309523809523809e-05, + "loss": 0.9201, + "step": 862 + }, + { + "epoch": 3.411067193675889, + "grad_norm": 3.8132359981536865, + "learning_rate": 3.3075396825396826e-05, + "loss": 0.6403, + "step": 863 + }, + { + "epoch": 3.4150197628458496, + "grad_norm": 3.109771251678467, + "learning_rate": 3.3055555555555553e-05, + "loss": 0.756, + "step": 864 + }, + { + "epoch": 3.4189723320158105, + "grad_norm": 3.4301555156707764, + "learning_rate": 3.303571428571429e-05, + "loss": 1.0933, + "step": 865 + }, + { + "epoch": 3.4229249011857705, + "grad_norm": 3.461763620376587, + "learning_rate": 3.3015873015873014e-05, + "loss": 1.2169, + "step": 866 + }, + { + "epoch": 3.4268774703557314, + "grad_norm": 4.107053756713867, + "learning_rate": 3.299603174603175e-05, + "loss": 1.0824, + "step": 867 + }, + { + "epoch": 3.430830039525692, + "grad_norm": 3.434462070465088, + "learning_rate": 3.2976190476190475e-05, + "loss": 1.0065, + "step": 868 + }, + { + "epoch": 3.4347826086956523, + "grad_norm": 3.3090224266052246, + "learning_rate": 3.295634920634921e-05, + "loss": 0.9509, + "step": 869 + }, + { + "epoch": 3.438735177865613, + "grad_norm": 3.5097386837005615, + "learning_rate": 3.2936507936507936e-05, + "loss": 0.8331, + "step": 870 + }, + { + "epoch": 3.4426877470355732, + "grad_norm": 3.1316442489624023, + "learning_rate": 3.291666666666667e-05, + "loss": 0.8634, + "step": 871 + }, + { + "epoch": 3.4466403162055337, + "grad_norm": 2.5135436058044434, + "learning_rate": 3.2896825396825396e-05, + "loss": 0.6734, + "step": 872 + }, + { + "epoch": 3.450592885375494, + "grad_norm": 3.5739667415618896, + "learning_rate": 3.287698412698413e-05, + "loss": 0.8149, + "step": 873 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 2.84611439704895, + "learning_rate": 3.285714285714286e-05, + "loss": 0.8658, + "step": 874 + }, + { + "epoch": 3.458498023715415, + "grad_norm": 2.680215835571289, + "learning_rate": 3.283730158730159e-05, + "loss": 1.1295, + "step": 875 + }, + { + "epoch": 3.4624505928853755, + "grad_norm": 3.028846263885498, + "learning_rate": 3.281746031746032e-05, + "loss": 1.0244, + "step": 876 + }, + { + "epoch": 3.466403162055336, + "grad_norm": 2.5386555194854736, + "learning_rate": 3.279761904761905e-05, + "loss": 1.0598, + "step": 877 + }, + { + "epoch": 3.4703557312252964, + "grad_norm": 2.7928504943847656, + "learning_rate": 3.277777777777778e-05, + "loss": 1.181, + "step": 878 + }, + { + "epoch": 3.474308300395257, + "grad_norm": 3.864605665206909, + "learning_rate": 3.275793650793651e-05, + "loss": 0.9425, + "step": 879 + }, + { + "epoch": 3.4782608695652173, + "grad_norm": 2.8199121952056885, + "learning_rate": 3.273809523809524e-05, + "loss": 1.0021, + "step": 880 + }, + { + "epoch": 3.4822134387351777, + "grad_norm": 3.323481321334839, + "learning_rate": 3.271825396825397e-05, + "loss": 1.141, + "step": 881 + }, + { + "epoch": 3.486166007905138, + "grad_norm": 3.6544790267944336, + "learning_rate": 3.26984126984127e-05, + "loss": 0.9603, + "step": 882 + }, + { + "epoch": 3.4901185770750986, + "grad_norm": 3.930521011352539, + "learning_rate": 3.2678571428571434e-05, + "loss": 0.9759, + "step": 883 + }, + { + "epoch": 3.494071146245059, + "grad_norm": 2.638461112976074, + "learning_rate": 3.265873015873016e-05, + "loss": 1.0179, + "step": 884 + }, + { + "epoch": 3.4980237154150196, + "grad_norm": 3.1710309982299805, + "learning_rate": 3.263888888888889e-05, + "loss": 0.9966, + "step": 885 + }, + { + "epoch": 3.5019762845849804, + "grad_norm": 4.251243591308594, + "learning_rate": 3.261904761904762e-05, + "loss": 0.9437, + "step": 886 + }, + { + "epoch": 3.5059288537549405, + "grad_norm": 3.0481278896331787, + "learning_rate": 3.259920634920635e-05, + "loss": 0.99, + "step": 887 + }, + { + "epoch": 3.5098814229249014, + "grad_norm": 2.843862295150757, + "learning_rate": 3.257936507936508e-05, + "loss": 1.057, + "step": 888 + }, + { + "epoch": 3.513833992094862, + "grad_norm": 2.6303539276123047, + "learning_rate": 3.255952380952381e-05, + "loss": 1.0664, + "step": 889 + }, + { + "epoch": 3.5177865612648223, + "grad_norm": 2.792405128479004, + "learning_rate": 3.253968253968254e-05, + "loss": 1.0506, + "step": 890 + }, + { + "epoch": 3.5217391304347827, + "grad_norm": 5.069064140319824, + "learning_rate": 3.251984126984127e-05, + "loss": 0.815, + "step": 891 + }, + { + "epoch": 3.525691699604743, + "grad_norm": 2.9551217555999756, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.1707, + "step": 892 + }, + { + "epoch": 3.5296442687747036, + "grad_norm": 3.7232065200805664, + "learning_rate": 3.248015873015873e-05, + "loss": 1.0063, + "step": 893 + }, + { + "epoch": 3.533596837944664, + "grad_norm": 3.990532875061035, + "learning_rate": 3.2460317460317465e-05, + "loss": 0.8298, + "step": 894 + }, + { + "epoch": 3.5375494071146245, + "grad_norm": 4.859503269195557, + "learning_rate": 3.244047619047619e-05, + "loss": 0.8827, + "step": 895 + }, + { + "epoch": 3.541501976284585, + "grad_norm": 3.32344913482666, + "learning_rate": 3.2420634920634925e-05, + "loss": 0.8536, + "step": 896 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 3.2624435424804688, + "learning_rate": 3.240079365079365e-05, + "loss": 1.2324, + "step": 897 + }, + { + "epoch": 3.549407114624506, + "grad_norm": 4.429110050201416, + "learning_rate": 3.2380952380952386e-05, + "loss": 1.1154, + "step": 898 + }, + { + "epoch": 3.5533596837944663, + "grad_norm": 3.8388755321502686, + "learning_rate": 3.236111111111111e-05, + "loss": 1.2359, + "step": 899 + }, + { + "epoch": 3.5573122529644268, + "grad_norm": 3.2584800720214844, + "learning_rate": 3.234126984126985e-05, + "loss": 0.9469, + "step": 900 + }, + { + "epoch": 3.561264822134387, + "grad_norm": 3.2762997150421143, + "learning_rate": 3.2321428571428574e-05, + "loss": 0.7991, + "step": 901 + }, + { + "epoch": 3.5652173913043477, + "grad_norm": 3.214747905731201, + "learning_rate": 3.230158730158731e-05, + "loss": 1.2799, + "step": 902 + }, + { + "epoch": 3.5691699604743086, + "grad_norm": 3.528118133544922, + "learning_rate": 3.2281746031746035e-05, + "loss": 0.9679, + "step": 903 + }, + { + "epoch": 3.5731225296442686, + "grad_norm": 2.9673147201538086, + "learning_rate": 3.226190476190477e-05, + "loss": 1.0996, + "step": 904 + }, + { + "epoch": 3.5770750988142295, + "grad_norm": 2.345867395401001, + "learning_rate": 3.2242063492063495e-05, + "loss": 0.8981, + "step": 905 + }, + { + "epoch": 3.5810276679841895, + "grad_norm": 2.914339065551758, + "learning_rate": 3.222222222222223e-05, + "loss": 1.2681, + "step": 906 + }, + { + "epoch": 3.5849802371541504, + "grad_norm": 2.6776278018951416, + "learning_rate": 3.2202380952380956e-05, + "loss": 0.9516, + "step": 907 + }, + { + "epoch": 3.588932806324111, + "grad_norm": 3.282893180847168, + "learning_rate": 3.218253968253968e-05, + "loss": 1.1996, + "step": 908 + }, + { + "epoch": 3.5928853754940713, + "grad_norm": 2.855419635772705, + "learning_rate": 3.216269841269842e-05, + "loss": 0.9228, + "step": 909 + }, + { + "epoch": 3.5968379446640317, + "grad_norm": 3.372144937515259, + "learning_rate": 3.2142857142857144e-05, + "loss": 1.1176, + "step": 910 + }, + { + "epoch": 3.600790513833992, + "grad_norm": 2.9246695041656494, + "learning_rate": 3.212301587301588e-05, + "loss": 0.8848, + "step": 911 + }, + { + "epoch": 3.6047430830039526, + "grad_norm": 3.258700370788574, + "learning_rate": 3.2103174603174605e-05, + "loss": 1.1232, + "step": 912 + }, + { + "epoch": 3.608695652173913, + "grad_norm": 3.726252794265747, + "learning_rate": 3.208333333333334e-05, + "loss": 0.9653, + "step": 913 + }, + { + "epoch": 3.6126482213438735, + "grad_norm": 3.1601107120513916, + "learning_rate": 3.2063492063492065e-05, + "loss": 1.039, + "step": 914 + }, + { + "epoch": 3.616600790513834, + "grad_norm": 3.5057191848754883, + "learning_rate": 3.20436507936508e-05, + "loss": 0.9399, + "step": 915 + }, + { + "epoch": 3.6205533596837944, + "grad_norm": 3.562908411026001, + "learning_rate": 3.202380952380952e-05, + "loss": 1.0264, + "step": 916 + }, + { + "epoch": 3.624505928853755, + "grad_norm": 3.443648099899292, + "learning_rate": 3.200396825396825e-05, + "loss": 0.8603, + "step": 917 + }, + { + "epoch": 3.6284584980237153, + "grad_norm": 3.095889091491699, + "learning_rate": 3.198412698412698e-05, + "loss": 1.1861, + "step": 918 + }, + { + "epoch": 3.632411067193676, + "grad_norm": 3.4708924293518066, + "learning_rate": 3.1964285714285714e-05, + "loss": 1.1105, + "step": 919 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 2.894131660461426, + "learning_rate": 3.194444444444444e-05, + "loss": 1.07, + "step": 920 + }, + { + "epoch": 3.6403162055335967, + "grad_norm": 4.143770694732666, + "learning_rate": 3.1924603174603175e-05, + "loss": 0.9666, + "step": 921 + }, + { + "epoch": 3.6442687747035576, + "grad_norm": 2.9045121669769287, + "learning_rate": 3.19047619047619e-05, + "loss": 1.0069, + "step": 922 + }, + { + "epoch": 3.6482213438735176, + "grad_norm": 2.5982000827789307, + "learning_rate": 3.1884920634920635e-05, + "loss": 1.2217, + "step": 923 + }, + { + "epoch": 3.6521739130434785, + "grad_norm": 2.874178647994995, + "learning_rate": 3.186507936507936e-05, + "loss": 1.1494, + "step": 924 + }, + { + "epoch": 3.6561264822134385, + "grad_norm": 3.3155198097229004, + "learning_rate": 3.1845238095238096e-05, + "loss": 1.1825, + "step": 925 + }, + { + "epoch": 3.6600790513833994, + "grad_norm": 3.4435360431671143, + "learning_rate": 3.182539682539682e-05, + "loss": 0.7864, + "step": 926 + }, + { + "epoch": 3.66403162055336, + "grad_norm": 3.355100393295288, + "learning_rate": 3.180555555555556e-05, + "loss": 0.8861, + "step": 927 + }, + { + "epoch": 3.6679841897233203, + "grad_norm": 3.6266121864318848, + "learning_rate": 3.1785714285714284e-05, + "loss": 1.0996, + "step": 928 + }, + { + "epoch": 3.6719367588932808, + "grad_norm": 3.77498459815979, + "learning_rate": 3.176587301587302e-05, + "loss": 0.7629, + "step": 929 + }, + { + "epoch": 3.675889328063241, + "grad_norm": 3.8526268005371094, + "learning_rate": 3.1746031746031745e-05, + "loss": 1.1476, + "step": 930 + }, + { + "epoch": 3.6798418972332017, + "grad_norm": 2.970158338546753, + "learning_rate": 3.172619047619048e-05, + "loss": 1.1171, + "step": 931 + }, + { + "epoch": 3.683794466403162, + "grad_norm": 2.934819459915161, + "learning_rate": 3.1706349206349205e-05, + "loss": 0.964, + "step": 932 + }, + { + "epoch": 3.6877470355731226, + "grad_norm": 3.0377979278564453, + "learning_rate": 3.168650793650794e-05, + "loss": 1.1309, + "step": 933 + }, + { + "epoch": 3.691699604743083, + "grad_norm": 3.5154223442077637, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.9907, + "step": 934 + }, + { + "epoch": 3.6956521739130435, + "grad_norm": 3.415923833847046, + "learning_rate": 3.16468253968254e-05, + "loss": 0.9697, + "step": 935 + }, + { + "epoch": 3.699604743083004, + "grad_norm": 3.0694198608398438, + "learning_rate": 3.162698412698413e-05, + "loss": 0.9632, + "step": 936 + }, + { + "epoch": 3.7035573122529644, + "grad_norm": 2.25923490524292, + "learning_rate": 3.160714285714286e-05, + "loss": 0.9753, + "step": 937 + }, + { + "epoch": 3.707509881422925, + "grad_norm": 4.5253190994262695, + "learning_rate": 3.158730158730159e-05, + "loss": 0.9479, + "step": 938 + }, + { + "epoch": 3.7114624505928853, + "grad_norm": 2.794191598892212, + "learning_rate": 3.1567460317460315e-05, + "loss": 0.9286, + "step": 939 + }, + { + "epoch": 3.7154150197628457, + "grad_norm": 3.3554208278656006, + "learning_rate": 3.154761904761905e-05, + "loss": 1.0205, + "step": 940 + }, + { + "epoch": 3.719367588932806, + "grad_norm": 3.562750816345215, + "learning_rate": 3.1527777777777775e-05, + "loss": 0.7624, + "step": 941 + }, + { + "epoch": 3.7233201581027666, + "grad_norm": 2.868648052215576, + "learning_rate": 3.150793650793651e-05, + "loss": 0.8968, + "step": 942 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 3.128340721130371, + "learning_rate": 3.1488095238095236e-05, + "loss": 0.9745, + "step": 943 + }, + { + "epoch": 3.7312252964426875, + "grad_norm": 3.5969157218933105, + "learning_rate": 3.146825396825397e-05, + "loss": 1.0403, + "step": 944 + }, + { + "epoch": 3.7351778656126484, + "grad_norm": 3.2540881633758545, + "learning_rate": 3.14484126984127e-05, + "loss": 1.1106, + "step": 945 + }, + { + "epoch": 3.7391304347826084, + "grad_norm": 2.8699069023132324, + "learning_rate": 3.142857142857143e-05, + "loss": 0.9645, + "step": 946 + }, + { + "epoch": 3.7430830039525693, + "grad_norm": 3.2061824798583984, + "learning_rate": 3.140873015873016e-05, + "loss": 0.864, + "step": 947 + }, + { + "epoch": 3.7470355731225298, + "grad_norm": 2.7899482250213623, + "learning_rate": 3.138888888888889e-05, + "loss": 1.1192, + "step": 948 + }, + { + "epoch": 3.7509881422924902, + "grad_norm": 4.122161388397217, + "learning_rate": 3.136904761904762e-05, + "loss": 0.8513, + "step": 949 + }, + { + "epoch": 3.7549407114624507, + "grad_norm": 2.6482725143432617, + "learning_rate": 3.134920634920635e-05, + "loss": 0.811, + "step": 950 + }, + { + "epoch": 3.758893280632411, + "grad_norm": 2.7796828746795654, + "learning_rate": 3.132936507936508e-05, + "loss": 0.9356, + "step": 951 + }, + { + "epoch": 3.7628458498023716, + "grad_norm": 3.4659996032714844, + "learning_rate": 3.130952380952381e-05, + "loss": 1.0248, + "step": 952 + }, + { + "epoch": 3.766798418972332, + "grad_norm": 3.140477418899536, + "learning_rate": 3.128968253968254e-05, + "loss": 1.2593, + "step": 953 + }, + { + "epoch": 3.7707509881422925, + "grad_norm": 3.8717284202575684, + "learning_rate": 3.1269841269841274e-05, + "loss": 1.0904, + "step": 954 + }, + { + "epoch": 3.774703557312253, + "grad_norm": 2.737684488296509, + "learning_rate": 3.125e-05, + "loss": 0.8346, + "step": 955 + }, + { + "epoch": 3.7786561264822134, + "grad_norm": 3.515249013900757, + "learning_rate": 3.1230158730158734e-05, + "loss": 1.1003, + "step": 956 + }, + { + "epoch": 3.782608695652174, + "grad_norm": 3.8520002365112305, + "learning_rate": 3.121031746031746e-05, + "loss": 0.8998, + "step": 957 + }, + { + "epoch": 3.7865612648221343, + "grad_norm": 3.644209384918213, + "learning_rate": 3.1190476190476195e-05, + "loss": 1.0063, + "step": 958 + }, + { + "epoch": 3.7905138339920947, + "grad_norm": 3.2415544986724854, + "learning_rate": 3.117063492063492e-05, + "loss": 0.8328, + "step": 959 + }, + { + "epoch": 3.794466403162055, + "grad_norm": 3.8536369800567627, + "learning_rate": 3.1150793650793656e-05, + "loss": 0.9605, + "step": 960 + }, + { + "epoch": 3.7984189723320156, + "grad_norm": 2.8492162227630615, + "learning_rate": 3.113095238095238e-05, + "loss": 0.8059, + "step": 961 + }, + { + "epoch": 3.8023715415019765, + "grad_norm": 3.2966291904449463, + "learning_rate": 3.111111111111111e-05, + "loss": 0.9345, + "step": 962 + }, + { + "epoch": 3.8063241106719365, + "grad_norm": 3.0209009647369385, + "learning_rate": 3.1091269841269844e-05, + "loss": 0.9933, + "step": 963 + }, + { + "epoch": 3.8102766798418974, + "grad_norm": 2.6817831993103027, + "learning_rate": 3.107142857142857e-05, + "loss": 0.9633, + "step": 964 + }, + { + "epoch": 3.8142292490118574, + "grad_norm": 3.1547319889068604, + "learning_rate": 3.1051587301587304e-05, + "loss": 1.1855, + "step": 965 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 3.485544443130493, + "learning_rate": 3.103174603174603e-05, + "loss": 1.1107, + "step": 966 + }, + { + "epoch": 3.822134387351779, + "grad_norm": 3.0962092876434326, + "learning_rate": 3.1011904761904765e-05, + "loss": 1.2013, + "step": 967 + }, + { + "epoch": 3.8260869565217392, + "grad_norm": 3.373776912689209, + "learning_rate": 3.099206349206349e-05, + "loss": 0.7553, + "step": 968 + }, + { + "epoch": 3.8300395256916997, + "grad_norm": 3.174527883529663, + "learning_rate": 3.0972222222222226e-05, + "loss": 1.1787, + "step": 969 + }, + { + "epoch": 3.83399209486166, + "grad_norm": 3.290992498397827, + "learning_rate": 3.095238095238095e-05, + "loss": 0.9103, + "step": 970 + }, + { + "epoch": 3.8379446640316206, + "grad_norm": 3.8229081630706787, + "learning_rate": 3.093253968253969e-05, + "loss": 0.9309, + "step": 971 + }, + { + "epoch": 3.841897233201581, + "grad_norm": 3.5504062175750732, + "learning_rate": 3.0912698412698414e-05, + "loss": 1.0507, + "step": 972 + }, + { + "epoch": 3.8458498023715415, + "grad_norm": 3.4466750621795654, + "learning_rate": 3.089285714285715e-05, + "loss": 1.056, + "step": 973 + }, + { + "epoch": 3.849802371541502, + "grad_norm": 4.083953857421875, + "learning_rate": 3.0873015873015874e-05, + "loss": 0.9524, + "step": 974 + }, + { + "epoch": 3.8537549407114624, + "grad_norm": 4.008061408996582, + "learning_rate": 3.085317460317461e-05, + "loss": 1.0339, + "step": 975 + }, + { + "epoch": 3.857707509881423, + "grad_norm": 2.528346300125122, + "learning_rate": 3.0833333333333335e-05, + "loss": 1.0405, + "step": 976 + }, + { + "epoch": 3.8616600790513833, + "grad_norm": 2.7856969833374023, + "learning_rate": 3.081349206349207e-05, + "loss": 0.878, + "step": 977 + }, + { + "epoch": 3.8656126482213438, + "grad_norm": 3.5250959396362305, + "learning_rate": 3.0793650793650796e-05, + "loss": 1.0533, + "step": 978 + }, + { + "epoch": 3.869565217391304, + "grad_norm": 3.1857500076293945, + "learning_rate": 3.077380952380953e-05, + "loss": 1.1474, + "step": 979 + }, + { + "epoch": 3.8735177865612647, + "grad_norm": 3.111074209213257, + "learning_rate": 3.075396825396826e-05, + "loss": 1.0187, + "step": 980 + }, + { + "epoch": 3.8774703557312256, + "grad_norm": 3.149482250213623, + "learning_rate": 3.073412698412699e-05, + "loss": 0.8251, + "step": 981 + }, + { + "epoch": 3.8814229249011856, + "grad_norm": 3.7290894985198975, + "learning_rate": 3.071428571428572e-05, + "loss": 0.8504, + "step": 982 + }, + { + "epoch": 3.8853754940711465, + "grad_norm": 2.8733174800872803, + "learning_rate": 3.069444444444445e-05, + "loss": 0.7323, + "step": 983 + }, + { + "epoch": 3.8893280632411065, + "grad_norm": 3.1158406734466553, + "learning_rate": 3.067460317460318e-05, + "loss": 0.8583, + "step": 984 + }, + { + "epoch": 3.8932806324110674, + "grad_norm": 3.126250743865967, + "learning_rate": 3.0654761904761905e-05, + "loss": 1.0742, + "step": 985 + }, + { + "epoch": 3.897233201581028, + "grad_norm": 4.984484672546387, + "learning_rate": 3.063492063492064e-05, + "loss": 1.0604, + "step": 986 + }, + { + "epoch": 3.9011857707509883, + "grad_norm": 3.3782174587249756, + "learning_rate": 3.0615079365079366e-05, + "loss": 0.8303, + "step": 987 + }, + { + "epoch": 3.9051383399209487, + "grad_norm": 3.0348994731903076, + "learning_rate": 3.05952380952381e-05, + "loss": 1.0685, + "step": 988 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 3.0283362865448, + "learning_rate": 3.057539682539683e-05, + "loss": 1.1548, + "step": 989 + }, + { + "epoch": 3.9130434782608696, + "grad_norm": 3.938385009765625, + "learning_rate": 3.055555555555556e-05, + "loss": 0.7831, + "step": 990 + }, + { + "epoch": 3.91699604743083, + "grad_norm": 3.3874075412750244, + "learning_rate": 3.053571428571429e-05, + "loss": 0.7838, + "step": 991 + }, + { + "epoch": 3.9209486166007905, + "grad_norm": 3.09395432472229, + "learning_rate": 3.051587301587302e-05, + "loss": 0.7563, + "step": 992 + }, + { + "epoch": 3.924901185770751, + "grad_norm": 3.3949618339538574, + "learning_rate": 3.049603174603175e-05, + "loss": 1.2432, + "step": 993 + }, + { + "epoch": 3.9288537549407114, + "grad_norm": 3.91082501411438, + "learning_rate": 3.0476190476190482e-05, + "loss": 0.8746, + "step": 994 + }, + { + "epoch": 3.932806324110672, + "grad_norm": 3.182863235473633, + "learning_rate": 3.0456349206349206e-05, + "loss": 0.9189, + "step": 995 + }, + { + "epoch": 3.9367588932806323, + "grad_norm": 3.7233152389526367, + "learning_rate": 3.0436507936507936e-05, + "loss": 0.9538, + "step": 996 + }, + { + "epoch": 3.940711462450593, + "grad_norm": 4.027876853942871, + "learning_rate": 3.0416666666666666e-05, + "loss": 0.8883, + "step": 997 + }, + { + "epoch": 3.9446640316205532, + "grad_norm": 3.228454113006592, + "learning_rate": 3.0396825396825397e-05, + "loss": 0.9645, + "step": 998 + }, + { + "epoch": 3.9486166007905137, + "grad_norm": 3.264758348464966, + "learning_rate": 3.0376984126984127e-05, + "loss": 0.9099, + "step": 999 + }, + { + "epoch": 3.9525691699604746, + "grad_norm": 3.349977731704712, + "learning_rate": 3.0357142857142857e-05, + "loss": 0.731, + "step": 1000 + }, + { + "epoch": 3.9565217391304346, + "grad_norm": 3.3020308017730713, + "learning_rate": 3.0337301587301588e-05, + "loss": 0.9018, + "step": 1001 + }, + { + "epoch": 3.9604743083003955, + "grad_norm": 3.4217529296875, + "learning_rate": 3.0317460317460318e-05, + "loss": 1.1229, + "step": 1002 + }, + { + "epoch": 3.9644268774703555, + "grad_norm": 3.2917253971099854, + "learning_rate": 3.029761904761905e-05, + "loss": 1.0188, + "step": 1003 + }, + { + "epoch": 3.9683794466403164, + "grad_norm": 3.4840149879455566, + "learning_rate": 3.0277777777777776e-05, + "loss": 0.987, + "step": 1004 + }, + { + "epoch": 3.972332015810277, + "grad_norm": 3.3869576454162598, + "learning_rate": 3.0257936507936506e-05, + "loss": 0.8482, + "step": 1005 + }, + { + "epoch": 3.9762845849802373, + "grad_norm": 4.020000457763672, + "learning_rate": 3.0238095238095236e-05, + "loss": 0.9846, + "step": 1006 + }, + { + "epoch": 3.9802371541501977, + "grad_norm": 3.028305768966675, + "learning_rate": 3.0218253968253967e-05, + "loss": 0.8622, + "step": 1007 + }, + { + "epoch": 3.984189723320158, + "grad_norm": 2.8138604164123535, + "learning_rate": 3.0198412698412697e-05, + "loss": 1.2293, + "step": 1008 + }, + { + "epoch": 3.9881422924901186, + "grad_norm": 3.196972370147705, + "learning_rate": 3.0178571428571427e-05, + "loss": 1.0762, + "step": 1009 + }, + { + "epoch": 3.992094861660079, + "grad_norm": 4.044905185699463, + "learning_rate": 3.0158730158730158e-05, + "loss": 1.01, + "step": 1010 + }, + { + "epoch": 3.9960474308300395, + "grad_norm": 2.731539011001587, + "learning_rate": 3.0138888888888888e-05, + "loss": 0.9882, + "step": 1011 + }, + { + "epoch": 4.0, + "grad_norm": 2.787231683731079, + "learning_rate": 3.011904761904762e-05, + "loss": 0.7786, + "step": 1012 + }, + { + "epoch": 4.003952569169961, + "grad_norm": 2.597419023513794, + "learning_rate": 3.009920634920635e-05, + "loss": 0.5683, + "step": 1013 + }, + { + "epoch": 4.007905138339921, + "grad_norm": 3.2427003383636475, + "learning_rate": 3.007936507936508e-05, + "loss": 0.9022, + "step": 1014 + }, + { + "epoch": 4.011857707509882, + "grad_norm": 2.5612714290618896, + "learning_rate": 3.005952380952381e-05, + "loss": 0.4494, + "step": 1015 + }, + { + "epoch": 4.015810276679842, + "grad_norm": 3.5918335914611816, + "learning_rate": 3.003968253968254e-05, + "loss": 0.5691, + "step": 1016 + }, + { + "epoch": 4.019762845849803, + "grad_norm": 4.1184892654418945, + "learning_rate": 3.001984126984127e-05, + "loss": 0.6702, + "step": 1017 + }, + { + "epoch": 4.023715415019763, + "grad_norm": 4.948250770568848, + "learning_rate": 3e-05, + "loss": 0.7939, + "step": 1018 + }, + { + "epoch": 4.027667984189724, + "grad_norm": 4.541839122772217, + "learning_rate": 2.998015873015873e-05, + "loss": 0.5846, + "step": 1019 + }, + { + "epoch": 4.031620553359684, + "grad_norm": 6.365063190460205, + "learning_rate": 2.996031746031746e-05, + "loss": 1.0476, + "step": 1020 + }, + { + "epoch": 4.0355731225296445, + "grad_norm": 5.4133992195129395, + "learning_rate": 2.9940476190476192e-05, + "loss": 0.5169, + "step": 1021 + }, + { + "epoch": 4.0395256916996045, + "grad_norm": 5.79783821105957, + "learning_rate": 2.9920634920634922e-05, + "loss": 0.5218, + "step": 1022 + }, + { + "epoch": 4.043478260869565, + "grad_norm": 3.8281731605529785, + "learning_rate": 2.9900793650793653e-05, + "loss": 0.5262, + "step": 1023 + }, + { + "epoch": 4.047430830039525, + "grad_norm": 3.4406702518463135, + "learning_rate": 2.9880952380952383e-05, + "loss": 0.6195, + "step": 1024 + }, + { + "epoch": 4.051383399209486, + "grad_norm": 2.6785409450531006, + "learning_rate": 2.9861111111111113e-05, + "loss": 0.6092, + "step": 1025 + }, + { + "epoch": 4.055335968379446, + "grad_norm": 3.221266508102417, + "learning_rate": 2.9841269841269844e-05, + "loss": 0.5122, + "step": 1026 + }, + { + "epoch": 4.059288537549407, + "grad_norm": 3.454709053039551, + "learning_rate": 2.982142857142857e-05, + "loss": 0.364, + "step": 1027 + }, + { + "epoch": 4.063241106719367, + "grad_norm": 3.4353463649749756, + "learning_rate": 2.98015873015873e-05, + "loss": 0.4952, + "step": 1028 + }, + { + "epoch": 4.067193675889328, + "grad_norm": 3.565638303756714, + "learning_rate": 2.978174603174603e-05, + "loss": 0.7452, + "step": 1029 + }, + { + "epoch": 4.071146245059288, + "grad_norm": 4.197516918182373, + "learning_rate": 2.9761904761904762e-05, + "loss": 0.5709, + "step": 1030 + }, + { + "epoch": 4.075098814229249, + "grad_norm": 3.723968744277954, + "learning_rate": 2.9742063492063492e-05, + "loss": 0.4972, + "step": 1031 + }, + { + "epoch": 4.07905138339921, + "grad_norm": 3.2937045097351074, + "learning_rate": 2.9722222222222223e-05, + "loss": 0.5963, + "step": 1032 + }, + { + "epoch": 4.08300395256917, + "grad_norm": 3.6638174057006836, + "learning_rate": 2.9702380952380953e-05, + "loss": 0.6339, + "step": 1033 + }, + { + "epoch": 4.086956521739131, + "grad_norm": 3.8052680492401123, + "learning_rate": 2.9682539682539683e-05, + "loss": 0.4921, + "step": 1034 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 3.9429097175598145, + "learning_rate": 2.9662698412698414e-05, + "loss": 0.5685, + "step": 1035 + }, + { + "epoch": 4.094861660079052, + "grad_norm": 4.403009414672852, + "learning_rate": 2.9642857142857144e-05, + "loss": 0.8721, + "step": 1036 + }, + { + "epoch": 4.098814229249012, + "grad_norm": 3.999182939529419, + "learning_rate": 2.9623015873015875e-05, + "loss": 0.768, + "step": 1037 + }, + { + "epoch": 4.102766798418973, + "grad_norm": 3.5793051719665527, + "learning_rate": 2.9603174603174605e-05, + "loss": 0.465, + "step": 1038 + }, + { + "epoch": 4.106719367588933, + "grad_norm": 3.8434765338897705, + "learning_rate": 2.9583333333333335e-05, + "loss": 0.5206, + "step": 1039 + }, + { + "epoch": 4.1106719367588935, + "grad_norm": 4.839815139770508, + "learning_rate": 2.9563492063492066e-05, + "loss": 0.4424, + "step": 1040 + }, + { + "epoch": 4.1146245059288535, + "grad_norm": 4.106775283813477, + "learning_rate": 2.9543650793650796e-05, + "loss": 0.5181, + "step": 1041 + }, + { + "epoch": 4.118577075098814, + "grad_norm": 3.0763916969299316, + "learning_rate": 2.9523809523809526e-05, + "loss": 0.5855, + "step": 1042 + }, + { + "epoch": 4.122529644268774, + "grad_norm": 3.8926162719726562, + "learning_rate": 2.9503968253968257e-05, + "loss": 0.6347, + "step": 1043 + }, + { + "epoch": 4.126482213438735, + "grad_norm": 4.27288293838501, + "learning_rate": 2.9484126984126987e-05, + "loss": 0.4295, + "step": 1044 + }, + { + "epoch": 4.130434782608695, + "grad_norm": 2.7551262378692627, + "learning_rate": 2.9464285714285718e-05, + "loss": 0.733, + "step": 1045 + }, + { + "epoch": 4.134387351778656, + "grad_norm": 3.992636203765869, + "learning_rate": 2.9444444444444448e-05, + "loss": 0.5781, + "step": 1046 + }, + { + "epoch": 4.138339920948616, + "grad_norm": 5.2175469398498535, + "learning_rate": 2.9424603174603178e-05, + "loss": 0.5791, + "step": 1047 + }, + { + "epoch": 4.142292490118577, + "grad_norm": 3.3808295726776123, + "learning_rate": 2.940476190476191e-05, + "loss": 0.5001, + "step": 1048 + }, + { + "epoch": 4.146245059288537, + "grad_norm": 3.7828593254089355, + "learning_rate": 2.938492063492064e-05, + "loss": 0.5229, + "step": 1049 + }, + { + "epoch": 4.150197628458498, + "grad_norm": 3.9474239349365234, + "learning_rate": 2.9365079365079366e-05, + "loss": 0.6843, + "step": 1050 + }, + { + "epoch": 4.154150197628459, + "grad_norm": 3.5939300060272217, + "learning_rate": 2.9345238095238096e-05, + "loss": 0.5496, + "step": 1051 + }, + { + "epoch": 4.158102766798419, + "grad_norm": 4.338398456573486, + "learning_rate": 2.9325396825396827e-05, + "loss": 0.5746, + "step": 1052 + }, + { + "epoch": 4.16205533596838, + "grad_norm": 3.5342512130737305, + "learning_rate": 2.9305555555555557e-05, + "loss": 0.5925, + "step": 1053 + }, + { + "epoch": 4.16600790513834, + "grad_norm": 3.970322608947754, + "learning_rate": 2.9285714285714288e-05, + "loss": 0.5174, + "step": 1054 + }, + { + "epoch": 4.169960474308301, + "grad_norm": 3.5085713863372803, + "learning_rate": 2.9265873015873018e-05, + "loss": 0.5177, + "step": 1055 + }, + { + "epoch": 4.173913043478261, + "grad_norm": 4.02853536605835, + "learning_rate": 2.9246031746031748e-05, + "loss": 0.58, + "step": 1056 + }, + { + "epoch": 4.177865612648222, + "grad_norm": 3.9853994846343994, + "learning_rate": 2.922619047619048e-05, + "loss": 0.8849, + "step": 1057 + }, + { + "epoch": 4.181818181818182, + "grad_norm": 3.8760087490081787, + "learning_rate": 2.920634920634921e-05, + "loss": 0.3916, + "step": 1058 + }, + { + "epoch": 4.1857707509881426, + "grad_norm": 4.411335468292236, + "learning_rate": 2.918650793650794e-05, + "loss": 0.5398, + "step": 1059 + }, + { + "epoch": 4.189723320158103, + "grad_norm": 2.8598296642303467, + "learning_rate": 2.916666666666667e-05, + "loss": 0.5319, + "step": 1060 + }, + { + "epoch": 4.1936758893280635, + "grad_norm": 3.225334405899048, + "learning_rate": 2.91468253968254e-05, + "loss": 0.6801, + "step": 1061 + }, + { + "epoch": 4.1976284584980235, + "grad_norm": 3.511744976043701, + "learning_rate": 2.912698412698413e-05, + "loss": 0.6467, + "step": 1062 + }, + { + "epoch": 4.201581027667984, + "grad_norm": 5.1000165939331055, + "learning_rate": 2.910714285714286e-05, + "loss": 0.6979, + "step": 1063 + }, + { + "epoch": 4.205533596837944, + "grad_norm": 4.431115627288818, + "learning_rate": 2.908730158730159e-05, + "loss": 0.7414, + "step": 1064 + }, + { + "epoch": 4.209486166007905, + "grad_norm": 3.7748608589172363, + "learning_rate": 2.906746031746032e-05, + "loss": 0.5364, + "step": 1065 + }, + { + "epoch": 4.213438735177865, + "grad_norm": 3.4133846759796143, + "learning_rate": 2.9047619047619052e-05, + "loss": 0.766, + "step": 1066 + }, + { + "epoch": 4.217391304347826, + "grad_norm": 4.544011116027832, + "learning_rate": 2.9027777777777782e-05, + "loss": 0.6298, + "step": 1067 + }, + { + "epoch": 4.221343873517786, + "grad_norm": 3.5001301765441895, + "learning_rate": 2.9007936507936513e-05, + "loss": 0.6486, + "step": 1068 + }, + { + "epoch": 4.225296442687747, + "grad_norm": 4.15241003036499, + "learning_rate": 2.8988095238095243e-05, + "loss": 0.6689, + "step": 1069 + }, + { + "epoch": 4.229249011857707, + "grad_norm": 4.282387733459473, + "learning_rate": 2.8968253968253974e-05, + "loss": 0.6515, + "step": 1070 + }, + { + "epoch": 4.233201581027668, + "grad_norm": 3.4778153896331787, + "learning_rate": 2.8948412698412704e-05, + "loss": 0.6742, + "step": 1071 + }, + { + "epoch": 4.237154150197629, + "grad_norm": 3.6013362407684326, + "learning_rate": 2.8928571428571434e-05, + "loss": 0.5932, + "step": 1072 + }, + { + "epoch": 4.241106719367589, + "grad_norm": 3.0561881065368652, + "learning_rate": 2.890873015873016e-05, + "loss": 0.5181, + "step": 1073 + }, + { + "epoch": 4.24505928853755, + "grad_norm": 3.436709403991699, + "learning_rate": 2.8888888888888888e-05, + "loss": 0.6392, + "step": 1074 + }, + { + "epoch": 4.24901185770751, + "grad_norm": 3.798583984375, + "learning_rate": 2.886904761904762e-05, + "loss": 0.3719, + "step": 1075 + }, + { + "epoch": 4.252964426877471, + "grad_norm": 4.340543746948242, + "learning_rate": 2.884920634920635e-05, + "loss": 0.7213, + "step": 1076 + }, + { + "epoch": 4.256916996047431, + "grad_norm": 3.0846059322357178, + "learning_rate": 2.882936507936508e-05, + "loss": 0.6184, + "step": 1077 + }, + { + "epoch": 4.260869565217392, + "grad_norm": 3.127023696899414, + "learning_rate": 2.880952380952381e-05, + "loss": 0.8165, + "step": 1078 + }, + { + "epoch": 4.264822134387352, + "grad_norm": 4.435762405395508, + "learning_rate": 2.878968253968254e-05, + "loss": 0.8462, + "step": 1079 + }, + { + "epoch": 4.2687747035573125, + "grad_norm": 3.7268333435058594, + "learning_rate": 2.876984126984127e-05, + "loss": 0.5501, + "step": 1080 + }, + { + "epoch": 4.2727272727272725, + "grad_norm": 4.605154514312744, + "learning_rate": 2.8749999999999997e-05, + "loss": 0.4942, + "step": 1081 + }, + { + "epoch": 4.276679841897233, + "grad_norm": 4.297336101531982, + "learning_rate": 2.8730158730158728e-05, + "loss": 0.7144, + "step": 1082 + }, + { + "epoch": 4.280632411067193, + "grad_norm": 3.4597883224487305, + "learning_rate": 2.8710317460317458e-05, + "loss": 0.6571, + "step": 1083 + }, + { + "epoch": 4.284584980237154, + "grad_norm": 3.579210042953491, + "learning_rate": 2.869047619047619e-05, + "loss": 0.8167, + "step": 1084 + }, + { + "epoch": 4.288537549407114, + "grad_norm": 4.63372802734375, + "learning_rate": 2.867063492063492e-05, + "loss": 0.5739, + "step": 1085 + }, + { + "epoch": 4.292490118577075, + "grad_norm": 4.892091751098633, + "learning_rate": 2.865079365079365e-05, + "loss": 0.7786, + "step": 1086 + }, + { + "epoch": 4.296442687747035, + "grad_norm": 3.682457208633423, + "learning_rate": 2.863095238095238e-05, + "loss": 0.6753, + "step": 1087 + }, + { + "epoch": 4.300395256916996, + "grad_norm": 4.4599432945251465, + "learning_rate": 2.861111111111111e-05, + "loss": 0.5335, + "step": 1088 + }, + { + "epoch": 4.304347826086957, + "grad_norm": 4.139975070953369, + "learning_rate": 2.859126984126984e-05, + "loss": 0.631, + "step": 1089 + }, + { + "epoch": 4.308300395256917, + "grad_norm": 2.9012610912323, + "learning_rate": 2.857142857142857e-05, + "loss": 0.6686, + "step": 1090 + }, + { + "epoch": 4.312252964426877, + "grad_norm": 4.059675216674805, + "learning_rate": 2.85515873015873e-05, + "loss": 0.5449, + "step": 1091 + }, + { + "epoch": 4.316205533596838, + "grad_norm": 3.3060367107391357, + "learning_rate": 2.853174603174603e-05, + "loss": 0.5399, + "step": 1092 + }, + { + "epoch": 4.320158102766799, + "grad_norm": 4.22020149230957, + "learning_rate": 2.8511904761904762e-05, + "loss": 0.5664, + "step": 1093 + }, + { + "epoch": 4.324110671936759, + "grad_norm": 3.2028045654296875, + "learning_rate": 2.8492063492063492e-05, + "loss": 0.5289, + "step": 1094 + }, + { + "epoch": 4.32806324110672, + "grad_norm": 4.202157497406006, + "learning_rate": 2.8472222222222223e-05, + "loss": 0.6845, + "step": 1095 + }, + { + "epoch": 4.33201581027668, + "grad_norm": 3.3916430473327637, + "learning_rate": 2.8452380952380953e-05, + "loss": 0.7733, + "step": 1096 + }, + { + "epoch": 4.335968379446641, + "grad_norm": 3.6081862449645996, + "learning_rate": 2.8432539682539683e-05, + "loss": 0.7535, + "step": 1097 + }, + { + "epoch": 4.339920948616601, + "grad_norm": 3.1682872772216797, + "learning_rate": 2.8412698412698414e-05, + "loss": 0.5639, + "step": 1098 + }, + { + "epoch": 4.3438735177865615, + "grad_norm": 3.9137520790100098, + "learning_rate": 2.8392857142857144e-05, + "loss": 0.5085, + "step": 1099 + }, + { + "epoch": 4.3478260869565215, + "grad_norm": 3.902578353881836, + "learning_rate": 2.8373015873015875e-05, + "loss": 0.7883, + "step": 1100 + }, + { + "epoch": 4.351778656126482, + "grad_norm": 3.498415946960449, + "learning_rate": 2.8353174603174605e-05, + "loss": 0.7754, + "step": 1101 + }, + { + "epoch": 4.355731225296442, + "grad_norm": 3.8628177642822266, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.7349, + "step": 1102 + }, + { + "epoch": 4.359683794466403, + "grad_norm": 3.9302592277526855, + "learning_rate": 2.8313492063492066e-05, + "loss": 0.5988, + "step": 1103 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 3.7080790996551514, + "learning_rate": 2.8293650793650793e-05, + "loss": 0.7116, + "step": 1104 + }, + { + "epoch": 4.367588932806324, + "grad_norm": 3.222238779067993, + "learning_rate": 2.8273809523809523e-05, + "loss": 0.4634, + "step": 1105 + }, + { + "epoch": 4.371541501976284, + "grad_norm": 3.111239194869995, + "learning_rate": 2.8253968253968253e-05, + "loss": 0.4384, + "step": 1106 + }, + { + "epoch": 4.375494071146245, + "grad_norm": 3.0986177921295166, + "learning_rate": 2.8234126984126984e-05, + "loss": 0.7854, + "step": 1107 + }, + { + "epoch": 4.379446640316205, + "grad_norm": 4.473976135253906, + "learning_rate": 2.8214285714285714e-05, + "loss": 0.6981, + "step": 1108 + }, + { + "epoch": 4.383399209486166, + "grad_norm": 3.3967599868774414, + "learning_rate": 2.8194444444444445e-05, + "loss": 0.7922, + "step": 1109 + }, + { + "epoch": 4.387351778656127, + "grad_norm": 5.092434406280518, + "learning_rate": 2.8174603174603175e-05, + "loss": 0.6217, + "step": 1110 + }, + { + "epoch": 4.391304347826087, + "grad_norm": 6.223644256591797, + "learning_rate": 2.8154761904761905e-05, + "loss": 0.6414, + "step": 1111 + }, + { + "epoch": 4.395256916996048, + "grad_norm": 4.24446439743042, + "learning_rate": 2.8134920634920636e-05, + "loss": 0.9485, + "step": 1112 + }, + { + "epoch": 4.399209486166008, + "grad_norm": 4.5990986824035645, + "learning_rate": 2.8115079365079366e-05, + "loss": 0.7239, + "step": 1113 + }, + { + "epoch": 4.403162055335969, + "grad_norm": 4.005640506744385, + "learning_rate": 2.8095238095238096e-05, + "loss": 0.9314, + "step": 1114 + }, + { + "epoch": 4.407114624505929, + "grad_norm": 4.990167140960693, + "learning_rate": 2.8075396825396827e-05, + "loss": 0.6125, + "step": 1115 + }, + { + "epoch": 4.41106719367589, + "grad_norm": 4.539425849914551, + "learning_rate": 2.8055555555555557e-05, + "loss": 0.4673, + "step": 1116 + }, + { + "epoch": 4.41501976284585, + "grad_norm": 5.004159927368164, + "learning_rate": 2.8035714285714288e-05, + "loss": 0.6748, + "step": 1117 + }, + { + "epoch": 4.4189723320158105, + "grad_norm": 5.46227502822876, + "learning_rate": 2.8015873015873018e-05, + "loss": 0.6671, + "step": 1118 + }, + { + "epoch": 4.4229249011857705, + "grad_norm": 3.3494784832000732, + "learning_rate": 2.799603174603175e-05, + "loss": 0.5736, + "step": 1119 + }, + { + "epoch": 4.426877470355731, + "grad_norm": 4.099065780639648, + "learning_rate": 2.797619047619048e-05, + "loss": 0.6317, + "step": 1120 + }, + { + "epoch": 4.430830039525691, + "grad_norm": 3.662707805633545, + "learning_rate": 2.795634920634921e-05, + "loss": 0.6749, + "step": 1121 + }, + { + "epoch": 4.434782608695652, + "grad_norm": 3.4078757762908936, + "learning_rate": 2.793650793650794e-05, + "loss": 0.723, + "step": 1122 + }, + { + "epoch": 4.438735177865612, + "grad_norm": 3.4741852283477783, + "learning_rate": 2.791666666666667e-05, + "loss": 0.7619, + "step": 1123 + }, + { + "epoch": 4.442687747035573, + "grad_norm": 4.361887454986572, + "learning_rate": 2.78968253968254e-05, + "loss": 0.7173, + "step": 1124 + }, + { + "epoch": 4.446640316205533, + "grad_norm": 3.31022047996521, + "learning_rate": 2.787698412698413e-05, + "loss": 0.4076, + "step": 1125 + }, + { + "epoch": 4.450592885375494, + "grad_norm": 3.635115623474121, + "learning_rate": 2.785714285714286e-05, + "loss": 0.5728, + "step": 1126 + }, + { + "epoch": 4.454545454545454, + "grad_norm": 5.300922870635986, + "learning_rate": 2.7837301587301588e-05, + "loss": 0.8788, + "step": 1127 + }, + { + "epoch": 4.458498023715415, + "grad_norm": 4.8898491859436035, + "learning_rate": 2.781746031746032e-05, + "loss": 0.6955, + "step": 1128 + }, + { + "epoch": 4.462450592885375, + "grad_norm": 3.6756420135498047, + "learning_rate": 2.779761904761905e-05, + "loss": 0.6531, + "step": 1129 + }, + { + "epoch": 4.466403162055336, + "grad_norm": 4.139333724975586, + "learning_rate": 2.777777777777778e-05, + "loss": 0.4227, + "step": 1130 + }, + { + "epoch": 4.470355731225297, + "grad_norm": 4.150503158569336, + "learning_rate": 2.775793650793651e-05, + "loss": 0.5596, + "step": 1131 + }, + { + "epoch": 4.474308300395257, + "grad_norm": 3.9440436363220215, + "learning_rate": 2.773809523809524e-05, + "loss": 0.5473, + "step": 1132 + }, + { + "epoch": 4.478260869565218, + "grad_norm": 4.698122978210449, + "learning_rate": 2.771825396825397e-05, + "loss": 0.5331, + "step": 1133 + }, + { + "epoch": 4.482213438735178, + "grad_norm": 4.4642133712768555, + "learning_rate": 2.76984126984127e-05, + "loss": 0.5267, + "step": 1134 + }, + { + "epoch": 4.486166007905139, + "grad_norm": 3.5104897022247314, + "learning_rate": 2.767857142857143e-05, + "loss": 0.7138, + "step": 1135 + }, + { + "epoch": 4.490118577075099, + "grad_norm": 4.170843601226807, + "learning_rate": 2.765873015873016e-05, + "loss": 0.8175, + "step": 1136 + }, + { + "epoch": 4.4940711462450595, + "grad_norm": 3.3033299446105957, + "learning_rate": 2.7638888888888892e-05, + "loss": 0.5711, + "step": 1137 + }, + { + "epoch": 4.4980237154150196, + "grad_norm": 4.042965412139893, + "learning_rate": 2.7619047619047622e-05, + "loss": 0.7944, + "step": 1138 + }, + { + "epoch": 4.5019762845849804, + "grad_norm": 3.890293836593628, + "learning_rate": 2.7599206349206352e-05, + "loss": 0.7309, + "step": 1139 + }, + { + "epoch": 4.5059288537549405, + "grad_norm": 3.5573506355285645, + "learning_rate": 2.7579365079365083e-05, + "loss": 0.2866, + "step": 1140 + }, + { + "epoch": 4.509881422924901, + "grad_norm": 3.3874566555023193, + "learning_rate": 2.7559523809523813e-05, + "loss": 0.5363, + "step": 1141 + }, + { + "epoch": 4.513833992094861, + "grad_norm": 3.777653217315674, + "learning_rate": 2.7539682539682544e-05, + "loss": 0.8195, + "step": 1142 + }, + { + "epoch": 4.517786561264822, + "grad_norm": 3.0119121074676514, + "learning_rate": 2.7519841269841274e-05, + "loss": 0.4722, + "step": 1143 + }, + { + "epoch": 4.521739130434782, + "grad_norm": 3.57424259185791, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.6739, + "step": 1144 + }, + { + "epoch": 4.525691699604743, + "grad_norm": 4.8777174949646, + "learning_rate": 2.7480158730158735e-05, + "loss": 0.5557, + "step": 1145 + }, + { + "epoch": 4.529644268774703, + "grad_norm": 5.821610927581787, + "learning_rate": 2.7460317460317465e-05, + "loss": 0.7279, + "step": 1146 + }, + { + "epoch": 4.533596837944664, + "grad_norm": 3.617403268814087, + "learning_rate": 2.7440476190476195e-05, + "loss": 0.7618, + "step": 1147 + }, + { + "epoch": 4.537549407114625, + "grad_norm": 4.069850921630859, + "learning_rate": 2.7420634920634926e-05, + "loss": 0.6691, + "step": 1148 + }, + { + "epoch": 4.541501976284585, + "grad_norm": 3.8116888999938965, + "learning_rate": 2.7400793650793656e-05, + "loss": 0.6847, + "step": 1149 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 4.080233573913574, + "learning_rate": 2.7380952380952383e-05, + "loss": 0.7181, + "step": 1150 + }, + { + "epoch": 4.549407114624506, + "grad_norm": 3.930443048477173, + "learning_rate": 2.7361111111111114e-05, + "loss": 0.4972, + "step": 1151 + }, + { + "epoch": 4.553359683794467, + "grad_norm": 3.8190906047821045, + "learning_rate": 2.734126984126984e-05, + "loss": 0.7407, + "step": 1152 + }, + { + "epoch": 4.557312252964427, + "grad_norm": 4.792582988739014, + "learning_rate": 2.732142857142857e-05, + "loss": 0.5122, + "step": 1153 + }, + { + "epoch": 4.561264822134388, + "grad_norm": 3.7988221645355225, + "learning_rate": 2.73015873015873e-05, + "loss": 0.7225, + "step": 1154 + }, + { + "epoch": 4.565217391304348, + "grad_norm": 4.3564605712890625, + "learning_rate": 2.7281746031746032e-05, + "loss": 0.5053, + "step": 1155 + }, + { + "epoch": 4.569169960474309, + "grad_norm": 3.3426055908203125, + "learning_rate": 2.7261904761904762e-05, + "loss": 0.5994, + "step": 1156 + }, + { + "epoch": 4.573122529644269, + "grad_norm": 3.6366496086120605, + "learning_rate": 2.7242063492063492e-05, + "loss": 0.7183, + "step": 1157 + }, + { + "epoch": 4.5770750988142295, + "grad_norm": 4.141812801361084, + "learning_rate": 2.7222222222222223e-05, + "loss": 0.8983, + "step": 1158 + }, + { + "epoch": 4.5810276679841895, + "grad_norm": 3.6468398571014404, + "learning_rate": 2.720238095238095e-05, + "loss": 0.7188, + "step": 1159 + }, + { + "epoch": 4.58498023715415, + "grad_norm": 3.9180054664611816, + "learning_rate": 2.718253968253968e-05, + "loss": 0.5211, + "step": 1160 + }, + { + "epoch": 4.58893280632411, + "grad_norm": 3.7029590606689453, + "learning_rate": 2.716269841269841e-05, + "loss": 0.6044, + "step": 1161 + }, + { + "epoch": 4.592885375494071, + "grad_norm": 4.28466272354126, + "learning_rate": 2.714285714285714e-05, + "loss": 0.6413, + "step": 1162 + }, + { + "epoch": 4.596837944664031, + "grad_norm": 5.500331401824951, + "learning_rate": 2.712301587301587e-05, + "loss": 0.7009, + "step": 1163 + }, + { + "epoch": 4.600790513833992, + "grad_norm": 4.083467960357666, + "learning_rate": 2.7103174603174602e-05, + "loss": 0.822, + "step": 1164 + }, + { + "epoch": 4.604743083003952, + "grad_norm": 2.7674543857574463, + "learning_rate": 2.7083333333333332e-05, + "loss": 0.6164, + "step": 1165 + }, + { + "epoch": 4.608695652173913, + "grad_norm": 3.0514588356018066, + "learning_rate": 2.7063492063492062e-05, + "loss": 0.4806, + "step": 1166 + }, + { + "epoch": 4.612648221343873, + "grad_norm": 4.635437965393066, + "learning_rate": 2.7043650793650793e-05, + "loss": 0.5268, + "step": 1167 + }, + { + "epoch": 4.616600790513834, + "grad_norm": 3.791935443878174, + "learning_rate": 2.7023809523809523e-05, + "loss": 0.6394, + "step": 1168 + }, + { + "epoch": 4.620553359683795, + "grad_norm": 3.0412096977233887, + "learning_rate": 2.7003968253968254e-05, + "loss": 0.5388, + "step": 1169 + }, + { + "epoch": 4.624505928853755, + "grad_norm": 3.3811123371124268, + "learning_rate": 2.6984126984126984e-05, + "loss": 0.5911, + "step": 1170 + }, + { + "epoch": 4.628458498023716, + "grad_norm": 3.5328590869903564, + "learning_rate": 2.6964285714285714e-05, + "loss": 0.5826, + "step": 1171 + }, + { + "epoch": 4.632411067193676, + "grad_norm": 3.483593702316284, + "learning_rate": 2.6944444444444445e-05, + "loss": 0.5478, + "step": 1172 + }, + { + "epoch": 4.636363636363637, + "grad_norm": 4.339010238647461, + "learning_rate": 2.6924603174603175e-05, + "loss": 0.5807, + "step": 1173 + }, + { + "epoch": 4.640316205533597, + "grad_norm": 3.775643825531006, + "learning_rate": 2.6904761904761905e-05, + "loss": 0.7021, + "step": 1174 + }, + { + "epoch": 4.644268774703558, + "grad_norm": 4.458033561706543, + "learning_rate": 2.6884920634920636e-05, + "loss": 0.6987, + "step": 1175 + }, + { + "epoch": 4.648221343873518, + "grad_norm": 4.082224369049072, + "learning_rate": 2.6865079365079366e-05, + "loss": 0.5305, + "step": 1176 + }, + { + "epoch": 4.6521739130434785, + "grad_norm": 4.834577560424805, + "learning_rate": 2.6845238095238097e-05, + "loss": 0.5664, + "step": 1177 + }, + { + "epoch": 4.6561264822134385, + "grad_norm": 4.269159317016602, + "learning_rate": 2.6825396825396827e-05, + "loss": 0.5704, + "step": 1178 + }, + { + "epoch": 4.660079051383399, + "grad_norm": 3.2594568729400635, + "learning_rate": 2.6805555555555557e-05, + "loss": 0.5927, + "step": 1179 + }, + { + "epoch": 4.664031620553359, + "grad_norm": 3.458214521408081, + "learning_rate": 2.6785714285714288e-05, + "loss": 0.5754, + "step": 1180 + }, + { + "epoch": 4.66798418972332, + "grad_norm": 4.189966201782227, + "learning_rate": 2.6765873015873018e-05, + "loss": 0.8723, + "step": 1181 + }, + { + "epoch": 4.67193675889328, + "grad_norm": 3.8360981941223145, + "learning_rate": 2.6746031746031745e-05, + "loss": 0.4775, + "step": 1182 + }, + { + "epoch": 4.675889328063241, + "grad_norm": 4.381925582885742, + "learning_rate": 2.6726190476190475e-05, + "loss": 0.607, + "step": 1183 + }, + { + "epoch": 4.679841897233201, + "grad_norm": 4.080252647399902, + "learning_rate": 2.6706349206349206e-05, + "loss": 0.8249, + "step": 1184 + }, + { + "epoch": 4.683794466403162, + "grad_norm": 3.6359405517578125, + "learning_rate": 2.6686507936507936e-05, + "loss": 0.3814, + "step": 1185 + }, + { + "epoch": 4.687747035573123, + "grad_norm": 3.182612895965576, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.5989, + "step": 1186 + }, + { + "epoch": 4.691699604743083, + "grad_norm": 4.81362247467041, + "learning_rate": 2.6646825396825397e-05, + "loss": 0.7298, + "step": 1187 + }, + { + "epoch": 4.695652173913043, + "grad_norm": 3.2109711170196533, + "learning_rate": 2.6626984126984127e-05, + "loss": 0.3586, + "step": 1188 + }, + { + "epoch": 4.699604743083004, + "grad_norm": 3.968430280685425, + "learning_rate": 2.6607142857142858e-05, + "loss": 0.5076, + "step": 1189 + }, + { + "epoch": 4.703557312252965, + "grad_norm": 3.057274341583252, + "learning_rate": 2.6587301587301588e-05, + "loss": 0.4669, + "step": 1190 + }, + { + "epoch": 4.707509881422925, + "grad_norm": 4.017573356628418, + "learning_rate": 2.656746031746032e-05, + "loss": 0.6747, + "step": 1191 + }, + { + "epoch": 4.711462450592886, + "grad_norm": 3.6146085262298584, + "learning_rate": 2.654761904761905e-05, + "loss": 0.4618, + "step": 1192 + }, + { + "epoch": 4.715415019762846, + "grad_norm": 3.433858871459961, + "learning_rate": 2.652777777777778e-05, + "loss": 0.4446, + "step": 1193 + }, + { + "epoch": 4.719367588932807, + "grad_norm": 3.7666232585906982, + "learning_rate": 2.650793650793651e-05, + "loss": 0.5724, + "step": 1194 + }, + { + "epoch": 4.723320158102767, + "grad_norm": 3.7725718021392822, + "learning_rate": 2.648809523809524e-05, + "loss": 0.7872, + "step": 1195 + }, + { + "epoch": 4.7272727272727275, + "grad_norm": 4.2439446449279785, + "learning_rate": 2.646825396825397e-05, + "loss": 0.7158, + "step": 1196 + }, + { + "epoch": 4.7312252964426875, + "grad_norm": 5.448159694671631, + "learning_rate": 2.64484126984127e-05, + "loss": 0.4721, + "step": 1197 + }, + { + "epoch": 4.735177865612648, + "grad_norm": 4.358859062194824, + "learning_rate": 2.642857142857143e-05, + "loss": 0.6254, + "step": 1198 + }, + { + "epoch": 4.739130434782608, + "grad_norm": 3.7496564388275146, + "learning_rate": 2.640873015873016e-05, + "loss": 0.7183, + "step": 1199 + }, + { + "epoch": 4.743083003952569, + "grad_norm": 4.28209114074707, + "learning_rate": 2.6388888888888892e-05, + "loss": 0.5722, + "step": 1200 + }, + { + "epoch": 4.747035573122529, + "grad_norm": 3.856718063354492, + "learning_rate": 2.6369047619047622e-05, + "loss": 0.5289, + "step": 1201 + }, + { + "epoch": 4.75098814229249, + "grad_norm": 3.4435012340545654, + "learning_rate": 2.6349206349206353e-05, + "loss": 0.7542, + "step": 1202 + }, + { + "epoch": 4.75494071146245, + "grad_norm": 4.4995436668396, + "learning_rate": 2.6329365079365083e-05, + "loss": 0.7093, + "step": 1203 + }, + { + "epoch": 4.758893280632411, + "grad_norm": 3.929421901702881, + "learning_rate": 2.6309523809523813e-05, + "loss": 0.8889, + "step": 1204 + }, + { + "epoch": 4.762845849802371, + "grad_norm": 3.778069496154785, + "learning_rate": 2.628968253968254e-05, + "loss": 0.766, + "step": 1205 + }, + { + "epoch": 4.766798418972332, + "grad_norm": 3.344264030456543, + "learning_rate": 2.626984126984127e-05, + "loss": 0.5133, + "step": 1206 + }, + { + "epoch": 4.770750988142293, + "grad_norm": 3.597881317138672, + "learning_rate": 2.625e-05, + "loss": 0.7008, + "step": 1207 + }, + { + "epoch": 4.774703557312253, + "grad_norm": 3.753389358520508, + "learning_rate": 2.623015873015873e-05, + "loss": 0.5307, + "step": 1208 + }, + { + "epoch": 4.778656126482213, + "grad_norm": 3.206299066543579, + "learning_rate": 2.6210317460317462e-05, + "loss": 0.5639, + "step": 1209 + }, + { + "epoch": 4.782608695652174, + "grad_norm": 3.630187749862671, + "learning_rate": 2.6190476190476192e-05, + "loss": 0.4498, + "step": 1210 + }, + { + "epoch": 4.786561264822135, + "grad_norm": 3.9658334255218506, + "learning_rate": 2.6170634920634923e-05, + "loss": 0.6379, + "step": 1211 + }, + { + "epoch": 4.790513833992095, + "grad_norm": 4.101767539978027, + "learning_rate": 2.6150793650793653e-05, + "loss": 0.6574, + "step": 1212 + }, + { + "epoch": 4.794466403162056, + "grad_norm": 3.9470412731170654, + "learning_rate": 2.6130952380952383e-05, + "loss": 0.5475, + "step": 1213 + }, + { + "epoch": 4.798418972332016, + "grad_norm": 4.430981159210205, + "learning_rate": 2.6111111111111114e-05, + "loss": 0.4462, + "step": 1214 + }, + { + "epoch": 4.8023715415019765, + "grad_norm": 5.369266033172607, + "learning_rate": 2.6091269841269844e-05, + "loss": 0.7793, + "step": 1215 + }, + { + "epoch": 4.8063241106719365, + "grad_norm": 3.2877326011657715, + "learning_rate": 2.6071428571428574e-05, + "loss": 0.5941, + "step": 1216 + }, + { + "epoch": 4.810276679841897, + "grad_norm": 4.576911449432373, + "learning_rate": 2.6051587301587305e-05, + "loss": 0.582, + "step": 1217 + }, + { + "epoch": 4.8142292490118574, + "grad_norm": 3.5742104053497314, + "learning_rate": 2.6031746031746035e-05, + "loss": 0.6526, + "step": 1218 + }, + { + "epoch": 4.818181818181818, + "grad_norm": 2.9173500537872314, + "learning_rate": 2.6011904761904766e-05, + "loss": 0.4235, + "step": 1219 + }, + { + "epoch": 4.822134387351778, + "grad_norm": 3.32147479057312, + "learning_rate": 2.5992063492063496e-05, + "loss": 0.5129, + "step": 1220 + }, + { + "epoch": 4.826086956521739, + "grad_norm": 3.451444387435913, + "learning_rate": 2.5972222222222226e-05, + "loss": 0.5899, + "step": 1221 + }, + { + "epoch": 4.830039525691699, + "grad_norm": 4.481076717376709, + "learning_rate": 2.5952380952380957e-05, + "loss": 0.7092, + "step": 1222 + }, + { + "epoch": 4.83399209486166, + "grad_norm": 5.769997596740723, + "learning_rate": 2.5932539682539687e-05, + "loss": 0.8478, + "step": 1223 + }, + { + "epoch": 4.837944664031621, + "grad_norm": 4.319329738616943, + "learning_rate": 2.5912698412698417e-05, + "loss": 0.7591, + "step": 1224 + }, + { + "epoch": 4.841897233201581, + "grad_norm": 3.661302328109741, + "learning_rate": 2.5892857142857148e-05, + "loss": 0.6261, + "step": 1225 + }, + { + "epoch": 4.845849802371541, + "grad_norm": 3.7250123023986816, + "learning_rate": 2.5873015873015878e-05, + "loss": 0.8497, + "step": 1226 + }, + { + "epoch": 4.849802371541502, + "grad_norm": 4.192583084106445, + "learning_rate": 2.585317460317461e-05, + "loss": 0.6283, + "step": 1227 + }, + { + "epoch": 4.853754940711463, + "grad_norm": 3.978309154510498, + "learning_rate": 2.5833333333333336e-05, + "loss": 0.7002, + "step": 1228 + }, + { + "epoch": 4.857707509881423, + "grad_norm": 3.473998546600342, + "learning_rate": 2.5813492063492066e-05, + "loss": 0.6287, + "step": 1229 + }, + { + "epoch": 4.861660079051384, + "grad_norm": 3.9068286418914795, + "learning_rate": 2.5793650793650796e-05, + "loss": 0.4956, + "step": 1230 + }, + { + "epoch": 4.865612648221344, + "grad_norm": 2.8710238933563232, + "learning_rate": 2.5773809523809523e-05, + "loss": 0.6042, + "step": 1231 + }, + { + "epoch": 4.869565217391305, + "grad_norm": 3.4524970054626465, + "learning_rate": 2.5753968253968254e-05, + "loss": 0.4019, + "step": 1232 + }, + { + "epoch": 4.873517786561265, + "grad_norm": 3.4277803897857666, + "learning_rate": 2.5734126984126984e-05, + "loss": 0.4765, + "step": 1233 + }, + { + "epoch": 4.877470355731226, + "grad_norm": 3.466019868850708, + "learning_rate": 2.5714285714285714e-05, + "loss": 0.6861, + "step": 1234 + }, + { + "epoch": 4.881422924901186, + "grad_norm": 4.309812068939209, + "learning_rate": 2.5694444444444445e-05, + "loss": 0.5055, + "step": 1235 + }, + { + "epoch": 4.8853754940711465, + "grad_norm": 3.608254909515381, + "learning_rate": 2.5674603174603172e-05, + "loss": 0.5123, + "step": 1236 + }, + { + "epoch": 4.8893280632411065, + "grad_norm": 4.4674787521362305, + "learning_rate": 2.5654761904761902e-05, + "loss": 0.8411, + "step": 1237 + }, + { + "epoch": 4.893280632411067, + "grad_norm": 4.53634786605835, + "learning_rate": 2.5634920634920633e-05, + "loss": 0.5074, + "step": 1238 + }, + { + "epoch": 4.897233201581027, + "grad_norm": 3.6150360107421875, + "learning_rate": 2.5615079365079363e-05, + "loss": 0.7875, + "step": 1239 + }, + { + "epoch": 4.901185770750988, + "grad_norm": 4.096851348876953, + "learning_rate": 2.5595238095238093e-05, + "loss": 0.6804, + "step": 1240 + }, + { + "epoch": 4.905138339920948, + "grad_norm": 3.8696417808532715, + "learning_rate": 2.5575396825396824e-05, + "loss": 0.6159, + "step": 1241 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 3.823349714279175, + "learning_rate": 2.5555555555555554e-05, + "loss": 0.6646, + "step": 1242 + }, + { + "epoch": 4.913043478260869, + "grad_norm": 4.550106048583984, + "learning_rate": 2.5535714285714284e-05, + "loss": 0.6771, + "step": 1243 + }, + { + "epoch": 4.91699604743083, + "grad_norm": 4.155416011810303, + "learning_rate": 2.5515873015873015e-05, + "loss": 0.6493, + "step": 1244 + }, + { + "epoch": 4.920948616600791, + "grad_norm": 3.774624824523926, + "learning_rate": 2.5496031746031745e-05, + "loss": 0.6771, + "step": 1245 + }, + { + "epoch": 4.924901185770751, + "grad_norm": 3.5552561283111572, + "learning_rate": 2.5476190476190476e-05, + "loss": 0.6626, + "step": 1246 + }, + { + "epoch": 4.928853754940711, + "grad_norm": 3.7037765979766846, + "learning_rate": 2.5456349206349206e-05, + "loss": 0.5427, + "step": 1247 + }, + { + "epoch": 4.932806324110672, + "grad_norm": 4.090572357177734, + "learning_rate": 2.5436507936507936e-05, + "loss": 0.7206, + "step": 1248 + }, + { + "epoch": 4.936758893280633, + "grad_norm": 3.817936420440674, + "learning_rate": 2.5416666666666667e-05, + "loss": 0.4127, + "step": 1249 + }, + { + "epoch": 4.940711462450593, + "grad_norm": 3.522655487060547, + "learning_rate": 2.5396825396825397e-05, + "loss": 0.6837, + "step": 1250 + }, + { + "epoch": 4.944664031620554, + "grad_norm": 3.069960832595825, + "learning_rate": 2.5376984126984127e-05, + "loss": 0.5466, + "step": 1251 + }, + { + "epoch": 4.948616600790514, + "grad_norm": 2.7633731365203857, + "learning_rate": 2.5357142857142858e-05, + "loss": 0.3194, + "step": 1252 + }, + { + "epoch": 4.952569169960475, + "grad_norm": 4.602902889251709, + "learning_rate": 2.5337301587301588e-05, + "loss": 0.5539, + "step": 1253 + }, + { + "epoch": 4.956521739130435, + "grad_norm": 4.454743385314941, + "learning_rate": 2.531746031746032e-05, + "loss": 0.5853, + "step": 1254 + }, + { + "epoch": 4.9604743083003955, + "grad_norm": 3.8360097408294678, + "learning_rate": 2.529761904761905e-05, + "loss": 0.6157, + "step": 1255 + }, + { + "epoch": 4.9644268774703555, + "grad_norm": 3.1883623600006104, + "learning_rate": 2.527777777777778e-05, + "loss": 0.6483, + "step": 1256 + }, + { + "epoch": 4.968379446640316, + "grad_norm": 3.8525397777557373, + "learning_rate": 2.525793650793651e-05, + "loss": 0.4996, + "step": 1257 + }, + { + "epoch": 4.972332015810276, + "grad_norm": 3.8935108184814453, + "learning_rate": 2.523809523809524e-05, + "loss": 0.5236, + "step": 1258 + }, + { + "epoch": 4.976284584980237, + "grad_norm": 3.436164140701294, + "learning_rate": 2.5218253968253967e-05, + "loss": 0.6688, + "step": 1259 + }, + { + "epoch": 4.980237154150197, + "grad_norm": 3.803886651992798, + "learning_rate": 2.5198412698412697e-05, + "loss": 0.4893, + "step": 1260 + }, + { + "epoch": 4.984189723320158, + "grad_norm": 4.310186862945557, + "learning_rate": 2.5178571428571428e-05, + "loss": 0.3476, + "step": 1261 + }, + { + "epoch": 4.988142292490118, + "grad_norm": 4.136441707611084, + "learning_rate": 2.5158730158730158e-05, + "loss": 0.6233, + "step": 1262 + }, + { + "epoch": 4.992094861660079, + "grad_norm": 3.9039924144744873, + "learning_rate": 2.513888888888889e-05, + "loss": 0.6625, + "step": 1263 + }, + { + "epoch": 4.996047430830039, + "grad_norm": 5.175858497619629, + "learning_rate": 2.511904761904762e-05, + "loss": 0.5538, + "step": 1264 + }, + { + "epoch": 5.0, + "grad_norm": 4.218173503875732, + "learning_rate": 2.509920634920635e-05, + "loss": 0.7123, + "step": 1265 + }, + { + "epoch": 5.003952569169961, + "grad_norm": 3.1178061962127686, + "learning_rate": 2.507936507936508e-05, + "loss": 0.4178, + "step": 1266 + }, + { + "epoch": 5.007905138339921, + "grad_norm": 2.8245718479156494, + "learning_rate": 2.505952380952381e-05, + "loss": 0.2365, + "step": 1267 + }, + { + "epoch": 5.011857707509882, + "grad_norm": 2.6951630115509033, + "learning_rate": 2.503968253968254e-05, + "loss": 0.4007, + "step": 1268 + }, + { + "epoch": 5.015810276679842, + "grad_norm": 2.6765530109405518, + "learning_rate": 2.501984126984127e-05, + "loss": 0.2377, + "step": 1269 + }, + { + "epoch": 5.019762845849803, + "grad_norm": 4.122332572937012, + "learning_rate": 2.5e-05, + "loss": 0.3555, + "step": 1270 + }, + { + "epoch": 5.023715415019763, + "grad_norm": 4.3077712059021, + "learning_rate": 2.498015873015873e-05, + "loss": 0.2416, + "step": 1271 + }, + { + "epoch": 5.027667984189724, + "grad_norm": 4.278382778167725, + "learning_rate": 2.4960317460317462e-05, + "loss": 0.2413, + "step": 1272 + }, + { + "epoch": 5.031620553359684, + "grad_norm": 5.61036491394043, + "learning_rate": 2.4940476190476192e-05, + "loss": 0.4181, + "step": 1273 + }, + { + "epoch": 5.0355731225296445, + "grad_norm": 5.716897487640381, + "learning_rate": 2.4920634920634923e-05, + "loss": 0.4539, + "step": 1274 + }, + { + "epoch": 5.0395256916996045, + "grad_norm": 4.314607620239258, + "learning_rate": 2.4900793650793653e-05, + "loss": 0.3872, + "step": 1275 + }, + { + "epoch": 5.043478260869565, + "grad_norm": 4.847557067871094, + "learning_rate": 2.4880952380952383e-05, + "loss": 0.4242, + "step": 1276 + }, + { + "epoch": 5.047430830039525, + "grad_norm": 4.569025039672852, + "learning_rate": 2.4861111111111114e-05, + "loss": 0.3746, + "step": 1277 + }, + { + "epoch": 5.051383399209486, + "grad_norm": 3.309478282928467, + "learning_rate": 2.4841269841269844e-05, + "loss": 0.263, + "step": 1278 + }, + { + "epoch": 5.055335968379446, + "grad_norm": 3.7712574005126953, + "learning_rate": 2.4821428571428575e-05, + "loss": 0.3596, + "step": 1279 + }, + { + "epoch": 5.059288537549407, + "grad_norm": 4.986356258392334, + "learning_rate": 2.4801587301587305e-05, + "loss": 0.324, + "step": 1280 + }, + { + "epoch": 5.063241106719367, + "grad_norm": 3.729706048965454, + "learning_rate": 2.4781746031746035e-05, + "loss": 0.476, + "step": 1281 + }, + { + "epoch": 5.067193675889328, + "grad_norm": 3.7136807441711426, + "learning_rate": 2.4761904761904762e-05, + "loss": 0.2756, + "step": 1282 + }, + { + "epoch": 5.071146245059288, + "grad_norm": 2.8954904079437256, + "learning_rate": 2.4742063492063493e-05, + "loss": 0.3505, + "step": 1283 + }, + { + "epoch": 5.075098814229249, + "grad_norm": 4.071838855743408, + "learning_rate": 2.4722222222222223e-05, + "loss": 0.447, + "step": 1284 + }, + { + "epoch": 5.07905138339921, + "grad_norm": 3.134131908416748, + "learning_rate": 2.4702380952380953e-05, + "loss": 0.3407, + "step": 1285 + }, + { + "epoch": 5.08300395256917, + "grad_norm": 3.690873861312866, + "learning_rate": 2.4682539682539684e-05, + "loss": 0.3665, + "step": 1286 + }, + { + "epoch": 5.086956521739131, + "grad_norm": 4.1070051193237305, + "learning_rate": 2.4662698412698414e-05, + "loss": 0.3099, + "step": 1287 + }, + { + "epoch": 5.090909090909091, + "grad_norm": 3.0999696254730225, + "learning_rate": 2.4642857142857145e-05, + "loss": 0.278, + "step": 1288 + }, + { + "epoch": 5.094861660079052, + "grad_norm": 4.00131893157959, + "learning_rate": 2.4623015873015875e-05, + "loss": 0.4586, + "step": 1289 + }, + { + "epoch": 5.098814229249012, + "grad_norm": 4.4364776611328125, + "learning_rate": 2.4603174603174602e-05, + "loss": 0.5131, + "step": 1290 + }, + { + "epoch": 5.102766798418973, + "grad_norm": 4.154432773590088, + "learning_rate": 2.4583333333333332e-05, + "loss": 0.2289, + "step": 1291 + }, + { + "epoch": 5.106719367588933, + "grad_norm": 4.669454574584961, + "learning_rate": 2.4563492063492063e-05, + "loss": 0.3081, + "step": 1292 + }, + { + "epoch": 5.1106719367588935, + "grad_norm": 3.263782024383545, + "learning_rate": 2.4543650793650793e-05, + "loss": 0.2814, + "step": 1293 + }, + { + "epoch": 5.1146245059288535, + "grad_norm": 4.402275085449219, + "learning_rate": 2.4523809523809523e-05, + "loss": 0.2797, + "step": 1294 + }, + { + "epoch": 5.118577075098814, + "grad_norm": 3.0900776386260986, + "learning_rate": 2.4503968253968254e-05, + "loss": 0.2825, + "step": 1295 + }, + { + "epoch": 5.122529644268774, + "grad_norm": 3.7776150703430176, + "learning_rate": 2.4484126984126984e-05, + "loss": 0.3906, + "step": 1296 + }, + { + "epoch": 5.126482213438735, + "grad_norm": 3.909672975540161, + "learning_rate": 2.4464285714285715e-05, + "loss": 0.3245, + "step": 1297 + }, + { + "epoch": 5.130434782608695, + "grad_norm": 3.4614851474761963, + "learning_rate": 2.4444444444444445e-05, + "loss": 0.364, + "step": 1298 + }, + { + "epoch": 5.134387351778656, + "grad_norm": 3.2292017936706543, + "learning_rate": 2.4424603174603175e-05, + "loss": 0.3371, + "step": 1299 + }, + { + "epoch": 5.138339920948616, + "grad_norm": 4.739649772644043, + "learning_rate": 2.4404761904761906e-05, + "loss": 0.3461, + "step": 1300 + }, + { + "epoch": 5.142292490118577, + "grad_norm": 3.756049156188965, + "learning_rate": 2.4384920634920636e-05, + "loss": 0.3143, + "step": 1301 + }, + { + "epoch": 5.146245059288537, + "grad_norm": 4.8067145347595215, + "learning_rate": 2.4365079365079366e-05, + "loss": 0.4448, + "step": 1302 + }, + { + "epoch": 5.150197628458498, + "grad_norm": 3.8932623863220215, + "learning_rate": 2.4345238095238097e-05, + "loss": 0.2526, + "step": 1303 + }, + { + "epoch": 5.154150197628459, + "grad_norm": 3.6755340099334717, + "learning_rate": 2.4325396825396827e-05, + "loss": 0.397, + "step": 1304 + }, + { + "epoch": 5.158102766798419, + "grad_norm": 3.7219412326812744, + "learning_rate": 2.4305555555555558e-05, + "loss": 0.1992, + "step": 1305 + }, + { + "epoch": 5.16205533596838, + "grad_norm": 3.744370698928833, + "learning_rate": 2.4285714285714288e-05, + "loss": 0.2522, + "step": 1306 + }, + { + "epoch": 5.16600790513834, + "grad_norm": 3.2599740028381348, + "learning_rate": 2.426587301587302e-05, + "loss": 0.3581, + "step": 1307 + }, + { + "epoch": 5.169960474308301, + "grad_norm": 3.48649525642395, + "learning_rate": 2.424603174603175e-05, + "loss": 0.3864, + "step": 1308 + }, + { + "epoch": 5.173913043478261, + "grad_norm": 3.1932923793792725, + "learning_rate": 2.4226190476190476e-05, + "loss": 0.3054, + "step": 1309 + }, + { + "epoch": 5.177865612648222, + "grad_norm": 4.015415191650391, + "learning_rate": 2.4206349206349206e-05, + "loss": 0.2925, + "step": 1310 + }, + { + "epoch": 5.181818181818182, + "grad_norm": 3.9713363647460938, + "learning_rate": 2.4186507936507936e-05, + "loss": 0.2362, + "step": 1311 + }, + { + "epoch": 5.1857707509881426, + "grad_norm": 4.697850704193115, + "learning_rate": 2.4166666666666667e-05, + "loss": 0.2583, + "step": 1312 + }, + { + "epoch": 5.189723320158103, + "grad_norm": 3.067460298538208, + "learning_rate": 2.4146825396825397e-05, + "loss": 0.2453, + "step": 1313 + }, + { + "epoch": 5.1936758893280635, + "grad_norm": 4.411377906799316, + "learning_rate": 2.4126984126984128e-05, + "loss": 0.1776, + "step": 1314 + }, + { + "epoch": 5.1976284584980235, + "grad_norm": 3.420597791671753, + "learning_rate": 2.4107142857142858e-05, + "loss": 0.3165, + "step": 1315 + }, + { + "epoch": 5.201581027667984, + "grad_norm": 4.028104305267334, + "learning_rate": 2.408730158730159e-05, + "loss": 0.4102, + "step": 1316 + }, + { + "epoch": 5.205533596837944, + "grad_norm": 3.9082906246185303, + "learning_rate": 2.406746031746032e-05, + "loss": 0.3382, + "step": 1317 + }, + { + "epoch": 5.209486166007905, + "grad_norm": 4.633881092071533, + "learning_rate": 2.404761904761905e-05, + "loss": 0.2822, + "step": 1318 + }, + { + "epoch": 5.213438735177865, + "grad_norm": 3.6297354698181152, + "learning_rate": 2.402777777777778e-05, + "loss": 0.2072, + "step": 1319 + }, + { + "epoch": 5.217391304347826, + "grad_norm": 3.1377158164978027, + "learning_rate": 2.400793650793651e-05, + "loss": 0.271, + "step": 1320 + }, + { + "epoch": 5.221343873517786, + "grad_norm": 3.472032308578491, + "learning_rate": 2.398809523809524e-05, + "loss": 0.2801, + "step": 1321 + }, + { + "epoch": 5.225296442687747, + "grad_norm": 3.7419073581695557, + "learning_rate": 2.396825396825397e-05, + "loss": 0.3191, + "step": 1322 + }, + { + "epoch": 5.229249011857707, + "grad_norm": 3.5220224857330322, + "learning_rate": 2.39484126984127e-05, + "loss": 0.4082, + "step": 1323 + }, + { + "epoch": 5.233201581027668, + "grad_norm": 3.874562978744507, + "learning_rate": 2.392857142857143e-05, + "loss": 0.3281, + "step": 1324 + }, + { + "epoch": 5.237154150197629, + "grad_norm": 4.159897804260254, + "learning_rate": 2.390873015873016e-05, + "loss": 0.437, + "step": 1325 + }, + { + "epoch": 5.241106719367589, + "grad_norm": 4.653357982635498, + "learning_rate": 2.3888888888888892e-05, + "loss": 0.4198, + "step": 1326 + }, + { + "epoch": 5.24505928853755, + "grad_norm": 5.037525177001953, + "learning_rate": 2.3869047619047622e-05, + "loss": 0.4139, + "step": 1327 + }, + { + "epoch": 5.24901185770751, + "grad_norm": 4.0110368728637695, + "learning_rate": 2.3849206349206353e-05, + "loss": 0.2786, + "step": 1328 + }, + { + "epoch": 5.252964426877471, + "grad_norm": 3.9994056224823, + "learning_rate": 2.3829365079365083e-05, + "loss": 0.3709, + "step": 1329 + }, + { + "epoch": 5.256916996047431, + "grad_norm": 3.936352014541626, + "learning_rate": 2.380952380952381e-05, + "loss": 0.3744, + "step": 1330 + }, + { + "epoch": 5.260869565217392, + "grad_norm": 3.9300754070281982, + "learning_rate": 2.378968253968254e-05, + "loss": 0.5034, + "step": 1331 + }, + { + "epoch": 5.264822134387352, + "grad_norm": 4.13817024230957, + "learning_rate": 2.376984126984127e-05, + "loss": 0.4842, + "step": 1332 + }, + { + "epoch": 5.2687747035573125, + "grad_norm": 3.0476675033569336, + "learning_rate": 2.375e-05, + "loss": 0.2846, + "step": 1333 + }, + { + "epoch": 5.2727272727272725, + "grad_norm": 4.827993869781494, + "learning_rate": 2.373015873015873e-05, + "loss": 0.4083, + "step": 1334 + }, + { + "epoch": 5.276679841897233, + "grad_norm": 3.111452341079712, + "learning_rate": 2.3710317460317462e-05, + "loss": 0.3334, + "step": 1335 + }, + { + "epoch": 5.280632411067193, + "grad_norm": 4.097339630126953, + "learning_rate": 2.369047619047619e-05, + "loss": 0.3375, + "step": 1336 + }, + { + "epoch": 5.284584980237154, + "grad_norm": 4.1747822761535645, + "learning_rate": 2.367063492063492e-05, + "loss": 0.3817, + "step": 1337 + }, + { + "epoch": 5.288537549407114, + "grad_norm": 3.4249610900878906, + "learning_rate": 2.365079365079365e-05, + "loss": 0.3121, + "step": 1338 + }, + { + "epoch": 5.292490118577075, + "grad_norm": 3.0609562397003174, + "learning_rate": 2.363095238095238e-05, + "loss": 0.3, + "step": 1339 + }, + { + "epoch": 5.296442687747035, + "grad_norm": 4.675295829772949, + "learning_rate": 2.361111111111111e-05, + "loss": 0.4734, + "step": 1340 + }, + { + "epoch": 5.300395256916996, + "grad_norm": 4.56593656539917, + "learning_rate": 2.359126984126984e-05, + "loss": 0.386, + "step": 1341 + }, + { + "epoch": 5.304347826086957, + "grad_norm": 5.29105806350708, + "learning_rate": 2.357142857142857e-05, + "loss": 0.3133, + "step": 1342 + }, + { + "epoch": 5.308300395256917, + "grad_norm": 4.2684712409973145, + "learning_rate": 2.35515873015873e-05, + "loss": 0.4087, + "step": 1343 + }, + { + "epoch": 5.312252964426877, + "grad_norm": 5.04403018951416, + "learning_rate": 2.3531746031746032e-05, + "loss": 0.4299, + "step": 1344 + }, + { + "epoch": 5.316205533596838, + "grad_norm": 4.923758506774902, + "learning_rate": 2.3511904761904762e-05, + "loss": 0.3231, + "step": 1345 + }, + { + "epoch": 5.320158102766799, + "grad_norm": 2.760050058364868, + "learning_rate": 2.3492063492063493e-05, + "loss": 0.2268, + "step": 1346 + }, + { + "epoch": 5.324110671936759, + "grad_norm": 3.437049150466919, + "learning_rate": 2.3472222222222223e-05, + "loss": 0.391, + "step": 1347 + }, + { + "epoch": 5.32806324110672, + "grad_norm": 4.473261833190918, + "learning_rate": 2.3452380952380954e-05, + "loss": 0.4342, + "step": 1348 + }, + { + "epoch": 5.33201581027668, + "grad_norm": 4.289645195007324, + "learning_rate": 2.3432539682539684e-05, + "loss": 0.4789, + "step": 1349 + }, + { + "epoch": 5.335968379446641, + "grad_norm": 3.814570426940918, + "learning_rate": 2.3412698412698414e-05, + "loss": 0.3432, + "step": 1350 + }, + { + "epoch": 5.339920948616601, + "grad_norm": 3.5725395679473877, + "learning_rate": 2.3392857142857145e-05, + "loss": 0.2394, + "step": 1351 + }, + { + "epoch": 5.3438735177865615, + "grad_norm": 3.9966726303100586, + "learning_rate": 2.3373015873015875e-05, + "loss": 0.3657, + "step": 1352 + }, + { + "epoch": 5.3478260869565215, + "grad_norm": 3.7222578525543213, + "learning_rate": 2.3353174603174605e-05, + "loss": 0.2929, + "step": 1353 + }, + { + "epoch": 5.351778656126482, + "grad_norm": 3.659536361694336, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.5094, + "step": 1354 + }, + { + "epoch": 5.355731225296442, + "grad_norm": 4.850214004516602, + "learning_rate": 2.3313492063492066e-05, + "loss": 0.4599, + "step": 1355 + }, + { + "epoch": 5.359683794466403, + "grad_norm": 3.3052451610565186, + "learning_rate": 2.3293650793650797e-05, + "loss": 0.1597, + "step": 1356 + }, + { + "epoch": 5.363636363636363, + "grad_norm": 3.6496050357818604, + "learning_rate": 2.3273809523809527e-05, + "loss": 0.3862, + "step": 1357 + }, + { + "epoch": 5.367588932806324, + "grad_norm": 4.302274703979492, + "learning_rate": 2.3253968253968257e-05, + "loss": 0.2688, + "step": 1358 + }, + { + "epoch": 5.371541501976284, + "grad_norm": 4.052597522735596, + "learning_rate": 2.3234126984126984e-05, + "loss": 0.328, + "step": 1359 + }, + { + "epoch": 5.375494071146245, + "grad_norm": 4.061760425567627, + "learning_rate": 2.3214285714285715e-05, + "loss": 0.394, + "step": 1360 + }, + { + "epoch": 5.379446640316205, + "grad_norm": 4.183040618896484, + "learning_rate": 2.3194444444444445e-05, + "loss": 0.4477, + "step": 1361 + }, + { + "epoch": 5.383399209486166, + "grad_norm": 2.766695737838745, + "learning_rate": 2.3174603174603175e-05, + "loss": 0.1786, + "step": 1362 + }, + { + "epoch": 5.387351778656127, + "grad_norm": 5.040890693664551, + "learning_rate": 2.3154761904761906e-05, + "loss": 0.4192, + "step": 1363 + }, + { + "epoch": 5.391304347826087, + "grad_norm": 4.445863246917725, + "learning_rate": 2.3134920634920636e-05, + "loss": 0.5025, + "step": 1364 + }, + { + "epoch": 5.395256916996048, + "grad_norm": 3.4918015003204346, + "learning_rate": 2.3115079365079367e-05, + "loss": 0.2392, + "step": 1365 + }, + { + "epoch": 5.399209486166008, + "grad_norm": 3.366082191467285, + "learning_rate": 2.3095238095238097e-05, + "loss": 0.4595, + "step": 1366 + }, + { + "epoch": 5.403162055335969, + "grad_norm": 3.521512508392334, + "learning_rate": 2.3075396825396827e-05, + "loss": 0.422, + "step": 1367 + }, + { + "epoch": 5.407114624505929, + "grad_norm": 3.9632761478424072, + "learning_rate": 2.3055555555555558e-05, + "loss": 0.4579, + "step": 1368 + }, + { + "epoch": 5.41106719367589, + "grad_norm": 3.9189956188201904, + "learning_rate": 2.3035714285714285e-05, + "loss": 0.3785, + "step": 1369 + }, + { + "epoch": 5.41501976284585, + "grad_norm": 3.7866268157958984, + "learning_rate": 2.3015873015873015e-05, + "loss": 0.223, + "step": 1370 + }, + { + "epoch": 5.4189723320158105, + "grad_norm": 3.4151864051818848, + "learning_rate": 2.2996031746031745e-05, + "loss": 0.1987, + "step": 1371 + }, + { + "epoch": 5.4229249011857705, + "grad_norm": 3.8373048305511475, + "learning_rate": 2.2976190476190476e-05, + "loss": 0.3249, + "step": 1372 + }, + { + "epoch": 5.426877470355731, + "grad_norm": 4.3949103355407715, + "learning_rate": 2.2956349206349206e-05, + "loss": 0.3704, + "step": 1373 + }, + { + "epoch": 5.430830039525691, + "grad_norm": 3.6524100303649902, + "learning_rate": 2.2936507936507937e-05, + "loss": 0.4462, + "step": 1374 + }, + { + "epoch": 5.434782608695652, + "grad_norm": 3.927030086517334, + "learning_rate": 2.2916666666666667e-05, + "loss": 0.3642, + "step": 1375 + }, + { + "epoch": 5.438735177865612, + "grad_norm": 4.005973815917969, + "learning_rate": 2.2896825396825397e-05, + "loss": 0.4237, + "step": 1376 + }, + { + "epoch": 5.442687747035573, + "grad_norm": 3.733210325241089, + "learning_rate": 2.2876984126984128e-05, + "loss": 0.3089, + "step": 1377 + }, + { + "epoch": 5.446640316205533, + "grad_norm": 5.140311241149902, + "learning_rate": 2.2857142857142858e-05, + "loss": 0.4288, + "step": 1378 + }, + { + "epoch": 5.450592885375494, + "grad_norm": 3.3404946327209473, + "learning_rate": 2.283730158730159e-05, + "loss": 0.2962, + "step": 1379 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 4.286531448364258, + "learning_rate": 2.281746031746032e-05, + "loss": 0.3931, + "step": 1380 + }, + { + "epoch": 5.458498023715415, + "grad_norm": 3.68621826171875, + "learning_rate": 2.279761904761905e-05, + "loss": 0.2881, + "step": 1381 + }, + { + "epoch": 5.462450592885375, + "grad_norm": 3.6594183444976807, + "learning_rate": 2.277777777777778e-05, + "loss": 0.293, + "step": 1382 + }, + { + "epoch": 5.466403162055336, + "grad_norm": 3.7897143363952637, + "learning_rate": 2.275793650793651e-05, + "loss": 0.207, + "step": 1383 + }, + { + "epoch": 5.470355731225297, + "grad_norm": 4.473965644836426, + "learning_rate": 2.273809523809524e-05, + "loss": 0.2935, + "step": 1384 + }, + { + "epoch": 5.474308300395257, + "grad_norm": 3.8163721561431885, + "learning_rate": 2.271825396825397e-05, + "loss": 0.4574, + "step": 1385 + }, + { + "epoch": 5.478260869565218, + "grad_norm": 3.263646125793457, + "learning_rate": 2.2698412698412698e-05, + "loss": 0.3372, + "step": 1386 + }, + { + "epoch": 5.482213438735178, + "grad_norm": 4.062425136566162, + "learning_rate": 2.2678571428571428e-05, + "loss": 0.2997, + "step": 1387 + }, + { + "epoch": 5.486166007905139, + "grad_norm": 3.691974401473999, + "learning_rate": 2.265873015873016e-05, + "loss": 0.3711, + "step": 1388 + }, + { + "epoch": 5.490118577075099, + "grad_norm": 3.3711211681365967, + "learning_rate": 2.263888888888889e-05, + "loss": 0.293, + "step": 1389 + }, + { + "epoch": 5.4940711462450595, + "grad_norm": 3.659691572189331, + "learning_rate": 2.261904761904762e-05, + "loss": 0.2519, + "step": 1390 + }, + { + "epoch": 5.4980237154150196, + "grad_norm": 4.6761088371276855, + "learning_rate": 2.259920634920635e-05, + "loss": 0.3788, + "step": 1391 + }, + { + "epoch": 5.5019762845849804, + "grad_norm": 4.013514995574951, + "learning_rate": 2.257936507936508e-05, + "loss": 0.3685, + "step": 1392 + }, + { + "epoch": 5.5059288537549405, + "grad_norm": 3.272243022918701, + "learning_rate": 2.255952380952381e-05, + "loss": 0.2541, + "step": 1393 + }, + { + "epoch": 5.509881422924901, + "grad_norm": 4.194965362548828, + "learning_rate": 2.253968253968254e-05, + "loss": 0.3032, + "step": 1394 + }, + { + "epoch": 5.513833992094861, + "grad_norm": 5.20955228805542, + "learning_rate": 2.251984126984127e-05, + "loss": 0.4339, + "step": 1395 + }, + { + "epoch": 5.517786561264822, + "grad_norm": 3.9413256645202637, + "learning_rate": 2.25e-05, + "loss": 0.2921, + "step": 1396 + }, + { + "epoch": 5.521739130434782, + "grad_norm": 3.783141613006592, + "learning_rate": 2.2480158730158732e-05, + "loss": 0.2763, + "step": 1397 + }, + { + "epoch": 5.525691699604743, + "grad_norm": 3.764054298400879, + "learning_rate": 2.2460317460317462e-05, + "loss": 0.4252, + "step": 1398 + }, + { + "epoch": 5.529644268774703, + "grad_norm": 3.668379545211792, + "learning_rate": 2.2440476190476193e-05, + "loss": 0.3577, + "step": 1399 + }, + { + "epoch": 5.533596837944664, + "grad_norm": 4.717470645904541, + "learning_rate": 2.2420634920634923e-05, + "loss": 0.2837, + "step": 1400 + }, + { + "epoch": 5.537549407114625, + "grad_norm": 4.919825553894043, + "learning_rate": 2.2400793650793653e-05, + "loss": 0.4905, + "step": 1401 + }, + { + "epoch": 5.541501976284585, + "grad_norm": 3.827908754348755, + "learning_rate": 2.2380952380952384e-05, + "loss": 0.219, + "step": 1402 + }, + { + "epoch": 5.545454545454545, + "grad_norm": 3.300463914871216, + "learning_rate": 2.2361111111111114e-05, + "loss": 0.2114, + "step": 1403 + }, + { + "epoch": 5.549407114624506, + "grad_norm": 4.180085182189941, + "learning_rate": 2.2341269841269844e-05, + "loss": 0.4172, + "step": 1404 + }, + { + "epoch": 5.553359683794467, + "grad_norm": 3.7211532592773438, + "learning_rate": 2.2321428571428575e-05, + "loss": 0.3994, + "step": 1405 + }, + { + "epoch": 5.557312252964427, + "grad_norm": 4.4185991287231445, + "learning_rate": 2.2301587301587305e-05, + "loss": 0.4922, + "step": 1406 + }, + { + "epoch": 5.561264822134388, + "grad_norm": 5.1140456199646, + "learning_rate": 2.2281746031746036e-05, + "loss": 0.4038, + "step": 1407 + }, + { + "epoch": 5.565217391304348, + "grad_norm": 3.7456719875335693, + "learning_rate": 2.2261904761904763e-05, + "loss": 0.2587, + "step": 1408 + }, + { + "epoch": 5.569169960474309, + "grad_norm": 4.17236852645874, + "learning_rate": 2.2242063492063493e-05, + "loss": 0.3107, + "step": 1409 + }, + { + "epoch": 5.573122529644269, + "grad_norm": 3.580836057662964, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.3648, + "step": 1410 + }, + { + "epoch": 5.5770750988142295, + "grad_norm": 4.266388893127441, + "learning_rate": 2.2202380952380954e-05, + "loss": 0.3403, + "step": 1411 + }, + { + "epoch": 5.5810276679841895, + "grad_norm": 2.99090576171875, + "learning_rate": 2.2182539682539684e-05, + "loss": 0.3277, + "step": 1412 + }, + { + "epoch": 5.58498023715415, + "grad_norm": 4.490349292755127, + "learning_rate": 2.2162698412698414e-05, + "loss": 0.4371, + "step": 1413 + }, + { + "epoch": 5.58893280632411, + "grad_norm": 4.330888271331787, + "learning_rate": 2.214285714285714e-05, + "loss": 0.2821, + "step": 1414 + }, + { + "epoch": 5.592885375494071, + "grad_norm": 3.7015597820281982, + "learning_rate": 2.2123015873015872e-05, + "loss": 0.2986, + "step": 1415 + }, + { + "epoch": 5.596837944664031, + "grad_norm": 4.108189582824707, + "learning_rate": 2.2103174603174602e-05, + "loss": 0.4689, + "step": 1416 + }, + { + "epoch": 5.600790513833992, + "grad_norm": 3.0757765769958496, + "learning_rate": 2.2083333333333333e-05, + "loss": 0.2335, + "step": 1417 + }, + { + "epoch": 5.604743083003952, + "grad_norm": 3.065613031387329, + "learning_rate": 2.2063492063492063e-05, + "loss": 0.3694, + "step": 1418 + }, + { + "epoch": 5.608695652173913, + "grad_norm": 4.539591312408447, + "learning_rate": 2.2043650793650793e-05, + "loss": 0.3448, + "step": 1419 + }, + { + "epoch": 5.612648221343873, + "grad_norm": 3.5507845878601074, + "learning_rate": 2.2023809523809524e-05, + "loss": 0.4036, + "step": 1420 + }, + { + "epoch": 5.616600790513834, + "grad_norm": 3.766512632369995, + "learning_rate": 2.2003968253968254e-05, + "loss": 0.3682, + "step": 1421 + }, + { + "epoch": 5.620553359683795, + "grad_norm": 3.268449306488037, + "learning_rate": 2.1984126984126984e-05, + "loss": 0.2589, + "step": 1422 + }, + { + "epoch": 5.624505928853755, + "grad_norm": 3.850033760070801, + "learning_rate": 2.1964285714285715e-05, + "loss": 0.2949, + "step": 1423 + }, + { + "epoch": 5.628458498023716, + "grad_norm": 4.563751220703125, + "learning_rate": 2.1944444444444445e-05, + "loss": 0.387, + "step": 1424 + }, + { + "epoch": 5.632411067193676, + "grad_norm": 3.109978199005127, + "learning_rate": 2.1924603174603176e-05, + "loss": 0.2175, + "step": 1425 + }, + { + "epoch": 5.636363636363637, + "grad_norm": 4.669355869293213, + "learning_rate": 2.1904761904761906e-05, + "loss": 0.2422, + "step": 1426 + }, + { + "epoch": 5.640316205533597, + "grad_norm": 3.95523738861084, + "learning_rate": 2.1884920634920636e-05, + "loss": 0.3867, + "step": 1427 + }, + { + "epoch": 5.644268774703558, + "grad_norm": 3.9912211894989014, + "learning_rate": 2.1865079365079367e-05, + "loss": 0.2815, + "step": 1428 + }, + { + "epoch": 5.648221343873518, + "grad_norm": 3.4305765628814697, + "learning_rate": 2.1845238095238097e-05, + "loss": 0.263, + "step": 1429 + }, + { + "epoch": 5.6521739130434785, + "grad_norm": 4.1827216148376465, + "learning_rate": 2.1825396825396827e-05, + "loss": 0.292, + "step": 1430 + }, + { + "epoch": 5.6561264822134385, + "grad_norm": 3.6767704486846924, + "learning_rate": 2.1805555555555558e-05, + "loss": 0.2732, + "step": 1431 + }, + { + "epoch": 5.660079051383399, + "grad_norm": 4.459062099456787, + "learning_rate": 2.1785714285714288e-05, + "loss": 0.3492, + "step": 1432 + }, + { + "epoch": 5.664031620553359, + "grad_norm": 3.5659234523773193, + "learning_rate": 2.176587301587302e-05, + "loss": 0.2712, + "step": 1433 + }, + { + "epoch": 5.66798418972332, + "grad_norm": 4.487940788269043, + "learning_rate": 2.174603174603175e-05, + "loss": 0.2831, + "step": 1434 + }, + { + "epoch": 5.67193675889328, + "grad_norm": 5.612968921661377, + "learning_rate": 2.172619047619048e-05, + "loss": 0.5013, + "step": 1435 + }, + { + "epoch": 5.675889328063241, + "grad_norm": 5.958313465118408, + "learning_rate": 2.170634920634921e-05, + "loss": 0.4195, + "step": 1436 + }, + { + "epoch": 5.679841897233201, + "grad_norm": 4.63142728805542, + "learning_rate": 2.1686507936507937e-05, + "loss": 0.421, + "step": 1437 + }, + { + "epoch": 5.683794466403162, + "grad_norm": 3.549577236175537, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.3966, + "step": 1438 + }, + { + "epoch": 5.687747035573123, + "grad_norm": 3.38968825340271, + "learning_rate": 2.1646825396825397e-05, + "loss": 0.3352, + "step": 1439 + }, + { + "epoch": 5.691699604743083, + "grad_norm": 3.199084520339966, + "learning_rate": 2.1626984126984128e-05, + "loss": 0.2996, + "step": 1440 + }, + { + "epoch": 5.695652173913043, + "grad_norm": 3.6920907497406006, + "learning_rate": 2.1607142857142858e-05, + "loss": 0.329, + "step": 1441 + }, + { + "epoch": 5.699604743083004, + "grad_norm": 3.676727533340454, + "learning_rate": 2.158730158730159e-05, + "loss": 0.5122, + "step": 1442 + }, + { + "epoch": 5.703557312252965, + "grad_norm": 3.520972728729248, + "learning_rate": 2.156746031746032e-05, + "loss": 0.2829, + "step": 1443 + }, + { + "epoch": 5.707509881422925, + "grad_norm": 4.055224895477295, + "learning_rate": 2.154761904761905e-05, + "loss": 0.3474, + "step": 1444 + }, + { + "epoch": 5.711462450592886, + "grad_norm": 4.417123317718506, + "learning_rate": 2.152777777777778e-05, + "loss": 0.3876, + "step": 1445 + }, + { + "epoch": 5.715415019762846, + "grad_norm": 3.9218266010284424, + "learning_rate": 2.150793650793651e-05, + "loss": 0.4128, + "step": 1446 + }, + { + "epoch": 5.719367588932807, + "grad_norm": 4.506389141082764, + "learning_rate": 2.148809523809524e-05, + "loss": 0.4672, + "step": 1447 + }, + { + "epoch": 5.723320158102767, + "grad_norm": 3.930652141571045, + "learning_rate": 2.1468253968253967e-05, + "loss": 0.2926, + "step": 1448 + }, + { + "epoch": 5.7272727272727275, + "grad_norm": 4.012630939483643, + "learning_rate": 2.1448412698412698e-05, + "loss": 0.4217, + "step": 1449 + }, + { + "epoch": 5.7312252964426875, + "grad_norm": 4.338024139404297, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.2772, + "step": 1450 + }, + { + "epoch": 5.735177865612648, + "grad_norm": 3.886202096939087, + "learning_rate": 2.140873015873016e-05, + "loss": 0.3482, + "step": 1451 + }, + { + "epoch": 5.739130434782608, + "grad_norm": 4.083372116088867, + "learning_rate": 2.138888888888889e-05, + "loss": 0.2735, + "step": 1452 + }, + { + "epoch": 5.743083003952569, + "grad_norm": 3.7557191848754883, + "learning_rate": 2.136904761904762e-05, + "loss": 0.4228, + "step": 1453 + }, + { + "epoch": 5.747035573122529, + "grad_norm": 3.172152519226074, + "learning_rate": 2.134920634920635e-05, + "loss": 0.3562, + "step": 1454 + }, + { + "epoch": 5.75098814229249, + "grad_norm": 3.7699217796325684, + "learning_rate": 2.132936507936508e-05, + "loss": 0.2838, + "step": 1455 + }, + { + "epoch": 5.75494071146245, + "grad_norm": 3.8559303283691406, + "learning_rate": 2.130952380952381e-05, + "loss": 0.5692, + "step": 1456 + }, + { + "epoch": 5.758893280632411, + "grad_norm": 5.278477191925049, + "learning_rate": 2.128968253968254e-05, + "loss": 0.5161, + "step": 1457 + }, + { + "epoch": 5.762845849802371, + "grad_norm": 4.712038993835449, + "learning_rate": 2.126984126984127e-05, + "loss": 0.473, + "step": 1458 + }, + { + "epoch": 5.766798418972332, + "grad_norm": 3.714198112487793, + "learning_rate": 2.125e-05, + "loss": 0.2426, + "step": 1459 + }, + { + "epoch": 5.770750988142293, + "grad_norm": 3.246757745742798, + "learning_rate": 2.1230158730158732e-05, + "loss": 0.4447, + "step": 1460 + }, + { + "epoch": 5.774703557312253, + "grad_norm": 4.785327434539795, + "learning_rate": 2.1210317460317462e-05, + "loss": 0.3517, + "step": 1461 + }, + { + "epoch": 5.778656126482213, + "grad_norm": 4.272199630737305, + "learning_rate": 2.1190476190476193e-05, + "loss": 0.329, + "step": 1462 + }, + { + "epoch": 5.782608695652174, + "grad_norm": 4.346473217010498, + "learning_rate": 2.1170634920634923e-05, + "loss": 0.4102, + "step": 1463 + }, + { + "epoch": 5.786561264822135, + "grad_norm": 3.4643688201904297, + "learning_rate": 2.115079365079365e-05, + "loss": 0.2851, + "step": 1464 + }, + { + "epoch": 5.790513833992095, + "grad_norm": 4.473227024078369, + "learning_rate": 2.113095238095238e-05, + "loss": 0.3341, + "step": 1465 + }, + { + "epoch": 5.794466403162056, + "grad_norm": 3.813743829727173, + "learning_rate": 2.111111111111111e-05, + "loss": 0.4491, + "step": 1466 + }, + { + "epoch": 5.798418972332016, + "grad_norm": 4.01721715927124, + "learning_rate": 2.109126984126984e-05, + "loss": 0.1942, + "step": 1467 + }, + { + "epoch": 5.8023715415019765, + "grad_norm": 3.757263422012329, + "learning_rate": 2.107142857142857e-05, + "loss": 0.3666, + "step": 1468 + }, + { + "epoch": 5.8063241106719365, + "grad_norm": 3.4394917488098145, + "learning_rate": 2.1051587301587302e-05, + "loss": 0.3211, + "step": 1469 + }, + { + "epoch": 5.810276679841897, + "grad_norm": 5.106522083282471, + "learning_rate": 2.1031746031746032e-05, + "loss": 0.3757, + "step": 1470 + }, + { + "epoch": 5.8142292490118574, + "grad_norm": 4.171218395233154, + "learning_rate": 2.1011904761904763e-05, + "loss": 0.4219, + "step": 1471 + }, + { + "epoch": 5.818181818181818, + "grad_norm": 4.083078384399414, + "learning_rate": 2.0992063492063493e-05, + "loss": 0.3451, + "step": 1472 + }, + { + "epoch": 5.822134387351778, + "grad_norm": 4.33717679977417, + "learning_rate": 2.0972222222222223e-05, + "loss": 0.5292, + "step": 1473 + }, + { + "epoch": 5.826086956521739, + "grad_norm": 4.866550445556641, + "learning_rate": 2.0952380952380954e-05, + "loss": 0.3334, + "step": 1474 + }, + { + "epoch": 5.830039525691699, + "grad_norm": 3.197094202041626, + "learning_rate": 2.0932539682539684e-05, + "loss": 0.2206, + "step": 1475 + }, + { + "epoch": 5.83399209486166, + "grad_norm": 3.3910369873046875, + "learning_rate": 2.0912698412698415e-05, + "loss": 0.1893, + "step": 1476 + }, + { + "epoch": 5.837944664031621, + "grad_norm": 4.107388496398926, + "learning_rate": 2.0892857142857145e-05, + "loss": 0.4357, + "step": 1477 + }, + { + "epoch": 5.841897233201581, + "grad_norm": 2.6555662155151367, + "learning_rate": 2.0873015873015875e-05, + "loss": 0.2589, + "step": 1478 + }, + { + "epoch": 5.845849802371541, + "grad_norm": 3.3115289211273193, + "learning_rate": 2.0853174603174606e-05, + "loss": 0.3587, + "step": 1479 + }, + { + "epoch": 5.849802371541502, + "grad_norm": 3.604375123977661, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.273, + "step": 1480 + }, + { + "epoch": 5.853754940711463, + "grad_norm": 4.312830448150635, + "learning_rate": 2.0813492063492066e-05, + "loss": 0.2748, + "step": 1481 + }, + { + "epoch": 5.857707509881423, + "grad_norm": 4.150437831878662, + "learning_rate": 2.0793650793650797e-05, + "loss": 0.3099, + "step": 1482 + }, + { + "epoch": 5.861660079051384, + "grad_norm": 4.916041851043701, + "learning_rate": 2.0773809523809527e-05, + "loss": 0.3229, + "step": 1483 + }, + { + "epoch": 5.865612648221344, + "grad_norm": 4.0450663566589355, + "learning_rate": 2.0753968253968258e-05, + "loss": 0.3658, + "step": 1484 + }, + { + "epoch": 5.869565217391305, + "grad_norm": 3.5350306034088135, + "learning_rate": 2.0734126984126988e-05, + "loss": 0.3391, + "step": 1485 + }, + { + "epoch": 5.873517786561265, + "grad_norm": 3.364154100418091, + "learning_rate": 2.0714285714285718e-05, + "loss": 0.1964, + "step": 1486 + }, + { + "epoch": 5.877470355731226, + "grad_norm": 5.209838390350342, + "learning_rate": 2.0694444444444445e-05, + "loss": 0.3585, + "step": 1487 + }, + { + "epoch": 5.881422924901186, + "grad_norm": 3.9065868854522705, + "learning_rate": 2.0674603174603176e-05, + "loss": 0.2676, + "step": 1488 + }, + { + "epoch": 5.8853754940711465, + "grad_norm": 4.029334545135498, + "learning_rate": 2.0654761904761906e-05, + "loss": 0.2837, + "step": 1489 + }, + { + "epoch": 5.8893280632411065, + "grad_norm": 4.0023112297058105, + "learning_rate": 2.0634920634920636e-05, + "loss": 0.4846, + "step": 1490 + }, + { + "epoch": 5.893280632411067, + "grad_norm": 4.869349479675293, + "learning_rate": 2.0615079365079363e-05, + "loss": 0.3852, + "step": 1491 + }, + { + "epoch": 5.897233201581027, + "grad_norm": 4.35573148727417, + "learning_rate": 2.0595238095238094e-05, + "loss": 0.2365, + "step": 1492 + }, + { + "epoch": 5.901185770750988, + "grad_norm": 4.35988187789917, + "learning_rate": 2.0575396825396824e-05, + "loss": 0.6209, + "step": 1493 + }, + { + "epoch": 5.905138339920948, + "grad_norm": 3.8130955696105957, + "learning_rate": 2.0555555555555555e-05, + "loss": 0.3111, + "step": 1494 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 3.3116374015808105, + "learning_rate": 2.0535714285714285e-05, + "loss": 0.261, + "step": 1495 + }, + { + "epoch": 5.913043478260869, + "grad_norm": 4.634829521179199, + "learning_rate": 2.0515873015873015e-05, + "loss": 0.3943, + "step": 1496 + }, + { + "epoch": 5.91699604743083, + "grad_norm": 3.972263813018799, + "learning_rate": 2.0496031746031746e-05, + "loss": 0.3012, + "step": 1497 + }, + { + "epoch": 5.920948616600791, + "grad_norm": 4.382541656494141, + "learning_rate": 2.0476190476190476e-05, + "loss": 0.2957, + "step": 1498 + }, + { + "epoch": 5.924901185770751, + "grad_norm": 4.666507720947266, + "learning_rate": 2.0456349206349206e-05, + "loss": 0.4242, + "step": 1499 + }, + { + "epoch": 5.928853754940711, + "grad_norm": 3.7338948249816895, + "learning_rate": 2.0436507936507937e-05, + "loss": 0.2565, + "step": 1500 + }, + { + "epoch": 5.932806324110672, + "grad_norm": 3.5870771408081055, + "learning_rate": 2.0416666666666667e-05, + "loss": 0.4244, + "step": 1501 + }, + { + "epoch": 5.936758893280633, + "grad_norm": 4.410994052886963, + "learning_rate": 2.0396825396825398e-05, + "loss": 0.3985, + "step": 1502 + }, + { + "epoch": 5.940711462450593, + "grad_norm": 5.01856803894043, + "learning_rate": 2.0376984126984128e-05, + "loss": 0.354, + "step": 1503 + }, + { + "epoch": 5.944664031620554, + "grad_norm": 3.5459818840026855, + "learning_rate": 2.0357142857142858e-05, + "loss": 0.3448, + "step": 1504 + }, + { + "epoch": 5.948616600790514, + "grad_norm": 4.243325233459473, + "learning_rate": 2.033730158730159e-05, + "loss": 0.4499, + "step": 1505 + }, + { + "epoch": 5.952569169960475, + "grad_norm": 4.445835113525391, + "learning_rate": 2.031746031746032e-05, + "loss": 0.4184, + "step": 1506 + }, + { + "epoch": 5.956521739130435, + "grad_norm": 4.013076305389404, + "learning_rate": 2.029761904761905e-05, + "loss": 0.3721, + "step": 1507 + }, + { + "epoch": 5.9604743083003955, + "grad_norm": 4.0771284103393555, + "learning_rate": 2.027777777777778e-05, + "loss": 0.414, + "step": 1508 + }, + { + "epoch": 5.9644268774703555, + "grad_norm": 4.353712558746338, + "learning_rate": 2.025793650793651e-05, + "loss": 0.4212, + "step": 1509 + }, + { + "epoch": 5.968379446640316, + "grad_norm": 3.816298007965088, + "learning_rate": 2.023809523809524e-05, + "loss": 0.2885, + "step": 1510 + }, + { + "epoch": 5.972332015810276, + "grad_norm": 5.043026447296143, + "learning_rate": 2.021825396825397e-05, + "loss": 0.4901, + "step": 1511 + }, + { + "epoch": 5.976284584980237, + "grad_norm": 3.432596206665039, + "learning_rate": 2.01984126984127e-05, + "loss": 0.4023, + "step": 1512 + }, + { + "epoch": 5.980237154150197, + "grad_norm": 5.160770893096924, + "learning_rate": 2.017857142857143e-05, + "loss": 0.435, + "step": 1513 + }, + { + "epoch": 5.984189723320158, + "grad_norm": 3.863649845123291, + "learning_rate": 2.015873015873016e-05, + "loss": 0.2929, + "step": 1514 + }, + { + "epoch": 5.988142292490118, + "grad_norm": 3.960110902786255, + "learning_rate": 2.013888888888889e-05, + "loss": 0.3358, + "step": 1515 + }, + { + "epoch": 5.992094861660079, + "grad_norm": 3.597496747970581, + "learning_rate": 2.011904761904762e-05, + "loss": 0.3232, + "step": 1516 + }, + { + "epoch": 5.996047430830039, + "grad_norm": 3.676575183868408, + "learning_rate": 2.009920634920635e-05, + "loss": 0.3356, + "step": 1517 + }, + { + "epoch": 6.0, + "grad_norm": 4.146391868591309, + "learning_rate": 2.007936507936508e-05, + "loss": 0.3818, + "step": 1518 + }, + { + "epoch": 6.003952569169961, + "grad_norm": 1.9677680730819702, + "learning_rate": 2.005952380952381e-05, + "loss": 0.1328, + "step": 1519 + }, + { + "epoch": 6.007905138339921, + "grad_norm": 3.2990286350250244, + "learning_rate": 2.003968253968254e-05, + "loss": 0.1407, + "step": 1520 + }, + { + "epoch": 6.011857707509882, + "grad_norm": 3.0897316932678223, + "learning_rate": 2.001984126984127e-05, + "loss": 0.23, + "step": 1521 + }, + { + "epoch": 6.015810276679842, + "grad_norm": 3.0987026691436768, + "learning_rate": 2e-05, + "loss": 0.2357, + "step": 1522 + }, + { + "epoch": 6.019762845849803, + "grad_norm": 2.3966803550720215, + "learning_rate": 1.9980158730158732e-05, + "loss": 0.1346, + "step": 1523 + }, + { + "epoch": 6.023715415019763, + "grad_norm": 2.7668561935424805, + "learning_rate": 1.9960317460317462e-05, + "loss": 0.1102, + "step": 1524 + }, + { + "epoch": 6.027667984189724, + "grad_norm": 3.623638868331909, + "learning_rate": 1.9940476190476193e-05, + "loss": 0.1318, + "step": 1525 + }, + { + "epoch": 6.031620553359684, + "grad_norm": 3.124091148376465, + "learning_rate": 1.992063492063492e-05, + "loss": 0.2056, + "step": 1526 + }, + { + "epoch": 6.0355731225296445, + "grad_norm": 3.1599087715148926, + "learning_rate": 1.990079365079365e-05, + "loss": 0.1375, + "step": 1527 + }, + { + "epoch": 6.0395256916996045, + "grad_norm": 3.4084415435791016, + "learning_rate": 1.988095238095238e-05, + "loss": 0.1443, + "step": 1528 + }, + { + "epoch": 6.043478260869565, + "grad_norm": 3.607503652572632, + "learning_rate": 1.986111111111111e-05, + "loss": 0.1846, + "step": 1529 + }, + { + "epoch": 6.047430830039525, + "grad_norm": 3.6665408611297607, + "learning_rate": 1.984126984126984e-05, + "loss": 0.1801, + "step": 1530 + }, + { + "epoch": 6.051383399209486, + "grad_norm": 3.5593857765197754, + "learning_rate": 1.982142857142857e-05, + "loss": 0.2119, + "step": 1531 + }, + { + "epoch": 6.055335968379446, + "grad_norm": 2.55330228805542, + "learning_rate": 1.9801587301587302e-05, + "loss": 0.119, + "step": 1532 + }, + { + "epoch": 6.059288537549407, + "grad_norm": 4.402235984802246, + "learning_rate": 1.9781746031746032e-05, + "loss": 0.2068, + "step": 1533 + }, + { + "epoch": 6.063241106719367, + "grad_norm": 2.9708304405212402, + "learning_rate": 1.9761904761904763e-05, + "loss": 0.1736, + "step": 1534 + }, + { + "epoch": 6.067193675889328, + "grad_norm": 4.265127182006836, + "learning_rate": 1.9742063492063493e-05, + "loss": 0.2059, + "step": 1535 + }, + { + "epoch": 6.071146245059288, + "grad_norm": 3.346165895462036, + "learning_rate": 1.9722222222222224e-05, + "loss": 0.2927, + "step": 1536 + }, + { + "epoch": 6.075098814229249, + "grad_norm": 3.0124001502990723, + "learning_rate": 1.9702380952380954e-05, + "loss": 0.1643, + "step": 1537 + }, + { + "epoch": 6.07905138339921, + "grad_norm": 3.65307879447937, + "learning_rate": 1.9682539682539684e-05, + "loss": 0.1432, + "step": 1538 + }, + { + "epoch": 6.08300395256917, + "grad_norm": 3.4866161346435547, + "learning_rate": 1.9662698412698415e-05, + "loss": 0.1515, + "step": 1539 + }, + { + "epoch": 6.086956521739131, + "grad_norm": 3.2024996280670166, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.1209, + "step": 1540 + }, + { + "epoch": 6.090909090909091, + "grad_norm": 3.161350965499878, + "learning_rate": 1.9623015873015872e-05, + "loss": 0.1511, + "step": 1541 + }, + { + "epoch": 6.094861660079052, + "grad_norm": 2.771225929260254, + "learning_rate": 1.9603174603174602e-05, + "loss": 0.1782, + "step": 1542 + }, + { + "epoch": 6.098814229249012, + "grad_norm": 2.3539700508117676, + "learning_rate": 1.9583333333333333e-05, + "loss": 0.1533, + "step": 1543 + }, + { + "epoch": 6.102766798418973, + "grad_norm": 3.1462249755859375, + "learning_rate": 1.9563492063492063e-05, + "loss": 0.1733, + "step": 1544 + }, + { + "epoch": 6.106719367588933, + "grad_norm": 3.8993260860443115, + "learning_rate": 1.9543650793650793e-05, + "loss": 0.2576, + "step": 1545 + }, + { + "epoch": 6.1106719367588935, + "grad_norm": 4.060632228851318, + "learning_rate": 1.9523809523809524e-05, + "loss": 0.2333, + "step": 1546 + }, + { + "epoch": 6.1146245059288535, + "grad_norm": 3.7323641777038574, + "learning_rate": 1.9503968253968254e-05, + "loss": 0.256, + "step": 1547 + }, + { + "epoch": 6.118577075098814, + "grad_norm": 2.8360302448272705, + "learning_rate": 1.9484126984126985e-05, + "loss": 0.1162, + "step": 1548 + }, + { + "epoch": 6.122529644268774, + "grad_norm": 4.867652416229248, + "learning_rate": 1.9464285714285715e-05, + "loss": 0.2682, + "step": 1549 + }, + { + "epoch": 6.126482213438735, + "grad_norm": 3.90091872215271, + "learning_rate": 1.9444444444444445e-05, + "loss": 0.2084, + "step": 1550 + }, + { + "epoch": 6.130434782608695, + "grad_norm": 3.1003096103668213, + "learning_rate": 1.9424603174603176e-05, + "loss": 0.1783, + "step": 1551 + }, + { + "epoch": 6.134387351778656, + "grad_norm": 4.112887859344482, + "learning_rate": 1.9404761904761906e-05, + "loss": 0.2177, + "step": 1552 + }, + { + "epoch": 6.138339920948616, + "grad_norm": 2.7363293170928955, + "learning_rate": 1.9384920634920637e-05, + "loss": 0.1403, + "step": 1553 + }, + { + "epoch": 6.142292490118577, + "grad_norm": 3.061474561691284, + "learning_rate": 1.9365079365079367e-05, + "loss": 0.1204, + "step": 1554 + }, + { + "epoch": 6.146245059288537, + "grad_norm": 3.5960800647735596, + "learning_rate": 1.9345238095238097e-05, + "loss": 0.1865, + "step": 1555 + }, + { + "epoch": 6.150197628458498, + "grad_norm": 3.1782121658325195, + "learning_rate": 1.9325396825396828e-05, + "loss": 0.1481, + "step": 1556 + }, + { + "epoch": 6.154150197628459, + "grad_norm": 3.249864339828491, + "learning_rate": 1.9305555555555558e-05, + "loss": 0.2116, + "step": 1557 + }, + { + "epoch": 6.158102766798419, + "grad_norm": 4.330471515655518, + "learning_rate": 1.928571428571429e-05, + "loss": 0.158, + "step": 1558 + }, + { + "epoch": 6.16205533596838, + "grad_norm": 4.093515872955322, + "learning_rate": 1.926587301587302e-05, + "loss": 0.1871, + "step": 1559 + }, + { + "epoch": 6.16600790513834, + "grad_norm": 3.096252918243408, + "learning_rate": 1.924603174603175e-05, + "loss": 0.1601, + "step": 1560 + }, + { + "epoch": 6.169960474308301, + "grad_norm": 3.766211748123169, + "learning_rate": 1.922619047619048e-05, + "loss": 0.1666, + "step": 1561 + }, + { + "epoch": 6.173913043478261, + "grad_norm": 3.036038637161255, + "learning_rate": 1.920634920634921e-05, + "loss": 0.1687, + "step": 1562 + }, + { + "epoch": 6.177865612648222, + "grad_norm": 3.8249239921569824, + "learning_rate": 1.918650793650794e-05, + "loss": 0.1887, + "step": 1563 + }, + { + "epoch": 6.181818181818182, + "grad_norm": 3.461127281188965, + "learning_rate": 1.9166666666666667e-05, + "loss": 0.1666, + "step": 1564 + }, + { + "epoch": 6.1857707509881426, + "grad_norm": 3.262033462524414, + "learning_rate": 1.9146825396825398e-05, + "loss": 0.1546, + "step": 1565 + }, + { + "epoch": 6.189723320158103, + "grad_norm": 3.1641769409179688, + "learning_rate": 1.9126984126984128e-05, + "loss": 0.1628, + "step": 1566 + }, + { + "epoch": 6.1936758893280635, + "grad_norm": 2.8139045238494873, + "learning_rate": 1.910714285714286e-05, + "loss": 0.1658, + "step": 1567 + }, + { + "epoch": 6.1976284584980235, + "grad_norm": 4.408447265625, + "learning_rate": 1.9087301587301585e-05, + "loss": 0.2102, + "step": 1568 + }, + { + "epoch": 6.201581027667984, + "grad_norm": 2.9684667587280273, + "learning_rate": 1.9067460317460316e-05, + "loss": 0.2325, + "step": 1569 + }, + { + "epoch": 6.205533596837944, + "grad_norm": 2.9627156257629395, + "learning_rate": 1.9047619047619046e-05, + "loss": 0.1779, + "step": 1570 + }, + { + "epoch": 6.209486166007905, + "grad_norm": 3.4896349906921387, + "learning_rate": 1.9027777777777776e-05, + "loss": 0.1404, + "step": 1571 + }, + { + "epoch": 6.213438735177865, + "grad_norm": 2.7135539054870605, + "learning_rate": 1.9007936507936507e-05, + "loss": 0.1057, + "step": 1572 + }, + { + "epoch": 6.217391304347826, + "grad_norm": 3.230480909347534, + "learning_rate": 1.8988095238095237e-05, + "loss": 0.1485, + "step": 1573 + }, + { + "epoch": 6.221343873517786, + "grad_norm": 2.9966936111450195, + "learning_rate": 1.8968253968253968e-05, + "loss": 0.1521, + "step": 1574 + }, + { + "epoch": 6.225296442687747, + "grad_norm": 2.451078414916992, + "learning_rate": 1.8948412698412698e-05, + "loss": 0.1289, + "step": 1575 + }, + { + "epoch": 6.229249011857707, + "grad_norm": 3.682922124862671, + "learning_rate": 1.892857142857143e-05, + "loss": 0.1696, + "step": 1576 + }, + { + "epoch": 6.233201581027668, + "grad_norm": 4.325542449951172, + "learning_rate": 1.890873015873016e-05, + "loss": 0.2367, + "step": 1577 + }, + { + "epoch": 6.237154150197629, + "grad_norm": 3.799766778945923, + "learning_rate": 1.888888888888889e-05, + "loss": 0.1636, + "step": 1578 + }, + { + "epoch": 6.241106719367589, + "grad_norm": 2.6366379261016846, + "learning_rate": 1.886904761904762e-05, + "loss": 0.126, + "step": 1579 + }, + { + "epoch": 6.24505928853755, + "grad_norm": 3.575343608856201, + "learning_rate": 1.884920634920635e-05, + "loss": 0.1621, + "step": 1580 + }, + { + "epoch": 6.24901185770751, + "grad_norm": 4.663724422454834, + "learning_rate": 1.882936507936508e-05, + "loss": 0.2898, + "step": 1581 + }, + { + "epoch": 6.252964426877471, + "grad_norm": 3.3895034790039062, + "learning_rate": 1.880952380952381e-05, + "loss": 0.2493, + "step": 1582 + }, + { + "epoch": 6.256916996047431, + "grad_norm": 4.064142227172852, + "learning_rate": 1.878968253968254e-05, + "loss": 0.2136, + "step": 1583 + }, + { + "epoch": 6.260869565217392, + "grad_norm": 4.431739330291748, + "learning_rate": 1.876984126984127e-05, + "loss": 0.168, + "step": 1584 + }, + { + "epoch": 6.264822134387352, + "grad_norm": 3.109384536743164, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.1606, + "step": 1585 + }, + { + "epoch": 6.2687747035573125, + "grad_norm": 3.7452895641326904, + "learning_rate": 1.8730158730158732e-05, + "loss": 0.2287, + "step": 1586 + }, + { + "epoch": 6.2727272727272725, + "grad_norm": 3.320481777191162, + "learning_rate": 1.8710317460317462e-05, + "loss": 0.1221, + "step": 1587 + }, + { + "epoch": 6.276679841897233, + "grad_norm": 3.2335190773010254, + "learning_rate": 1.8690476190476193e-05, + "loss": 0.1522, + "step": 1588 + }, + { + "epoch": 6.280632411067193, + "grad_norm": 2.431941509246826, + "learning_rate": 1.8670634920634923e-05, + "loss": 0.1787, + "step": 1589 + }, + { + "epoch": 6.284584980237154, + "grad_norm": 2.7418622970581055, + "learning_rate": 1.8650793650793654e-05, + "loss": 0.1862, + "step": 1590 + }, + { + "epoch": 6.288537549407114, + "grad_norm": 3.3152060508728027, + "learning_rate": 1.863095238095238e-05, + "loss": 0.1879, + "step": 1591 + }, + { + "epoch": 6.292490118577075, + "grad_norm": 2.946201801300049, + "learning_rate": 1.861111111111111e-05, + "loss": 0.1429, + "step": 1592 + }, + { + "epoch": 6.296442687747035, + "grad_norm": 4.4527587890625, + "learning_rate": 1.859126984126984e-05, + "loss": 0.1634, + "step": 1593 + }, + { + "epoch": 6.300395256916996, + "grad_norm": 2.763129711151123, + "learning_rate": 1.8571428571428572e-05, + "loss": 0.1338, + "step": 1594 + }, + { + "epoch": 6.304347826086957, + "grad_norm": 3.8917641639709473, + "learning_rate": 1.8551587301587302e-05, + "loss": 0.1526, + "step": 1595 + }, + { + "epoch": 6.308300395256917, + "grad_norm": 3.10577130317688, + "learning_rate": 1.8531746031746032e-05, + "loss": 0.1993, + "step": 1596 + }, + { + "epoch": 6.312252964426877, + "grad_norm": 4.823581695556641, + "learning_rate": 1.8511904761904763e-05, + "loss": 0.1888, + "step": 1597 + }, + { + "epoch": 6.316205533596838, + "grad_norm": 3.8852086067199707, + "learning_rate": 1.8492063492063493e-05, + "loss": 0.1369, + "step": 1598 + }, + { + "epoch": 6.320158102766799, + "grad_norm": 3.8083715438842773, + "learning_rate": 1.8472222222222224e-05, + "loss": 0.1808, + "step": 1599 + }, + { + "epoch": 6.324110671936759, + "grad_norm": 3.3586747646331787, + "learning_rate": 1.8452380952380954e-05, + "loss": 0.1479, + "step": 1600 + }, + { + "epoch": 6.32806324110672, + "grad_norm": 2.599363088607788, + "learning_rate": 1.8432539682539684e-05, + "loss": 0.1342, + "step": 1601 + }, + { + "epoch": 6.33201581027668, + "grad_norm": 3.6683595180511475, + "learning_rate": 1.8412698412698415e-05, + "loss": 0.1694, + "step": 1602 + }, + { + "epoch": 6.335968379446641, + "grad_norm": 2.9072253704071045, + "learning_rate": 1.8392857142857145e-05, + "loss": 0.1598, + "step": 1603 + }, + { + "epoch": 6.339920948616601, + "grad_norm": 4.043822288513184, + "learning_rate": 1.8373015873015875e-05, + "loss": 0.2375, + "step": 1604 + }, + { + "epoch": 6.3438735177865615, + "grad_norm": 4.241188049316406, + "learning_rate": 1.8353174603174602e-05, + "loss": 0.2391, + "step": 1605 + }, + { + "epoch": 6.3478260869565215, + "grad_norm": 3.922586441040039, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.2149, + "step": 1606 + }, + { + "epoch": 6.351778656126482, + "grad_norm": 4.914572715759277, + "learning_rate": 1.8313492063492063e-05, + "loss": 0.2859, + "step": 1607 + }, + { + "epoch": 6.355731225296442, + "grad_norm": 3.6631100177764893, + "learning_rate": 1.8293650793650794e-05, + "loss": 0.2134, + "step": 1608 + }, + { + "epoch": 6.359683794466403, + "grad_norm": 2.8764913082122803, + "learning_rate": 1.8273809523809524e-05, + "loss": 0.1453, + "step": 1609 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 2.821441173553467, + "learning_rate": 1.8253968253968254e-05, + "loss": 0.1457, + "step": 1610 + }, + { + "epoch": 6.367588932806324, + "grad_norm": 2.9946677684783936, + "learning_rate": 1.8234126984126985e-05, + "loss": 0.144, + "step": 1611 + }, + { + "epoch": 6.371541501976284, + "grad_norm": 2.6403133869171143, + "learning_rate": 1.8214285714285715e-05, + "loss": 0.1484, + "step": 1612 + }, + { + "epoch": 6.375494071146245, + "grad_norm": 2.8457889556884766, + "learning_rate": 1.8194444444444445e-05, + "loss": 0.1816, + "step": 1613 + }, + { + "epoch": 6.379446640316205, + "grad_norm": 2.93542742729187, + "learning_rate": 1.8174603174603176e-05, + "loss": 0.1347, + "step": 1614 + }, + { + "epoch": 6.383399209486166, + "grad_norm": 2.890878915786743, + "learning_rate": 1.8154761904761906e-05, + "loss": 0.1245, + "step": 1615 + }, + { + "epoch": 6.387351778656127, + "grad_norm": 2.7771224975585938, + "learning_rate": 1.8134920634920637e-05, + "loss": 0.1573, + "step": 1616 + }, + { + "epoch": 6.391304347826087, + "grad_norm": 3.317066192626953, + "learning_rate": 1.8115079365079367e-05, + "loss": 0.1826, + "step": 1617 + }, + { + "epoch": 6.395256916996048, + "grad_norm": 3.2389166355133057, + "learning_rate": 1.8095238095238094e-05, + "loss": 0.1539, + "step": 1618 + }, + { + "epoch": 6.399209486166008, + "grad_norm": 3.585541009902954, + "learning_rate": 1.8075396825396824e-05, + "loss": 0.1731, + "step": 1619 + }, + { + "epoch": 6.403162055335969, + "grad_norm": 3.805875778198242, + "learning_rate": 1.8055555555555555e-05, + "loss": 0.2165, + "step": 1620 + }, + { + "epoch": 6.407114624505929, + "grad_norm": 4.0997090339660645, + "learning_rate": 1.8035714285714285e-05, + "loss": 0.178, + "step": 1621 + }, + { + "epoch": 6.41106719367589, + "grad_norm": 3.204512596130371, + "learning_rate": 1.8015873015873015e-05, + "loss": 0.1188, + "step": 1622 + }, + { + "epoch": 6.41501976284585, + "grad_norm": 3.9086551666259766, + "learning_rate": 1.7996031746031746e-05, + "loss": 0.1424, + "step": 1623 + }, + { + "epoch": 6.4189723320158105, + "grad_norm": 3.208350658416748, + "learning_rate": 1.7976190476190476e-05, + "loss": 0.1119, + "step": 1624 + }, + { + "epoch": 6.4229249011857705, + "grad_norm": 4.048145771026611, + "learning_rate": 1.7956349206349207e-05, + "loss": 0.2034, + "step": 1625 + }, + { + "epoch": 6.426877470355731, + "grad_norm": 3.807363986968994, + "learning_rate": 1.7936507936507937e-05, + "loss": 0.1894, + "step": 1626 + }, + { + "epoch": 6.430830039525691, + "grad_norm": 3.468174457550049, + "learning_rate": 1.7916666666666667e-05, + "loss": 0.1755, + "step": 1627 + }, + { + "epoch": 6.434782608695652, + "grad_norm": 3.572981595993042, + "learning_rate": 1.7896825396825398e-05, + "loss": 0.2043, + "step": 1628 + }, + { + "epoch": 6.438735177865612, + "grad_norm": 4.217936992645264, + "learning_rate": 1.7876984126984128e-05, + "loss": 0.3312, + "step": 1629 + }, + { + "epoch": 6.442687747035573, + "grad_norm": 3.2552435398101807, + "learning_rate": 1.785714285714286e-05, + "loss": 0.1874, + "step": 1630 + }, + { + "epoch": 6.446640316205533, + "grad_norm": 3.2607510089874268, + "learning_rate": 1.783730158730159e-05, + "loss": 0.1648, + "step": 1631 + }, + { + "epoch": 6.450592885375494, + "grad_norm": 4.080394268035889, + "learning_rate": 1.781746031746032e-05, + "loss": 0.1779, + "step": 1632 + }, + { + "epoch": 6.454545454545454, + "grad_norm": 3.7420568466186523, + "learning_rate": 1.779761904761905e-05, + "loss": 0.161, + "step": 1633 + }, + { + "epoch": 6.458498023715415, + "grad_norm": 3.199740171432495, + "learning_rate": 1.777777777777778e-05, + "loss": 0.1783, + "step": 1634 + }, + { + "epoch": 6.462450592885375, + "grad_norm": 3.5963351726531982, + "learning_rate": 1.775793650793651e-05, + "loss": 0.1577, + "step": 1635 + }, + { + "epoch": 6.466403162055336, + "grad_norm": 2.5002620220184326, + "learning_rate": 1.773809523809524e-05, + "loss": 0.1206, + "step": 1636 + }, + { + "epoch": 6.470355731225297, + "grad_norm": 3.222224235534668, + "learning_rate": 1.771825396825397e-05, + "loss": 0.1532, + "step": 1637 + }, + { + "epoch": 6.474308300395257, + "grad_norm": 2.948617935180664, + "learning_rate": 1.76984126984127e-05, + "loss": 0.1807, + "step": 1638 + }, + { + "epoch": 6.478260869565218, + "grad_norm": 3.9954729080200195, + "learning_rate": 1.7678571428571432e-05, + "loss": 0.1606, + "step": 1639 + }, + { + "epoch": 6.482213438735178, + "grad_norm": 3.8689181804656982, + "learning_rate": 1.7658730158730162e-05, + "loss": 0.188, + "step": 1640 + }, + { + "epoch": 6.486166007905139, + "grad_norm": 2.888376235961914, + "learning_rate": 1.763888888888889e-05, + "loss": 0.1553, + "step": 1641 + }, + { + "epoch": 6.490118577075099, + "grad_norm": 3.6249587535858154, + "learning_rate": 1.761904761904762e-05, + "loss": 0.178, + "step": 1642 + }, + { + "epoch": 6.4940711462450595, + "grad_norm": 2.787454843521118, + "learning_rate": 1.759920634920635e-05, + "loss": 0.129, + "step": 1643 + }, + { + "epoch": 6.4980237154150196, + "grad_norm": 4.260427474975586, + "learning_rate": 1.757936507936508e-05, + "loss": 0.2073, + "step": 1644 + }, + { + "epoch": 6.5019762845849804, + "grad_norm": 4.095132827758789, + "learning_rate": 1.755952380952381e-05, + "loss": 0.1609, + "step": 1645 + }, + { + "epoch": 6.5059288537549405, + "grad_norm": 3.89748477935791, + "learning_rate": 1.7539682539682538e-05, + "loss": 0.2223, + "step": 1646 + }, + { + "epoch": 6.509881422924901, + "grad_norm": 3.197842597961426, + "learning_rate": 1.7519841269841268e-05, + "loss": 0.1804, + "step": 1647 + }, + { + "epoch": 6.513833992094861, + "grad_norm": 3.7351438999176025, + "learning_rate": 1.75e-05, + "loss": 0.1659, + "step": 1648 + }, + { + "epoch": 6.517786561264822, + "grad_norm": 3.417168378829956, + "learning_rate": 1.748015873015873e-05, + "loss": 0.1493, + "step": 1649 + }, + { + "epoch": 6.521739130434782, + "grad_norm": 2.904148578643799, + "learning_rate": 1.746031746031746e-05, + "loss": 0.1689, + "step": 1650 + }, + { + "epoch": 6.525691699604743, + "grad_norm": 3.141493558883667, + "learning_rate": 1.744047619047619e-05, + "loss": 0.1433, + "step": 1651 + }, + { + "epoch": 6.529644268774703, + "grad_norm": 3.594679832458496, + "learning_rate": 1.742063492063492e-05, + "loss": 0.2365, + "step": 1652 + }, + { + "epoch": 6.533596837944664, + "grad_norm": 3.7037243843078613, + "learning_rate": 1.740079365079365e-05, + "loss": 0.2317, + "step": 1653 + }, + { + "epoch": 6.537549407114625, + "grad_norm": 3.6561882495880127, + "learning_rate": 1.738095238095238e-05, + "loss": 0.1495, + "step": 1654 + }, + { + "epoch": 6.541501976284585, + "grad_norm": 3.278259754180908, + "learning_rate": 1.736111111111111e-05, + "loss": 0.1314, + "step": 1655 + }, + { + "epoch": 6.545454545454545, + "grad_norm": 3.1691975593566895, + "learning_rate": 1.734126984126984e-05, + "loss": 0.1566, + "step": 1656 + }, + { + "epoch": 6.549407114624506, + "grad_norm": 3.14813494682312, + "learning_rate": 1.7321428571428572e-05, + "loss": 0.1585, + "step": 1657 + }, + { + "epoch": 6.553359683794467, + "grad_norm": 3.2934324741363525, + "learning_rate": 1.7301587301587302e-05, + "loss": 0.1568, + "step": 1658 + }, + { + "epoch": 6.557312252964427, + "grad_norm": 4.18383264541626, + "learning_rate": 1.7281746031746033e-05, + "loss": 0.3083, + "step": 1659 + }, + { + "epoch": 6.561264822134388, + "grad_norm": 4.086765766143799, + "learning_rate": 1.7261904761904763e-05, + "loss": 0.1687, + "step": 1660 + }, + { + "epoch": 6.565217391304348, + "grad_norm": 3.70186185836792, + "learning_rate": 1.7242063492063493e-05, + "loss": 0.1741, + "step": 1661 + }, + { + "epoch": 6.569169960474309, + "grad_norm": 3.012298583984375, + "learning_rate": 1.7222222222222224e-05, + "loss": 0.1249, + "step": 1662 + }, + { + "epoch": 6.573122529644269, + "grad_norm": 5.418375492095947, + "learning_rate": 1.7202380952380954e-05, + "loss": 0.328, + "step": 1663 + }, + { + "epoch": 6.5770750988142295, + "grad_norm": 2.400665521621704, + "learning_rate": 1.7182539682539684e-05, + "loss": 0.1125, + "step": 1664 + }, + { + "epoch": 6.5810276679841895, + "grad_norm": 3.7247157096862793, + "learning_rate": 1.7162698412698415e-05, + "loss": 0.1577, + "step": 1665 + }, + { + "epoch": 6.58498023715415, + "grad_norm": 2.6763904094696045, + "learning_rate": 1.7142857142857145e-05, + "loss": 0.1269, + "step": 1666 + }, + { + "epoch": 6.58893280632411, + "grad_norm": 2.851741313934326, + "learning_rate": 1.7123015873015876e-05, + "loss": 0.141, + "step": 1667 + }, + { + "epoch": 6.592885375494071, + "grad_norm": 3.190314769744873, + "learning_rate": 1.7103174603174606e-05, + "loss": 0.2027, + "step": 1668 + }, + { + "epoch": 6.596837944664031, + "grad_norm": 3.6213324069976807, + "learning_rate": 1.7083333333333333e-05, + "loss": 0.2126, + "step": 1669 + }, + { + "epoch": 6.600790513833992, + "grad_norm": 3.6748242378234863, + "learning_rate": 1.7063492063492063e-05, + "loss": 0.1946, + "step": 1670 + }, + { + "epoch": 6.604743083003952, + "grad_norm": 2.7625160217285156, + "learning_rate": 1.7043650793650794e-05, + "loss": 0.1886, + "step": 1671 + }, + { + "epoch": 6.608695652173913, + "grad_norm": 3.2063212394714355, + "learning_rate": 1.7023809523809524e-05, + "loss": 0.1705, + "step": 1672 + }, + { + "epoch": 6.612648221343873, + "grad_norm": 3.1696741580963135, + "learning_rate": 1.7003968253968254e-05, + "loss": 0.1476, + "step": 1673 + }, + { + "epoch": 6.616600790513834, + "grad_norm": 2.514961004257202, + "learning_rate": 1.6984126984126985e-05, + "loss": 0.1324, + "step": 1674 + }, + { + "epoch": 6.620553359683795, + "grad_norm": 3.8006770610809326, + "learning_rate": 1.6964285714285715e-05, + "loss": 0.2031, + "step": 1675 + }, + { + "epoch": 6.624505928853755, + "grad_norm": 2.094867706298828, + "learning_rate": 1.6944444444444446e-05, + "loss": 0.0909, + "step": 1676 + }, + { + "epoch": 6.628458498023716, + "grad_norm": 5.729726314544678, + "learning_rate": 1.6924603174603176e-05, + "loss": 0.2006, + "step": 1677 + }, + { + "epoch": 6.632411067193676, + "grad_norm": 4.788626194000244, + "learning_rate": 1.6904761904761906e-05, + "loss": 0.188, + "step": 1678 + }, + { + "epoch": 6.636363636363637, + "grad_norm": 3.7394933700561523, + "learning_rate": 1.6884920634920637e-05, + "loss": 0.1474, + "step": 1679 + }, + { + "epoch": 6.640316205533597, + "grad_norm": 3.15619158744812, + "learning_rate": 1.6865079365079367e-05, + "loss": 0.1226, + "step": 1680 + }, + { + "epoch": 6.644268774703558, + "grad_norm": 3.1773312091827393, + "learning_rate": 1.6845238095238097e-05, + "loss": 0.1683, + "step": 1681 + }, + { + "epoch": 6.648221343873518, + "grad_norm": 3.101545572280884, + "learning_rate": 1.6825396825396828e-05, + "loss": 0.1786, + "step": 1682 + }, + { + "epoch": 6.6521739130434785, + "grad_norm": 2.9866223335266113, + "learning_rate": 1.6805555555555558e-05, + "loss": 0.1385, + "step": 1683 + }, + { + "epoch": 6.6561264822134385, + "grad_norm": 2.350311517715454, + "learning_rate": 1.6785714285714285e-05, + "loss": 0.1332, + "step": 1684 + }, + { + "epoch": 6.660079051383399, + "grad_norm": 3.1310527324676514, + "learning_rate": 1.6765873015873016e-05, + "loss": 0.1336, + "step": 1685 + }, + { + "epoch": 6.664031620553359, + "grad_norm": 3.0900039672851562, + "learning_rate": 1.6746031746031746e-05, + "loss": 0.1304, + "step": 1686 + }, + { + "epoch": 6.66798418972332, + "grad_norm": 3.2894325256347656, + "learning_rate": 1.6726190476190476e-05, + "loss": 0.1869, + "step": 1687 + }, + { + "epoch": 6.67193675889328, + "grad_norm": 3.1792759895324707, + "learning_rate": 1.6706349206349207e-05, + "loss": 0.145, + "step": 1688 + }, + { + "epoch": 6.675889328063241, + "grad_norm": 2.9144246578216553, + "learning_rate": 1.6686507936507937e-05, + "loss": 0.1617, + "step": 1689 + }, + { + "epoch": 6.679841897233201, + "grad_norm": 3.1634509563446045, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.159, + "step": 1690 + }, + { + "epoch": 6.683794466403162, + "grad_norm": 3.328819990158081, + "learning_rate": 1.6646825396825398e-05, + "loss": 0.2096, + "step": 1691 + }, + { + "epoch": 6.687747035573123, + "grad_norm": 3.085995674133301, + "learning_rate": 1.6626984126984128e-05, + "loss": 0.1649, + "step": 1692 + }, + { + "epoch": 6.691699604743083, + "grad_norm": 2.8405370712280273, + "learning_rate": 1.660714285714286e-05, + "loss": 0.1666, + "step": 1693 + }, + { + "epoch": 6.695652173913043, + "grad_norm": 3.161557912826538, + "learning_rate": 1.658730158730159e-05, + "loss": 0.1811, + "step": 1694 + }, + { + "epoch": 6.699604743083004, + "grad_norm": 4.0587158203125, + "learning_rate": 1.656746031746032e-05, + "loss": 0.2273, + "step": 1695 + }, + { + "epoch": 6.703557312252965, + "grad_norm": 3.1666932106018066, + "learning_rate": 1.6547619047619046e-05, + "loss": 0.2019, + "step": 1696 + }, + { + "epoch": 6.707509881422925, + "grad_norm": 4.08314847946167, + "learning_rate": 1.6527777777777777e-05, + "loss": 0.2561, + "step": 1697 + }, + { + "epoch": 6.711462450592886, + "grad_norm": 3.302401065826416, + "learning_rate": 1.6507936507936507e-05, + "loss": 0.1543, + "step": 1698 + }, + { + "epoch": 6.715415019762846, + "grad_norm": 3.8679957389831543, + "learning_rate": 1.6488095238095237e-05, + "loss": 0.2805, + "step": 1699 + }, + { + "epoch": 6.719367588932807, + "grad_norm": 2.699409008026123, + "learning_rate": 1.6468253968253968e-05, + "loss": 0.1732, + "step": 1700 + }, + { + "epoch": 6.723320158102767, + "grad_norm": 4.578848838806152, + "learning_rate": 1.6448412698412698e-05, + "loss": 0.1573, + "step": 1701 + }, + { + "epoch": 6.7272727272727275, + "grad_norm": 4.305120468139648, + "learning_rate": 1.642857142857143e-05, + "loss": 0.1459, + "step": 1702 + }, + { + "epoch": 6.7312252964426875, + "grad_norm": 2.981783628463745, + "learning_rate": 1.640873015873016e-05, + "loss": 0.1605, + "step": 1703 + }, + { + "epoch": 6.735177865612648, + "grad_norm": 3.4200551509857178, + "learning_rate": 1.638888888888889e-05, + "loss": 0.1136, + "step": 1704 + }, + { + "epoch": 6.739130434782608, + "grad_norm": 3.0042500495910645, + "learning_rate": 1.636904761904762e-05, + "loss": 0.1645, + "step": 1705 + }, + { + "epoch": 6.743083003952569, + "grad_norm": 5.0387043952941895, + "learning_rate": 1.634920634920635e-05, + "loss": 0.3057, + "step": 1706 + }, + { + "epoch": 6.747035573122529, + "grad_norm": 3.7466354370117188, + "learning_rate": 1.632936507936508e-05, + "loss": 0.1289, + "step": 1707 + }, + { + "epoch": 6.75098814229249, + "grad_norm": 2.57358980178833, + "learning_rate": 1.630952380952381e-05, + "loss": 0.1363, + "step": 1708 + }, + { + "epoch": 6.75494071146245, + "grad_norm": 3.13360333442688, + "learning_rate": 1.628968253968254e-05, + "loss": 0.1979, + "step": 1709 + }, + { + "epoch": 6.758893280632411, + "grad_norm": 3.5185632705688477, + "learning_rate": 1.626984126984127e-05, + "loss": 0.1732, + "step": 1710 + }, + { + "epoch": 6.762845849802371, + "grad_norm": 4.430140018463135, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.1609, + "step": 1711 + }, + { + "epoch": 6.766798418972332, + "grad_norm": 3.1439976692199707, + "learning_rate": 1.6230158730158732e-05, + "loss": 0.1765, + "step": 1712 + }, + { + "epoch": 6.770750988142293, + "grad_norm": 2.766545534133911, + "learning_rate": 1.6210317460317463e-05, + "loss": 0.173, + "step": 1713 + }, + { + "epoch": 6.774703557312253, + "grad_norm": 3.4417989253997803, + "learning_rate": 1.6190476190476193e-05, + "loss": 0.1748, + "step": 1714 + }, + { + "epoch": 6.778656126482213, + "grad_norm": 3.091768503189087, + "learning_rate": 1.6170634920634923e-05, + "loss": 0.1947, + "step": 1715 + }, + { + "epoch": 6.782608695652174, + "grad_norm": 2.7375543117523193, + "learning_rate": 1.6150793650793654e-05, + "loss": 0.1786, + "step": 1716 + }, + { + "epoch": 6.786561264822135, + "grad_norm": 3.463697671890259, + "learning_rate": 1.6130952380952384e-05, + "loss": 0.2595, + "step": 1717 + }, + { + "epoch": 6.790513833992095, + "grad_norm": 3.0348737239837646, + "learning_rate": 1.6111111111111115e-05, + "loss": 0.123, + "step": 1718 + }, + { + "epoch": 6.794466403162056, + "grad_norm": 3.646333694458008, + "learning_rate": 1.609126984126984e-05, + "loss": 0.1529, + "step": 1719 + }, + { + "epoch": 6.798418972332016, + "grad_norm": 4.054477691650391, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.1475, + "step": 1720 + }, + { + "epoch": 6.8023715415019765, + "grad_norm": 3.619706869125366, + "learning_rate": 1.6051587301587302e-05, + "loss": 0.1682, + "step": 1721 + }, + { + "epoch": 6.8063241106719365, + "grad_norm": 3.0783016681671143, + "learning_rate": 1.6031746031746033e-05, + "loss": 0.1772, + "step": 1722 + }, + { + "epoch": 6.810276679841897, + "grad_norm": 2.9383182525634766, + "learning_rate": 1.601190476190476e-05, + "loss": 0.149, + "step": 1723 + }, + { + "epoch": 6.8142292490118574, + "grad_norm": 3.6608943939208984, + "learning_rate": 1.599206349206349e-05, + "loss": 0.152, + "step": 1724 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 3.396385908126831, + "learning_rate": 1.597222222222222e-05, + "loss": 0.1656, + "step": 1725 + }, + { + "epoch": 6.822134387351778, + "grad_norm": 2.923689126968384, + "learning_rate": 1.595238095238095e-05, + "loss": 0.1529, + "step": 1726 + }, + { + "epoch": 6.826086956521739, + "grad_norm": 4.30761194229126, + "learning_rate": 1.593253968253968e-05, + "loss": 0.3032, + "step": 1727 + }, + { + "epoch": 6.830039525691699, + "grad_norm": 3.8314170837402344, + "learning_rate": 1.591269841269841e-05, + "loss": 0.1874, + "step": 1728 + }, + { + "epoch": 6.83399209486166, + "grad_norm": 2.5451226234436035, + "learning_rate": 1.5892857142857142e-05, + "loss": 0.1155, + "step": 1729 + }, + { + "epoch": 6.837944664031621, + "grad_norm": 3.4286904335021973, + "learning_rate": 1.5873015873015872e-05, + "loss": 0.2253, + "step": 1730 + }, + { + "epoch": 6.841897233201581, + "grad_norm": 3.8259549140930176, + "learning_rate": 1.5853174603174603e-05, + "loss": 0.2408, + "step": 1731 + }, + { + "epoch": 6.845849802371541, + "grad_norm": 3.6046714782714844, + "learning_rate": 1.5833333333333333e-05, + "loss": 0.1758, + "step": 1732 + }, + { + "epoch": 6.849802371541502, + "grad_norm": 3.0990259647369385, + "learning_rate": 1.5813492063492063e-05, + "loss": 0.1622, + "step": 1733 + }, + { + "epoch": 6.853754940711463, + "grad_norm": 2.931340217590332, + "learning_rate": 1.5793650793650794e-05, + "loss": 0.1598, + "step": 1734 + }, + { + "epoch": 6.857707509881423, + "grad_norm": 2.4774787425994873, + "learning_rate": 1.5773809523809524e-05, + "loss": 0.141, + "step": 1735 + }, + { + "epoch": 6.861660079051384, + "grad_norm": 2.9939541816711426, + "learning_rate": 1.5753968253968255e-05, + "loss": 0.135, + "step": 1736 + }, + { + "epoch": 6.865612648221344, + "grad_norm": 2.929865598678589, + "learning_rate": 1.5734126984126985e-05, + "loss": 0.1947, + "step": 1737 + }, + { + "epoch": 6.869565217391305, + "grad_norm": 3.4848814010620117, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.1808, + "step": 1738 + }, + { + "epoch": 6.873517786561265, + "grad_norm": 2.8920042514801025, + "learning_rate": 1.5694444444444446e-05, + "loss": 0.102, + "step": 1739 + }, + { + "epoch": 6.877470355731226, + "grad_norm": 4.020946979522705, + "learning_rate": 1.5674603174603176e-05, + "loss": 0.1483, + "step": 1740 + }, + { + "epoch": 6.881422924901186, + "grad_norm": 3.165574073791504, + "learning_rate": 1.5654761904761906e-05, + "loss": 0.1437, + "step": 1741 + }, + { + "epoch": 6.8853754940711465, + "grad_norm": 3.595200538635254, + "learning_rate": 1.5634920634920637e-05, + "loss": 0.1561, + "step": 1742 + }, + { + "epoch": 6.8893280632411065, + "grad_norm": 2.952357530593872, + "learning_rate": 1.5615079365079367e-05, + "loss": 0.1786, + "step": 1743 + }, + { + "epoch": 6.893280632411067, + "grad_norm": 3.0202043056488037, + "learning_rate": 1.5595238095238098e-05, + "loss": 0.1493, + "step": 1744 + }, + { + "epoch": 6.897233201581027, + "grad_norm": 3.46177339553833, + "learning_rate": 1.5575396825396828e-05, + "loss": 0.1578, + "step": 1745 + }, + { + "epoch": 6.901185770750988, + "grad_norm": 4.415173053741455, + "learning_rate": 1.5555555555555555e-05, + "loss": 0.205, + "step": 1746 + }, + { + "epoch": 6.905138339920948, + "grad_norm": 3.0862183570861816, + "learning_rate": 1.5535714285714285e-05, + "loss": 0.1837, + "step": 1747 + }, + { + "epoch": 6.909090909090909, + "grad_norm": 3.788036584854126, + "learning_rate": 1.5515873015873016e-05, + "loss": 0.1423, + "step": 1748 + }, + { + "epoch": 6.913043478260869, + "grad_norm": 3.3133058547973633, + "learning_rate": 1.5496031746031746e-05, + "loss": 0.2186, + "step": 1749 + }, + { + "epoch": 6.91699604743083, + "grad_norm": 4.402420520782471, + "learning_rate": 1.5476190476190476e-05, + "loss": 0.1938, + "step": 1750 + }, + { + "epoch": 6.920948616600791, + "grad_norm": 3.363860607147217, + "learning_rate": 1.5456349206349207e-05, + "loss": 0.1936, + "step": 1751 + }, + { + "epoch": 6.924901185770751, + "grad_norm": 3.4065957069396973, + "learning_rate": 1.5436507936507937e-05, + "loss": 0.174, + "step": 1752 + }, + { + "epoch": 6.928853754940711, + "grad_norm": 2.393531322479248, + "learning_rate": 1.5416666666666668e-05, + "loss": 0.1511, + "step": 1753 + }, + { + "epoch": 6.932806324110672, + "grad_norm": 3.180001974105835, + "learning_rate": 1.5396825396825398e-05, + "loss": 0.1551, + "step": 1754 + }, + { + "epoch": 6.936758893280633, + "grad_norm": 2.9649102687835693, + "learning_rate": 1.537698412698413e-05, + "loss": 0.1588, + "step": 1755 + }, + { + "epoch": 6.940711462450593, + "grad_norm": 3.6014418601989746, + "learning_rate": 1.535714285714286e-05, + "loss": 0.3112, + "step": 1756 + }, + { + "epoch": 6.944664031620554, + "grad_norm": 3.6775028705596924, + "learning_rate": 1.533730158730159e-05, + "loss": 0.2638, + "step": 1757 + }, + { + "epoch": 6.948616600790514, + "grad_norm": 2.682131052017212, + "learning_rate": 1.531746031746032e-05, + "loss": 0.1377, + "step": 1758 + }, + { + "epoch": 6.952569169960475, + "grad_norm": 3.296804904937744, + "learning_rate": 1.529761904761905e-05, + "loss": 0.1325, + "step": 1759 + }, + { + "epoch": 6.956521739130435, + "grad_norm": 3.019824743270874, + "learning_rate": 1.527777777777778e-05, + "loss": 0.1358, + "step": 1760 + }, + { + "epoch": 6.9604743083003955, + "grad_norm": 3.4432945251464844, + "learning_rate": 1.525793650793651e-05, + "loss": 0.1674, + "step": 1761 + }, + { + "epoch": 6.9644268774703555, + "grad_norm": 4.434787273406982, + "learning_rate": 1.5238095238095241e-05, + "loss": 0.2198, + "step": 1762 + }, + { + "epoch": 6.968379446640316, + "grad_norm": 2.954920530319214, + "learning_rate": 1.5218253968253968e-05, + "loss": 0.1315, + "step": 1763 + }, + { + "epoch": 6.972332015810276, + "grad_norm": 3.6730740070343018, + "learning_rate": 1.5198412698412698e-05, + "loss": 0.1586, + "step": 1764 + }, + { + "epoch": 6.976284584980237, + "grad_norm": 4.036146640777588, + "learning_rate": 1.5178571428571429e-05, + "loss": 0.2249, + "step": 1765 + }, + { + "epoch": 6.980237154150197, + "grad_norm": 3.2338225841522217, + "learning_rate": 1.5158730158730159e-05, + "loss": 0.138, + "step": 1766 + }, + { + "epoch": 6.984189723320158, + "grad_norm": 3.692736864089966, + "learning_rate": 1.5138888888888888e-05, + "loss": 0.1937, + "step": 1767 + }, + { + "epoch": 6.988142292490118, + "grad_norm": 2.946046829223633, + "learning_rate": 1.5119047619047618e-05, + "loss": 0.1506, + "step": 1768 + }, + { + "epoch": 6.992094861660079, + "grad_norm": 4.145671367645264, + "learning_rate": 1.5099206349206349e-05, + "loss": 0.1591, + "step": 1769 + }, + { + "epoch": 6.996047430830039, + "grad_norm": 3.8622443675994873, + "learning_rate": 1.5079365079365079e-05, + "loss": 0.2398, + "step": 1770 + }, + { + "epoch": 7.0, + "grad_norm": 3.0714492797851562, + "learning_rate": 1.505952380952381e-05, + "loss": 0.1849, + "step": 1771 + }, + { + "epoch": 7.003952569169961, + "grad_norm": 1.683645248413086, + "learning_rate": 1.503968253968254e-05, + "loss": 0.0714, + "step": 1772 + }, + { + "epoch": 7.007905138339921, + "grad_norm": 2.0551791191101074, + "learning_rate": 1.501984126984127e-05, + "loss": 0.0825, + "step": 1773 + }, + { + "epoch": 7.011857707509882, + "grad_norm": 2.248523712158203, + "learning_rate": 1.5e-05, + "loss": 0.0954, + "step": 1774 + }, + { + "epoch": 7.015810276679842, + "grad_norm": 2.069105863571167, + "learning_rate": 1.498015873015873e-05, + "loss": 0.1108, + "step": 1775 + }, + { + "epoch": 7.019762845849803, + "grad_norm": 2.3756532669067383, + "learning_rate": 1.4960317460317461e-05, + "loss": 0.1123, + "step": 1776 + }, + { + "epoch": 7.023715415019763, + "grad_norm": 2.0308547019958496, + "learning_rate": 1.4940476190476192e-05, + "loss": 0.0715, + "step": 1777 + }, + { + "epoch": 7.027667984189724, + "grad_norm": 2.602980613708496, + "learning_rate": 1.4920634920634922e-05, + "loss": 0.1107, + "step": 1778 + }, + { + "epoch": 7.031620553359684, + "grad_norm": 1.8657187223434448, + "learning_rate": 1.490079365079365e-05, + "loss": 0.0681, + "step": 1779 + }, + { + "epoch": 7.0355731225296445, + "grad_norm": 1.9880305528640747, + "learning_rate": 1.4880952380952381e-05, + "loss": 0.0644, + "step": 1780 + }, + { + "epoch": 7.0395256916996045, + "grad_norm": 1.8279154300689697, + "learning_rate": 1.4861111111111111e-05, + "loss": 0.0662, + "step": 1781 + }, + { + "epoch": 7.043478260869565, + "grad_norm": 2.1131718158721924, + "learning_rate": 1.4841269841269842e-05, + "loss": 0.0787, + "step": 1782 + }, + { + "epoch": 7.047430830039525, + "grad_norm": 2.5316834449768066, + "learning_rate": 1.4821428571428572e-05, + "loss": 0.1016, + "step": 1783 + }, + { + "epoch": 7.051383399209486, + "grad_norm": 2.494107961654663, + "learning_rate": 1.4801587301587302e-05, + "loss": 0.0835, + "step": 1784 + }, + { + "epoch": 7.055335968379446, + "grad_norm": 2.633171796798706, + "learning_rate": 1.4781746031746033e-05, + "loss": 0.0724, + "step": 1785 + }, + { + "epoch": 7.059288537549407, + "grad_norm": 1.709030270576477, + "learning_rate": 1.4761904761904763e-05, + "loss": 0.0618, + "step": 1786 + }, + { + "epoch": 7.063241106719367, + "grad_norm": 2.1349966526031494, + "learning_rate": 1.4742063492063494e-05, + "loss": 0.0763, + "step": 1787 + }, + { + "epoch": 7.067193675889328, + "grad_norm": 3.602543354034424, + "learning_rate": 1.4722222222222224e-05, + "loss": 0.1131, + "step": 1788 + }, + { + "epoch": 7.071146245059288, + "grad_norm": 3.5014562606811523, + "learning_rate": 1.4702380952380954e-05, + "loss": 0.1475, + "step": 1789 + }, + { + "epoch": 7.075098814229249, + "grad_norm": 1.7112377882003784, + "learning_rate": 1.4682539682539683e-05, + "loss": 0.0655, + "step": 1790 + }, + { + "epoch": 7.07905138339921, + "grad_norm": 2.5485458374023438, + "learning_rate": 1.4662698412698413e-05, + "loss": 0.0878, + "step": 1791 + }, + { + "epoch": 7.08300395256917, + "grad_norm": 2.0620596408843994, + "learning_rate": 1.4642857142857144e-05, + "loss": 0.0801, + "step": 1792 + }, + { + "epoch": 7.086956521739131, + "grad_norm": 2.537457227706909, + "learning_rate": 1.4623015873015874e-05, + "loss": 0.0698, + "step": 1793 + }, + { + "epoch": 7.090909090909091, + "grad_norm": 1.8265228271484375, + "learning_rate": 1.4603174603174605e-05, + "loss": 0.0559, + "step": 1794 + }, + { + "epoch": 7.094861660079052, + "grad_norm": 1.819351315498352, + "learning_rate": 1.4583333333333335e-05, + "loss": 0.0665, + "step": 1795 + }, + { + "epoch": 7.098814229249012, + "grad_norm": 2.4396214485168457, + "learning_rate": 1.4563492063492065e-05, + "loss": 0.0786, + "step": 1796 + }, + { + "epoch": 7.102766798418973, + "grad_norm": 2.4278268814086914, + "learning_rate": 1.4543650793650796e-05, + "loss": 0.0957, + "step": 1797 + }, + { + "epoch": 7.106719367588933, + "grad_norm": 2.5539965629577637, + "learning_rate": 1.4523809523809526e-05, + "loss": 0.0876, + "step": 1798 + }, + { + "epoch": 7.1106719367588935, + "grad_norm": 2.3676304817199707, + "learning_rate": 1.4503968253968256e-05, + "loss": 0.0801, + "step": 1799 + }, + { + "epoch": 7.1146245059288535, + "grad_norm": 2.803798198699951, + "learning_rate": 1.4484126984126987e-05, + "loss": 0.1065, + "step": 1800 + }, + { + "epoch": 7.118577075098814, + "grad_norm": 2.0833263397216797, + "learning_rate": 1.4464285714285717e-05, + "loss": 0.0701, + "step": 1801 + }, + { + "epoch": 7.122529644268774, + "grad_norm": 1.9213995933532715, + "learning_rate": 1.4444444444444444e-05, + "loss": 0.0689, + "step": 1802 + }, + { + "epoch": 7.126482213438735, + "grad_norm": 2.999032974243164, + "learning_rate": 1.4424603174603174e-05, + "loss": 0.1407, + "step": 1803 + }, + { + "epoch": 7.130434782608695, + "grad_norm": 2.492380142211914, + "learning_rate": 1.4404761904761905e-05, + "loss": 0.0919, + "step": 1804 + }, + { + "epoch": 7.134387351778656, + "grad_norm": 2.170208692550659, + "learning_rate": 1.4384920634920635e-05, + "loss": 0.0862, + "step": 1805 + }, + { + "epoch": 7.138339920948616, + "grad_norm": 2.5370352268218994, + "learning_rate": 1.4365079365079364e-05, + "loss": 0.0899, + "step": 1806 + }, + { + "epoch": 7.142292490118577, + "grad_norm": 2.430377960205078, + "learning_rate": 1.4345238095238094e-05, + "loss": 0.1082, + "step": 1807 + }, + { + "epoch": 7.146245059288537, + "grad_norm": 3.0296268463134766, + "learning_rate": 1.4325396825396825e-05, + "loss": 0.1017, + "step": 1808 + }, + { + "epoch": 7.150197628458498, + "grad_norm": 2.192507743835449, + "learning_rate": 1.4305555555555555e-05, + "loss": 0.0969, + "step": 1809 + }, + { + "epoch": 7.154150197628459, + "grad_norm": 2.191904306411743, + "learning_rate": 1.4285714285714285e-05, + "loss": 0.0905, + "step": 1810 + }, + { + "epoch": 7.158102766798419, + "grad_norm": 1.9413102865219116, + "learning_rate": 1.4265873015873016e-05, + "loss": 0.0746, + "step": 1811 + }, + { + "epoch": 7.16205533596838, + "grad_norm": 2.1803011894226074, + "learning_rate": 1.4246031746031746e-05, + "loss": 0.0679, + "step": 1812 + }, + { + "epoch": 7.16600790513834, + "grad_norm": 1.9626377820968628, + "learning_rate": 1.4226190476190477e-05, + "loss": 0.0784, + "step": 1813 + }, + { + "epoch": 7.169960474308301, + "grad_norm": 3.6826534271240234, + "learning_rate": 1.4206349206349207e-05, + "loss": 0.1193, + "step": 1814 + }, + { + "epoch": 7.173913043478261, + "grad_norm": 2.018355131149292, + "learning_rate": 1.4186507936507937e-05, + "loss": 0.0614, + "step": 1815 + }, + { + "epoch": 7.177865612648222, + "grad_norm": 2.3391740322113037, + "learning_rate": 1.4166666666666668e-05, + "loss": 0.1052, + "step": 1816 + }, + { + "epoch": 7.181818181818182, + "grad_norm": 3.039984941482544, + "learning_rate": 1.4146825396825396e-05, + "loss": 0.1077, + "step": 1817 + }, + { + "epoch": 7.1857707509881426, + "grad_norm": 2.2779464721679688, + "learning_rate": 1.4126984126984127e-05, + "loss": 0.0654, + "step": 1818 + }, + { + "epoch": 7.189723320158103, + "grad_norm": 1.959446907043457, + "learning_rate": 1.4107142857142857e-05, + "loss": 0.0676, + "step": 1819 + }, + { + "epoch": 7.1936758893280635, + "grad_norm": 3.0260629653930664, + "learning_rate": 1.4087301587301587e-05, + "loss": 0.0924, + "step": 1820 + }, + { + "epoch": 7.1976284584980235, + "grad_norm": 1.552370548248291, + "learning_rate": 1.4067460317460318e-05, + "loss": 0.0557, + "step": 1821 + }, + { + "epoch": 7.201581027667984, + "grad_norm": 1.851386308670044, + "learning_rate": 1.4047619047619048e-05, + "loss": 0.0842, + "step": 1822 + }, + { + "epoch": 7.205533596837944, + "grad_norm": 1.901151180267334, + "learning_rate": 1.4027777777777779e-05, + "loss": 0.0602, + "step": 1823 + }, + { + "epoch": 7.209486166007905, + "grad_norm": 2.256410598754883, + "learning_rate": 1.4007936507936509e-05, + "loss": 0.0779, + "step": 1824 + }, + { + "epoch": 7.213438735177865, + "grad_norm": 2.2768027782440186, + "learning_rate": 1.398809523809524e-05, + "loss": 0.0701, + "step": 1825 + }, + { + "epoch": 7.217391304347826, + "grad_norm": 1.8319358825683594, + "learning_rate": 1.396825396825397e-05, + "loss": 0.0711, + "step": 1826 + }, + { + "epoch": 7.221343873517786, + "grad_norm": 1.6305453777313232, + "learning_rate": 1.39484126984127e-05, + "loss": 0.0601, + "step": 1827 + }, + { + "epoch": 7.225296442687747, + "grad_norm": 2.377932071685791, + "learning_rate": 1.392857142857143e-05, + "loss": 0.084, + "step": 1828 + }, + { + "epoch": 7.229249011857707, + "grad_norm": 2.3718206882476807, + "learning_rate": 1.390873015873016e-05, + "loss": 0.1073, + "step": 1829 + }, + { + "epoch": 7.233201581027668, + "grad_norm": 1.8948851823806763, + "learning_rate": 1.388888888888889e-05, + "loss": 0.066, + "step": 1830 + }, + { + "epoch": 7.237154150197629, + "grad_norm": 2.6554150581359863, + "learning_rate": 1.386904761904762e-05, + "loss": 0.1004, + "step": 1831 + }, + { + "epoch": 7.241106719367589, + "grad_norm": 1.7534379959106445, + "learning_rate": 1.384920634920635e-05, + "loss": 0.0686, + "step": 1832 + }, + { + "epoch": 7.24505928853755, + "grad_norm": 2.8044631481170654, + "learning_rate": 1.382936507936508e-05, + "loss": 0.0981, + "step": 1833 + }, + { + "epoch": 7.24901185770751, + "grad_norm": 1.962010145187378, + "learning_rate": 1.3809523809523811e-05, + "loss": 0.0606, + "step": 1834 + }, + { + "epoch": 7.252964426877471, + "grad_norm": 2.598278522491455, + "learning_rate": 1.3789682539682541e-05, + "loss": 0.0823, + "step": 1835 + }, + { + "epoch": 7.256916996047431, + "grad_norm": 2.0581893920898438, + "learning_rate": 1.3769841269841272e-05, + "loss": 0.0724, + "step": 1836 + }, + { + "epoch": 7.260869565217392, + "grad_norm": 2.1700425148010254, + "learning_rate": 1.3750000000000002e-05, + "loss": 0.0747, + "step": 1837 + }, + { + "epoch": 7.264822134387352, + "grad_norm": 2.3965978622436523, + "learning_rate": 1.3730158730158733e-05, + "loss": 0.0756, + "step": 1838 + }, + { + "epoch": 7.2687747035573125, + "grad_norm": 2.8531904220581055, + "learning_rate": 1.3710317460317463e-05, + "loss": 0.0915, + "step": 1839 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 1.8420287370681763, + "learning_rate": 1.3690476190476192e-05, + "loss": 0.0674, + "step": 1840 + }, + { + "epoch": 7.276679841897233, + "grad_norm": 2.040949583053589, + "learning_rate": 1.367063492063492e-05, + "loss": 0.0662, + "step": 1841 + }, + { + "epoch": 7.280632411067193, + "grad_norm": 4.134637832641602, + "learning_rate": 1.365079365079365e-05, + "loss": 0.1148, + "step": 1842 + }, + { + "epoch": 7.284584980237154, + "grad_norm": 2.1330273151397705, + "learning_rate": 1.3630952380952381e-05, + "loss": 0.0594, + "step": 1843 + }, + { + "epoch": 7.288537549407114, + "grad_norm": 2.3376622200012207, + "learning_rate": 1.3611111111111111e-05, + "loss": 0.1026, + "step": 1844 + }, + { + "epoch": 7.292490118577075, + "grad_norm": 2.1795151233673096, + "learning_rate": 1.359126984126984e-05, + "loss": 0.0685, + "step": 1845 + }, + { + "epoch": 7.296442687747035, + "grad_norm": 2.4371635913848877, + "learning_rate": 1.357142857142857e-05, + "loss": 0.0856, + "step": 1846 + }, + { + "epoch": 7.300395256916996, + "grad_norm": 1.8547295331954956, + "learning_rate": 1.3551587301587301e-05, + "loss": 0.0593, + "step": 1847 + }, + { + "epoch": 7.304347826086957, + "grad_norm": 2.3141019344329834, + "learning_rate": 1.3531746031746031e-05, + "loss": 0.0935, + "step": 1848 + }, + { + "epoch": 7.308300395256917, + "grad_norm": 1.9820470809936523, + "learning_rate": 1.3511904761904762e-05, + "loss": 0.069, + "step": 1849 + }, + { + "epoch": 7.312252964426877, + "grad_norm": 2.5655643939971924, + "learning_rate": 1.3492063492063492e-05, + "loss": 0.0899, + "step": 1850 + }, + { + "epoch": 7.316205533596838, + "grad_norm": 2.5492866039276123, + "learning_rate": 1.3472222222222222e-05, + "loss": 0.0874, + "step": 1851 + }, + { + "epoch": 7.320158102766799, + "grad_norm": 2.2358458042144775, + "learning_rate": 1.3452380952380953e-05, + "loss": 0.0769, + "step": 1852 + }, + { + "epoch": 7.324110671936759, + "grad_norm": 1.9372227191925049, + "learning_rate": 1.3432539682539683e-05, + "loss": 0.0729, + "step": 1853 + }, + { + "epoch": 7.32806324110672, + "grad_norm": 2.8513784408569336, + "learning_rate": 1.3412698412698413e-05, + "loss": 0.0988, + "step": 1854 + }, + { + "epoch": 7.33201581027668, + "grad_norm": 2.437175989151001, + "learning_rate": 1.3392857142857144e-05, + "loss": 0.0891, + "step": 1855 + }, + { + "epoch": 7.335968379446641, + "grad_norm": 1.8044086694717407, + "learning_rate": 1.3373015873015873e-05, + "loss": 0.0611, + "step": 1856 + }, + { + "epoch": 7.339920948616601, + "grad_norm": 1.8136906623840332, + "learning_rate": 1.3353174603174603e-05, + "loss": 0.0604, + "step": 1857 + }, + { + "epoch": 7.3438735177865615, + "grad_norm": 2.4834377765655518, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.126, + "step": 1858 + }, + { + "epoch": 7.3478260869565215, + "grad_norm": 2.810823678970337, + "learning_rate": 1.3313492063492064e-05, + "loss": 0.1165, + "step": 1859 + }, + { + "epoch": 7.351778656126482, + "grad_norm": 3.216146945953369, + "learning_rate": 1.3293650793650794e-05, + "loss": 0.0965, + "step": 1860 + }, + { + "epoch": 7.355731225296442, + "grad_norm": 2.910884141921997, + "learning_rate": 1.3273809523809524e-05, + "loss": 0.103, + "step": 1861 + }, + { + "epoch": 7.359683794466403, + "grad_norm": 1.8496427536010742, + "learning_rate": 1.3253968253968255e-05, + "loss": 0.0602, + "step": 1862 + }, + { + "epoch": 7.363636363636363, + "grad_norm": 2.382444381713867, + "learning_rate": 1.3234126984126985e-05, + "loss": 0.1, + "step": 1863 + }, + { + "epoch": 7.367588932806324, + "grad_norm": 2.162414789199829, + "learning_rate": 1.3214285714285716e-05, + "loss": 0.0704, + "step": 1864 + }, + { + "epoch": 7.371541501976284, + "grad_norm": 2.582324981689453, + "learning_rate": 1.3194444444444446e-05, + "loss": 0.0735, + "step": 1865 + }, + { + "epoch": 7.375494071146245, + "grad_norm": 2.0686068534851074, + "learning_rate": 1.3174603174603176e-05, + "loss": 0.0691, + "step": 1866 + }, + { + "epoch": 7.379446640316205, + "grad_norm": 2.73799467086792, + "learning_rate": 1.3154761904761907e-05, + "loss": 0.0977, + "step": 1867 + }, + { + "epoch": 7.383399209486166, + "grad_norm": 2.5663845539093018, + "learning_rate": 1.3134920634920635e-05, + "loss": 0.0964, + "step": 1868 + }, + { + "epoch": 7.387351778656127, + "grad_norm": 2.602886199951172, + "learning_rate": 1.3115079365079366e-05, + "loss": 0.0834, + "step": 1869 + }, + { + "epoch": 7.391304347826087, + "grad_norm": 2.9359493255615234, + "learning_rate": 1.3095238095238096e-05, + "loss": 0.1205, + "step": 1870 + }, + { + "epoch": 7.395256916996048, + "grad_norm": 2.4510388374328613, + "learning_rate": 1.3075396825396826e-05, + "loss": 0.0765, + "step": 1871 + }, + { + "epoch": 7.399209486166008, + "grad_norm": 1.9874929189682007, + "learning_rate": 1.3055555555555557e-05, + "loss": 0.0798, + "step": 1872 + }, + { + "epoch": 7.403162055335969, + "grad_norm": 1.9240570068359375, + "learning_rate": 1.3035714285714287e-05, + "loss": 0.0649, + "step": 1873 + }, + { + "epoch": 7.407114624505929, + "grad_norm": 1.8569307327270508, + "learning_rate": 1.3015873015873018e-05, + "loss": 0.061, + "step": 1874 + }, + { + "epoch": 7.41106719367589, + "grad_norm": 2.232877016067505, + "learning_rate": 1.2996031746031748e-05, + "loss": 0.0778, + "step": 1875 + }, + { + "epoch": 7.41501976284585, + "grad_norm": 2.330413818359375, + "learning_rate": 1.2976190476190478e-05, + "loss": 0.0897, + "step": 1876 + }, + { + "epoch": 7.4189723320158105, + "grad_norm": 2.543062686920166, + "learning_rate": 1.2956349206349209e-05, + "loss": 0.1039, + "step": 1877 + }, + { + "epoch": 7.4229249011857705, + "grad_norm": 2.075242757797241, + "learning_rate": 1.2936507936507939e-05, + "loss": 0.0918, + "step": 1878 + }, + { + "epoch": 7.426877470355731, + "grad_norm": 2.339674234390259, + "learning_rate": 1.2916666666666668e-05, + "loss": 0.0905, + "step": 1879 + }, + { + "epoch": 7.430830039525691, + "grad_norm": 2.0178771018981934, + "learning_rate": 1.2896825396825398e-05, + "loss": 0.074, + "step": 1880 + }, + { + "epoch": 7.434782608695652, + "grad_norm": 2.169339179992676, + "learning_rate": 1.2876984126984127e-05, + "loss": 0.0627, + "step": 1881 + }, + { + "epoch": 7.438735177865612, + "grad_norm": 3.798720121383667, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.1162, + "step": 1882 + }, + { + "epoch": 7.442687747035573, + "grad_norm": 3.5218403339385986, + "learning_rate": 1.2837301587301586e-05, + "loss": 0.1009, + "step": 1883 + }, + { + "epoch": 7.446640316205533, + "grad_norm": 2.855560302734375, + "learning_rate": 1.2817460317460316e-05, + "loss": 0.1037, + "step": 1884 + }, + { + "epoch": 7.450592885375494, + "grad_norm": 1.8869415521621704, + "learning_rate": 1.2797619047619047e-05, + "loss": 0.0598, + "step": 1885 + }, + { + "epoch": 7.454545454545454, + "grad_norm": 1.6059690713882446, + "learning_rate": 1.2777777777777777e-05, + "loss": 0.0565, + "step": 1886 + }, + { + "epoch": 7.458498023715415, + "grad_norm": 3.0044198036193848, + "learning_rate": 1.2757936507936507e-05, + "loss": 0.131, + "step": 1887 + }, + { + "epoch": 7.462450592885375, + "grad_norm": 2.2843515872955322, + "learning_rate": 1.2738095238095238e-05, + "loss": 0.0856, + "step": 1888 + }, + { + "epoch": 7.466403162055336, + "grad_norm": 1.9446831941604614, + "learning_rate": 1.2718253968253968e-05, + "loss": 0.0657, + "step": 1889 + }, + { + "epoch": 7.470355731225297, + "grad_norm": 2.3453097343444824, + "learning_rate": 1.2698412698412699e-05, + "loss": 0.0929, + "step": 1890 + }, + { + "epoch": 7.474308300395257, + "grad_norm": 2.4141080379486084, + "learning_rate": 1.2678571428571429e-05, + "loss": 0.0882, + "step": 1891 + }, + { + "epoch": 7.478260869565218, + "grad_norm": 3.0322489738464355, + "learning_rate": 1.265873015873016e-05, + "loss": 0.0845, + "step": 1892 + }, + { + "epoch": 7.482213438735178, + "grad_norm": 2.348433017730713, + "learning_rate": 1.263888888888889e-05, + "loss": 0.0828, + "step": 1893 + }, + { + "epoch": 7.486166007905139, + "grad_norm": 2.1347391605377197, + "learning_rate": 1.261904761904762e-05, + "loss": 0.0667, + "step": 1894 + }, + { + "epoch": 7.490118577075099, + "grad_norm": 2.272301435470581, + "learning_rate": 1.2599206349206349e-05, + "loss": 0.0898, + "step": 1895 + }, + { + "epoch": 7.4940711462450595, + "grad_norm": 2.3849878311157227, + "learning_rate": 1.2579365079365079e-05, + "loss": 0.0747, + "step": 1896 + }, + { + "epoch": 7.4980237154150196, + "grad_norm": 2.220501661300659, + "learning_rate": 1.255952380952381e-05, + "loss": 0.0794, + "step": 1897 + }, + { + "epoch": 7.5019762845849804, + "grad_norm": 2.281405448913574, + "learning_rate": 1.253968253968254e-05, + "loss": 0.0915, + "step": 1898 + }, + { + "epoch": 7.5059288537549405, + "grad_norm": 2.4831249713897705, + "learning_rate": 1.251984126984127e-05, + "loss": 0.0725, + "step": 1899 + }, + { + "epoch": 7.509881422924901, + "grad_norm": 2.52744197845459, + "learning_rate": 1.25e-05, + "loss": 0.0719, + "step": 1900 + }, + { + "epoch": 7.513833992094861, + "grad_norm": 2.3339502811431885, + "learning_rate": 1.2480158730158731e-05, + "loss": 0.0821, + "step": 1901 + }, + { + "epoch": 7.517786561264822, + "grad_norm": 2.408015012741089, + "learning_rate": 1.2460317460317461e-05, + "loss": 0.0929, + "step": 1902 + }, + { + "epoch": 7.521739130434782, + "grad_norm": 2.307608127593994, + "learning_rate": 1.2440476190476192e-05, + "loss": 0.0884, + "step": 1903 + }, + { + "epoch": 7.525691699604743, + "grad_norm": 2.454751491546631, + "learning_rate": 1.2420634920634922e-05, + "loss": 0.0851, + "step": 1904 + }, + { + "epoch": 7.529644268774703, + "grad_norm": 1.6989669799804688, + "learning_rate": 1.2400793650793652e-05, + "loss": 0.0765, + "step": 1905 + }, + { + "epoch": 7.533596837944664, + "grad_norm": 3.3208930492401123, + "learning_rate": 1.2380952380952381e-05, + "loss": 0.1033, + "step": 1906 + }, + { + "epoch": 7.537549407114625, + "grad_norm": 2.317495822906494, + "learning_rate": 1.2361111111111112e-05, + "loss": 0.0911, + "step": 1907 + }, + { + "epoch": 7.541501976284585, + "grad_norm": 2.3214991092681885, + "learning_rate": 1.2341269841269842e-05, + "loss": 0.0985, + "step": 1908 + }, + { + "epoch": 7.545454545454545, + "grad_norm": 2.3527603149414062, + "learning_rate": 1.2321428571428572e-05, + "loss": 0.077, + "step": 1909 + }, + { + "epoch": 7.549407114624506, + "grad_norm": 3.5933728218078613, + "learning_rate": 1.2301587301587301e-05, + "loss": 0.1284, + "step": 1910 + }, + { + "epoch": 7.553359683794467, + "grad_norm": 2.274735450744629, + "learning_rate": 1.2281746031746031e-05, + "loss": 0.1062, + "step": 1911 + }, + { + "epoch": 7.557312252964427, + "grad_norm": 3.133971691131592, + "learning_rate": 1.2261904761904762e-05, + "loss": 0.1185, + "step": 1912 + }, + { + "epoch": 7.561264822134388, + "grad_norm": 3.7312350273132324, + "learning_rate": 1.2242063492063492e-05, + "loss": 0.1343, + "step": 1913 + }, + { + "epoch": 7.565217391304348, + "grad_norm": 1.8403165340423584, + "learning_rate": 1.2222222222222222e-05, + "loss": 0.0699, + "step": 1914 + }, + { + "epoch": 7.569169960474309, + "grad_norm": 3.350177049636841, + "learning_rate": 1.2202380952380953e-05, + "loss": 0.1266, + "step": 1915 + }, + { + "epoch": 7.573122529644269, + "grad_norm": 1.6349577903747559, + "learning_rate": 1.2182539682539683e-05, + "loss": 0.0628, + "step": 1916 + }, + { + "epoch": 7.5770750988142295, + "grad_norm": 1.906072974205017, + "learning_rate": 1.2162698412698414e-05, + "loss": 0.0835, + "step": 1917 + }, + { + "epoch": 7.5810276679841895, + "grad_norm": 2.1547391414642334, + "learning_rate": 1.2142857142857144e-05, + "loss": 0.0803, + "step": 1918 + }, + { + "epoch": 7.58498023715415, + "grad_norm": 2.4813320636749268, + "learning_rate": 1.2123015873015874e-05, + "loss": 0.1215, + "step": 1919 + }, + { + "epoch": 7.58893280632411, + "grad_norm": 2.235426664352417, + "learning_rate": 1.2103174603174603e-05, + "loss": 0.0785, + "step": 1920 + }, + { + "epoch": 7.592885375494071, + "grad_norm": 2.80841326713562, + "learning_rate": 1.2083333333333333e-05, + "loss": 0.1276, + "step": 1921 + }, + { + "epoch": 7.596837944664031, + "grad_norm": 2.305530548095703, + "learning_rate": 1.2063492063492064e-05, + "loss": 0.065, + "step": 1922 + }, + { + "epoch": 7.600790513833992, + "grad_norm": 1.6414098739624023, + "learning_rate": 1.2043650793650794e-05, + "loss": 0.0607, + "step": 1923 + }, + { + "epoch": 7.604743083003952, + "grad_norm": 2.6492977142333984, + "learning_rate": 1.2023809523809525e-05, + "loss": 0.0877, + "step": 1924 + }, + { + "epoch": 7.608695652173913, + "grad_norm": 2.1286118030548096, + "learning_rate": 1.2003968253968255e-05, + "loss": 0.0843, + "step": 1925 + }, + { + "epoch": 7.612648221343873, + "grad_norm": 2.137827157974243, + "learning_rate": 1.1984126984126985e-05, + "loss": 0.0691, + "step": 1926 + }, + { + "epoch": 7.616600790513834, + "grad_norm": 2.486067056655884, + "learning_rate": 1.1964285714285716e-05, + "loss": 0.0795, + "step": 1927 + }, + { + "epoch": 7.620553359683795, + "grad_norm": 3.5269148349761963, + "learning_rate": 1.1944444444444446e-05, + "loss": 0.1021, + "step": 1928 + }, + { + "epoch": 7.624505928853755, + "grad_norm": 3.005458116531372, + "learning_rate": 1.1924603174603176e-05, + "loss": 0.0863, + "step": 1929 + }, + { + "epoch": 7.628458498023716, + "grad_norm": 1.5922969579696655, + "learning_rate": 1.1904761904761905e-05, + "loss": 0.0562, + "step": 1930 + }, + { + "epoch": 7.632411067193676, + "grad_norm": 2.0607211589813232, + "learning_rate": 1.1884920634920635e-05, + "loss": 0.0796, + "step": 1931 + }, + { + "epoch": 7.636363636363637, + "grad_norm": 2.8598814010620117, + "learning_rate": 1.1865079365079366e-05, + "loss": 0.1132, + "step": 1932 + }, + { + "epoch": 7.640316205533597, + "grad_norm": 2.145017147064209, + "learning_rate": 1.1845238095238095e-05, + "loss": 0.081, + "step": 1933 + }, + { + "epoch": 7.644268774703558, + "grad_norm": 1.9191386699676514, + "learning_rate": 1.1825396825396825e-05, + "loss": 0.0867, + "step": 1934 + }, + { + "epoch": 7.648221343873518, + "grad_norm": 2.149658441543579, + "learning_rate": 1.1805555555555555e-05, + "loss": 0.0706, + "step": 1935 + }, + { + "epoch": 7.6521739130434785, + "grad_norm": 2.7651522159576416, + "learning_rate": 1.1785714285714286e-05, + "loss": 0.0835, + "step": 1936 + }, + { + "epoch": 7.6561264822134385, + "grad_norm": 3.162454605102539, + "learning_rate": 1.1765873015873016e-05, + "loss": 0.1092, + "step": 1937 + }, + { + "epoch": 7.660079051383399, + "grad_norm": 2.8850865364074707, + "learning_rate": 1.1746031746031746e-05, + "loss": 0.0828, + "step": 1938 + }, + { + "epoch": 7.664031620553359, + "grad_norm": 1.7384766340255737, + "learning_rate": 1.1726190476190477e-05, + "loss": 0.057, + "step": 1939 + }, + { + "epoch": 7.66798418972332, + "grad_norm": 1.8011753559112549, + "learning_rate": 1.1706349206349207e-05, + "loss": 0.0737, + "step": 1940 + }, + { + "epoch": 7.67193675889328, + "grad_norm": 3.141075372695923, + "learning_rate": 1.1686507936507938e-05, + "loss": 0.0868, + "step": 1941 + }, + { + "epoch": 7.675889328063241, + "grad_norm": 3.0187206268310547, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.1188, + "step": 1942 + }, + { + "epoch": 7.679841897233201, + "grad_norm": 2.71610689163208, + "learning_rate": 1.1646825396825398e-05, + "loss": 0.0873, + "step": 1943 + }, + { + "epoch": 7.683794466403162, + "grad_norm": 3.3756630420684814, + "learning_rate": 1.1626984126984129e-05, + "loss": 0.1259, + "step": 1944 + }, + { + "epoch": 7.687747035573123, + "grad_norm": 2.53981351852417, + "learning_rate": 1.1607142857142857e-05, + "loss": 0.0935, + "step": 1945 + }, + { + "epoch": 7.691699604743083, + "grad_norm": 1.9116166830062866, + "learning_rate": 1.1587301587301588e-05, + "loss": 0.0657, + "step": 1946 + }, + { + "epoch": 7.695652173913043, + "grad_norm": 2.145357370376587, + "learning_rate": 1.1567460317460318e-05, + "loss": 0.0928, + "step": 1947 + }, + { + "epoch": 7.699604743083004, + "grad_norm": 2.8089849948883057, + "learning_rate": 1.1547619047619048e-05, + "loss": 0.0938, + "step": 1948 + }, + { + "epoch": 7.703557312252965, + "grad_norm": 2.510547399520874, + "learning_rate": 1.1527777777777779e-05, + "loss": 0.0942, + "step": 1949 + }, + { + "epoch": 7.707509881422925, + "grad_norm": 2.142611503601074, + "learning_rate": 1.1507936507936508e-05, + "loss": 0.0776, + "step": 1950 + }, + { + "epoch": 7.711462450592886, + "grad_norm": 2.8611981868743896, + "learning_rate": 1.1488095238095238e-05, + "loss": 0.0874, + "step": 1951 + }, + { + "epoch": 7.715415019762846, + "grad_norm": 2.1545233726501465, + "learning_rate": 1.1468253968253968e-05, + "loss": 0.0846, + "step": 1952 + }, + { + "epoch": 7.719367588932807, + "grad_norm": 2.4546070098876953, + "learning_rate": 1.1448412698412699e-05, + "loss": 0.0745, + "step": 1953 + }, + { + "epoch": 7.723320158102767, + "grad_norm": 3.605870008468628, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.1321, + "step": 1954 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 3.173536539077759, + "learning_rate": 1.140873015873016e-05, + "loss": 0.1024, + "step": 1955 + }, + { + "epoch": 7.7312252964426875, + "grad_norm": 2.163079023361206, + "learning_rate": 1.138888888888889e-05, + "loss": 0.0626, + "step": 1956 + }, + { + "epoch": 7.735177865612648, + "grad_norm": 2.247568130493164, + "learning_rate": 1.136904761904762e-05, + "loss": 0.0801, + "step": 1957 + }, + { + "epoch": 7.739130434782608, + "grad_norm": 2.0294995307922363, + "learning_rate": 1.1349206349206349e-05, + "loss": 0.0987, + "step": 1958 + }, + { + "epoch": 7.743083003952569, + "grad_norm": 2.5226917266845703, + "learning_rate": 1.132936507936508e-05, + "loss": 0.0815, + "step": 1959 + }, + { + "epoch": 7.747035573122529, + "grad_norm": 2.68029522895813, + "learning_rate": 1.130952380952381e-05, + "loss": 0.1003, + "step": 1960 + }, + { + "epoch": 7.75098814229249, + "grad_norm": 2.617349624633789, + "learning_rate": 1.128968253968254e-05, + "loss": 0.0843, + "step": 1961 + }, + { + "epoch": 7.75494071146245, + "grad_norm": 2.5157854557037354, + "learning_rate": 1.126984126984127e-05, + "loss": 0.1024, + "step": 1962 + }, + { + "epoch": 7.758893280632411, + "grad_norm": 2.2654972076416016, + "learning_rate": 1.125e-05, + "loss": 0.0801, + "step": 1963 + }, + { + "epoch": 7.762845849802371, + "grad_norm": 2.273188352584839, + "learning_rate": 1.1230158730158731e-05, + "loss": 0.0774, + "step": 1964 + }, + { + "epoch": 7.766798418972332, + "grad_norm": 2.1782922744750977, + "learning_rate": 1.1210317460317461e-05, + "loss": 0.069, + "step": 1965 + }, + { + "epoch": 7.770750988142293, + "grad_norm": 2.23232102394104, + "learning_rate": 1.1190476190476192e-05, + "loss": 0.0868, + "step": 1966 + }, + { + "epoch": 7.774703557312253, + "grad_norm": 2.0577633380889893, + "learning_rate": 1.1170634920634922e-05, + "loss": 0.0718, + "step": 1967 + }, + { + "epoch": 7.778656126482213, + "grad_norm": 2.2454826831817627, + "learning_rate": 1.1150793650793653e-05, + "loss": 0.0789, + "step": 1968 + }, + { + "epoch": 7.782608695652174, + "grad_norm": 3.088087558746338, + "learning_rate": 1.1130952380952381e-05, + "loss": 0.0944, + "step": 1969 + }, + { + "epoch": 7.786561264822135, + "grad_norm": 2.3378899097442627, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0908, + "step": 1970 + }, + { + "epoch": 7.790513833992095, + "grad_norm": 2.6260411739349365, + "learning_rate": 1.1091269841269842e-05, + "loss": 0.087, + "step": 1971 + }, + { + "epoch": 7.794466403162056, + "grad_norm": 1.8749479055404663, + "learning_rate": 1.107142857142857e-05, + "loss": 0.0641, + "step": 1972 + }, + { + "epoch": 7.798418972332016, + "grad_norm": 2.9281997680664062, + "learning_rate": 1.1051587301587301e-05, + "loss": 0.1033, + "step": 1973 + }, + { + "epoch": 7.8023715415019765, + "grad_norm": 1.7113523483276367, + "learning_rate": 1.1031746031746031e-05, + "loss": 0.0573, + "step": 1974 + }, + { + "epoch": 7.8063241106719365, + "grad_norm": 4.461965560913086, + "learning_rate": 1.1011904761904762e-05, + "loss": 0.1805, + "step": 1975 + }, + { + "epoch": 7.810276679841897, + "grad_norm": 2.3460776805877686, + "learning_rate": 1.0992063492063492e-05, + "loss": 0.0981, + "step": 1976 + }, + { + "epoch": 7.8142292490118574, + "grad_norm": 2.7546355724334717, + "learning_rate": 1.0972222222222223e-05, + "loss": 0.0963, + "step": 1977 + }, + { + "epoch": 7.818181818181818, + "grad_norm": 3.3164448738098145, + "learning_rate": 1.0952380952380953e-05, + "loss": 0.096, + "step": 1978 + }, + { + "epoch": 7.822134387351778, + "grad_norm": 1.8524004220962524, + "learning_rate": 1.0932539682539683e-05, + "loss": 0.0725, + "step": 1979 + }, + { + "epoch": 7.826086956521739, + "grad_norm": 2.2797439098358154, + "learning_rate": 1.0912698412698414e-05, + "loss": 0.0792, + "step": 1980 + }, + { + "epoch": 7.830039525691699, + "grad_norm": 2.863020420074463, + "learning_rate": 1.0892857142857144e-05, + "loss": 0.0749, + "step": 1981 + }, + { + "epoch": 7.83399209486166, + "grad_norm": 2.2198400497436523, + "learning_rate": 1.0873015873015874e-05, + "loss": 0.0986, + "step": 1982 + }, + { + "epoch": 7.837944664031621, + "grad_norm": 1.540998935699463, + "learning_rate": 1.0853174603174605e-05, + "loss": 0.0654, + "step": 1983 + }, + { + "epoch": 7.841897233201581, + "grad_norm": 1.856723666191101, + "learning_rate": 1.0833333333333334e-05, + "loss": 0.0751, + "step": 1984 + }, + { + "epoch": 7.845849802371541, + "grad_norm": 2.4738926887512207, + "learning_rate": 1.0813492063492064e-05, + "loss": 0.0848, + "step": 1985 + }, + { + "epoch": 7.849802371541502, + "grad_norm": 1.9589910507202148, + "learning_rate": 1.0793650793650794e-05, + "loss": 0.0674, + "step": 1986 + }, + { + "epoch": 7.853754940711463, + "grad_norm": 2.1947100162506104, + "learning_rate": 1.0773809523809525e-05, + "loss": 0.077, + "step": 1987 + }, + { + "epoch": 7.857707509881423, + "grad_norm": 3.0187575817108154, + "learning_rate": 1.0753968253968255e-05, + "loss": 0.1128, + "step": 1988 + }, + { + "epoch": 7.861660079051384, + "grad_norm": 1.8199398517608643, + "learning_rate": 1.0734126984126984e-05, + "loss": 0.077, + "step": 1989 + }, + { + "epoch": 7.865612648221344, + "grad_norm": 2.459689140319824, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.0834, + "step": 1990 + }, + { + "epoch": 7.869565217391305, + "grad_norm": 2.451312303543091, + "learning_rate": 1.0694444444444444e-05, + "loss": 0.1036, + "step": 1991 + }, + { + "epoch": 7.873517786561265, + "grad_norm": 2.400502920150757, + "learning_rate": 1.0674603174603175e-05, + "loss": 0.0821, + "step": 1992 + }, + { + "epoch": 7.877470355731226, + "grad_norm": 1.9994014501571655, + "learning_rate": 1.0654761904761905e-05, + "loss": 0.072, + "step": 1993 + }, + { + "epoch": 7.881422924901186, + "grad_norm": 1.9804929494857788, + "learning_rate": 1.0634920634920636e-05, + "loss": 0.0723, + "step": 1994 + }, + { + "epoch": 7.8853754940711465, + "grad_norm": 1.7949855327606201, + "learning_rate": 1.0615079365079366e-05, + "loss": 0.0725, + "step": 1995 + }, + { + "epoch": 7.8893280632411065, + "grad_norm": 3.4392995834350586, + "learning_rate": 1.0595238095238096e-05, + "loss": 0.1088, + "step": 1996 + }, + { + "epoch": 7.893280632411067, + "grad_norm": 2.1142258644104004, + "learning_rate": 1.0575396825396825e-05, + "loss": 0.0653, + "step": 1997 + }, + { + "epoch": 7.897233201581027, + "grad_norm": 1.7416729927062988, + "learning_rate": 1.0555555555555555e-05, + "loss": 0.0623, + "step": 1998 + }, + { + "epoch": 7.901185770750988, + "grad_norm": 2.438103437423706, + "learning_rate": 1.0535714285714286e-05, + "loss": 0.0712, + "step": 1999 + }, + { + "epoch": 7.905138339920948, + "grad_norm": 2.4873170852661133, + "learning_rate": 1.0515873015873016e-05, + "loss": 0.0704, + "step": 2000 + }, + { + "epoch": 7.909090909090909, + "grad_norm": 2.9187819957733154, + "learning_rate": 1.0496031746031747e-05, + "loss": 0.1055, + "step": 2001 + }, + { + "epoch": 7.913043478260869, + "grad_norm": 2.2539193630218506, + "learning_rate": 1.0476190476190477e-05, + "loss": 0.0777, + "step": 2002 + }, + { + "epoch": 7.91699604743083, + "grad_norm": 1.9834545850753784, + "learning_rate": 1.0456349206349207e-05, + "loss": 0.0651, + "step": 2003 + }, + { + "epoch": 7.920948616600791, + "grad_norm": 2.7357428073883057, + "learning_rate": 1.0436507936507938e-05, + "loss": 0.0943, + "step": 2004 + }, + { + "epoch": 7.924901185770751, + "grad_norm": 2.9565341472625732, + "learning_rate": 1.0416666666666668e-05, + "loss": 0.1032, + "step": 2005 + }, + { + "epoch": 7.928853754940711, + "grad_norm": 2.176581382751465, + "learning_rate": 1.0396825396825398e-05, + "loss": 0.0774, + "step": 2006 + }, + { + "epoch": 7.932806324110672, + "grad_norm": 2.717416524887085, + "learning_rate": 1.0376984126984129e-05, + "loss": 0.1105, + "step": 2007 + }, + { + "epoch": 7.936758893280633, + "grad_norm": 2.976388931274414, + "learning_rate": 1.0357142857142859e-05, + "loss": 0.1036, + "step": 2008 + }, + { + "epoch": 7.940711462450593, + "grad_norm": 4.3491411209106445, + "learning_rate": 1.0337301587301588e-05, + "loss": 0.166, + "step": 2009 + }, + { + "epoch": 7.944664031620554, + "grad_norm": 2.4038586616516113, + "learning_rate": 1.0317460317460318e-05, + "loss": 0.0833, + "step": 2010 + }, + { + "epoch": 7.948616600790514, + "grad_norm": 2.2004966735839844, + "learning_rate": 1.0297619047619047e-05, + "loss": 0.079, + "step": 2011 + }, + { + "epoch": 7.952569169960475, + "grad_norm": 2.817812919616699, + "learning_rate": 1.0277777777777777e-05, + "loss": 0.1185, + "step": 2012 + }, + { + "epoch": 7.956521739130435, + "grad_norm": 2.230272054672241, + "learning_rate": 1.0257936507936508e-05, + "loss": 0.0932, + "step": 2013 + }, + { + "epoch": 7.9604743083003955, + "grad_norm": 2.4217472076416016, + "learning_rate": 1.0238095238095238e-05, + "loss": 0.0912, + "step": 2014 + }, + { + "epoch": 7.9644268774703555, + "grad_norm": 2.3574728965759277, + "learning_rate": 1.0218253968253968e-05, + "loss": 0.0844, + "step": 2015 + }, + { + "epoch": 7.968379446640316, + "grad_norm": 1.9760262966156006, + "learning_rate": 1.0198412698412699e-05, + "loss": 0.0852, + "step": 2016 + }, + { + "epoch": 7.972332015810276, + "grad_norm": 3.1597955226898193, + "learning_rate": 1.0178571428571429e-05, + "loss": 0.1101, + "step": 2017 + }, + { + "epoch": 7.976284584980237, + "grad_norm": 2.150995969772339, + "learning_rate": 1.015873015873016e-05, + "loss": 0.0867, + "step": 2018 + }, + { + "epoch": 7.980237154150197, + "grad_norm": 2.0937557220458984, + "learning_rate": 1.013888888888889e-05, + "loss": 0.0701, + "step": 2019 + }, + { + "epoch": 7.984189723320158, + "grad_norm": 3.2109408378601074, + "learning_rate": 1.011904761904762e-05, + "loss": 0.0974, + "step": 2020 + }, + { + "epoch": 7.988142292490118, + "grad_norm": 3.7526028156280518, + "learning_rate": 1.009920634920635e-05, + "loss": 0.1281, + "step": 2021 + }, + { + "epoch": 7.992094861660079, + "grad_norm": 2.220501184463501, + "learning_rate": 1.007936507936508e-05, + "loss": 0.1216, + "step": 2022 + }, + { + "epoch": 7.996047430830039, + "grad_norm": 2.583831548690796, + "learning_rate": 1.005952380952381e-05, + "loss": 0.0817, + "step": 2023 + }, + { + "epoch": 8.0, + "grad_norm": 2.142831802368164, + "learning_rate": 1.003968253968254e-05, + "loss": 0.0659, + "step": 2024 + }, + { + "epoch": 8.003952569169961, + "grad_norm": 0.9592176079750061, + "learning_rate": 1.001984126984127e-05, + "loss": 0.0366, + "step": 2025 + }, + { + "epoch": 8.007905138339922, + "grad_norm": 1.110743522644043, + "learning_rate": 1e-05, + "loss": 0.0397, + "step": 2026 + }, + { + "epoch": 8.011857707509881, + "grad_norm": 1.0800786018371582, + "learning_rate": 9.980158730158731e-06, + "loss": 0.04, + "step": 2027 + }, + { + "epoch": 8.015810276679842, + "grad_norm": 1.1953082084655762, + "learning_rate": 9.96031746031746e-06, + "loss": 0.04, + "step": 2028 + }, + { + "epoch": 8.019762845849803, + "grad_norm": 1.4953047037124634, + "learning_rate": 9.94047619047619e-06, + "loss": 0.0726, + "step": 2029 + }, + { + "epoch": 8.023715415019764, + "grad_norm": 1.4194068908691406, + "learning_rate": 9.92063492063492e-06, + "loss": 0.0474, + "step": 2030 + }, + { + "epoch": 8.027667984189723, + "grad_norm": 1.2994885444641113, + "learning_rate": 9.900793650793651e-06, + "loss": 0.0462, + "step": 2031 + }, + { + "epoch": 8.031620553359684, + "grad_norm": 1.5428236722946167, + "learning_rate": 9.880952380952381e-06, + "loss": 0.0681, + "step": 2032 + }, + { + "epoch": 8.035573122529645, + "grad_norm": 1.375458836555481, + "learning_rate": 9.861111111111112e-06, + "loss": 0.0437, + "step": 2033 + }, + { + "epoch": 8.039525691699605, + "grad_norm": 1.303333044052124, + "learning_rate": 9.841269841269842e-06, + "loss": 0.0418, + "step": 2034 + }, + { + "epoch": 8.043478260869565, + "grad_norm": 1.6367980241775513, + "learning_rate": 9.821428571428573e-06, + "loss": 0.0603, + "step": 2035 + }, + { + "epoch": 8.047430830039525, + "grad_norm": 1.2272205352783203, + "learning_rate": 9.801587301587301e-06, + "loss": 0.0373, + "step": 2036 + }, + { + "epoch": 8.051383399209486, + "grad_norm": 1.42149019241333, + "learning_rate": 9.781746031746032e-06, + "loss": 0.0376, + "step": 2037 + }, + { + "epoch": 8.055335968379447, + "grad_norm": 1.757165789604187, + "learning_rate": 9.761904761904762e-06, + "loss": 0.051, + "step": 2038 + }, + { + "epoch": 8.059288537549406, + "grad_norm": 2.7419040203094482, + "learning_rate": 9.742063492063492e-06, + "loss": 0.0709, + "step": 2039 + }, + { + "epoch": 8.063241106719367, + "grad_norm": 1.15654718875885, + "learning_rate": 9.722222222222223e-06, + "loss": 0.0328, + "step": 2040 + }, + { + "epoch": 8.067193675889328, + "grad_norm": 1.6456338167190552, + "learning_rate": 9.702380952380953e-06, + "loss": 0.0584, + "step": 2041 + }, + { + "epoch": 8.071146245059289, + "grad_norm": 1.8168511390686035, + "learning_rate": 9.682539682539683e-06, + "loss": 0.0547, + "step": 2042 + }, + { + "epoch": 8.075098814229248, + "grad_norm": 1.2571635246276855, + "learning_rate": 9.662698412698414e-06, + "loss": 0.0409, + "step": 2043 + }, + { + "epoch": 8.079051383399209, + "grad_norm": 1.5656439065933228, + "learning_rate": 9.642857142857144e-06, + "loss": 0.0416, + "step": 2044 + }, + { + "epoch": 8.08300395256917, + "grad_norm": 1.297942042350769, + "learning_rate": 9.623015873015875e-06, + "loss": 0.0449, + "step": 2045 + }, + { + "epoch": 8.08695652173913, + "grad_norm": 1.577046275138855, + "learning_rate": 9.603174603174605e-06, + "loss": 0.0613, + "step": 2046 + }, + { + "epoch": 8.090909090909092, + "grad_norm": 1.2949413061141968, + "learning_rate": 9.583333333333334e-06, + "loss": 0.0421, + "step": 2047 + }, + { + "epoch": 8.09486166007905, + "grad_norm": 1.3732746839523315, + "learning_rate": 9.563492063492064e-06, + "loss": 0.0487, + "step": 2048 + }, + { + "epoch": 8.098814229249012, + "grad_norm": 1.6629955768585205, + "learning_rate": 9.543650793650793e-06, + "loss": 0.0535, + "step": 2049 + }, + { + "epoch": 8.102766798418973, + "grad_norm": 1.3761000633239746, + "learning_rate": 9.523809523809523e-06, + "loss": 0.0475, + "step": 2050 + }, + { + "epoch": 8.106719367588934, + "grad_norm": 1.721295952796936, + "learning_rate": 9.503968253968253e-06, + "loss": 0.0532, + "step": 2051 + }, + { + "epoch": 8.110671936758893, + "grad_norm": 2.0054855346679688, + "learning_rate": 9.484126984126984e-06, + "loss": 0.0652, + "step": 2052 + }, + { + "epoch": 8.114624505928854, + "grad_norm": 1.1261333227157593, + "learning_rate": 9.464285714285714e-06, + "loss": 0.0385, + "step": 2053 + }, + { + "epoch": 8.118577075098814, + "grad_norm": 1.6062073707580566, + "learning_rate": 9.444444444444445e-06, + "loss": 0.0692, + "step": 2054 + }, + { + "epoch": 8.122529644268775, + "grad_norm": 1.7869125604629517, + "learning_rate": 9.424603174603175e-06, + "loss": 0.0545, + "step": 2055 + }, + { + "epoch": 8.126482213438734, + "grad_norm": 1.4991074800491333, + "learning_rate": 9.404761904761905e-06, + "loss": 0.0564, + "step": 2056 + }, + { + "epoch": 8.130434782608695, + "grad_norm": 1.527508020401001, + "learning_rate": 9.384920634920636e-06, + "loss": 0.0515, + "step": 2057 + }, + { + "epoch": 8.134387351778656, + "grad_norm": 1.497931957244873, + "learning_rate": 9.365079365079366e-06, + "loss": 0.0542, + "step": 2058 + }, + { + "epoch": 8.138339920948617, + "grad_norm": 1.5828138589859009, + "learning_rate": 9.345238095238096e-06, + "loss": 0.0502, + "step": 2059 + }, + { + "epoch": 8.142292490118576, + "grad_norm": 2.0395078659057617, + "learning_rate": 9.325396825396827e-06, + "loss": 0.0642, + "step": 2060 + }, + { + "epoch": 8.146245059288537, + "grad_norm": 1.7669570446014404, + "learning_rate": 9.305555555555555e-06, + "loss": 0.0685, + "step": 2061 + }, + { + "epoch": 8.150197628458498, + "grad_norm": 1.2871284484863281, + "learning_rate": 9.285714285714286e-06, + "loss": 0.0494, + "step": 2062 + }, + { + "epoch": 8.154150197628459, + "grad_norm": 1.5461068153381348, + "learning_rate": 9.265873015873016e-06, + "loss": 0.0486, + "step": 2063 + }, + { + "epoch": 8.15810276679842, + "grad_norm": 3.5550456047058105, + "learning_rate": 9.246031746031747e-06, + "loss": 0.0718, + "step": 2064 + }, + { + "epoch": 8.162055335968379, + "grad_norm": 1.4119137525558472, + "learning_rate": 9.226190476190477e-06, + "loss": 0.039, + "step": 2065 + }, + { + "epoch": 8.16600790513834, + "grad_norm": 1.7155872583389282, + "learning_rate": 9.206349206349207e-06, + "loss": 0.0476, + "step": 2066 + }, + { + "epoch": 8.1699604743083, + "grad_norm": 1.5434550046920776, + "learning_rate": 9.186507936507938e-06, + "loss": 0.0549, + "step": 2067 + }, + { + "epoch": 8.173913043478262, + "grad_norm": 1.677038550376892, + "learning_rate": 9.166666666666666e-06, + "loss": 0.0592, + "step": 2068 + }, + { + "epoch": 8.17786561264822, + "grad_norm": 1.7242430448532104, + "learning_rate": 9.146825396825397e-06, + "loss": 0.0461, + "step": 2069 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 1.5962724685668945, + "learning_rate": 9.126984126984127e-06, + "loss": 0.056, + "step": 2070 + }, + { + "epoch": 8.185770750988143, + "grad_norm": 1.2169913053512573, + "learning_rate": 9.107142857142858e-06, + "loss": 0.0414, + "step": 2071 + }, + { + "epoch": 8.189723320158103, + "grad_norm": 1.2942997217178345, + "learning_rate": 9.087301587301588e-06, + "loss": 0.0397, + "step": 2072 + }, + { + "epoch": 8.193675889328063, + "grad_norm": 1.064394474029541, + "learning_rate": 9.067460317460318e-06, + "loss": 0.0412, + "step": 2073 + }, + { + "epoch": 8.197628458498023, + "grad_norm": 1.550001859664917, + "learning_rate": 9.047619047619047e-06, + "loss": 0.0496, + "step": 2074 + }, + { + "epoch": 8.201581027667984, + "grad_norm": 0.9107499718666077, + "learning_rate": 9.027777777777777e-06, + "loss": 0.036, + "step": 2075 + }, + { + "epoch": 8.205533596837945, + "grad_norm": 1.4544087648391724, + "learning_rate": 9.007936507936508e-06, + "loss": 0.0486, + "step": 2076 + }, + { + "epoch": 8.209486166007904, + "grad_norm": 1.8703144788742065, + "learning_rate": 8.988095238095238e-06, + "loss": 0.0581, + "step": 2077 + }, + { + "epoch": 8.213438735177865, + "grad_norm": 1.3876852989196777, + "learning_rate": 8.968253968253968e-06, + "loss": 0.0463, + "step": 2078 + }, + { + "epoch": 8.217391304347826, + "grad_norm": 1.303426742553711, + "learning_rate": 8.948412698412699e-06, + "loss": 0.0374, + "step": 2079 + }, + { + "epoch": 8.221343873517787, + "grad_norm": 1.1986994743347168, + "learning_rate": 8.92857142857143e-06, + "loss": 0.0455, + "step": 2080 + }, + { + "epoch": 8.225296442687746, + "grad_norm": 1.7828227281570435, + "learning_rate": 8.90873015873016e-06, + "loss": 0.0522, + "step": 2081 + }, + { + "epoch": 8.229249011857707, + "grad_norm": 1.5747590065002441, + "learning_rate": 8.88888888888889e-06, + "loss": 0.0524, + "step": 2082 + }, + { + "epoch": 8.233201581027668, + "grad_norm": 1.693088412284851, + "learning_rate": 8.86904761904762e-06, + "loss": 0.0467, + "step": 2083 + }, + { + "epoch": 8.237154150197629, + "grad_norm": 1.7213762998580933, + "learning_rate": 8.84920634920635e-06, + "loss": 0.0479, + "step": 2084 + }, + { + "epoch": 8.24110671936759, + "grad_norm": 1.4260462522506714, + "learning_rate": 8.829365079365081e-06, + "loss": 0.0515, + "step": 2085 + }, + { + "epoch": 8.245059288537549, + "grad_norm": 1.4755898714065552, + "learning_rate": 8.80952380952381e-06, + "loss": 0.0434, + "step": 2086 + }, + { + "epoch": 8.24901185770751, + "grad_norm": 1.2368816137313843, + "learning_rate": 8.78968253968254e-06, + "loss": 0.0404, + "step": 2087 + }, + { + "epoch": 8.25296442687747, + "grad_norm": 1.3295128345489502, + "learning_rate": 8.769841269841269e-06, + "loss": 0.0443, + "step": 2088 + }, + { + "epoch": 8.256916996047432, + "grad_norm": 1.3453065156936646, + "learning_rate": 8.75e-06, + "loss": 0.044, + "step": 2089 + }, + { + "epoch": 8.26086956521739, + "grad_norm": 1.3558346033096313, + "learning_rate": 8.73015873015873e-06, + "loss": 0.0477, + "step": 2090 + }, + { + "epoch": 8.264822134387352, + "grad_norm": 1.5338172912597656, + "learning_rate": 8.71031746031746e-06, + "loss": 0.0489, + "step": 2091 + }, + { + "epoch": 8.268774703557312, + "grad_norm": 1.350160002708435, + "learning_rate": 8.69047619047619e-06, + "loss": 0.0508, + "step": 2092 + }, + { + "epoch": 8.272727272727273, + "grad_norm": 1.5303537845611572, + "learning_rate": 8.67063492063492e-06, + "loss": 0.0465, + "step": 2093 + }, + { + "epoch": 8.276679841897232, + "grad_norm": 1.5438976287841797, + "learning_rate": 8.650793650793651e-06, + "loss": 0.0692, + "step": 2094 + }, + { + "epoch": 8.280632411067193, + "grad_norm": 1.1079212427139282, + "learning_rate": 8.630952380952381e-06, + "loss": 0.044, + "step": 2095 + }, + { + "epoch": 8.284584980237154, + "grad_norm": 1.2162022590637207, + "learning_rate": 8.611111111111112e-06, + "loss": 0.0369, + "step": 2096 + }, + { + "epoch": 8.288537549407115, + "grad_norm": 1.4534077644348145, + "learning_rate": 8.591269841269842e-06, + "loss": 0.0549, + "step": 2097 + }, + { + "epoch": 8.292490118577074, + "grad_norm": 1.3834608793258667, + "learning_rate": 8.571428571428573e-06, + "loss": 0.0423, + "step": 2098 + }, + { + "epoch": 8.296442687747035, + "grad_norm": 1.7133499383926392, + "learning_rate": 8.551587301587303e-06, + "loss": 0.0612, + "step": 2099 + }, + { + "epoch": 8.300395256916996, + "grad_norm": 1.966752052307129, + "learning_rate": 8.531746031746032e-06, + "loss": 0.0524, + "step": 2100 + }, + { + "epoch": 8.304347826086957, + "grad_norm": 2.143880844116211, + "learning_rate": 8.511904761904762e-06, + "loss": 0.0515, + "step": 2101 + }, + { + "epoch": 8.308300395256918, + "grad_norm": 2.052530527114868, + "learning_rate": 8.492063492063492e-06, + "loss": 0.0721, + "step": 2102 + }, + { + "epoch": 8.312252964426877, + "grad_norm": 2.0002057552337646, + "learning_rate": 8.472222222222223e-06, + "loss": 0.0559, + "step": 2103 + }, + { + "epoch": 8.316205533596838, + "grad_norm": 1.0597388744354248, + "learning_rate": 8.452380952380953e-06, + "loss": 0.0444, + "step": 2104 + }, + { + "epoch": 8.320158102766799, + "grad_norm": 1.3378074169158936, + "learning_rate": 8.432539682539684e-06, + "loss": 0.0408, + "step": 2105 + }, + { + "epoch": 8.32411067193676, + "grad_norm": 1.926456093788147, + "learning_rate": 8.412698412698414e-06, + "loss": 0.0592, + "step": 2106 + }, + { + "epoch": 8.328063241106719, + "grad_norm": 1.6579585075378418, + "learning_rate": 8.392857142857143e-06, + "loss": 0.0448, + "step": 2107 + }, + { + "epoch": 8.33201581027668, + "grad_norm": 1.6691573858261108, + "learning_rate": 8.373015873015873e-06, + "loss": 0.0476, + "step": 2108 + }, + { + "epoch": 8.33596837944664, + "grad_norm": 1.7489898204803467, + "learning_rate": 8.353174603174603e-06, + "loss": 0.067, + "step": 2109 + }, + { + "epoch": 8.339920948616601, + "grad_norm": 1.1511108875274658, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0477, + "step": 2110 + }, + { + "epoch": 8.34387351778656, + "grad_norm": 1.527985692024231, + "learning_rate": 8.313492063492064e-06, + "loss": 0.041, + "step": 2111 + }, + { + "epoch": 8.347826086956522, + "grad_norm": 1.5950675010681152, + "learning_rate": 8.293650793650794e-06, + "loss": 0.0588, + "step": 2112 + }, + { + "epoch": 8.351778656126482, + "grad_norm": 1.2112717628479004, + "learning_rate": 8.273809523809523e-06, + "loss": 0.0397, + "step": 2113 + }, + { + "epoch": 8.355731225296443, + "grad_norm": 1.6547131538391113, + "learning_rate": 8.253968253968254e-06, + "loss": 0.0609, + "step": 2114 + }, + { + "epoch": 8.359683794466402, + "grad_norm": 1.6982663869857788, + "learning_rate": 8.234126984126984e-06, + "loss": 0.0554, + "step": 2115 + }, + { + "epoch": 8.363636363636363, + "grad_norm": 1.5355052947998047, + "learning_rate": 8.214285714285714e-06, + "loss": 0.0663, + "step": 2116 + }, + { + "epoch": 8.367588932806324, + "grad_norm": 0.8851954340934753, + "learning_rate": 8.194444444444445e-06, + "loss": 0.0346, + "step": 2117 + }, + { + "epoch": 8.371541501976285, + "grad_norm": 1.6623241901397705, + "learning_rate": 8.174603174603175e-06, + "loss": 0.0488, + "step": 2118 + }, + { + "epoch": 8.375494071146244, + "grad_norm": 0.9663978219032288, + "learning_rate": 8.154761904761905e-06, + "loss": 0.0372, + "step": 2119 + }, + { + "epoch": 8.379446640316205, + "grad_norm": 1.2707703113555908, + "learning_rate": 8.134920634920636e-06, + "loss": 0.0439, + "step": 2120 + }, + { + "epoch": 8.383399209486166, + "grad_norm": 2.081395149230957, + "learning_rate": 8.115079365079366e-06, + "loss": 0.0474, + "step": 2121 + }, + { + "epoch": 8.387351778656127, + "grad_norm": 1.8119603395462036, + "learning_rate": 8.095238095238097e-06, + "loss": 0.0496, + "step": 2122 + }, + { + "epoch": 8.391304347826088, + "grad_norm": 1.7686362266540527, + "learning_rate": 8.075396825396827e-06, + "loss": 0.0549, + "step": 2123 + }, + { + "epoch": 8.395256916996047, + "grad_norm": 1.752198338508606, + "learning_rate": 8.055555555555557e-06, + "loss": 0.044, + "step": 2124 + }, + { + "epoch": 8.399209486166008, + "grad_norm": 1.523292064666748, + "learning_rate": 8.035714285714286e-06, + "loss": 0.0491, + "step": 2125 + }, + { + "epoch": 8.403162055335969, + "grad_norm": 1.8821699619293213, + "learning_rate": 8.015873015873016e-06, + "loss": 0.0473, + "step": 2126 + }, + { + "epoch": 8.40711462450593, + "grad_norm": 1.762847661972046, + "learning_rate": 7.996031746031745e-06, + "loss": 0.0498, + "step": 2127 + }, + { + "epoch": 8.411067193675889, + "grad_norm": 1.716994285583496, + "learning_rate": 7.976190476190475e-06, + "loss": 0.0536, + "step": 2128 + }, + { + "epoch": 8.41501976284585, + "grad_norm": 1.1499348878860474, + "learning_rate": 7.956349206349206e-06, + "loss": 0.0351, + "step": 2129 + }, + { + "epoch": 8.41897233201581, + "grad_norm": 0.8600573539733887, + "learning_rate": 7.936507936507936e-06, + "loss": 0.036, + "step": 2130 + }, + { + "epoch": 8.422924901185771, + "grad_norm": 1.5126005411148071, + "learning_rate": 7.916666666666667e-06, + "loss": 0.0523, + "step": 2131 + }, + { + "epoch": 8.42687747035573, + "grad_norm": 0.9862467050552368, + "learning_rate": 7.896825396825397e-06, + "loss": 0.0395, + "step": 2132 + }, + { + "epoch": 8.430830039525691, + "grad_norm": 1.1598824262619019, + "learning_rate": 7.876984126984127e-06, + "loss": 0.0387, + "step": 2133 + }, + { + "epoch": 8.434782608695652, + "grad_norm": 2.440631151199341, + "learning_rate": 7.857142857142858e-06, + "loss": 0.0462, + "step": 2134 + }, + { + "epoch": 8.438735177865613, + "grad_norm": 1.5269445180892944, + "learning_rate": 7.837301587301588e-06, + "loss": 0.0576, + "step": 2135 + }, + { + "epoch": 8.442687747035572, + "grad_norm": 1.6497447490692139, + "learning_rate": 7.817460317460318e-06, + "loss": 0.0695, + "step": 2136 + }, + { + "epoch": 8.446640316205533, + "grad_norm": 2.240730047225952, + "learning_rate": 7.797619047619049e-06, + "loss": 0.0558, + "step": 2137 + }, + { + "epoch": 8.450592885375494, + "grad_norm": 1.2452744245529175, + "learning_rate": 7.777777777777777e-06, + "loss": 0.0861, + "step": 2138 + }, + { + "epoch": 8.454545454545455, + "grad_norm": 1.3848949670791626, + "learning_rate": 7.757936507936508e-06, + "loss": 0.0522, + "step": 2139 + }, + { + "epoch": 8.458498023715414, + "grad_norm": 1.5077615976333618, + "learning_rate": 7.738095238095238e-06, + "loss": 0.051, + "step": 2140 + }, + { + "epoch": 8.462450592885375, + "grad_norm": 1.707553505897522, + "learning_rate": 7.718253968253969e-06, + "loss": 0.0592, + "step": 2141 + }, + { + "epoch": 8.466403162055336, + "grad_norm": 1.6871522665023804, + "learning_rate": 7.698412698412699e-06, + "loss": 0.0518, + "step": 2142 + }, + { + "epoch": 8.470355731225297, + "grad_norm": 2.1698107719421387, + "learning_rate": 7.67857142857143e-06, + "loss": 0.0599, + "step": 2143 + }, + { + "epoch": 8.474308300395258, + "grad_norm": 1.105559229850769, + "learning_rate": 7.65873015873016e-06, + "loss": 0.0368, + "step": 2144 + }, + { + "epoch": 8.478260869565217, + "grad_norm": 1.6224021911621094, + "learning_rate": 7.63888888888889e-06, + "loss": 0.0513, + "step": 2145 + }, + { + "epoch": 8.482213438735178, + "grad_norm": 1.7027612924575806, + "learning_rate": 7.6190476190476205e-06, + "loss": 0.0502, + "step": 2146 + }, + { + "epoch": 8.486166007905139, + "grad_norm": 1.5517146587371826, + "learning_rate": 7.599206349206349e-06, + "loss": 0.0515, + "step": 2147 + }, + { + "epoch": 8.4901185770751, + "grad_norm": 1.5171629190444946, + "learning_rate": 7.5793650793650795e-06, + "loss": 0.0509, + "step": 2148 + }, + { + "epoch": 8.494071146245059, + "grad_norm": 1.476023554801941, + "learning_rate": 7.559523809523809e-06, + "loss": 0.0498, + "step": 2149 + }, + { + "epoch": 8.49802371541502, + "grad_norm": 1.404647707939148, + "learning_rate": 7.5396825396825394e-06, + "loss": 0.0449, + "step": 2150 + }, + { + "epoch": 8.50197628458498, + "grad_norm": 1.2494837045669556, + "learning_rate": 7.51984126984127e-06, + "loss": 0.0503, + "step": 2151 + }, + { + "epoch": 8.505928853754941, + "grad_norm": 1.9429749250411987, + "learning_rate": 7.5e-06, + "loss": 0.0454, + "step": 2152 + }, + { + "epoch": 8.5098814229249, + "grad_norm": 1.3501893281936646, + "learning_rate": 7.4801587301587306e-06, + "loss": 0.0435, + "step": 2153 + }, + { + "epoch": 8.513833992094861, + "grad_norm": 1.3887948989868164, + "learning_rate": 7.460317460317461e-06, + "loss": 0.0479, + "step": 2154 + }, + { + "epoch": 8.517786561264822, + "grad_norm": 1.7705752849578857, + "learning_rate": 7.4404761904761905e-06, + "loss": 0.0579, + "step": 2155 + }, + { + "epoch": 8.521739130434783, + "grad_norm": 1.6018643379211426, + "learning_rate": 7.420634920634921e-06, + "loss": 0.0488, + "step": 2156 + }, + { + "epoch": 8.525691699604742, + "grad_norm": 1.2142655849456787, + "learning_rate": 7.400793650793651e-06, + "loss": 0.0474, + "step": 2157 + }, + { + "epoch": 8.529644268774703, + "grad_norm": 1.2533504962921143, + "learning_rate": 7.380952380952382e-06, + "loss": 0.0428, + "step": 2158 + }, + { + "epoch": 8.533596837944664, + "grad_norm": 1.5056265592575073, + "learning_rate": 7.361111111111112e-06, + "loss": 0.0412, + "step": 2159 + }, + { + "epoch": 8.537549407114625, + "grad_norm": 1.0978323221206665, + "learning_rate": 7.3412698412698415e-06, + "loss": 0.0429, + "step": 2160 + }, + { + "epoch": 8.541501976284586, + "grad_norm": 1.600511908531189, + "learning_rate": 7.321428571428572e-06, + "loss": 0.0506, + "step": 2161 + }, + { + "epoch": 8.545454545454545, + "grad_norm": 1.2966439723968506, + "learning_rate": 7.301587301587302e-06, + "loss": 0.0445, + "step": 2162 + }, + { + "epoch": 8.549407114624506, + "grad_norm": 1.1138356924057007, + "learning_rate": 7.281746031746033e-06, + "loss": 0.0362, + "step": 2163 + }, + { + "epoch": 8.553359683794467, + "grad_norm": 1.2294458150863647, + "learning_rate": 7.261904761904763e-06, + "loss": 0.0418, + "step": 2164 + }, + { + "epoch": 8.557312252964428, + "grad_norm": 2.5378119945526123, + "learning_rate": 7.242063492063493e-06, + "loss": 0.0703, + "step": 2165 + }, + { + "epoch": 8.561264822134387, + "grad_norm": 1.068594217300415, + "learning_rate": 7.222222222222222e-06, + "loss": 0.0407, + "step": 2166 + }, + { + "epoch": 8.565217391304348, + "grad_norm": 1.1735312938690186, + "learning_rate": 7.2023809523809524e-06, + "loss": 0.0435, + "step": 2167 + }, + { + "epoch": 8.569169960474309, + "grad_norm": 2.6890134811401367, + "learning_rate": 7.182539682539682e-06, + "loss": 0.0897, + "step": 2168 + }, + { + "epoch": 8.57312252964427, + "grad_norm": 1.2781907320022583, + "learning_rate": 7.162698412698412e-06, + "loss": 0.0517, + "step": 2169 + }, + { + "epoch": 8.577075098814229, + "grad_norm": 1.8715581893920898, + "learning_rate": 7.142857142857143e-06, + "loss": 0.0598, + "step": 2170 + }, + { + "epoch": 8.58102766798419, + "grad_norm": 2.0492424964904785, + "learning_rate": 7.123015873015873e-06, + "loss": 0.0498, + "step": 2171 + }, + { + "epoch": 8.58498023715415, + "grad_norm": 2.0802016258239746, + "learning_rate": 7.1031746031746035e-06, + "loss": 0.0543, + "step": 2172 + }, + { + "epoch": 8.588932806324111, + "grad_norm": 1.2858587503433228, + "learning_rate": 7.083333333333334e-06, + "loss": 0.0445, + "step": 2173 + }, + { + "epoch": 8.59288537549407, + "grad_norm": 1.2407563924789429, + "learning_rate": 7.063492063492063e-06, + "loss": 0.0393, + "step": 2174 + }, + { + "epoch": 8.596837944664031, + "grad_norm": 1.4286069869995117, + "learning_rate": 7.043650793650794e-06, + "loss": 0.0472, + "step": 2175 + }, + { + "epoch": 8.600790513833992, + "grad_norm": 2.2047770023345947, + "learning_rate": 7.023809523809524e-06, + "loss": 0.0527, + "step": 2176 + }, + { + "epoch": 8.604743083003953, + "grad_norm": 1.2905137538909912, + "learning_rate": 7.0039682539682545e-06, + "loss": 0.0412, + "step": 2177 + }, + { + "epoch": 8.608695652173914, + "grad_norm": 2.2183566093444824, + "learning_rate": 6.984126984126985e-06, + "loss": 0.0679, + "step": 2178 + }, + { + "epoch": 8.612648221343873, + "grad_norm": 1.4202773571014404, + "learning_rate": 6.964285714285715e-06, + "loss": 0.0462, + "step": 2179 + }, + { + "epoch": 8.616600790513834, + "grad_norm": 1.6715538501739502, + "learning_rate": 6.944444444444445e-06, + "loss": 0.0637, + "step": 2180 + }, + { + "epoch": 8.620553359683795, + "grad_norm": 1.480151891708374, + "learning_rate": 6.924603174603175e-06, + "loss": 0.0496, + "step": 2181 + }, + { + "epoch": 8.624505928853754, + "grad_norm": 1.6258138418197632, + "learning_rate": 6.9047619047619055e-06, + "loss": 0.0608, + "step": 2182 + }, + { + "epoch": 8.628458498023715, + "grad_norm": 1.6677876710891724, + "learning_rate": 6.884920634920636e-06, + "loss": 0.0547, + "step": 2183 + }, + { + "epoch": 8.632411067193676, + "grad_norm": 1.3751989603042603, + "learning_rate": 6.865079365079366e-06, + "loss": 0.0441, + "step": 2184 + }, + { + "epoch": 8.636363636363637, + "grad_norm": 1.4697294235229492, + "learning_rate": 6.845238095238096e-06, + "loss": 0.0485, + "step": 2185 + }, + { + "epoch": 8.640316205533598, + "grad_norm": 1.547059178352356, + "learning_rate": 6.825396825396825e-06, + "loss": 0.0515, + "step": 2186 + }, + { + "epoch": 8.644268774703557, + "grad_norm": 1.3061259984970093, + "learning_rate": 6.805555555555556e-06, + "loss": 0.0428, + "step": 2187 + }, + { + "epoch": 8.648221343873518, + "grad_norm": 1.2875500917434692, + "learning_rate": 6.785714285714285e-06, + "loss": 0.0422, + "step": 2188 + }, + { + "epoch": 8.652173913043478, + "grad_norm": 1.4154988527297974, + "learning_rate": 6.765873015873016e-06, + "loss": 0.051, + "step": 2189 + }, + { + "epoch": 8.65612648221344, + "grad_norm": 1.7555774450302124, + "learning_rate": 6.746031746031746e-06, + "loss": 0.0484, + "step": 2190 + }, + { + "epoch": 8.660079051383399, + "grad_norm": 1.8239620923995972, + "learning_rate": 6.726190476190476e-06, + "loss": 0.0498, + "step": 2191 + }, + { + "epoch": 8.66403162055336, + "grad_norm": 1.1520577669143677, + "learning_rate": 6.706349206349207e-06, + "loss": 0.042, + "step": 2192 + }, + { + "epoch": 8.66798418972332, + "grad_norm": 1.5023807287216187, + "learning_rate": 6.686507936507936e-06, + "loss": 0.0578, + "step": 2193 + }, + { + "epoch": 8.671936758893281, + "grad_norm": 2.02333927154541, + "learning_rate": 6.666666666666667e-06, + "loss": 0.05, + "step": 2194 + }, + { + "epoch": 8.67588932806324, + "grad_norm": 1.4692609310150146, + "learning_rate": 6.646825396825397e-06, + "loss": 0.0454, + "step": 2195 + }, + { + "epoch": 8.679841897233201, + "grad_norm": 1.0825679302215576, + "learning_rate": 6.626984126984127e-06, + "loss": 0.0362, + "step": 2196 + }, + { + "epoch": 8.683794466403162, + "grad_norm": 1.203744649887085, + "learning_rate": 6.607142857142858e-06, + "loss": 0.0464, + "step": 2197 + }, + { + "epoch": 8.687747035573123, + "grad_norm": 2.339820623397827, + "learning_rate": 6.587301587301588e-06, + "loss": 0.0745, + "step": 2198 + }, + { + "epoch": 8.691699604743082, + "grad_norm": 1.2836312055587769, + "learning_rate": 6.567460317460318e-06, + "loss": 0.0443, + "step": 2199 + }, + { + "epoch": 8.695652173913043, + "grad_norm": 2.841017723083496, + "learning_rate": 6.547619047619048e-06, + "loss": 0.0571, + "step": 2200 + }, + { + "epoch": 8.699604743083004, + "grad_norm": 1.3378969430923462, + "learning_rate": 6.5277777777777784e-06, + "loss": 0.0398, + "step": 2201 + }, + { + "epoch": 8.703557312252965, + "grad_norm": 3.261378526687622, + "learning_rate": 6.507936507936509e-06, + "loss": 0.044, + "step": 2202 + }, + { + "epoch": 8.707509881422926, + "grad_norm": 1.5358128547668457, + "learning_rate": 6.488095238095239e-06, + "loss": 0.0509, + "step": 2203 + }, + { + "epoch": 8.711462450592885, + "grad_norm": 1.134779930114746, + "learning_rate": 6.4682539682539696e-06, + "loss": 0.0338, + "step": 2204 + }, + { + "epoch": 8.715415019762846, + "grad_norm": 1.4963998794555664, + "learning_rate": 6.448412698412699e-06, + "loss": 0.0444, + "step": 2205 + }, + { + "epoch": 8.719367588932807, + "grad_norm": 1.054849624633789, + "learning_rate": 6.428571428571429e-06, + "loss": 0.0365, + "step": 2206 + }, + { + "epoch": 8.723320158102768, + "grad_norm": 1.3599568605422974, + "learning_rate": 6.408730158730158e-06, + "loss": 0.0443, + "step": 2207 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 2.381058931350708, + "learning_rate": 6.3888888888888885e-06, + "loss": 0.0691, + "step": 2208 + }, + { + "epoch": 8.731225296442688, + "grad_norm": 1.278258204460144, + "learning_rate": 6.369047619047619e-06, + "loss": 0.051, + "step": 2209 + }, + { + "epoch": 8.735177865612648, + "grad_norm": 1.1571305990219116, + "learning_rate": 6.349206349206349e-06, + "loss": 0.0556, + "step": 2210 + }, + { + "epoch": 8.73913043478261, + "grad_norm": 1.4655886888504028, + "learning_rate": 6.32936507936508e-06, + "loss": 0.0484, + "step": 2211 + }, + { + "epoch": 8.743083003952568, + "grad_norm": 1.1832677125930786, + "learning_rate": 6.30952380952381e-06, + "loss": 0.0396, + "step": 2212 + }, + { + "epoch": 8.74703557312253, + "grad_norm": 1.5218229293823242, + "learning_rate": 6.2896825396825395e-06, + "loss": 0.0535, + "step": 2213 + }, + { + "epoch": 8.75098814229249, + "grad_norm": 1.2771645784378052, + "learning_rate": 6.26984126984127e-06, + "loss": 0.0454, + "step": 2214 + }, + { + "epoch": 8.754940711462451, + "grad_norm": 1.9904732704162598, + "learning_rate": 6.25e-06, + "loss": 0.0582, + "step": 2215 + }, + { + "epoch": 8.75889328063241, + "grad_norm": 1.5673877000808716, + "learning_rate": 6.230158730158731e-06, + "loss": 0.0481, + "step": 2216 + }, + { + "epoch": 8.762845849802371, + "grad_norm": 1.2909679412841797, + "learning_rate": 6.210317460317461e-06, + "loss": 0.0427, + "step": 2217 + }, + { + "epoch": 8.766798418972332, + "grad_norm": 1.0783694982528687, + "learning_rate": 6.190476190476191e-06, + "loss": 0.0379, + "step": 2218 + }, + { + "epoch": 8.770750988142293, + "grad_norm": 1.3569505214691162, + "learning_rate": 6.170634920634921e-06, + "loss": 0.0438, + "step": 2219 + }, + { + "epoch": 8.774703557312254, + "grad_norm": 1.4521833658218384, + "learning_rate": 6.1507936507936505e-06, + "loss": 0.0533, + "step": 2220 + }, + { + "epoch": 8.778656126482213, + "grad_norm": 1.4218260049819946, + "learning_rate": 6.130952380952381e-06, + "loss": 0.0499, + "step": 2221 + }, + { + "epoch": 8.782608695652174, + "grad_norm": 1.4923803806304932, + "learning_rate": 6.111111111111111e-06, + "loss": 0.066, + "step": 2222 + }, + { + "epoch": 8.786561264822135, + "grad_norm": 1.5490056276321411, + "learning_rate": 6.091269841269842e-06, + "loss": 0.0537, + "step": 2223 + }, + { + "epoch": 8.790513833992096, + "grad_norm": 2.071575164794922, + "learning_rate": 6.071428571428572e-06, + "loss": 0.0601, + "step": 2224 + }, + { + "epoch": 8.794466403162055, + "grad_norm": 1.5150130987167358, + "learning_rate": 6.0515873015873015e-06, + "loss": 0.0486, + "step": 2225 + }, + { + "epoch": 8.798418972332016, + "grad_norm": 1.5356022119522095, + "learning_rate": 6.031746031746032e-06, + "loss": 0.0446, + "step": 2226 + }, + { + "epoch": 8.802371541501977, + "grad_norm": 1.9417153596878052, + "learning_rate": 6.011904761904762e-06, + "loss": 0.0542, + "step": 2227 + }, + { + "epoch": 8.806324110671937, + "grad_norm": 1.9926491975784302, + "learning_rate": 5.992063492063493e-06, + "loss": 0.0664, + "step": 2228 + }, + { + "epoch": 8.810276679841897, + "grad_norm": 1.1277952194213867, + "learning_rate": 5.972222222222223e-06, + "loss": 0.0478, + "step": 2229 + }, + { + "epoch": 8.814229249011857, + "grad_norm": 1.2221788167953491, + "learning_rate": 5.9523809523809525e-06, + "loss": 0.0366, + "step": 2230 + }, + { + "epoch": 8.818181818181818, + "grad_norm": 1.5764946937561035, + "learning_rate": 5.932539682539683e-06, + "loss": 0.0477, + "step": 2231 + }, + { + "epoch": 8.82213438735178, + "grad_norm": 1.4645206928253174, + "learning_rate": 5.9126984126984124e-06, + "loss": 0.0418, + "step": 2232 + }, + { + "epoch": 8.826086956521738, + "grad_norm": 1.9131215810775757, + "learning_rate": 5.892857142857143e-06, + "loss": 0.0689, + "step": 2233 + }, + { + "epoch": 8.8300395256917, + "grad_norm": 1.1039915084838867, + "learning_rate": 5.873015873015873e-06, + "loss": 0.035, + "step": 2234 + }, + { + "epoch": 8.83399209486166, + "grad_norm": 1.4914485216140747, + "learning_rate": 5.8531746031746036e-06, + "loss": 0.0451, + "step": 2235 + }, + { + "epoch": 8.837944664031621, + "grad_norm": 2.195981502532959, + "learning_rate": 5.833333333333334e-06, + "loss": 0.0672, + "step": 2236 + }, + { + "epoch": 8.841897233201582, + "grad_norm": 1.5078537464141846, + "learning_rate": 5.813492063492064e-06, + "loss": 0.0416, + "step": 2237 + }, + { + "epoch": 8.845849802371541, + "grad_norm": 1.2279962301254272, + "learning_rate": 5.793650793650794e-06, + "loss": 0.0414, + "step": 2238 + }, + { + "epoch": 8.849802371541502, + "grad_norm": 1.8902143239974976, + "learning_rate": 5.773809523809524e-06, + "loss": 0.0677, + "step": 2239 + }, + { + "epoch": 8.853754940711463, + "grad_norm": 0.9590296745300293, + "learning_rate": 5.753968253968254e-06, + "loss": 0.0366, + "step": 2240 + }, + { + "epoch": 8.857707509881424, + "grad_norm": 1.2233220338821411, + "learning_rate": 5.734126984126984e-06, + "loss": 0.0498, + "step": 2241 + }, + { + "epoch": 8.861660079051383, + "grad_norm": 1.215208888053894, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.049, + "step": 2242 + }, + { + "epoch": 8.865612648221344, + "grad_norm": 1.1311190128326416, + "learning_rate": 5.694444444444445e-06, + "loss": 0.0442, + "step": 2243 + }, + { + "epoch": 8.869565217391305, + "grad_norm": 1.8418809175491333, + "learning_rate": 5.674603174603174e-06, + "loss": 0.0535, + "step": 2244 + }, + { + "epoch": 8.873517786561266, + "grad_norm": 1.3625643253326416, + "learning_rate": 5.654761904761905e-06, + "loss": 0.0461, + "step": 2245 + }, + { + "epoch": 8.877470355731225, + "grad_norm": 1.0108672380447388, + "learning_rate": 5.634920634920635e-06, + "loss": 0.0362, + "step": 2246 + }, + { + "epoch": 8.881422924901186, + "grad_norm": 1.096070647239685, + "learning_rate": 5.6150793650793655e-06, + "loss": 0.0467, + "step": 2247 + }, + { + "epoch": 8.885375494071146, + "grad_norm": 1.4030896425247192, + "learning_rate": 5.595238095238096e-06, + "loss": 0.0459, + "step": 2248 + }, + { + "epoch": 8.889328063241107, + "grad_norm": 1.9264237880706787, + "learning_rate": 5.575396825396826e-06, + "loss": 0.0593, + "step": 2249 + }, + { + "epoch": 8.893280632411066, + "grad_norm": 1.8848391771316528, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0654, + "step": 2250 + }, + { + "epoch": 8.897233201581027, + "grad_norm": 1.1368132829666138, + "learning_rate": 5.535714285714285e-06, + "loss": 0.0356, + "step": 2251 + }, + { + "epoch": 8.901185770750988, + "grad_norm": 1.4050102233886719, + "learning_rate": 5.515873015873016e-06, + "loss": 0.0568, + "step": 2252 + }, + { + "epoch": 8.90513833992095, + "grad_norm": 1.6391522884368896, + "learning_rate": 5.496031746031746e-06, + "loss": 0.0512, + "step": 2253 + }, + { + "epoch": 8.909090909090908, + "grad_norm": 1.2339924573898315, + "learning_rate": 5.4761904761904765e-06, + "loss": 0.0423, + "step": 2254 + }, + { + "epoch": 8.91304347826087, + "grad_norm": 1.334693431854248, + "learning_rate": 5.456349206349207e-06, + "loss": 0.043, + "step": 2255 + }, + { + "epoch": 8.91699604743083, + "grad_norm": 1.6208196878433228, + "learning_rate": 5.436507936507937e-06, + "loss": 0.0515, + "step": 2256 + }, + { + "epoch": 8.920948616600791, + "grad_norm": 1.959010124206543, + "learning_rate": 5.416666666666667e-06, + "loss": 0.0591, + "step": 2257 + }, + { + "epoch": 8.92490118577075, + "grad_norm": 1.2464101314544678, + "learning_rate": 5.396825396825397e-06, + "loss": 0.0444, + "step": 2258 + }, + { + "epoch": 8.928853754940711, + "grad_norm": 1.113886833190918, + "learning_rate": 5.3769841269841275e-06, + "loss": 0.045, + "step": 2259 + }, + { + "epoch": 8.932806324110672, + "grad_norm": 1.9017833471298218, + "learning_rate": 5.357142857142857e-06, + "loss": 0.0586, + "step": 2260 + }, + { + "epoch": 8.936758893280633, + "grad_norm": 1.6921342611312866, + "learning_rate": 5.337301587301587e-06, + "loss": 0.0678, + "step": 2261 + }, + { + "epoch": 8.940711462450594, + "grad_norm": 1.1940516233444214, + "learning_rate": 5.317460317460318e-06, + "loss": 0.0476, + "step": 2262 + }, + { + "epoch": 8.944664031620553, + "grad_norm": 1.899581789970398, + "learning_rate": 5.297619047619048e-06, + "loss": 0.0688, + "step": 2263 + }, + { + "epoch": 8.948616600790514, + "grad_norm": 1.1794816255569458, + "learning_rate": 5.277777777777778e-06, + "loss": 0.0459, + "step": 2264 + }, + { + "epoch": 8.952569169960475, + "grad_norm": 1.233652949333191, + "learning_rate": 5.257936507936508e-06, + "loss": 0.0436, + "step": 2265 + }, + { + "epoch": 8.956521739130435, + "grad_norm": 1.3263018131256104, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.046, + "step": 2266 + }, + { + "epoch": 8.960474308300395, + "grad_norm": 1.1982510089874268, + "learning_rate": 5.218253968253969e-06, + "loss": 0.0414, + "step": 2267 + }, + { + "epoch": 8.964426877470355, + "grad_norm": 2.1782376766204834, + "learning_rate": 5.198412698412699e-06, + "loss": 0.075, + "step": 2268 + }, + { + "epoch": 8.968379446640316, + "grad_norm": 1.3456032276153564, + "learning_rate": 5.1785714285714296e-06, + "loss": 0.0451, + "step": 2269 + }, + { + "epoch": 8.972332015810277, + "grad_norm": 1.7961373329162598, + "learning_rate": 5.158730158730159e-06, + "loss": 0.0464, + "step": 2270 + }, + { + "epoch": 8.976284584980236, + "grad_norm": 1.0362279415130615, + "learning_rate": 5.138888888888889e-06, + "loss": 0.0374, + "step": 2271 + }, + { + "epoch": 8.980237154150197, + "grad_norm": 1.464831829071045, + "learning_rate": 5.119047619047619e-06, + "loss": 0.0419, + "step": 2272 + }, + { + "epoch": 8.984189723320158, + "grad_norm": 1.7607347965240479, + "learning_rate": 5.099206349206349e-06, + "loss": 0.053, + "step": 2273 + }, + { + "epoch": 8.988142292490119, + "grad_norm": 1.2373989820480347, + "learning_rate": 5.07936507936508e-06, + "loss": 0.05, + "step": 2274 + }, + { + "epoch": 8.992094861660078, + "grad_norm": 1.8209704160690308, + "learning_rate": 5.05952380952381e-06, + "loss": 0.0494, + "step": 2275 + }, + { + "epoch": 8.996047430830039, + "grad_norm": 1.3406612873077393, + "learning_rate": 5.03968253968254e-06, + "loss": 0.0461, + "step": 2276 + }, + { + "epoch": 9.0, + "grad_norm": 1.2392350435256958, + "learning_rate": 5.01984126984127e-06, + "loss": 0.0467, + "step": 2277 + }, + { + "epoch": 9.003952569169961, + "grad_norm": 0.9584967494010925, + "learning_rate": 5e-06, + "loss": 0.0403, + "step": 2278 + }, + { + "epoch": 9.007905138339922, + "grad_norm": 0.5832052826881409, + "learning_rate": 4.98015873015873e-06, + "loss": 0.0321, + "step": 2279 + }, + { + "epoch": 9.011857707509881, + "grad_norm": 1.2194710969924927, + "learning_rate": 4.96031746031746e-06, + "loss": 0.0532, + "step": 2280 + }, + { + "epoch": 9.015810276679842, + "grad_norm": 0.5771675705909729, + "learning_rate": 4.940476190476191e-06, + "loss": 0.028, + "step": 2281 + }, + { + "epoch": 9.019762845849803, + "grad_norm": 1.116558313369751, + "learning_rate": 4.920634920634921e-06, + "loss": 0.0355, + "step": 2282 + }, + { + "epoch": 9.023715415019764, + "grad_norm": 0.4172620475292206, + "learning_rate": 4.900793650793651e-06, + "loss": 0.0239, + "step": 2283 + }, + { + "epoch": 9.027667984189723, + "grad_norm": 0.6424721479415894, + "learning_rate": 4.880952380952381e-06, + "loss": 0.027, + "step": 2284 + }, + { + "epoch": 9.031620553359684, + "grad_norm": 1.7028534412384033, + "learning_rate": 4.861111111111111e-06, + "loss": 0.0455, + "step": 2285 + }, + { + "epoch": 9.035573122529645, + "grad_norm": 0.9933211803436279, + "learning_rate": 4.841269841269842e-06, + "loss": 0.033, + "step": 2286 + }, + { + "epoch": 9.039525691699605, + "grad_norm": 0.9646832942962646, + "learning_rate": 4.821428571428572e-06, + "loss": 0.0386, + "step": 2287 + }, + { + "epoch": 9.043478260869565, + "grad_norm": 0.6727697253227234, + "learning_rate": 4.8015873015873025e-06, + "loss": 0.0352, + "step": 2288 + }, + { + "epoch": 9.047430830039525, + "grad_norm": 0.7543931603431702, + "learning_rate": 4.781746031746032e-06, + "loss": 0.0295, + "step": 2289 + }, + { + "epoch": 9.051383399209486, + "grad_norm": 0.46685945987701416, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.0243, + "step": 2290 + }, + { + "epoch": 9.055335968379447, + "grad_norm": 0.4817000925540924, + "learning_rate": 4.742063492063492e-06, + "loss": 0.0292, + "step": 2291 + }, + { + "epoch": 9.059288537549406, + "grad_norm": 0.8889188170433044, + "learning_rate": 4.722222222222222e-06, + "loss": 0.0369, + "step": 2292 + }, + { + "epoch": 9.063241106719367, + "grad_norm": 0.5639324188232422, + "learning_rate": 4.702380952380953e-06, + "loss": 0.0307, + "step": 2293 + }, + { + "epoch": 9.067193675889328, + "grad_norm": 0.6505725383758545, + "learning_rate": 4.682539682539683e-06, + "loss": 0.0313, + "step": 2294 + }, + { + "epoch": 9.071146245059289, + "grad_norm": 0.9556856751441956, + "learning_rate": 4.662698412698413e-06, + "loss": 0.0463, + "step": 2295 + }, + { + "epoch": 9.075098814229248, + "grad_norm": 0.49648115038871765, + "learning_rate": 4.642857142857143e-06, + "loss": 0.0246, + "step": 2296 + }, + { + "epoch": 9.079051383399209, + "grad_norm": 0.877655029296875, + "learning_rate": 4.623015873015873e-06, + "loss": 0.0309, + "step": 2297 + }, + { + "epoch": 9.08300395256917, + "grad_norm": 1.1071048974990845, + "learning_rate": 4.603174603174604e-06, + "loss": 0.0437, + "step": 2298 + }, + { + "epoch": 9.08695652173913, + "grad_norm": 1.238307237625122, + "learning_rate": 4.583333333333333e-06, + "loss": 0.0346, + "step": 2299 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.9343061447143555, + "learning_rate": 4.563492063492064e-06, + "loss": 0.0306, + "step": 2300 + }, + { + "epoch": 9.09486166007905, + "grad_norm": 1.3237608671188354, + "learning_rate": 4.543650793650794e-06, + "loss": 0.0405, + "step": 2301 + }, + { + "epoch": 9.098814229249012, + "grad_norm": 1.2943129539489746, + "learning_rate": 4.5238095238095235e-06, + "loss": 0.0414, + "step": 2302 + }, + { + "epoch": 9.102766798418973, + "grad_norm": 0.4701171815395355, + "learning_rate": 4.503968253968254e-06, + "loss": 0.0275, + "step": 2303 + }, + { + "epoch": 9.106719367588934, + "grad_norm": 1.5329245328903198, + "learning_rate": 4.484126984126984e-06, + "loss": 0.0412, + "step": 2304 + }, + { + "epoch": 9.110671936758893, + "grad_norm": 1.2468199729919434, + "learning_rate": 4.464285714285715e-06, + "loss": 0.0361, + "step": 2305 + }, + { + "epoch": 9.114624505928854, + "grad_norm": 0.4569074511528015, + "learning_rate": 4.444444444444445e-06, + "loss": 0.0251, + "step": 2306 + }, + { + "epoch": 9.118577075098814, + "grad_norm": 0.7745764255523682, + "learning_rate": 4.424603174603175e-06, + "loss": 0.033, + "step": 2307 + }, + { + "epoch": 9.122529644268775, + "grad_norm": 0.8108665943145752, + "learning_rate": 4.404761904761905e-06, + "loss": 0.0239, + "step": 2308 + }, + { + "epoch": 9.126482213438734, + "grad_norm": 0.8088359832763672, + "learning_rate": 4.3849206349206344e-06, + "loss": 0.0303, + "step": 2309 + }, + { + "epoch": 9.130434782608695, + "grad_norm": 0.9375463724136353, + "learning_rate": 4.365079365079365e-06, + "loss": 0.0593, + "step": 2310 + }, + { + "epoch": 9.134387351778656, + "grad_norm": 1.1133482456207275, + "learning_rate": 4.345238095238095e-06, + "loss": 0.0357, + "step": 2311 + }, + { + "epoch": 9.138339920948617, + "grad_norm": 0.5319989919662476, + "learning_rate": 4.3253968253968256e-06, + "loss": 0.029, + "step": 2312 + }, + { + "epoch": 9.142292490118576, + "grad_norm": 0.9559447169303894, + "learning_rate": 4.305555555555556e-06, + "loss": 0.0377, + "step": 2313 + }, + { + "epoch": 9.146245059288537, + "grad_norm": 1.1195396184921265, + "learning_rate": 4.285714285714286e-06, + "loss": 0.0254, + "step": 2314 + }, + { + "epoch": 9.150197628458498, + "grad_norm": 0.6906282901763916, + "learning_rate": 4.265873015873016e-06, + "loss": 0.0417, + "step": 2315 + }, + { + "epoch": 9.154150197628459, + "grad_norm": 0.730162501335144, + "learning_rate": 4.246031746031746e-06, + "loss": 0.0304, + "step": 2316 + }, + { + "epoch": 9.15810276679842, + "grad_norm": 0.8155921697616577, + "learning_rate": 4.226190476190477e-06, + "loss": 0.0334, + "step": 2317 + }, + { + "epoch": 9.162055335968379, + "grad_norm": 1.145893931388855, + "learning_rate": 4.206349206349207e-06, + "loss": 0.0472, + "step": 2318 + }, + { + "epoch": 9.16600790513834, + "grad_norm": 0.6284162998199463, + "learning_rate": 4.1865079365079365e-06, + "loss": 0.0302, + "step": 2319 + }, + { + "epoch": 9.1699604743083, + "grad_norm": 3.132331371307373, + "learning_rate": 4.166666666666667e-06, + "loss": 0.0671, + "step": 2320 + }, + { + "epoch": 9.173913043478262, + "grad_norm": 0.725439190864563, + "learning_rate": 4.146825396825397e-06, + "loss": 0.029, + "step": 2321 + }, + { + "epoch": 9.17786561264822, + "grad_norm": 0.6807708144187927, + "learning_rate": 4.126984126984127e-06, + "loss": 0.0265, + "step": 2322 + }, + { + "epoch": 9.181818181818182, + "grad_norm": 1.1239207983016968, + "learning_rate": 4.107142857142857e-06, + "loss": 0.0428, + "step": 2323 + }, + { + "epoch": 9.185770750988143, + "grad_norm": 0.6186617612838745, + "learning_rate": 4.0873015873015875e-06, + "loss": 0.026, + "step": 2324 + }, + { + "epoch": 9.189723320158103, + "grad_norm": 0.5834595561027527, + "learning_rate": 4.067460317460318e-06, + "loss": 0.0281, + "step": 2325 + }, + { + "epoch": 9.193675889328063, + "grad_norm": 0.7775494456291199, + "learning_rate": 4.047619047619048e-06, + "loss": 0.027, + "step": 2326 + }, + { + "epoch": 9.197628458498023, + "grad_norm": 0.5415392518043518, + "learning_rate": 4.027777777777779e-06, + "loss": 0.0291, + "step": 2327 + }, + { + "epoch": 9.201581027667984, + "grad_norm": 1.0766407251358032, + "learning_rate": 4.007936507936508e-06, + "loss": 0.0347, + "step": 2328 + }, + { + "epoch": 9.205533596837945, + "grad_norm": 0.705011785030365, + "learning_rate": 3.988095238095238e-06, + "loss": 0.0274, + "step": 2329 + }, + { + "epoch": 9.209486166007904, + "grad_norm": 1.0642330646514893, + "learning_rate": 3.968253968253968e-06, + "loss": 0.0432, + "step": 2330 + }, + { + "epoch": 9.213438735177865, + "grad_norm": 0.8075931072235107, + "learning_rate": 3.9484126984126985e-06, + "loss": 0.031, + "step": 2331 + }, + { + "epoch": 9.217391304347826, + "grad_norm": 0.5113716721534729, + "learning_rate": 3.928571428571429e-06, + "loss": 0.0224, + "step": 2332 + }, + { + "epoch": 9.221343873517787, + "grad_norm": 0.9608832597732544, + "learning_rate": 3.908730158730159e-06, + "loss": 0.0319, + "step": 2333 + }, + { + "epoch": 9.225296442687746, + "grad_norm": 1.2569172382354736, + "learning_rate": 3.888888888888889e-06, + "loss": 0.0328, + "step": 2334 + }, + { + "epoch": 9.229249011857707, + "grad_norm": 0.7299090027809143, + "learning_rate": 3.869047619047619e-06, + "loss": 0.0258, + "step": 2335 + }, + { + "epoch": 9.233201581027668, + "grad_norm": 0.5903069972991943, + "learning_rate": 3.8492063492063495e-06, + "loss": 0.0313, + "step": 2336 + }, + { + "epoch": 9.237154150197629, + "grad_norm": 0.715298056602478, + "learning_rate": 3.82936507936508e-06, + "loss": 0.042, + "step": 2337 + }, + { + "epoch": 9.24110671936759, + "grad_norm": 0.6976611018180847, + "learning_rate": 3.8095238095238102e-06, + "loss": 0.0399, + "step": 2338 + }, + { + "epoch": 9.245059288537549, + "grad_norm": 0.9303411245346069, + "learning_rate": 3.7896825396825398e-06, + "loss": 0.0319, + "step": 2339 + }, + { + "epoch": 9.24901185770751, + "grad_norm": 0.706676721572876, + "learning_rate": 3.7698412698412697e-06, + "loss": 0.0332, + "step": 2340 + }, + { + "epoch": 9.25296442687747, + "grad_norm": 0.553431510925293, + "learning_rate": 3.75e-06, + "loss": 0.0339, + "step": 2341 + }, + { + "epoch": 9.256916996047432, + "grad_norm": 1.1647964715957642, + "learning_rate": 3.7301587301587305e-06, + "loss": 0.0277, + "step": 2342 + }, + { + "epoch": 9.26086956521739, + "grad_norm": 0.5181522369384766, + "learning_rate": 3.7103174603174604e-06, + "loss": 0.0231, + "step": 2343 + }, + { + "epoch": 9.264822134387352, + "grad_norm": 1.1019365787506104, + "learning_rate": 3.690476190476191e-06, + "loss": 0.0654, + "step": 2344 + }, + { + "epoch": 9.268774703557312, + "grad_norm": 0.6852990984916687, + "learning_rate": 3.6706349206349208e-06, + "loss": 0.0254, + "step": 2345 + }, + { + "epoch": 9.272727272727273, + "grad_norm": 0.6852630972862244, + "learning_rate": 3.650793650793651e-06, + "loss": 0.028, + "step": 2346 + }, + { + "epoch": 9.276679841897232, + "grad_norm": 0.79278165102005, + "learning_rate": 3.6309523809523815e-06, + "loss": 0.0321, + "step": 2347 + }, + { + "epoch": 9.280632411067193, + "grad_norm": 1.3321659564971924, + "learning_rate": 3.611111111111111e-06, + "loss": 0.0305, + "step": 2348 + }, + { + "epoch": 9.284584980237154, + "grad_norm": 1.0156902074813843, + "learning_rate": 3.591269841269841e-06, + "loss": 0.0331, + "step": 2349 + }, + { + "epoch": 9.288537549407115, + "grad_norm": 0.9449909329414368, + "learning_rate": 3.5714285714285714e-06, + "loss": 0.0291, + "step": 2350 + }, + { + "epoch": 9.292490118577074, + "grad_norm": 0.6974058747291565, + "learning_rate": 3.5515873015873017e-06, + "loss": 0.0369, + "step": 2351 + }, + { + "epoch": 9.296442687747035, + "grad_norm": 0.6614970564842224, + "learning_rate": 3.5317460317460317e-06, + "loss": 0.0245, + "step": 2352 + }, + { + "epoch": 9.300395256916996, + "grad_norm": 1.0674023628234863, + "learning_rate": 3.511904761904762e-06, + "loss": 0.0361, + "step": 2353 + }, + { + "epoch": 9.304347826086957, + "grad_norm": 0.8966097831726074, + "learning_rate": 3.4920634920634924e-06, + "loss": 0.0399, + "step": 2354 + }, + { + "epoch": 9.308300395256918, + "grad_norm": 0.8403730988502502, + "learning_rate": 3.4722222222222224e-06, + "loss": 0.0403, + "step": 2355 + }, + { + "epoch": 9.312252964426877, + "grad_norm": 0.9505050182342529, + "learning_rate": 3.4523809523809528e-06, + "loss": 0.0277, + "step": 2356 + }, + { + "epoch": 9.316205533596838, + "grad_norm": 0.5641964673995972, + "learning_rate": 3.432539682539683e-06, + "loss": 0.0316, + "step": 2357 + }, + { + "epoch": 9.320158102766799, + "grad_norm": 0.9463332891464233, + "learning_rate": 3.4126984126984127e-06, + "loss": 0.0395, + "step": 2358 + }, + { + "epoch": 9.32411067193676, + "grad_norm": 1.2878776788711548, + "learning_rate": 3.3928571428571426e-06, + "loss": 0.0397, + "step": 2359 + }, + { + "epoch": 9.328063241106719, + "grad_norm": 1.2284893989562988, + "learning_rate": 3.373015873015873e-06, + "loss": 0.0421, + "step": 2360 + }, + { + "epoch": 9.33201581027668, + "grad_norm": 0.9104984402656555, + "learning_rate": 3.3531746031746034e-06, + "loss": 0.0384, + "step": 2361 + }, + { + "epoch": 9.33596837944664, + "grad_norm": 0.7004300951957703, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0506, + "step": 2362 + }, + { + "epoch": 9.339920948616601, + "grad_norm": 0.9767426252365112, + "learning_rate": 3.3134920634920637e-06, + "loss": 0.0415, + "step": 2363 + }, + { + "epoch": 9.34387351778656, + "grad_norm": 0.7821481227874756, + "learning_rate": 3.293650793650794e-06, + "loss": 0.031, + "step": 2364 + }, + { + "epoch": 9.347826086956522, + "grad_norm": 0.9590467810630798, + "learning_rate": 3.273809523809524e-06, + "loss": 0.0393, + "step": 2365 + }, + { + "epoch": 9.351778656126482, + "grad_norm": 0.6908400654792786, + "learning_rate": 3.2539682539682544e-06, + "loss": 0.0405, + "step": 2366 + }, + { + "epoch": 9.355731225296443, + "grad_norm": 0.8684214949607849, + "learning_rate": 3.2341269841269848e-06, + "loss": 0.0313, + "step": 2367 + }, + { + "epoch": 9.359683794466402, + "grad_norm": 0.5750323534011841, + "learning_rate": 3.2142857142857143e-06, + "loss": 0.0323, + "step": 2368 + }, + { + "epoch": 9.363636363636363, + "grad_norm": 0.7478132247924805, + "learning_rate": 3.1944444444444443e-06, + "loss": 0.0299, + "step": 2369 + }, + { + "epoch": 9.367588932806324, + "grad_norm": 0.4561312794685364, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.0263, + "step": 2370 + }, + { + "epoch": 9.371541501976285, + "grad_norm": 0.856050968170166, + "learning_rate": 3.154761904761905e-06, + "loss": 0.032, + "step": 2371 + }, + { + "epoch": 9.375494071146244, + "grad_norm": 0.4932451844215393, + "learning_rate": 3.134920634920635e-06, + "loss": 0.0264, + "step": 2372 + }, + { + "epoch": 9.379446640316205, + "grad_norm": 0.6153606176376343, + "learning_rate": 3.1150793650793653e-06, + "loss": 0.0281, + "step": 2373 + }, + { + "epoch": 9.383399209486166, + "grad_norm": 0.8731127381324768, + "learning_rate": 3.0952380952380953e-06, + "loss": 0.0324, + "step": 2374 + }, + { + "epoch": 9.387351778656127, + "grad_norm": 0.7539214491844177, + "learning_rate": 3.0753968253968252e-06, + "loss": 0.0375, + "step": 2375 + }, + { + "epoch": 9.391304347826088, + "grad_norm": 1.1730583906173706, + "learning_rate": 3.0555555555555556e-06, + "loss": 0.0359, + "step": 2376 + }, + { + "epoch": 9.395256916996047, + "grad_norm": 0.7655700445175171, + "learning_rate": 3.035714285714286e-06, + "loss": 0.0297, + "step": 2377 + }, + { + "epoch": 9.399209486166008, + "grad_norm": 0.5464181900024414, + "learning_rate": 3.015873015873016e-06, + "loss": 0.0272, + "step": 2378 + }, + { + "epoch": 9.403162055335969, + "grad_norm": 1.2985944747924805, + "learning_rate": 2.9960317460317463e-06, + "loss": 0.0368, + "step": 2379 + }, + { + "epoch": 9.40711462450593, + "grad_norm": 0.7202474474906921, + "learning_rate": 2.9761904761904763e-06, + "loss": 0.0431, + "step": 2380 + }, + { + "epoch": 9.411067193675889, + "grad_norm": 0.859272301197052, + "learning_rate": 2.9563492063492062e-06, + "loss": 0.0339, + "step": 2381 + }, + { + "epoch": 9.41501976284585, + "grad_norm": 0.9698725342750549, + "learning_rate": 2.9365079365079366e-06, + "loss": 0.0264, + "step": 2382 + }, + { + "epoch": 9.41897233201581, + "grad_norm": 0.8615301251411438, + "learning_rate": 2.916666666666667e-06, + "loss": 0.0291, + "step": 2383 + }, + { + "epoch": 9.422924901185771, + "grad_norm": 0.5525624752044678, + "learning_rate": 2.896825396825397e-06, + "loss": 0.0368, + "step": 2384 + }, + { + "epoch": 9.42687747035573, + "grad_norm": 1.0065622329711914, + "learning_rate": 2.876984126984127e-06, + "loss": 0.0386, + "step": 2385 + }, + { + "epoch": 9.430830039525691, + "grad_norm": 1.2710402011871338, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0432, + "step": 2386 + }, + { + "epoch": 9.434782608695652, + "grad_norm": 0.5823608636856079, + "learning_rate": 2.837301587301587e-06, + "loss": 0.0302, + "step": 2387 + }, + { + "epoch": 9.438735177865613, + "grad_norm": 0.8949801325798035, + "learning_rate": 2.8174603174603176e-06, + "loss": 0.0464, + "step": 2388 + }, + { + "epoch": 9.442687747035572, + "grad_norm": 0.6641007661819458, + "learning_rate": 2.797619047619048e-06, + "loss": 0.0232, + "step": 2389 + }, + { + "epoch": 9.446640316205533, + "grad_norm": 0.9579842686653137, + "learning_rate": 2.777777777777778e-06, + "loss": 0.0421, + "step": 2390 + }, + { + "epoch": 9.450592885375494, + "grad_norm": 0.843721866607666, + "learning_rate": 2.757936507936508e-06, + "loss": 0.0329, + "step": 2391 + }, + { + "epoch": 9.454545454545455, + "grad_norm": 0.8626483082771301, + "learning_rate": 2.7380952380952382e-06, + "loss": 0.0407, + "step": 2392 + }, + { + "epoch": 9.458498023715414, + "grad_norm": 0.610456109046936, + "learning_rate": 2.7182539682539686e-06, + "loss": 0.0307, + "step": 2393 + }, + { + "epoch": 9.462450592885375, + "grad_norm": 0.6262418031692505, + "learning_rate": 2.6984126984126986e-06, + "loss": 0.0261, + "step": 2394 + }, + { + "epoch": 9.466403162055336, + "grad_norm": 0.5422096848487854, + "learning_rate": 2.6785714285714285e-06, + "loss": 0.0271, + "step": 2395 + }, + { + "epoch": 9.470355731225297, + "grad_norm": 0.46671536564826965, + "learning_rate": 2.658730158730159e-06, + "loss": 0.0212, + "step": 2396 + }, + { + "epoch": 9.474308300395258, + "grad_norm": 0.6522403359413147, + "learning_rate": 2.638888888888889e-06, + "loss": 0.0346, + "step": 2397 + }, + { + "epoch": 9.478260869565217, + "grad_norm": 1.04923677444458, + "learning_rate": 2.6190476190476192e-06, + "loss": 0.0386, + "step": 2398 + }, + { + "epoch": 9.482213438735178, + "grad_norm": 0.817672610282898, + "learning_rate": 2.5992063492063496e-06, + "loss": 0.0375, + "step": 2399 + }, + { + "epoch": 9.486166007905139, + "grad_norm": 0.4436691105365753, + "learning_rate": 2.5793650793650795e-06, + "loss": 0.0337, + "step": 2400 + }, + { + "epoch": 9.4901185770751, + "grad_norm": 0.5380107164382935, + "learning_rate": 2.5595238095238095e-06, + "loss": 0.0252, + "step": 2401 + }, + { + "epoch": 9.494071146245059, + "grad_norm": 0.5638983845710754, + "learning_rate": 2.53968253968254e-06, + "loss": 0.0258, + "step": 2402 + }, + { + "epoch": 9.49802371541502, + "grad_norm": 0.5976006388664246, + "learning_rate": 2.51984126984127e-06, + "loss": 0.026, + "step": 2403 + }, + { + "epoch": 9.50197628458498, + "grad_norm": 0.8063969612121582, + "learning_rate": 2.5e-06, + "loss": 0.035, + "step": 2404 + }, + { + "epoch": 9.505928853754941, + "grad_norm": 0.7016451954841614, + "learning_rate": 2.48015873015873e-06, + "loss": 0.031, + "step": 2405 + }, + { + "epoch": 9.5098814229249, + "grad_norm": 0.6681998372077942, + "learning_rate": 2.4603174603174605e-06, + "loss": 0.0293, + "step": 2406 + }, + { + "epoch": 9.513833992094861, + "grad_norm": 0.8126972913742065, + "learning_rate": 2.4404761904761905e-06, + "loss": 0.0296, + "step": 2407 + }, + { + "epoch": 9.517786561264822, + "grad_norm": 0.6644697189331055, + "learning_rate": 2.420634920634921e-06, + "loss": 0.028, + "step": 2408 + }, + { + "epoch": 9.521739130434783, + "grad_norm": 0.7953290939331055, + "learning_rate": 2.4007936507936512e-06, + "loss": 0.0244, + "step": 2409 + }, + { + "epoch": 9.525691699604742, + "grad_norm": 0.8905209302902222, + "learning_rate": 2.3809523809523808e-06, + "loss": 0.0629, + "step": 2410 + }, + { + "epoch": 9.529644268774703, + "grad_norm": 0.6507531404495239, + "learning_rate": 2.361111111111111e-06, + "loss": 0.026, + "step": 2411 + }, + { + "epoch": 9.533596837944664, + "grad_norm": 0.6166356205940247, + "learning_rate": 2.3412698412698415e-06, + "loss": 0.0342, + "step": 2412 + }, + { + "epoch": 9.537549407114625, + "grad_norm": 0.6299740076065063, + "learning_rate": 2.3214285714285715e-06, + "loss": 0.0307, + "step": 2413 + }, + { + "epoch": 9.541501976284586, + "grad_norm": 0.6078210473060608, + "learning_rate": 2.301587301587302e-06, + "loss": 0.0341, + "step": 2414 + }, + { + "epoch": 9.545454545454545, + "grad_norm": 1.263504981994629, + "learning_rate": 2.281746031746032e-06, + "loss": 0.0389, + "step": 2415 + }, + { + "epoch": 9.549407114624506, + "grad_norm": 0.6518561244010925, + "learning_rate": 2.2619047619047617e-06, + "loss": 0.0344, + "step": 2416 + }, + { + "epoch": 9.553359683794467, + "grad_norm": 0.9337310194969177, + "learning_rate": 2.242063492063492e-06, + "loss": 0.0299, + "step": 2417 + }, + { + "epoch": 9.557312252964428, + "grad_norm": 0.6526855826377869, + "learning_rate": 2.2222222222222225e-06, + "loss": 0.0307, + "step": 2418 + }, + { + "epoch": 9.561264822134387, + "grad_norm": 0.8223960399627686, + "learning_rate": 2.2023809523809525e-06, + "loss": 0.0279, + "step": 2419 + }, + { + "epoch": 9.565217391304348, + "grad_norm": 0.6811292171478271, + "learning_rate": 2.1825396825396824e-06, + "loss": 0.0301, + "step": 2420 + }, + { + "epoch": 9.569169960474309, + "grad_norm": 0.9670735597610474, + "learning_rate": 2.1626984126984128e-06, + "loss": 0.031, + "step": 2421 + }, + { + "epoch": 9.57312252964427, + "grad_norm": 0.5966360569000244, + "learning_rate": 2.142857142857143e-06, + "loss": 0.0266, + "step": 2422 + }, + { + "epoch": 9.577075098814229, + "grad_norm": 0.6706477999687195, + "learning_rate": 2.123015873015873e-06, + "loss": 0.0356, + "step": 2423 + }, + { + "epoch": 9.58102766798419, + "grad_norm": 0.7263142466545105, + "learning_rate": 2.1031746031746035e-06, + "loss": 0.0336, + "step": 2424 + }, + { + "epoch": 9.58498023715415, + "grad_norm": 0.509760856628418, + "learning_rate": 2.0833333333333334e-06, + "loss": 0.0276, + "step": 2425 + }, + { + "epoch": 9.588932806324111, + "grad_norm": 0.6894228458404541, + "learning_rate": 2.0634920634920634e-06, + "loss": 0.0322, + "step": 2426 + }, + { + "epoch": 9.59288537549407, + "grad_norm": 0.751865029335022, + "learning_rate": 2.0436507936507938e-06, + "loss": 0.0364, + "step": 2427 + }, + { + "epoch": 9.596837944664031, + "grad_norm": 0.9960070848464966, + "learning_rate": 2.023809523809524e-06, + "loss": 0.0268, + "step": 2428 + }, + { + "epoch": 9.600790513833992, + "grad_norm": 1.3808013200759888, + "learning_rate": 2.003968253968254e-06, + "loss": 0.0395, + "step": 2429 + }, + { + "epoch": 9.604743083003953, + "grad_norm": 0.6000566482543945, + "learning_rate": 1.984126984126984e-06, + "loss": 0.0259, + "step": 2430 + }, + { + "epoch": 9.608695652173914, + "grad_norm": 0.715231716632843, + "learning_rate": 1.9642857142857144e-06, + "loss": 0.0297, + "step": 2431 + }, + { + "epoch": 9.612648221343873, + "grad_norm": 0.8936877250671387, + "learning_rate": 1.9444444444444444e-06, + "loss": 0.0346, + "step": 2432 + }, + { + "epoch": 9.616600790513834, + "grad_norm": 0.7350375056266785, + "learning_rate": 1.9246031746031747e-06, + "loss": 0.0375, + "step": 2433 + }, + { + "epoch": 9.620553359683795, + "grad_norm": 0.7897645235061646, + "learning_rate": 1.9047619047619051e-06, + "loss": 0.0312, + "step": 2434 + }, + { + "epoch": 9.624505928853754, + "grad_norm": 0.4184766113758087, + "learning_rate": 1.8849206349206349e-06, + "loss": 0.0244, + "step": 2435 + }, + { + "epoch": 9.628458498023715, + "grad_norm": 0.7881268262863159, + "learning_rate": 1.8650793650793652e-06, + "loss": 0.0385, + "step": 2436 + }, + { + "epoch": 9.632411067193676, + "grad_norm": 0.83879554271698, + "learning_rate": 1.8452380952380954e-06, + "loss": 0.0278, + "step": 2437 + }, + { + "epoch": 9.636363636363637, + "grad_norm": 0.9279481768608093, + "learning_rate": 1.8253968253968256e-06, + "loss": 0.0288, + "step": 2438 + }, + { + "epoch": 9.640316205533598, + "grad_norm": 1.2790271043777466, + "learning_rate": 1.8055555555555555e-06, + "loss": 0.0289, + "step": 2439 + }, + { + "epoch": 9.644268774703557, + "grad_norm": 0.5256231427192688, + "learning_rate": 1.7857142857142857e-06, + "loss": 0.0262, + "step": 2440 + }, + { + "epoch": 9.648221343873518, + "grad_norm": 0.5460281372070312, + "learning_rate": 1.7658730158730158e-06, + "loss": 0.0312, + "step": 2441 + }, + { + "epoch": 9.652173913043478, + "grad_norm": 0.8667499423027039, + "learning_rate": 1.7460317460317462e-06, + "loss": 0.0355, + "step": 2442 + }, + { + "epoch": 9.65612648221344, + "grad_norm": 1.1661618947982788, + "learning_rate": 1.7261904761904764e-06, + "loss": 0.0397, + "step": 2443 + }, + { + "epoch": 9.660079051383399, + "grad_norm": 0.5786067843437195, + "learning_rate": 1.7063492063492063e-06, + "loss": 0.0255, + "step": 2444 + }, + { + "epoch": 9.66403162055336, + "grad_norm": 1.2098567485809326, + "learning_rate": 1.6865079365079365e-06, + "loss": 0.0829, + "step": 2445 + }, + { + "epoch": 9.66798418972332, + "grad_norm": 1.0478078126907349, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0351, + "step": 2446 + }, + { + "epoch": 9.671936758893281, + "grad_norm": 0.5507435202598572, + "learning_rate": 1.646825396825397e-06, + "loss": 0.0268, + "step": 2447 + }, + { + "epoch": 9.67588932806324, + "grad_norm": 0.4946918785572052, + "learning_rate": 1.6269841269841272e-06, + "loss": 0.0291, + "step": 2448 + }, + { + "epoch": 9.679841897233201, + "grad_norm": 1.1714826822280884, + "learning_rate": 1.6071428571428572e-06, + "loss": 0.0418, + "step": 2449 + }, + { + "epoch": 9.683794466403162, + "grad_norm": 0.6348810791969299, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.0329, + "step": 2450 + }, + { + "epoch": 9.687747035573123, + "grad_norm": 1.8041942119598389, + "learning_rate": 1.5674603174603175e-06, + "loss": 0.0434, + "step": 2451 + }, + { + "epoch": 9.691699604743082, + "grad_norm": 0.9931659698486328, + "learning_rate": 1.5476190476190476e-06, + "loss": 0.0405, + "step": 2452 + }, + { + "epoch": 9.695652173913043, + "grad_norm": 0.7843307256698608, + "learning_rate": 1.5277777777777778e-06, + "loss": 0.0316, + "step": 2453 + }, + { + "epoch": 9.699604743083004, + "grad_norm": 0.6582829356193542, + "learning_rate": 1.507936507936508e-06, + "loss": 0.0373, + "step": 2454 + }, + { + "epoch": 9.703557312252965, + "grad_norm": 1.3511526584625244, + "learning_rate": 1.4880952380952381e-06, + "loss": 0.0397, + "step": 2455 + }, + { + "epoch": 9.707509881422926, + "grad_norm": 0.6811285614967346, + "learning_rate": 1.4682539682539683e-06, + "loss": 0.0257, + "step": 2456 + }, + { + "epoch": 9.711462450592885, + "grad_norm": 1.0743825435638428, + "learning_rate": 1.4484126984126985e-06, + "loss": 0.0378, + "step": 2457 + }, + { + "epoch": 9.715415019762846, + "grad_norm": 0.5769475102424622, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.0271, + "step": 2458 + }, + { + "epoch": 9.719367588932807, + "grad_norm": 0.5640432834625244, + "learning_rate": 1.4087301587301588e-06, + "loss": 0.023, + "step": 2459 + }, + { + "epoch": 9.723320158102768, + "grad_norm": 0.6095978021621704, + "learning_rate": 1.388888888888889e-06, + "loss": 0.0368, + "step": 2460 + }, + { + "epoch": 9.727272727272727, + "grad_norm": 0.6505516767501831, + "learning_rate": 1.3690476190476191e-06, + "loss": 0.025, + "step": 2461 + }, + { + "epoch": 9.731225296442688, + "grad_norm": 0.7247635722160339, + "learning_rate": 1.3492063492063493e-06, + "loss": 0.0271, + "step": 2462 + }, + { + "epoch": 9.735177865612648, + "grad_norm": 0.7161766290664673, + "learning_rate": 1.3293650793650794e-06, + "loss": 0.0316, + "step": 2463 + }, + { + "epoch": 9.73913043478261, + "grad_norm": 0.8741920590400696, + "learning_rate": 1.3095238095238096e-06, + "loss": 0.0356, + "step": 2464 + }, + { + "epoch": 9.743083003952568, + "grad_norm": 0.7110796570777893, + "learning_rate": 1.2896825396825398e-06, + "loss": 0.0312, + "step": 2465 + }, + { + "epoch": 9.74703557312253, + "grad_norm": 0.5976476669311523, + "learning_rate": 1.26984126984127e-06, + "loss": 0.0341, + "step": 2466 + }, + { + "epoch": 9.75098814229249, + "grad_norm": 1.2879040241241455, + "learning_rate": 1.25e-06, + "loss": 0.0361, + "step": 2467 + }, + { + "epoch": 9.754940711462451, + "grad_norm": 0.5123355388641357, + "learning_rate": 1.2301587301587303e-06, + "loss": 0.0235, + "step": 2468 + }, + { + "epoch": 9.75889328063241, + "grad_norm": 1.0054389238357544, + "learning_rate": 1.2103174603174604e-06, + "loss": 0.0441, + "step": 2469 + }, + { + "epoch": 9.762845849802371, + "grad_norm": 0.8575003147125244, + "learning_rate": 1.1904761904761904e-06, + "loss": 0.0337, + "step": 2470 + }, + { + "epoch": 9.766798418972332, + "grad_norm": 0.6102113127708435, + "learning_rate": 1.1706349206349208e-06, + "loss": 0.0359, + "step": 2471 + }, + { + "epoch": 9.770750988142293, + "grad_norm": 1.0554832220077515, + "learning_rate": 1.150793650793651e-06, + "loss": 0.0388, + "step": 2472 + }, + { + "epoch": 9.774703557312254, + "grad_norm": 0.7735835909843445, + "learning_rate": 1.1309523809523809e-06, + "loss": 0.0271, + "step": 2473 + }, + { + "epoch": 9.778656126482213, + "grad_norm": 1.1842035055160522, + "learning_rate": 1.1111111111111112e-06, + "loss": 0.0413, + "step": 2474 + }, + { + "epoch": 9.782608695652174, + "grad_norm": 0.8943471312522888, + "learning_rate": 1.0912698412698412e-06, + "loss": 0.0396, + "step": 2475 + }, + { + "epoch": 9.786561264822135, + "grad_norm": 0.7007603049278259, + "learning_rate": 1.0714285714285716e-06, + "loss": 0.0315, + "step": 2476 + }, + { + "epoch": 9.790513833992096, + "grad_norm": 0.545301616191864, + "learning_rate": 1.0515873015873017e-06, + "loss": 0.0243, + "step": 2477 + }, + { + "epoch": 9.794466403162055, + "grad_norm": 0.3348066210746765, + "learning_rate": 1.0317460317460317e-06, + "loss": 0.0259, + "step": 2478 + }, + { + "epoch": 9.798418972332016, + "grad_norm": 0.856115996837616, + "learning_rate": 1.011904761904762e-06, + "loss": 0.0496, + "step": 2479 + }, + { + "epoch": 9.802371541501977, + "grad_norm": 1.026168704032898, + "learning_rate": 9.92063492063492e-07, + "loss": 0.0343, + "step": 2480 + }, + { + "epoch": 9.806324110671937, + "grad_norm": 0.5965343117713928, + "learning_rate": 9.722222222222222e-07, + "loss": 0.0248, + "step": 2481 + }, + { + "epoch": 9.810276679841897, + "grad_norm": 0.6260294318199158, + "learning_rate": 9.523809523809526e-07, + "loss": 0.0229, + "step": 2482 + }, + { + "epoch": 9.814229249011857, + "grad_norm": 0.7057980298995972, + "learning_rate": 9.325396825396826e-07, + "loss": 0.0344, + "step": 2483 + }, + { + "epoch": 9.818181818181818, + "grad_norm": 1.1280661821365356, + "learning_rate": 9.126984126984128e-07, + "loss": 0.0297, + "step": 2484 + }, + { + "epoch": 9.82213438735178, + "grad_norm": 0.6508181095123291, + "learning_rate": 8.928571428571428e-07, + "loss": 0.0312, + "step": 2485 + }, + { + "epoch": 9.826086956521738, + "grad_norm": 2.5763347148895264, + "learning_rate": 8.730158730158731e-07, + "loss": 0.0786, + "step": 2486 + }, + { + "epoch": 9.8300395256917, + "grad_norm": 0.6393975615501404, + "learning_rate": 8.531746031746032e-07, + "loss": 0.0292, + "step": 2487 + }, + { + "epoch": 9.83399209486166, + "grad_norm": 0.559859037399292, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0322, + "step": 2488 + }, + { + "epoch": 9.837944664031621, + "grad_norm": 0.8873099088668823, + "learning_rate": 8.134920634920636e-07, + "loss": 0.0402, + "step": 2489 + }, + { + "epoch": 9.841897233201582, + "grad_norm": 1.0293989181518555, + "learning_rate": 7.936507936507937e-07, + "loss": 0.0487, + "step": 2490 + }, + { + "epoch": 9.845849802371541, + "grad_norm": 0.8084055185317993, + "learning_rate": 7.738095238095238e-07, + "loss": 0.0415, + "step": 2491 + }, + { + "epoch": 9.849802371541502, + "grad_norm": 0.8011215329170227, + "learning_rate": 7.53968253968254e-07, + "loss": 0.0332, + "step": 2492 + }, + { + "epoch": 9.853754940711463, + "grad_norm": 0.6973430514335632, + "learning_rate": 7.341269841269842e-07, + "loss": 0.0338, + "step": 2493 + }, + { + "epoch": 9.857707509881424, + "grad_norm": 1.018647313117981, + "learning_rate": 7.142857142857143e-07, + "loss": 0.0305, + "step": 2494 + }, + { + "epoch": 9.861660079051383, + "grad_norm": 0.7031568884849548, + "learning_rate": 6.944444444444445e-07, + "loss": 0.0297, + "step": 2495 + }, + { + "epoch": 9.865612648221344, + "grad_norm": 0.6655853986740112, + "learning_rate": 6.746031746031746e-07, + "loss": 0.0298, + "step": 2496 + }, + { + "epoch": 9.869565217391305, + "grad_norm": 1.6888632774353027, + "learning_rate": 6.547619047619048e-07, + "loss": 0.0578, + "step": 2497 + }, + { + "epoch": 9.873517786561266, + "grad_norm": 1.1527448892593384, + "learning_rate": 6.34920634920635e-07, + "loss": 0.0444, + "step": 2498 + }, + { + "epoch": 9.877470355731225, + "grad_norm": 0.6494206190109253, + "learning_rate": 6.150793650793651e-07, + "loss": 0.034, + "step": 2499 + }, + { + "epoch": 9.881422924901186, + "grad_norm": 0.9560872912406921, + "learning_rate": 5.952380952380952e-07, + "loss": 0.0283, + "step": 2500 + } + ], + "logging_steps": 1, + "max_steps": 2530, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.237898477266954e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}