{ "best_metric": 1.8160488605499268, "best_model_checkpoint": "pneumonia3/checkpoint-2608", "epoch": 1.0, "eval_steps": 500, "global_step": 2608, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003834355828220859, "grad_norm": 0.8956754803657532, "learning_rate": 1.9157088122605365e-07, "loss": 0.0063, "step": 1 }, { "epoch": 0.0007668711656441718, "grad_norm": NaN, "learning_rate": 1.9157088122605365e-07, "loss": 5.584, "step": 2 }, { "epoch": 0.0011503067484662577, "grad_norm": NaN, "learning_rate": 1.9157088122605365e-07, "loss": 2.2579, "step": 3 }, { "epoch": 0.0015337423312883436, "grad_norm": NaN, "learning_rate": 1.9157088122605365e-07, "loss": 1.0205, "step": 4 }, { "epoch": 0.0019171779141104294, "grad_norm": 8.993351936340332, "learning_rate": 3.831417624521073e-07, "loss": 0.0311, "step": 5 }, { "epoch": 0.0023006134969325155, "grad_norm": NaN, "learning_rate": 3.831417624521073e-07, "loss": 1.7237, "step": 6 }, { "epoch": 0.0026840490797546013, "grad_norm": 14.72826862335205, "learning_rate": 5.747126436781609e-07, "loss": 0.1122, "step": 7 }, { "epoch": 0.003067484662576687, "grad_norm": 3.6637356281280518, "learning_rate": 7.662835249042146e-07, "loss": 0.0223, "step": 8 }, { "epoch": 0.003450920245398773, "grad_norm": Infinity, "learning_rate": 7.662835249042146e-07, "loss": 0.7598, "step": 9 }, { "epoch": 0.003834355828220859, "grad_norm": 24.12232780456543, "learning_rate": 9.578544061302681e-07, "loss": 0.6069, "step": 10 }, { "epoch": 0.004217791411042945, "grad_norm": 7.987460613250732, "learning_rate": 1.1494252873563219e-06, "loss": 0.0296, "step": 11 }, { "epoch": 0.004601226993865031, "grad_norm": 1.6977758407592773, "learning_rate": 1.3409961685823756e-06, "loss": 0.008, "step": 12 }, { "epoch": 0.004984662576687116, "grad_norm": 0.19353151321411133, "learning_rate": 1.5325670498084292e-06, "loss": 0.0017, "step": 13 }, { "epoch": 0.005368098159509203, "grad_norm": 3.7171666622161865, "learning_rate": 1.724137931034483e-06, "loss": 0.0126, "step": 14 }, { "epoch": 0.005751533742331288, "grad_norm": Infinity, "learning_rate": 1.724137931034483e-06, "loss": 1.3653, "step": 15 }, { "epoch": 0.006134969325153374, "grad_norm": 6.29285192489624, "learning_rate": 1.9157088122605362e-06, "loss": 0.0114, "step": 16 }, { "epoch": 0.00651840490797546, "grad_norm": 213.4350128173828, "learning_rate": 2.1072796934865904e-06, "loss": 5.9951, "step": 17 }, { "epoch": 0.006901840490797546, "grad_norm": 0.3442775011062622, "learning_rate": 2.2988505747126437e-06, "loss": 0.0021, "step": 18 }, { "epoch": 0.0072852760736196315, "grad_norm": 2.453491449356079, "learning_rate": 2.4904214559386975e-06, "loss": 0.0215, "step": 19 }, { "epoch": 0.007668711656441718, "grad_norm": 212.2965850830078, "learning_rate": 2.6819923371647512e-06, "loss": 5.3477, "step": 20 }, { "epoch": 0.008052147239263804, "grad_norm": 2.1834559440612793, "learning_rate": 2.8735632183908046e-06, "loss": 0.0067, "step": 21 }, { "epoch": 0.00843558282208589, "grad_norm": 0.46670106053352356, "learning_rate": 3.0651340996168583e-06, "loss": 0.0031, "step": 22 }, { "epoch": 0.008819018404907975, "grad_norm": NaN, "learning_rate": 3.0651340996168583e-06, "loss": 4.2522, "step": 23 }, { "epoch": 0.009202453987730062, "grad_norm": 4.1455769538879395, "learning_rate": 3.2567049808429117e-06, "loss": 0.0382, "step": 24 }, { "epoch": 0.009585889570552147, "grad_norm": 4.376644134521484, "learning_rate": 3.448275862068966e-06, "loss": 0.0156, "step": 25 }, { "epoch": 0.009969325153374233, "grad_norm": 6.021722793579102, "learning_rate": 3.6398467432950196e-06, "loss": 0.0658, "step": 26 }, { "epoch": 0.010352760736196318, "grad_norm": 2.8319826126098633, "learning_rate": 3.8314176245210725e-06, "loss": 0.0228, "step": 27 }, { "epoch": 0.010736196319018405, "grad_norm": 7.251446723937988, "learning_rate": 4.022988505747127e-06, "loss": 0.0726, "step": 28 }, { "epoch": 0.01111963190184049, "grad_norm": 96.88240051269531, "learning_rate": 4.214559386973181e-06, "loss": 6.416, "step": 29 }, { "epoch": 0.011503067484662576, "grad_norm": 1.113573431968689, "learning_rate": 4.406130268199233e-06, "loss": 0.0034, "step": 30 }, { "epoch": 0.011886503067484663, "grad_norm": 76.65680694580078, "learning_rate": 4.5977011494252875e-06, "loss": 0.9038, "step": 31 }, { "epoch": 0.012269938650306749, "grad_norm": 86.12126159667969, "learning_rate": 4.789272030651342e-06, "loss": 1.2334, "step": 32 }, { "epoch": 0.012653374233128834, "grad_norm": 5.0126142501831055, "learning_rate": 4.980842911877395e-06, "loss": 0.025, "step": 33 }, { "epoch": 0.01303680981595092, "grad_norm": 74.3577651977539, "learning_rate": 5.172413793103448e-06, "loss": 1.2891, "step": 34 }, { "epoch": 0.013420245398773007, "grad_norm": 40.25752258300781, "learning_rate": 5.3639846743295025e-06, "loss": 0.9468, "step": 35 }, { "epoch": 0.013803680981595092, "grad_norm": 0.10465328395366669, "learning_rate": 5.555555555555556e-06, "loss": 0.0015, "step": 36 }, { "epoch": 0.014187116564417178, "grad_norm": 2.5959291458129883, "learning_rate": 5.747126436781609e-06, "loss": 0.0092, "step": 37 }, { "epoch": 0.014570552147239263, "grad_norm": 0.26874759793281555, "learning_rate": 5.938697318007663e-06, "loss": 0.0018, "step": 38 }, { "epoch": 0.01495398773006135, "grad_norm": 0.9604064226150513, "learning_rate": 6.130268199233717e-06, "loss": 0.0031, "step": 39 }, { "epoch": 0.015337423312883436, "grad_norm": 60.562522888183594, "learning_rate": 6.321839080459771e-06, "loss": 6.1016, "step": 40 }, { "epoch": 0.01572085889570552, "grad_norm": 54.024776458740234, "learning_rate": 6.513409961685823e-06, "loss": 1.0664, "step": 41 }, { "epoch": 0.016104294478527608, "grad_norm": 5.381654262542725, "learning_rate": 6.7049808429118775e-06, "loss": 0.0345, "step": 42 }, { "epoch": 0.016487730061349692, "grad_norm": 123.08094787597656, "learning_rate": 6.896551724137932e-06, "loss": 1.4581, "step": 43 }, { "epoch": 0.01687116564417178, "grad_norm": 8.839736938476562, "learning_rate": 7.088122605363985e-06, "loss": 0.0295, "step": 44 }, { "epoch": 0.017254601226993866, "grad_norm": 2.2964253425598145, "learning_rate": 7.279693486590039e-06, "loss": 0.0139, "step": 45 }, { "epoch": 0.01763803680981595, "grad_norm": 1.5887353420257568, "learning_rate": 7.4712643678160925e-06, "loss": 0.0085, "step": 46 }, { "epoch": 0.018021472392638037, "grad_norm": 65.72738647460938, "learning_rate": 7.662835249042145e-06, "loss": 1.3535, "step": 47 }, { "epoch": 0.018404907975460124, "grad_norm": 266.64569091796875, "learning_rate": 7.854406130268199e-06, "loss": 1.6163, "step": 48 }, { "epoch": 0.018788343558282208, "grad_norm": 10.696005821228027, "learning_rate": 8.045977011494253e-06, "loss": 0.1241, "step": 49 }, { "epoch": 0.019171779141104295, "grad_norm": 3.5583226680755615, "learning_rate": 8.237547892720307e-06, "loss": 0.0094, "step": 50 }, { "epoch": 0.019555214723926382, "grad_norm": 2.1616084575653076, "learning_rate": 8.429118773946362e-06, "loss": 0.0086, "step": 51 }, { "epoch": 0.019938650306748466, "grad_norm": 0.8845393061637878, "learning_rate": 8.620689655172414e-06, "loss": 0.0052, "step": 52 }, { "epoch": 0.020322085889570553, "grad_norm": 62.269100189208984, "learning_rate": 8.812260536398467e-06, "loss": 1.0137, "step": 53 }, { "epoch": 0.020705521472392636, "grad_norm": 1.511942744255066, "learning_rate": 9.00383141762452e-06, "loss": 0.0042, "step": 54 }, { "epoch": 0.021088957055214724, "grad_norm": 35.93381881713867, "learning_rate": 9.195402298850575e-06, "loss": 0.7896, "step": 55 }, { "epoch": 0.02147239263803681, "grad_norm": 61.91488265991211, "learning_rate": 9.386973180076629e-06, "loss": 0.8931, "step": 56 }, { "epoch": 0.021855828220858894, "grad_norm": 182.94107055664062, "learning_rate": 9.578544061302683e-06, "loss": 2.389, "step": 57 }, { "epoch": 0.02223926380368098, "grad_norm": 0.3494167923927307, "learning_rate": 9.770114942528738e-06, "loss": 0.0024, "step": 58 }, { "epoch": 0.02262269938650307, "grad_norm": 114.19107818603516, "learning_rate": 9.96168582375479e-06, "loss": 5.874, "step": 59 }, { "epoch": 0.023006134969325152, "grad_norm": 106.06359100341797, "learning_rate": 1.0153256704980842e-05, "loss": 1.3262, "step": 60 }, { "epoch": 0.02338957055214724, "grad_norm": 2.7763619422912598, "learning_rate": 1.0344827586206897e-05, "loss": 0.0113, "step": 61 }, { "epoch": 0.023773006134969327, "grad_norm": 64.10795593261719, "learning_rate": 1.053639846743295e-05, "loss": 0.7485, "step": 62 }, { "epoch": 0.02415644171779141, "grad_norm": 89.75699615478516, "learning_rate": 1.0727969348659005e-05, "loss": 5.8164, "step": 63 }, { "epoch": 0.024539877300613498, "grad_norm": 173.2172393798828, "learning_rate": 1.091954022988506e-05, "loss": 6.1543, "step": 64 }, { "epoch": 0.02492331288343558, "grad_norm": 55.35068893432617, "learning_rate": 1.1111111111111112e-05, "loss": 0.9883, "step": 65 }, { "epoch": 0.02530674846625767, "grad_norm": 6.057254791259766, "learning_rate": 1.1302681992337164e-05, "loss": 0.0216, "step": 66 }, { "epoch": 0.025690184049079755, "grad_norm": 2.4507741928100586, "learning_rate": 1.1494252873563218e-05, "loss": 0.0169, "step": 67 }, { "epoch": 0.02607361963190184, "grad_norm": 80.08985137939453, "learning_rate": 1.1685823754789272e-05, "loss": 1.002, "step": 68 }, { "epoch": 0.026457055214723926, "grad_norm": 188.5212860107422, "learning_rate": 1.1877394636015327e-05, "loss": 1.4981, "step": 69 }, { "epoch": 0.026840490797546013, "grad_norm": 61.822757720947266, "learning_rate": 1.206896551724138e-05, "loss": 1.2676, "step": 70 }, { "epoch": 0.027223926380368097, "grad_norm": 64.24207305908203, "learning_rate": 1.2260536398467433e-05, "loss": 0.8086, "step": 71 }, { "epoch": 0.027607361963190184, "grad_norm": 323.5947265625, "learning_rate": 1.2452107279693487e-05, "loss": 2.217, "step": 72 }, { "epoch": 0.02799079754601227, "grad_norm": 129.00328063964844, "learning_rate": 1.2643678160919542e-05, "loss": 5.5635, "step": 73 }, { "epoch": 0.028374233128834355, "grad_norm": 11.802986145019531, "learning_rate": 1.2835249042145594e-05, "loss": 0.0487, "step": 74 }, { "epoch": 0.028757668711656442, "grad_norm": 0.5383877754211426, "learning_rate": 1.3026819923371647e-05, "loss": 0.0032, "step": 75 }, { "epoch": 0.029141104294478526, "grad_norm": 159.20738220214844, "learning_rate": 1.3218390804597702e-05, "loss": 5.7246, "step": 76 }, { "epoch": 0.029524539877300613, "grad_norm": 60.51280212402344, "learning_rate": 1.3409961685823755e-05, "loss": 1.0391, "step": 77 }, { "epoch": 0.0299079754601227, "grad_norm": 73.97840118408203, "learning_rate": 1.360153256704981e-05, "loss": 0.9321, "step": 78 }, { "epoch": 0.030291411042944784, "grad_norm": 1.2738165855407715, "learning_rate": 1.3793103448275863e-05, "loss": 0.0052, "step": 79 }, { "epoch": 0.03067484662576687, "grad_norm": 0.34937584400177, "learning_rate": 1.3984674329501916e-05, "loss": 0.0024, "step": 80 }, { "epoch": 0.031058282208588958, "grad_norm": 67.0838851928711, "learning_rate": 1.417624521072797e-05, "loss": 1.3536, "step": 81 }, { "epoch": 0.03144171779141104, "grad_norm": 0.8529216051101685, "learning_rate": 1.4367816091954022e-05, "loss": 0.0033, "step": 82 }, { "epoch": 0.031825153374233126, "grad_norm": 37.8706169128418, "learning_rate": 1.4559386973180078e-05, "loss": 1.1436, "step": 83 }, { "epoch": 0.032208588957055216, "grad_norm": 76.21135711669922, "learning_rate": 1.475095785440613e-05, "loss": 1.5489, "step": 84 }, { "epoch": 0.0325920245398773, "grad_norm": 1.0968785285949707, "learning_rate": 1.4942528735632185e-05, "loss": 0.003, "step": 85 }, { "epoch": 0.032975460122699383, "grad_norm": 0.46162062883377075, "learning_rate": 1.5134099616858237e-05, "loss": 0.0024, "step": 86 }, { "epoch": 0.033358895705521474, "grad_norm": 90.52841186523438, "learning_rate": 1.532567049808429e-05, "loss": 1.2198, "step": 87 }, { "epoch": 0.03374233128834356, "grad_norm": 0.3833475410938263, "learning_rate": 1.5517241379310346e-05, "loss": 0.0024, "step": 88 }, { "epoch": 0.03412576687116564, "grad_norm": 314.6944580078125, "learning_rate": 1.5708812260536398e-05, "loss": 5.8516, "step": 89 }, { "epoch": 0.03450920245398773, "grad_norm": 2.8447794914245605, "learning_rate": 1.5900383141762454e-05, "loss": 0.0128, "step": 90 }, { "epoch": 0.034892638036809816, "grad_norm": 149.35581970214844, "learning_rate": 1.6091954022988507e-05, "loss": 2.0236, "step": 91 }, { "epoch": 0.0352760736196319, "grad_norm": 55.89840316772461, "learning_rate": 1.628352490421456e-05, "loss": 1.1123, "step": 92 }, { "epoch": 0.03565950920245399, "grad_norm": 20.16428565979004, "learning_rate": 1.6475095785440615e-05, "loss": 0.0356, "step": 93 }, { "epoch": 0.036042944785276074, "grad_norm": 0.32057636976242065, "learning_rate": 1.6666666666666667e-05, "loss": 0.0018, "step": 94 }, { "epoch": 0.03642638036809816, "grad_norm": 12.989239692687988, "learning_rate": 1.6858237547892723e-05, "loss": 0.0375, "step": 95 }, { "epoch": 0.03680981595092025, "grad_norm": 3.8411009311676025, "learning_rate": 1.7049808429118776e-05, "loss": 0.0294, "step": 96 }, { "epoch": 0.03719325153374233, "grad_norm": 76.6435775756836, "learning_rate": 1.7241379310344828e-05, "loss": 1.3907, "step": 97 }, { "epoch": 0.037576687116564415, "grad_norm": 111.41454315185547, "learning_rate": 1.743295019157088e-05, "loss": 5.5, "step": 98 }, { "epoch": 0.037960122699386506, "grad_norm": 2.332939624786377, "learning_rate": 1.7624521072796933e-05, "loss": 0.008, "step": 99 }, { "epoch": 0.03834355828220859, "grad_norm": 0.13917133212089539, "learning_rate": 1.781609195402299e-05, "loss": 0.0011, "step": 100 }, { "epoch": 0.03872699386503067, "grad_norm": 1.1241354942321777, "learning_rate": 1.800766283524904e-05, "loss": 0.0037, "step": 101 }, { "epoch": 0.039110429447852764, "grad_norm": 0.4089910686016083, "learning_rate": 1.8199233716475097e-05, "loss": 0.0024, "step": 102 }, { "epoch": 0.03949386503067485, "grad_norm": 147.89845275878906, "learning_rate": 1.839080459770115e-05, "loss": 1.4602, "step": 103 }, { "epoch": 0.03987730061349693, "grad_norm": 2.5901684761047363, "learning_rate": 1.8582375478927206e-05, "loss": 0.0127, "step": 104 }, { "epoch": 0.040260736196319015, "grad_norm": 2.627976655960083, "learning_rate": 1.8773946360153258e-05, "loss": 0.0043, "step": 105 }, { "epoch": 0.040644171779141106, "grad_norm": 0.39192333817481995, "learning_rate": 1.896551724137931e-05, "loss": 0.0052, "step": 106 }, { "epoch": 0.04102760736196319, "grad_norm": 12.433937072753906, "learning_rate": 1.9157088122605367e-05, "loss": 0.0861, "step": 107 }, { "epoch": 0.04141104294478527, "grad_norm": 3.929945230484009, "learning_rate": 1.934865900383142e-05, "loss": 0.0098, "step": 108 }, { "epoch": 0.041794478527607364, "grad_norm": 100.13874816894531, "learning_rate": 1.9540229885057475e-05, "loss": 5.9512, "step": 109 }, { "epoch": 0.04217791411042945, "grad_norm": 0.7809451222419739, "learning_rate": 1.9731800766283527e-05, "loss": 0.0057, "step": 110 }, { "epoch": 0.04256134969325153, "grad_norm": 38.45713424682617, "learning_rate": 1.992337164750958e-05, "loss": 0.7549, "step": 111 }, { "epoch": 0.04294478527607362, "grad_norm": 12.913437843322754, "learning_rate": 2.0114942528735632e-05, "loss": 0.0696, "step": 112 }, { "epoch": 0.043328220858895705, "grad_norm": 161.37461853027344, "learning_rate": 2.0306513409961685e-05, "loss": 5.5381, "step": 113 }, { "epoch": 0.04371165644171779, "grad_norm": 13.583711624145508, "learning_rate": 2.049808429118774e-05, "loss": 0.0394, "step": 114 }, { "epoch": 0.04409509202453988, "grad_norm": 109.6414566040039, "learning_rate": 2.0689655172413793e-05, "loss": 1.2266, "step": 115 }, { "epoch": 0.04447852760736196, "grad_norm": 18.10565948486328, "learning_rate": 2.088122605363985e-05, "loss": 0.0208, "step": 116 }, { "epoch": 0.04486196319018405, "grad_norm": 81.25139617919922, "learning_rate": 2.10727969348659e-05, "loss": 1.2227, "step": 117 }, { "epoch": 0.04524539877300614, "grad_norm": 30.773693084716797, "learning_rate": 2.1264367816091954e-05, "loss": 0.8042, "step": 118 }, { "epoch": 0.04562883435582822, "grad_norm": 62.7272834777832, "learning_rate": 2.145593869731801e-05, "loss": 0.9893, "step": 119 }, { "epoch": 0.046012269938650305, "grad_norm": 1.4851429462432861, "learning_rate": 2.1647509578544062e-05, "loss": 0.0034, "step": 120 }, { "epoch": 0.046395705521472395, "grad_norm": 120.005126953125, "learning_rate": 2.183908045977012e-05, "loss": 1.3565, "step": 121 }, { "epoch": 0.04677914110429448, "grad_norm": 296.0460510253906, "learning_rate": 2.203065134099617e-05, "loss": 7.1914, "step": 122 }, { "epoch": 0.04716257668711656, "grad_norm": 128.2917022705078, "learning_rate": 2.2222222222222223e-05, "loss": 2.8809, "step": 123 }, { "epoch": 0.04754601226993865, "grad_norm": 86.54306030273438, "learning_rate": 2.2413793103448276e-05, "loss": 1.6328, "step": 124 }, { "epoch": 0.04792944785276074, "grad_norm": 0.41690361499786377, "learning_rate": 2.2605363984674328e-05, "loss": 0.0036, "step": 125 }, { "epoch": 0.04831288343558282, "grad_norm": 113.13068389892578, "learning_rate": 2.2796934865900384e-05, "loss": 0.8526, "step": 126 }, { "epoch": 0.04869631901840491, "grad_norm": 157.20811462402344, "learning_rate": 2.2988505747126437e-05, "loss": 4.1294, "step": 127 }, { "epoch": 0.049079754601226995, "grad_norm": 76.66633605957031, "learning_rate": 2.3180076628352492e-05, "loss": 0.6802, "step": 128 }, { "epoch": 0.04946319018404908, "grad_norm": 1.2853686809539795, "learning_rate": 2.3371647509578545e-05, "loss": 0.0071, "step": 129 }, { "epoch": 0.04984662576687116, "grad_norm": 110.15524291992188, "learning_rate": 2.3563218390804597e-05, "loss": 5.2832, "step": 130 }, { "epoch": 0.05023006134969325, "grad_norm": 89.71929931640625, "learning_rate": 2.3754789272030653e-05, "loss": 1.0645, "step": 131 }, { "epoch": 0.05061349693251534, "grad_norm": 126.18505096435547, "learning_rate": 2.3946360153256706e-05, "loss": 1.3311, "step": 132 }, { "epoch": 0.05099693251533742, "grad_norm": 6.359533309936523, "learning_rate": 2.413793103448276e-05, "loss": 0.0912, "step": 133 }, { "epoch": 0.05138036809815951, "grad_norm": 2.9896469116210938, "learning_rate": 2.4329501915708814e-05, "loss": 0.012, "step": 134 }, { "epoch": 0.051763803680981595, "grad_norm": 6.923796653747559, "learning_rate": 2.4521072796934867e-05, "loss": 0.008, "step": 135 }, { "epoch": 0.05214723926380368, "grad_norm": 167.52798461914062, "learning_rate": 2.4712643678160922e-05, "loss": 3.4893, "step": 136 }, { "epoch": 0.05253067484662577, "grad_norm": 7.542374610900879, "learning_rate": 2.4904214559386975e-05, "loss": 0.1313, "step": 137 }, { "epoch": 0.05291411042944785, "grad_norm": 100.85726928710938, "learning_rate": 2.5095785440613027e-05, "loss": 1.0762, "step": 138 }, { "epoch": 0.053297546012269936, "grad_norm": 1.1239936351776123, "learning_rate": 2.5287356321839083e-05, "loss": 0.0035, "step": 139 }, { "epoch": 0.05368098159509203, "grad_norm": 0.7821186780929565, "learning_rate": 2.5478927203065132e-05, "loss": 0.0046, "step": 140 }, { "epoch": 0.05406441717791411, "grad_norm": 70.37541961669922, "learning_rate": 2.5670498084291188e-05, "loss": 1.0537, "step": 141 }, { "epoch": 0.054447852760736194, "grad_norm": 0.31846246123313904, "learning_rate": 2.5862068965517244e-05, "loss": 0.0029, "step": 142 }, { "epoch": 0.054831288343558285, "grad_norm": 0.34014156460762024, "learning_rate": 2.6053639846743293e-05, "loss": 0.0019, "step": 143 }, { "epoch": 0.05521472392638037, "grad_norm": 104.0475082397461, "learning_rate": 2.624521072796935e-05, "loss": 6.5469, "step": 144 }, { "epoch": 0.05559815950920245, "grad_norm": 9.99521541595459, "learning_rate": 2.6436781609195405e-05, "loss": 0.0142, "step": 145 }, { "epoch": 0.05598159509202454, "grad_norm": 82.37095642089844, "learning_rate": 2.662835249042146e-05, "loss": 1.2188, "step": 146 }, { "epoch": 0.056365030674846626, "grad_norm": 99.57766723632812, "learning_rate": 2.681992337164751e-05, "loss": 1.4502, "step": 147 }, { "epoch": 0.05674846625766871, "grad_norm": 0.3052326440811157, "learning_rate": 2.7011494252873566e-05, "loss": 0.0032, "step": 148 }, { "epoch": 0.0571319018404908, "grad_norm": 1.5312012434005737, "learning_rate": 2.720306513409962e-05, "loss": 0.0033, "step": 149 }, { "epoch": 0.057515337423312884, "grad_norm": 4.41005802154541, "learning_rate": 2.739463601532567e-05, "loss": 0.0119, "step": 150 }, { "epoch": 0.05789877300613497, "grad_norm": 7.4667277336120605, "learning_rate": 2.7586206896551727e-05, "loss": 0.0504, "step": 151 }, { "epoch": 0.05828220858895705, "grad_norm": 1.4329824447631836, "learning_rate": 2.777777777777778e-05, "loss": 0.0035, "step": 152 }, { "epoch": 0.05866564417177914, "grad_norm": 2.555189609527588, "learning_rate": 2.796934865900383e-05, "loss": 0.0063, "step": 153 }, { "epoch": 0.059049079754601226, "grad_norm": 83.7321548461914, "learning_rate": 2.8160919540229884e-05, "loss": 1.5372, "step": 154 }, { "epoch": 0.05943251533742331, "grad_norm": 0.33591243624687195, "learning_rate": 2.835249042145594e-05, "loss": 0.0023, "step": 155 }, { "epoch": 0.0598159509202454, "grad_norm": 2.3094089031219482, "learning_rate": 2.8544061302681996e-05, "loss": 0.0108, "step": 156 }, { "epoch": 0.060199386503067484, "grad_norm": 1.295427918434143, "learning_rate": 2.8735632183908045e-05, "loss": 0.0058, "step": 157 }, { "epoch": 0.06058282208588957, "grad_norm": 2.300220251083374, "learning_rate": 2.89272030651341e-05, "loss": 0.0097, "step": 158 }, { "epoch": 0.06096625766871166, "grad_norm": 0.4389096200466156, "learning_rate": 2.9118773946360157e-05, "loss": 0.0021, "step": 159 }, { "epoch": 0.06134969325153374, "grad_norm": 1.887438416481018, "learning_rate": 2.9310344827586206e-05, "loss": 0.0112, "step": 160 }, { "epoch": 0.061733128834355826, "grad_norm": 103.8377456665039, "learning_rate": 2.950191570881226e-05, "loss": 1.5381, "step": 161 }, { "epoch": 0.062116564417177916, "grad_norm": 68.52552795410156, "learning_rate": 2.9693486590038317e-05, "loss": 1.0391, "step": 162 }, { "epoch": 0.0625, "grad_norm": 84.98541259765625, "learning_rate": 2.988505747126437e-05, "loss": 1.629, "step": 163 }, { "epoch": 0.06288343558282208, "grad_norm": 120.75799560546875, "learning_rate": 3.0076628352490422e-05, "loss": 1.2385, "step": 164 }, { "epoch": 0.06326687116564417, "grad_norm": 50.883365631103516, "learning_rate": 3.0268199233716475e-05, "loss": 1.0186, "step": 165 }, { "epoch": 0.06365030674846625, "grad_norm": 40.614540100097656, "learning_rate": 3.045977011494253e-05, "loss": 0.9351, "step": 166 }, { "epoch": 0.06403374233128835, "grad_norm": 1.2160786390304565, "learning_rate": 3.065134099616858e-05, "loss": 0.0038, "step": 167 }, { "epoch": 0.06441717791411043, "grad_norm": 1.191338062286377, "learning_rate": 3.084291187739464e-05, "loss": 0.0026, "step": 168 }, { "epoch": 0.06480061349693252, "grad_norm": 40.91733932495117, "learning_rate": 3.103448275862069e-05, "loss": 0.1254, "step": 169 }, { "epoch": 0.0651840490797546, "grad_norm": 20.858539581298828, "learning_rate": 3.1226053639846744e-05, "loss": 0.6699, "step": 170 }, { "epoch": 0.06556748466257668, "grad_norm": 4.083669185638428, "learning_rate": 3.1417624521072797e-05, "loss": 0.0045, "step": 171 }, { "epoch": 0.06595092024539877, "grad_norm": 25.001136779785156, "learning_rate": 3.160919540229885e-05, "loss": 0.9185, "step": 172 }, { "epoch": 0.06633435582822086, "grad_norm": 68.41410827636719, "learning_rate": 3.180076628352491e-05, "loss": 0.8237, "step": 173 }, { "epoch": 0.06671779141104295, "grad_norm": 6.041451454162598, "learning_rate": 3.1992337164750954e-05, "loss": 0.0305, "step": 174 }, { "epoch": 0.06710122699386503, "grad_norm": 0.3134997487068176, "learning_rate": 3.218390804597701e-05, "loss": 0.0014, "step": 175 }, { "epoch": 0.06748466257668712, "grad_norm": 74.52098083496094, "learning_rate": 3.2375478927203066e-05, "loss": 1.5216, "step": 176 }, { "epoch": 0.0678680981595092, "grad_norm": 7.908259868621826, "learning_rate": 3.256704980842912e-05, "loss": 0.0355, "step": 177 }, { "epoch": 0.06825153374233128, "grad_norm": 69.83658599853516, "learning_rate": 3.275862068965517e-05, "loss": 1.6026, "step": 178 }, { "epoch": 0.06863496932515338, "grad_norm": 20.584131240844727, "learning_rate": 3.295019157088123e-05, "loss": 0.8193, "step": 179 }, { "epoch": 0.06901840490797546, "grad_norm": 0.837039589881897, "learning_rate": 3.314176245210728e-05, "loss": 0.0022, "step": 180 }, { "epoch": 0.06940184049079755, "grad_norm": 138.28211975097656, "learning_rate": 3.3333333333333335e-05, "loss": 3.8125, "step": 181 }, { "epoch": 0.06978527607361963, "grad_norm": 121.50843048095703, "learning_rate": 3.352490421455939e-05, "loss": 0.9912, "step": 182 }, { "epoch": 0.07016871165644172, "grad_norm": 1.1271746158599854, "learning_rate": 3.371647509578545e-05, "loss": 0.0033, "step": 183 }, { "epoch": 0.0705521472392638, "grad_norm": 36.171268463134766, "learning_rate": 3.390804597701149e-05, "loss": 0.6187, "step": 184 }, { "epoch": 0.0709355828220859, "grad_norm": 177.51988220214844, "learning_rate": 3.409961685823755e-05, "loss": 6.2168, "step": 185 }, { "epoch": 0.07131901840490798, "grad_norm": 186.8992462158203, "learning_rate": 3.4291187739463604e-05, "loss": 5.8516, "step": 186 }, { "epoch": 0.07170245398773006, "grad_norm": 12.703483581542969, "learning_rate": 3.4482758620689657e-05, "loss": 0.079, "step": 187 }, { "epoch": 0.07208588957055215, "grad_norm": 93.80378723144531, "learning_rate": 3.467432950191571e-05, "loss": 2.0041, "step": 188 }, { "epoch": 0.07246932515337423, "grad_norm": 95.0303955078125, "learning_rate": 3.486590038314176e-05, "loss": 0.9229, "step": 189 }, { "epoch": 0.07285276073619631, "grad_norm": 122.20203399658203, "learning_rate": 3.505747126436782e-05, "loss": 2.1648, "step": 190 }, { "epoch": 0.0732361963190184, "grad_norm": 0.7854564189910889, "learning_rate": 3.5249042145593867e-05, "loss": 0.0028, "step": 191 }, { "epoch": 0.0736196319018405, "grad_norm": 1.2210944890975952, "learning_rate": 3.5440613026819926e-05, "loss": 0.0054, "step": 192 }, { "epoch": 0.07400306748466258, "grad_norm": 20.203170776367188, "learning_rate": 3.563218390804598e-05, "loss": 0.6558, "step": 193 }, { "epoch": 0.07438650306748466, "grad_norm": 149.05807495117188, "learning_rate": 3.582375478927204e-05, "loss": 2.5322, "step": 194 }, { "epoch": 0.07476993865030675, "grad_norm": 186.68226623535156, "learning_rate": 3.601532567049808e-05, "loss": 2.225, "step": 195 }, { "epoch": 0.07515337423312883, "grad_norm": 155.98397827148438, "learning_rate": 3.620689655172414e-05, "loss": 1.7022, "step": 196 }, { "epoch": 0.07553680981595091, "grad_norm": 1.276442050933838, "learning_rate": 3.6398467432950195e-05, "loss": 0.0058, "step": 197 }, { "epoch": 0.07592024539877301, "grad_norm": 10.517167091369629, "learning_rate": 3.659003831417625e-05, "loss": 0.0524, "step": 198 }, { "epoch": 0.0763036809815951, "grad_norm": 9.035552978515625, "learning_rate": 3.67816091954023e-05, "loss": 0.0112, "step": 199 }, { "epoch": 0.07668711656441718, "grad_norm": 2.937300205230713, "learning_rate": 3.697318007662835e-05, "loss": 0.0202, "step": 200 }, { "epoch": 0.07707055214723926, "grad_norm": 0.48456454277038574, "learning_rate": 3.716475095785441e-05, "loss": 0.0031, "step": 201 }, { "epoch": 0.07745398773006135, "grad_norm": 1.5013498067855835, "learning_rate": 3.735632183908046e-05, "loss": 0.0046, "step": 202 }, { "epoch": 0.07783742331288343, "grad_norm": 5.882223606109619, "learning_rate": 3.7547892720306517e-05, "loss": 0.0604, "step": 203 }, { "epoch": 0.07822085889570553, "grad_norm": 35.46553421020508, "learning_rate": 3.773946360153257e-05, "loss": 0.9717, "step": 204 }, { "epoch": 0.07860429447852761, "grad_norm": 5.151394844055176, "learning_rate": 3.793103448275862e-05, "loss": 0.0493, "step": 205 }, { "epoch": 0.0789877300613497, "grad_norm": 154.71730041503906, "learning_rate": 3.8122605363984674e-05, "loss": 1.7569, "step": 206 }, { "epoch": 0.07937116564417178, "grad_norm": 31.566425323486328, "learning_rate": 3.831417624521073e-05, "loss": 0.1257, "step": 207 }, { "epoch": 0.07975460122699386, "grad_norm": 542.98974609375, "learning_rate": 3.850574712643678e-05, "loss": 5.7324, "step": 208 }, { "epoch": 0.08013803680981595, "grad_norm": 150.66156005859375, "learning_rate": 3.869731800766284e-05, "loss": 5.9746, "step": 209 }, { "epoch": 0.08052147239263803, "grad_norm": 0.5510711669921875, "learning_rate": 3.888888888888889e-05, "loss": 0.0027, "step": 210 }, { "epoch": 0.08090490797546013, "grad_norm": 138.30918884277344, "learning_rate": 3.908045977011495e-05, "loss": 5.8896, "step": 211 }, { "epoch": 0.08128834355828221, "grad_norm": 114.1432876586914, "learning_rate": 3.9272030651340996e-05, "loss": 5.4746, "step": 212 }, { "epoch": 0.0816717791411043, "grad_norm": 92.57093048095703, "learning_rate": 3.9463601532567055e-05, "loss": 1.5499, "step": 213 }, { "epoch": 0.08205521472392638, "grad_norm": 1.4484443664550781, "learning_rate": 3.965517241379311e-05, "loss": 0.0134, "step": 214 }, { "epoch": 0.08243865030674846, "grad_norm": 123.93447875976562, "learning_rate": 3.984674329501916e-05, "loss": 2.5042, "step": 215 }, { "epoch": 0.08282208588957055, "grad_norm": 1.8793611526489258, "learning_rate": 4.003831417624521e-05, "loss": 0.0092, "step": 216 }, { "epoch": 0.08320552147239264, "grad_norm": 0.6625899076461792, "learning_rate": 4.0229885057471265e-05, "loss": 0.0024, "step": 217 }, { "epoch": 0.08358895705521473, "grad_norm": 12.705831527709961, "learning_rate": 4.0421455938697324e-05, "loss": 0.0755, "step": 218 }, { "epoch": 0.08397239263803681, "grad_norm": 13.084242820739746, "learning_rate": 4.061302681992337e-05, "loss": 0.0665, "step": 219 }, { "epoch": 0.0843558282208589, "grad_norm": 103.4574203491211, "learning_rate": 4.080459770114943e-05, "loss": 5.9307, "step": 220 }, { "epoch": 0.08473926380368098, "grad_norm": 1.5646629333496094, "learning_rate": 4.099616858237548e-05, "loss": 0.0032, "step": 221 }, { "epoch": 0.08512269938650306, "grad_norm": 13.707032203674316, "learning_rate": 4.1187739463601534e-05, "loss": 0.1379, "step": 222 }, { "epoch": 0.08550613496932516, "grad_norm": 4.0889668464660645, "learning_rate": 4.1379310344827587e-05, "loss": 0.0058, "step": 223 }, { "epoch": 0.08588957055214724, "grad_norm": 1.3660260438919067, "learning_rate": 4.1570881226053646e-05, "loss": 0.0049, "step": 224 }, { "epoch": 0.08627300613496933, "grad_norm": 9.488546371459961, "learning_rate": 4.17624521072797e-05, "loss": 0.0082, "step": 225 }, { "epoch": 0.08665644171779141, "grad_norm": 0.5113505125045776, "learning_rate": 4.195402298850575e-05, "loss": 0.0033, "step": 226 }, { "epoch": 0.0870398773006135, "grad_norm": 1.202599048614502, "learning_rate": 4.21455938697318e-05, "loss": 0.0024, "step": 227 }, { "epoch": 0.08742331288343558, "grad_norm": 191.9002685546875, "learning_rate": 4.2337164750957856e-05, "loss": 1.5363, "step": 228 }, { "epoch": 0.08780674846625768, "grad_norm": 2.334892511367798, "learning_rate": 4.252873563218391e-05, "loss": 0.0096, "step": 229 }, { "epoch": 0.08819018404907976, "grad_norm": 1.950028896331787, "learning_rate": 4.272030651340996e-05, "loss": 0.0141, "step": 230 }, { "epoch": 0.08857361963190184, "grad_norm": 7.90887451171875, "learning_rate": 4.291187739463602e-05, "loss": 0.0993, "step": 231 }, { "epoch": 0.08895705521472393, "grad_norm": 5.312363147735596, "learning_rate": 4.3103448275862066e-05, "loss": 0.0222, "step": 232 }, { "epoch": 0.08934049079754601, "grad_norm": 26.44396209716797, "learning_rate": 4.3295019157088125e-05, "loss": 0.9038, "step": 233 }, { "epoch": 0.0897239263803681, "grad_norm": 1.0253314971923828, "learning_rate": 4.348659003831418e-05, "loss": 0.0079, "step": 234 }, { "epoch": 0.09010736196319018, "grad_norm": 1.3432118892669678, "learning_rate": 4.367816091954024e-05, "loss": 0.0079, "step": 235 }, { "epoch": 0.09049079754601227, "grad_norm": 223.07232666015625, "learning_rate": 4.386973180076628e-05, "loss": 6.1641, "step": 236 }, { "epoch": 0.09087423312883436, "grad_norm": 5.819482326507568, "learning_rate": 4.406130268199234e-05, "loss": 0.0063, "step": 237 }, { "epoch": 0.09125766871165644, "grad_norm": 63.8910026550293, "learning_rate": 4.4252873563218394e-05, "loss": 1.0508, "step": 238 }, { "epoch": 0.09164110429447853, "grad_norm": 23.658857345581055, "learning_rate": 4.4444444444444447e-05, "loss": 0.1659, "step": 239 }, { "epoch": 0.09202453987730061, "grad_norm": 37.76922607421875, "learning_rate": 4.46360153256705e-05, "loss": 0.6724, "step": 240 }, { "epoch": 0.0924079754601227, "grad_norm": 4.410397529602051, "learning_rate": 4.482758620689655e-05, "loss": 0.0246, "step": 241 }, { "epoch": 0.09279141104294479, "grad_norm": 0.8323369026184082, "learning_rate": 4.501915708812261e-05, "loss": 0.0048, "step": 242 }, { "epoch": 0.09317484662576687, "grad_norm": 10.32751178741455, "learning_rate": 4.5210727969348656e-05, "loss": 0.0696, "step": 243 }, { "epoch": 0.09355828220858896, "grad_norm": 128.6432647705078, "learning_rate": 4.5402298850574716e-05, "loss": 1.2657, "step": 244 }, { "epoch": 0.09394171779141104, "grad_norm": 35.12845993041992, "learning_rate": 4.559386973180077e-05, "loss": 0.6641, "step": 245 }, { "epoch": 0.09432515337423313, "grad_norm": 5.09098482131958, "learning_rate": 4.578544061302682e-05, "loss": 0.0231, "step": 246 }, { "epoch": 0.09470858895705521, "grad_norm": 10.12911605834961, "learning_rate": 4.597701149425287e-05, "loss": 0.1411, "step": 247 }, { "epoch": 0.0950920245398773, "grad_norm": 0.48184308409690857, "learning_rate": 4.616858237547893e-05, "loss": 0.0031, "step": 248 }, { "epoch": 0.09547546012269939, "grad_norm": 1.3390072584152222, "learning_rate": 4.6360153256704985e-05, "loss": 0.0055, "step": 249 }, { "epoch": 0.09585889570552147, "grad_norm": 1.018207311630249, "learning_rate": 4.655172413793104e-05, "loss": 0.0025, "step": 250 }, { "epoch": 0.09624233128834356, "grad_norm": 0.7538869380950928, "learning_rate": 4.674329501915709e-05, "loss": 0.0035, "step": 251 }, { "epoch": 0.09662576687116564, "grad_norm": 2.483527421951294, "learning_rate": 4.693486590038315e-05, "loss": 0.0044, "step": 252 }, { "epoch": 0.09700920245398773, "grad_norm": 138.2169952392578, "learning_rate": 4.7126436781609195e-05, "loss": 2.0765, "step": 253 }, { "epoch": 0.09739263803680982, "grad_norm": 20.763120651245117, "learning_rate": 4.7318007662835254e-05, "loss": 0.142, "step": 254 }, { "epoch": 0.0977760736196319, "grad_norm": 190.92677307128906, "learning_rate": 4.7509578544061307e-05, "loss": 2.5516, "step": 255 }, { "epoch": 0.09815950920245399, "grad_norm": 3.3017444610595703, "learning_rate": 4.770114942528736e-05, "loss": 0.0114, "step": 256 }, { "epoch": 0.09854294478527607, "grad_norm": 122.03047180175781, "learning_rate": 4.789272030651341e-05, "loss": 5.3203, "step": 257 }, { "epoch": 0.09892638036809816, "grad_norm": 0.15875087678432465, "learning_rate": 4.8084291187739464e-05, "loss": 0.0017, "step": 258 }, { "epoch": 0.09930981595092024, "grad_norm": 1.3615827560424805, "learning_rate": 4.827586206896552e-05, "loss": 0.004, "step": 259 }, { "epoch": 0.09969325153374232, "grad_norm": 1.167940378189087, "learning_rate": 4.846743295019157e-05, "loss": 0.0071, "step": 260 }, { "epoch": 0.10007668711656442, "grad_norm": 0.7109537124633789, "learning_rate": 4.865900383141763e-05, "loss": 0.0034, "step": 261 }, { "epoch": 0.1004601226993865, "grad_norm": 0.3329644203186035, "learning_rate": 4.885057471264368e-05, "loss": 0.0038, "step": 262 }, { "epoch": 0.10084355828220859, "grad_norm": 102.13310241699219, "learning_rate": 4.904214559386973e-05, "loss": 2.0372, "step": 263 }, { "epoch": 0.10122699386503067, "grad_norm": 103.58604431152344, "learning_rate": 4.9233716475095786e-05, "loss": 1.6924, "step": 264 }, { "epoch": 0.10161042944785276, "grad_norm": 3.0650875568389893, "learning_rate": 4.9425287356321845e-05, "loss": 0.0152, "step": 265 }, { "epoch": 0.10199386503067484, "grad_norm": 0.4443052113056183, "learning_rate": 4.96168582375479e-05, "loss": 0.0024, "step": 266 }, { "epoch": 0.10237730061349694, "grad_norm": 0.8741576075553894, "learning_rate": 4.980842911877395e-05, "loss": 0.0033, "step": 267 }, { "epoch": 0.10276073619631902, "grad_norm": 0.7627093195915222, "learning_rate": 5e-05, "loss": 0.0032, "step": 268 }, { "epoch": 0.1031441717791411, "grad_norm": 57.72730255126953, "learning_rate": 4.997869620792501e-05, "loss": 1.0674, "step": 269 }, { "epoch": 0.10352760736196319, "grad_norm": 76.46302795410156, "learning_rate": 4.995739241585002e-05, "loss": 1.4453, "step": 270 }, { "epoch": 0.10391104294478527, "grad_norm": 274.11651611328125, "learning_rate": 4.993608862377504e-05, "loss": 2.428, "step": 271 }, { "epoch": 0.10429447852760736, "grad_norm": 162.5637664794922, "learning_rate": 4.9914784831700046e-05, "loss": 1.8976, "step": 272 }, { "epoch": 0.10467791411042945, "grad_norm": 60.692230224609375, "learning_rate": 4.9893481039625056e-05, "loss": 1.1085, "step": 273 }, { "epoch": 0.10506134969325154, "grad_norm": 130.2882537841797, "learning_rate": 4.9872177247550065e-05, "loss": 1.9282, "step": 274 }, { "epoch": 0.10544478527607362, "grad_norm": 24.1922664642334, "learning_rate": 4.985087345547508e-05, "loss": 0.0541, "step": 275 }, { "epoch": 0.1058282208588957, "grad_norm": 105.14942169189453, "learning_rate": 4.982956966340009e-05, "loss": 1.2168, "step": 276 }, { "epoch": 0.10621165644171779, "grad_norm": 182.097412109375, "learning_rate": 4.98082658713251e-05, "loss": 1.5978, "step": 277 }, { "epoch": 0.10659509202453987, "grad_norm": 22.384475708007812, "learning_rate": 4.978696207925011e-05, "loss": 0.647, "step": 278 }, { "epoch": 0.10697852760736197, "grad_norm": 15.088644981384277, "learning_rate": 4.976565828717512e-05, "loss": 0.1837, "step": 279 }, { "epoch": 0.10736196319018405, "grad_norm": 0.3369879424571991, "learning_rate": 4.9744354495100134e-05, "loss": 0.0031, "step": 280 }, { "epoch": 0.10774539877300614, "grad_norm": 199.0261688232422, "learning_rate": 4.9723050703025144e-05, "loss": 0.8648, "step": 281 }, { "epoch": 0.10812883435582822, "grad_norm": 3.4141273498535156, "learning_rate": 4.970174691095015e-05, "loss": 0.0096, "step": 282 }, { "epoch": 0.1085122699386503, "grad_norm": 1.4798744916915894, "learning_rate": 4.968044311887516e-05, "loss": 0.0094, "step": 283 }, { "epoch": 0.10889570552147239, "grad_norm": 0.30312761664390564, "learning_rate": 4.965913932680017e-05, "loss": 0.0032, "step": 284 }, { "epoch": 0.10927914110429447, "grad_norm": 0.24992112815380096, "learning_rate": 4.963783553472519e-05, "loss": 0.0017, "step": 285 }, { "epoch": 0.10966257668711657, "grad_norm": 248.25474548339844, "learning_rate": 4.96165317426502e-05, "loss": 1.9727, "step": 286 }, { "epoch": 0.11004601226993865, "grad_norm": 67.61776733398438, "learning_rate": 4.9595227950575206e-05, "loss": 0.8365, "step": 287 }, { "epoch": 0.11042944785276074, "grad_norm": 75.35614013671875, "learning_rate": 4.9573924158500216e-05, "loss": 0.0323, "step": 288 }, { "epoch": 0.11081288343558282, "grad_norm": 0.5646623373031616, "learning_rate": 4.955262036642523e-05, "loss": 0.0036, "step": 289 }, { "epoch": 0.1111963190184049, "grad_norm": 1.4315290451049805, "learning_rate": 4.953131657435024e-05, "loss": 0.0041, "step": 290 }, { "epoch": 0.11157975460122699, "grad_norm": 4.68117618560791, "learning_rate": 4.9510012782275244e-05, "loss": 0.0249, "step": 291 }, { "epoch": 0.11196319018404909, "grad_norm": 0.2679153084754944, "learning_rate": 4.948870899020025e-05, "loss": 0.0015, "step": 292 }, { "epoch": 0.11234662576687117, "grad_norm": 141.1145782470703, "learning_rate": 4.946740519812527e-05, "loss": 5.8057, "step": 293 }, { "epoch": 0.11273006134969325, "grad_norm": 54.23237228393555, "learning_rate": 4.944610140605028e-05, "loss": 1.1983, "step": 294 }, { "epoch": 0.11311349693251534, "grad_norm": 0.5340811610221863, "learning_rate": 4.942479761397529e-05, "loss": 0.0022, "step": 295 }, { "epoch": 0.11349693251533742, "grad_norm": 2.3286099433898926, "learning_rate": 4.94034938219003e-05, "loss": 0.0026, "step": 296 }, { "epoch": 0.1138803680981595, "grad_norm": 68.02838897705078, "learning_rate": 4.9382190029825306e-05, "loss": 0.7764, "step": 297 }, { "epoch": 0.1142638036809816, "grad_norm": 1.8361034393310547, "learning_rate": 4.936088623775032e-05, "loss": 0.0073, "step": 298 }, { "epoch": 0.11464723926380369, "grad_norm": 101.44828796386719, "learning_rate": 4.933958244567533e-05, "loss": 6.2578, "step": 299 }, { "epoch": 0.11503067484662577, "grad_norm": 1.1895840167999268, "learning_rate": 4.931827865360034e-05, "loss": 0.0022, "step": 300 }, { "epoch": 0.11541411042944785, "grad_norm": 157.90872192382812, "learning_rate": 4.929697486152535e-05, "loss": 1.6612, "step": 301 }, { "epoch": 0.11579754601226994, "grad_norm": 51.457881927490234, "learning_rate": 4.9275671069450366e-05, "loss": 1.0479, "step": 302 }, { "epoch": 0.11618098159509202, "grad_norm": 8.253484725952148, "learning_rate": 4.9254367277375376e-05, "loss": 0.0332, "step": 303 }, { "epoch": 0.1165644171779141, "grad_norm": 2.1906232833862305, "learning_rate": 4.9233063485300385e-05, "loss": 0.0039, "step": 304 }, { "epoch": 0.1169478527607362, "grad_norm": 5.369943141937256, "learning_rate": 4.9211759693225394e-05, "loss": 0.034, "step": 305 }, { "epoch": 0.11733128834355828, "grad_norm": 90.70282745361328, "learning_rate": 4.9190455901150404e-05, "loss": 2.1387, "step": 306 }, { "epoch": 0.11771472392638037, "grad_norm": 2.9550065994262695, "learning_rate": 4.916915210907542e-05, "loss": 0.0074, "step": 307 }, { "epoch": 0.11809815950920245, "grad_norm": 2.424849033355713, "learning_rate": 4.914784831700043e-05, "loss": 0.0033, "step": 308 }, { "epoch": 0.11848159509202454, "grad_norm": 2.2970473766326904, "learning_rate": 4.912654452492544e-05, "loss": 0.0086, "step": 309 }, { "epoch": 0.11886503067484662, "grad_norm": 3.276686429977417, "learning_rate": 4.910524073285045e-05, "loss": 0.0124, "step": 310 }, { "epoch": 0.11924846625766872, "grad_norm": 42.38562774658203, "learning_rate": 4.908393694077546e-05, "loss": 0.6455, "step": 311 }, { "epoch": 0.1196319018404908, "grad_norm": 5.350261211395264, "learning_rate": 4.906263314870047e-05, "loss": 0.0195, "step": 312 }, { "epoch": 0.12001533742331288, "grad_norm": 0.3664326071739197, "learning_rate": 4.904132935662548e-05, "loss": 0.0021, "step": 313 }, { "epoch": 0.12039877300613497, "grad_norm": 0.39626434445381165, "learning_rate": 4.902002556455049e-05, "loss": 0.0037, "step": 314 }, { "epoch": 0.12078220858895705, "grad_norm": 2.8121376037597656, "learning_rate": 4.89987217724755e-05, "loss": 0.0184, "step": 315 }, { "epoch": 0.12116564417177914, "grad_norm": 3.150080919265747, "learning_rate": 4.897741798040052e-05, "loss": 0.006, "step": 316 }, { "epoch": 0.12154907975460123, "grad_norm": 0.7951256632804871, "learning_rate": 4.8956114188325526e-05, "loss": 0.0028, "step": 317 }, { "epoch": 0.12193251533742332, "grad_norm": 3.72554349899292, "learning_rate": 4.8934810396250536e-05, "loss": 0.0047, "step": 318 }, { "epoch": 0.1223159509202454, "grad_norm": 1.347676157951355, "learning_rate": 4.8913506604175545e-05, "loss": 0.0042, "step": 319 }, { "epoch": 0.12269938650306748, "grad_norm": 16.904159545898438, "learning_rate": 4.8892202812100554e-05, "loss": 0.6064, "step": 320 }, { "epoch": 0.12308282208588957, "grad_norm": 7.208027362823486, "learning_rate": 4.887089902002557e-05, "loss": 0.0188, "step": 321 }, { "epoch": 0.12346625766871165, "grad_norm": 1.5120185613632202, "learning_rate": 4.884959522795058e-05, "loss": 0.0061, "step": 322 }, { "epoch": 0.12384969325153375, "grad_norm": 1.062395453453064, "learning_rate": 4.882829143587559e-05, "loss": 0.004, "step": 323 }, { "epoch": 0.12423312883435583, "grad_norm": 1.7264481782913208, "learning_rate": 4.88069876438006e-05, "loss": 0.0023, "step": 324 }, { "epoch": 0.12461656441717792, "grad_norm": 1.1984316110610962, "learning_rate": 4.878568385172561e-05, "loss": 0.0044, "step": 325 }, { "epoch": 0.125, "grad_norm": 0.9304490089416504, "learning_rate": 4.8764380059650624e-05, "loss": 0.0041, "step": 326 }, { "epoch": 0.12538343558282208, "grad_norm": 0.3947988748550415, "learning_rate": 4.874307626757563e-05, "loss": 0.0022, "step": 327 }, { "epoch": 0.12576687116564417, "grad_norm": 1.1371535062789917, "learning_rate": 4.872177247550064e-05, "loss": 0.0065, "step": 328 }, { "epoch": 0.12615030674846625, "grad_norm": 0.2662462592124939, "learning_rate": 4.870046868342565e-05, "loss": 0.0013, "step": 329 }, { "epoch": 0.12653374233128833, "grad_norm": 82.56263732910156, "learning_rate": 4.867916489135067e-05, "loss": 1.4531, "step": 330 }, { "epoch": 0.12691717791411042, "grad_norm": 269.9754638671875, "learning_rate": 4.865786109927568e-05, "loss": 2.0977, "step": 331 }, { "epoch": 0.1273006134969325, "grad_norm": 2.6148176193237305, "learning_rate": 4.8636557307200686e-05, "loss": 0.0096, "step": 332 }, { "epoch": 0.1276840490797546, "grad_norm": 220.37306213378906, "learning_rate": 4.8615253515125696e-05, "loss": 2.211, "step": 333 }, { "epoch": 0.1280674846625767, "grad_norm": 0.2989043891429901, "learning_rate": 4.8593949723050705e-05, "loss": 0.0025, "step": 334 }, { "epoch": 0.12845092024539878, "grad_norm": 113.23271942138672, "learning_rate": 4.857264593097572e-05, "loss": 1.9356, "step": 335 }, { "epoch": 0.12883435582822086, "grad_norm": 69.79551696777344, "learning_rate": 4.8551342138900724e-05, "loss": 1.0248, "step": 336 }, { "epoch": 0.12921779141104295, "grad_norm": 5.828656196594238, "learning_rate": 4.853003834682573e-05, "loss": 0.0179, "step": 337 }, { "epoch": 0.12960122699386503, "grad_norm": 8.144012451171875, "learning_rate": 4.850873455475074e-05, "loss": 0.0617, "step": 338 }, { "epoch": 0.12998466257668712, "grad_norm": 22.994976043701172, "learning_rate": 4.848743076267576e-05, "loss": 0.0673, "step": 339 }, { "epoch": 0.1303680981595092, "grad_norm": 40.77123260498047, "learning_rate": 4.846612697060077e-05, "loss": 0.8135, "step": 340 }, { "epoch": 0.13075153374233128, "grad_norm": 0.17868733406066895, "learning_rate": 4.844482317852578e-05, "loss": 0.0012, "step": 341 }, { "epoch": 0.13113496932515337, "grad_norm": 89.55314636230469, "learning_rate": 4.8423519386450786e-05, "loss": 1.2852, "step": 342 }, { "epoch": 0.13151840490797545, "grad_norm": 85.80487823486328, "learning_rate": 4.84022155943758e-05, "loss": 1.4805, "step": 343 }, { "epoch": 0.13190184049079753, "grad_norm": 39.61482238769531, "learning_rate": 4.838091180230081e-05, "loss": 1.0889, "step": 344 }, { "epoch": 0.13228527607361965, "grad_norm": 22.226238250732422, "learning_rate": 4.835960801022582e-05, "loss": 0.6861, "step": 345 }, { "epoch": 0.13266871165644173, "grad_norm": 100.61962127685547, "learning_rate": 4.833830421815083e-05, "loss": 1.5889, "step": 346 }, { "epoch": 0.1330521472392638, "grad_norm": 0.11054955422878265, "learning_rate": 4.831700042607584e-05, "loss": 0.0011, "step": 347 }, { "epoch": 0.1334355828220859, "grad_norm": 10.697381019592285, "learning_rate": 4.8295696634000856e-05, "loss": 0.0584, "step": 348 }, { "epoch": 0.13381901840490798, "grad_norm": 3.288479804992676, "learning_rate": 4.8274392841925865e-05, "loss": 0.0072, "step": 349 }, { "epoch": 0.13420245398773006, "grad_norm": 130.9109649658203, "learning_rate": 4.8253089049850874e-05, "loss": 1.1807, "step": 350 }, { "epoch": 0.13458588957055215, "grad_norm": 0.38042566180229187, "learning_rate": 4.8231785257775884e-05, "loss": 0.0021, "step": 351 }, { "epoch": 0.13496932515337423, "grad_norm": 128.2454376220703, "learning_rate": 4.821048146570089e-05, "loss": 1.8008, "step": 352 }, { "epoch": 0.13535276073619631, "grad_norm": 24.663909912109375, "learning_rate": 4.818917767362591e-05, "loss": 0.8667, "step": 353 }, { "epoch": 0.1357361963190184, "grad_norm": 80.28264617919922, "learning_rate": 4.816787388155092e-05, "loss": 0.9551, "step": 354 }, { "epoch": 0.13611963190184048, "grad_norm": Infinity, "learning_rate": 4.816787388155092e-05, "loss": 2.5081, "step": 355 }, { "epoch": 0.13650306748466257, "grad_norm": 0.5318364500999451, "learning_rate": 4.814657008947593e-05, "loss": 0.0032, "step": 356 }, { "epoch": 0.13688650306748465, "grad_norm": 0.21274921298027039, "learning_rate": 4.812526629740094e-05, "loss": 0.0022, "step": 357 }, { "epoch": 0.13726993865030676, "grad_norm": 2.563629627227783, "learning_rate": 4.810396250532595e-05, "loss": 0.0094, "step": 358 }, { "epoch": 0.13765337423312884, "grad_norm": 0.35965976119041443, "learning_rate": 4.808265871325096e-05, "loss": 0.0016, "step": 359 }, { "epoch": 0.13803680981595093, "grad_norm": 4.237149238586426, "learning_rate": 4.806135492117597e-05, "loss": 0.017, "step": 360 }, { "epoch": 0.138420245398773, "grad_norm": 3.40621280670166, "learning_rate": 4.804005112910098e-05, "loss": 0.0052, "step": 361 }, { "epoch": 0.1388036809815951, "grad_norm": 38.05250930786133, "learning_rate": 4.801874733702599e-05, "loss": 0.6519, "step": 362 }, { "epoch": 0.13918711656441718, "grad_norm": 35.53778076171875, "learning_rate": 4.7997443544951006e-05, "loss": 0.8091, "step": 363 }, { "epoch": 0.13957055214723926, "grad_norm": 93.23106384277344, "learning_rate": 4.7976139752876016e-05, "loss": 1.4649, "step": 364 }, { "epoch": 0.13995398773006135, "grad_norm": 10.391888618469238, "learning_rate": 4.7954835960801025e-05, "loss": 0.0904, "step": 365 }, { "epoch": 0.14033742331288343, "grad_norm": 87.87434387207031, "learning_rate": 4.7933532168726034e-05, "loss": 0.8394, "step": 366 }, { "epoch": 0.14072085889570551, "grad_norm": 186.20010375976562, "learning_rate": 4.791222837665105e-05, "loss": 3.8704, "step": 367 }, { "epoch": 0.1411042944785276, "grad_norm": 9.14842700958252, "learning_rate": 4.789092458457606e-05, "loss": 0.0408, "step": 368 }, { "epoch": 0.14148773006134968, "grad_norm": 17.004417419433594, "learning_rate": 4.786962079250107e-05, "loss": 0.1443, "step": 369 }, { "epoch": 0.1418711656441718, "grad_norm": 65.06951141357422, "learning_rate": 4.784831700042608e-05, "loss": 1.1856, "step": 370 }, { "epoch": 0.14225460122699388, "grad_norm": 139.8549346923828, "learning_rate": 4.782701320835109e-05, "loss": 6.0264, "step": 371 }, { "epoch": 0.14263803680981596, "grad_norm": 0.6574199795722961, "learning_rate": 4.7805709416276104e-05, "loss": 0.0021, "step": 372 }, { "epoch": 0.14302147239263804, "grad_norm": 3.178603410720825, "learning_rate": 4.778440562420111e-05, "loss": 0.011, "step": 373 }, { "epoch": 0.14340490797546013, "grad_norm": 47.10575866699219, "learning_rate": 4.776310183212612e-05, "loss": 0.6182, "step": 374 }, { "epoch": 0.1437883435582822, "grad_norm": 0.21282723546028137, "learning_rate": 4.774179804005113e-05, "loss": 0.0016, "step": 375 }, { "epoch": 0.1441717791411043, "grad_norm": 1.779011845588684, "learning_rate": 4.772049424797614e-05, "loss": 0.0061, "step": 376 }, { "epoch": 0.14455521472392638, "grad_norm": 0.5587969422340393, "learning_rate": 4.769919045590116e-05, "loss": 0.0016, "step": 377 }, { "epoch": 0.14493865030674846, "grad_norm": 171.7791290283203, "learning_rate": 4.7677886663826166e-05, "loss": 1.4456, "step": 378 }, { "epoch": 0.14532208588957055, "grad_norm": 20.514801025390625, "learning_rate": 4.7656582871751176e-05, "loss": 0.0603, "step": 379 }, { "epoch": 0.14570552147239263, "grad_norm": 23.678197860717773, "learning_rate": 4.7635279079676185e-05, "loss": 0.5738, "step": 380 }, { "epoch": 0.1460889570552147, "grad_norm": 4.139899253845215, "learning_rate": 4.76139752876012e-05, "loss": 0.0305, "step": 381 }, { "epoch": 0.1464723926380368, "grad_norm": 1.6574698686599731, "learning_rate": 4.7592671495526204e-05, "loss": 0.0023, "step": 382 }, { "epoch": 0.1468558282208589, "grad_norm": 0.47582539916038513, "learning_rate": 4.757136770345121e-05, "loss": 0.0028, "step": 383 }, { "epoch": 0.147239263803681, "grad_norm": 8.048569679260254, "learning_rate": 4.755006391137622e-05, "loss": 0.1513, "step": 384 }, { "epoch": 0.14762269938650308, "grad_norm": 262.814208984375, "learning_rate": 4.752876011930124e-05, "loss": 5.9023, "step": 385 }, { "epoch": 0.14800613496932516, "grad_norm": 0.6565395593643188, "learning_rate": 4.750745632722625e-05, "loss": 0.0025, "step": 386 }, { "epoch": 0.14838957055214724, "grad_norm": 6.76060152053833, "learning_rate": 4.748615253515126e-05, "loss": 0.2013, "step": 387 }, { "epoch": 0.14877300613496933, "grad_norm": 254.4228515625, "learning_rate": 4.7464848743076266e-05, "loss": 1.6953, "step": 388 }, { "epoch": 0.1491564417177914, "grad_norm": 36.017982482910156, "learning_rate": 4.7443544951001276e-05, "loss": 0.6831, "step": 389 }, { "epoch": 0.1495398773006135, "grad_norm": 1.1480233669281006, "learning_rate": 4.742224115892629e-05, "loss": 0.0034, "step": 390 }, { "epoch": 0.14992331288343558, "grad_norm": 0.12915252149105072, "learning_rate": 4.74009373668513e-05, "loss": 0.0012, "step": 391 }, { "epoch": 0.15030674846625766, "grad_norm": 0.6576260924339294, "learning_rate": 4.737963357477631e-05, "loss": 0.0039, "step": 392 }, { "epoch": 0.15069018404907975, "grad_norm": 0.5759190917015076, "learning_rate": 4.735832978270132e-05, "loss": 0.0029, "step": 393 }, { "epoch": 0.15107361963190183, "grad_norm": 9.682991981506348, "learning_rate": 4.7337025990626336e-05, "loss": 0.0185, "step": 394 }, { "epoch": 0.15145705521472394, "grad_norm": 146.7510986328125, "learning_rate": 4.7315722198551345e-05, "loss": 6.0566, "step": 395 }, { "epoch": 0.15184049079754602, "grad_norm": 105.06553649902344, "learning_rate": 4.7294418406476354e-05, "loss": 1.7637, "step": 396 }, { "epoch": 0.1522239263803681, "grad_norm": 0.605169415473938, "learning_rate": 4.7273114614401364e-05, "loss": 0.0023, "step": 397 }, { "epoch": 0.1526073619631902, "grad_norm": 7.71505880355835, "learning_rate": 4.725181082232637e-05, "loss": 0.0183, "step": 398 }, { "epoch": 0.15299079754601227, "grad_norm": 9.905267715454102, "learning_rate": 4.723050703025139e-05, "loss": 0.015, "step": 399 }, { "epoch": 0.15337423312883436, "grad_norm": 142.18589782714844, "learning_rate": 4.72092032381764e-05, "loss": 1.415, "step": 400 }, { "epoch": 0.15375766871165644, "grad_norm": 1.3648536205291748, "learning_rate": 4.718789944610141e-05, "loss": 0.0038, "step": 401 }, { "epoch": 0.15414110429447853, "grad_norm": 126.9765396118164, "learning_rate": 4.716659565402642e-05, "loss": 1.7979, "step": 402 }, { "epoch": 0.1545245398773006, "grad_norm": 45.10152053833008, "learning_rate": 4.7145291861951426e-05, "loss": 0.7524, "step": 403 }, { "epoch": 0.1549079754601227, "grad_norm": 0.40764355659484863, "learning_rate": 4.712398806987644e-05, "loss": 0.0022, "step": 404 }, { "epoch": 0.15529141104294478, "grad_norm": 2.1609463691711426, "learning_rate": 4.710268427780145e-05, "loss": 0.0041, "step": 405 }, { "epoch": 0.15567484662576686, "grad_norm": 4.3262434005737305, "learning_rate": 4.708138048572646e-05, "loss": 0.0314, "step": 406 }, { "epoch": 0.15605828220858894, "grad_norm": 0.5598259568214417, "learning_rate": 4.706007669365147e-05, "loss": 0.0028, "step": 407 }, { "epoch": 0.15644171779141106, "grad_norm": 1.6508883237838745, "learning_rate": 4.7038772901576486e-05, "loss": 0.0066, "step": 408 }, { "epoch": 0.15682515337423314, "grad_norm": 11.46448040008545, "learning_rate": 4.7017469109501496e-05, "loss": 0.0401, "step": 409 }, { "epoch": 0.15720858895705522, "grad_norm": 165.40843200683594, "learning_rate": 4.6996165317426505e-05, "loss": 3.3642, "step": 410 }, { "epoch": 0.1575920245398773, "grad_norm": 78.23701477050781, "learning_rate": 4.6974861525351514e-05, "loss": 1.3516, "step": 411 }, { "epoch": 0.1579754601226994, "grad_norm": 0.5470488667488098, "learning_rate": 4.6953557733276524e-05, "loss": 0.0019, "step": 412 }, { "epoch": 0.15835889570552147, "grad_norm": 298.1001281738281, "learning_rate": 4.693225394120154e-05, "loss": 1.5518, "step": 413 }, { "epoch": 0.15874233128834356, "grad_norm": 0.11850805580615997, "learning_rate": 4.691095014912655e-05, "loss": 0.0018, "step": 414 }, { "epoch": 0.15912576687116564, "grad_norm": 133.16050720214844, "learning_rate": 4.688964635705156e-05, "loss": 1.5381, "step": 415 }, { "epoch": 0.15950920245398773, "grad_norm": 0.5266661643981934, "learning_rate": 4.686834256497657e-05, "loss": 0.0033, "step": 416 }, { "epoch": 0.1598926380368098, "grad_norm": 28.62251091003418, "learning_rate": 4.684703877290158e-05, "loss": 0.583, "step": 417 }, { "epoch": 0.1602760736196319, "grad_norm": 3.70107364654541, "learning_rate": 4.682573498082659e-05, "loss": 0.0058, "step": 418 }, { "epoch": 0.16065950920245398, "grad_norm": 2.993849754333496, "learning_rate": 4.68044311887516e-05, "loss": 0.017, "step": 419 }, { "epoch": 0.16104294478527606, "grad_norm": 95.91368103027344, "learning_rate": 4.678312739667661e-05, "loss": 1.3858, "step": 420 }, { "epoch": 0.16142638036809817, "grad_norm": 379.5052185058594, "learning_rate": 4.676182360460162e-05, "loss": 6.4629, "step": 421 }, { "epoch": 0.16180981595092025, "grad_norm": 12.950861930847168, "learning_rate": 4.674051981252664e-05, "loss": 0.0474, "step": 422 }, { "epoch": 0.16219325153374234, "grad_norm": 161.7283935546875, "learning_rate": 4.6719216020451646e-05, "loss": 2.4474, "step": 423 }, { "epoch": 0.16257668711656442, "grad_norm": 0.9702209830284119, "learning_rate": 4.6697912228376656e-05, "loss": 0.0026, "step": 424 }, { "epoch": 0.1629601226993865, "grad_norm": 0.5118975043296814, "learning_rate": 4.6676608436301665e-05, "loss": 0.002, "step": 425 }, { "epoch": 0.1633435582822086, "grad_norm": 24.19846534729004, "learning_rate": 4.6655304644226674e-05, "loss": 0.6104, "step": 426 }, { "epoch": 0.16372699386503067, "grad_norm": 74.22824096679688, "learning_rate": 4.6634000852151684e-05, "loss": 0.7046, "step": 427 }, { "epoch": 0.16411042944785276, "grad_norm": 0.7992326617240906, "learning_rate": 4.661269706007669e-05, "loss": 0.0033, "step": 428 }, { "epoch": 0.16449386503067484, "grad_norm": 0.5454573631286621, "learning_rate": 4.65913932680017e-05, "loss": 0.0028, "step": 429 }, { "epoch": 0.16487730061349692, "grad_norm": 0.4567127525806427, "learning_rate": 4.657008947592671e-05, "loss": 0.0021, "step": 430 }, { "epoch": 0.165260736196319, "grad_norm": 149.2803955078125, "learning_rate": 4.654878568385173e-05, "loss": 1.0586, "step": 431 }, { "epoch": 0.1656441717791411, "grad_norm": 134.65061950683594, "learning_rate": 4.652748189177674e-05, "loss": 1.1211, "step": 432 }, { "epoch": 0.1660276073619632, "grad_norm": 138.16680908203125, "learning_rate": 4.6506178099701746e-05, "loss": 5.3633, "step": 433 }, { "epoch": 0.1664110429447853, "grad_norm": 1.2007067203521729, "learning_rate": 4.6484874307626756e-05, "loss": 0.0057, "step": 434 }, { "epoch": 0.16679447852760737, "grad_norm": 0.8257370591163635, "learning_rate": 4.646357051555177e-05, "loss": 0.0028, "step": 435 }, { "epoch": 0.16717791411042945, "grad_norm": 375.18988037109375, "learning_rate": 4.644226672347678e-05, "loss": 5.9131, "step": 436 }, { "epoch": 0.16756134969325154, "grad_norm": 17.690185546875, "learning_rate": 4.642096293140179e-05, "loss": 0.0275, "step": 437 }, { "epoch": 0.16794478527607362, "grad_norm": 134.84230041503906, "learning_rate": 4.63996591393268e-05, "loss": 1.8117, "step": 438 }, { "epoch": 0.1683282208588957, "grad_norm": 178.44369506835938, "learning_rate": 4.637835534725181e-05, "loss": 5.9438, "step": 439 }, { "epoch": 0.1687116564417178, "grad_norm": 9.95478630065918, "learning_rate": 4.6357051555176825e-05, "loss": 0.0455, "step": 440 }, { "epoch": 0.16909509202453987, "grad_norm": 65.96708679199219, "learning_rate": 4.6335747763101834e-05, "loss": 1.1426, "step": 441 }, { "epoch": 0.16947852760736196, "grad_norm": 36.904945373535156, "learning_rate": 4.6314443971026844e-05, "loss": 0.6465, "step": 442 }, { "epoch": 0.16986196319018404, "grad_norm": 3.5175623893737793, "learning_rate": 4.629314017895185e-05, "loss": 0.0048, "step": 443 }, { "epoch": 0.17024539877300612, "grad_norm": 1.2675668001174927, "learning_rate": 4.627183638687686e-05, "loss": 0.0065, "step": 444 }, { "epoch": 0.1706288343558282, "grad_norm": 95.76133728027344, "learning_rate": 4.625053259480188e-05, "loss": 5.6777, "step": 445 }, { "epoch": 0.17101226993865032, "grad_norm": 1.1783291101455688, "learning_rate": 4.622922880272689e-05, "loss": 0.0027, "step": 446 }, { "epoch": 0.1713957055214724, "grad_norm": 47.477622985839844, "learning_rate": 4.62079250106519e-05, "loss": 0.9526, "step": 447 }, { "epoch": 0.17177914110429449, "grad_norm": 20.091808319091797, "learning_rate": 4.6186621218576906e-05, "loss": 0.5361, "step": 448 }, { "epoch": 0.17216257668711657, "grad_norm": 0.798278272151947, "learning_rate": 4.616531742650192e-05, "loss": 0.0032, "step": 449 }, { "epoch": 0.17254601226993865, "grad_norm": 7.0091776847839355, "learning_rate": 4.614401363442693e-05, "loss": 0.0322, "step": 450 }, { "epoch": 0.17292944785276074, "grad_norm": 11.215709686279297, "learning_rate": 4.612270984235194e-05, "loss": 0.5532, "step": 451 }, { "epoch": 0.17331288343558282, "grad_norm": 11.881379127502441, "learning_rate": 4.610140605027695e-05, "loss": 0.1141, "step": 452 }, { "epoch": 0.1736963190184049, "grad_norm": 3.7086267471313477, "learning_rate": 4.608010225820196e-05, "loss": 0.0232, "step": 453 }, { "epoch": 0.174079754601227, "grad_norm": 116.72730255126953, "learning_rate": 4.6058798466126976e-05, "loss": 5.5176, "step": 454 }, { "epoch": 0.17446319018404907, "grad_norm": 150.83160400390625, "learning_rate": 4.6037494674051985e-05, "loss": 6.5215, "step": 455 }, { "epoch": 0.17484662576687116, "grad_norm": 0.8595237135887146, "learning_rate": 4.6016190881976994e-05, "loss": 0.0028, "step": 456 }, { "epoch": 0.17523006134969324, "grad_norm": 0.31871017813682556, "learning_rate": 4.5994887089902004e-05, "loss": 0.0015, "step": 457 }, { "epoch": 0.17561349693251535, "grad_norm": 0.3064028322696686, "learning_rate": 4.597358329782702e-05, "loss": 0.0022, "step": 458 }, { "epoch": 0.17599693251533743, "grad_norm": 7.268013954162598, "learning_rate": 4.595227950575203e-05, "loss": 0.0172, "step": 459 }, { "epoch": 0.17638036809815952, "grad_norm": 141.68385314941406, "learning_rate": 4.593097571367704e-05, "loss": 1.1113, "step": 460 }, { "epoch": 0.1767638036809816, "grad_norm": 179.11373901367188, "learning_rate": 4.590967192160205e-05, "loss": 1.4815, "step": 461 }, { "epoch": 0.17714723926380369, "grad_norm": 88.66524505615234, "learning_rate": 4.588836812952706e-05, "loss": 0.9683, "step": 462 }, { "epoch": 0.17753067484662577, "grad_norm": 61.23609161376953, "learning_rate": 4.586706433745207e-05, "loss": 0.6357, "step": 463 }, { "epoch": 0.17791411042944785, "grad_norm": 4.37770414352417, "learning_rate": 4.584576054537708e-05, "loss": 0.031, "step": 464 }, { "epoch": 0.17829754601226994, "grad_norm": 3.8708748817443848, "learning_rate": 4.582445675330209e-05, "loss": 0.0144, "step": 465 }, { "epoch": 0.17868098159509202, "grad_norm": 0.21280257403850555, "learning_rate": 4.58031529612271e-05, "loss": 0.0017, "step": 466 }, { "epoch": 0.1790644171779141, "grad_norm": 35.33317947387695, "learning_rate": 4.578184916915211e-05, "loss": 0.6538, "step": 467 }, { "epoch": 0.1794478527607362, "grad_norm": 47.658287048339844, "learning_rate": 4.5760545377077126e-05, "loss": 0.9697, "step": 468 }, { "epoch": 0.17983128834355827, "grad_norm": 24.110111236572266, "learning_rate": 4.5739241585002136e-05, "loss": 0.6646, "step": 469 }, { "epoch": 0.18021472392638035, "grad_norm": 0.593888521194458, "learning_rate": 4.5717937792927145e-05, "loss": 0.002, "step": 470 }, { "epoch": 0.18059815950920247, "grad_norm": 1.9042633771896362, "learning_rate": 4.5696634000852154e-05, "loss": 0.0037, "step": 471 }, { "epoch": 0.18098159509202455, "grad_norm": 3.080111026763916, "learning_rate": 4.567533020877717e-05, "loss": 0.0084, "step": 472 }, { "epoch": 0.18136503067484663, "grad_norm": 1.5383565425872803, "learning_rate": 4.565402641670217e-05, "loss": 0.0064, "step": 473 }, { "epoch": 0.18174846625766872, "grad_norm": 0.2783668637275696, "learning_rate": 4.563272262462718e-05, "loss": 0.0025, "step": 474 }, { "epoch": 0.1821319018404908, "grad_norm": 97.4244613647461, "learning_rate": 4.561141883255219e-05, "loss": 6.6191, "step": 475 }, { "epoch": 0.18251533742331288, "grad_norm": 55.277164459228516, "learning_rate": 4.559011504047721e-05, "loss": 1.1631, "step": 476 }, { "epoch": 0.18289877300613497, "grad_norm": 6.295434951782227, "learning_rate": 4.556881124840222e-05, "loss": 0.0353, "step": 477 }, { "epoch": 0.18328220858895705, "grad_norm": 19.35044288635254, "learning_rate": 4.5547507456327226e-05, "loss": 0.7227, "step": 478 }, { "epoch": 0.18366564417177914, "grad_norm": 1.969385027885437, "learning_rate": 4.5526203664252236e-05, "loss": 0.0086, "step": 479 }, { "epoch": 0.18404907975460122, "grad_norm": 6.183859348297119, "learning_rate": 4.5504899872177245e-05, "loss": 0.0301, "step": 480 }, { "epoch": 0.1844325153374233, "grad_norm": 49.332706451416016, "learning_rate": 4.548359608010226e-05, "loss": 1.1514, "step": 481 }, { "epoch": 0.1848159509202454, "grad_norm": 0.09144336730241776, "learning_rate": 4.546229228802727e-05, "loss": 0.001, "step": 482 }, { "epoch": 0.1851993865030675, "grad_norm": 227.00379943847656, "learning_rate": 4.544098849595228e-05, "loss": 6.0859, "step": 483 }, { "epoch": 0.18558282208588958, "grad_norm": 13.271467208862305, "learning_rate": 4.541968470387729e-05, "loss": 0.0679, "step": 484 }, { "epoch": 0.18596625766871167, "grad_norm": 6.430564880371094, "learning_rate": 4.5398380911802305e-05, "loss": 0.0404, "step": 485 }, { "epoch": 0.18634969325153375, "grad_norm": 97.63346862792969, "learning_rate": 4.5377077119727314e-05, "loss": 0.9849, "step": 486 }, { "epoch": 0.18673312883435583, "grad_norm": 12.30589485168457, "learning_rate": 4.5355773327652324e-05, "loss": 0.0462, "step": 487 }, { "epoch": 0.18711656441717792, "grad_norm": 44.606597900390625, "learning_rate": 4.533446953557733e-05, "loss": 0.6133, "step": 488 }, { "epoch": 0.1875, "grad_norm": 0.7023065686225891, "learning_rate": 4.531316574350234e-05, "loss": 0.003, "step": 489 }, { "epoch": 0.18788343558282208, "grad_norm": 407.8355407714844, "learning_rate": 4.529186195142736e-05, "loss": 2.3166, "step": 490 }, { "epoch": 0.18826687116564417, "grad_norm": 100.24420166015625, "learning_rate": 4.527055815935237e-05, "loss": 1.46, "step": 491 }, { "epoch": 0.18865030674846625, "grad_norm": 1.02791166305542, "learning_rate": 4.524925436727738e-05, "loss": 0.0033, "step": 492 }, { "epoch": 0.18903374233128833, "grad_norm": 151.98976135253906, "learning_rate": 4.5227950575202386e-05, "loss": 2.4806, "step": 493 }, { "epoch": 0.18941717791411042, "grad_norm": 0.4681469202041626, "learning_rate": 4.5206646783127396e-05, "loss": 0.0029, "step": 494 }, { "epoch": 0.1898006134969325, "grad_norm": 92.45951080322266, "learning_rate": 4.518534299105241e-05, "loss": 1.5801, "step": 495 }, { "epoch": 0.1901840490797546, "grad_norm": 44.523433685302734, "learning_rate": 4.516403919897742e-05, "loss": 1.1572, "step": 496 }, { "epoch": 0.1905674846625767, "grad_norm": 11.493820190429688, "learning_rate": 4.514273540690243e-05, "loss": 0.0422, "step": 497 }, { "epoch": 0.19095092024539878, "grad_norm": 202.0272674560547, "learning_rate": 4.512143161482744e-05, "loss": 2.0394, "step": 498 }, { "epoch": 0.19133435582822086, "grad_norm": 13.105687141418457, "learning_rate": 4.5100127822752456e-05, "loss": 0.0464, "step": 499 }, { "epoch": 0.19171779141104295, "grad_norm": 4.247876167297363, "learning_rate": 4.5078824030677465e-05, "loss": 0.0123, "step": 500 }, { "epoch": 0.19210122699386503, "grad_norm": 228.3590850830078, "learning_rate": 4.5057520238602474e-05, "loss": 1.4082, "step": 501 }, { "epoch": 0.19248466257668712, "grad_norm": 3.032365322113037, "learning_rate": 4.5036216446527484e-05, "loss": 0.0085, "step": 502 }, { "epoch": 0.1928680981595092, "grad_norm": 0.6577479243278503, "learning_rate": 4.501491265445249e-05, "loss": 0.0024, "step": 503 }, { "epoch": 0.19325153374233128, "grad_norm": 89.23050689697266, "learning_rate": 4.499360886237751e-05, "loss": 5.9355, "step": 504 }, { "epoch": 0.19363496932515337, "grad_norm": 274.9822692871094, "learning_rate": 4.497230507030252e-05, "loss": 2.0099, "step": 505 }, { "epoch": 0.19401840490797545, "grad_norm": 2.5567502975463867, "learning_rate": 4.495100127822753e-05, "loss": 0.0053, "step": 506 }, { "epoch": 0.19440184049079753, "grad_norm": 45.34825134277344, "learning_rate": 4.492969748615254e-05, "loss": 0.9722, "step": 507 }, { "epoch": 0.19478527607361965, "grad_norm": 43.673038482666016, "learning_rate": 4.4908393694077546e-05, "loss": 0.6895, "step": 508 }, { "epoch": 0.19516871165644173, "grad_norm": 179.43748474121094, "learning_rate": 4.488708990200256e-05, "loss": 0.6533, "step": 509 }, { "epoch": 0.1955521472392638, "grad_norm": 2.264289140701294, "learning_rate": 4.486578610992757e-05, "loss": 0.0089, "step": 510 }, { "epoch": 0.1959355828220859, "grad_norm": 2.1404731273651123, "learning_rate": 4.484448231785258e-05, "loss": 0.0134, "step": 511 }, { "epoch": 0.19631901840490798, "grad_norm": 217.19174194335938, "learning_rate": 4.482317852577759e-05, "loss": 3.2687, "step": 512 }, { "epoch": 0.19670245398773006, "grad_norm": 63.18962097167969, "learning_rate": 4.4801874733702606e-05, "loss": 1.0537, "step": 513 }, { "epoch": 0.19708588957055215, "grad_norm": 53.48243713378906, "learning_rate": 4.4780570941627616e-05, "loss": 0.7662, "step": 514 }, { "epoch": 0.19746932515337423, "grad_norm": 3.4618992805480957, "learning_rate": 4.4759267149552625e-05, "loss": 0.0125, "step": 515 }, { "epoch": 0.19785276073619631, "grad_norm": 4.8398756980896, "learning_rate": 4.4737963357477634e-05, "loss": 0.0358, "step": 516 }, { "epoch": 0.1982361963190184, "grad_norm": 107.578857421875, "learning_rate": 4.4716659565402644e-05, "loss": 0.959, "step": 517 }, { "epoch": 0.19861963190184048, "grad_norm": 1.9601085186004639, "learning_rate": 4.469535577332765e-05, "loss": 0.0099, "step": 518 }, { "epoch": 0.19900306748466257, "grad_norm": 17.420923233032227, "learning_rate": 4.467405198125266e-05, "loss": 0.1004, "step": 519 }, { "epoch": 0.19938650306748465, "grad_norm": 42.05288314819336, "learning_rate": 4.465274818917767e-05, "loss": 0.7842, "step": 520 }, { "epoch": 0.19976993865030676, "grad_norm": 4.815923690795898, "learning_rate": 4.463144439710268e-05, "loss": 0.0127, "step": 521 }, { "epoch": 0.20015337423312884, "grad_norm": 130.26187133789062, "learning_rate": 4.46101406050277e-05, "loss": 2.5907, "step": 522 }, { "epoch": 0.20053680981595093, "grad_norm": 133.83578491210938, "learning_rate": 4.4588836812952706e-05, "loss": 6.6577, "step": 523 }, { "epoch": 0.200920245398773, "grad_norm": 1.783935308456421, "learning_rate": 4.4567533020877716e-05, "loss": 0.0084, "step": 524 }, { "epoch": 0.2013036809815951, "grad_norm": 150.36233520507812, "learning_rate": 4.4546229228802725e-05, "loss": 1.8362, "step": 525 }, { "epoch": 0.20168711656441718, "grad_norm": 6.6197075843811035, "learning_rate": 4.452492543672774e-05, "loss": 0.0135, "step": 526 }, { "epoch": 0.20207055214723926, "grad_norm": 184.958740234375, "learning_rate": 4.450362164465275e-05, "loss": 1.9093, "step": 527 }, { "epoch": 0.20245398773006135, "grad_norm": 76.64862060546875, "learning_rate": 4.448231785257776e-05, "loss": 1.2227, "step": 528 }, { "epoch": 0.20283742331288343, "grad_norm": 0.4043022394180298, "learning_rate": 4.446101406050277e-05, "loss": 0.0019, "step": 529 }, { "epoch": 0.20322085889570551, "grad_norm": 26.354055404663086, "learning_rate": 4.443971026842778e-05, "loss": 1.2315, "step": 530 }, { "epoch": 0.2036042944785276, "grad_norm": 85.78749084472656, "learning_rate": 4.4418406476352794e-05, "loss": 0.9126, "step": 531 }, { "epoch": 0.20398773006134968, "grad_norm": 0.17065221071243286, "learning_rate": 4.4397102684277804e-05, "loss": 0.0014, "step": 532 }, { "epoch": 0.2043711656441718, "grad_norm": 59.73079299926758, "learning_rate": 4.437579889220281e-05, "loss": 1.2969, "step": 533 }, { "epoch": 0.20475460122699388, "grad_norm": 3.527113437652588, "learning_rate": 4.435449510012782e-05, "loss": 0.0127, "step": 534 }, { "epoch": 0.20513803680981596, "grad_norm": 2.873439073562622, "learning_rate": 4.433319130805284e-05, "loss": 0.012, "step": 535 }, { "epoch": 0.20552147239263804, "grad_norm": 11.59239673614502, "learning_rate": 4.431188751597785e-05, "loss": 0.1285, "step": 536 }, { "epoch": 0.20590490797546013, "grad_norm": 0.4274378716945648, "learning_rate": 4.429058372390286e-05, "loss": 0.0013, "step": 537 }, { "epoch": 0.2062883435582822, "grad_norm": 19.355199813842773, "learning_rate": 4.4269279931827866e-05, "loss": 0.0459, "step": 538 }, { "epoch": 0.2066717791411043, "grad_norm": 2.7791860103607178, "learning_rate": 4.4247976139752876e-05, "loss": 0.0166, "step": 539 }, { "epoch": 0.20705521472392638, "grad_norm": 11.108470916748047, "learning_rate": 4.422667234767789e-05, "loss": 0.1113, "step": 540 }, { "epoch": 0.20743865030674846, "grad_norm": 29.45231819152832, "learning_rate": 4.42053685556029e-05, "loss": 0.7363, "step": 541 }, { "epoch": 0.20782208588957055, "grad_norm": 136.5975799560547, "learning_rate": 4.418406476352791e-05, "loss": 6.1973, "step": 542 }, { "epoch": 0.20820552147239263, "grad_norm": 3.0369179248809814, "learning_rate": 4.416276097145292e-05, "loss": 0.0062, "step": 543 }, { "epoch": 0.2085889570552147, "grad_norm": 29.516555786132812, "learning_rate": 4.414145717937793e-05, "loss": 0.8848, "step": 544 }, { "epoch": 0.2089723926380368, "grad_norm": 5.653842926025391, "learning_rate": 4.4120153387302945e-05, "loss": 0.0231, "step": 545 }, { "epoch": 0.2093558282208589, "grad_norm": 111.30487823486328, "learning_rate": 4.4098849595227954e-05, "loss": 2.0919, "step": 546 }, { "epoch": 0.209739263803681, "grad_norm": 137.13394165039062, "learning_rate": 4.4077545803152964e-05, "loss": 1.6612, "step": 547 }, { "epoch": 0.21012269938650308, "grad_norm": 0.26130568981170654, "learning_rate": 4.405624201107797e-05, "loss": 0.0022, "step": 548 }, { "epoch": 0.21050613496932516, "grad_norm": 117.121826171875, "learning_rate": 4.403493821900299e-05, "loss": 1.127, "step": 549 }, { "epoch": 0.21088957055214724, "grad_norm": 0.17888779938220978, "learning_rate": 4.4013634426928e-05, "loss": 0.0019, "step": 550 }, { "epoch": 0.21127300613496933, "grad_norm": 2.819636821746826, "learning_rate": 4.399233063485301e-05, "loss": 0.0068, "step": 551 }, { "epoch": 0.2116564417177914, "grad_norm": 11.347158432006836, "learning_rate": 4.397102684277802e-05, "loss": 0.0568, "step": 552 }, { "epoch": 0.2120398773006135, "grad_norm": 1.9594674110412598, "learning_rate": 4.3949723050703026e-05, "loss": 0.0094, "step": 553 }, { "epoch": 0.21242331288343558, "grad_norm": 0.2811185419559479, "learning_rate": 4.392841925862804e-05, "loss": 0.0017, "step": 554 }, { "epoch": 0.21280674846625766, "grad_norm": 1.277053713798523, "learning_rate": 4.390711546655305e-05, "loss": 0.0055, "step": 555 }, { "epoch": 0.21319018404907975, "grad_norm": 197.99415588378906, "learning_rate": 4.388581167447806e-05, "loss": 1.8321, "step": 556 }, { "epoch": 0.21357361963190183, "grad_norm": 128.40423583984375, "learning_rate": 4.386450788240307e-05, "loss": 1.7901, "step": 557 }, { "epoch": 0.21395705521472394, "grad_norm": 1.1741713285446167, "learning_rate": 4.384320409032808e-05, "loss": 0.0036, "step": 558 }, { "epoch": 0.21434049079754602, "grad_norm": 0.503452718257904, "learning_rate": 4.3821900298253096e-05, "loss": 0.0032, "step": 559 }, { "epoch": 0.2147239263803681, "grad_norm": 48.687339782714844, "learning_rate": 4.3800596506178105e-05, "loss": 0.6504, "step": 560 }, { "epoch": 0.2151073619631902, "grad_norm": 9.530661582946777, "learning_rate": 4.3779292714103114e-05, "loss": 0.082, "step": 561 }, { "epoch": 0.21549079754601227, "grad_norm": 0.6268191933631897, "learning_rate": 4.3757988922028124e-05, "loss": 0.0038, "step": 562 }, { "epoch": 0.21587423312883436, "grad_norm": 11.719134330749512, "learning_rate": 4.373668512995313e-05, "loss": 0.1386, "step": 563 }, { "epoch": 0.21625766871165644, "grad_norm": 1.8017444610595703, "learning_rate": 4.371538133787814e-05, "loss": 0.0058, "step": 564 }, { "epoch": 0.21664110429447853, "grad_norm": 59.35349655151367, "learning_rate": 4.369407754580315e-05, "loss": 1.1719, "step": 565 }, { "epoch": 0.2170245398773006, "grad_norm": 13.164896965026855, "learning_rate": 4.367277375372816e-05, "loss": 0.5293, "step": 566 }, { "epoch": 0.2174079754601227, "grad_norm": 97.70745086669922, "learning_rate": 4.365146996165318e-05, "loss": 2.3205, "step": 567 }, { "epoch": 0.21779141104294478, "grad_norm": 47.05236053466797, "learning_rate": 4.3630166169578186e-05, "loss": 0.6914, "step": 568 }, { "epoch": 0.21817484662576686, "grad_norm": 69.89529418945312, "learning_rate": 4.3608862377503196e-05, "loss": 1.9659, "step": 569 }, { "epoch": 0.21855828220858894, "grad_norm": 139.23280334472656, "learning_rate": 4.3587558585428205e-05, "loss": 1.8878, "step": 570 }, { "epoch": 0.21894171779141106, "grad_norm": 1.1655089855194092, "learning_rate": 4.3566254793353214e-05, "loss": 0.0067, "step": 571 }, { "epoch": 0.21932515337423314, "grad_norm": 100.34245300292969, "learning_rate": 4.354495100127823e-05, "loss": 0.8511, "step": 572 }, { "epoch": 0.21970858895705522, "grad_norm": 87.71229553222656, "learning_rate": 4.352364720920324e-05, "loss": 1.1719, "step": 573 }, { "epoch": 0.2200920245398773, "grad_norm": 54.652565002441406, "learning_rate": 4.350234341712825e-05, "loss": 0.8374, "step": 574 }, { "epoch": 0.2204754601226994, "grad_norm": 0.6231522560119629, "learning_rate": 4.348103962505326e-05, "loss": 0.0037, "step": 575 }, { "epoch": 0.22085889570552147, "grad_norm": 0.3720647692680359, "learning_rate": 4.3459735832978274e-05, "loss": 0.0018, "step": 576 }, { "epoch": 0.22124233128834356, "grad_norm": 142.35353088378906, "learning_rate": 4.3438432040903284e-05, "loss": 1.7725, "step": 577 }, { "epoch": 0.22162576687116564, "grad_norm": 55.13176727294922, "learning_rate": 4.341712824882829e-05, "loss": 0.7578, "step": 578 }, { "epoch": 0.22200920245398773, "grad_norm": 1.6380528211593628, "learning_rate": 4.33958244567533e-05, "loss": 0.0051, "step": 579 }, { "epoch": 0.2223926380368098, "grad_norm": 4.648425102233887, "learning_rate": 4.337452066467831e-05, "loss": 0.0107, "step": 580 }, { "epoch": 0.2227760736196319, "grad_norm": 153.6146697998047, "learning_rate": 4.335321687260333e-05, "loss": 2.2267, "step": 581 }, { "epoch": 0.22315950920245398, "grad_norm": 132.43736267089844, "learning_rate": 4.333191308052834e-05, "loss": 1.9737, "step": 582 }, { "epoch": 0.22354294478527606, "grad_norm": 138.14703369140625, "learning_rate": 4.3310609288453346e-05, "loss": 1.2608, "step": 583 }, { "epoch": 0.22392638036809817, "grad_norm": 108.3702392578125, "learning_rate": 4.3289305496378356e-05, "loss": 5.8828, "step": 584 }, { "epoch": 0.22430981595092025, "grad_norm": 0.3305304944515228, "learning_rate": 4.3268001704303365e-05, "loss": 0.0018, "step": 585 }, { "epoch": 0.22469325153374234, "grad_norm": 26.56437110900879, "learning_rate": 4.324669791222838e-05, "loss": 0.6597, "step": 586 }, { "epoch": 0.22507668711656442, "grad_norm": 15.87643814086914, "learning_rate": 4.322539412015339e-05, "loss": 0.0642, "step": 587 }, { "epoch": 0.2254601226993865, "grad_norm": 66.76805114746094, "learning_rate": 4.32040903280784e-05, "loss": 1.0527, "step": 588 }, { "epoch": 0.2258435582822086, "grad_norm": 57.99732208251953, "learning_rate": 4.318278653600341e-05, "loss": 0.7607, "step": 589 }, { "epoch": 0.22622699386503067, "grad_norm": 3.2562761306762695, "learning_rate": 4.3161482743928425e-05, "loss": 0.0178, "step": 590 }, { "epoch": 0.22661042944785276, "grad_norm": 226.23036193847656, "learning_rate": 4.3140178951853434e-05, "loss": 1.9583, "step": 591 }, { "epoch": 0.22699386503067484, "grad_norm": 45.63861846923828, "learning_rate": 4.3118875159778444e-05, "loss": 0.6699, "step": 592 }, { "epoch": 0.22737730061349692, "grad_norm": 1.0839177370071411, "learning_rate": 4.309757136770345e-05, "loss": 0.0045, "step": 593 }, { "epoch": 0.227760736196319, "grad_norm": 10.099230766296387, "learning_rate": 4.307626757562846e-05, "loss": 0.0467, "step": 594 }, { "epoch": 0.2281441717791411, "grad_norm": 66.15339660644531, "learning_rate": 4.305496378355348e-05, "loss": 0.7017, "step": 595 }, { "epoch": 0.2285276073619632, "grad_norm": 0.2999923527240753, "learning_rate": 4.303365999147849e-05, "loss": 0.0016, "step": 596 }, { "epoch": 0.2289110429447853, "grad_norm": 13.346380233764648, "learning_rate": 4.30123561994035e-05, "loss": 0.0342, "step": 597 }, { "epoch": 0.22929447852760737, "grad_norm": 2.257432699203491, "learning_rate": 4.2991052407328506e-05, "loss": 0.0107, "step": 598 }, { "epoch": 0.22967791411042945, "grad_norm": 1.0266526937484741, "learning_rate": 4.2969748615253516e-05, "loss": 0.0033, "step": 599 }, { "epoch": 0.23006134969325154, "grad_norm": 0.8544795513153076, "learning_rate": 4.294844482317853e-05, "loss": 0.0029, "step": 600 }, { "epoch": 0.23044478527607362, "grad_norm": 23.01372718811035, "learning_rate": 4.292714103110354e-05, "loss": 0.7617, "step": 601 }, { "epoch": 0.2308282208588957, "grad_norm": 1.7772164344787598, "learning_rate": 4.290583723902855e-05, "loss": 0.0027, "step": 602 }, { "epoch": 0.2312116564417178, "grad_norm": 72.03024291992188, "learning_rate": 4.288453344695356e-05, "loss": 0.8277, "step": 603 }, { "epoch": 0.23159509202453987, "grad_norm": 16.46124839782715, "learning_rate": 4.2863229654878576e-05, "loss": 0.5488, "step": 604 }, { "epoch": 0.23197852760736196, "grad_norm": 0.7167817950248718, "learning_rate": 4.2841925862803585e-05, "loss": 0.0034, "step": 605 }, { "epoch": 0.23236196319018404, "grad_norm": 4.347182750701904, "learning_rate": 4.2820622070728594e-05, "loss": 0.0047, "step": 606 }, { "epoch": 0.23274539877300612, "grad_norm": 2.2599356174468994, "learning_rate": 4.2799318278653604e-05, "loss": 0.005, "step": 607 }, { "epoch": 0.2331288343558282, "grad_norm": 7.927770614624023, "learning_rate": 4.277801448657861e-05, "loss": 0.0365, "step": 608 }, { "epoch": 0.23351226993865032, "grad_norm": 111.94647216796875, "learning_rate": 4.275671069450362e-05, "loss": 5.792, "step": 609 }, { "epoch": 0.2338957055214724, "grad_norm": 0.46775519847869873, "learning_rate": 4.273540690242863e-05, "loss": 0.0026, "step": 610 }, { "epoch": 0.23427914110429449, "grad_norm": 72.51065063476562, "learning_rate": 4.271410311035364e-05, "loss": 0.5313, "step": 611 }, { "epoch": 0.23466257668711657, "grad_norm": 127.6706771850586, "learning_rate": 4.269279931827865e-05, "loss": 5.7559, "step": 612 }, { "epoch": 0.23504601226993865, "grad_norm": 117.30060577392578, "learning_rate": 4.2671495526203666e-05, "loss": 1.3448, "step": 613 }, { "epoch": 0.23542944785276074, "grad_norm": 51.63465881347656, "learning_rate": 4.2650191734128676e-05, "loss": 0.6577, "step": 614 }, { "epoch": 0.23581288343558282, "grad_norm": 628.3287353515625, "learning_rate": 4.2628887942053685e-05, "loss": 2.7054, "step": 615 }, { "epoch": 0.2361963190184049, "grad_norm": 8.268697738647461, "learning_rate": 4.2607584149978694e-05, "loss": 0.0254, "step": 616 }, { "epoch": 0.236579754601227, "grad_norm": 36.803314208984375, "learning_rate": 4.258628035790371e-05, "loss": 0.6128, "step": 617 }, { "epoch": 0.23696319018404907, "grad_norm": 0.6197580099105835, "learning_rate": 4.256497656582872e-05, "loss": 0.0018, "step": 618 }, { "epoch": 0.23734662576687116, "grad_norm": 0.23906821012496948, "learning_rate": 4.254367277375373e-05, "loss": 0.0013, "step": 619 }, { "epoch": 0.23773006134969324, "grad_norm": 136.54547119140625, "learning_rate": 4.252236898167874e-05, "loss": 3.1651, "step": 620 }, { "epoch": 0.23811349693251535, "grad_norm": 81.10930633544922, "learning_rate": 4.250106518960375e-05, "loss": 1.3369, "step": 621 }, { "epoch": 0.23849693251533743, "grad_norm": 0.25049564242362976, "learning_rate": 4.2479761397528764e-05, "loss": 0.0022, "step": 622 }, { "epoch": 0.23888036809815952, "grad_norm": 22.74750328063965, "learning_rate": 4.245845760545377e-05, "loss": 0.0658, "step": 623 }, { "epoch": 0.2392638036809816, "grad_norm": 2.002502918243408, "learning_rate": 4.243715381337878e-05, "loss": 0.0106, "step": 624 }, { "epoch": 0.23964723926380369, "grad_norm": 2.6754136085510254, "learning_rate": 4.241585002130379e-05, "loss": 0.0089, "step": 625 }, { "epoch": 0.24003067484662577, "grad_norm": 0.7828622460365295, "learning_rate": 4.239454622922881e-05, "loss": 0.0035, "step": 626 }, { "epoch": 0.24041411042944785, "grad_norm": 94.25086212158203, "learning_rate": 4.237324243715382e-05, "loss": 5.7812, "step": 627 }, { "epoch": 0.24079754601226994, "grad_norm": 7.338162899017334, "learning_rate": 4.2351938645078826e-05, "loss": 0.1679, "step": 628 }, { "epoch": 0.24118098159509202, "grad_norm": 5.649900913238525, "learning_rate": 4.2330634853003836e-05, "loss": 0.0223, "step": 629 }, { "epoch": 0.2415644171779141, "grad_norm": 128.02952575683594, "learning_rate": 4.2309331060928845e-05, "loss": 0.9888, "step": 630 }, { "epoch": 0.2419478527607362, "grad_norm": 181.65542602539062, "learning_rate": 4.228802726885386e-05, "loss": 5.6309, "step": 631 }, { "epoch": 0.24233128834355827, "grad_norm": 45.65901565551758, "learning_rate": 4.226672347677887e-05, "loss": 0.77, "step": 632 }, { "epoch": 0.24271472392638035, "grad_norm": 193.27027893066406, "learning_rate": 4.224541968470388e-05, "loss": 4.4053, "step": 633 }, { "epoch": 0.24309815950920247, "grad_norm": 11.706387519836426, "learning_rate": 4.222411589262889e-05, "loss": 0.07, "step": 634 }, { "epoch": 0.24348159509202455, "grad_norm": 121.58812713623047, "learning_rate": 4.22028121005539e-05, "loss": 1.4532, "step": 635 }, { "epoch": 0.24386503067484663, "grad_norm": 159.98052978515625, "learning_rate": 4.2181508308478914e-05, "loss": 6.2891, "step": 636 }, { "epoch": 0.24424846625766872, "grad_norm": 122.95700073242188, "learning_rate": 4.2160204516403924e-05, "loss": 6.1455, "step": 637 }, { "epoch": 0.2446319018404908, "grad_norm": 3.863123655319214, "learning_rate": 4.213890072432893e-05, "loss": 0.0217, "step": 638 }, { "epoch": 0.24501533742331288, "grad_norm": 143.37567138671875, "learning_rate": 4.211759693225394e-05, "loss": 5.5938, "step": 639 }, { "epoch": 0.24539877300613497, "grad_norm": 4.452521800994873, "learning_rate": 4.209629314017896e-05, "loss": 0.02, "step": 640 }, { "epoch": 0.24578220858895705, "grad_norm": 110.67408752441406, "learning_rate": 4.207498934810397e-05, "loss": 1.2315, "step": 641 }, { "epoch": 0.24616564417177914, "grad_norm": 35.09891128540039, "learning_rate": 4.205368555602898e-05, "loss": 0.6338, "step": 642 }, { "epoch": 0.24654907975460122, "grad_norm": 68.7496109008789, "learning_rate": 4.2032381763953986e-05, "loss": 5.666, "step": 643 }, { "epoch": 0.2469325153374233, "grad_norm": 0.4419560730457306, "learning_rate": 4.2011077971878996e-05, "loss": 0.0021, "step": 644 }, { "epoch": 0.2473159509202454, "grad_norm": 1.9711453914642334, "learning_rate": 4.198977417980401e-05, "loss": 0.007, "step": 645 }, { "epoch": 0.2476993865030675, "grad_norm": 170.4474639892578, "learning_rate": 4.196847038772902e-05, "loss": 4.8525, "step": 646 }, { "epoch": 0.24808282208588958, "grad_norm": 10.959206581115723, "learning_rate": 4.194716659565403e-05, "loss": 0.0409, "step": 647 }, { "epoch": 0.24846625766871167, "grad_norm": 164.01382446289062, "learning_rate": 4.192586280357904e-05, "loss": 5.5068, "step": 648 }, { "epoch": 0.24884969325153375, "grad_norm": 2.1526594161987305, "learning_rate": 4.190455901150405e-05, "loss": 0.0089, "step": 649 }, { "epoch": 0.24923312883435583, "grad_norm": 1.0481888055801392, "learning_rate": 4.1883255219429065e-05, "loss": 0.005, "step": 650 }, { "epoch": 0.24961656441717792, "grad_norm": 1.0030834674835205, "learning_rate": 4.1861951427354074e-05, "loss": 0.0046, "step": 651 }, { "epoch": 0.25, "grad_norm": 23.366331100463867, "learning_rate": 4.1840647635279084e-05, "loss": 0.5884, "step": 652 }, { "epoch": 0.2503834355828221, "grad_norm": 1.5120415687561035, "learning_rate": 4.181934384320409e-05, "loss": 0.0074, "step": 653 }, { "epoch": 0.25076687116564417, "grad_norm": 0.35710829496383667, "learning_rate": 4.17980400511291e-05, "loss": 0.0028, "step": 654 }, { "epoch": 0.2511503067484663, "grad_norm": 33.088714599609375, "learning_rate": 4.177673625905411e-05, "loss": 0.5952, "step": 655 }, { "epoch": 0.25153374233128833, "grad_norm": 144.0160675048828, "learning_rate": 4.175543246697912e-05, "loss": 5.8027, "step": 656 }, { "epoch": 0.25191717791411045, "grad_norm": 112.53301239013672, "learning_rate": 4.173412867490413e-05, "loss": 1.4445, "step": 657 }, { "epoch": 0.2523006134969325, "grad_norm": 180.619873046875, "learning_rate": 4.1712824882829146e-05, "loss": 5.5791, "step": 658 }, { "epoch": 0.2526840490797546, "grad_norm": 4.506056308746338, "learning_rate": 4.1691521090754156e-05, "loss": 0.0169, "step": 659 }, { "epoch": 0.25306748466257667, "grad_norm": 4.511041641235352, "learning_rate": 4.1670217298679165e-05, "loss": 0.0065, "step": 660 }, { "epoch": 0.2534509202453988, "grad_norm": 20.754192352294922, "learning_rate": 4.1648913506604174e-05, "loss": 0.6382, "step": 661 }, { "epoch": 0.25383435582822084, "grad_norm": 2.1220593452453613, "learning_rate": 4.1627609714529183e-05, "loss": 0.007, "step": 662 }, { "epoch": 0.25421779141104295, "grad_norm": 120.6075210571289, "learning_rate": 4.16063059224542e-05, "loss": 2.1088, "step": 663 }, { "epoch": 0.254601226993865, "grad_norm": 1.466044306755066, "learning_rate": 4.158500213037921e-05, "loss": 0.0044, "step": 664 }, { "epoch": 0.2549846625766871, "grad_norm": 41.46731185913086, "learning_rate": 4.156369833830422e-05, "loss": 0.5107, "step": 665 }, { "epoch": 0.2553680981595092, "grad_norm": 1.1529388427734375, "learning_rate": 4.154239454622923e-05, "loss": 0.002, "step": 666 }, { "epoch": 0.2557515337423313, "grad_norm": 8.65416431427002, "learning_rate": 4.1521090754154244e-05, "loss": 0.0345, "step": 667 }, { "epoch": 0.2561349693251534, "grad_norm": 22.93845558166504, "learning_rate": 4.149978696207925e-05, "loss": 0.5298, "step": 668 }, { "epoch": 0.25651840490797545, "grad_norm": 3.1691551208496094, "learning_rate": 4.147848317000426e-05, "loss": 0.0062, "step": 669 }, { "epoch": 0.25690184049079756, "grad_norm": 15.257349967956543, "learning_rate": 4.145717937792927e-05, "loss": 0.4937, "step": 670 }, { "epoch": 0.2572852760736196, "grad_norm": 3.3930716514587402, "learning_rate": 4.143587558585428e-05, "loss": 0.0194, "step": 671 }, { "epoch": 0.25766871165644173, "grad_norm": 1.1906558275222778, "learning_rate": 4.14145717937793e-05, "loss": 0.0059, "step": 672 }, { "epoch": 0.2580521472392638, "grad_norm": 14.620223045349121, "learning_rate": 4.1393268001704306e-05, "loss": 0.5498, "step": 673 }, { "epoch": 0.2584355828220859, "grad_norm": 0.23422184586524963, "learning_rate": 4.1371964209629316e-05, "loss": 0.0018, "step": 674 }, { "epoch": 0.25881901840490795, "grad_norm": 120.83528900146484, "learning_rate": 4.1350660417554325e-05, "loss": 1.7823, "step": 675 }, { "epoch": 0.25920245398773006, "grad_norm": 1.0207319259643555, "learning_rate": 4.1329356625479334e-05, "loss": 0.0049, "step": 676 }, { "epoch": 0.2595858895705521, "grad_norm": 1.2075201272964478, "learning_rate": 4.130805283340435e-05, "loss": 0.0067, "step": 677 }, { "epoch": 0.25996932515337423, "grad_norm": 0.39283424615859985, "learning_rate": 4.128674904132936e-05, "loss": 0.0022, "step": 678 }, { "epoch": 0.26035276073619634, "grad_norm": 0.2834872603416443, "learning_rate": 4.126544524925437e-05, "loss": 0.0021, "step": 679 }, { "epoch": 0.2607361963190184, "grad_norm": 202.35397338867188, "learning_rate": 4.124414145717938e-05, "loss": 1.4132, "step": 680 }, { "epoch": 0.2611196319018405, "grad_norm": 0.7038347721099854, "learning_rate": 4.1222837665104394e-05, "loss": 0.0037, "step": 681 }, { "epoch": 0.26150306748466257, "grad_norm": 189.6617889404297, "learning_rate": 4.1201533873029404e-05, "loss": 1.214, "step": 682 }, { "epoch": 0.2618865030674847, "grad_norm": 31.273454666137695, "learning_rate": 4.118023008095441e-05, "loss": 0.5957, "step": 683 }, { "epoch": 0.26226993865030673, "grad_norm": 0.646104097366333, "learning_rate": 4.115892628887942e-05, "loss": 0.0044, "step": 684 }, { "epoch": 0.26265337423312884, "grad_norm": 157.92442321777344, "learning_rate": 4.113762249680443e-05, "loss": 2.3674, "step": 685 }, { "epoch": 0.2630368098159509, "grad_norm": 38.39439392089844, "learning_rate": 4.111631870472945e-05, "loss": 0.6558, "step": 686 }, { "epoch": 0.263420245398773, "grad_norm": 2.025766611099243, "learning_rate": 4.109501491265446e-05, "loss": 0.0108, "step": 687 }, { "epoch": 0.26380368098159507, "grad_norm": 239.11248779296875, "learning_rate": 4.1073711120579466e-05, "loss": 1.8917, "step": 688 }, { "epoch": 0.2641871165644172, "grad_norm": 1.424906611442566, "learning_rate": 4.1052407328504476e-05, "loss": 0.0087, "step": 689 }, { "epoch": 0.2645705521472393, "grad_norm": 215.03074645996094, "learning_rate": 4.1031103536429485e-05, "loss": 5.6675, "step": 690 }, { "epoch": 0.26495398773006135, "grad_norm": 11.536243438720703, "learning_rate": 4.10097997443545e-05, "loss": 0.1388, "step": 691 }, { "epoch": 0.26533742331288346, "grad_norm": 102.15013122558594, "learning_rate": 4.098849595227951e-05, "loss": 5.1914, "step": 692 }, { "epoch": 0.2657208588957055, "grad_norm": 0.3701198101043701, "learning_rate": 4.096719216020452e-05, "loss": 0.0029, "step": 693 }, { "epoch": 0.2661042944785276, "grad_norm": 65.08767700195312, "learning_rate": 4.094588836812953e-05, "loss": 0.7599, "step": 694 }, { "epoch": 0.2664877300613497, "grad_norm": 9.87397289276123, "learning_rate": 4.0924584576054545e-05, "loss": 0.0347, "step": 695 }, { "epoch": 0.2668711656441718, "grad_norm": 8.291452407836914, "learning_rate": 4.0903280783979554e-05, "loss": 0.0771, "step": 696 }, { "epoch": 0.26725460122699385, "grad_norm": 209.46115112304688, "learning_rate": 4.0881976991904564e-05, "loss": 3.0864, "step": 697 }, { "epoch": 0.26763803680981596, "grad_norm": 216.21163940429688, "learning_rate": 4.0860673199829566e-05, "loss": 2.5413, "step": 698 }, { "epoch": 0.268021472392638, "grad_norm": 2.646641969680786, "learning_rate": 4.083936940775458e-05, "loss": 0.0041, "step": 699 }, { "epoch": 0.2684049079754601, "grad_norm": 156.0490264892578, "learning_rate": 4.081806561567959e-05, "loss": 1.5695, "step": 700 }, { "epoch": 0.2687883435582822, "grad_norm": 0.74263596534729, "learning_rate": 4.07967618236046e-05, "loss": 0.0024, "step": 701 }, { "epoch": 0.2691717791411043, "grad_norm": 120.81138610839844, "learning_rate": 4.077545803152961e-05, "loss": 1.086, "step": 702 }, { "epoch": 0.2695552147239264, "grad_norm": 14.250238418579102, "learning_rate": 4.075415423945462e-05, "loss": 0.055, "step": 703 }, { "epoch": 0.26993865030674846, "grad_norm": 146.56906127929688, "learning_rate": 4.0732850447379636e-05, "loss": 1.5176, "step": 704 }, { "epoch": 0.2703220858895706, "grad_norm": 3.4120378494262695, "learning_rate": 4.0711546655304645e-05, "loss": 0.0138, "step": 705 }, { "epoch": 0.27070552147239263, "grad_norm": 1.6495180130004883, "learning_rate": 4.0690242863229654e-05, "loss": 0.0087, "step": 706 }, { "epoch": 0.27108895705521474, "grad_norm": 1.0825520753860474, "learning_rate": 4.0668939071154663e-05, "loss": 0.0056, "step": 707 }, { "epoch": 0.2714723926380368, "grad_norm": 129.17660522460938, "learning_rate": 4.064763527907968e-05, "loss": 1.7218, "step": 708 }, { "epoch": 0.2718558282208589, "grad_norm": 1.3704122304916382, "learning_rate": 4.062633148700469e-05, "loss": 0.0055, "step": 709 }, { "epoch": 0.27223926380368096, "grad_norm": 19.120973587036133, "learning_rate": 4.06050276949297e-05, "loss": 0.0598, "step": 710 }, { "epoch": 0.2726226993865031, "grad_norm": 9.105579376220703, "learning_rate": 4.058372390285471e-05, "loss": 0.0208, "step": 711 }, { "epoch": 0.27300613496932513, "grad_norm": 9.893427848815918, "learning_rate": 4.056242011077972e-05, "loss": 0.034, "step": 712 }, { "epoch": 0.27338957055214724, "grad_norm": 10.373291969299316, "learning_rate": 4.054111631870473e-05, "loss": 0.0451, "step": 713 }, { "epoch": 0.2737730061349693, "grad_norm": 48.78724670410156, "learning_rate": 4.051981252662974e-05, "loss": 0.6045, "step": 714 }, { "epoch": 0.2741564417177914, "grad_norm": 45.57914733886719, "learning_rate": 4.049850873455475e-05, "loss": 0.7661, "step": 715 }, { "epoch": 0.2745398773006135, "grad_norm": 71.10076904296875, "learning_rate": 4.047720494247976e-05, "loss": 1.6184, "step": 716 }, { "epoch": 0.2749233128834356, "grad_norm": 8.53931999206543, "learning_rate": 4.045590115040478e-05, "loss": 0.0327, "step": 717 }, { "epoch": 0.2753067484662577, "grad_norm": 57.249568939208984, "learning_rate": 4.0434597358329786e-05, "loss": 0.8604, "step": 718 }, { "epoch": 0.27569018404907975, "grad_norm": 108.18406677246094, "learning_rate": 4.0413293566254796e-05, "loss": 1.3262, "step": 719 }, { "epoch": 0.27607361963190186, "grad_norm": 1.0649335384368896, "learning_rate": 4.0391989774179805e-05, "loss": 0.0041, "step": 720 }, { "epoch": 0.2764570552147239, "grad_norm": 17.896425247192383, "learning_rate": 4.0370685982104814e-05, "loss": 0.1149, "step": 721 }, { "epoch": 0.276840490797546, "grad_norm": 75.33570098876953, "learning_rate": 4.034938219002983e-05, "loss": 1.0372, "step": 722 }, { "epoch": 0.2772239263803681, "grad_norm": 293.58526611328125, "learning_rate": 4.032807839795484e-05, "loss": 5.5889, "step": 723 }, { "epoch": 0.2776073619631902, "grad_norm": 45.82387924194336, "learning_rate": 4.030677460587985e-05, "loss": 0.751, "step": 724 }, { "epoch": 0.27799079754601225, "grad_norm": 4.106308937072754, "learning_rate": 4.028547081380486e-05, "loss": 0.0154, "step": 725 }, { "epoch": 0.27837423312883436, "grad_norm": 27.708314895629883, "learning_rate": 4.026416702172987e-05, "loss": 0.6475, "step": 726 }, { "epoch": 0.2787576687116564, "grad_norm": 2.0057215690612793, "learning_rate": 4.0242863229654884e-05, "loss": 0.005, "step": 727 }, { "epoch": 0.2791411042944785, "grad_norm": 12.491474151611328, "learning_rate": 4.022155943757989e-05, "loss": 0.0745, "step": 728 }, { "epoch": 0.27952453987730064, "grad_norm": 0.5827816128730774, "learning_rate": 4.02002556455049e-05, "loss": 0.0029, "step": 729 }, { "epoch": 0.2799079754601227, "grad_norm": 123.37541198730469, "learning_rate": 4.017895185342991e-05, "loss": 1.7745, "step": 730 }, { "epoch": 0.2802914110429448, "grad_norm": 44.76090621948242, "learning_rate": 4.015764806135493e-05, "loss": 0.6943, "step": 731 }, { "epoch": 0.28067484662576686, "grad_norm": 4.828588008880615, "learning_rate": 4.013634426927994e-05, "loss": 0.0133, "step": 732 }, { "epoch": 0.28105828220858897, "grad_norm": 5.215601921081543, "learning_rate": 4.0115040477204946e-05, "loss": 0.0117, "step": 733 }, { "epoch": 0.28144171779141103, "grad_norm": 0.5461913347244263, "learning_rate": 4.0093736685129956e-05, "loss": 0.0029, "step": 734 }, { "epoch": 0.28182515337423314, "grad_norm": 125.95172882080078, "learning_rate": 4.0072432893054965e-05, "loss": 1.6723, "step": 735 }, { "epoch": 0.2822085889570552, "grad_norm": 2.944730043411255, "learning_rate": 4.005112910097998e-05, "loss": 0.0209, "step": 736 }, { "epoch": 0.2825920245398773, "grad_norm": 17.602924346923828, "learning_rate": 4.002982530890499e-05, "loss": 0.1221, "step": 737 }, { "epoch": 0.28297546012269936, "grad_norm": 3.474398612976074, "learning_rate": 4.000852151683e-05, "loss": 0.0083, "step": 738 }, { "epoch": 0.2833588957055215, "grad_norm": 191.66481018066406, "learning_rate": 3.998721772475501e-05, "loss": 1.3117, "step": 739 }, { "epoch": 0.2837423312883436, "grad_norm": 5.7077412605285645, "learning_rate": 3.996591393268002e-05, "loss": 0.0304, "step": 740 }, { "epoch": 0.28412576687116564, "grad_norm": 1.7796605825424194, "learning_rate": 3.9944610140605034e-05, "loss": 0.0048, "step": 741 }, { "epoch": 0.28450920245398775, "grad_norm": 20.046506881713867, "learning_rate": 3.9923306348530044e-05, "loss": 0.6294, "step": 742 }, { "epoch": 0.2848926380368098, "grad_norm": 8.310009956359863, "learning_rate": 3.9902002556455046e-05, "loss": 0.4993, "step": 743 }, { "epoch": 0.2852760736196319, "grad_norm": 115.0446548461914, "learning_rate": 3.988069876438006e-05, "loss": 5.3862, "step": 744 }, { "epoch": 0.285659509202454, "grad_norm": 20.69170570373535, "learning_rate": 3.985939497230507e-05, "loss": 0.5698, "step": 745 }, { "epoch": 0.2860429447852761, "grad_norm": 1.4113951921463013, "learning_rate": 3.983809118023008e-05, "loss": 0.0058, "step": 746 }, { "epoch": 0.28642638036809814, "grad_norm": 2.7105507850646973, "learning_rate": 3.981678738815509e-05, "loss": 0.0033, "step": 747 }, { "epoch": 0.28680981595092025, "grad_norm": 21.30638885498047, "learning_rate": 3.97954835960801e-05, "loss": 0.5322, "step": 748 }, { "epoch": 0.2871932515337423, "grad_norm": 16.875904083251953, "learning_rate": 3.9774179804005116e-05, "loss": 0.0605, "step": 749 }, { "epoch": 0.2875766871165644, "grad_norm": 32.32379913330078, "learning_rate": 3.9752876011930125e-05, "loss": 0.7578, "step": 750 }, { "epoch": 0.2879601226993865, "grad_norm": 124.02713012695312, "learning_rate": 3.9731572219855134e-05, "loss": 0.981, "step": 751 }, { "epoch": 0.2883435582822086, "grad_norm": 100.00495910644531, "learning_rate": 3.9710268427780143e-05, "loss": 1.5207, "step": 752 }, { "epoch": 0.2887269938650307, "grad_norm": 142.09523010253906, "learning_rate": 3.968896463570515e-05, "loss": 6.4824, "step": 753 }, { "epoch": 0.28911042944785276, "grad_norm": 0.56594318151474, "learning_rate": 3.966766084363017e-05, "loss": 0.0031, "step": 754 }, { "epoch": 0.28949386503067487, "grad_norm": 7.328930854797363, "learning_rate": 3.964635705155518e-05, "loss": 0.0452, "step": 755 }, { "epoch": 0.2898773006134969, "grad_norm": 0.901790201663971, "learning_rate": 3.962505325948019e-05, "loss": 0.0028, "step": 756 }, { "epoch": 0.29026073619631904, "grad_norm": 3.4390993118286133, "learning_rate": 3.96037494674052e-05, "loss": 0.0098, "step": 757 }, { "epoch": 0.2906441717791411, "grad_norm": 22.084123611450195, "learning_rate": 3.958244567533021e-05, "loss": 0.5352, "step": 758 }, { "epoch": 0.2910276073619632, "grad_norm": 1.4480448961257935, "learning_rate": 3.956114188325522e-05, "loss": 0.0038, "step": 759 }, { "epoch": 0.29141104294478526, "grad_norm": 0.6343742609024048, "learning_rate": 3.953983809118023e-05, "loss": 0.0027, "step": 760 }, { "epoch": 0.29179447852760737, "grad_norm": 6.620326995849609, "learning_rate": 3.951853429910524e-05, "loss": 0.2262, "step": 761 }, { "epoch": 0.2921779141104294, "grad_norm": 0.6557702422142029, "learning_rate": 3.949723050703025e-05, "loss": 0.003, "step": 762 }, { "epoch": 0.29256134969325154, "grad_norm": 87.16983795166016, "learning_rate": 3.9475926714955266e-05, "loss": 1.0498, "step": 763 }, { "epoch": 0.2929447852760736, "grad_norm": 496.9752197265625, "learning_rate": 3.9454622922880276e-05, "loss": 3.306, "step": 764 }, { "epoch": 0.2933282208588957, "grad_norm": 1.9638266563415527, "learning_rate": 3.9433319130805285e-05, "loss": 0.0062, "step": 765 }, { "epoch": 0.2937116564417178, "grad_norm": 5.150081157684326, "learning_rate": 3.9412015338730294e-05, "loss": 0.0124, "step": 766 }, { "epoch": 0.2940950920245399, "grad_norm": 89.9288558959961, "learning_rate": 3.9390711546655303e-05, "loss": 0.8052, "step": 767 }, { "epoch": 0.294478527607362, "grad_norm": 0.3339363634586334, "learning_rate": 3.936940775458032e-05, "loss": 0.0019, "step": 768 }, { "epoch": 0.29486196319018404, "grad_norm": 11.336874008178711, "learning_rate": 3.934810396250533e-05, "loss": 0.0814, "step": 769 }, { "epoch": 0.29524539877300615, "grad_norm": 7.84946870803833, "learning_rate": 3.932680017043034e-05, "loss": 0.0116, "step": 770 }, { "epoch": 0.2956288343558282, "grad_norm": 0.8124586343765259, "learning_rate": 3.930549637835535e-05, "loss": 0.0035, "step": 771 }, { "epoch": 0.2960122699386503, "grad_norm": 1.8907150030136108, "learning_rate": 3.9284192586280364e-05, "loss": 0.0031, "step": 772 }, { "epoch": 0.2963957055214724, "grad_norm": 11.590265274047852, "learning_rate": 3.926288879420537e-05, "loss": 0.0675, "step": 773 }, { "epoch": 0.2967791411042945, "grad_norm": 182.07699584960938, "learning_rate": 3.924158500213038e-05, "loss": 5.2842, "step": 774 }, { "epoch": 0.29716257668711654, "grad_norm": 367.9045104980469, "learning_rate": 3.922028121005539e-05, "loss": 5.6216, "step": 775 }, { "epoch": 0.29754601226993865, "grad_norm": 1.2980585098266602, "learning_rate": 3.91989774179804e-05, "loss": 0.0065, "step": 776 }, { "epoch": 0.2979294478527607, "grad_norm": 1.5405141115188599, "learning_rate": 3.917767362590542e-05, "loss": 0.0059, "step": 777 }, { "epoch": 0.2983128834355828, "grad_norm": 0.18221314251422882, "learning_rate": 3.9156369833830426e-05, "loss": 0.0029, "step": 778 }, { "epoch": 0.29869631901840493, "grad_norm": 42.80677032470703, "learning_rate": 3.9135066041755436e-05, "loss": 0.9043, "step": 779 }, { "epoch": 0.299079754601227, "grad_norm": 9.322417259216309, "learning_rate": 3.9113762249680445e-05, "loss": 0.0489, "step": 780 }, { "epoch": 0.2994631901840491, "grad_norm": 172.9141387939453, "learning_rate": 3.909245845760546e-05, "loss": 1.3604, "step": 781 }, { "epoch": 0.29984662576687116, "grad_norm": 2.292292833328247, "learning_rate": 3.907115466553047e-05, "loss": 0.0119, "step": 782 }, { "epoch": 0.30023006134969327, "grad_norm": 0.20979280769824982, "learning_rate": 3.904985087345548e-05, "loss": 0.0016, "step": 783 }, { "epoch": 0.3006134969325153, "grad_norm": 81.50609588623047, "learning_rate": 3.902854708138049e-05, "loss": 2.1564, "step": 784 }, { "epoch": 0.30099693251533743, "grad_norm": 0.5129420757293701, "learning_rate": 3.90072432893055e-05, "loss": 0.0024, "step": 785 }, { "epoch": 0.3013803680981595, "grad_norm": 2.1608102321624756, "learning_rate": 3.8985939497230514e-05, "loss": 0.0096, "step": 786 }, { "epoch": 0.3017638036809816, "grad_norm": 104.12256622314453, "learning_rate": 3.8964635705155524e-05, "loss": 5.7793, "step": 787 }, { "epoch": 0.30214723926380366, "grad_norm": 5.102387428283691, "learning_rate": 3.894333191308053e-05, "loss": 0.0276, "step": 788 }, { "epoch": 0.30253067484662577, "grad_norm": 143.1125030517578, "learning_rate": 3.8922028121005535e-05, "loss": 1.8458, "step": 789 }, { "epoch": 0.3029141104294479, "grad_norm": 5.7853684425354, "learning_rate": 3.890072432893055e-05, "loss": 0.0213, "step": 790 }, { "epoch": 0.30329754601226994, "grad_norm": 7.543581962585449, "learning_rate": 3.887942053685556e-05, "loss": 0.1757, "step": 791 }, { "epoch": 0.30368098159509205, "grad_norm": 2.6905198097229004, "learning_rate": 3.885811674478057e-05, "loss": 0.0089, "step": 792 }, { "epoch": 0.3040644171779141, "grad_norm": 69.44195556640625, "learning_rate": 3.883681295270558e-05, "loss": 0.7481, "step": 793 }, { "epoch": 0.3044478527607362, "grad_norm": 11.465123176574707, "learning_rate": 3.881550916063059e-05, "loss": 0.0668, "step": 794 }, { "epoch": 0.30483128834355827, "grad_norm": 210.43499755859375, "learning_rate": 3.8794205368555605e-05, "loss": 1.9943, "step": 795 }, { "epoch": 0.3052147239263804, "grad_norm": 4.897526741027832, "learning_rate": 3.8772901576480614e-05, "loss": 0.0399, "step": 796 }, { "epoch": 0.30559815950920244, "grad_norm": 0.5718501210212708, "learning_rate": 3.8751597784405623e-05, "loss": 0.0039, "step": 797 }, { "epoch": 0.30598159509202455, "grad_norm": 2.810070037841797, "learning_rate": 3.873029399233063e-05, "loss": 0.0138, "step": 798 }, { "epoch": 0.3063650306748466, "grad_norm": 142.47463989257812, "learning_rate": 3.870899020025565e-05, "loss": 2.2659, "step": 799 }, { "epoch": 0.3067484662576687, "grad_norm": 6.814948558807373, "learning_rate": 3.868768640818066e-05, "loss": 0.0326, "step": 800 }, { "epoch": 0.3071319018404908, "grad_norm": 2.7013139724731445, "learning_rate": 3.866638261610567e-05, "loss": 0.0122, "step": 801 }, { "epoch": 0.3075153374233129, "grad_norm": 0.8929374814033508, "learning_rate": 3.864507882403068e-05, "loss": 0.0043, "step": 802 }, { "epoch": 0.307898773006135, "grad_norm": 6.759843349456787, "learning_rate": 3.8623775031955686e-05, "loss": 0.0239, "step": 803 }, { "epoch": 0.30828220858895705, "grad_norm": 142.0617218017578, "learning_rate": 3.86024712398807e-05, "loss": 2.0098, "step": 804 }, { "epoch": 0.30866564417177916, "grad_norm": 99.49002838134766, "learning_rate": 3.858116744780571e-05, "loss": 0.8111, "step": 805 }, { "epoch": 0.3090490797546012, "grad_norm": 213.89450073242188, "learning_rate": 3.855986365573072e-05, "loss": 6.645, "step": 806 }, { "epoch": 0.30943251533742333, "grad_norm": 0.9271179437637329, "learning_rate": 3.853855986365573e-05, "loss": 0.0058, "step": 807 }, { "epoch": 0.3098159509202454, "grad_norm": 0.571173906326294, "learning_rate": 3.8517256071580746e-05, "loss": 0.0037, "step": 808 }, { "epoch": 0.3101993865030675, "grad_norm": 86.75624084472656, "learning_rate": 3.8495952279505756e-05, "loss": 1.6153, "step": 809 }, { "epoch": 0.31058282208588955, "grad_norm": 17.55738067626953, "learning_rate": 3.8474648487430765e-05, "loss": 0.5586, "step": 810 }, { "epoch": 0.31096625766871167, "grad_norm": 0.6597051620483398, "learning_rate": 3.8453344695355774e-05, "loss": 0.0042, "step": 811 }, { "epoch": 0.3113496932515337, "grad_norm": 112.5511703491211, "learning_rate": 3.8432040903280783e-05, "loss": 1.919, "step": 812 }, { "epoch": 0.31173312883435583, "grad_norm": 1.3902746438980103, "learning_rate": 3.84107371112058e-05, "loss": 0.0039, "step": 813 }, { "epoch": 0.3121165644171779, "grad_norm": 0.32743898034095764, "learning_rate": 3.838943331913081e-05, "loss": 0.0024, "step": 814 }, { "epoch": 0.3125, "grad_norm": 26.69757652282715, "learning_rate": 3.836812952705582e-05, "loss": 0.7124, "step": 815 }, { "epoch": 0.3128834355828221, "grad_norm": 147.48939514160156, "learning_rate": 3.834682573498083e-05, "loss": 3.6739, "step": 816 }, { "epoch": 0.31326687116564417, "grad_norm": 0.9667326807975769, "learning_rate": 3.832552194290584e-05, "loss": 0.0038, "step": 817 }, { "epoch": 0.3136503067484663, "grad_norm": 3.3136909008026123, "learning_rate": 3.830421815083085e-05, "loss": 0.0168, "step": 818 }, { "epoch": 0.31403374233128833, "grad_norm": 3.586291790008545, "learning_rate": 3.828291435875586e-05, "loss": 0.0177, "step": 819 }, { "epoch": 0.31441717791411045, "grad_norm": 0.5330338478088379, "learning_rate": 3.826161056668087e-05, "loss": 0.0022, "step": 820 }, { "epoch": 0.3148006134969325, "grad_norm": 3.030703544616699, "learning_rate": 3.824030677460588e-05, "loss": 0.0088, "step": 821 }, { "epoch": 0.3151840490797546, "grad_norm": 137.04339599609375, "learning_rate": 3.82190029825309e-05, "loss": 2.5419, "step": 822 }, { "epoch": 0.31556748466257667, "grad_norm": 6.643217086791992, "learning_rate": 3.8197699190455906e-05, "loss": 0.0334, "step": 823 }, { "epoch": 0.3159509202453988, "grad_norm": 75.90634155273438, "learning_rate": 3.8176395398380916e-05, "loss": 1.7666, "step": 824 }, { "epoch": 0.31633435582822084, "grad_norm": 14.753342628479004, "learning_rate": 3.8155091606305925e-05, "loss": 0.4866, "step": 825 }, { "epoch": 0.31671779141104295, "grad_norm": 218.08372497558594, "learning_rate": 3.8133787814230934e-05, "loss": 3.4464, "step": 826 }, { "epoch": 0.317101226993865, "grad_norm": 1.1979540586471558, "learning_rate": 3.811248402215595e-05, "loss": 0.0052, "step": 827 }, { "epoch": 0.3174846625766871, "grad_norm": 8.071738243103027, "learning_rate": 3.809118023008096e-05, "loss": 0.0327, "step": 828 }, { "epoch": 0.3178680981595092, "grad_norm": 0.4262201189994812, "learning_rate": 3.806987643800597e-05, "loss": 0.0028, "step": 829 }, { "epoch": 0.3182515337423313, "grad_norm": 239.7389678955078, "learning_rate": 3.804857264593098e-05, "loss": 2.2638, "step": 830 }, { "epoch": 0.3186349693251534, "grad_norm": 54.11473846435547, "learning_rate": 3.802726885385599e-05, "loss": 0.8097, "step": 831 }, { "epoch": 0.31901840490797545, "grad_norm": 2.1526424884796143, "learning_rate": 3.8005965061781004e-05, "loss": 0.005, "step": 832 }, { "epoch": 0.31940184049079756, "grad_norm": 0.307246595621109, "learning_rate": 3.798466126970601e-05, "loss": 0.0019, "step": 833 }, { "epoch": 0.3197852760736196, "grad_norm": 3.812610387802124, "learning_rate": 3.7963357477631015e-05, "loss": 0.0167, "step": 834 }, { "epoch": 0.32016871165644173, "grad_norm": 7.786213397979736, "learning_rate": 3.794205368555603e-05, "loss": 0.0101, "step": 835 }, { "epoch": 0.3205521472392638, "grad_norm": 1.0512534379959106, "learning_rate": 3.792074989348104e-05, "loss": 0.0072, "step": 836 }, { "epoch": 0.3209355828220859, "grad_norm": 190.12026977539062, "learning_rate": 3.789944610140605e-05, "loss": 1.3262, "step": 837 }, { "epoch": 0.32131901840490795, "grad_norm": 0.8700166344642639, "learning_rate": 3.787814230933106e-05, "loss": 0.0046, "step": 838 }, { "epoch": 0.32170245398773006, "grad_norm": 3.2446205615997314, "learning_rate": 3.785683851725607e-05, "loss": 0.0075, "step": 839 }, { "epoch": 0.3220858895705521, "grad_norm": 130.6759033203125, "learning_rate": 3.7835534725181085e-05, "loss": 1.763, "step": 840 }, { "epoch": 0.32246932515337423, "grad_norm": 212.87217712402344, "learning_rate": 3.7814230933106094e-05, "loss": 2.9789, "step": 841 }, { "epoch": 0.32285276073619634, "grad_norm": 37.23453140258789, "learning_rate": 3.7792927141031103e-05, "loss": 0.5869, "step": 842 }, { "epoch": 0.3232361963190184, "grad_norm": 5.587158203125, "learning_rate": 3.777162334895611e-05, "loss": 0.4417, "step": 843 }, { "epoch": 0.3236196319018405, "grad_norm": 115.92218017578125, "learning_rate": 3.775031955688112e-05, "loss": 6.5654, "step": 844 }, { "epoch": 0.32400306748466257, "grad_norm": 41.286556243896484, "learning_rate": 3.772901576480614e-05, "loss": 0.7354, "step": 845 }, { "epoch": 0.3243865030674847, "grad_norm": 0.4213690459728241, "learning_rate": 3.770771197273115e-05, "loss": 0.0031, "step": 846 }, { "epoch": 0.32476993865030673, "grad_norm": 6.38857364654541, "learning_rate": 3.768640818065616e-05, "loss": 0.0108, "step": 847 }, { "epoch": 0.32515337423312884, "grad_norm": 3.0541932582855225, "learning_rate": 3.7665104388581166e-05, "loss": 0.009, "step": 848 }, { "epoch": 0.3255368098159509, "grad_norm": 111.27130889892578, "learning_rate": 3.764380059650618e-05, "loss": 0.629, "step": 849 }, { "epoch": 0.325920245398773, "grad_norm": 11.071561813354492, "learning_rate": 3.762249680443119e-05, "loss": 0.157, "step": 850 }, { "epoch": 0.32630368098159507, "grad_norm": 0.9017576575279236, "learning_rate": 3.76011930123562e-05, "loss": 0.0061, "step": 851 }, { "epoch": 0.3266871165644172, "grad_norm": 0.2364967167377472, "learning_rate": 3.757988922028121e-05, "loss": 0.0018, "step": 852 }, { "epoch": 0.3270705521472393, "grad_norm": 29.81633758544922, "learning_rate": 3.755858542820622e-05, "loss": 0.8071, "step": 853 }, { "epoch": 0.32745398773006135, "grad_norm": 99.31094360351562, "learning_rate": 3.7537281636131236e-05, "loss": 1.1748, "step": 854 }, { "epoch": 0.32783742331288346, "grad_norm": 117.63552856445312, "learning_rate": 3.7515977844056245e-05, "loss": 2.9048, "step": 855 }, { "epoch": 0.3282208588957055, "grad_norm": 162.0067138671875, "learning_rate": 3.7494674051981254e-05, "loss": 1.9249, "step": 856 }, { "epoch": 0.3286042944785276, "grad_norm": 0.38351160287857056, "learning_rate": 3.7473370259906263e-05, "loss": 0.0024, "step": 857 }, { "epoch": 0.3289877300613497, "grad_norm": 2.895625114440918, "learning_rate": 3.745206646783127e-05, "loss": 0.012, "step": 858 }, { "epoch": 0.3293711656441718, "grad_norm": 1.1712878942489624, "learning_rate": 3.743076267575629e-05, "loss": 0.0037, "step": 859 }, { "epoch": 0.32975460122699385, "grad_norm": 7.501552581787109, "learning_rate": 3.74094588836813e-05, "loss": 0.046, "step": 860 }, { "epoch": 0.33013803680981596, "grad_norm": 0.43802642822265625, "learning_rate": 3.738815509160631e-05, "loss": 0.0023, "step": 861 }, { "epoch": 0.330521472392638, "grad_norm": 127.40283966064453, "learning_rate": 3.736685129953132e-05, "loss": 1.3009, "step": 862 }, { "epoch": 0.3309049079754601, "grad_norm": 17.892202377319336, "learning_rate": 3.734554750745633e-05, "loss": 0.4453, "step": 863 }, { "epoch": 0.3312883435582822, "grad_norm": 0.5644845962524414, "learning_rate": 3.732424371538134e-05, "loss": 0.0034, "step": 864 }, { "epoch": 0.3316717791411043, "grad_norm": 2.1184301376342773, "learning_rate": 3.730293992330635e-05, "loss": 0.0061, "step": 865 }, { "epoch": 0.3320552147239264, "grad_norm": 2.9578826427459717, "learning_rate": 3.728163613123136e-05, "loss": 0.0029, "step": 866 }, { "epoch": 0.33243865030674846, "grad_norm": 15.780271530151367, "learning_rate": 3.726033233915637e-05, "loss": 0.6094, "step": 867 }, { "epoch": 0.3328220858895706, "grad_norm": 108.54817962646484, "learning_rate": 3.7239028547081386e-05, "loss": 1.042, "step": 868 }, { "epoch": 0.33320552147239263, "grad_norm": 4.40326452255249, "learning_rate": 3.7217724755006395e-05, "loss": 0.0163, "step": 869 }, { "epoch": 0.33358895705521474, "grad_norm": 12.562338829040527, "learning_rate": 3.7196420962931405e-05, "loss": 0.1777, "step": 870 }, { "epoch": 0.3339723926380368, "grad_norm": 30.592666625976562, "learning_rate": 3.7175117170856414e-05, "loss": 0.5625, "step": 871 }, { "epoch": 0.3343558282208589, "grad_norm": 141.96115112304688, "learning_rate": 3.715381337878143e-05, "loss": 2.6057, "step": 872 }, { "epoch": 0.33473926380368096, "grad_norm": 95.01751708984375, "learning_rate": 3.713250958670644e-05, "loss": 1.1113, "step": 873 }, { "epoch": 0.3351226993865031, "grad_norm": 66.68183135986328, "learning_rate": 3.711120579463145e-05, "loss": 1.7306, "step": 874 }, { "epoch": 0.33550613496932513, "grad_norm": 248.12063598632812, "learning_rate": 3.708990200255646e-05, "loss": 1.7404, "step": 875 }, { "epoch": 0.33588957055214724, "grad_norm": 5.3473029136657715, "learning_rate": 3.706859821048147e-05, "loss": 0.0198, "step": 876 }, { "epoch": 0.3362730061349693, "grad_norm": 2.2328739166259766, "learning_rate": 3.7047294418406484e-05, "loss": 0.0116, "step": 877 }, { "epoch": 0.3366564417177914, "grad_norm": 22.21512222290039, "learning_rate": 3.702599062633149e-05, "loss": 0.4836, "step": 878 }, { "epoch": 0.3370398773006135, "grad_norm": 8.086519241333008, "learning_rate": 3.7004686834256495e-05, "loss": 0.0579, "step": 879 }, { "epoch": 0.3374233128834356, "grad_norm": 97.34036254882812, "learning_rate": 3.6983383042181505e-05, "loss": 1.1231, "step": 880 }, { "epoch": 0.3378067484662577, "grad_norm": 13.689335823059082, "learning_rate": 3.696207925010652e-05, "loss": 0.1815, "step": 881 }, { "epoch": 0.33819018404907975, "grad_norm": 0.08570973575115204, "learning_rate": 3.694077545803153e-05, "loss": 0.0014, "step": 882 }, { "epoch": 0.33857361963190186, "grad_norm": 14.040021896362305, "learning_rate": 3.691947166595654e-05, "loss": 0.0878, "step": 883 }, { "epoch": 0.3389570552147239, "grad_norm": 146.0187530517578, "learning_rate": 3.689816787388155e-05, "loss": 1.1436, "step": 884 }, { "epoch": 0.339340490797546, "grad_norm": 102.55371856689453, "learning_rate": 3.687686408180656e-05, "loss": 5.8774, "step": 885 }, { "epoch": 0.3397239263803681, "grad_norm": 8.883628845214844, "learning_rate": 3.6855560289731574e-05, "loss": 0.0355, "step": 886 }, { "epoch": 0.3401073619631902, "grad_norm": 199.91314697265625, "learning_rate": 3.6834256497656583e-05, "loss": 2.3519, "step": 887 }, { "epoch": 0.34049079754601225, "grad_norm": 74.7225112915039, "learning_rate": 3.681295270558159e-05, "loss": 0.9121, "step": 888 }, { "epoch": 0.34087423312883436, "grad_norm": 8.965007781982422, "learning_rate": 3.67916489135066e-05, "loss": 0.0455, "step": 889 }, { "epoch": 0.3412576687116564, "grad_norm": 21.721363067626953, "learning_rate": 3.677034512143162e-05, "loss": 0.6646, "step": 890 }, { "epoch": 0.3416411042944785, "grad_norm": 2.8286852836608887, "learning_rate": 3.674904132935663e-05, "loss": 0.0111, "step": 891 }, { "epoch": 0.34202453987730064, "grad_norm": 1.648725152015686, "learning_rate": 3.672773753728164e-05, "loss": 0.006, "step": 892 }, { "epoch": 0.3424079754601227, "grad_norm": 53.5578498840332, "learning_rate": 3.6706433745206646e-05, "loss": 0.8672, "step": 893 }, { "epoch": 0.3427914110429448, "grad_norm": 10.51111888885498, "learning_rate": 3.6685129953131655e-05, "loss": 0.0892, "step": 894 }, { "epoch": 0.34317484662576686, "grad_norm": 56.53624725341797, "learning_rate": 3.666382616105667e-05, "loss": 1.2852, "step": 895 }, { "epoch": 0.34355828220858897, "grad_norm": 20.492630004882812, "learning_rate": 3.664252236898168e-05, "loss": 0.5601, "step": 896 }, { "epoch": 0.34394171779141103, "grad_norm": 209.2409210205078, "learning_rate": 3.662121857690669e-05, "loss": 4.2934, "step": 897 }, { "epoch": 0.34432515337423314, "grad_norm": 48.036991119384766, "learning_rate": 3.65999147848317e-05, "loss": 1.4971, "step": 898 }, { "epoch": 0.3447085889570552, "grad_norm": 28.335689544677734, "learning_rate": 3.6578610992756715e-05, "loss": 0.54, "step": 899 }, { "epoch": 0.3450920245398773, "grad_norm": 16.36398696899414, "learning_rate": 3.6557307200681725e-05, "loss": 0.5205, "step": 900 }, { "epoch": 0.34547546012269936, "grad_norm": 13.94140625, "learning_rate": 3.6536003408606734e-05, "loss": 0.1626, "step": 901 }, { "epoch": 0.3458588957055215, "grad_norm": 5.451650142669678, "learning_rate": 3.6514699616531743e-05, "loss": 0.02, "step": 902 }, { "epoch": 0.3462423312883436, "grad_norm": 1.251380205154419, "learning_rate": 3.649339582445675e-05, "loss": 0.004, "step": 903 }, { "epoch": 0.34662576687116564, "grad_norm": 145.5535888671875, "learning_rate": 3.647209203238177e-05, "loss": 1.4522, "step": 904 }, { "epoch": 0.34700920245398775, "grad_norm": 59.523311614990234, "learning_rate": 3.645078824030678e-05, "loss": 1.4131, "step": 905 }, { "epoch": 0.3473926380368098, "grad_norm": 132.08468627929688, "learning_rate": 3.642948444823179e-05, "loss": 3.3542, "step": 906 }, { "epoch": 0.3477760736196319, "grad_norm": 165.15797424316406, "learning_rate": 3.64081806561568e-05, "loss": 2.2759, "step": 907 }, { "epoch": 0.348159509202454, "grad_norm": 1.4160902500152588, "learning_rate": 3.6386876864081806e-05, "loss": 0.0076, "step": 908 }, { "epoch": 0.3485429447852761, "grad_norm": 0.31354132294654846, "learning_rate": 3.636557307200682e-05, "loss": 0.0015, "step": 909 }, { "epoch": 0.34892638036809814, "grad_norm": 13.58237075805664, "learning_rate": 3.634426927993183e-05, "loss": 0.2168, "step": 910 }, { "epoch": 0.34930981595092025, "grad_norm": 45.58867263793945, "learning_rate": 3.632296548785684e-05, "loss": 0.5972, "step": 911 }, { "epoch": 0.3496932515337423, "grad_norm": 13.427237510681152, "learning_rate": 3.630166169578185e-05, "loss": 0.52, "step": 912 }, { "epoch": 0.3500766871165644, "grad_norm": 17.51856803894043, "learning_rate": 3.6280357903706866e-05, "loss": 0.5234, "step": 913 }, { "epoch": 0.3504601226993865, "grad_norm": 4.446659088134766, "learning_rate": 3.6259054111631875e-05, "loss": 0.0372, "step": 914 }, { "epoch": 0.3508435582822086, "grad_norm": 1.4216930866241455, "learning_rate": 3.6237750319556885e-05, "loss": 0.0073, "step": 915 }, { "epoch": 0.3512269938650307, "grad_norm": 77.42320251464844, "learning_rate": 3.6216446527481894e-05, "loss": 1.1709, "step": 916 }, { "epoch": 0.35161042944785276, "grad_norm": 97.61112213134766, "learning_rate": 3.6195142735406903e-05, "loss": 0.8571, "step": 917 }, { "epoch": 0.35199386503067487, "grad_norm": 135.04241943359375, "learning_rate": 3.617383894333192e-05, "loss": 2.0392, "step": 918 }, { "epoch": 0.3523773006134969, "grad_norm": 239.87155151367188, "learning_rate": 3.615253515125693e-05, "loss": 6.3501, "step": 919 }, { "epoch": 0.35276073619631904, "grad_norm": 4.7225165367126465, "learning_rate": 3.613123135918194e-05, "loss": 0.0113, "step": 920 }, { "epoch": 0.3531441717791411, "grad_norm": 288.6584167480469, "learning_rate": 3.610992756710695e-05, "loss": 1.7044, "step": 921 }, { "epoch": 0.3535276073619632, "grad_norm": 16.934629440307617, "learning_rate": 3.608862377503196e-05, "loss": 0.0699, "step": 922 }, { "epoch": 0.35391104294478526, "grad_norm": 19.55699920654297, "learning_rate": 3.606731998295697e-05, "loss": 0.5528, "step": 923 }, { "epoch": 0.35429447852760737, "grad_norm": 170.64454650878906, "learning_rate": 3.6046016190881975e-05, "loss": 5.6133, "step": 924 }, { "epoch": 0.3546779141104294, "grad_norm": 65.25788879394531, "learning_rate": 3.6024712398806985e-05, "loss": 1.1045, "step": 925 }, { "epoch": 0.35506134969325154, "grad_norm": 56.15605545043945, "learning_rate": 3.6003408606732e-05, "loss": 1.2637, "step": 926 }, { "epoch": 0.3554447852760736, "grad_norm": 18.860065460205078, "learning_rate": 3.598210481465701e-05, "loss": 0.0143, "step": 927 }, { "epoch": 0.3558282208588957, "grad_norm": 7.427710056304932, "learning_rate": 3.596080102258202e-05, "loss": 0.0345, "step": 928 }, { "epoch": 0.3562116564417178, "grad_norm": 34.25039291381836, "learning_rate": 3.593949723050703e-05, "loss": 0.8027, "step": 929 }, { "epoch": 0.3565950920245399, "grad_norm": 6.015954971313477, "learning_rate": 3.591819343843204e-05, "loss": 0.4458, "step": 930 }, { "epoch": 0.356978527607362, "grad_norm": 1.2891515493392944, "learning_rate": 3.5896889646357054e-05, "loss": 0.0023, "step": 931 }, { "epoch": 0.35736196319018404, "grad_norm": 253.78134155273438, "learning_rate": 3.5875585854282063e-05, "loss": 6.0605, "step": 932 }, { "epoch": 0.35774539877300615, "grad_norm": 2.394024610519409, "learning_rate": 3.585428206220707e-05, "loss": 0.0065, "step": 933 }, { "epoch": 0.3581288343558282, "grad_norm": 105.65200805664062, "learning_rate": 3.583297827013208e-05, "loss": 1.461, "step": 934 }, { "epoch": 0.3585122699386503, "grad_norm": 3.791978597640991, "learning_rate": 3.581167447805709e-05, "loss": 0.0257, "step": 935 }, { "epoch": 0.3588957055214724, "grad_norm": 175.56011962890625, "learning_rate": 3.579037068598211e-05, "loss": 2.4556, "step": 936 }, { "epoch": 0.3592791411042945, "grad_norm": 2.273514747619629, "learning_rate": 3.576906689390712e-05, "loss": 0.0097, "step": 937 }, { "epoch": 0.35966257668711654, "grad_norm": 30.347814559936523, "learning_rate": 3.5747763101832126e-05, "loss": 0.5523, "step": 938 }, { "epoch": 0.36004601226993865, "grad_norm": 48.91766357421875, "learning_rate": 3.5726459309757135e-05, "loss": 0.6304, "step": 939 }, { "epoch": 0.3604294478527607, "grad_norm": 12.900555610656738, "learning_rate": 3.570515551768215e-05, "loss": 0.5147, "step": 940 }, { "epoch": 0.3608128834355828, "grad_norm": 109.76214599609375, "learning_rate": 3.568385172560716e-05, "loss": 1.792, "step": 941 }, { "epoch": 0.36119631901840493, "grad_norm": 10.001334190368652, "learning_rate": 3.566254793353217e-05, "loss": 0.2402, "step": 942 }, { "epoch": 0.361579754601227, "grad_norm": 15.179523468017578, "learning_rate": 3.564124414145718e-05, "loss": 0.0724, "step": 943 }, { "epoch": 0.3619631901840491, "grad_norm": 3.511728048324585, "learning_rate": 3.561994034938219e-05, "loss": 0.0056, "step": 944 }, { "epoch": 0.36234662576687116, "grad_norm": 96.32710266113281, "learning_rate": 3.5598636557307205e-05, "loss": 6.5293, "step": 945 }, { "epoch": 0.36273006134969327, "grad_norm": 6.759649753570557, "learning_rate": 3.5577332765232214e-05, "loss": 0.0846, "step": 946 }, { "epoch": 0.3631134969325153, "grad_norm": 14.899292945861816, "learning_rate": 3.5556028973157223e-05, "loss": 0.104, "step": 947 }, { "epoch": 0.36349693251533743, "grad_norm": 8.704339027404785, "learning_rate": 3.553472518108223e-05, "loss": 0.0911, "step": 948 }, { "epoch": 0.3638803680981595, "grad_norm": 5.973624229431152, "learning_rate": 3.551342138900724e-05, "loss": 0.0193, "step": 949 }, { "epoch": 0.3642638036809816, "grad_norm": 178.04185485839844, "learning_rate": 3.549211759693226e-05, "loss": 1.0596, "step": 950 }, { "epoch": 0.36464723926380366, "grad_norm": 126.13075256347656, "learning_rate": 3.547081380485727e-05, "loss": 1.6575, "step": 951 }, { "epoch": 0.36503067484662577, "grad_norm": 101.19743347167969, "learning_rate": 3.544951001278228e-05, "loss": 5.752, "step": 952 }, { "epoch": 0.3654141104294479, "grad_norm": 11.74567985534668, "learning_rate": 3.5428206220707286e-05, "loss": 0.2307, "step": 953 }, { "epoch": 0.36579754601226994, "grad_norm": 24.42420196533203, "learning_rate": 3.54069024286323e-05, "loss": 0.1229, "step": 954 }, { "epoch": 0.36618098159509205, "grad_norm": 113.34022521972656, "learning_rate": 3.538559863655731e-05, "loss": 1.3563, "step": 955 }, { "epoch": 0.3665644171779141, "grad_norm": 16.56443214416504, "learning_rate": 3.536429484448232e-05, "loss": 0.4824, "step": 956 }, { "epoch": 0.3669478527607362, "grad_norm": 4.898117542266846, "learning_rate": 3.534299105240733e-05, "loss": 0.0257, "step": 957 }, { "epoch": 0.36733128834355827, "grad_norm": 11.129241943359375, "learning_rate": 3.532168726033234e-05, "loss": 0.0542, "step": 958 }, { "epoch": 0.3677147239263804, "grad_norm": 31.687559127807617, "learning_rate": 3.5300383468257355e-05, "loss": 0.6754, "step": 959 }, { "epoch": 0.36809815950920244, "grad_norm": 1224.97607421875, "learning_rate": 3.5279079676182365e-05, "loss": 5.4961, "step": 960 }, { "epoch": 0.36848159509202455, "grad_norm": 19.937158584594727, "learning_rate": 3.5257775884107374e-05, "loss": 0.5596, "step": 961 }, { "epoch": 0.3688650306748466, "grad_norm": 15.832524299621582, "learning_rate": 3.5236472092032383e-05, "loss": 0.09, "step": 962 }, { "epoch": 0.3692484662576687, "grad_norm": 139.12794494628906, "learning_rate": 3.52151682999574e-05, "loss": 0.9937, "step": 963 }, { "epoch": 0.3696319018404908, "grad_norm": 218.58534240722656, "learning_rate": 3.519386450788241e-05, "loss": 2.086, "step": 964 }, { "epoch": 0.3700153374233129, "grad_norm": 8.590350151062012, "learning_rate": 3.517256071580742e-05, "loss": 0.1904, "step": 965 }, { "epoch": 0.370398773006135, "grad_norm": 6.688065052032471, "learning_rate": 3.515125692373243e-05, "loss": 0.0427, "step": 966 }, { "epoch": 0.37078220858895705, "grad_norm": 24.327075958251953, "learning_rate": 3.512995313165744e-05, "loss": 0.645, "step": 967 }, { "epoch": 0.37116564417177916, "grad_norm": 2.1375648975372314, "learning_rate": 3.510864933958245e-05, "loss": 0.0069, "step": 968 }, { "epoch": 0.3715490797546012, "grad_norm": 1.738763689994812, "learning_rate": 3.5087345547507455e-05, "loss": 0.007, "step": 969 }, { "epoch": 0.37193251533742333, "grad_norm": 4.914362907409668, "learning_rate": 3.5066041755432465e-05, "loss": 0.4139, "step": 970 }, { "epoch": 0.3723159509202454, "grad_norm": 13.827409744262695, "learning_rate": 3.5044737963357474e-05, "loss": 0.5098, "step": 971 }, { "epoch": 0.3726993865030675, "grad_norm": 1.2973281145095825, "learning_rate": 3.502343417128249e-05, "loss": 0.0035, "step": 972 }, { "epoch": 0.37308282208588955, "grad_norm": 138.20770263671875, "learning_rate": 3.50021303792075e-05, "loss": 0.9175, "step": 973 }, { "epoch": 0.37346625766871167, "grad_norm": 0.4478858709335327, "learning_rate": 3.498082658713251e-05, "loss": 0.0031, "step": 974 }, { "epoch": 0.3738496932515337, "grad_norm": 0.9873825311660767, "learning_rate": 3.495952279505752e-05, "loss": 0.0056, "step": 975 }, { "epoch": 0.37423312883435583, "grad_norm": 43.3076171875, "learning_rate": 3.493821900298253e-05, "loss": 0.7267, "step": 976 }, { "epoch": 0.3746165644171779, "grad_norm": 1.7374697923660278, "learning_rate": 3.4916915210907543e-05, "loss": 0.0115, "step": 977 }, { "epoch": 0.375, "grad_norm": 28.69477081298828, "learning_rate": 3.489561141883255e-05, "loss": 0.5532, "step": 978 }, { "epoch": 0.3753834355828221, "grad_norm": 10.242043495178223, "learning_rate": 3.487430762675756e-05, "loss": 0.4641, "step": 979 }, { "epoch": 0.37576687116564417, "grad_norm": 15.66881275177002, "learning_rate": 3.485300383468257e-05, "loss": 0.1858, "step": 980 }, { "epoch": 0.3761503067484663, "grad_norm": 9.314672470092773, "learning_rate": 3.483170004260759e-05, "loss": 0.051, "step": 981 }, { "epoch": 0.37653374233128833, "grad_norm": 9.923254013061523, "learning_rate": 3.48103962505326e-05, "loss": 0.0828, "step": 982 }, { "epoch": 0.37691717791411045, "grad_norm": 1.2475645542144775, "learning_rate": 3.4789092458457606e-05, "loss": 0.0059, "step": 983 }, { "epoch": 0.3773006134969325, "grad_norm": 60.9388313293457, "learning_rate": 3.4767788666382615e-05, "loss": 1.0782, "step": 984 }, { "epoch": 0.3776840490797546, "grad_norm": 43.85203552246094, "learning_rate": 3.4746484874307625e-05, "loss": 0.9326, "step": 985 }, { "epoch": 0.37806748466257667, "grad_norm": 7.825187683105469, "learning_rate": 3.472518108223264e-05, "loss": 0.4473, "step": 986 }, { "epoch": 0.3784509202453988, "grad_norm": 3.5926787853240967, "learning_rate": 3.470387729015765e-05, "loss": 0.0061, "step": 987 }, { "epoch": 0.37883435582822084, "grad_norm": 2.565843105316162, "learning_rate": 3.468257349808266e-05, "loss": 0.016, "step": 988 }, { "epoch": 0.37921779141104295, "grad_norm": 11.560431480407715, "learning_rate": 3.466126970600767e-05, "loss": 0.0822, "step": 989 }, { "epoch": 0.379601226993865, "grad_norm": 12.063653945922852, "learning_rate": 3.4639965913932685e-05, "loss": 0.0968, "step": 990 }, { "epoch": 0.3799846625766871, "grad_norm": 5.371000289916992, "learning_rate": 3.4618662121857694e-05, "loss": 0.4231, "step": 991 }, { "epoch": 0.3803680981595092, "grad_norm": 0.4216524660587311, "learning_rate": 3.4597358329782703e-05, "loss": 0.0027, "step": 992 }, { "epoch": 0.3807515337423313, "grad_norm": 0.5369511246681213, "learning_rate": 3.457605453770771e-05, "loss": 0.0026, "step": 993 }, { "epoch": 0.3811349693251534, "grad_norm": 66.50501251220703, "learning_rate": 3.455475074563272e-05, "loss": 0.9512, "step": 994 }, { "epoch": 0.38151840490797545, "grad_norm": 31.421464920043945, "learning_rate": 3.453344695355774e-05, "loss": 0.645, "step": 995 }, { "epoch": 0.38190184049079756, "grad_norm": 57.370025634765625, "learning_rate": 3.451214316148275e-05, "loss": 0.6948, "step": 996 }, { "epoch": 0.3822852760736196, "grad_norm": 116.8099594116211, "learning_rate": 3.449083936940776e-05, "loss": 5.9358, "step": 997 }, { "epoch": 0.38266871165644173, "grad_norm": 58.960086822509766, "learning_rate": 3.4469535577332766e-05, "loss": 0.7402, "step": 998 }, { "epoch": 0.3830521472392638, "grad_norm": 2.194159984588623, "learning_rate": 3.4448231785257775e-05, "loss": 0.0069, "step": 999 }, { "epoch": 0.3834355828220859, "grad_norm": 1.4175691604614258, "learning_rate": 3.442692799318279e-05, "loss": 0.005, "step": 1000 }, { "epoch": 0.38381901840490795, "grad_norm": 1.2209025621414185, "learning_rate": 3.44056242011078e-05, "loss": 0.0048, "step": 1001 }, { "epoch": 0.38420245398773006, "grad_norm": 8.037437438964844, "learning_rate": 3.438432040903281e-05, "loss": 0.0942, "step": 1002 }, { "epoch": 0.3845858895705521, "grad_norm": 0.16226428747177124, "learning_rate": 3.436301661695782e-05, "loss": 0.0023, "step": 1003 }, { "epoch": 0.38496932515337423, "grad_norm": 10.410154342651367, "learning_rate": 3.4341712824882835e-05, "loss": 0.037, "step": 1004 }, { "epoch": 0.38535276073619634, "grad_norm": 9.644920349121094, "learning_rate": 3.4320409032807845e-05, "loss": 0.0425, "step": 1005 }, { "epoch": 0.3857361963190184, "grad_norm": 0.09576055407524109, "learning_rate": 3.4299105240732854e-05, "loss": 0.0012, "step": 1006 }, { "epoch": 0.3861196319018405, "grad_norm": 1.5106265544891357, "learning_rate": 3.4277801448657863e-05, "loss": 0.0071, "step": 1007 }, { "epoch": 0.38650306748466257, "grad_norm": 5.6399126052856445, "learning_rate": 3.425649765658287e-05, "loss": 0.0083, "step": 1008 }, { "epoch": 0.3868865030674847, "grad_norm": 0.7495969533920288, "learning_rate": 3.423519386450789e-05, "loss": 0.0038, "step": 1009 }, { "epoch": 0.38726993865030673, "grad_norm": 107.00861358642578, "learning_rate": 3.42138900724329e-05, "loss": 2.4652, "step": 1010 }, { "epoch": 0.38765337423312884, "grad_norm": 4.924862861633301, "learning_rate": 3.419258628035791e-05, "loss": 0.0218, "step": 1011 }, { "epoch": 0.3880368098159509, "grad_norm": 10.599689483642578, "learning_rate": 3.417128248828292e-05, "loss": 0.4526, "step": 1012 }, { "epoch": 0.388420245398773, "grad_norm": 54.0268669128418, "learning_rate": 3.4149978696207926e-05, "loss": 0.8702, "step": 1013 }, { "epoch": 0.38880368098159507, "grad_norm": 78.01480865478516, "learning_rate": 3.4128674904132935e-05, "loss": 1.5245, "step": 1014 }, { "epoch": 0.3891871165644172, "grad_norm": 10.43925952911377, "learning_rate": 3.4107371112057945e-05, "loss": 0.0676, "step": 1015 }, { "epoch": 0.3895705521472393, "grad_norm": 2.993187427520752, "learning_rate": 3.4086067319982954e-05, "loss": 0.0068, "step": 1016 }, { "epoch": 0.38995398773006135, "grad_norm": 0.821253776550293, "learning_rate": 3.406476352790797e-05, "loss": 0.0037, "step": 1017 }, { "epoch": 0.39033742331288346, "grad_norm": 87.67337799072266, "learning_rate": 3.404345973583298e-05, "loss": 6.043, "step": 1018 }, { "epoch": 0.3907208588957055, "grad_norm": 142.62059020996094, "learning_rate": 3.402215594375799e-05, "loss": 1.3897, "step": 1019 }, { "epoch": 0.3911042944785276, "grad_norm": 209.81443786621094, "learning_rate": 3.4000852151683e-05, "loss": 3.2858, "step": 1020 }, { "epoch": 0.3914877300613497, "grad_norm": 30.382448196411133, "learning_rate": 3.397954835960801e-05, "loss": 0.7295, "step": 1021 }, { "epoch": 0.3918711656441718, "grad_norm": 2.5746898651123047, "learning_rate": 3.3958244567533023e-05, "loss": 0.0051, "step": 1022 }, { "epoch": 0.39225460122699385, "grad_norm": 28.87703514099121, "learning_rate": 3.393694077545803e-05, "loss": 0.731, "step": 1023 }, { "epoch": 0.39263803680981596, "grad_norm": 165.62081909179688, "learning_rate": 3.391563698338304e-05, "loss": 2.7097, "step": 1024 }, { "epoch": 0.393021472392638, "grad_norm": 0.3038354814052582, "learning_rate": 3.389433319130805e-05, "loss": 0.0019, "step": 1025 }, { "epoch": 0.3934049079754601, "grad_norm": 1.9357569217681885, "learning_rate": 3.387302939923306e-05, "loss": 0.0062, "step": 1026 }, { "epoch": 0.3937883435582822, "grad_norm": 5.303816795349121, "learning_rate": 3.385172560715808e-05, "loss": 0.0374, "step": 1027 }, { "epoch": 0.3941717791411043, "grad_norm": 34.76011276245117, "learning_rate": 3.3830421815083086e-05, "loss": 0.6807, "step": 1028 }, { "epoch": 0.3945552147239264, "grad_norm": 7.539120197296143, "learning_rate": 3.3809118023008095e-05, "loss": 0.0786, "step": 1029 }, { "epoch": 0.39493865030674846, "grad_norm": 6.309747219085693, "learning_rate": 3.3787814230933105e-05, "loss": 0.0261, "step": 1030 }, { "epoch": 0.3953220858895706, "grad_norm": 106.8431396484375, "learning_rate": 3.376651043885812e-05, "loss": 1.046, "step": 1031 }, { "epoch": 0.39570552147239263, "grad_norm": 8.58929443359375, "learning_rate": 3.374520664678313e-05, "loss": 0.467, "step": 1032 }, { "epoch": 0.39608895705521474, "grad_norm": 46.64529800415039, "learning_rate": 3.372390285470814e-05, "loss": 0.645, "step": 1033 }, { "epoch": 0.3964723926380368, "grad_norm": 138.46295166015625, "learning_rate": 3.370259906263315e-05, "loss": 5.7383, "step": 1034 }, { "epoch": 0.3968558282208589, "grad_norm": 1.559178113937378, "learning_rate": 3.368129527055816e-05, "loss": 0.0107, "step": 1035 }, { "epoch": 0.39723926380368096, "grad_norm": 8.139238357543945, "learning_rate": 3.3659991478483174e-05, "loss": 0.0212, "step": 1036 }, { "epoch": 0.3976226993865031, "grad_norm": 69.52721405029297, "learning_rate": 3.3638687686408183e-05, "loss": 0.6763, "step": 1037 }, { "epoch": 0.39800613496932513, "grad_norm": 80.78510284423828, "learning_rate": 3.361738389433319e-05, "loss": 0.6592, "step": 1038 }, { "epoch": 0.39838957055214724, "grad_norm": 1.6643552780151367, "learning_rate": 3.35960801022582e-05, "loss": 0.0042, "step": 1039 }, { "epoch": 0.3987730061349693, "grad_norm": 35.950103759765625, "learning_rate": 3.357477631018321e-05, "loss": 0.6318, "step": 1040 }, { "epoch": 0.3991564417177914, "grad_norm": 5.243088722229004, "learning_rate": 3.355347251810823e-05, "loss": 0.0222, "step": 1041 }, { "epoch": 0.3995398773006135, "grad_norm": 1.2611973285675049, "learning_rate": 3.353216872603324e-05, "loss": 0.004, "step": 1042 }, { "epoch": 0.3999233128834356, "grad_norm": 7.315226078033447, "learning_rate": 3.3510864933958246e-05, "loss": 0.173, "step": 1043 }, { "epoch": 0.4003067484662577, "grad_norm": 0.6392849683761597, "learning_rate": 3.3489561141883255e-05, "loss": 0.0028, "step": 1044 }, { "epoch": 0.40069018404907975, "grad_norm": 142.23976135253906, "learning_rate": 3.346825734980827e-05, "loss": 5.9639, "step": 1045 }, { "epoch": 0.40107361963190186, "grad_norm": 27.186695098876953, "learning_rate": 3.344695355773328e-05, "loss": 0.2051, "step": 1046 }, { "epoch": 0.4014570552147239, "grad_norm": 108.6761474609375, "learning_rate": 3.342564976565829e-05, "loss": 2.6102, "step": 1047 }, { "epoch": 0.401840490797546, "grad_norm": 40.983646392822266, "learning_rate": 3.34043459735833e-05, "loss": 0.6411, "step": 1048 }, { "epoch": 0.4022239263803681, "grad_norm": 0.22776922583580017, "learning_rate": 3.338304218150831e-05, "loss": 0.0019, "step": 1049 }, { "epoch": 0.4026073619631902, "grad_norm": 2.4507944583892822, "learning_rate": 3.3361738389433325e-05, "loss": 0.0061, "step": 1050 }, { "epoch": 0.40299079754601225, "grad_norm": 13.551019668579102, "learning_rate": 3.3340434597358334e-05, "loss": 0.0541, "step": 1051 }, { "epoch": 0.40337423312883436, "grad_norm": 1.0843762159347534, "learning_rate": 3.3319130805283343e-05, "loss": 0.0046, "step": 1052 }, { "epoch": 0.4037576687116564, "grad_norm": 79.6090316772461, "learning_rate": 3.329782701320835e-05, "loss": 1.67, "step": 1053 }, { "epoch": 0.4041411042944785, "grad_norm": 221.82077026367188, "learning_rate": 3.327652322113337e-05, "loss": 2.742, "step": 1054 }, { "epoch": 0.40452453987730064, "grad_norm": 15.52837085723877, "learning_rate": 3.325521942905838e-05, "loss": 0.2072, "step": 1055 }, { "epoch": 0.4049079754601227, "grad_norm": 21.680545806884766, "learning_rate": 3.323391563698339e-05, "loss": 0.5474, "step": 1056 }, { "epoch": 0.4052914110429448, "grad_norm": 91.9194107055664, "learning_rate": 3.32126118449084e-05, "loss": 2.147, "step": 1057 }, { "epoch": 0.40567484662576686, "grad_norm": 6.127416610717773, "learning_rate": 3.3191308052833406e-05, "loss": 0.0274, "step": 1058 }, { "epoch": 0.40605828220858897, "grad_norm": 96.55035400390625, "learning_rate": 3.3170004260758415e-05, "loss": 5.8691, "step": 1059 }, { "epoch": 0.40644171779141103, "grad_norm": 129.17575073242188, "learning_rate": 3.3148700468683425e-05, "loss": 1.4494, "step": 1060 }, { "epoch": 0.40682515337423314, "grad_norm": 14.602612495422363, "learning_rate": 3.3127396676608434e-05, "loss": 0.1394, "step": 1061 }, { "epoch": 0.4072085889570552, "grad_norm": 165.3533172607422, "learning_rate": 3.310609288453344e-05, "loss": 5.4185, "step": 1062 }, { "epoch": 0.4075920245398773, "grad_norm": 114.32776641845703, "learning_rate": 3.308478909245846e-05, "loss": 0.9317, "step": 1063 }, { "epoch": 0.40797546012269936, "grad_norm": 10.328788757324219, "learning_rate": 3.306348530038347e-05, "loss": 0.0949, "step": 1064 }, { "epoch": 0.4083588957055215, "grad_norm": 73.43679809570312, "learning_rate": 3.304218150830848e-05, "loss": 1.0195, "step": 1065 }, { "epoch": 0.4087423312883436, "grad_norm": 94.94047546386719, "learning_rate": 3.302087771623349e-05, "loss": 5.0684, "step": 1066 }, { "epoch": 0.40912576687116564, "grad_norm": 34.3326301574707, "learning_rate": 3.2999573924158503e-05, "loss": 0.0419, "step": 1067 }, { "epoch": 0.40950920245398775, "grad_norm": 2.6018779277801514, "learning_rate": 3.297827013208351e-05, "loss": 0.0124, "step": 1068 }, { "epoch": 0.4098926380368098, "grad_norm": 1.7786577939987183, "learning_rate": 3.295696634000852e-05, "loss": 0.0086, "step": 1069 }, { "epoch": 0.4102760736196319, "grad_norm": 5.96002197265625, "learning_rate": 3.293566254793353e-05, "loss": 0.0331, "step": 1070 }, { "epoch": 0.410659509202454, "grad_norm": 9.824308395385742, "learning_rate": 3.291435875585854e-05, "loss": 0.4092, "step": 1071 }, { "epoch": 0.4110429447852761, "grad_norm": 1.0867952108383179, "learning_rate": 3.289305496378356e-05, "loss": 0.0044, "step": 1072 }, { "epoch": 0.41142638036809814, "grad_norm": 141.3063507080078, "learning_rate": 3.2871751171708566e-05, "loss": 2.0315, "step": 1073 }, { "epoch": 0.41180981595092025, "grad_norm": 59.421531677246094, "learning_rate": 3.2850447379633575e-05, "loss": 1.2481, "step": 1074 }, { "epoch": 0.4121932515337423, "grad_norm": 414.6046142578125, "learning_rate": 3.2829143587558585e-05, "loss": 3.2283, "step": 1075 }, { "epoch": 0.4125766871165644, "grad_norm": 8.04227352142334, "learning_rate": 3.2807839795483594e-05, "loss": 0.0501, "step": 1076 }, { "epoch": 0.4129601226993865, "grad_norm": 2.519871711730957, "learning_rate": 3.278653600340861e-05, "loss": 0.0129, "step": 1077 }, { "epoch": 0.4133435582822086, "grad_norm": 9.231657981872559, "learning_rate": 3.276523221133362e-05, "loss": 0.1472, "step": 1078 }, { "epoch": 0.4137269938650307, "grad_norm": 0.6013676524162292, "learning_rate": 3.274392841925863e-05, "loss": 0.0028, "step": 1079 }, { "epoch": 0.41411042944785276, "grad_norm": 9.617133140563965, "learning_rate": 3.272262462718364e-05, "loss": 0.033, "step": 1080 }, { "epoch": 0.41449386503067487, "grad_norm": 0.8977956771850586, "learning_rate": 3.2701320835108654e-05, "loss": 0.0046, "step": 1081 }, { "epoch": 0.4148773006134969, "grad_norm": 90.50689697265625, "learning_rate": 3.2680017043033663e-05, "loss": 5.4648, "step": 1082 }, { "epoch": 0.41526073619631904, "grad_norm": 0.3941224217414856, "learning_rate": 3.265871325095867e-05, "loss": 0.0039, "step": 1083 }, { "epoch": 0.4156441717791411, "grad_norm": 73.57256317138672, "learning_rate": 3.263740945888368e-05, "loss": 6.416, "step": 1084 }, { "epoch": 0.4160276073619632, "grad_norm": 59.00543975830078, "learning_rate": 3.261610566680869e-05, "loss": 0.7217, "step": 1085 }, { "epoch": 0.41641104294478526, "grad_norm": 3.38399600982666, "learning_rate": 3.259480187473371e-05, "loss": 0.0067, "step": 1086 }, { "epoch": 0.41679447852760737, "grad_norm": 46.32831573486328, "learning_rate": 3.257349808265872e-05, "loss": 0.6934, "step": 1087 }, { "epoch": 0.4171779141104294, "grad_norm": 146.98757934570312, "learning_rate": 3.2552194290583726e-05, "loss": 5.5771, "step": 1088 }, { "epoch": 0.41756134969325154, "grad_norm": 0.4296836256980896, "learning_rate": 3.2530890498508735e-05, "loss": 0.0025, "step": 1089 }, { "epoch": 0.4179447852760736, "grad_norm": 111.8563003540039, "learning_rate": 3.2509586706433745e-05, "loss": 5.4316, "step": 1090 }, { "epoch": 0.4183282208588957, "grad_norm": 6.3267083168029785, "learning_rate": 3.248828291435876e-05, "loss": 0.4231, "step": 1091 }, { "epoch": 0.4187116564417178, "grad_norm": 7.0891194343566895, "learning_rate": 3.246697912228377e-05, "loss": 0.0222, "step": 1092 }, { "epoch": 0.4190950920245399, "grad_norm": 1.2289345264434814, "learning_rate": 3.244567533020878e-05, "loss": 0.0039, "step": 1093 }, { "epoch": 0.419478527607362, "grad_norm": 7.595825672149658, "learning_rate": 3.242437153813379e-05, "loss": 0.0247, "step": 1094 }, { "epoch": 0.41986196319018404, "grad_norm": 3.3048288822174072, "learning_rate": 3.2403067746058805e-05, "loss": 0.0148, "step": 1095 }, { "epoch": 0.42024539877300615, "grad_norm": 79.68567657470703, "learning_rate": 3.2381763953983814e-05, "loss": 6.0049, "step": 1096 }, { "epoch": 0.4206288343558282, "grad_norm": 15.561921119689941, "learning_rate": 3.236046016190882e-05, "loss": 0.1219, "step": 1097 }, { "epoch": 0.4210122699386503, "grad_norm": 76.29059600830078, "learning_rate": 3.233915636983383e-05, "loss": 1.2828, "step": 1098 }, { "epoch": 0.4213957055214724, "grad_norm": 48.52256774902344, "learning_rate": 3.231785257775884e-05, "loss": 0.5698, "step": 1099 }, { "epoch": 0.4217791411042945, "grad_norm": 1.1661450862884521, "learning_rate": 3.229654878568386e-05, "loss": 0.0048, "step": 1100 }, { "epoch": 0.42216257668711654, "grad_norm": 0.9635507464408875, "learning_rate": 3.227524499360887e-05, "loss": 0.0042, "step": 1101 }, { "epoch": 0.42254601226993865, "grad_norm": 6.463715553283691, "learning_rate": 3.225394120153388e-05, "loss": 0.0227, "step": 1102 }, { "epoch": 0.4229294478527607, "grad_norm": 60.45621871948242, "learning_rate": 3.2232637409458886e-05, "loss": 0.8892, "step": 1103 }, { "epoch": 0.4233128834355828, "grad_norm": 2.652829647064209, "learning_rate": 3.2211333617383895e-05, "loss": 0.0126, "step": 1104 }, { "epoch": 0.42369631901840493, "grad_norm": 16.4697322845459, "learning_rate": 3.2190029825308905e-05, "loss": 0.1293, "step": 1105 }, { "epoch": 0.424079754601227, "grad_norm": 0.6932334303855896, "learning_rate": 3.2168726033233914e-05, "loss": 0.0022, "step": 1106 }, { "epoch": 0.4244631901840491, "grad_norm": 3.634219169616699, "learning_rate": 3.214742224115892e-05, "loss": 0.0057, "step": 1107 }, { "epoch": 0.42484662576687116, "grad_norm": 6.787144184112549, "learning_rate": 3.212611844908394e-05, "loss": 0.4788, "step": 1108 }, { "epoch": 0.42523006134969327, "grad_norm": 2.133728504180908, "learning_rate": 3.210481465700895e-05, "loss": 0.0099, "step": 1109 }, { "epoch": 0.4256134969325153, "grad_norm": 0.46014270186424255, "learning_rate": 3.208351086493396e-05, "loss": 0.0027, "step": 1110 }, { "epoch": 0.42599693251533743, "grad_norm": 7.713406085968018, "learning_rate": 3.206220707285897e-05, "loss": 0.032, "step": 1111 }, { "epoch": 0.4263803680981595, "grad_norm": 123.03546142578125, "learning_rate": 3.2040903280783977e-05, "loss": 0.9968, "step": 1112 }, { "epoch": 0.4267638036809816, "grad_norm": 64.26725006103516, "learning_rate": 3.201959948870899e-05, "loss": 1.3106, "step": 1113 }, { "epoch": 0.42714723926380366, "grad_norm": 0.5695100426673889, "learning_rate": 3.1998295696634e-05, "loss": 0.0039, "step": 1114 }, { "epoch": 0.42753067484662577, "grad_norm": 12.705592155456543, "learning_rate": 3.197699190455901e-05, "loss": 0.5059, "step": 1115 }, { "epoch": 0.4279141104294479, "grad_norm": 154.61781311035156, "learning_rate": 3.195568811248402e-05, "loss": 5.0117, "step": 1116 }, { "epoch": 0.42829754601226994, "grad_norm": 0.260300874710083, "learning_rate": 3.193438432040903e-05, "loss": 0.0019, "step": 1117 }, { "epoch": 0.42868098159509205, "grad_norm": 1.714156985282898, "learning_rate": 3.1913080528334046e-05, "loss": 0.0072, "step": 1118 }, { "epoch": 0.4290644171779141, "grad_norm": 7.342320442199707, "learning_rate": 3.1891776736259055e-05, "loss": 0.0997, "step": 1119 }, { "epoch": 0.4294478527607362, "grad_norm": 31.19200897216797, "learning_rate": 3.1870472944184065e-05, "loss": 0.6641, "step": 1120 }, { "epoch": 0.42983128834355827, "grad_norm": 2.5018272399902344, "learning_rate": 3.1849169152109074e-05, "loss": 0.0097, "step": 1121 }, { "epoch": 0.4302147239263804, "grad_norm": 79.78598022460938, "learning_rate": 3.182786536003409e-05, "loss": 1.3058, "step": 1122 }, { "epoch": 0.43059815950920244, "grad_norm": 0.393129825592041, "learning_rate": 3.18065615679591e-05, "loss": 0.0028, "step": 1123 }, { "epoch": 0.43098159509202455, "grad_norm": 4.799597263336182, "learning_rate": 3.178525777588411e-05, "loss": 0.0454, "step": 1124 }, { "epoch": 0.4313650306748466, "grad_norm": 3.526318311691284, "learning_rate": 3.176395398380912e-05, "loss": 0.0144, "step": 1125 }, { "epoch": 0.4317484662576687, "grad_norm": 0.4182857573032379, "learning_rate": 3.174265019173413e-05, "loss": 0.0027, "step": 1126 }, { "epoch": 0.4321319018404908, "grad_norm": 3.599029779434204, "learning_rate": 3.172134639965914e-05, "loss": 0.0185, "step": 1127 }, { "epoch": 0.4325153374233129, "grad_norm": 13.80219841003418, "learning_rate": 3.170004260758415e-05, "loss": 0.0846, "step": 1128 }, { "epoch": 0.432898773006135, "grad_norm": 30.802892684936523, "learning_rate": 3.167873881550916e-05, "loss": 0.6416, "step": 1129 }, { "epoch": 0.43328220858895705, "grad_norm": 1.0535043478012085, "learning_rate": 3.165743502343417e-05, "loss": 0.0039, "step": 1130 }, { "epoch": 0.43366564417177916, "grad_norm": 25.42630958557129, "learning_rate": 3.163613123135918e-05, "loss": 0.2013, "step": 1131 }, { "epoch": 0.4340490797546012, "grad_norm": 8.263312339782715, "learning_rate": 3.16148274392842e-05, "loss": 0.0891, "step": 1132 }, { "epoch": 0.43443251533742333, "grad_norm": 0.37161657214164734, "learning_rate": 3.1593523647209206e-05, "loss": 0.0031, "step": 1133 }, { "epoch": 0.4348159509202454, "grad_norm": 10.676396369934082, "learning_rate": 3.1572219855134215e-05, "loss": 0.0371, "step": 1134 }, { "epoch": 0.4351993865030675, "grad_norm": 7.621251583099365, "learning_rate": 3.1550916063059225e-05, "loss": 0.4434, "step": 1135 }, { "epoch": 0.43558282208588955, "grad_norm": 0.4153030514717102, "learning_rate": 3.152961227098424e-05, "loss": 0.0028, "step": 1136 }, { "epoch": 0.43596625766871167, "grad_norm": 7.068398952484131, "learning_rate": 3.150830847890925e-05, "loss": 0.0553, "step": 1137 }, { "epoch": 0.4363496932515337, "grad_norm": 78.07585144042969, "learning_rate": 3.148700468683426e-05, "loss": 1.5176, "step": 1138 }, { "epoch": 0.43673312883435583, "grad_norm": 55.385684967041016, "learning_rate": 3.146570089475927e-05, "loss": 0.9117, "step": 1139 }, { "epoch": 0.4371165644171779, "grad_norm": 5.380411624908447, "learning_rate": 3.144439710268428e-05, "loss": 0.2649, "step": 1140 }, { "epoch": 0.4375, "grad_norm": 1.1014796495437622, "learning_rate": 3.1423093310609294e-05, "loss": 0.0047, "step": 1141 }, { "epoch": 0.4378834355828221, "grad_norm": 60.870994567871094, "learning_rate": 3.14017895185343e-05, "loss": 0.8858, "step": 1142 }, { "epoch": 0.43826687116564417, "grad_norm": 7.248571395874023, "learning_rate": 3.138048572645931e-05, "loss": 0.0342, "step": 1143 }, { "epoch": 0.4386503067484663, "grad_norm": 5.0063371658325195, "learning_rate": 3.135918193438432e-05, "loss": 0.0108, "step": 1144 }, { "epoch": 0.43903374233128833, "grad_norm": 193.22222900390625, "learning_rate": 3.133787814230934e-05, "loss": 3.3618, "step": 1145 }, { "epoch": 0.43941717791411045, "grad_norm": 128.14036560058594, "learning_rate": 3.131657435023435e-05, "loss": 1.5245, "step": 1146 }, { "epoch": 0.4398006134969325, "grad_norm": 12.670878410339355, "learning_rate": 3.129527055815936e-05, "loss": 0.0353, "step": 1147 }, { "epoch": 0.4401840490797546, "grad_norm": 0.30025821924209595, "learning_rate": 3.1273966766084366e-05, "loss": 0.0028, "step": 1148 }, { "epoch": 0.44056748466257667, "grad_norm": 25.819347381591797, "learning_rate": 3.1252662974009375e-05, "loss": 0.0876, "step": 1149 }, { "epoch": 0.4409509202453988, "grad_norm": 15.05252456665039, "learning_rate": 3.1231359181934385e-05, "loss": 0.1499, "step": 1150 }, { "epoch": 0.44133435582822084, "grad_norm": 12.940930366516113, "learning_rate": 3.1210055389859394e-05, "loss": 0.0503, "step": 1151 }, { "epoch": 0.44171779141104295, "grad_norm": 27.6295166015625, "learning_rate": 3.11887515977844e-05, "loss": 0.5821, "step": 1152 }, { "epoch": 0.442101226993865, "grad_norm": 0.9625844955444336, "learning_rate": 3.116744780570941e-05, "loss": 0.0054, "step": 1153 }, { "epoch": 0.4424846625766871, "grad_norm": 19.52550506591797, "learning_rate": 3.114614401363443e-05, "loss": 0.034, "step": 1154 }, { "epoch": 0.4428680981595092, "grad_norm": 2.0176961421966553, "learning_rate": 3.112484022155944e-05, "loss": 0.0054, "step": 1155 }, { "epoch": 0.4432515337423313, "grad_norm": 209.79808044433594, "learning_rate": 3.110353642948445e-05, "loss": 2.6355, "step": 1156 }, { "epoch": 0.4436349693251534, "grad_norm": 14.19172191619873, "learning_rate": 3.1082232637409457e-05, "loss": 0.5855, "step": 1157 }, { "epoch": 0.44401840490797545, "grad_norm": 62.55099105834961, "learning_rate": 3.106092884533447e-05, "loss": 0.8145, "step": 1158 }, { "epoch": 0.44440184049079756, "grad_norm": 9.882044792175293, "learning_rate": 3.103962505325948e-05, "loss": 0.0322, "step": 1159 }, { "epoch": 0.4447852760736196, "grad_norm": 2.612950325012207, "learning_rate": 3.101832126118449e-05, "loss": 0.0114, "step": 1160 }, { "epoch": 0.44516871165644173, "grad_norm": 19.931814193725586, "learning_rate": 3.09970174691095e-05, "loss": 0.4781, "step": 1161 }, { "epoch": 0.4455521472392638, "grad_norm": 151.32928466796875, "learning_rate": 3.097571367703451e-05, "loss": 4.6748, "step": 1162 }, { "epoch": 0.4459355828220859, "grad_norm": 17.60948944091797, "learning_rate": 3.0954409884959526e-05, "loss": 0.1786, "step": 1163 }, { "epoch": 0.44631901840490795, "grad_norm": 8.400739669799805, "learning_rate": 3.0933106092884535e-05, "loss": 0.02, "step": 1164 }, { "epoch": 0.44670245398773006, "grad_norm": 7.452956199645996, "learning_rate": 3.0911802300809545e-05, "loss": 0.0238, "step": 1165 }, { "epoch": 0.4470858895705521, "grad_norm": 174.81600952148438, "learning_rate": 3.0890498508734554e-05, "loss": 1.3851, "step": 1166 }, { "epoch": 0.44746932515337423, "grad_norm": 2.653099536895752, "learning_rate": 3.086919471665956e-05, "loss": 0.0158, "step": 1167 }, { "epoch": 0.44785276073619634, "grad_norm": 0.17586538195610046, "learning_rate": 3.084789092458458e-05, "loss": 0.0015, "step": 1168 }, { "epoch": 0.4482361963190184, "grad_norm": 17.50071144104004, "learning_rate": 3.082658713250959e-05, "loss": 0.0608, "step": 1169 }, { "epoch": 0.4486196319018405, "grad_norm": 10.259678840637207, "learning_rate": 3.08052833404346e-05, "loss": 0.0739, "step": 1170 }, { "epoch": 0.44900306748466257, "grad_norm": 0.9370091557502747, "learning_rate": 3.078397954835961e-05, "loss": 0.0054, "step": 1171 }, { "epoch": 0.4493865030674847, "grad_norm": 1.6083060503005981, "learning_rate": 3.076267575628462e-05, "loss": 0.0076, "step": 1172 }, { "epoch": 0.44976993865030673, "grad_norm": 50.8786506652832, "learning_rate": 3.074137196420963e-05, "loss": 1.0371, "step": 1173 }, { "epoch": 0.45015337423312884, "grad_norm": 210.81854248046875, "learning_rate": 3.072006817213464e-05, "loss": 3.9482, "step": 1174 }, { "epoch": 0.4505368098159509, "grad_norm": 1.0925509929656982, "learning_rate": 3.069876438005965e-05, "loss": 0.0053, "step": 1175 }, { "epoch": 0.450920245398773, "grad_norm": 0.6283935308456421, "learning_rate": 3.067746058798466e-05, "loss": 0.0036, "step": 1176 }, { "epoch": 0.45130368098159507, "grad_norm": 155.3126678466797, "learning_rate": 3.065615679590968e-05, "loss": 5.6431, "step": 1177 }, { "epoch": 0.4516871165644172, "grad_norm": 4.078763484954834, "learning_rate": 3.0634853003834686e-05, "loss": 0.0166, "step": 1178 }, { "epoch": 0.4520705521472393, "grad_norm": 332.8621826171875, "learning_rate": 3.0613549211759695e-05, "loss": 3.4974, "step": 1179 }, { "epoch": 0.45245398773006135, "grad_norm": 103.15957641601562, "learning_rate": 3.0592245419684705e-05, "loss": 2.0648, "step": 1180 }, { "epoch": 0.45283742331288346, "grad_norm": 0.7433050274848938, "learning_rate": 3.0570941627609714e-05, "loss": 0.0039, "step": 1181 }, { "epoch": 0.4532208588957055, "grad_norm": 1.3463419675827026, "learning_rate": 3.054963783553473e-05, "loss": 0.0045, "step": 1182 }, { "epoch": 0.4536042944785276, "grad_norm": 62.824432373046875, "learning_rate": 3.052833404345974e-05, "loss": 1.5899, "step": 1183 }, { "epoch": 0.4539877300613497, "grad_norm": 186.881591796875, "learning_rate": 3.050703025138475e-05, "loss": 3.0674, "step": 1184 }, { "epoch": 0.4543711656441718, "grad_norm": 139.8800811767578, "learning_rate": 3.048572645930976e-05, "loss": 2.6504, "step": 1185 }, { "epoch": 0.45475460122699385, "grad_norm": 0.8735361695289612, "learning_rate": 3.046442266723477e-05, "loss": 0.0045, "step": 1186 }, { "epoch": 0.45513803680981596, "grad_norm": 4.634177207946777, "learning_rate": 3.0443118875159783e-05, "loss": 0.0315, "step": 1187 }, { "epoch": 0.455521472392638, "grad_norm": 3.494042158126831, "learning_rate": 3.0421815083084793e-05, "loss": 0.0229, "step": 1188 }, { "epoch": 0.4559049079754601, "grad_norm": 100.4810562133789, "learning_rate": 3.0400511291009802e-05, "loss": 6.0488, "step": 1189 }, { "epoch": 0.4562883435582822, "grad_norm": 0.2752269208431244, "learning_rate": 3.0379207498934815e-05, "loss": 0.0022, "step": 1190 }, { "epoch": 0.4566717791411043, "grad_norm": 8.22811508178711, "learning_rate": 3.0357903706859824e-05, "loss": 0.223, "step": 1191 }, { "epoch": 0.4570552147239264, "grad_norm": 6.318234920501709, "learning_rate": 3.0336599914784837e-05, "loss": 0.4302, "step": 1192 }, { "epoch": 0.45743865030674846, "grad_norm": 13.462141036987305, "learning_rate": 3.0315296122709846e-05, "loss": 0.0474, "step": 1193 }, { "epoch": 0.4578220858895706, "grad_norm": 23.509485244750977, "learning_rate": 3.029399233063486e-05, "loss": 0.5611, "step": 1194 }, { "epoch": 0.45820552147239263, "grad_norm": 4.083889007568359, "learning_rate": 3.027268853855986e-05, "loss": 0.0215, "step": 1195 }, { "epoch": 0.45858895705521474, "grad_norm": 232.07672119140625, "learning_rate": 3.0251384746484874e-05, "loss": 6.0332, "step": 1196 }, { "epoch": 0.4589723926380368, "grad_norm": 0.418995201587677, "learning_rate": 3.0230080954409883e-05, "loss": 0.0025, "step": 1197 }, { "epoch": 0.4593558282208589, "grad_norm": 0.27796462178230286, "learning_rate": 3.0208777162334896e-05, "loss": 0.0016, "step": 1198 }, { "epoch": 0.45973926380368096, "grad_norm": 2.5159637928009033, "learning_rate": 3.0187473370259905e-05, "loss": 0.0093, "step": 1199 }, { "epoch": 0.4601226993865031, "grad_norm": 0.2359163910150528, "learning_rate": 3.0166169578184918e-05, "loss": 0.0017, "step": 1200 }, { "epoch": 0.46050613496932513, "grad_norm": 1.0194178819656372, "learning_rate": 3.0144865786109927e-05, "loss": 0.0034, "step": 1201 }, { "epoch": 0.46088957055214724, "grad_norm": 4.287911415100098, "learning_rate": 3.0123561994034937e-05, "loss": 0.0126, "step": 1202 }, { "epoch": 0.4612730061349693, "grad_norm": 0.42968660593032837, "learning_rate": 3.010225820195995e-05, "loss": 0.0031, "step": 1203 }, { "epoch": 0.4616564417177914, "grad_norm": 22.549182891845703, "learning_rate": 3.008095440988496e-05, "loss": 0.5249, "step": 1204 }, { "epoch": 0.4620398773006135, "grad_norm": 1.3578665256500244, "learning_rate": 3.005965061780997e-05, "loss": 0.005, "step": 1205 }, { "epoch": 0.4624233128834356, "grad_norm": 0.51641446352005, "learning_rate": 3.003834682573498e-05, "loss": 0.0029, "step": 1206 }, { "epoch": 0.4628067484662577, "grad_norm": 0.573008120059967, "learning_rate": 3.0017043033659993e-05, "loss": 0.0036, "step": 1207 }, { "epoch": 0.46319018404907975, "grad_norm": 129.40185546875, "learning_rate": 2.9995739241585003e-05, "loss": 2.0801, "step": 1208 }, { "epoch": 0.46357361963190186, "grad_norm": 31.359634399414062, "learning_rate": 2.9974435449510012e-05, "loss": 0.5342, "step": 1209 }, { "epoch": 0.4639570552147239, "grad_norm": 1.4557340145111084, "learning_rate": 2.9953131657435025e-05, "loss": 0.0035, "step": 1210 }, { "epoch": 0.464340490797546, "grad_norm": 7.835709571838379, "learning_rate": 2.9931827865360034e-05, "loss": 0.1309, "step": 1211 }, { "epoch": 0.4647239263803681, "grad_norm": 1.2238229513168335, "learning_rate": 2.9910524073285047e-05, "loss": 0.0049, "step": 1212 }, { "epoch": 0.4651073619631902, "grad_norm": 2.1265645027160645, "learning_rate": 2.9889220281210056e-05, "loss": 0.0101, "step": 1213 }, { "epoch": 0.46549079754601225, "grad_norm": 2.475086212158203, "learning_rate": 2.986791648913507e-05, "loss": 0.0075, "step": 1214 }, { "epoch": 0.46587423312883436, "grad_norm": 1.0121549367904663, "learning_rate": 2.9846612697060078e-05, "loss": 0.0053, "step": 1215 }, { "epoch": 0.4662576687116564, "grad_norm": 51.59149169921875, "learning_rate": 2.9825308904985087e-05, "loss": 1.2022, "step": 1216 }, { "epoch": 0.4666411042944785, "grad_norm": 50.36254119873047, "learning_rate": 2.98040051129101e-05, "loss": 0.7691, "step": 1217 }, { "epoch": 0.46702453987730064, "grad_norm": 1.9499720335006714, "learning_rate": 2.978270132083511e-05, "loss": 0.0092, "step": 1218 }, { "epoch": 0.4674079754601227, "grad_norm": 105.05870056152344, "learning_rate": 2.9761397528760122e-05, "loss": 1.1055, "step": 1219 }, { "epoch": 0.4677914110429448, "grad_norm": 0.6851003766059875, "learning_rate": 2.974009373668513e-05, "loss": 0.0033, "step": 1220 }, { "epoch": 0.46817484662576686, "grad_norm": 9.881448745727539, "learning_rate": 2.9718789944610144e-05, "loss": 0.0343, "step": 1221 }, { "epoch": 0.46855828220858897, "grad_norm": 0.9985806941986084, "learning_rate": 2.9697486152535153e-05, "loss": 0.006, "step": 1222 }, { "epoch": 0.46894171779141103, "grad_norm": 10.051969528198242, "learning_rate": 2.9676182360460163e-05, "loss": 0.0237, "step": 1223 }, { "epoch": 0.46932515337423314, "grad_norm": 1.2723650932312012, "learning_rate": 2.9654878568385175e-05, "loss": 0.0057, "step": 1224 }, { "epoch": 0.4697085889570552, "grad_norm": 1.2021504640579224, "learning_rate": 2.9633574776310185e-05, "loss": 0.0051, "step": 1225 }, { "epoch": 0.4700920245398773, "grad_norm": 6.088982105255127, "learning_rate": 2.9612270984235197e-05, "loss": 0.0288, "step": 1226 }, { "epoch": 0.47047546012269936, "grad_norm": 1.1847741603851318, "learning_rate": 2.9590967192160207e-05, "loss": 0.0046, "step": 1227 }, { "epoch": 0.4708588957055215, "grad_norm": 1.1589510440826416, "learning_rate": 2.956966340008522e-05, "loss": 0.0042, "step": 1228 }, { "epoch": 0.4712423312883436, "grad_norm": 4.910191059112549, "learning_rate": 2.954835960801023e-05, "loss": 0.0179, "step": 1229 }, { "epoch": 0.47162576687116564, "grad_norm": 131.7100372314453, "learning_rate": 2.9527055815935238e-05, "loss": 5.6689, "step": 1230 }, { "epoch": 0.47200920245398775, "grad_norm": 48.22613525390625, "learning_rate": 2.950575202386025e-05, "loss": 0.585, "step": 1231 }, { "epoch": 0.4723926380368098, "grad_norm": 3.0125133991241455, "learning_rate": 2.948444823178526e-05, "loss": 0.0114, "step": 1232 }, { "epoch": 0.4727760736196319, "grad_norm": 109.76148986816406, "learning_rate": 2.9463144439710273e-05, "loss": 1.4414, "step": 1233 }, { "epoch": 0.473159509202454, "grad_norm": 4.895443439483643, "learning_rate": 2.9441840647635282e-05, "loss": 0.0181, "step": 1234 }, { "epoch": 0.4735429447852761, "grad_norm": 3.6181979179382324, "learning_rate": 2.9420536855560295e-05, "loss": 0.0222, "step": 1235 }, { "epoch": 0.47392638036809814, "grad_norm": 55.43559646606445, "learning_rate": 2.9399233063485304e-05, "loss": 1.0049, "step": 1236 }, { "epoch": 0.47430981595092025, "grad_norm": 12.764467239379883, "learning_rate": 2.9377929271410313e-05, "loss": 0.069, "step": 1237 }, { "epoch": 0.4746932515337423, "grad_norm": 5.7123942375183105, "learning_rate": 2.9356625479335326e-05, "loss": 0.0184, "step": 1238 }, { "epoch": 0.4750766871165644, "grad_norm": 11.40140438079834, "learning_rate": 2.9335321687260335e-05, "loss": 0.0416, "step": 1239 }, { "epoch": 0.4754601226993865, "grad_norm": 0.8403089642524719, "learning_rate": 2.931401789518534e-05, "loss": 0.0034, "step": 1240 }, { "epoch": 0.4758435582822086, "grad_norm": 3.561800479888916, "learning_rate": 2.9292714103110354e-05, "loss": 0.0037, "step": 1241 }, { "epoch": 0.4762269938650307, "grad_norm": 136.95355224609375, "learning_rate": 2.9271410311035363e-05, "loss": 2.2112, "step": 1242 }, { "epoch": 0.47661042944785276, "grad_norm": 0.960547685623169, "learning_rate": 2.9250106518960373e-05, "loss": 0.0028, "step": 1243 }, { "epoch": 0.47699386503067487, "grad_norm": 6.117381572723389, "learning_rate": 2.9228802726885385e-05, "loss": 0.0469, "step": 1244 }, { "epoch": 0.4773773006134969, "grad_norm": 19.996009826660156, "learning_rate": 2.9207498934810395e-05, "loss": 0.6602, "step": 1245 }, { "epoch": 0.47776073619631904, "grad_norm": 9.0525541305542, "learning_rate": 2.9186195142735407e-05, "loss": 0.0484, "step": 1246 }, { "epoch": 0.4781441717791411, "grad_norm": 9.098682403564453, "learning_rate": 2.9164891350660417e-05, "loss": 0.0613, "step": 1247 }, { "epoch": 0.4785276073619632, "grad_norm": 192.00765991210938, "learning_rate": 2.914358755858543e-05, "loss": 1.1309, "step": 1248 }, { "epoch": 0.47891104294478526, "grad_norm": 1.7598403692245483, "learning_rate": 2.912228376651044e-05, "loss": 0.0053, "step": 1249 }, { "epoch": 0.47929447852760737, "grad_norm": 84.8008041381836, "learning_rate": 2.9100979974435448e-05, "loss": 1.0332, "step": 1250 }, { "epoch": 0.4796779141104294, "grad_norm": 123.36064910888672, "learning_rate": 2.907967618236046e-05, "loss": 2.0742, "step": 1251 }, { "epoch": 0.48006134969325154, "grad_norm": 0.9011976718902588, "learning_rate": 2.905837239028547e-05, "loss": 0.0026, "step": 1252 }, { "epoch": 0.4804447852760736, "grad_norm": 95.64689636230469, "learning_rate": 2.9037068598210483e-05, "loss": 1.8077, "step": 1253 }, { "epoch": 0.4808282208588957, "grad_norm": 2.4204602241516113, "learning_rate": 2.9015764806135492e-05, "loss": 0.0072, "step": 1254 }, { "epoch": 0.4812116564417178, "grad_norm": 4.134298324584961, "learning_rate": 2.8994461014060505e-05, "loss": 0.022, "step": 1255 }, { "epoch": 0.4815950920245399, "grad_norm": 139.32273864746094, "learning_rate": 2.8973157221985514e-05, "loss": 6.0684, "step": 1256 }, { "epoch": 0.481978527607362, "grad_norm": 597.2622680664062, "learning_rate": 2.8951853429910523e-05, "loss": 2.1761, "step": 1257 }, { "epoch": 0.48236196319018404, "grad_norm": 5.3948163986206055, "learning_rate": 2.8930549637835536e-05, "loss": 0.022, "step": 1258 }, { "epoch": 0.48274539877300615, "grad_norm": 0.4378185272216797, "learning_rate": 2.8909245845760545e-05, "loss": 0.0024, "step": 1259 }, { "epoch": 0.4831288343558282, "grad_norm": 2.58599591255188, "learning_rate": 2.8887942053685558e-05, "loss": 0.0067, "step": 1260 }, { "epoch": 0.4835122699386503, "grad_norm": 7.638411998748779, "learning_rate": 2.8866638261610567e-05, "loss": 0.0158, "step": 1261 }, { "epoch": 0.4838957055214724, "grad_norm": 52.83462142944336, "learning_rate": 2.884533446953558e-05, "loss": 0.7105, "step": 1262 }, { "epoch": 0.4842791411042945, "grad_norm": 68.8130874633789, "learning_rate": 2.882403067746059e-05, "loss": 1.1182, "step": 1263 }, { "epoch": 0.48466257668711654, "grad_norm": 1.7231501340866089, "learning_rate": 2.88027268853856e-05, "loss": 0.0049, "step": 1264 }, { "epoch": 0.48504601226993865, "grad_norm": 1.7606278657913208, "learning_rate": 2.878142309331061e-05, "loss": 0.0068, "step": 1265 }, { "epoch": 0.4854294478527607, "grad_norm": 0.5795959830284119, "learning_rate": 2.876011930123562e-05, "loss": 0.0031, "step": 1266 }, { "epoch": 0.4858128834355828, "grad_norm": 0.928696870803833, "learning_rate": 2.8738815509160633e-05, "loss": 0.0055, "step": 1267 }, { "epoch": 0.48619631901840493, "grad_norm": 2.9062998294830322, "learning_rate": 2.8717511717085643e-05, "loss": 0.0081, "step": 1268 }, { "epoch": 0.486579754601227, "grad_norm": 0.6792395114898682, "learning_rate": 2.8696207925010655e-05, "loss": 0.0034, "step": 1269 }, { "epoch": 0.4869631901840491, "grad_norm": 89.9220962524414, "learning_rate": 2.8674904132935665e-05, "loss": 1.0928, "step": 1270 }, { "epoch": 0.48734662576687116, "grad_norm": 47.2714958190918, "learning_rate": 2.8653600340860677e-05, "loss": 0.939, "step": 1271 }, { "epoch": 0.48773006134969327, "grad_norm": 175.17723083496094, "learning_rate": 2.8632296548785687e-05, "loss": 5.8057, "step": 1272 }, { "epoch": 0.4881134969325153, "grad_norm": 47.968807220458984, "learning_rate": 2.8610992756710696e-05, "loss": 0.7549, "step": 1273 }, { "epoch": 0.48849693251533743, "grad_norm": 0.32888033986091614, "learning_rate": 2.858968896463571e-05, "loss": 0.0023, "step": 1274 }, { "epoch": 0.4888803680981595, "grad_norm": 3.0864601135253906, "learning_rate": 2.8568385172560718e-05, "loss": 0.0145, "step": 1275 }, { "epoch": 0.4892638036809816, "grad_norm": 1.845633864402771, "learning_rate": 2.854708138048573e-05, "loss": 0.005, "step": 1276 }, { "epoch": 0.48964723926380366, "grad_norm": 2.848792552947998, "learning_rate": 2.852577758841074e-05, "loss": 0.0059, "step": 1277 }, { "epoch": 0.49003067484662577, "grad_norm": 9.916733741760254, "learning_rate": 2.8504473796335753e-05, "loss": 0.0439, "step": 1278 }, { "epoch": 0.4904141104294479, "grad_norm": 22.210588455200195, "learning_rate": 2.8483170004260762e-05, "loss": 0.5889, "step": 1279 }, { "epoch": 0.49079754601226994, "grad_norm": 21.33092498779297, "learning_rate": 2.846186621218577e-05, "loss": 0.5547, "step": 1280 }, { "epoch": 0.49118098159509205, "grad_norm": 0.24328042566776276, "learning_rate": 2.8440562420110784e-05, "loss": 0.0028, "step": 1281 }, { "epoch": 0.4915644171779141, "grad_norm": 11.243773460388184, "learning_rate": 2.8419258628035793e-05, "loss": 0.1562, "step": 1282 }, { "epoch": 0.4919478527607362, "grad_norm": 8.533361434936523, "learning_rate": 2.8397954835960806e-05, "loss": 0.4775, "step": 1283 }, { "epoch": 0.49233128834355827, "grad_norm": 41.438018798828125, "learning_rate": 2.8376651043885815e-05, "loss": 0.8608, "step": 1284 }, { "epoch": 0.4927147239263804, "grad_norm": 100.1050796508789, "learning_rate": 2.835534725181082e-05, "loss": 1.001, "step": 1285 }, { "epoch": 0.49309815950920244, "grad_norm": 46.85498046875, "learning_rate": 2.833404345973583e-05, "loss": 0.5293, "step": 1286 }, { "epoch": 0.49348159509202455, "grad_norm": 1.5562477111816406, "learning_rate": 2.8312739667660843e-05, "loss": 0.006, "step": 1287 }, { "epoch": 0.4938650306748466, "grad_norm": 215.5096893310547, "learning_rate": 2.8291435875585853e-05, "loss": 4.0902, "step": 1288 }, { "epoch": 0.4942484662576687, "grad_norm": 2.6404812335968018, "learning_rate": 2.8270132083510865e-05, "loss": 0.0129, "step": 1289 }, { "epoch": 0.4946319018404908, "grad_norm": 86.96484375, "learning_rate": 2.8248828291435875e-05, "loss": 1.3555, "step": 1290 }, { "epoch": 0.4950153374233129, "grad_norm": 10.088623046875, "learning_rate": 2.8227524499360887e-05, "loss": 0.2534, "step": 1291 }, { "epoch": 0.495398773006135, "grad_norm": 87.9999008178711, "learning_rate": 2.8206220707285897e-05, "loss": 1.25, "step": 1292 }, { "epoch": 0.49578220858895705, "grad_norm": 0.6408212184906006, "learning_rate": 2.8184916915210906e-05, "loss": 0.0028, "step": 1293 }, { "epoch": 0.49616564417177916, "grad_norm": 0.568196177482605, "learning_rate": 2.816361312313592e-05, "loss": 0.0036, "step": 1294 }, { "epoch": 0.4965490797546012, "grad_norm": 21.775447845458984, "learning_rate": 2.8142309331060928e-05, "loss": 0.6846, "step": 1295 }, { "epoch": 0.49693251533742333, "grad_norm": 2.9825246334075928, "learning_rate": 2.812100553898594e-05, "loss": 0.0079, "step": 1296 }, { "epoch": 0.4973159509202454, "grad_norm": 1.3889049291610718, "learning_rate": 2.809970174691095e-05, "loss": 0.0087, "step": 1297 }, { "epoch": 0.4976993865030675, "grad_norm": 0.6602429151535034, "learning_rate": 2.8078397954835963e-05, "loss": 0.0024, "step": 1298 }, { "epoch": 0.49808282208588955, "grad_norm": 84.68866729736328, "learning_rate": 2.8057094162760972e-05, "loss": 0.8604, "step": 1299 }, { "epoch": 0.49846625766871167, "grad_norm": 1.651374340057373, "learning_rate": 2.803579037068598e-05, "loss": 0.0067, "step": 1300 }, { "epoch": 0.4988496932515337, "grad_norm": 15.142035484313965, "learning_rate": 2.8014486578610994e-05, "loss": 0.5489, "step": 1301 }, { "epoch": 0.49923312883435583, "grad_norm": 31.190401077270508, "learning_rate": 2.7993182786536003e-05, "loss": 0.1139, "step": 1302 }, { "epoch": 0.4996165644171779, "grad_norm": 4.73783540725708, "learning_rate": 2.7971878994461016e-05, "loss": 0.02, "step": 1303 }, { "epoch": 0.5, "grad_norm": 100.45794677734375, "learning_rate": 2.7950575202386025e-05, "loss": 1.3066, "step": 1304 }, { "epoch": 0.5003834355828221, "grad_norm": 6.176492214202881, "learning_rate": 2.7929271410311038e-05, "loss": 0.4216, "step": 1305 }, { "epoch": 0.5007668711656442, "grad_norm": 242.9879608154297, "learning_rate": 2.7907967618236047e-05, "loss": 2.3107, "step": 1306 }, { "epoch": 0.5011503067484663, "grad_norm": 105.03369903564453, "learning_rate": 2.7886663826161057e-05, "loss": 1.8819, "step": 1307 }, { "epoch": 0.5015337423312883, "grad_norm": 19.288230895996094, "learning_rate": 2.786536003408607e-05, "loss": 0.0878, "step": 1308 }, { "epoch": 0.5019171779141104, "grad_norm": 149.70127868652344, "learning_rate": 2.784405624201108e-05, "loss": 6.0527, "step": 1309 }, { "epoch": 0.5023006134969326, "grad_norm": 105.20108032226562, "learning_rate": 2.782275244993609e-05, "loss": 1.4551, "step": 1310 }, { "epoch": 0.5026840490797546, "grad_norm": 17.483646392822266, "learning_rate": 2.78014486578611e-05, "loss": 0.0961, "step": 1311 }, { "epoch": 0.5030674846625767, "grad_norm": 12.938019752502441, "learning_rate": 2.7780144865786113e-05, "loss": 0.4529, "step": 1312 }, { "epoch": 0.5034509202453987, "grad_norm": 19.03183364868164, "learning_rate": 2.7758841073711123e-05, "loss": 0.0877, "step": 1313 }, { "epoch": 0.5038343558282209, "grad_norm": 95.82111358642578, "learning_rate": 2.7737537281636132e-05, "loss": 1.2188, "step": 1314 }, { "epoch": 0.504217791411043, "grad_norm": 0.7974203824996948, "learning_rate": 2.7716233489561145e-05, "loss": 0.0022, "step": 1315 }, { "epoch": 0.504601226993865, "grad_norm": 21.486732482910156, "learning_rate": 2.7694929697486154e-05, "loss": 0.542, "step": 1316 }, { "epoch": 0.5049846625766872, "grad_norm": 12.884425163269043, "learning_rate": 2.7673625905411167e-05, "loss": 0.1489, "step": 1317 }, { "epoch": 0.5053680981595092, "grad_norm": 367.4250183105469, "learning_rate": 2.7652322113336176e-05, "loss": 1.3174, "step": 1318 }, { "epoch": 0.5057515337423313, "grad_norm": 6.886001110076904, "learning_rate": 2.763101832126119e-05, "loss": 0.0208, "step": 1319 }, { "epoch": 0.5061349693251533, "grad_norm": 4.112507343292236, "learning_rate": 2.7609714529186198e-05, "loss": 0.0295, "step": 1320 }, { "epoch": 0.5065184049079755, "grad_norm": 8.974945068359375, "learning_rate": 2.7588410737111207e-05, "loss": 0.0565, "step": 1321 }, { "epoch": 0.5069018404907976, "grad_norm": 5.564464092254639, "learning_rate": 2.756710694503622e-05, "loss": 0.4058, "step": 1322 }, { "epoch": 0.5072852760736196, "grad_norm": 8.100415229797363, "learning_rate": 2.754580315296123e-05, "loss": 0.4504, "step": 1323 }, { "epoch": 0.5076687116564417, "grad_norm": 0.20162974298000336, "learning_rate": 2.7524499360886242e-05, "loss": 0.0014, "step": 1324 }, { "epoch": 0.5080521472392638, "grad_norm": 4.389342784881592, "learning_rate": 2.750319556881125e-05, "loss": 0.012, "step": 1325 }, { "epoch": 0.5084355828220859, "grad_norm": 1.5896692276000977, "learning_rate": 2.7481891776736264e-05, "loss": 0.0022, "step": 1326 }, { "epoch": 0.508819018404908, "grad_norm": 100.52224731445312, "learning_rate": 2.7460587984661273e-05, "loss": 1.8311, "step": 1327 }, { "epoch": 0.50920245398773, "grad_norm": 3.928161382675171, "learning_rate": 2.7439284192586283e-05, "loss": 0.0204, "step": 1328 }, { "epoch": 0.5095858895705522, "grad_norm": 13.133651733398438, "learning_rate": 2.7417980400511295e-05, "loss": 0.0135, "step": 1329 }, { "epoch": 0.5099693251533742, "grad_norm": 32.4322624206543, "learning_rate": 2.73966766084363e-05, "loss": 0.5806, "step": 1330 }, { "epoch": 0.5103527607361963, "grad_norm": 0.5560774207115173, "learning_rate": 2.737537281636131e-05, "loss": 0.0034, "step": 1331 }, { "epoch": 0.5107361963190185, "grad_norm": 275.88140869140625, "learning_rate": 2.7354069024286323e-05, "loss": 5.6875, "step": 1332 }, { "epoch": 0.5111196319018405, "grad_norm": 10.551005363464355, "learning_rate": 2.7332765232211333e-05, "loss": 0.071, "step": 1333 }, { "epoch": 0.5115030674846626, "grad_norm": 128.31874084472656, "learning_rate": 2.7311461440136342e-05, "loss": 5.5439, "step": 1334 }, { "epoch": 0.5118865030674846, "grad_norm": 168.2147979736328, "learning_rate": 2.7290157648061355e-05, "loss": 1.3653, "step": 1335 }, { "epoch": 0.5122699386503068, "grad_norm": 3.8666961193084717, "learning_rate": 2.7268853855986364e-05, "loss": 0.0106, "step": 1336 }, { "epoch": 0.5126533742331288, "grad_norm": 1.494707703590393, "learning_rate": 2.7247550063911377e-05, "loss": 0.0083, "step": 1337 }, { "epoch": 0.5130368098159509, "grad_norm": 20.960519790649414, "learning_rate": 2.7226246271836386e-05, "loss": 0.4636, "step": 1338 }, { "epoch": 0.513420245398773, "grad_norm": 165.2379913330078, "learning_rate": 2.72049424797614e-05, "loss": 5.5386, "step": 1339 }, { "epoch": 0.5138036809815951, "grad_norm": 2.861499071121216, "learning_rate": 2.7183638687686408e-05, "loss": 0.0108, "step": 1340 }, { "epoch": 0.5141871165644172, "grad_norm": 23.677654266357422, "learning_rate": 2.7162334895611417e-05, "loss": 0.1123, "step": 1341 }, { "epoch": 0.5145705521472392, "grad_norm": 118.72830963134766, "learning_rate": 2.714103110353643e-05, "loss": 1.2725, "step": 1342 }, { "epoch": 0.5149539877300614, "grad_norm": 0.7612152099609375, "learning_rate": 2.711972731146144e-05, "loss": 0.005, "step": 1343 }, { "epoch": 0.5153374233128835, "grad_norm": 167.5882568359375, "learning_rate": 2.7098423519386452e-05, "loss": 2.1641, "step": 1344 }, { "epoch": 0.5157208588957055, "grad_norm": 5.0922956466674805, "learning_rate": 2.707711972731146e-05, "loss": 0.0371, "step": 1345 }, { "epoch": 0.5161042944785276, "grad_norm": 13.625322341918945, "learning_rate": 2.7055815935236474e-05, "loss": 0.5049, "step": 1346 }, { "epoch": 0.5164877300613497, "grad_norm": 1.7424520254135132, "learning_rate": 2.7034512143161483e-05, "loss": 0.005, "step": 1347 }, { "epoch": 0.5168711656441718, "grad_norm": 0.6367583274841309, "learning_rate": 2.7013208351086493e-05, "loss": 0.0036, "step": 1348 }, { "epoch": 0.5172546012269938, "grad_norm": 3.3440346717834473, "learning_rate": 2.6991904559011505e-05, "loss": 0.0164, "step": 1349 }, { "epoch": 0.5176380368098159, "grad_norm": 131.81895446777344, "learning_rate": 2.6970600766936515e-05, "loss": 6.2031, "step": 1350 }, { "epoch": 0.5180214723926381, "grad_norm": 32.145870208740234, "learning_rate": 2.6949296974861527e-05, "loss": 0.8281, "step": 1351 }, { "epoch": 0.5184049079754601, "grad_norm": 15.580379486083984, "learning_rate": 2.6927993182786537e-05, "loss": 0.1283, "step": 1352 }, { "epoch": 0.5187883435582822, "grad_norm": 8.98824405670166, "learning_rate": 2.690668939071155e-05, "loss": 0.1846, "step": 1353 }, { "epoch": 0.5191717791411042, "grad_norm": 18.761882781982422, "learning_rate": 2.688538559863656e-05, "loss": 0.5318, "step": 1354 }, { "epoch": 0.5195552147239264, "grad_norm": 7.901307106018066, "learning_rate": 2.686408180656157e-05, "loss": 0.4761, "step": 1355 }, { "epoch": 0.5199386503067485, "grad_norm": 183.93434143066406, "learning_rate": 2.684277801448658e-05, "loss": 1.3828, "step": 1356 }, { "epoch": 0.5203220858895705, "grad_norm": 32.8038444519043, "learning_rate": 2.682147422241159e-05, "loss": 0.6572, "step": 1357 }, { "epoch": 0.5207055214723927, "grad_norm": 3.186523675918579, "learning_rate": 2.6800170430336603e-05, "loss": 0.0134, "step": 1358 }, { "epoch": 0.5210889570552147, "grad_norm": 1.7899644374847412, "learning_rate": 2.6778866638261612e-05, "loss": 0.0081, "step": 1359 }, { "epoch": 0.5214723926380368, "grad_norm": 6.219476699829102, "learning_rate": 2.6757562846186625e-05, "loss": 0.0469, "step": 1360 }, { "epoch": 0.5218558282208589, "grad_norm": 3.5063388347625732, "learning_rate": 2.6736259054111634e-05, "loss": 0.3955, "step": 1361 }, { "epoch": 0.522239263803681, "grad_norm": 58.13272476196289, "learning_rate": 2.6714955262036647e-05, "loss": 0.7842, "step": 1362 }, { "epoch": 0.5226226993865031, "grad_norm": 216.25241088867188, "learning_rate": 2.6693651469961656e-05, "loss": 1.1358, "step": 1363 }, { "epoch": 0.5230061349693251, "grad_norm": 0.674191415309906, "learning_rate": 2.6672347677886665e-05, "loss": 0.0041, "step": 1364 }, { "epoch": 0.5233895705521472, "grad_norm": 36.996612548828125, "learning_rate": 2.6651043885811678e-05, "loss": 0.6734, "step": 1365 }, { "epoch": 0.5237730061349694, "grad_norm": 0.2353329062461853, "learning_rate": 2.6629740093736687e-05, "loss": 0.0021, "step": 1366 }, { "epoch": 0.5241564417177914, "grad_norm": 19.37664031982422, "learning_rate": 2.66084363016617e-05, "loss": 0.1535, "step": 1367 }, { "epoch": 0.5245398773006135, "grad_norm": 573.6376953125, "learning_rate": 2.658713250958671e-05, "loss": 1.7295, "step": 1368 }, { "epoch": 0.5249233128834356, "grad_norm": 153.51812744140625, "learning_rate": 2.6565828717511722e-05, "loss": 4.3696, "step": 1369 }, { "epoch": 0.5253067484662577, "grad_norm": 0.6202207803726196, "learning_rate": 2.654452492543673e-05, "loss": 0.003, "step": 1370 }, { "epoch": 0.5256901840490797, "grad_norm": 121.04489135742188, "learning_rate": 2.652322113336174e-05, "loss": 1.6329, "step": 1371 }, { "epoch": 0.5260736196319018, "grad_norm": 2.2874927520751953, "learning_rate": 2.6501917341286753e-05, "loss": 0.0081, "step": 1372 }, { "epoch": 0.526457055214724, "grad_norm": 0.22468091547489166, "learning_rate": 2.6480613549211763e-05, "loss": 0.0013, "step": 1373 }, { "epoch": 0.526840490797546, "grad_norm": 2.073918342590332, "learning_rate": 2.6459309757136775e-05, "loss": 0.0083, "step": 1374 }, { "epoch": 0.5272239263803681, "grad_norm": 5.445145130157471, "learning_rate": 2.6438005965061778e-05, "loss": 0.0729, "step": 1375 }, { "epoch": 0.5276073619631901, "grad_norm": 53.28449249267578, "learning_rate": 2.641670217298679e-05, "loss": 0.8887, "step": 1376 }, { "epoch": 0.5279907975460123, "grad_norm": 135.62582397460938, "learning_rate": 2.63953983809118e-05, "loss": 2.7295, "step": 1377 }, { "epoch": 0.5283742331288344, "grad_norm": 1.8507696390151978, "learning_rate": 2.6374094588836813e-05, "loss": 0.0066, "step": 1378 }, { "epoch": 0.5287576687116564, "grad_norm": 3.9919838905334473, "learning_rate": 2.6352790796761822e-05, "loss": 0.03, "step": 1379 }, { "epoch": 0.5291411042944786, "grad_norm": 10.276567459106445, "learning_rate": 2.6331487004686835e-05, "loss": 0.4624, "step": 1380 }, { "epoch": 0.5295245398773006, "grad_norm": 16.75113296508789, "learning_rate": 2.6310183212611844e-05, "loss": 0.1008, "step": 1381 }, { "epoch": 0.5299079754601227, "grad_norm": 5.067356109619141, "learning_rate": 2.6288879420536857e-05, "loss": 0.0235, "step": 1382 }, { "epoch": 0.5302914110429447, "grad_norm": 1.2853585481643677, "learning_rate": 2.6267575628461866e-05, "loss": 0.0039, "step": 1383 }, { "epoch": 0.5306748466257669, "grad_norm": 60.08701705932617, "learning_rate": 2.6246271836386875e-05, "loss": 0.8955, "step": 1384 }, { "epoch": 0.531058282208589, "grad_norm": 1.2853145599365234, "learning_rate": 2.6224968044311888e-05, "loss": 0.0051, "step": 1385 }, { "epoch": 0.531441717791411, "grad_norm": 63.4505615234375, "learning_rate": 2.6203664252236897e-05, "loss": 0.8331, "step": 1386 }, { "epoch": 0.5318251533742331, "grad_norm": 14.749077796936035, "learning_rate": 2.618236046016191e-05, "loss": 0.4832, "step": 1387 }, { "epoch": 0.5322085889570553, "grad_norm": 13.824406623840332, "learning_rate": 2.616105666808692e-05, "loss": 0.0809, "step": 1388 }, { "epoch": 0.5325920245398773, "grad_norm": 2.1554923057556152, "learning_rate": 2.6139752876011932e-05, "loss": 0.0092, "step": 1389 }, { "epoch": 0.5329754601226994, "grad_norm": 78.30574798583984, "learning_rate": 2.611844908393694e-05, "loss": 1.3371, "step": 1390 }, { "epoch": 0.5333588957055214, "grad_norm": 3.4257280826568604, "learning_rate": 2.609714529186195e-05, "loss": 0.0098, "step": 1391 }, { "epoch": 0.5337423312883436, "grad_norm": 16.018634796142578, "learning_rate": 2.6075841499786963e-05, "loss": 0.1732, "step": 1392 }, { "epoch": 0.5341257668711656, "grad_norm": 119.55533599853516, "learning_rate": 2.6054537707711973e-05, "loss": 0.752, "step": 1393 }, { "epoch": 0.5345092024539877, "grad_norm": 2.6413443088531494, "learning_rate": 2.6033233915636985e-05, "loss": 0.0117, "step": 1394 }, { "epoch": 0.5348926380368099, "grad_norm": 0.46682795882225037, "learning_rate": 2.6011930123561995e-05, "loss": 0.0039, "step": 1395 }, { "epoch": 0.5352760736196319, "grad_norm": 79.32655334472656, "learning_rate": 2.5990626331487007e-05, "loss": 0.8901, "step": 1396 }, { "epoch": 0.535659509202454, "grad_norm": 107.98719024658203, "learning_rate": 2.5969322539412017e-05, "loss": 1.2768, "step": 1397 }, { "epoch": 0.536042944785276, "grad_norm": 83.03223419189453, "learning_rate": 2.5948018747337026e-05, "loss": 1.0703, "step": 1398 }, { "epoch": 0.5364263803680982, "grad_norm": 2.588785171508789, "learning_rate": 2.592671495526204e-05, "loss": 0.0098, "step": 1399 }, { "epoch": 0.5368098159509203, "grad_norm": 0.832665741443634, "learning_rate": 2.5905411163187048e-05, "loss": 0.0029, "step": 1400 }, { "epoch": 0.5371932515337423, "grad_norm": 8.87395191192627, "learning_rate": 2.588410737111206e-05, "loss": 0.0303, "step": 1401 }, { "epoch": 0.5375766871165644, "grad_norm": 97.84419250488281, "learning_rate": 2.586280357903707e-05, "loss": 1.2237, "step": 1402 }, { "epoch": 0.5379601226993865, "grad_norm": 0.29744938015937805, "learning_rate": 2.5841499786962083e-05, "loss": 0.0024, "step": 1403 }, { "epoch": 0.5383435582822086, "grad_norm": 0.6852608919143677, "learning_rate": 2.5820195994887092e-05, "loss": 0.002, "step": 1404 }, { "epoch": 0.5387269938650306, "grad_norm": 53.20299530029297, "learning_rate": 2.57988922028121e-05, "loss": 0.9683, "step": 1405 }, { "epoch": 0.5391104294478528, "grad_norm": 2.4528932571411133, "learning_rate": 2.5777588410737114e-05, "loss": 0.0096, "step": 1406 }, { "epoch": 0.5394938650306749, "grad_norm": 0.36206158995628357, "learning_rate": 2.5756284618662123e-05, "loss": 0.0019, "step": 1407 }, { "epoch": 0.5398773006134969, "grad_norm": 0.45718395709991455, "learning_rate": 2.5734980826587136e-05, "loss": 0.0025, "step": 1408 }, { "epoch": 0.540260736196319, "grad_norm": 83.47513580322266, "learning_rate": 2.5713677034512145e-05, "loss": 6.1074, "step": 1409 }, { "epoch": 0.5406441717791411, "grad_norm": 199.39964294433594, "learning_rate": 2.5692373242437158e-05, "loss": 3.4777, "step": 1410 }, { "epoch": 0.5410276073619632, "grad_norm": 51.30986785888672, "learning_rate": 2.5671069450362167e-05, "loss": 0.8536, "step": 1411 }, { "epoch": 0.5414110429447853, "grad_norm": 7.736695766448975, "learning_rate": 2.5649765658287177e-05, "loss": 0.4246, "step": 1412 }, { "epoch": 0.5417944785276073, "grad_norm": 4.35651159286499, "learning_rate": 2.562846186621219e-05, "loss": 0.0247, "step": 1413 }, { "epoch": 0.5421779141104295, "grad_norm": 8.150458335876465, "learning_rate": 2.56071580741372e-05, "loss": 0.0403, "step": 1414 }, { "epoch": 0.5425613496932515, "grad_norm": 1.985188603401184, "learning_rate": 2.558585428206221e-05, "loss": 0.0082, "step": 1415 }, { "epoch": 0.5429447852760736, "grad_norm": 2.916771411895752, "learning_rate": 2.556455048998722e-05, "loss": 0.0088, "step": 1416 }, { "epoch": 0.5433282208588958, "grad_norm": 113.31708526611328, "learning_rate": 2.5543246697912233e-05, "loss": 1.5412, "step": 1417 }, { "epoch": 0.5437116564417178, "grad_norm": 138.85861206054688, "learning_rate": 2.5521942905837243e-05, "loss": 1.7139, "step": 1418 }, { "epoch": 0.5440950920245399, "grad_norm": 8.595303535461426, "learning_rate": 2.5500639113762252e-05, "loss": 0.0212, "step": 1419 }, { "epoch": 0.5444785276073619, "grad_norm": 245.66258239746094, "learning_rate": 2.5479335321687258e-05, "loss": 1.1719, "step": 1420 }, { "epoch": 0.5448619631901841, "grad_norm": 20.04214859008789, "learning_rate": 2.545803152961227e-05, "loss": 0.1265, "step": 1421 }, { "epoch": 0.5452453987730062, "grad_norm": 8.85911750793457, "learning_rate": 2.543672773753728e-05, "loss": 0.4402, "step": 1422 }, { "epoch": 0.5456288343558282, "grad_norm": 13.186134338378906, "learning_rate": 2.5415423945462293e-05, "loss": 0.0499, "step": 1423 }, { "epoch": 0.5460122699386503, "grad_norm": 55.921443939208984, "learning_rate": 2.5394120153387302e-05, "loss": 0.8443, "step": 1424 }, { "epoch": 0.5463957055214724, "grad_norm": 0.4546794295310974, "learning_rate": 2.537281636131231e-05, "loss": 0.0023, "step": 1425 }, { "epoch": 0.5467791411042945, "grad_norm": 288.71466064453125, "learning_rate": 2.5351512569237324e-05, "loss": 5.0381, "step": 1426 }, { "epoch": 0.5471625766871165, "grad_norm": 128.4453582763672, "learning_rate": 2.5330208777162333e-05, "loss": 5.7852, "step": 1427 }, { "epoch": 0.5475460122699386, "grad_norm": 3.204813003540039, "learning_rate": 2.5308904985087346e-05, "loss": 0.0066, "step": 1428 }, { "epoch": 0.5479294478527608, "grad_norm": 93.51945495605469, "learning_rate": 2.5287601193012355e-05, "loss": 1.4854, "step": 1429 }, { "epoch": 0.5483128834355828, "grad_norm": 300.50006103515625, "learning_rate": 2.5266297400937368e-05, "loss": 5.624, "step": 1430 }, { "epoch": 0.5486963190184049, "grad_norm": 52.6341438293457, "learning_rate": 2.5244993608862377e-05, "loss": 0.7945, "step": 1431 }, { "epoch": 0.549079754601227, "grad_norm": 1.61876380443573, "learning_rate": 2.5223689816787386e-05, "loss": 0.0069, "step": 1432 }, { "epoch": 0.5494631901840491, "grad_norm": 157.72666931152344, "learning_rate": 2.52023860247124e-05, "loss": 5.9502, "step": 1433 }, { "epoch": 0.5498466257668712, "grad_norm": 9.354452133178711, "learning_rate": 2.518108223263741e-05, "loss": 0.0192, "step": 1434 }, { "epoch": 0.5502300613496932, "grad_norm": 0.2993928790092468, "learning_rate": 2.515977844056242e-05, "loss": 0.0025, "step": 1435 }, { "epoch": 0.5506134969325154, "grad_norm": 136.23651123046875, "learning_rate": 2.513847464848743e-05, "loss": 1.8184, "step": 1436 }, { "epoch": 0.5509969325153374, "grad_norm": 1.0809216499328613, "learning_rate": 2.5117170856412443e-05, "loss": 0.0021, "step": 1437 }, { "epoch": 0.5513803680981595, "grad_norm": 39.01406478881836, "learning_rate": 2.5095867064337453e-05, "loss": 0.5596, "step": 1438 }, { "epoch": 0.5517638036809815, "grad_norm": 0.9668958187103271, "learning_rate": 2.5074563272262462e-05, "loss": 0.0034, "step": 1439 }, { "epoch": 0.5521472392638037, "grad_norm": 0.5503262877464294, "learning_rate": 2.5053259480187475e-05, "loss": 0.0023, "step": 1440 }, { "epoch": 0.5525306748466258, "grad_norm": 2.4875385761260986, "learning_rate": 2.5031955688112484e-05, "loss": 0.009, "step": 1441 }, { "epoch": 0.5529141104294478, "grad_norm": 0.723963737487793, "learning_rate": 2.5010651896037497e-05, "loss": 0.004, "step": 1442 }, { "epoch": 0.55329754601227, "grad_norm": 0.2790910005569458, "learning_rate": 2.4989348103962506e-05, "loss": 0.0018, "step": 1443 }, { "epoch": 0.553680981595092, "grad_norm": 1.9345526695251465, "learning_rate": 2.496804431188752e-05, "loss": 0.0034, "step": 1444 }, { "epoch": 0.5540644171779141, "grad_norm": 1.0085608959197998, "learning_rate": 2.4946740519812528e-05, "loss": 0.0036, "step": 1445 }, { "epoch": 0.5544478527607362, "grad_norm": 10.536890029907227, "learning_rate": 2.492543672773754e-05, "loss": 0.0603, "step": 1446 }, { "epoch": 0.5548312883435583, "grad_norm": 30.43031120300293, "learning_rate": 2.490413293566255e-05, "loss": 0.9595, "step": 1447 }, { "epoch": 0.5552147239263804, "grad_norm": 9.614569664001465, "learning_rate": 2.488282914358756e-05, "loss": 0.4419, "step": 1448 }, { "epoch": 0.5555981595092024, "grad_norm": 21.122318267822266, "learning_rate": 2.4861525351512572e-05, "loss": 0.5918, "step": 1449 }, { "epoch": 0.5559815950920245, "grad_norm": 174.86630249023438, "learning_rate": 2.484022155943758e-05, "loss": 5.9844, "step": 1450 }, { "epoch": 0.5563650306748467, "grad_norm": 123.41751861572266, "learning_rate": 2.4818917767362594e-05, "loss": 2.4866, "step": 1451 }, { "epoch": 0.5567484662576687, "grad_norm": 76.07979583740234, "learning_rate": 2.4797613975287603e-05, "loss": 0.8784, "step": 1452 }, { "epoch": 0.5571319018404908, "grad_norm": 20.474328994750977, "learning_rate": 2.4776310183212616e-05, "loss": 0.1867, "step": 1453 }, { "epoch": 0.5575153374233128, "grad_norm": 29.993284225463867, "learning_rate": 2.4755006391137622e-05, "loss": 0.5933, "step": 1454 }, { "epoch": 0.557898773006135, "grad_norm": 179.69223022460938, "learning_rate": 2.4733702599062635e-05, "loss": 5.0938, "step": 1455 }, { "epoch": 0.558282208588957, "grad_norm": 4.559651851654053, "learning_rate": 2.4712398806987644e-05, "loss": 0.0206, "step": 1456 }, { "epoch": 0.5586656441717791, "grad_norm": 56.0554084777832, "learning_rate": 2.4691095014912653e-05, "loss": 0.8277, "step": 1457 }, { "epoch": 0.5590490797546013, "grad_norm": 44.82016372680664, "learning_rate": 2.4669791222837666e-05, "loss": 0.9175, "step": 1458 }, { "epoch": 0.5594325153374233, "grad_norm": 22.089797973632812, "learning_rate": 2.4648487430762675e-05, "loss": 0.5376, "step": 1459 }, { "epoch": 0.5598159509202454, "grad_norm": 34.83670425415039, "learning_rate": 2.4627183638687688e-05, "loss": 0.5234, "step": 1460 }, { "epoch": 0.5601993865030674, "grad_norm": 63.4261589050293, "learning_rate": 2.4605879846612697e-05, "loss": 1.0977, "step": 1461 }, { "epoch": 0.5605828220858896, "grad_norm": 49.81941604614258, "learning_rate": 2.458457605453771e-05, "loss": 0.7837, "step": 1462 }, { "epoch": 0.5609662576687117, "grad_norm": 22.516889572143555, "learning_rate": 2.456327226246272e-05, "loss": 0.1992, "step": 1463 }, { "epoch": 0.5613496932515337, "grad_norm": 0.19163483381271362, "learning_rate": 2.454196847038773e-05, "loss": 0.0019, "step": 1464 }, { "epoch": 0.5617331288343558, "grad_norm": 0.478487104177475, "learning_rate": 2.452066467831274e-05, "loss": 0.0027, "step": 1465 }, { "epoch": 0.5621165644171779, "grad_norm": 0.2832421362400055, "learning_rate": 2.449936088623775e-05, "loss": 0.0017, "step": 1466 }, { "epoch": 0.5625, "grad_norm": 1.0812827348709106, "learning_rate": 2.4478057094162763e-05, "loss": 0.0055, "step": 1467 }, { "epoch": 0.5628834355828221, "grad_norm": 128.22760009765625, "learning_rate": 2.4456753302087773e-05, "loss": 2.5939, "step": 1468 }, { "epoch": 0.5632668711656442, "grad_norm": 87.38990020751953, "learning_rate": 2.4435449510012785e-05, "loss": 1.8282, "step": 1469 }, { "epoch": 0.5636503067484663, "grad_norm": 1.4836554527282715, "learning_rate": 2.4414145717937795e-05, "loss": 0.0054, "step": 1470 }, { "epoch": 0.5640337423312883, "grad_norm": 7.705040454864502, "learning_rate": 2.4392841925862804e-05, "loss": 0.0478, "step": 1471 }, { "epoch": 0.5644171779141104, "grad_norm": 50.90473556518555, "learning_rate": 2.4371538133787817e-05, "loss": 0.6255, "step": 1472 }, { "epoch": 0.5648006134969326, "grad_norm": 111.48176574707031, "learning_rate": 2.4350234341712826e-05, "loss": 2.2913, "step": 1473 }, { "epoch": 0.5651840490797546, "grad_norm": 140.29307556152344, "learning_rate": 2.432893054963784e-05, "loss": 3.7265, "step": 1474 }, { "epoch": 0.5655674846625767, "grad_norm": 1.5388623476028442, "learning_rate": 2.4307626757562848e-05, "loss": 0.006, "step": 1475 }, { "epoch": 0.5659509202453987, "grad_norm": 19.834774017333984, "learning_rate": 2.428632296548786e-05, "loss": 0.5942, "step": 1476 }, { "epoch": 0.5663343558282209, "grad_norm": 156.98837280273438, "learning_rate": 2.4265019173412866e-05, "loss": 6.3779, "step": 1477 }, { "epoch": 0.566717791411043, "grad_norm": 0.3030416965484619, "learning_rate": 2.424371538133788e-05, "loss": 0.0015, "step": 1478 }, { "epoch": 0.567101226993865, "grad_norm": 10.810173034667969, "learning_rate": 2.422241158926289e-05, "loss": 0.1487, "step": 1479 }, { "epoch": 0.5674846625766872, "grad_norm": 42.8617057800293, "learning_rate": 2.42011077971879e-05, "loss": 0.6021, "step": 1480 }, { "epoch": 0.5678680981595092, "grad_norm": 11.066485404968262, "learning_rate": 2.417980400511291e-05, "loss": 0.0384, "step": 1481 }, { "epoch": 0.5682515337423313, "grad_norm": 14.02347469329834, "learning_rate": 2.415850021303792e-05, "loss": 0.428, "step": 1482 }, { "epoch": 0.5686349693251533, "grad_norm": 105.50628662109375, "learning_rate": 2.4137196420962933e-05, "loss": 1.8076, "step": 1483 }, { "epoch": 0.5690184049079755, "grad_norm": 0.9760513305664062, "learning_rate": 2.4115892628887942e-05, "loss": 0.0036, "step": 1484 }, { "epoch": 0.5694018404907976, "grad_norm": 1.0313464403152466, "learning_rate": 2.4094588836812955e-05, "loss": 0.0047, "step": 1485 }, { "epoch": 0.5697852760736196, "grad_norm": 16.453495025634766, "learning_rate": 2.4073285044737964e-05, "loss": 0.4883, "step": 1486 }, { "epoch": 0.5701687116564417, "grad_norm": 0.4924452304840088, "learning_rate": 2.4051981252662977e-05, "loss": 0.0016, "step": 1487 }, { "epoch": 0.5705521472392638, "grad_norm": 30.859689712524414, "learning_rate": 2.4030677460587986e-05, "loss": 0.6582, "step": 1488 }, { "epoch": 0.5709355828220859, "grad_norm": 0.24691864848136902, "learning_rate": 2.4009373668512995e-05, "loss": 0.0016, "step": 1489 }, { "epoch": 0.571319018404908, "grad_norm": 5.21722412109375, "learning_rate": 2.3988069876438008e-05, "loss": 0.0255, "step": 1490 }, { "epoch": 0.57170245398773, "grad_norm": 13.898541450500488, "learning_rate": 2.3966766084363017e-05, "loss": 0.4402, "step": 1491 }, { "epoch": 0.5720858895705522, "grad_norm": 11.03413200378418, "learning_rate": 2.394546229228803e-05, "loss": 0.1442, "step": 1492 }, { "epoch": 0.5724693251533742, "grad_norm": 131.2899169921875, "learning_rate": 2.392415850021304e-05, "loss": 1.1788, "step": 1493 }, { "epoch": 0.5728527607361963, "grad_norm": 0.2610747218132019, "learning_rate": 2.3902854708138052e-05, "loss": 0.0022, "step": 1494 }, { "epoch": 0.5732361963190185, "grad_norm": 85.44371032714844, "learning_rate": 2.388155091606306e-05, "loss": 0.7159, "step": 1495 }, { "epoch": 0.5736196319018405, "grad_norm": 99.33592224121094, "learning_rate": 2.386024712398807e-05, "loss": 1.1123, "step": 1496 }, { "epoch": 0.5740030674846626, "grad_norm": 6.815636157989502, "learning_rate": 2.3838943331913083e-05, "loss": 0.2409, "step": 1497 }, { "epoch": 0.5743865030674846, "grad_norm": 61.178688049316406, "learning_rate": 2.3817639539838093e-05, "loss": 0.7031, "step": 1498 }, { "epoch": 0.5747699386503068, "grad_norm": 2.902374744415283, "learning_rate": 2.3796335747763102e-05, "loss": 0.0147, "step": 1499 }, { "epoch": 0.5751533742331288, "grad_norm": 3.590909481048584, "learning_rate": 2.377503195568811e-05, "loss": 0.0159, "step": 1500 }, { "epoch": 0.5755368098159509, "grad_norm": 0.2199438512325287, "learning_rate": 2.3753728163613124e-05, "loss": 0.0016, "step": 1501 }, { "epoch": 0.575920245398773, "grad_norm": 39.01709747314453, "learning_rate": 2.3732424371538133e-05, "loss": 0.6152, "step": 1502 }, { "epoch": 0.5763036809815951, "grad_norm": 4.145489692687988, "learning_rate": 2.3711120579463146e-05, "loss": 0.3848, "step": 1503 }, { "epoch": 0.5766871165644172, "grad_norm": 1.525215744972229, "learning_rate": 2.3689816787388155e-05, "loss": 0.0019, "step": 1504 }, { "epoch": 0.5770705521472392, "grad_norm": 36.02094650268555, "learning_rate": 2.3668512995313168e-05, "loss": 0.6113, "step": 1505 }, { "epoch": 0.5774539877300614, "grad_norm": 56.19095993041992, "learning_rate": 2.3647209203238177e-05, "loss": 0.5518, "step": 1506 }, { "epoch": 0.5778374233128835, "grad_norm": 1.4983949661254883, "learning_rate": 2.3625905411163186e-05, "loss": 0.0049, "step": 1507 }, { "epoch": 0.5782208588957055, "grad_norm": 58.191192626953125, "learning_rate": 2.36046016190882e-05, "loss": 0.9834, "step": 1508 }, { "epoch": 0.5786042944785276, "grad_norm": 9.700529098510742, "learning_rate": 2.358329782701321e-05, "loss": 0.0304, "step": 1509 }, { "epoch": 0.5789877300613497, "grad_norm": 75.30133056640625, "learning_rate": 2.356199403493822e-05, "loss": 0.7432, "step": 1510 }, { "epoch": 0.5793711656441718, "grad_norm": 16.457170486450195, "learning_rate": 2.354069024286323e-05, "loss": 0.1741, "step": 1511 }, { "epoch": 0.5797546012269938, "grad_norm": 12.593084335327148, "learning_rate": 2.3519386450788243e-05, "loss": 0.0602, "step": 1512 }, { "epoch": 0.5801380368098159, "grad_norm": 2.239633083343506, "learning_rate": 2.3498082658713253e-05, "loss": 0.0063, "step": 1513 }, { "epoch": 0.5805214723926381, "grad_norm": 69.36461639404297, "learning_rate": 2.3476778866638262e-05, "loss": 0.7149, "step": 1514 }, { "epoch": 0.5809049079754601, "grad_norm": 186.72915649414062, "learning_rate": 2.3455475074563275e-05, "loss": 6.3867, "step": 1515 }, { "epoch": 0.5812883435582822, "grad_norm": 205.23544311523438, "learning_rate": 2.3434171282488284e-05, "loss": 1.1739, "step": 1516 }, { "epoch": 0.5816717791411042, "grad_norm": 23.364177703857422, "learning_rate": 2.3412867490413297e-05, "loss": 0.071, "step": 1517 }, { "epoch": 0.5820552147239264, "grad_norm": 37.77939224243164, "learning_rate": 2.3391563698338306e-05, "loss": 0.7114, "step": 1518 }, { "epoch": 0.5824386503067485, "grad_norm": 14.460637092590332, "learning_rate": 2.337025990626332e-05, "loss": 0.0953, "step": 1519 }, { "epoch": 0.5828220858895705, "grad_norm": 12.150667190551758, "learning_rate": 2.3348956114188328e-05, "loss": 0.0507, "step": 1520 }, { "epoch": 0.5832055214723927, "grad_norm": 6.960376739501953, "learning_rate": 2.3327652322113337e-05, "loss": 0.0439, "step": 1521 }, { "epoch": 0.5835889570552147, "grad_norm": 2.1229488849639893, "learning_rate": 2.3306348530038346e-05, "loss": 0.0077, "step": 1522 }, { "epoch": 0.5839723926380368, "grad_norm": 0.3395184278488159, "learning_rate": 2.3285044737963356e-05, "loss": 0.0015, "step": 1523 }, { "epoch": 0.5843558282208589, "grad_norm": 0.09832275658845901, "learning_rate": 2.326374094588837e-05, "loss": 0.0009, "step": 1524 }, { "epoch": 0.584739263803681, "grad_norm": 0.6131349802017212, "learning_rate": 2.3242437153813378e-05, "loss": 0.0028, "step": 1525 }, { "epoch": 0.5851226993865031, "grad_norm": 0.6060605049133301, "learning_rate": 2.322113336173839e-05, "loss": 0.003, "step": 1526 }, { "epoch": 0.5855061349693251, "grad_norm": 17.108949661254883, "learning_rate": 2.31998295696634e-05, "loss": 0.1991, "step": 1527 }, { "epoch": 0.5858895705521472, "grad_norm": 1.113153338432312, "learning_rate": 2.3178525777588413e-05, "loss": 0.0061, "step": 1528 }, { "epoch": 0.5862730061349694, "grad_norm": 28.444774627685547, "learning_rate": 2.3157221985513422e-05, "loss": 0.522, "step": 1529 }, { "epoch": 0.5866564417177914, "grad_norm": 12.834977149963379, "learning_rate": 2.313591819343843e-05, "loss": 0.1884, "step": 1530 }, { "epoch": 0.5870398773006135, "grad_norm": 0.5456647872924805, "learning_rate": 2.3114614401363444e-05, "loss": 0.0032, "step": 1531 }, { "epoch": 0.5874233128834356, "grad_norm": 11.738081932067871, "learning_rate": 2.3093310609288453e-05, "loss": 0.4417, "step": 1532 }, { "epoch": 0.5878067484662577, "grad_norm": 16.152618408203125, "learning_rate": 2.3072006817213466e-05, "loss": 0.5591, "step": 1533 }, { "epoch": 0.5881901840490797, "grad_norm": 18.996780395507812, "learning_rate": 2.3050703025138475e-05, "loss": 0.1611, "step": 1534 }, { "epoch": 0.5885736196319018, "grad_norm": 0.45010751485824585, "learning_rate": 2.3029399233063488e-05, "loss": 0.0021, "step": 1535 }, { "epoch": 0.588957055214724, "grad_norm": 0.9772094488143921, "learning_rate": 2.3008095440988497e-05, "loss": 0.005, "step": 1536 }, { "epoch": 0.589340490797546, "grad_norm": 10.670825004577637, "learning_rate": 2.298679164891351e-05, "loss": 0.0729, "step": 1537 }, { "epoch": 0.5897239263803681, "grad_norm": 12.155583381652832, "learning_rate": 2.296548785683852e-05, "loss": 0.1005, "step": 1538 }, { "epoch": 0.5901073619631901, "grad_norm": 62.7037239074707, "learning_rate": 2.294418406476353e-05, "loss": 0.9976, "step": 1539 }, { "epoch": 0.5904907975460123, "grad_norm": 128.63902282714844, "learning_rate": 2.292288027268854e-05, "loss": 1.2705, "step": 1540 }, { "epoch": 0.5908742331288344, "grad_norm": 97.0986099243164, "learning_rate": 2.290157648061355e-05, "loss": 1.1673, "step": 1541 }, { "epoch": 0.5912576687116564, "grad_norm": 0.7080488801002502, "learning_rate": 2.2880272688538563e-05, "loss": 0.0027, "step": 1542 }, { "epoch": 0.5916411042944786, "grad_norm": 5.879316329956055, "learning_rate": 2.2858968896463573e-05, "loss": 0.0479, "step": 1543 }, { "epoch": 0.5920245398773006, "grad_norm": 187.53341674804688, "learning_rate": 2.2837665104388585e-05, "loss": 3.2312, "step": 1544 }, { "epoch": 0.5924079754601227, "grad_norm": 73.92506408691406, "learning_rate": 2.281636131231359e-05, "loss": 1.2617, "step": 1545 }, { "epoch": 0.5927914110429447, "grad_norm": 98.77815246582031, "learning_rate": 2.2795057520238604e-05, "loss": 1.798, "step": 1546 }, { "epoch": 0.5931748466257669, "grad_norm": 13.581368446350098, "learning_rate": 2.2773753728163613e-05, "loss": 0.0398, "step": 1547 }, { "epoch": 0.593558282208589, "grad_norm": 0.6549582481384277, "learning_rate": 2.2752449936088622e-05, "loss": 0.003, "step": 1548 }, { "epoch": 0.593941717791411, "grad_norm": 203.76071166992188, "learning_rate": 2.2731146144013635e-05, "loss": 2.1231, "step": 1549 }, { "epoch": 0.5943251533742331, "grad_norm": 0.31085968017578125, "learning_rate": 2.2709842351938644e-05, "loss": 0.0024, "step": 1550 }, { "epoch": 0.5947085889570553, "grad_norm": 0.24026542901992798, "learning_rate": 2.2688538559863657e-05, "loss": 0.0011, "step": 1551 }, { "epoch": 0.5950920245398773, "grad_norm": 134.61109924316406, "learning_rate": 2.2667234767788666e-05, "loss": 1.0635, "step": 1552 }, { "epoch": 0.5954754601226994, "grad_norm": 51.36643981933594, "learning_rate": 2.264593097571368e-05, "loss": 1.0771, "step": 1553 }, { "epoch": 0.5958588957055214, "grad_norm": 89.78507995605469, "learning_rate": 2.262462718363869e-05, "loss": 0.814, "step": 1554 }, { "epoch": 0.5962423312883436, "grad_norm": 152.59800720214844, "learning_rate": 2.2603323391563698e-05, "loss": 1.3302, "step": 1555 }, { "epoch": 0.5966257668711656, "grad_norm": 16.452253341674805, "learning_rate": 2.258201959948871e-05, "loss": 0.1604, "step": 1556 }, { "epoch": 0.5970092024539877, "grad_norm": 1.21446692943573, "learning_rate": 2.256071580741372e-05, "loss": 0.0063, "step": 1557 }, { "epoch": 0.5973926380368099, "grad_norm": 0.23559719324111938, "learning_rate": 2.2539412015338732e-05, "loss": 0.0012, "step": 1558 }, { "epoch": 0.5977760736196319, "grad_norm": 5.033271312713623, "learning_rate": 2.2518108223263742e-05, "loss": 0.0282, "step": 1559 }, { "epoch": 0.598159509202454, "grad_norm": 25.347471237182617, "learning_rate": 2.2496804431188755e-05, "loss": 0.5996, "step": 1560 }, { "epoch": 0.598542944785276, "grad_norm": 4.496798992156982, "learning_rate": 2.2475500639113764e-05, "loss": 0.0331, "step": 1561 }, { "epoch": 0.5989263803680982, "grad_norm": 74.6218490600586, "learning_rate": 2.2454196847038773e-05, "loss": 0.5161, "step": 1562 }, { "epoch": 0.5993098159509203, "grad_norm": 0.9332700967788696, "learning_rate": 2.2432893054963786e-05, "loss": 0.0054, "step": 1563 }, { "epoch": 0.5996932515337423, "grad_norm": 2.685960292816162, "learning_rate": 2.2411589262888795e-05, "loss": 0.0127, "step": 1564 }, { "epoch": 0.6000766871165644, "grad_norm": 3.394599676132202, "learning_rate": 2.2390285470813808e-05, "loss": 0.0126, "step": 1565 }, { "epoch": 0.6004601226993865, "grad_norm": 7.925353050231934, "learning_rate": 2.2368981678738817e-05, "loss": 0.0096, "step": 1566 }, { "epoch": 0.6008435582822086, "grad_norm": 65.36844635009766, "learning_rate": 2.2347677886663826e-05, "loss": 0.9502, "step": 1567 }, { "epoch": 0.6012269938650306, "grad_norm": 1.044142246246338, "learning_rate": 2.2326374094588836e-05, "loss": 0.0031, "step": 1568 }, { "epoch": 0.6016104294478528, "grad_norm": 1.1998786926269531, "learning_rate": 2.230507030251385e-05, "loss": 0.0049, "step": 1569 }, { "epoch": 0.6019938650306749, "grad_norm": 7.696629524230957, "learning_rate": 2.2283766510438858e-05, "loss": 0.0278, "step": 1570 }, { "epoch": 0.6023773006134969, "grad_norm": 2.3067355155944824, "learning_rate": 2.226246271836387e-05, "loss": 0.0082, "step": 1571 }, { "epoch": 0.602760736196319, "grad_norm": 8.440646171569824, "learning_rate": 2.224115892628888e-05, "loss": 0.2568, "step": 1572 }, { "epoch": 0.6031441717791411, "grad_norm": 23.990575790405273, "learning_rate": 2.221985513421389e-05, "loss": 0.5166, "step": 1573 }, { "epoch": 0.6035276073619632, "grad_norm": 72.5779037475586, "learning_rate": 2.2198551342138902e-05, "loss": 1.2823, "step": 1574 }, { "epoch": 0.6039110429447853, "grad_norm": 0.22823019325733185, "learning_rate": 2.217724755006391e-05, "loss": 0.0019, "step": 1575 }, { "epoch": 0.6042944785276073, "grad_norm": 18.47128677368164, "learning_rate": 2.2155943757988924e-05, "loss": 0.4653, "step": 1576 }, { "epoch": 0.6046779141104295, "grad_norm": 1.0686135292053223, "learning_rate": 2.2134639965913933e-05, "loss": 0.0038, "step": 1577 }, { "epoch": 0.6050613496932515, "grad_norm": 34.65068435668945, "learning_rate": 2.2113336173838946e-05, "loss": 0.6939, "step": 1578 }, { "epoch": 0.6054447852760736, "grad_norm": 21.57245635986328, "learning_rate": 2.2092032381763955e-05, "loss": 0.1171, "step": 1579 }, { "epoch": 0.6058282208588958, "grad_norm": 0.4185841381549835, "learning_rate": 2.2070728589688964e-05, "loss": 0.0025, "step": 1580 }, { "epoch": 0.6062116564417178, "grad_norm": 110.56187438964844, "learning_rate": 2.2049424797613977e-05, "loss": 1.5998, "step": 1581 }, { "epoch": 0.6065950920245399, "grad_norm": 38.00716781616211, "learning_rate": 2.2028121005538986e-05, "loss": 0.6504, "step": 1582 }, { "epoch": 0.6069785276073619, "grad_norm": 0.40848639607429504, "learning_rate": 2.2006817213464e-05, "loss": 0.0018, "step": 1583 }, { "epoch": 0.6073619631901841, "grad_norm": 0.4053003787994385, "learning_rate": 2.198551342138901e-05, "loss": 0.0019, "step": 1584 }, { "epoch": 0.6077453987730062, "grad_norm": 3.820105791091919, "learning_rate": 2.196420962931402e-05, "loss": 0.0084, "step": 1585 }, { "epoch": 0.6081288343558282, "grad_norm": 0.306547075510025, "learning_rate": 2.194290583723903e-05, "loss": 0.0018, "step": 1586 }, { "epoch": 0.6085122699386503, "grad_norm": 230.79635620117188, "learning_rate": 2.192160204516404e-05, "loss": 3.7719, "step": 1587 }, { "epoch": 0.6088957055214724, "grad_norm": 52.079227447509766, "learning_rate": 2.1900298253089052e-05, "loss": 0.7021, "step": 1588 }, { "epoch": 0.6092791411042945, "grad_norm": 1.4306854009628296, "learning_rate": 2.1878994461014062e-05, "loss": 0.0045, "step": 1589 }, { "epoch": 0.6096625766871165, "grad_norm": 29.657840728759766, "learning_rate": 2.185769066893907e-05, "loss": 0.7041, "step": 1590 }, { "epoch": 0.6100460122699386, "grad_norm": 0.2373381108045578, "learning_rate": 2.183638687686408e-05, "loss": 0.0016, "step": 1591 }, { "epoch": 0.6104294478527608, "grad_norm": 76.70428466796875, "learning_rate": 2.1815083084789093e-05, "loss": 2.3907, "step": 1592 }, { "epoch": 0.6108128834355828, "grad_norm": 25.90323829650879, "learning_rate": 2.1793779292714102e-05, "loss": 0.4868, "step": 1593 }, { "epoch": 0.6111963190184049, "grad_norm": 3.877763271331787, "learning_rate": 2.1772475500639115e-05, "loss": 0.0123, "step": 1594 }, { "epoch": 0.611579754601227, "grad_norm": 31.096912384033203, "learning_rate": 2.1751171708564124e-05, "loss": 0.7109, "step": 1595 }, { "epoch": 0.6119631901840491, "grad_norm": 306.73468017578125, "learning_rate": 2.1729867916489137e-05, "loss": 4.0736, "step": 1596 }, { "epoch": 0.6123466257668712, "grad_norm": 5.6847028732299805, "learning_rate": 2.1708564124414146e-05, "loss": 0.2793, "step": 1597 }, { "epoch": 0.6127300613496932, "grad_norm": 0.3299265205860138, "learning_rate": 2.1687260332339156e-05, "loss": 0.0015, "step": 1598 }, { "epoch": 0.6131134969325154, "grad_norm": 4.540491580963135, "learning_rate": 2.166595654026417e-05, "loss": 0.013, "step": 1599 }, { "epoch": 0.6134969325153374, "grad_norm": 13.582939147949219, "learning_rate": 2.1644652748189178e-05, "loss": 0.5747, "step": 1600 }, { "epoch": 0.6138803680981595, "grad_norm": 0.24872423708438873, "learning_rate": 2.162334895611419e-05, "loss": 0.002, "step": 1601 }, { "epoch": 0.6142638036809815, "grad_norm": 14.802531242370605, "learning_rate": 2.16020451640392e-05, "loss": 0.0799, "step": 1602 }, { "epoch": 0.6146472392638037, "grad_norm": 0.7813965678215027, "learning_rate": 2.1580741371964212e-05, "loss": 0.0032, "step": 1603 }, { "epoch": 0.6150306748466258, "grad_norm": 137.3957061767578, "learning_rate": 2.1559437579889222e-05, "loss": 1.9415, "step": 1604 }, { "epoch": 0.6154141104294478, "grad_norm": 0.37478020787239075, "learning_rate": 2.153813378781423e-05, "loss": 0.0012, "step": 1605 }, { "epoch": 0.61579754601227, "grad_norm": 313.3479309082031, "learning_rate": 2.1516829995739244e-05, "loss": 6.2529, "step": 1606 }, { "epoch": 0.616180981595092, "grad_norm": 0.25419455766677856, "learning_rate": 2.1495526203664253e-05, "loss": 0.0014, "step": 1607 }, { "epoch": 0.6165644171779141, "grad_norm": 4.846649646759033, "learning_rate": 2.1474222411589266e-05, "loss": 0.0219, "step": 1608 }, { "epoch": 0.6169478527607362, "grad_norm": 37.852149963378906, "learning_rate": 2.1452918619514275e-05, "loss": 0.5962, "step": 1609 }, { "epoch": 0.6173312883435583, "grad_norm": 1.2913243770599365, "learning_rate": 2.1431614827439288e-05, "loss": 0.003, "step": 1610 }, { "epoch": 0.6177147239263804, "grad_norm": 0.35508644580841064, "learning_rate": 2.1410311035364297e-05, "loss": 0.0011, "step": 1611 }, { "epoch": 0.6180981595092024, "grad_norm": 3.2686877250671387, "learning_rate": 2.1389007243289306e-05, "loss": 0.0142, "step": 1612 }, { "epoch": 0.6184815950920245, "grad_norm": 1.7194098234176636, "learning_rate": 2.1367703451214316e-05, "loss": 0.0058, "step": 1613 }, { "epoch": 0.6188650306748467, "grad_norm": 0.6371323466300964, "learning_rate": 2.1346399659139325e-05, "loss": 0.0014, "step": 1614 }, { "epoch": 0.6192484662576687, "grad_norm": 330.1398010253906, "learning_rate": 2.1325095867064338e-05, "loss": 3.541, "step": 1615 }, { "epoch": 0.6196319018404908, "grad_norm": 145.93212890625, "learning_rate": 2.1303792074989347e-05, "loss": 2.0606, "step": 1616 }, { "epoch": 0.6200153374233128, "grad_norm": 8.430885314941406, "learning_rate": 2.128248828291436e-05, "loss": 0.4036, "step": 1617 }, { "epoch": 0.620398773006135, "grad_norm": 22.9518985748291, "learning_rate": 2.126118449083937e-05, "loss": 0.5459, "step": 1618 }, { "epoch": 0.620782208588957, "grad_norm": 4.829712867736816, "learning_rate": 2.1239880698764382e-05, "loss": 0.031, "step": 1619 }, { "epoch": 0.6211656441717791, "grad_norm": 7.079444408416748, "learning_rate": 2.121857690668939e-05, "loss": 0.0195, "step": 1620 }, { "epoch": 0.6215490797546013, "grad_norm": 31.619905471801758, "learning_rate": 2.1197273114614404e-05, "loss": 0.71, "step": 1621 }, { "epoch": 0.6219325153374233, "grad_norm": 176.59059143066406, "learning_rate": 2.1175969322539413e-05, "loss": 1.6428, "step": 1622 }, { "epoch": 0.6223159509202454, "grad_norm": 2.9753220081329346, "learning_rate": 2.1154665530464422e-05, "loss": 0.0036, "step": 1623 }, { "epoch": 0.6226993865030674, "grad_norm": 14.047575950622559, "learning_rate": 2.1133361738389435e-05, "loss": 0.5679, "step": 1624 }, { "epoch": 0.6230828220858896, "grad_norm": 15.567380905151367, "learning_rate": 2.1112057946314444e-05, "loss": 0.0282, "step": 1625 }, { "epoch": 0.6234662576687117, "grad_norm": 0.5999993681907654, "learning_rate": 2.1090754154239457e-05, "loss": 0.0019, "step": 1626 }, { "epoch": 0.6238496932515337, "grad_norm": 48.12718963623047, "learning_rate": 2.1069450362164466e-05, "loss": 0.7349, "step": 1627 }, { "epoch": 0.6242331288343558, "grad_norm": 1.9771808385849, "learning_rate": 2.104814657008948e-05, "loss": 0.0071, "step": 1628 }, { "epoch": 0.6246165644171779, "grad_norm": 0.9278380870819092, "learning_rate": 2.102684277801449e-05, "loss": 0.0045, "step": 1629 }, { "epoch": 0.625, "grad_norm": 0.28498128056526184, "learning_rate": 2.1005538985939498e-05, "loss": 0.0013, "step": 1630 }, { "epoch": 0.6253834355828221, "grad_norm": 177.34898376464844, "learning_rate": 2.098423519386451e-05, "loss": 2.348, "step": 1631 }, { "epoch": 0.6257668711656442, "grad_norm": 60.71388244628906, "learning_rate": 2.096293140178952e-05, "loss": 0.7481, "step": 1632 }, { "epoch": 0.6261503067484663, "grad_norm": 13.655220031738281, "learning_rate": 2.0941627609714532e-05, "loss": 0.0885, "step": 1633 }, { "epoch": 0.6265337423312883, "grad_norm": 205.313232421875, "learning_rate": 2.0920323817639542e-05, "loss": 4.107, "step": 1634 }, { "epoch": 0.6269171779141104, "grad_norm": 2.9884133338928223, "learning_rate": 2.089902002556455e-05, "loss": 0.0125, "step": 1635 }, { "epoch": 0.6273006134969326, "grad_norm": 170.3273162841797, "learning_rate": 2.087771623348956e-05, "loss": 6.5684, "step": 1636 }, { "epoch": 0.6276840490797546, "grad_norm": 14.023516654968262, "learning_rate": 2.0856412441414573e-05, "loss": 0.1242, "step": 1637 }, { "epoch": 0.6280674846625767, "grad_norm": 17.409425735473633, "learning_rate": 2.0835108649339582e-05, "loss": 0.3997, "step": 1638 }, { "epoch": 0.6284509202453987, "grad_norm": 31.173315048217773, "learning_rate": 2.0813804857264592e-05, "loss": 0.6865, "step": 1639 }, { "epoch": 0.6288343558282209, "grad_norm": 1.5950876474380493, "learning_rate": 2.0792501065189604e-05, "loss": 0.0053, "step": 1640 }, { "epoch": 0.629217791411043, "grad_norm": 13.480043411254883, "learning_rate": 2.0771197273114614e-05, "loss": 0.0521, "step": 1641 }, { "epoch": 0.629601226993865, "grad_norm": 14.539470672607422, "learning_rate": 2.0749893481039626e-05, "loss": 0.1625, "step": 1642 }, { "epoch": 0.6299846625766872, "grad_norm": 26.961544036865234, "learning_rate": 2.0728589688964636e-05, "loss": 0.0735, "step": 1643 }, { "epoch": 0.6303680981595092, "grad_norm": 59.08195114135742, "learning_rate": 2.070728589688965e-05, "loss": 1.04, "step": 1644 }, { "epoch": 0.6307515337423313, "grad_norm": 0.13883554935455322, "learning_rate": 2.0685982104814658e-05, "loss": 0.0013, "step": 1645 }, { "epoch": 0.6311349693251533, "grad_norm": 54.861446380615234, "learning_rate": 2.0664678312739667e-05, "loss": 0.8756, "step": 1646 }, { "epoch": 0.6315184049079755, "grad_norm": 0.09707655757665634, "learning_rate": 2.064337452066468e-05, "loss": 0.0009, "step": 1647 }, { "epoch": 0.6319018404907976, "grad_norm": 5.110586643218994, "learning_rate": 2.062207072858969e-05, "loss": 0.0163, "step": 1648 }, { "epoch": 0.6322852760736196, "grad_norm": 17.940378189086914, "learning_rate": 2.0600766936514702e-05, "loss": 0.4658, "step": 1649 }, { "epoch": 0.6326687116564417, "grad_norm": 0.4684121012687683, "learning_rate": 2.057946314443971e-05, "loss": 0.0021, "step": 1650 }, { "epoch": 0.6330521472392638, "grad_norm": 2.591216564178467, "learning_rate": 2.0558159352364724e-05, "loss": 0.011, "step": 1651 }, { "epoch": 0.6334355828220859, "grad_norm": 1.9084800481796265, "learning_rate": 2.0536855560289733e-05, "loss": 0.0063, "step": 1652 }, { "epoch": 0.633819018404908, "grad_norm": 101.3776626586914, "learning_rate": 2.0515551768214742e-05, "loss": 1.0938, "step": 1653 }, { "epoch": 0.63420245398773, "grad_norm": 2.34978985786438, "learning_rate": 2.0494247976139755e-05, "loss": 0.0095, "step": 1654 }, { "epoch": 0.6345858895705522, "grad_norm": 116.3980941772461, "learning_rate": 2.0472944184064764e-05, "loss": 0.9522, "step": 1655 }, { "epoch": 0.6349693251533742, "grad_norm": 48.77947998046875, "learning_rate": 2.0451640391989777e-05, "loss": 0.6523, "step": 1656 }, { "epoch": 0.6353527607361963, "grad_norm": 91.40596771240234, "learning_rate": 2.0430336599914783e-05, "loss": 1.8175, "step": 1657 }, { "epoch": 0.6357361963190185, "grad_norm": 9.864049911499023, "learning_rate": 2.0409032807839796e-05, "loss": 0.1552, "step": 1658 }, { "epoch": 0.6361196319018405, "grad_norm": 226.86497497558594, "learning_rate": 2.0387729015764805e-05, "loss": 6.8369, "step": 1659 }, { "epoch": 0.6365030674846626, "grad_norm": 153.96597290039062, "learning_rate": 2.0366425223689818e-05, "loss": 4.1322, "step": 1660 }, { "epoch": 0.6368865030674846, "grad_norm": 154.2721710205078, "learning_rate": 2.0345121431614827e-05, "loss": 5.7915, "step": 1661 }, { "epoch": 0.6372699386503068, "grad_norm": 135.22979736328125, "learning_rate": 2.032381763953984e-05, "loss": 0.6553, "step": 1662 }, { "epoch": 0.6376533742331288, "grad_norm": 14.17731761932373, "learning_rate": 2.030251384746485e-05, "loss": 0.0685, "step": 1663 }, { "epoch": 0.6380368098159509, "grad_norm": 8.351577758789062, "learning_rate": 2.028121005538986e-05, "loss": 0.405, "step": 1664 }, { "epoch": 0.638420245398773, "grad_norm": 159.18130493164062, "learning_rate": 2.025990626331487e-05, "loss": 1.1095, "step": 1665 }, { "epoch": 0.6388036809815951, "grad_norm": 22.646175384521484, "learning_rate": 2.023860247123988e-05, "loss": 0.0969, "step": 1666 }, { "epoch": 0.6391871165644172, "grad_norm": 477.4721374511719, "learning_rate": 2.0217298679164893e-05, "loss": 1.6495, "step": 1667 }, { "epoch": 0.6395705521472392, "grad_norm": 0.9542309045791626, "learning_rate": 2.0195994887089902e-05, "loss": 0.0026, "step": 1668 }, { "epoch": 0.6399539877300614, "grad_norm": 2.2312052249908447, "learning_rate": 2.0174691095014915e-05, "loss": 0.0112, "step": 1669 }, { "epoch": 0.6403374233128835, "grad_norm": 2.4044041633605957, "learning_rate": 2.0153387302939924e-05, "loss": 0.0085, "step": 1670 }, { "epoch": 0.6407208588957055, "grad_norm": 2.774695873260498, "learning_rate": 2.0132083510864934e-05, "loss": 0.0101, "step": 1671 }, { "epoch": 0.6411042944785276, "grad_norm": 0.4862213134765625, "learning_rate": 2.0110779718789946e-05, "loss": 0.0024, "step": 1672 }, { "epoch": 0.6414877300613497, "grad_norm": 20.64777374267578, "learning_rate": 2.0089475926714956e-05, "loss": 0.4646, "step": 1673 }, { "epoch": 0.6418711656441718, "grad_norm": 79.48463439941406, "learning_rate": 2.006817213463997e-05, "loss": 1.1094, "step": 1674 }, { "epoch": 0.6422546012269938, "grad_norm": 6.128467082977295, "learning_rate": 2.0046868342564978e-05, "loss": 0.0144, "step": 1675 }, { "epoch": 0.6426380368098159, "grad_norm": 157.76174926757812, "learning_rate": 2.002556455048999e-05, "loss": 1.3321, "step": 1676 }, { "epoch": 0.6430214723926381, "grad_norm": 11.371692657470703, "learning_rate": 2.0004260758415e-05, "loss": 0.4241, "step": 1677 }, { "epoch": 0.6434049079754601, "grad_norm": 9.441778182983398, "learning_rate": 1.998295696634001e-05, "loss": 0.1427, "step": 1678 }, { "epoch": 0.6437883435582822, "grad_norm": 12.599998474121094, "learning_rate": 1.9961653174265022e-05, "loss": 0.1801, "step": 1679 }, { "epoch": 0.6441717791411042, "grad_norm": 1.525307059288025, "learning_rate": 1.994034938219003e-05, "loss": 0.0042, "step": 1680 }, { "epoch": 0.6445552147239264, "grad_norm": 0.44260019063949585, "learning_rate": 1.991904559011504e-05, "loss": 0.0021, "step": 1681 }, { "epoch": 0.6449386503067485, "grad_norm": 10.011754989624023, "learning_rate": 1.989774179804005e-05, "loss": 0.3962, "step": 1682 }, { "epoch": 0.6453220858895705, "grad_norm": 5.811450004577637, "learning_rate": 1.9876438005965062e-05, "loss": 0.3892, "step": 1683 }, { "epoch": 0.6457055214723927, "grad_norm": 0.6379611492156982, "learning_rate": 1.9855134213890072e-05, "loss": 0.0036, "step": 1684 }, { "epoch": 0.6460889570552147, "grad_norm": 0.11190050095319748, "learning_rate": 1.9833830421815084e-05, "loss": 0.0009, "step": 1685 }, { "epoch": 0.6464723926380368, "grad_norm": 4.65255069732666, "learning_rate": 1.9812526629740094e-05, "loss": 0.0162, "step": 1686 }, { "epoch": 0.6468558282208589, "grad_norm": 44.55350875854492, "learning_rate": 1.9791222837665106e-05, "loss": 0.8569, "step": 1687 }, { "epoch": 0.647239263803681, "grad_norm": 0.41566142439842224, "learning_rate": 1.9769919045590116e-05, "loss": 0.0015, "step": 1688 }, { "epoch": 0.6476226993865031, "grad_norm": 5.981119155883789, "learning_rate": 1.9748615253515125e-05, "loss": 0.3823, "step": 1689 }, { "epoch": 0.6480061349693251, "grad_norm": 2.8890583515167236, "learning_rate": 1.9727311461440138e-05, "loss": 0.0058, "step": 1690 }, { "epoch": 0.6483895705521472, "grad_norm": 180.98516845703125, "learning_rate": 1.9706007669365147e-05, "loss": 6.533, "step": 1691 }, { "epoch": 0.6487730061349694, "grad_norm": 0.5231181979179382, "learning_rate": 1.968470387729016e-05, "loss": 0.0025, "step": 1692 }, { "epoch": 0.6491564417177914, "grad_norm": 0.8823917508125305, "learning_rate": 1.966340008521517e-05, "loss": 0.0038, "step": 1693 }, { "epoch": 0.6495398773006135, "grad_norm": 9.24533462524414, "learning_rate": 1.9642096293140182e-05, "loss": 0.4578, "step": 1694 }, { "epoch": 0.6499233128834356, "grad_norm": 258.1440734863281, "learning_rate": 1.962079250106519e-05, "loss": 3.4919, "step": 1695 }, { "epoch": 0.6503067484662577, "grad_norm": 12.331808090209961, "learning_rate": 1.95994887089902e-05, "loss": 0.0584, "step": 1696 }, { "epoch": 0.6506901840490797, "grad_norm": 1.3217711448669434, "learning_rate": 1.9578184916915213e-05, "loss": 0.0049, "step": 1697 }, { "epoch": 0.6510736196319018, "grad_norm": 32.48533248901367, "learning_rate": 1.9556881124840222e-05, "loss": 0.6284, "step": 1698 }, { "epoch": 0.651457055214724, "grad_norm": 0.46559828519821167, "learning_rate": 1.9535577332765235e-05, "loss": 0.002, "step": 1699 }, { "epoch": 0.651840490797546, "grad_norm": 2.718585968017578, "learning_rate": 1.9514273540690244e-05, "loss": 0.0072, "step": 1700 }, { "epoch": 0.6522239263803681, "grad_norm": 5.244730472564697, "learning_rate": 1.9492969748615257e-05, "loss": 0.0048, "step": 1701 }, { "epoch": 0.6526073619631901, "grad_norm": 114.31527709960938, "learning_rate": 1.9471665956540266e-05, "loss": 0.8994, "step": 1702 }, { "epoch": 0.6529907975460123, "grad_norm": 28.290483474731445, "learning_rate": 1.9450362164465276e-05, "loss": 0.5547, "step": 1703 }, { "epoch": 0.6533742331288344, "grad_norm": 96.13031005859375, "learning_rate": 1.9429058372390285e-05, "loss": 1.3057, "step": 1704 }, { "epoch": 0.6537576687116564, "grad_norm": 0.8256839513778687, "learning_rate": 1.9407754580315294e-05, "loss": 0.0038, "step": 1705 }, { "epoch": 0.6541411042944786, "grad_norm": 69.43433380126953, "learning_rate": 1.9386450788240307e-05, "loss": 0.5801, "step": 1706 }, { "epoch": 0.6545245398773006, "grad_norm": 5.129863739013672, "learning_rate": 1.9365146996165316e-05, "loss": 0.0082, "step": 1707 }, { "epoch": 0.6549079754601227, "grad_norm": 290.7584533691406, "learning_rate": 1.934384320409033e-05, "loss": 3.4702, "step": 1708 }, { "epoch": 0.6552914110429447, "grad_norm": 15.134051322937012, "learning_rate": 1.932253941201534e-05, "loss": 0.4878, "step": 1709 }, { "epoch": 0.6556748466257669, "grad_norm": 7.622095108032227, "learning_rate": 1.930123561994035e-05, "loss": 0.0488, "step": 1710 }, { "epoch": 0.656058282208589, "grad_norm": 1.2107186317443848, "learning_rate": 1.927993182786536e-05, "loss": 0.0041, "step": 1711 }, { "epoch": 0.656441717791411, "grad_norm": 0.566555380821228, "learning_rate": 1.9258628035790373e-05, "loss": 0.0017, "step": 1712 }, { "epoch": 0.6568251533742331, "grad_norm": 0.2017064243555069, "learning_rate": 1.9237324243715382e-05, "loss": 0.0016, "step": 1713 }, { "epoch": 0.6572085889570553, "grad_norm": 42.69878387451172, "learning_rate": 1.9216020451640392e-05, "loss": 0.5176, "step": 1714 }, { "epoch": 0.6575920245398773, "grad_norm": 116.39155578613281, "learning_rate": 1.9194716659565404e-05, "loss": 1.8694, "step": 1715 }, { "epoch": 0.6579754601226994, "grad_norm": 9.295954704284668, "learning_rate": 1.9173412867490414e-05, "loss": 0.0294, "step": 1716 }, { "epoch": 0.6583588957055214, "grad_norm": 0.10093100368976593, "learning_rate": 1.9152109075415426e-05, "loss": 0.0007, "step": 1717 }, { "epoch": 0.6587423312883436, "grad_norm": 5.128528118133545, "learning_rate": 1.9130805283340436e-05, "loss": 0.0175, "step": 1718 }, { "epoch": 0.6591257668711656, "grad_norm": 46.20321273803711, "learning_rate": 1.910950149126545e-05, "loss": 0.7852, "step": 1719 }, { "epoch": 0.6595092024539877, "grad_norm": 96.1789779663086, "learning_rate": 1.9088197699190458e-05, "loss": 1.1777, "step": 1720 }, { "epoch": 0.6598926380368099, "grad_norm": 170.0279083251953, "learning_rate": 1.9066893907115467e-05, "loss": 5.8379, "step": 1721 }, { "epoch": 0.6602760736196319, "grad_norm": 15.705221176147461, "learning_rate": 1.904559011504048e-05, "loss": 0.4824, "step": 1722 }, { "epoch": 0.660659509202454, "grad_norm": 1.9702868461608887, "learning_rate": 1.902428632296549e-05, "loss": 0.0045, "step": 1723 }, { "epoch": 0.661042944785276, "grad_norm": 66.49562072753906, "learning_rate": 1.9002982530890502e-05, "loss": 1.2412, "step": 1724 }, { "epoch": 0.6614263803680982, "grad_norm": 9.27974796295166, "learning_rate": 1.8981678738815508e-05, "loss": 0.4216, "step": 1725 }, { "epoch": 0.6618098159509203, "grad_norm": 2.411372423171997, "learning_rate": 1.896037494674052e-05, "loss": 0.008, "step": 1726 }, { "epoch": 0.6621932515337423, "grad_norm": 32.30564880371094, "learning_rate": 1.893907115466553e-05, "loss": 0.1776, "step": 1727 }, { "epoch": 0.6625766871165644, "grad_norm": 2.3772690296173096, "learning_rate": 1.8917767362590542e-05, "loss": 0.0133, "step": 1728 }, { "epoch": 0.6629601226993865, "grad_norm": 39.052059173583984, "learning_rate": 1.8896463570515552e-05, "loss": 1.125, "step": 1729 }, { "epoch": 0.6633435582822086, "grad_norm": 0.2642825245857239, "learning_rate": 1.887515977844056e-05, "loss": 0.0021, "step": 1730 }, { "epoch": 0.6637269938650306, "grad_norm": 84.0382308959961, "learning_rate": 1.8853855986365574e-05, "loss": 1.0918, "step": 1731 }, { "epoch": 0.6641104294478528, "grad_norm": 45.17191696166992, "learning_rate": 1.8832552194290583e-05, "loss": 0.5645, "step": 1732 }, { "epoch": 0.6644938650306749, "grad_norm": 0.3179865777492523, "learning_rate": 1.8811248402215596e-05, "loss": 0.0017, "step": 1733 }, { "epoch": 0.6648773006134969, "grad_norm": 198.32339477539062, "learning_rate": 1.8789944610140605e-05, "loss": 6.1274, "step": 1734 }, { "epoch": 0.665260736196319, "grad_norm": 107.39860534667969, "learning_rate": 1.8768640818065618e-05, "loss": 0.8614, "step": 1735 }, { "epoch": 0.6656441717791411, "grad_norm": 2.4909374713897705, "learning_rate": 1.8747337025990627e-05, "loss": 0.014, "step": 1736 }, { "epoch": 0.6660276073619632, "grad_norm": 9.335639953613281, "learning_rate": 1.8726033233915636e-05, "loss": 0.0574, "step": 1737 }, { "epoch": 0.6664110429447853, "grad_norm": 14.218742370605469, "learning_rate": 1.870472944184065e-05, "loss": 0.5034, "step": 1738 }, { "epoch": 0.6667944785276073, "grad_norm": 0.4584377706050873, "learning_rate": 1.868342564976566e-05, "loss": 0.0018, "step": 1739 }, { "epoch": 0.6671779141104295, "grad_norm": 10.58724308013916, "learning_rate": 1.866212185769067e-05, "loss": 0.0159, "step": 1740 }, { "epoch": 0.6675613496932515, "grad_norm": 7.719966411590576, "learning_rate": 1.864081806561568e-05, "loss": 0.0215, "step": 1741 }, { "epoch": 0.6679447852760736, "grad_norm": 73.7376708984375, "learning_rate": 1.8619514273540693e-05, "loss": 0.8589, "step": 1742 }, { "epoch": 0.6683282208588958, "grad_norm": 4.980079174041748, "learning_rate": 1.8598210481465702e-05, "loss": 0.0174, "step": 1743 }, { "epoch": 0.6687116564417178, "grad_norm": 12.377878189086914, "learning_rate": 1.8576906689390715e-05, "loss": 0.4548, "step": 1744 }, { "epoch": 0.6690950920245399, "grad_norm": 134.05860900878906, "learning_rate": 1.8555602897315724e-05, "loss": 6.9141, "step": 1745 }, { "epoch": 0.6694785276073619, "grad_norm": 0.6603541374206543, "learning_rate": 1.8534299105240734e-05, "loss": 0.0032, "step": 1746 }, { "epoch": 0.6698619631901841, "grad_norm": 268.3301696777344, "learning_rate": 1.8512995313165746e-05, "loss": 5.8032, "step": 1747 }, { "epoch": 0.6702453987730062, "grad_norm": 12.854681968688965, "learning_rate": 1.8491691521090752e-05, "loss": 0.0681, "step": 1748 }, { "epoch": 0.6706288343558282, "grad_norm": 0.8064194321632385, "learning_rate": 1.8470387729015765e-05, "loss": 0.0032, "step": 1749 }, { "epoch": 0.6710122699386503, "grad_norm": 29.416452407836914, "learning_rate": 1.8449083936940774e-05, "loss": 0.5005, "step": 1750 }, { "epoch": 0.6713957055214724, "grad_norm": 204.04847717285156, "learning_rate": 1.8427780144865787e-05, "loss": 5.6729, "step": 1751 }, { "epoch": 0.6717791411042945, "grad_norm": 12.783069610595703, "learning_rate": 1.8406476352790796e-05, "loss": 0.5464, "step": 1752 }, { "epoch": 0.6721625766871165, "grad_norm": 193.79493713378906, "learning_rate": 1.838517256071581e-05, "loss": 2.2484, "step": 1753 }, { "epoch": 0.6725460122699386, "grad_norm": 4.909018516540527, "learning_rate": 1.836386876864082e-05, "loss": 0.0157, "step": 1754 }, { "epoch": 0.6729294478527608, "grad_norm": 124.53752136230469, "learning_rate": 1.8342564976565828e-05, "loss": 1.0596, "step": 1755 }, { "epoch": 0.6733128834355828, "grad_norm": 3.7626724243164062, "learning_rate": 1.832126118449084e-05, "loss": 0.0155, "step": 1756 }, { "epoch": 0.6736963190184049, "grad_norm": 18.932302474975586, "learning_rate": 1.829995739241585e-05, "loss": 0.4568, "step": 1757 }, { "epoch": 0.674079754601227, "grad_norm": 186.578857421875, "learning_rate": 1.8278653600340862e-05, "loss": 2.893, "step": 1758 }, { "epoch": 0.6744631901840491, "grad_norm": 195.05938720703125, "learning_rate": 1.8257349808265872e-05, "loss": 1.1065, "step": 1759 }, { "epoch": 0.6748466257668712, "grad_norm": 2.9090867042541504, "learning_rate": 1.8236046016190884e-05, "loss": 0.0057, "step": 1760 }, { "epoch": 0.6752300613496932, "grad_norm": 2.2006096839904785, "learning_rate": 1.8214742224115894e-05, "loss": 0.0067, "step": 1761 }, { "epoch": 0.6756134969325154, "grad_norm": 2.0305356979370117, "learning_rate": 1.8193438432040903e-05, "loss": 0.0048, "step": 1762 }, { "epoch": 0.6759969325153374, "grad_norm": 0.5942795872688293, "learning_rate": 1.8172134639965916e-05, "loss": 0.0025, "step": 1763 }, { "epoch": 0.6763803680981595, "grad_norm": 113.49565887451172, "learning_rate": 1.8150830847890925e-05, "loss": 0.855, "step": 1764 }, { "epoch": 0.6767638036809815, "grad_norm": 4.834312915802002, "learning_rate": 1.8129527055815938e-05, "loss": 0.0242, "step": 1765 }, { "epoch": 0.6771472392638037, "grad_norm": 15.199250221252441, "learning_rate": 1.8108223263740947e-05, "loss": 0.0306, "step": 1766 }, { "epoch": 0.6775306748466258, "grad_norm": 6.489785671234131, "learning_rate": 1.808691947166596e-05, "loss": 0.3616, "step": 1767 }, { "epoch": 0.6779141104294478, "grad_norm": 3.202645778656006, "learning_rate": 1.806561567959097e-05, "loss": 0.0137, "step": 1768 }, { "epoch": 0.67829754601227, "grad_norm": 49.37078857421875, "learning_rate": 1.804431188751598e-05, "loss": 0.5967, "step": 1769 }, { "epoch": 0.678680981595092, "grad_norm": 28.319705963134766, "learning_rate": 1.8023008095440988e-05, "loss": 0.4783, "step": 1770 }, { "epoch": 0.6790644171779141, "grad_norm": 8.28916072845459, "learning_rate": 1.8001704303366e-05, "loss": 0.0615, "step": 1771 }, { "epoch": 0.6794478527607362, "grad_norm": 6.079871654510498, "learning_rate": 1.798040051129101e-05, "loss": 0.0111, "step": 1772 }, { "epoch": 0.6798312883435583, "grad_norm": 188.42694091796875, "learning_rate": 1.795909671921602e-05, "loss": 1.5587, "step": 1773 }, { "epoch": 0.6802147239263804, "grad_norm": 20.358118057250977, "learning_rate": 1.7937792927141032e-05, "loss": 0.5034, "step": 1774 }, { "epoch": 0.6805981595092024, "grad_norm": 56.0653190612793, "learning_rate": 1.791648913506604e-05, "loss": 0.7251, "step": 1775 }, { "epoch": 0.6809815950920245, "grad_norm": 5.859439373016357, "learning_rate": 1.7895185342991054e-05, "loss": 0.0251, "step": 1776 }, { "epoch": 0.6813650306748467, "grad_norm": 13.532238006591797, "learning_rate": 1.7873881550916063e-05, "loss": 0.5747, "step": 1777 }, { "epoch": 0.6817484662576687, "grad_norm": 104.95072937011719, "learning_rate": 1.7852577758841076e-05, "loss": 1.2217, "step": 1778 }, { "epoch": 0.6821319018404908, "grad_norm": 84.7751235961914, "learning_rate": 1.7831273966766085e-05, "loss": 5.8945, "step": 1779 }, { "epoch": 0.6825153374233128, "grad_norm": 0.8566222786903381, "learning_rate": 1.7809970174691094e-05, "loss": 0.0023, "step": 1780 }, { "epoch": 0.682898773006135, "grad_norm": 5.526364803314209, "learning_rate": 1.7788666382616107e-05, "loss": 0.0271, "step": 1781 }, { "epoch": 0.683282208588957, "grad_norm": 65.81211853027344, "learning_rate": 1.7767362590541116e-05, "loss": 1.0322, "step": 1782 }, { "epoch": 0.6836656441717791, "grad_norm": 6.39776611328125, "learning_rate": 1.774605879846613e-05, "loss": 0.0161, "step": 1783 }, { "epoch": 0.6840490797546013, "grad_norm": 2.077016592025757, "learning_rate": 1.772475500639114e-05, "loss": 0.0071, "step": 1784 }, { "epoch": 0.6844325153374233, "grad_norm": 183.04539489746094, "learning_rate": 1.770345121431615e-05, "loss": 2.599, "step": 1785 }, { "epoch": 0.6848159509202454, "grad_norm": 165.1880340576172, "learning_rate": 1.768214742224116e-05, "loss": 1.0938, "step": 1786 }, { "epoch": 0.6851993865030674, "grad_norm": 181.1474151611328, "learning_rate": 1.766084363016617e-05, "loss": 3.0854, "step": 1787 }, { "epoch": 0.6855828220858896, "grad_norm": 0.4776207506656647, "learning_rate": 1.7639539838091182e-05, "loss": 0.0018, "step": 1788 }, { "epoch": 0.6859662576687117, "grad_norm": 0.3010045289993286, "learning_rate": 1.7618236046016192e-05, "loss": 0.0013, "step": 1789 }, { "epoch": 0.6863496932515337, "grad_norm": 4.359722137451172, "learning_rate": 1.7596932253941204e-05, "loss": 0.0199, "step": 1790 }, { "epoch": 0.6867331288343558, "grad_norm": 5.467195510864258, "learning_rate": 1.7575628461866214e-05, "loss": 0.021, "step": 1791 }, { "epoch": 0.6871165644171779, "grad_norm": 126.13877868652344, "learning_rate": 1.7554324669791226e-05, "loss": 1.8438, "step": 1792 }, { "epoch": 0.6875, "grad_norm": 39.74203872680664, "learning_rate": 1.7533020877716232e-05, "loss": 0.0711, "step": 1793 }, { "epoch": 0.6878834355828221, "grad_norm": 1.5761584043502808, "learning_rate": 1.7511717085641245e-05, "loss": 0.0063, "step": 1794 }, { "epoch": 0.6882668711656442, "grad_norm": 6.92527961730957, "learning_rate": 1.7490413293566254e-05, "loss": 0.0504, "step": 1795 }, { "epoch": 0.6886503067484663, "grad_norm": 14.018022537231445, "learning_rate": 1.7469109501491264e-05, "loss": 0.0411, "step": 1796 }, { "epoch": 0.6890337423312883, "grad_norm": 45.85357666015625, "learning_rate": 1.7447805709416276e-05, "loss": 0.6416, "step": 1797 }, { "epoch": 0.6894171779141104, "grad_norm": 7.583892345428467, "learning_rate": 1.7426501917341286e-05, "loss": 0.4094, "step": 1798 }, { "epoch": 0.6898006134969326, "grad_norm": 9.997591972351074, "learning_rate": 1.74051981252663e-05, "loss": 0.021, "step": 1799 }, { "epoch": 0.6901840490797546, "grad_norm": 37.529869079589844, "learning_rate": 1.7383894333191308e-05, "loss": 0.207, "step": 1800 }, { "epoch": 0.6905674846625767, "grad_norm": 0.7633021473884583, "learning_rate": 1.736259054111632e-05, "loss": 0.0027, "step": 1801 }, { "epoch": 0.6909509202453987, "grad_norm": 31.646526336669922, "learning_rate": 1.734128674904133e-05, "loss": 0.1269, "step": 1802 }, { "epoch": 0.6913343558282209, "grad_norm": 0.8677123188972473, "learning_rate": 1.7319982956966342e-05, "loss": 0.0021, "step": 1803 }, { "epoch": 0.691717791411043, "grad_norm": 94.05793762207031, "learning_rate": 1.7298679164891352e-05, "loss": 0.8174, "step": 1804 }, { "epoch": 0.692101226993865, "grad_norm": 0.7277902364730835, "learning_rate": 1.727737537281636e-05, "loss": 0.0035, "step": 1805 }, { "epoch": 0.6924846625766872, "grad_norm": 1.0158634185791016, "learning_rate": 1.7256071580741374e-05, "loss": 0.0061, "step": 1806 }, { "epoch": 0.6928680981595092, "grad_norm": 8.044918060302734, "learning_rate": 1.7234767788666383e-05, "loss": 0.4128, "step": 1807 }, { "epoch": 0.6932515337423313, "grad_norm": 1.4011287689208984, "learning_rate": 1.7213463996591396e-05, "loss": 0.0076, "step": 1808 }, { "epoch": 0.6936349693251533, "grad_norm": 11.000798225402832, "learning_rate": 1.7192160204516405e-05, "loss": 0.1869, "step": 1809 }, { "epoch": 0.6940184049079755, "grad_norm": 0.7866613864898682, "learning_rate": 1.7170856412441418e-05, "loss": 0.0037, "step": 1810 }, { "epoch": 0.6944018404907976, "grad_norm": 76.51680755615234, "learning_rate": 1.7149552620366427e-05, "loss": 0.9775, "step": 1811 }, { "epoch": 0.6947852760736196, "grad_norm": 20.117338180541992, "learning_rate": 1.7128248828291436e-05, "loss": 0.1213, "step": 1812 }, { "epoch": 0.6951687116564417, "grad_norm": 115.73905181884766, "learning_rate": 1.710694503621645e-05, "loss": 2.1172, "step": 1813 }, { "epoch": 0.6955521472392638, "grad_norm": 80.10689544677734, "learning_rate": 1.708564124414146e-05, "loss": 0.5376, "step": 1814 }, { "epoch": 0.6959355828220859, "grad_norm": 8.569589614868164, "learning_rate": 1.7064337452066468e-05, "loss": 0.4336, "step": 1815 }, { "epoch": 0.696319018404908, "grad_norm": 337.0047607421875, "learning_rate": 1.7043033659991477e-05, "loss": 3.3758, "step": 1816 }, { "epoch": 0.69670245398773, "grad_norm": 117.974853515625, "learning_rate": 1.702172986791649e-05, "loss": 6.3911, "step": 1817 }, { "epoch": 0.6970858895705522, "grad_norm": 198.02735900878906, "learning_rate": 1.70004260758415e-05, "loss": 5.3398, "step": 1818 }, { "epoch": 0.6974693251533742, "grad_norm": 18.306970596313477, "learning_rate": 1.6979122283766512e-05, "loss": 0.1182, "step": 1819 }, { "epoch": 0.6978527607361963, "grad_norm": 1.6636066436767578, "learning_rate": 1.695781849169152e-05, "loss": 0.0093, "step": 1820 }, { "epoch": 0.6982361963190185, "grad_norm": 38.072540283203125, "learning_rate": 1.693651469961653e-05, "loss": 0.9209, "step": 1821 }, { "epoch": 0.6986196319018405, "grad_norm": 38.3408317565918, "learning_rate": 1.6915210907541543e-05, "loss": 0.6152, "step": 1822 }, { "epoch": 0.6990030674846626, "grad_norm": 12.642842292785645, "learning_rate": 1.6893907115466552e-05, "loss": 0.0765, "step": 1823 }, { "epoch": 0.6993865030674846, "grad_norm": 7.309633255004883, "learning_rate": 1.6872603323391565e-05, "loss": 0.0517, "step": 1824 }, { "epoch": 0.6997699386503068, "grad_norm": 133.3367919921875, "learning_rate": 1.6851299531316574e-05, "loss": 1.2656, "step": 1825 }, { "epoch": 0.7001533742331288, "grad_norm": 4.016531467437744, "learning_rate": 1.6829995739241587e-05, "loss": 0.0179, "step": 1826 }, { "epoch": 0.7005368098159509, "grad_norm": 63.89706039428711, "learning_rate": 1.6808691947166596e-05, "loss": 0.9761, "step": 1827 }, { "epoch": 0.700920245398773, "grad_norm": 3.106642961502075, "learning_rate": 1.6787388155091606e-05, "loss": 0.0108, "step": 1828 }, { "epoch": 0.7013036809815951, "grad_norm": 199.66604614257812, "learning_rate": 1.676608436301662e-05, "loss": 5.8223, "step": 1829 }, { "epoch": 0.7016871165644172, "grad_norm": 7.568203926086426, "learning_rate": 1.6744780570941628e-05, "loss": 0.0156, "step": 1830 }, { "epoch": 0.7020705521472392, "grad_norm": 3.329444408416748, "learning_rate": 1.672347677886664e-05, "loss": 0.0109, "step": 1831 }, { "epoch": 0.7024539877300614, "grad_norm": 110.65080261230469, "learning_rate": 1.670217298679165e-05, "loss": 2.1547, "step": 1832 }, { "epoch": 0.7028374233128835, "grad_norm": 33.216392517089844, "learning_rate": 1.6680869194716662e-05, "loss": 0.7285, "step": 1833 }, { "epoch": 0.7032208588957055, "grad_norm": 20.46763801574707, "learning_rate": 1.6659565402641672e-05, "loss": 0.1825, "step": 1834 }, { "epoch": 0.7036042944785276, "grad_norm": 207.61228942871094, "learning_rate": 1.6638261610566684e-05, "loss": 5.3691, "step": 1835 }, { "epoch": 0.7039877300613497, "grad_norm": 10.169426918029785, "learning_rate": 1.6616957818491694e-05, "loss": 0.0364, "step": 1836 }, { "epoch": 0.7043711656441718, "grad_norm": 103.93666076660156, "learning_rate": 1.6595654026416703e-05, "loss": 1.1836, "step": 1837 }, { "epoch": 0.7047546012269938, "grad_norm": 83.81804656982422, "learning_rate": 1.6574350234341712e-05, "loss": 1.1963, "step": 1838 }, { "epoch": 0.7051380368098159, "grad_norm": 213.241943359375, "learning_rate": 1.655304644226672e-05, "loss": 2.1487, "step": 1839 }, { "epoch": 0.7055214723926381, "grad_norm": 1.7387597560882568, "learning_rate": 1.6531742650191734e-05, "loss": 0.0067, "step": 1840 }, { "epoch": 0.7059049079754601, "grad_norm": 183.66021728515625, "learning_rate": 1.6510438858116744e-05, "loss": 2.3087, "step": 1841 }, { "epoch": 0.7062883435582822, "grad_norm": 0.41703689098358154, "learning_rate": 1.6489135066041756e-05, "loss": 0.0019, "step": 1842 }, { "epoch": 0.7066717791411042, "grad_norm": 1.1585429906845093, "learning_rate": 1.6467831273966766e-05, "loss": 0.0027, "step": 1843 }, { "epoch": 0.7070552147239264, "grad_norm": 4.839847564697266, "learning_rate": 1.644652748189178e-05, "loss": 0.011, "step": 1844 }, { "epoch": 0.7074386503067485, "grad_norm": 87.01338195800781, "learning_rate": 1.6425223689816788e-05, "loss": 1.1348, "step": 1845 }, { "epoch": 0.7078220858895705, "grad_norm": 138.19476318359375, "learning_rate": 1.6403919897741797e-05, "loss": 2.0764, "step": 1846 }, { "epoch": 0.7082055214723927, "grad_norm": 0.5520815253257751, "learning_rate": 1.638261610566681e-05, "loss": 0.0023, "step": 1847 }, { "epoch": 0.7085889570552147, "grad_norm": 0.19649668037891388, "learning_rate": 1.636131231359182e-05, "loss": 0.0028, "step": 1848 }, { "epoch": 0.7089723926380368, "grad_norm": 12.944412231445312, "learning_rate": 1.6340008521516832e-05, "loss": 0.0554, "step": 1849 }, { "epoch": 0.7093558282208589, "grad_norm": 8.3750581741333, "learning_rate": 1.631870472944184e-05, "loss": 0.066, "step": 1850 }, { "epoch": 0.709739263803681, "grad_norm": 4.875927925109863, "learning_rate": 1.6297400937366854e-05, "loss": 0.0098, "step": 1851 }, { "epoch": 0.7101226993865031, "grad_norm": 6.181133270263672, "learning_rate": 1.6276097145291863e-05, "loss": 0.0357, "step": 1852 }, { "epoch": 0.7105061349693251, "grad_norm": 1.3159886598587036, "learning_rate": 1.6254793353216872e-05, "loss": 0.0032, "step": 1853 }, { "epoch": 0.7108895705521472, "grad_norm": 12.367630004882812, "learning_rate": 1.6233489561141885e-05, "loss": 0.0281, "step": 1854 }, { "epoch": 0.7112730061349694, "grad_norm": 0.16365942358970642, "learning_rate": 1.6212185769066894e-05, "loss": 0.0015, "step": 1855 }, { "epoch": 0.7116564417177914, "grad_norm": 0.538750946521759, "learning_rate": 1.6190881976991907e-05, "loss": 0.0025, "step": 1856 }, { "epoch": 0.7120398773006135, "grad_norm": 15.509056091308594, "learning_rate": 1.6169578184916916e-05, "loss": 0.5142, "step": 1857 }, { "epoch": 0.7124233128834356, "grad_norm": 0.3068455159664154, "learning_rate": 1.614827439284193e-05, "loss": 0.0024, "step": 1858 }, { "epoch": 0.7128067484662577, "grad_norm": 17.366540908813477, "learning_rate": 1.612697060076694e-05, "loss": 0.0475, "step": 1859 }, { "epoch": 0.7131901840490797, "grad_norm": 19.56414031982422, "learning_rate": 1.6105666808691948e-05, "loss": 0.4644, "step": 1860 }, { "epoch": 0.7135736196319018, "grad_norm": 138.43270874023438, "learning_rate": 1.6084363016616957e-05, "loss": 1.3018, "step": 1861 }, { "epoch": 0.713957055214724, "grad_norm": 4.688930511474609, "learning_rate": 1.606305922454197e-05, "loss": 0.3123, "step": 1862 }, { "epoch": 0.714340490797546, "grad_norm": 0.8955931067466736, "learning_rate": 1.604175543246698e-05, "loss": 0.0038, "step": 1863 }, { "epoch": 0.7147239263803681, "grad_norm": 0.4238760471343994, "learning_rate": 1.6020451640391988e-05, "loss": 0.0015, "step": 1864 }, { "epoch": 0.7151073619631901, "grad_norm": 37.68019485473633, "learning_rate": 1.5999147848317e-05, "loss": 0.5611, "step": 1865 }, { "epoch": 0.7154907975460123, "grad_norm": 1.6976826190948486, "learning_rate": 1.597784405624201e-05, "loss": 0.005, "step": 1866 }, { "epoch": 0.7158742331288344, "grad_norm": 257.343994140625, "learning_rate": 1.5956540264167023e-05, "loss": 1.7706, "step": 1867 }, { "epoch": 0.7162576687116564, "grad_norm": 11.13989543914795, "learning_rate": 1.5935236472092032e-05, "loss": 0.039, "step": 1868 }, { "epoch": 0.7166411042944786, "grad_norm": 60.56459426879883, "learning_rate": 1.5913932680017045e-05, "loss": 0.7422, "step": 1869 }, { "epoch": 0.7170245398773006, "grad_norm": 3.938284397125244, "learning_rate": 1.5892628887942054e-05, "loss": 0.0161, "step": 1870 }, { "epoch": 0.7174079754601227, "grad_norm": 0.5595266222953796, "learning_rate": 1.5871325095867064e-05, "loss": 0.0023, "step": 1871 }, { "epoch": 0.7177914110429447, "grad_norm": 18.529457092285156, "learning_rate": 1.5850021303792076e-05, "loss": 0.4453, "step": 1872 }, { "epoch": 0.7181748466257669, "grad_norm": 36.88288116455078, "learning_rate": 1.5828717511717086e-05, "loss": 0.6768, "step": 1873 }, { "epoch": 0.718558282208589, "grad_norm": 28.450756072998047, "learning_rate": 1.58074137196421e-05, "loss": 0.5103, "step": 1874 }, { "epoch": 0.718941717791411, "grad_norm": 11.081512451171875, "learning_rate": 1.5786109927567108e-05, "loss": 0.2457, "step": 1875 }, { "epoch": 0.7193251533742331, "grad_norm": 185.0625, "learning_rate": 1.576480613549212e-05, "loss": 6.2939, "step": 1876 }, { "epoch": 0.7197085889570553, "grad_norm": 11.565069198608398, "learning_rate": 1.574350234341713e-05, "loss": 0.2269, "step": 1877 }, { "epoch": 0.7200920245398773, "grad_norm": 1.4721031188964844, "learning_rate": 1.572219855134214e-05, "loss": 0.0078, "step": 1878 }, { "epoch": 0.7204754601226994, "grad_norm": 0.3564552366733551, "learning_rate": 1.570089475926715e-05, "loss": 0.0016, "step": 1879 }, { "epoch": 0.7208588957055214, "grad_norm": 1.6091878414154053, "learning_rate": 1.567959096719216e-05, "loss": 0.0045, "step": 1880 }, { "epoch": 0.7212423312883436, "grad_norm": 24.424392700195312, "learning_rate": 1.5658287175117174e-05, "loss": 0.563, "step": 1881 }, { "epoch": 0.7216257668711656, "grad_norm": 1.67574942111969, "learning_rate": 1.5636983383042183e-05, "loss": 0.0074, "step": 1882 }, { "epoch": 0.7220092024539877, "grad_norm": 12.126558303833008, "learning_rate": 1.5615679590967192e-05, "loss": 0.3853, "step": 1883 }, { "epoch": 0.7223926380368099, "grad_norm": 53.929473876953125, "learning_rate": 1.55943757988922e-05, "loss": 0.5855, "step": 1884 }, { "epoch": 0.7227760736196319, "grad_norm": 0.36175084114074707, "learning_rate": 1.5573072006817214e-05, "loss": 0.002, "step": 1885 }, { "epoch": 0.723159509202454, "grad_norm": 7.037576675415039, "learning_rate": 1.5551768214742224e-05, "loss": 0.0256, "step": 1886 }, { "epoch": 0.723542944785276, "grad_norm": 42.27104949951172, "learning_rate": 1.5530464422667236e-05, "loss": 0.5713, "step": 1887 }, { "epoch": 0.7239263803680982, "grad_norm": 7.669439315795898, "learning_rate": 1.5509160630592246e-05, "loss": 0.3967, "step": 1888 }, { "epoch": 0.7243098159509203, "grad_norm": 9.883378028869629, "learning_rate": 1.5487856838517255e-05, "loss": 0.1054, "step": 1889 }, { "epoch": 0.7246932515337423, "grad_norm": 5.921607494354248, "learning_rate": 1.5466553046442268e-05, "loss": 0.0218, "step": 1890 }, { "epoch": 0.7250766871165644, "grad_norm": 0.9740287065505981, "learning_rate": 1.5445249254367277e-05, "loss": 0.0042, "step": 1891 }, { "epoch": 0.7254601226993865, "grad_norm": 2.2117183208465576, "learning_rate": 1.542394546229229e-05, "loss": 0.007, "step": 1892 }, { "epoch": 0.7258435582822086, "grad_norm": 6.486509323120117, "learning_rate": 1.54026416702173e-05, "loss": 0.0383, "step": 1893 }, { "epoch": 0.7262269938650306, "grad_norm": 101.74858856201172, "learning_rate": 1.538133787814231e-05, "loss": 1.1943, "step": 1894 }, { "epoch": 0.7266104294478528, "grad_norm": 17.435060501098633, "learning_rate": 1.536003408606732e-05, "loss": 0.1314, "step": 1895 }, { "epoch": 0.7269938650306749, "grad_norm": 11.206552505493164, "learning_rate": 1.533873029399233e-05, "loss": 0.0237, "step": 1896 }, { "epoch": 0.7273773006134969, "grad_norm": 86.99957275390625, "learning_rate": 1.5317426501917343e-05, "loss": 0.5923, "step": 1897 }, { "epoch": 0.727760736196319, "grad_norm": 0.337234765291214, "learning_rate": 1.5296122709842352e-05, "loss": 0.0026, "step": 1898 }, { "epoch": 0.7281441717791411, "grad_norm": 5.304370403289795, "learning_rate": 1.5274818917767365e-05, "loss": 0.0127, "step": 1899 }, { "epoch": 0.7285276073619632, "grad_norm": 0.41007453203201294, "learning_rate": 1.5253515125692374e-05, "loss": 0.0018, "step": 1900 }, { "epoch": 0.7289110429447853, "grad_norm": 1.1569985151290894, "learning_rate": 1.5232211333617385e-05, "loss": 0.0064, "step": 1901 }, { "epoch": 0.7292944785276073, "grad_norm": 321.8655700683594, "learning_rate": 1.5210907541542396e-05, "loss": 6.2959, "step": 1902 }, { "epoch": 0.7296779141104295, "grad_norm": 1.9195574522018433, "learning_rate": 1.5189603749467407e-05, "loss": 0.0049, "step": 1903 }, { "epoch": 0.7300613496932515, "grad_norm": 17.68069076538086, "learning_rate": 1.5168299957392418e-05, "loss": 0.2111, "step": 1904 }, { "epoch": 0.7304447852760736, "grad_norm": 0.6155018210411072, "learning_rate": 1.514699616531743e-05, "loss": 0.0021, "step": 1905 }, { "epoch": 0.7308282208588958, "grad_norm": 5.575611114501953, "learning_rate": 1.5125692373242437e-05, "loss": 0.0278, "step": 1906 }, { "epoch": 0.7312116564417178, "grad_norm": 78.27565002441406, "learning_rate": 1.5104388581167448e-05, "loss": 0.8199, "step": 1907 }, { "epoch": 0.7315950920245399, "grad_norm": 0.3089018762111664, "learning_rate": 1.5083084789092459e-05, "loss": 0.0022, "step": 1908 }, { "epoch": 0.7319785276073619, "grad_norm": 6.002476692199707, "learning_rate": 1.5061780997017468e-05, "loss": 0.4094, "step": 1909 }, { "epoch": 0.7323619631901841, "grad_norm": 198.84535217285156, "learning_rate": 1.504047720494248e-05, "loss": 3.6476, "step": 1910 }, { "epoch": 0.7327453987730062, "grad_norm": 10.360122680664062, "learning_rate": 1.501917341286749e-05, "loss": 0.4385, "step": 1911 }, { "epoch": 0.7331288343558282, "grad_norm": 19.18494987487793, "learning_rate": 1.4997869620792501e-05, "loss": 0.0489, "step": 1912 }, { "epoch": 0.7335122699386503, "grad_norm": 0.40411585569381714, "learning_rate": 1.4976565828717512e-05, "loss": 0.0019, "step": 1913 }, { "epoch": 0.7338957055214724, "grad_norm": 5.745697021484375, "learning_rate": 1.4955262036642523e-05, "loss": 0.0181, "step": 1914 }, { "epoch": 0.7342791411042945, "grad_norm": 5.817564487457275, "learning_rate": 1.4933958244567534e-05, "loss": 0.0186, "step": 1915 }, { "epoch": 0.7346625766871165, "grad_norm": 0.47785699367523193, "learning_rate": 1.4912654452492544e-05, "loss": 0.0024, "step": 1916 }, { "epoch": 0.7350460122699386, "grad_norm": 0.995089590549469, "learning_rate": 1.4891350660417555e-05, "loss": 0.0047, "step": 1917 }, { "epoch": 0.7354294478527608, "grad_norm": 4.376603126525879, "learning_rate": 1.4870046868342566e-05, "loss": 0.3303, "step": 1918 }, { "epoch": 0.7358128834355828, "grad_norm": 2.5001165866851807, "learning_rate": 1.4848743076267577e-05, "loss": 0.0135, "step": 1919 }, { "epoch": 0.7361963190184049, "grad_norm": 1.037470817565918, "learning_rate": 1.4827439284192588e-05, "loss": 0.0028, "step": 1920 }, { "epoch": 0.736579754601227, "grad_norm": 33.8574333190918, "learning_rate": 1.4806135492117599e-05, "loss": 0.6431, "step": 1921 }, { "epoch": 0.7369631901840491, "grad_norm": 11.763459205627441, "learning_rate": 1.478483170004261e-05, "loss": 0.0238, "step": 1922 }, { "epoch": 0.7373466257668712, "grad_norm": 8.176692962646484, "learning_rate": 1.4763527907967619e-05, "loss": 0.0393, "step": 1923 }, { "epoch": 0.7377300613496932, "grad_norm": 95.25994873046875, "learning_rate": 1.474222411589263e-05, "loss": 1.3507, "step": 1924 }, { "epoch": 0.7381134969325154, "grad_norm": 6.803265571594238, "learning_rate": 1.4720920323817641e-05, "loss": 0.0887, "step": 1925 }, { "epoch": 0.7384969325153374, "grad_norm": 235.39866638183594, "learning_rate": 1.4699616531742652e-05, "loss": 2.6448, "step": 1926 }, { "epoch": 0.7388803680981595, "grad_norm": 157.25706481933594, "learning_rate": 1.4678312739667663e-05, "loss": 2.5195, "step": 1927 }, { "epoch": 0.7392638036809815, "grad_norm": 105.20527648925781, "learning_rate": 1.465700894759267e-05, "loss": 6.0176, "step": 1928 }, { "epoch": 0.7396472392638037, "grad_norm": 101.31459045410156, "learning_rate": 1.4635705155517682e-05, "loss": 0.9268, "step": 1929 }, { "epoch": 0.7400306748466258, "grad_norm": 74.20117950439453, "learning_rate": 1.4614401363442693e-05, "loss": 1.3682, "step": 1930 }, { "epoch": 0.7404141104294478, "grad_norm": 1.142648458480835, "learning_rate": 1.4593097571367704e-05, "loss": 0.003, "step": 1931 }, { "epoch": 0.74079754601227, "grad_norm": 0.8568863868713379, "learning_rate": 1.4571793779292715e-05, "loss": 0.0027, "step": 1932 }, { "epoch": 0.741180981595092, "grad_norm": 2.354698896408081, "learning_rate": 1.4550489987217724e-05, "loss": 0.0083, "step": 1933 }, { "epoch": 0.7415644171779141, "grad_norm": 137.51414489746094, "learning_rate": 1.4529186195142735e-05, "loss": 6.1094, "step": 1934 }, { "epoch": 0.7419478527607362, "grad_norm": 0.9444566369056702, "learning_rate": 1.4507882403067746e-05, "loss": 0.0025, "step": 1935 }, { "epoch": 0.7423312883435583, "grad_norm": 0.28314322233200073, "learning_rate": 1.4486578610992757e-05, "loss": 0.0026, "step": 1936 }, { "epoch": 0.7427147239263804, "grad_norm": 24.82479476928711, "learning_rate": 1.4465274818917768e-05, "loss": 0.5044, "step": 1937 }, { "epoch": 0.7430981595092024, "grad_norm": 12.466592788696289, "learning_rate": 1.4443971026842779e-05, "loss": 0.4556, "step": 1938 }, { "epoch": 0.7434815950920245, "grad_norm": 13.750520706176758, "learning_rate": 1.442266723476779e-05, "loss": 0.1217, "step": 1939 }, { "epoch": 0.7438650306748467, "grad_norm": 0.13975299894809723, "learning_rate": 1.44013634426928e-05, "loss": 0.0011, "step": 1940 }, { "epoch": 0.7442484662576687, "grad_norm": 6.503437519073486, "learning_rate": 1.438005965061781e-05, "loss": 0.0409, "step": 1941 }, { "epoch": 0.7446319018404908, "grad_norm": 9.731389999389648, "learning_rate": 1.4358755858542821e-05, "loss": 0.0243, "step": 1942 }, { "epoch": 0.7450153374233128, "grad_norm": 13.670307159423828, "learning_rate": 1.4337452066467832e-05, "loss": 0.1563, "step": 1943 }, { "epoch": 0.745398773006135, "grad_norm": 1.3887224197387695, "learning_rate": 1.4316148274392843e-05, "loss": 0.006, "step": 1944 }, { "epoch": 0.745782208588957, "grad_norm": 19.59745216369629, "learning_rate": 1.4294844482317854e-05, "loss": 0.6211, "step": 1945 }, { "epoch": 0.7461656441717791, "grad_norm": 0.4769805371761322, "learning_rate": 1.4273540690242865e-05, "loss": 0.0017, "step": 1946 }, { "epoch": 0.7465490797546013, "grad_norm": 5.039165019989014, "learning_rate": 1.4252236898167876e-05, "loss": 0.0082, "step": 1947 }, { "epoch": 0.7469325153374233, "grad_norm": 1.004677414894104, "learning_rate": 1.4230933106092886e-05, "loss": 0.0035, "step": 1948 }, { "epoch": 0.7473159509202454, "grad_norm": 0.9467623233795166, "learning_rate": 1.4209629314017897e-05, "loss": 0.0019, "step": 1949 }, { "epoch": 0.7476993865030674, "grad_norm": 8.143989562988281, "learning_rate": 1.4188325521942908e-05, "loss": 0.0665, "step": 1950 }, { "epoch": 0.7480828220858896, "grad_norm": 0.25524386763572693, "learning_rate": 1.4167021729867915e-05, "loss": 0.0008, "step": 1951 }, { "epoch": 0.7484662576687117, "grad_norm": 223.53726196289062, "learning_rate": 1.4145717937792926e-05, "loss": 1.3477, "step": 1952 }, { "epoch": 0.7488496932515337, "grad_norm": 214.31077575683594, "learning_rate": 1.4124414145717937e-05, "loss": 2.3165, "step": 1953 }, { "epoch": 0.7492331288343558, "grad_norm": 15.331868171691895, "learning_rate": 1.4103110353642948e-05, "loss": 0.4204, "step": 1954 }, { "epoch": 0.7496165644171779, "grad_norm": 139.1454315185547, "learning_rate": 1.408180656156796e-05, "loss": 6.4492, "step": 1955 }, { "epoch": 0.75, "grad_norm": 5.3760151863098145, "learning_rate": 1.406050276949297e-05, "loss": 0.0166, "step": 1956 }, { "epoch": 0.7503834355828221, "grad_norm": 2.1321780681610107, "learning_rate": 1.4039198977417981e-05, "loss": 0.0044, "step": 1957 }, { "epoch": 0.7507668711656442, "grad_norm": 76.46648406982422, "learning_rate": 1.401789518534299e-05, "loss": 0.9727, "step": 1958 }, { "epoch": 0.7511503067484663, "grad_norm": 151.1408233642578, "learning_rate": 1.3996591393268002e-05, "loss": 1.4248, "step": 1959 }, { "epoch": 0.7515337423312883, "grad_norm": 24.7465877532959, "learning_rate": 1.3975287601193013e-05, "loss": 0.4456, "step": 1960 }, { "epoch": 0.7519171779141104, "grad_norm": 11.28043270111084, "learning_rate": 1.3953983809118024e-05, "loss": 0.1874, "step": 1961 }, { "epoch": 0.7523006134969326, "grad_norm": 226.95289611816406, "learning_rate": 1.3932680017043035e-05, "loss": 1.3262, "step": 1962 }, { "epoch": 0.7526840490797546, "grad_norm": 0.2464362531900406, "learning_rate": 1.3911376224968046e-05, "loss": 0.0022, "step": 1963 }, { "epoch": 0.7530674846625767, "grad_norm": 12.830216407775879, "learning_rate": 1.3890072432893057e-05, "loss": 0.0542, "step": 1964 }, { "epoch": 0.7534509202453987, "grad_norm": 149.88232421875, "learning_rate": 1.3868768640818066e-05, "loss": 2.9808, "step": 1965 }, { "epoch": 0.7538343558282209, "grad_norm": 18.341276168823242, "learning_rate": 1.3847464848743077e-05, "loss": 0.1652, "step": 1966 }, { "epoch": 0.754217791411043, "grad_norm": 29.104957580566406, "learning_rate": 1.3826161056668088e-05, "loss": 0.0528, "step": 1967 }, { "epoch": 0.754601226993865, "grad_norm": 78.68212890625, "learning_rate": 1.3804857264593099e-05, "loss": 0.9668, "step": 1968 }, { "epoch": 0.7549846625766872, "grad_norm": 15.954132080078125, "learning_rate": 1.378355347251811e-05, "loss": 0.0438, "step": 1969 }, { "epoch": 0.7553680981595092, "grad_norm": 7.746146202087402, "learning_rate": 1.3762249680443121e-05, "loss": 0.0301, "step": 1970 }, { "epoch": 0.7557515337423313, "grad_norm": 0.9864992499351501, "learning_rate": 1.3740945888368132e-05, "loss": 0.0029, "step": 1971 }, { "epoch": 0.7561349693251533, "grad_norm": 197.49974060058594, "learning_rate": 1.3719642096293141e-05, "loss": 5.2236, "step": 1972 }, { "epoch": 0.7565184049079755, "grad_norm": 0.28314509987831116, "learning_rate": 1.369833830421815e-05, "loss": 0.0017, "step": 1973 }, { "epoch": 0.7569018404907976, "grad_norm": 1.3988046646118164, "learning_rate": 1.3677034512143162e-05, "loss": 0.0048, "step": 1974 }, { "epoch": 0.7572852760736196, "grad_norm": 9.20186996459961, "learning_rate": 1.3655730720068171e-05, "loss": 0.0245, "step": 1975 }, { "epoch": 0.7576687116564417, "grad_norm": 11.838817596435547, "learning_rate": 1.3634426927993182e-05, "loss": 0.0832, "step": 1976 }, { "epoch": 0.7580521472392638, "grad_norm": 138.5713348388672, "learning_rate": 1.3613123135918193e-05, "loss": 6.6934, "step": 1977 }, { "epoch": 0.7584355828220859, "grad_norm": 3.0601112842559814, "learning_rate": 1.3591819343843204e-05, "loss": 0.0071, "step": 1978 }, { "epoch": 0.758819018404908, "grad_norm": 0.6360490918159485, "learning_rate": 1.3570515551768215e-05, "loss": 0.0027, "step": 1979 }, { "epoch": 0.75920245398773, "grad_norm": 162.20603942871094, "learning_rate": 1.3549211759693226e-05, "loss": 5.6367, "step": 1980 }, { "epoch": 0.7595858895705522, "grad_norm": 11.104833602905273, "learning_rate": 1.3527907967618237e-05, "loss": 0.1571, "step": 1981 }, { "epoch": 0.7599693251533742, "grad_norm": 12.747593879699707, "learning_rate": 1.3506604175543246e-05, "loss": 0.1888, "step": 1982 }, { "epoch": 0.7603527607361963, "grad_norm": 199.33963012695312, "learning_rate": 1.3485300383468257e-05, "loss": 2.0549, "step": 1983 }, { "epoch": 0.7607361963190185, "grad_norm": 43.54447555541992, "learning_rate": 1.3463996591393268e-05, "loss": 0.8364, "step": 1984 }, { "epoch": 0.7611196319018405, "grad_norm": 2.030282974243164, "learning_rate": 1.344269279931828e-05, "loss": 0.0062, "step": 1985 }, { "epoch": 0.7615030674846626, "grad_norm": 21.328853607177734, "learning_rate": 1.342138900724329e-05, "loss": 0.4646, "step": 1986 }, { "epoch": 0.7618865030674846, "grad_norm": 194.71197509765625, "learning_rate": 1.3400085215168301e-05, "loss": 6.418, "step": 1987 }, { "epoch": 0.7622699386503068, "grad_norm": 1.1269176006317139, "learning_rate": 1.3378781423093312e-05, "loss": 0.0029, "step": 1988 }, { "epoch": 0.7626533742331288, "grad_norm": 15.9646635055542, "learning_rate": 1.3357477631018323e-05, "loss": 0.4321, "step": 1989 }, { "epoch": 0.7630368098159509, "grad_norm": 7.329847812652588, "learning_rate": 1.3336173838943333e-05, "loss": 0.027, "step": 1990 }, { "epoch": 0.763420245398773, "grad_norm": 71.15471649169922, "learning_rate": 1.3314870046868344e-05, "loss": 0.6953, "step": 1991 }, { "epoch": 0.7638036809815951, "grad_norm": 7.559596538543701, "learning_rate": 1.3293566254793355e-05, "loss": 0.0185, "step": 1992 }, { "epoch": 0.7641871165644172, "grad_norm": 122.88265228271484, "learning_rate": 1.3272262462718366e-05, "loss": 6.1523, "step": 1993 }, { "epoch": 0.7645705521472392, "grad_norm": 0.2372620701789856, "learning_rate": 1.3250958670643377e-05, "loss": 0.0013, "step": 1994 }, { "epoch": 0.7649539877300614, "grad_norm": 1.6263588666915894, "learning_rate": 1.3229654878568388e-05, "loss": 0.0038, "step": 1995 }, { "epoch": 0.7653374233128835, "grad_norm": 27.143592834472656, "learning_rate": 1.3208351086493395e-05, "loss": 0.429, "step": 1996 }, { "epoch": 0.7657208588957055, "grad_norm": 3.7754852771759033, "learning_rate": 1.3187047294418406e-05, "loss": 0.0113, "step": 1997 }, { "epoch": 0.7661042944785276, "grad_norm": 12.764897346496582, "learning_rate": 1.3165743502343417e-05, "loss": 0.0658, "step": 1998 }, { "epoch": 0.7664877300613497, "grad_norm": 1.2359267473220825, "learning_rate": 1.3144439710268428e-05, "loss": 0.0053, "step": 1999 }, { "epoch": 0.7668711656441718, "grad_norm": 49.90631103515625, "learning_rate": 1.3123135918193438e-05, "loss": 0.5952, "step": 2000 }, { "epoch": 0.7672546012269938, "grad_norm": 253.21788024902344, "learning_rate": 1.3101832126118449e-05, "loss": 2.6388, "step": 2001 }, { "epoch": 0.7676380368098159, "grad_norm": 11.39282512664795, "learning_rate": 1.308052833404346e-05, "loss": 0.0591, "step": 2002 }, { "epoch": 0.7680214723926381, "grad_norm": 14.903972625732422, "learning_rate": 1.305922454196847e-05, "loss": 0.0207, "step": 2003 }, { "epoch": 0.7684049079754601, "grad_norm": 130.27838134765625, "learning_rate": 1.3037920749893482e-05, "loss": 6.0723, "step": 2004 }, { "epoch": 0.7687883435582822, "grad_norm": 20.10736083984375, "learning_rate": 1.3016616957818493e-05, "loss": 0.542, "step": 2005 }, { "epoch": 0.7691717791411042, "grad_norm": 133.43751525878906, "learning_rate": 1.2995313165743504e-05, "loss": 6.8203, "step": 2006 }, { "epoch": 0.7695552147239264, "grad_norm": 37.334869384765625, "learning_rate": 1.2974009373668513e-05, "loss": 0.6221, "step": 2007 }, { "epoch": 0.7699386503067485, "grad_norm": 2.647223949432373, "learning_rate": 1.2952705581593524e-05, "loss": 0.0089, "step": 2008 }, { "epoch": 0.7703220858895705, "grad_norm": 172.29574584960938, "learning_rate": 1.2931401789518535e-05, "loss": 6.0957, "step": 2009 }, { "epoch": 0.7707055214723927, "grad_norm": 2.514612913131714, "learning_rate": 1.2910097997443546e-05, "loss": 0.0052, "step": 2010 }, { "epoch": 0.7710889570552147, "grad_norm": 13.704341888427734, "learning_rate": 1.2888794205368557e-05, "loss": 0.4009, "step": 2011 }, { "epoch": 0.7714723926380368, "grad_norm": 59.756534576416016, "learning_rate": 1.2867490413293568e-05, "loss": 0.5493, "step": 2012 }, { "epoch": 0.7718558282208589, "grad_norm": 8.976195335388184, "learning_rate": 1.2846186621218579e-05, "loss": 0.0307, "step": 2013 }, { "epoch": 0.772239263803681, "grad_norm": 17.59994125366211, "learning_rate": 1.2824882829143588e-05, "loss": 0.5464, "step": 2014 }, { "epoch": 0.7726226993865031, "grad_norm": 5.109958648681641, "learning_rate": 1.28035790370686e-05, "loss": 0.0102, "step": 2015 }, { "epoch": 0.7730061349693251, "grad_norm": 134.91262817382812, "learning_rate": 1.278227524499361e-05, "loss": 6.0898, "step": 2016 }, { "epoch": 0.7733895705521472, "grad_norm": 3.405291795730591, "learning_rate": 1.2760971452918621e-05, "loss": 0.0038, "step": 2017 }, { "epoch": 0.7737730061349694, "grad_norm": 69.19807434082031, "learning_rate": 1.2739667660843629e-05, "loss": 0.667, "step": 2018 }, { "epoch": 0.7741564417177914, "grad_norm": 0.4252760112285614, "learning_rate": 1.271836386876864e-05, "loss": 0.0025, "step": 2019 }, { "epoch": 0.7745398773006135, "grad_norm": 20.130861282348633, "learning_rate": 1.2697060076693651e-05, "loss": 0.1615, "step": 2020 }, { "epoch": 0.7749233128834356, "grad_norm": 0.6197576522827148, "learning_rate": 1.2675756284618662e-05, "loss": 0.0023, "step": 2021 }, { "epoch": 0.7753067484662577, "grad_norm": 8.245320320129395, "learning_rate": 1.2654452492543673e-05, "loss": 0.0094, "step": 2022 }, { "epoch": 0.7756901840490797, "grad_norm": 3.970463752746582, "learning_rate": 1.2633148700468684e-05, "loss": 0.0144, "step": 2023 }, { "epoch": 0.7760736196319018, "grad_norm": 206.5851287841797, "learning_rate": 1.2611844908393693e-05, "loss": 2.6232, "step": 2024 }, { "epoch": 0.776457055214724, "grad_norm": 62.07779312133789, "learning_rate": 1.2590541116318704e-05, "loss": 0.8525, "step": 2025 }, { "epoch": 0.776840490797546, "grad_norm": 98.0234375, "learning_rate": 1.2569237324243715e-05, "loss": 1.0723, "step": 2026 }, { "epoch": 0.7772239263803681, "grad_norm": 24.69891357421875, "learning_rate": 1.2547933532168726e-05, "loss": 0.5298, "step": 2027 }, { "epoch": 0.7776073619631901, "grad_norm": 37.78566360473633, "learning_rate": 1.2526629740093737e-05, "loss": 0.8819, "step": 2028 }, { "epoch": 0.7779907975460123, "grad_norm": 7.116847038269043, "learning_rate": 1.2505325948018748e-05, "loss": 0.3687, "step": 2029 }, { "epoch": 0.7783742331288344, "grad_norm": 8.491434097290039, "learning_rate": 1.248402215594376e-05, "loss": 0.0429, "step": 2030 }, { "epoch": 0.7787576687116564, "grad_norm": 1.4491527080535889, "learning_rate": 1.246271836386877e-05, "loss": 0.0033, "step": 2031 }, { "epoch": 0.7791411042944786, "grad_norm": 2.5249433517456055, "learning_rate": 1.244141457179378e-05, "loss": 0.0051, "step": 2032 }, { "epoch": 0.7795245398773006, "grad_norm": 57.93564224243164, "learning_rate": 1.242011077971879e-05, "loss": 0.6123, "step": 2033 }, { "epoch": 0.7799079754601227, "grad_norm": 1.0227478742599487, "learning_rate": 1.2398806987643802e-05, "loss": 0.0024, "step": 2034 }, { "epoch": 0.7802914110429447, "grad_norm": 0.39101430773735046, "learning_rate": 1.2377503195568811e-05, "loss": 0.0028, "step": 2035 }, { "epoch": 0.7806748466257669, "grad_norm": 38.51777267456055, "learning_rate": 1.2356199403493822e-05, "loss": 0.7461, "step": 2036 }, { "epoch": 0.781058282208589, "grad_norm": 2.0663833618164062, "learning_rate": 1.2334895611418833e-05, "loss": 0.0059, "step": 2037 }, { "epoch": 0.781441717791411, "grad_norm": 0.12430649995803833, "learning_rate": 1.2313591819343844e-05, "loss": 0.0014, "step": 2038 }, { "epoch": 0.7818251533742331, "grad_norm": 1.2589157819747925, "learning_rate": 1.2292288027268855e-05, "loss": 0.0031, "step": 2039 }, { "epoch": 0.7822085889570553, "grad_norm": 3.417670726776123, "learning_rate": 1.2270984235193864e-05, "loss": 0.0119, "step": 2040 }, { "epoch": 0.7825920245398773, "grad_norm": 16.39604949951172, "learning_rate": 1.2249680443118875e-05, "loss": 0.1658, "step": 2041 }, { "epoch": 0.7829754601226994, "grad_norm": 0.9449207782745361, "learning_rate": 1.2228376651043886e-05, "loss": 0.0032, "step": 2042 }, { "epoch": 0.7833588957055214, "grad_norm": 106.27434539794922, "learning_rate": 1.2207072858968897e-05, "loss": 0.7173, "step": 2043 }, { "epoch": 0.7837423312883436, "grad_norm": 12.998677253723145, "learning_rate": 1.2185769066893908e-05, "loss": 0.0501, "step": 2044 }, { "epoch": 0.7841257668711656, "grad_norm": 2.242809534072876, "learning_rate": 1.216446527481892e-05, "loss": 0.0047, "step": 2045 }, { "epoch": 0.7845092024539877, "grad_norm": 299.3255615234375, "learning_rate": 1.214316148274393e-05, "loss": 5.7402, "step": 2046 }, { "epoch": 0.7848926380368099, "grad_norm": 0.0981670394539833, "learning_rate": 1.212185769066894e-05, "loss": 0.0011, "step": 2047 }, { "epoch": 0.7852760736196319, "grad_norm": 109.51537322998047, "learning_rate": 1.210055389859395e-05, "loss": 1.4248, "step": 2048 }, { "epoch": 0.785659509202454, "grad_norm": 243.7086944580078, "learning_rate": 1.207925010651896e-05, "loss": 2.7435, "step": 2049 }, { "epoch": 0.786042944785276, "grad_norm": 1.342392086982727, "learning_rate": 1.2057946314443971e-05, "loss": 0.0056, "step": 2050 }, { "epoch": 0.7864263803680982, "grad_norm": 12.527939796447754, "learning_rate": 1.2036642522368982e-05, "loss": 0.406, "step": 2051 }, { "epoch": 0.7868098159509203, "grad_norm": 149.59002685546875, "learning_rate": 1.2015338730293993e-05, "loss": 1.8253, "step": 2052 }, { "epoch": 0.7871932515337423, "grad_norm": 2.531747817993164, "learning_rate": 1.1994034938219004e-05, "loss": 0.0104, "step": 2053 }, { "epoch": 0.7875766871165644, "grad_norm": 0.16375118494033813, "learning_rate": 1.1972731146144015e-05, "loss": 0.0012, "step": 2054 }, { "epoch": 0.7879601226993865, "grad_norm": 7.788699150085449, "learning_rate": 1.1951427354069026e-05, "loss": 0.3816, "step": 2055 }, { "epoch": 0.7883435582822086, "grad_norm": 36.510345458984375, "learning_rate": 1.1930123561994035e-05, "loss": 0.1997, "step": 2056 }, { "epoch": 0.7887269938650306, "grad_norm": 3.886620044708252, "learning_rate": 1.1908819769919046e-05, "loss": 0.0128, "step": 2057 }, { "epoch": 0.7891104294478528, "grad_norm": 0.6223607063293457, "learning_rate": 1.1887515977844056e-05, "loss": 0.0023, "step": 2058 }, { "epoch": 0.7894938650306749, "grad_norm": 1.6074047088623047, "learning_rate": 1.1866212185769067e-05, "loss": 0.0034, "step": 2059 }, { "epoch": 0.7898773006134969, "grad_norm": 19.636877059936523, "learning_rate": 1.1844908393694078e-05, "loss": 0.0944, "step": 2060 }, { "epoch": 0.790260736196319, "grad_norm": 7.295237064361572, "learning_rate": 1.1823604601619089e-05, "loss": 0.0228, "step": 2061 }, { "epoch": 0.7906441717791411, "grad_norm": 15.375859260559082, "learning_rate": 1.18023008095441e-05, "loss": 0.0322, "step": 2062 }, { "epoch": 0.7910276073619632, "grad_norm": 116.54905700683594, "learning_rate": 1.178099701746911e-05, "loss": 6.8379, "step": 2063 }, { "epoch": 0.7914110429447853, "grad_norm": 4.101083755493164, "learning_rate": 1.1759693225394122e-05, "loss": 0.016, "step": 2064 }, { "epoch": 0.7917944785276073, "grad_norm": 290.10479736328125, "learning_rate": 1.1738389433319131e-05, "loss": 5.3223, "step": 2065 }, { "epoch": 0.7921779141104295, "grad_norm": 10.354349136352539, "learning_rate": 1.1717085641244142e-05, "loss": 0.0312, "step": 2066 }, { "epoch": 0.7925613496932515, "grad_norm": 55.29102325439453, "learning_rate": 1.1695781849169153e-05, "loss": 0.6509, "step": 2067 }, { "epoch": 0.7929447852760736, "grad_norm": 37.20539855957031, "learning_rate": 1.1674478057094164e-05, "loss": 0.6289, "step": 2068 }, { "epoch": 0.7933282208588958, "grad_norm": 19.556257247924805, "learning_rate": 1.1653174265019173e-05, "loss": 0.1942, "step": 2069 }, { "epoch": 0.7937116564417178, "grad_norm": 76.98854064941406, "learning_rate": 1.1631870472944184e-05, "loss": 0.8726, "step": 2070 }, { "epoch": 0.7940950920245399, "grad_norm": 10.885735511779785, "learning_rate": 1.1610566680869195e-05, "loss": 0.2725, "step": 2071 }, { "epoch": 0.7944785276073619, "grad_norm": 259.9325256347656, "learning_rate": 1.1589262888794206e-05, "loss": 1.6543, "step": 2072 }, { "epoch": 0.7948619631901841, "grad_norm": 23.60029411315918, "learning_rate": 1.1567959096719216e-05, "loss": 0.0136, "step": 2073 }, { "epoch": 0.7952453987730062, "grad_norm": 11.054368019104004, "learning_rate": 1.1546655304644227e-05, "loss": 0.428, "step": 2074 }, { "epoch": 0.7956288343558282, "grad_norm": 6.6344451904296875, "learning_rate": 1.1525351512569238e-05, "loss": 0.0208, "step": 2075 }, { "epoch": 0.7960122699386503, "grad_norm": 131.8979034423828, "learning_rate": 1.1504047720494249e-05, "loss": 2.4476, "step": 2076 }, { "epoch": 0.7963957055214724, "grad_norm": 138.93003845214844, "learning_rate": 1.148274392841926e-05, "loss": 6.1064, "step": 2077 }, { "epoch": 0.7967791411042945, "grad_norm": 72.7630844116211, "learning_rate": 1.146144013634427e-05, "loss": 1.6818, "step": 2078 }, { "epoch": 0.7971625766871165, "grad_norm": 0.30345648527145386, "learning_rate": 1.1440136344269282e-05, "loss": 0.0009, "step": 2079 }, { "epoch": 0.7975460122699386, "grad_norm": 80.4941635131836, "learning_rate": 1.1418832552194293e-05, "loss": 0.544, "step": 2080 }, { "epoch": 0.7979294478527608, "grad_norm": 21.229745864868164, "learning_rate": 1.1397528760119302e-05, "loss": 0.4605, "step": 2081 }, { "epoch": 0.7983128834355828, "grad_norm": 72.00467681884766, "learning_rate": 1.1376224968044311e-05, "loss": 0.8608, "step": 2082 }, { "epoch": 0.7986963190184049, "grad_norm": 57.26301193237305, "learning_rate": 1.1354921175969322e-05, "loss": 0.5435, "step": 2083 }, { "epoch": 0.799079754601227, "grad_norm": 107.73514556884766, "learning_rate": 1.1333617383894333e-05, "loss": 1.4268, "step": 2084 }, { "epoch": 0.7994631901840491, "grad_norm": 91.19815826416016, "learning_rate": 1.1312313591819344e-05, "loss": 0.7488, "step": 2085 }, { "epoch": 0.7998466257668712, "grad_norm": 1.1510038375854492, "learning_rate": 1.1291009799744355e-05, "loss": 0.005, "step": 2086 }, { "epoch": 0.8002300613496932, "grad_norm": 2.4116382598876953, "learning_rate": 1.1269706007669366e-05, "loss": 0.0055, "step": 2087 }, { "epoch": 0.8006134969325154, "grad_norm": 0.2294168919324875, "learning_rate": 1.1248402215594377e-05, "loss": 0.0013, "step": 2088 }, { "epoch": 0.8009969325153374, "grad_norm": 0.37558597326278687, "learning_rate": 1.1227098423519387e-05, "loss": 0.002, "step": 2089 }, { "epoch": 0.8013803680981595, "grad_norm": 325.6514892578125, "learning_rate": 1.1205794631444398e-05, "loss": 5.7324, "step": 2090 }, { "epoch": 0.8017638036809815, "grad_norm": 8.776749610900879, "learning_rate": 1.1184490839369409e-05, "loss": 0.0286, "step": 2091 }, { "epoch": 0.8021472392638037, "grad_norm": 78.64884185791016, "learning_rate": 1.1163187047294418e-05, "loss": 0.8096, "step": 2092 }, { "epoch": 0.8025306748466258, "grad_norm": 48.145835876464844, "learning_rate": 1.1141883255219429e-05, "loss": 0.8326, "step": 2093 }, { "epoch": 0.8029141104294478, "grad_norm": 159.64549255371094, "learning_rate": 1.112057946314444e-05, "loss": 6.3145, "step": 2094 }, { "epoch": 0.80329754601227, "grad_norm": 4.064866065979004, "learning_rate": 1.1099275671069451e-05, "loss": 0.0089, "step": 2095 }, { "epoch": 0.803680981595092, "grad_norm": 5.229132652282715, "learning_rate": 1.1077971878994462e-05, "loss": 0.0324, "step": 2096 }, { "epoch": 0.8040644171779141, "grad_norm": 0.9916634559631348, "learning_rate": 1.1056668086919473e-05, "loss": 0.0044, "step": 2097 }, { "epoch": 0.8044478527607362, "grad_norm": 4.942611217498779, "learning_rate": 1.1035364294844482e-05, "loss": 0.0221, "step": 2098 }, { "epoch": 0.8048312883435583, "grad_norm": 45.87492370605469, "learning_rate": 1.1014060502769493e-05, "loss": 0.7393, "step": 2099 }, { "epoch": 0.8052147239263804, "grad_norm": 90.27005767822266, "learning_rate": 1.0992756710694504e-05, "loss": 1.5313, "step": 2100 }, { "epoch": 0.8055981595092024, "grad_norm": 0.21876457333564758, "learning_rate": 1.0971452918619515e-05, "loss": 0.0013, "step": 2101 }, { "epoch": 0.8059815950920245, "grad_norm": 0.13131088018417358, "learning_rate": 1.0950149126544526e-05, "loss": 0.0011, "step": 2102 }, { "epoch": 0.8063650306748467, "grad_norm": 1.5603348016738892, "learning_rate": 1.0928845334469536e-05, "loss": 0.0047, "step": 2103 }, { "epoch": 0.8067484662576687, "grad_norm": 0.4653047025203705, "learning_rate": 1.0907541542394547e-05, "loss": 0.0027, "step": 2104 }, { "epoch": 0.8071319018404908, "grad_norm": 0.2466188371181488, "learning_rate": 1.0886237750319558e-05, "loss": 0.001, "step": 2105 }, { "epoch": 0.8075153374233128, "grad_norm": 2.751034736633301, "learning_rate": 1.0864933958244569e-05, "loss": 0.0074, "step": 2106 }, { "epoch": 0.807898773006135, "grad_norm": 16.325443267822266, "learning_rate": 1.0843630166169578e-05, "loss": 0.0557, "step": 2107 }, { "epoch": 0.808282208588957, "grad_norm": 1.1431059837341309, "learning_rate": 1.0822326374094589e-05, "loss": 0.0033, "step": 2108 }, { "epoch": 0.8086656441717791, "grad_norm": 5.046448707580566, "learning_rate": 1.08010225820196e-05, "loss": 0.0111, "step": 2109 }, { "epoch": 0.8090490797546013, "grad_norm": 0.3879411518573761, "learning_rate": 1.0779718789944611e-05, "loss": 0.0015, "step": 2110 }, { "epoch": 0.8094325153374233, "grad_norm": 7.425095081329346, "learning_rate": 1.0758414997869622e-05, "loss": 0.3953, "step": 2111 }, { "epoch": 0.8098159509202454, "grad_norm": 15.652329444885254, "learning_rate": 1.0737111205794633e-05, "loss": 0.0942, "step": 2112 }, { "epoch": 0.8101993865030674, "grad_norm": 9.081522941589355, "learning_rate": 1.0715807413719644e-05, "loss": 0.0416, "step": 2113 }, { "epoch": 0.8105828220858896, "grad_norm": 1.3029454946517944, "learning_rate": 1.0694503621644653e-05, "loss": 0.0029, "step": 2114 }, { "epoch": 0.8109662576687117, "grad_norm": 8.73009967803955, "learning_rate": 1.0673199829569663e-05, "loss": 0.0344, "step": 2115 }, { "epoch": 0.8113496932515337, "grad_norm": 1.926862359046936, "learning_rate": 1.0651896037494674e-05, "loss": 0.0036, "step": 2116 }, { "epoch": 0.8117331288343558, "grad_norm": 1.280436396598816, "learning_rate": 1.0630592245419685e-05, "loss": 0.0032, "step": 2117 }, { "epoch": 0.8121165644171779, "grad_norm": 0.5277543067932129, "learning_rate": 1.0609288453344696e-05, "loss": 0.0021, "step": 2118 }, { "epoch": 0.8125, "grad_norm": 21.588167190551758, "learning_rate": 1.0587984661269707e-05, "loss": 0.1061, "step": 2119 }, { "epoch": 0.8128834355828221, "grad_norm": 3.551272392272949, "learning_rate": 1.0566680869194718e-05, "loss": 0.0067, "step": 2120 }, { "epoch": 0.8132668711656442, "grad_norm": 0.41435983777046204, "learning_rate": 1.0545377077119729e-05, "loss": 0.0031, "step": 2121 }, { "epoch": 0.8136503067484663, "grad_norm": 301.73583984375, "learning_rate": 1.052407328504474e-05, "loss": 7.0078, "step": 2122 }, { "epoch": 0.8140337423312883, "grad_norm": 9.311138153076172, "learning_rate": 1.0502769492969749e-05, "loss": 0.0557, "step": 2123 }, { "epoch": 0.8144171779141104, "grad_norm": 0.23725460469722748, "learning_rate": 1.048146570089476e-05, "loss": 0.002, "step": 2124 }, { "epoch": 0.8148006134969326, "grad_norm": 7.481473922729492, "learning_rate": 1.0460161908819771e-05, "loss": 0.0067, "step": 2125 }, { "epoch": 0.8151840490797546, "grad_norm": 0.6002553105354309, "learning_rate": 1.043885811674478e-05, "loss": 0.0023, "step": 2126 }, { "epoch": 0.8155674846625767, "grad_norm": 93.15229797363281, "learning_rate": 1.0417554324669791e-05, "loss": 1.0733, "step": 2127 }, { "epoch": 0.8159509202453987, "grad_norm": 11.985151290893555, "learning_rate": 1.0396250532594802e-05, "loss": 0.0348, "step": 2128 }, { "epoch": 0.8163343558282209, "grad_norm": 11.536222457885742, "learning_rate": 1.0374946740519813e-05, "loss": 0.4173, "step": 2129 }, { "epoch": 0.816717791411043, "grad_norm": 8.33538818359375, "learning_rate": 1.0353642948444824e-05, "loss": 0.1705, "step": 2130 }, { "epoch": 0.817101226993865, "grad_norm": 3.2708263397216797, "learning_rate": 1.0332339156369834e-05, "loss": 0.375, "step": 2131 }, { "epoch": 0.8174846625766872, "grad_norm": 148.76092529296875, "learning_rate": 1.0311035364294845e-05, "loss": 6.4873, "step": 2132 }, { "epoch": 0.8178680981595092, "grad_norm": 1.8550126552581787, "learning_rate": 1.0289731572219856e-05, "loss": 0.0058, "step": 2133 }, { "epoch": 0.8182515337423313, "grad_norm": 25.314233779907227, "learning_rate": 1.0268427780144867e-05, "loss": 0.6489, "step": 2134 }, { "epoch": 0.8186349693251533, "grad_norm": 19.60906219482422, "learning_rate": 1.0247123988069878e-05, "loss": 0.4819, "step": 2135 }, { "epoch": 0.8190184049079755, "grad_norm": 14.061655044555664, "learning_rate": 1.0225820195994889e-05, "loss": 0.0454, "step": 2136 }, { "epoch": 0.8194018404907976, "grad_norm": 5.1389923095703125, "learning_rate": 1.0204516403919898e-05, "loss": 0.0107, "step": 2137 }, { "epoch": 0.8197852760736196, "grad_norm": 1.9320876598358154, "learning_rate": 1.0183212611844909e-05, "loss": 0.0059, "step": 2138 }, { "epoch": 0.8201687116564417, "grad_norm": 0.8540799021720886, "learning_rate": 1.016190881976992e-05, "loss": 0.0031, "step": 2139 }, { "epoch": 0.8205521472392638, "grad_norm": 11.2532377243042, "learning_rate": 1.014060502769493e-05, "loss": 0.4314, "step": 2140 }, { "epoch": 0.8209355828220859, "grad_norm": 146.77346801757812, "learning_rate": 1.011930123561994e-05, "loss": 4.4738, "step": 2141 }, { "epoch": 0.821319018404908, "grad_norm": 0.3676358163356781, "learning_rate": 1.0097997443544951e-05, "loss": 0.0019, "step": 2142 }, { "epoch": 0.82170245398773, "grad_norm": 101.42230224609375, "learning_rate": 1.0076693651469962e-05, "loss": 1.8779, "step": 2143 }, { "epoch": 0.8220858895705522, "grad_norm": 0.34330686926841736, "learning_rate": 1.0055389859394973e-05, "loss": 0.0016, "step": 2144 }, { "epoch": 0.8224693251533742, "grad_norm": 178.65760803222656, "learning_rate": 1.0034086067319984e-05, "loss": 1.7715, "step": 2145 }, { "epoch": 0.8228527607361963, "grad_norm": 485.91131591796875, "learning_rate": 1.0012782275244995e-05, "loss": 5.7656, "step": 2146 }, { "epoch": 0.8232361963190185, "grad_norm": 7.863787651062012, "learning_rate": 9.991478483170005e-06, "loss": 0.3987, "step": 2147 }, { "epoch": 0.8236196319018405, "grad_norm": 34.286102294921875, "learning_rate": 9.970174691095016e-06, "loss": 0.625, "step": 2148 }, { "epoch": 0.8240030674846626, "grad_norm": 48.19886016845703, "learning_rate": 9.948870899020025e-06, "loss": 0.6436, "step": 2149 }, { "epoch": 0.8243865030674846, "grad_norm": 202.91395568847656, "learning_rate": 9.927567106945036e-06, "loss": 3.4361, "step": 2150 }, { "epoch": 0.8247699386503068, "grad_norm": 14.648388862609863, "learning_rate": 9.906263314870047e-06, "loss": 0.4126, "step": 2151 }, { "epoch": 0.8251533742331288, "grad_norm": 16.33935546875, "learning_rate": 9.884959522795058e-06, "loss": 0.0564, "step": 2152 }, { "epoch": 0.8255368098159509, "grad_norm": 86.1637954711914, "learning_rate": 9.863655730720069e-06, "loss": 1.294, "step": 2153 }, { "epoch": 0.825920245398773, "grad_norm": 3.0902631282806396, "learning_rate": 9.84235193864508e-06, "loss": 0.0195, "step": 2154 }, { "epoch": 0.8263036809815951, "grad_norm": 0.3026873767375946, "learning_rate": 9.821048146570091e-06, "loss": 0.0021, "step": 2155 }, { "epoch": 0.8266871165644172, "grad_norm": 198.00852966308594, "learning_rate": 9.7997443544951e-06, "loss": 3.0872, "step": 2156 }, { "epoch": 0.8270705521472392, "grad_norm": 261.3690490722656, "learning_rate": 9.778440562420111e-06, "loss": 1.6778, "step": 2157 }, { "epoch": 0.8274539877300614, "grad_norm": 4.099433422088623, "learning_rate": 9.757136770345122e-06, "loss": 0.0214, "step": 2158 }, { "epoch": 0.8278374233128835, "grad_norm": 1.4283417463302612, "learning_rate": 9.735832978270133e-06, "loss": 0.0041, "step": 2159 }, { "epoch": 0.8282208588957055, "grad_norm": 181.2616424560547, "learning_rate": 9.714529186195143e-06, "loss": 1.8028, "step": 2160 }, { "epoch": 0.8286042944785276, "grad_norm": 7.175016403198242, "learning_rate": 9.693225394120154e-06, "loss": 0.0357, "step": 2161 }, { "epoch": 0.8289877300613497, "grad_norm": 1.671427845954895, "learning_rate": 9.671921602045165e-06, "loss": 0.0034, "step": 2162 }, { "epoch": 0.8293711656441718, "grad_norm": 16.254331588745117, "learning_rate": 9.650617809970176e-06, "loss": 0.42, "step": 2163 }, { "epoch": 0.8297546012269938, "grad_norm": 213.69357299804688, "learning_rate": 9.629314017895187e-06, "loss": 6.9487, "step": 2164 }, { "epoch": 0.8301380368098159, "grad_norm": 4.822175025939941, "learning_rate": 9.608010225820196e-06, "loss": 0.017, "step": 2165 }, { "epoch": 0.8305214723926381, "grad_norm": 5.070052146911621, "learning_rate": 9.586706433745207e-06, "loss": 0.0058, "step": 2166 }, { "epoch": 0.8309049079754601, "grad_norm": 3.2388699054718018, "learning_rate": 9.565402641670218e-06, "loss": 0.0139, "step": 2167 }, { "epoch": 0.8312883435582822, "grad_norm": 15.923335075378418, "learning_rate": 9.544098849595229e-06, "loss": 0.0522, "step": 2168 }, { "epoch": 0.8316717791411042, "grad_norm": 0.178175151348114, "learning_rate": 9.52279505752024e-06, "loss": 0.0014, "step": 2169 }, { "epoch": 0.8320552147239264, "grad_norm": 9.746631622314453, "learning_rate": 9.501491265445251e-06, "loss": 0.0461, "step": 2170 }, { "epoch": 0.8324386503067485, "grad_norm": 0.3492185175418854, "learning_rate": 9.48018747337026e-06, "loss": 0.0013, "step": 2171 }, { "epoch": 0.8328220858895705, "grad_norm": 22.301040649414062, "learning_rate": 9.458883681295271e-06, "loss": 0.141, "step": 2172 }, { "epoch": 0.8332055214723927, "grad_norm": 211.20962524414062, "learning_rate": 9.43757988922028e-06, "loss": 2.256, "step": 2173 }, { "epoch": 0.8335889570552147, "grad_norm": 27.95379066467285, "learning_rate": 9.416276097145292e-06, "loss": 0.4873, "step": 2174 }, { "epoch": 0.8339723926380368, "grad_norm": 224.555419921875, "learning_rate": 9.394972305070303e-06, "loss": 1.7041, "step": 2175 }, { "epoch": 0.8343558282208589, "grad_norm": 68.85265350341797, "learning_rate": 9.373668512995314e-06, "loss": 0.9019, "step": 2176 }, { "epoch": 0.834739263803681, "grad_norm": 118.31787872314453, "learning_rate": 9.352364720920325e-06, "loss": 1.0137, "step": 2177 }, { "epoch": 0.8351226993865031, "grad_norm": 33.278621673583984, "learning_rate": 9.331060928845336e-06, "loss": 0.6021, "step": 2178 }, { "epoch": 0.8355061349693251, "grad_norm": 0.8133134245872498, "learning_rate": 9.309757136770347e-06, "loss": 0.0033, "step": 2179 }, { "epoch": 0.8358895705521472, "grad_norm": 0.19997651875019073, "learning_rate": 9.288453344695358e-06, "loss": 0.0019, "step": 2180 }, { "epoch": 0.8362730061349694, "grad_norm": 35.009544372558594, "learning_rate": 9.267149552620367e-06, "loss": 0.2026, "step": 2181 }, { "epoch": 0.8366564417177914, "grad_norm": 11.999231338500977, "learning_rate": 9.245845760545376e-06, "loss": 0.0838, "step": 2182 }, { "epoch": 0.8370398773006135, "grad_norm": 12.60776138305664, "learning_rate": 9.224541968470387e-06, "loss": 0.0487, "step": 2183 }, { "epoch": 0.8374233128834356, "grad_norm": 0.29287654161453247, "learning_rate": 9.203238176395398e-06, "loss": 0.0014, "step": 2184 }, { "epoch": 0.8378067484662577, "grad_norm": 74.42601013183594, "learning_rate": 9.18193438432041e-06, "loss": 1.1533, "step": 2185 }, { "epoch": 0.8381901840490797, "grad_norm": 8.74606704711914, "learning_rate": 9.16063059224542e-06, "loss": 0.0355, "step": 2186 }, { "epoch": 0.8385736196319018, "grad_norm": 47.63106155395508, "learning_rate": 9.139326800170431e-06, "loss": 0.5376, "step": 2187 }, { "epoch": 0.838957055214724, "grad_norm": 0.6941925287246704, "learning_rate": 9.118023008095442e-06, "loss": 0.0024, "step": 2188 }, { "epoch": 0.839340490797546, "grad_norm": 7.404911994934082, "learning_rate": 9.096719216020452e-06, "loss": 0.3897, "step": 2189 }, { "epoch": 0.8397239263803681, "grad_norm": 0.8754544854164124, "learning_rate": 9.075415423945463e-06, "loss": 0.0029, "step": 2190 }, { "epoch": 0.8401073619631901, "grad_norm": 134.01356506347656, "learning_rate": 9.054111631870474e-06, "loss": 6.3232, "step": 2191 }, { "epoch": 0.8404907975460123, "grad_norm": 0.8344331383705139, "learning_rate": 9.032807839795485e-06, "loss": 0.0023, "step": 2192 }, { "epoch": 0.8408742331288344, "grad_norm": 0.300471693277359, "learning_rate": 9.011504047720494e-06, "loss": 0.0011, "step": 2193 }, { "epoch": 0.8412576687116564, "grad_norm": 585.5133056640625, "learning_rate": 8.990200255645505e-06, "loss": 5.4443, "step": 2194 }, { "epoch": 0.8416411042944786, "grad_norm": 6.138875484466553, "learning_rate": 8.968896463570516e-06, "loss": 0.0332, "step": 2195 }, { "epoch": 0.8420245398773006, "grad_norm": 4.308825492858887, "learning_rate": 8.947592671495527e-06, "loss": 0.0201, "step": 2196 }, { "epoch": 0.8424079754601227, "grad_norm": 64.83824157714844, "learning_rate": 8.926288879420538e-06, "loss": 0.7031, "step": 2197 }, { "epoch": 0.8427914110429447, "grad_norm": 45.091739654541016, "learning_rate": 8.904985087345547e-06, "loss": 0.5522, "step": 2198 }, { "epoch": 0.8431748466257669, "grad_norm": 15.416338920593262, "learning_rate": 8.883681295270558e-06, "loss": 0.4653, "step": 2199 }, { "epoch": 0.843558282208589, "grad_norm": 74.4372787475586, "learning_rate": 8.86237750319557e-06, "loss": 1.0098, "step": 2200 }, { "epoch": 0.843941717791411, "grad_norm": 36.97453308105469, "learning_rate": 8.84107371112058e-06, "loss": 0.5796, "step": 2201 }, { "epoch": 0.8443251533742331, "grad_norm": 29.4422607421875, "learning_rate": 8.819769919045591e-06, "loss": 0.5654, "step": 2202 }, { "epoch": 0.8447085889570553, "grad_norm": 4.072035312652588, "learning_rate": 8.798466126970602e-06, "loss": 0.0087, "step": 2203 }, { "epoch": 0.8450920245398773, "grad_norm": 0.17604690790176392, "learning_rate": 8.777162334895613e-06, "loss": 0.0011, "step": 2204 }, { "epoch": 0.8454754601226994, "grad_norm": 0.0971691682934761, "learning_rate": 8.755858542820623e-06, "loss": 0.0007, "step": 2205 }, { "epoch": 0.8458588957055214, "grad_norm": 0.4734111726284027, "learning_rate": 8.734554750745632e-06, "loss": 0.0015, "step": 2206 }, { "epoch": 0.8462423312883436, "grad_norm": 0.7215316295623779, "learning_rate": 8.713250958670643e-06, "loss": 0.0026, "step": 2207 }, { "epoch": 0.8466257668711656, "grad_norm": 11.359012603759766, "learning_rate": 8.691947166595654e-06, "loss": 0.0755, "step": 2208 }, { "epoch": 0.8470092024539877, "grad_norm": 36.86509323120117, "learning_rate": 8.670643374520665e-06, "loss": 0.2292, "step": 2209 }, { "epoch": 0.8473926380368099, "grad_norm": 14.986333847045898, "learning_rate": 8.649339582445676e-06, "loss": 0.2059, "step": 2210 }, { "epoch": 0.8477760736196319, "grad_norm": 14.543253898620605, "learning_rate": 8.628035790370687e-06, "loss": 0.0939, "step": 2211 }, { "epoch": 0.848159509202454, "grad_norm": 133.95114135742188, "learning_rate": 8.606731998295698e-06, "loss": 1.671, "step": 2212 }, { "epoch": 0.848542944785276, "grad_norm": 9.932710647583008, "learning_rate": 8.585428206220709e-06, "loss": 0.0348, "step": 2213 }, { "epoch": 0.8489263803680982, "grad_norm": 0.16129763424396515, "learning_rate": 8.564124414145718e-06, "loss": 0.0012, "step": 2214 }, { "epoch": 0.8493098159509203, "grad_norm": 0.8886722922325134, "learning_rate": 8.54282062207073e-06, "loss": 0.0034, "step": 2215 }, { "epoch": 0.8496932515337423, "grad_norm": 82.60255432128906, "learning_rate": 8.521516829995738e-06, "loss": 1.5283, "step": 2216 }, { "epoch": 0.8500766871165644, "grad_norm": 17.767559051513672, "learning_rate": 8.50021303792075e-06, "loss": 0.4824, "step": 2217 }, { "epoch": 0.8504601226993865, "grad_norm": 110.89055633544922, "learning_rate": 8.47890924584576e-06, "loss": 5.498, "step": 2218 }, { "epoch": 0.8508435582822086, "grad_norm": 1.0662866830825806, "learning_rate": 8.457605453770772e-06, "loss": 0.0029, "step": 2219 }, { "epoch": 0.8512269938650306, "grad_norm": 2.306121587753296, "learning_rate": 8.436301661695783e-06, "loss": 0.0028, "step": 2220 }, { "epoch": 0.8516104294478528, "grad_norm": 30.38901710510254, "learning_rate": 8.414997869620794e-06, "loss": 0.6836, "step": 2221 }, { "epoch": 0.8519938650306749, "grad_norm": 9.880842208862305, "learning_rate": 8.393694077545803e-06, "loss": 0.2307, "step": 2222 }, { "epoch": 0.8523773006134969, "grad_norm": 0.7096598148345947, "learning_rate": 8.372390285470814e-06, "loss": 0.0046, "step": 2223 }, { "epoch": 0.852760736196319, "grad_norm": 19.66064453125, "learning_rate": 8.351086493395825e-06, "loss": 0.4373, "step": 2224 }, { "epoch": 0.8531441717791411, "grad_norm": 40.124759674072266, "learning_rate": 8.329782701320836e-06, "loss": 0.4824, "step": 2225 }, { "epoch": 0.8535276073619632, "grad_norm": 1.8273738622665405, "learning_rate": 8.308478909245847e-06, "loss": 0.0027, "step": 2226 }, { "epoch": 0.8539110429447853, "grad_norm": 0.40088343620300293, "learning_rate": 8.287175117170856e-06, "loss": 0.0023, "step": 2227 }, { "epoch": 0.8542944785276073, "grad_norm": 1.148898959159851, "learning_rate": 8.265871325095867e-06, "loss": 0.0057, "step": 2228 }, { "epoch": 0.8546779141104295, "grad_norm": 6.235866546630859, "learning_rate": 8.244567533020878e-06, "loss": 0.398, "step": 2229 }, { "epoch": 0.8550613496932515, "grad_norm": 1.2773715257644653, "learning_rate": 8.22326374094589e-06, "loss": 0.0041, "step": 2230 }, { "epoch": 0.8554447852760736, "grad_norm": 5.057631015777588, "learning_rate": 8.201959948870898e-06, "loss": 0.0131, "step": 2231 }, { "epoch": 0.8558282208588958, "grad_norm": 1.29682195186615, "learning_rate": 8.18065615679591e-06, "loss": 0.0036, "step": 2232 }, { "epoch": 0.8562116564417178, "grad_norm": 166.40093994140625, "learning_rate": 8.15935236472092e-06, "loss": 1.7452, "step": 2233 }, { "epoch": 0.8565950920245399, "grad_norm": 15.131058692932129, "learning_rate": 8.138048572645932e-06, "loss": 0.1754, "step": 2234 }, { "epoch": 0.8569785276073619, "grad_norm": 63.33060836791992, "learning_rate": 8.116744780570943e-06, "loss": 0.9424, "step": 2235 }, { "epoch": 0.8573619631901841, "grad_norm": 34.5869140625, "learning_rate": 8.095440988495954e-06, "loss": 0.2004, "step": 2236 }, { "epoch": 0.8577453987730062, "grad_norm": 0.4424964189529419, "learning_rate": 8.074137196420965e-06, "loss": 0.0014, "step": 2237 }, { "epoch": 0.8581288343558282, "grad_norm": 0.6975522637367249, "learning_rate": 8.052833404345974e-06, "loss": 0.0038, "step": 2238 }, { "epoch": 0.8585122699386503, "grad_norm": 1.3421223163604736, "learning_rate": 8.031529612270985e-06, "loss": 0.0017, "step": 2239 }, { "epoch": 0.8588957055214724, "grad_norm": 1.981398344039917, "learning_rate": 8.010225820195994e-06, "loss": 0.0048, "step": 2240 }, { "epoch": 0.8592791411042945, "grad_norm": 130.21043395996094, "learning_rate": 7.988922028121005e-06, "loss": 6.0957, "step": 2241 }, { "epoch": 0.8596625766871165, "grad_norm": 132.2084197998047, "learning_rate": 7.967618236046016e-06, "loss": 1.5684, "step": 2242 }, { "epoch": 0.8600460122699386, "grad_norm": 120.15467834472656, "learning_rate": 7.946314443971027e-06, "loss": 6.2344, "step": 2243 }, { "epoch": 0.8604294478527608, "grad_norm": 30.702177047729492, "learning_rate": 7.925010651896038e-06, "loss": 0.1836, "step": 2244 }, { "epoch": 0.8608128834355828, "grad_norm": 5.360868453979492, "learning_rate": 7.90370685982105e-06, "loss": 0.0084, "step": 2245 }, { "epoch": 0.8611963190184049, "grad_norm": 284.1966247558594, "learning_rate": 7.88240306774606e-06, "loss": 2.4767, "step": 2246 }, { "epoch": 0.861579754601227, "grad_norm": 0.34775063395500183, "learning_rate": 7.86109927567107e-06, "loss": 0.0014, "step": 2247 }, { "epoch": 0.8619631901840491, "grad_norm": 27.3027286529541, "learning_rate": 7.83979548359608e-06, "loss": 0.1621, "step": 2248 }, { "epoch": 0.8623466257668712, "grad_norm": 71.96947479248047, "learning_rate": 7.818491691521092e-06, "loss": 0.6729, "step": 2249 }, { "epoch": 0.8627300613496932, "grad_norm": 172.47738647460938, "learning_rate": 7.7971878994461e-06, "loss": 3.5972, "step": 2250 }, { "epoch": 0.8631134969325154, "grad_norm": 0.444509893655777, "learning_rate": 7.775884107371112e-06, "loss": 0.0018, "step": 2251 }, { "epoch": 0.8634969325153374, "grad_norm": 102.0246810913086, "learning_rate": 7.754580315296123e-06, "loss": 1.4697, "step": 2252 }, { "epoch": 0.8638803680981595, "grad_norm": 4.7411394119262695, "learning_rate": 7.733276523221134e-06, "loss": 0.0111, "step": 2253 }, { "epoch": 0.8642638036809815, "grad_norm": 25.057506561279297, "learning_rate": 7.711972731146145e-06, "loss": 0.4507, "step": 2254 }, { "epoch": 0.8646472392638037, "grad_norm": 26.792598724365234, "learning_rate": 7.690668939071156e-06, "loss": 0.4717, "step": 2255 }, { "epoch": 0.8650306748466258, "grad_norm": 32.75210189819336, "learning_rate": 7.669365146996165e-06, "loss": 0.6104, "step": 2256 }, { "epoch": 0.8654141104294478, "grad_norm": 291.1597900390625, "learning_rate": 7.648061354921176e-06, "loss": 7.042, "step": 2257 }, { "epoch": 0.86579754601227, "grad_norm": 0.7177117466926575, "learning_rate": 7.626757562846187e-06, "loss": 0.0017, "step": 2258 }, { "epoch": 0.866180981595092, "grad_norm": 0.7613211870193481, "learning_rate": 7.605453770771198e-06, "loss": 0.0028, "step": 2259 }, { "epoch": 0.8665644171779141, "grad_norm": 0.4335547983646393, "learning_rate": 7.584149978696209e-06, "loss": 0.002, "step": 2260 }, { "epoch": 0.8669478527607362, "grad_norm": 0.32038646936416626, "learning_rate": 7.5628461866212185e-06, "loss": 0.0015, "step": 2261 }, { "epoch": 0.8673312883435583, "grad_norm": 84.02080535888672, "learning_rate": 7.5415423945462295e-06, "loss": 0.9361, "step": 2262 }, { "epoch": 0.8677147239263804, "grad_norm": 2.1479883193969727, "learning_rate": 7.52023860247124e-06, "loss": 0.0058, "step": 2263 }, { "epoch": 0.8680981595092024, "grad_norm": 87.77676391601562, "learning_rate": 7.498934810396251e-06, "loss": 0.9238, "step": 2264 }, { "epoch": 0.8684815950920245, "grad_norm": 38.867977142333984, "learning_rate": 7.477631018321262e-06, "loss": 0.6719, "step": 2265 }, { "epoch": 0.8688650306748467, "grad_norm": 4.981602191925049, "learning_rate": 7.456327226246272e-06, "loss": 0.0198, "step": 2266 }, { "epoch": 0.8692484662576687, "grad_norm": 115.20155334472656, "learning_rate": 7.435023434171283e-06, "loss": 1.0859, "step": 2267 }, { "epoch": 0.8696319018404908, "grad_norm": 81.47464752197266, "learning_rate": 7.413719642096294e-06, "loss": 0.7612, "step": 2268 }, { "epoch": 0.8700153374233128, "grad_norm": 1.5404905080795288, "learning_rate": 7.392415850021305e-06, "loss": 0.0041, "step": 2269 }, { "epoch": 0.870398773006135, "grad_norm": 18.807405471801758, "learning_rate": 7.371112057946315e-06, "loss": 0.469, "step": 2270 }, { "epoch": 0.870782208588957, "grad_norm": 3.0587081909179688, "learning_rate": 7.349808265871326e-06, "loss": 0.0128, "step": 2271 }, { "epoch": 0.8711656441717791, "grad_norm": 4.244951248168945, "learning_rate": 7.328504473796335e-06, "loss": 0.363, "step": 2272 }, { "epoch": 0.8715490797546013, "grad_norm": 176.03610229492188, "learning_rate": 7.307200681721346e-06, "loss": 0.9551, "step": 2273 }, { "epoch": 0.8719325153374233, "grad_norm": 8.542078971862793, "learning_rate": 7.285896889646357e-06, "loss": 0.0167, "step": 2274 }, { "epoch": 0.8723159509202454, "grad_norm": 0.5152387619018555, "learning_rate": 7.2645930975713675e-06, "loss": 0.0023, "step": 2275 }, { "epoch": 0.8726993865030674, "grad_norm": 1.4831758737564087, "learning_rate": 7.2432893054963785e-06, "loss": 0.0041, "step": 2276 }, { "epoch": 0.8730828220858896, "grad_norm": 43.16259765625, "learning_rate": 7.2219855134213895e-06, "loss": 1.0479, "step": 2277 }, { "epoch": 0.8734662576687117, "grad_norm": 145.6817169189453, "learning_rate": 7.2006817213464e-06, "loss": 6.7085, "step": 2278 }, { "epoch": 0.8738496932515337, "grad_norm": 64.62080383300781, "learning_rate": 7.179377929271411e-06, "loss": 0.6709, "step": 2279 }, { "epoch": 0.8742331288343558, "grad_norm": 164.92628479003906, "learning_rate": 7.158074137196422e-06, "loss": 2.2658, "step": 2280 }, { "epoch": 0.8746165644171779, "grad_norm": 1.5775574445724487, "learning_rate": 7.136770345121433e-06, "loss": 0.0039, "step": 2281 }, { "epoch": 0.875, "grad_norm": 84.53050994873047, "learning_rate": 7.115466553046443e-06, "loss": 1.6895, "step": 2282 }, { "epoch": 0.8753834355828221, "grad_norm": 109.07520294189453, "learning_rate": 7.094162760971454e-06, "loss": 1.4405, "step": 2283 }, { "epoch": 0.8757668711656442, "grad_norm": 22.942350387573242, "learning_rate": 7.072858968896463e-06, "loss": 0.5737, "step": 2284 }, { "epoch": 0.8761503067484663, "grad_norm": 17.40251350402832, "learning_rate": 7.051555176821474e-06, "loss": 0.2432, "step": 2285 }, { "epoch": 0.8765337423312883, "grad_norm": 84.14927673339844, "learning_rate": 7.030251384746485e-06, "loss": 0.9087, "step": 2286 }, { "epoch": 0.8769171779141104, "grad_norm": 11.85714340209961, "learning_rate": 7.008947592671495e-06, "loss": 0.2301, "step": 2287 }, { "epoch": 0.8773006134969326, "grad_norm": 125.55137634277344, "learning_rate": 6.987643800596506e-06, "loss": 6.3281, "step": 2288 }, { "epoch": 0.8776840490797546, "grad_norm": 231.6448211669922, "learning_rate": 6.966340008521517e-06, "loss": 3.7931, "step": 2289 }, { "epoch": 0.8780674846625767, "grad_norm": 0.2826296389102936, "learning_rate": 6.945036216446528e-06, "loss": 0.0011, "step": 2290 }, { "epoch": 0.8784509202453987, "grad_norm": 20.809093475341797, "learning_rate": 6.9237324243715385e-06, "loss": 0.4322, "step": 2291 }, { "epoch": 0.8788343558282209, "grad_norm": 0.10539169609546661, "learning_rate": 6.9024286322965495e-06, "loss": 0.0009, "step": 2292 }, { "epoch": 0.879217791411043, "grad_norm": 2.3163046836853027, "learning_rate": 6.8811248402215605e-06, "loss": 0.0087, "step": 2293 }, { "epoch": 0.879601226993865, "grad_norm": 48.467830657958984, "learning_rate": 6.859821048146571e-06, "loss": 1.0156, "step": 2294 }, { "epoch": 0.8799846625766872, "grad_norm": 2.4319138526916504, "learning_rate": 6.838517256071581e-06, "loss": 0.0056, "step": 2295 }, { "epoch": 0.8803680981595092, "grad_norm": 28.968769073486328, "learning_rate": 6.817213463996591e-06, "loss": 0.4573, "step": 2296 }, { "epoch": 0.8807515337423313, "grad_norm": 2.2300775051116943, "learning_rate": 6.795909671921602e-06, "loss": 0.0071, "step": 2297 }, { "epoch": 0.8811349693251533, "grad_norm": 0.5528029799461365, "learning_rate": 6.774605879846613e-06, "loss": 0.0016, "step": 2298 }, { "epoch": 0.8815184049079755, "grad_norm": 2.7337145805358887, "learning_rate": 6.753302087771623e-06, "loss": 0.0087, "step": 2299 }, { "epoch": 0.8819018404907976, "grad_norm": 0.2805922031402588, "learning_rate": 6.731998295696634e-06, "loss": 0.0018, "step": 2300 }, { "epoch": 0.8822852760736196, "grad_norm": 94.86458587646484, "learning_rate": 6.710694503621645e-06, "loss": 1.3252, "step": 2301 }, { "epoch": 0.8826687116564417, "grad_norm": 12.665270805358887, "learning_rate": 6.689390711546656e-06, "loss": 0.0423, "step": 2302 }, { "epoch": 0.8830521472392638, "grad_norm": 91.64434051513672, "learning_rate": 6.668086919471666e-06, "loss": 1.9256, "step": 2303 }, { "epoch": 0.8834355828220859, "grad_norm": 29.66964340209961, "learning_rate": 6.646783127396677e-06, "loss": 0.0693, "step": 2304 }, { "epoch": 0.883819018404908, "grad_norm": 56.94545364379883, "learning_rate": 6.625479335321688e-06, "loss": 0.8106, "step": 2305 }, { "epoch": 0.88420245398773, "grad_norm": 0.6294219493865967, "learning_rate": 6.604175543246698e-06, "loss": 0.0022, "step": 2306 }, { "epoch": 0.8845858895705522, "grad_norm": 4.835546493530273, "learning_rate": 6.582871751171709e-06, "loss": 0.3618, "step": 2307 }, { "epoch": 0.8849693251533742, "grad_norm": 21.996061325073242, "learning_rate": 6.561567959096719e-06, "loss": 0.4268, "step": 2308 }, { "epoch": 0.8853527607361963, "grad_norm": 189.7275848388672, "learning_rate": 6.54026416702173e-06, "loss": 5.3691, "step": 2309 }, { "epoch": 0.8857361963190185, "grad_norm": 368.8897399902344, "learning_rate": 6.518960374946741e-06, "loss": 3.4177, "step": 2310 }, { "epoch": 0.8861196319018405, "grad_norm": 18.67096519470215, "learning_rate": 6.497656582871752e-06, "loss": 0.543, "step": 2311 }, { "epoch": 0.8865030674846626, "grad_norm": 0.8498499989509583, "learning_rate": 6.476352790796762e-06, "loss": 0.0019, "step": 2312 }, { "epoch": 0.8868865030674846, "grad_norm": 61.36403274536133, "learning_rate": 6.455048998721773e-06, "loss": 0.5713, "step": 2313 }, { "epoch": 0.8872699386503068, "grad_norm": 92.98079681396484, "learning_rate": 6.433745206646784e-06, "loss": 7.3711, "step": 2314 }, { "epoch": 0.8876533742331288, "grad_norm": 0.798412561416626, "learning_rate": 6.412441414571794e-06, "loss": 0.002, "step": 2315 }, { "epoch": 0.8880368098159509, "grad_norm": 16.507095336914062, "learning_rate": 6.391137622496805e-06, "loss": 0.4097, "step": 2316 }, { "epoch": 0.888420245398773, "grad_norm": 5.5403289794921875, "learning_rate": 6.3698338304218145e-06, "loss": 0.0322, "step": 2317 }, { "epoch": 0.8888036809815951, "grad_norm": 92.69943237304688, "learning_rate": 6.3485300383468255e-06, "loss": 0.5801, "step": 2318 }, { "epoch": 0.8891871165644172, "grad_norm": 103.48361206054688, "learning_rate": 6.3272262462718365e-06, "loss": 6.1143, "step": 2319 }, { "epoch": 0.8895705521472392, "grad_norm": 0.7315687537193298, "learning_rate": 6.305922454196847e-06, "loss": 0.002, "step": 2320 }, { "epoch": 0.8899539877300614, "grad_norm": 69.4375, "learning_rate": 6.284618662121858e-06, "loss": 0.6387, "step": 2321 }, { "epoch": 0.8903374233128835, "grad_norm": 1.154506802558899, "learning_rate": 6.263314870046869e-06, "loss": 0.004, "step": 2322 }, { "epoch": 0.8907208588957055, "grad_norm": 125.8946304321289, "learning_rate": 6.24201107797188e-06, "loss": 2.3558, "step": 2323 }, { "epoch": 0.8911042944785276, "grad_norm": 0.334873229265213, "learning_rate": 6.22070728589689e-06, "loss": 0.0023, "step": 2324 }, { "epoch": 0.8914877300613497, "grad_norm": 115.25357055664062, "learning_rate": 6.199403493821901e-06, "loss": 1.4082, "step": 2325 }, { "epoch": 0.8918711656441718, "grad_norm": 113.96891784667969, "learning_rate": 6.178099701746911e-06, "loss": 0.8809, "step": 2326 }, { "epoch": 0.8922546012269938, "grad_norm": 45.48662567138672, "learning_rate": 6.156795909671922e-06, "loss": 0.5483, "step": 2327 }, { "epoch": 0.8926380368098159, "grad_norm": 121.83149719238281, "learning_rate": 6.135492117596932e-06, "loss": 3.0171, "step": 2328 }, { "epoch": 0.8930214723926381, "grad_norm": 4.581501483917236, "learning_rate": 6.114188325521943e-06, "loss": 0.0171, "step": 2329 }, { "epoch": 0.8934049079754601, "grad_norm": 9.469815254211426, "learning_rate": 6.092884533446954e-06, "loss": 0.0296, "step": 2330 }, { "epoch": 0.8937883435582822, "grad_norm": 0.4207109212875366, "learning_rate": 6.071580741371965e-06, "loss": 0.0019, "step": 2331 }, { "epoch": 0.8941717791411042, "grad_norm": 6.05441427230835, "learning_rate": 6.050276949296975e-06, "loss": 0.0243, "step": 2332 }, { "epoch": 0.8945552147239264, "grad_norm": 15.64470100402832, "learning_rate": 6.0289731572219855e-06, "loss": 0.4124, "step": 2333 }, { "epoch": 0.8949386503067485, "grad_norm": 118.96612548828125, "learning_rate": 6.0076693651469965e-06, "loss": 0.9951, "step": 2334 }, { "epoch": 0.8953220858895705, "grad_norm": 395.6971740722656, "learning_rate": 5.9863655730720075e-06, "loss": 2.096, "step": 2335 }, { "epoch": 0.8957055214723927, "grad_norm": 352.2525939941406, "learning_rate": 5.965061780997018e-06, "loss": 5.1981, "step": 2336 }, { "epoch": 0.8960889570552147, "grad_norm": 0.6346721649169922, "learning_rate": 5.943757988922028e-06, "loss": 0.002, "step": 2337 }, { "epoch": 0.8964723926380368, "grad_norm": 1.3291672468185425, "learning_rate": 5.922454196847039e-06, "loss": 0.0031, "step": 2338 }, { "epoch": 0.8968558282208589, "grad_norm": 206.52493286132812, "learning_rate": 5.90115040477205e-06, "loss": 5.8926, "step": 2339 }, { "epoch": 0.897239263803681, "grad_norm": 389.3257141113281, "learning_rate": 5.879846612697061e-06, "loss": 5.8574, "step": 2340 }, { "epoch": 0.8976226993865031, "grad_norm": 1.5397624969482422, "learning_rate": 5.858542820622071e-06, "loss": 0.0041, "step": 2341 }, { "epoch": 0.8980061349693251, "grad_norm": 0.9912972450256348, "learning_rate": 5.837239028547082e-06, "loss": 0.0024, "step": 2342 }, { "epoch": 0.8983895705521472, "grad_norm": 0.4512677490711212, "learning_rate": 5.815935236472092e-06, "loss": 0.0024, "step": 2343 }, { "epoch": 0.8987730061349694, "grad_norm": 115.8443603515625, "learning_rate": 5.794631444397103e-06, "loss": 6.1021, "step": 2344 }, { "epoch": 0.8991564417177914, "grad_norm": 0.5459240674972534, "learning_rate": 5.773327652322113e-06, "loss": 0.0024, "step": 2345 }, { "epoch": 0.8995398773006135, "grad_norm": 98.77571105957031, "learning_rate": 5.752023860247124e-06, "loss": 0.94, "step": 2346 }, { "epoch": 0.8999233128834356, "grad_norm": 26.973379135131836, "learning_rate": 5.730720068172135e-06, "loss": 0.4268, "step": 2347 }, { "epoch": 0.9003067484662577, "grad_norm": 76.31385803222656, "learning_rate": 5.709416276097146e-06, "loss": 0.8687, "step": 2348 }, { "epoch": 0.9006901840490797, "grad_norm": 6.279634952545166, "learning_rate": 5.688112484022156e-06, "loss": 0.3914, "step": 2349 }, { "epoch": 0.9010736196319018, "grad_norm": 32.16206359863281, "learning_rate": 5.666808691947167e-06, "loss": 0.5913, "step": 2350 }, { "epoch": 0.901457055214724, "grad_norm": 84.63687896728516, "learning_rate": 5.645504899872178e-06, "loss": 0.9142, "step": 2351 }, { "epoch": 0.901840490797546, "grad_norm": 9.987971305847168, "learning_rate": 5.624201107797189e-06, "loss": 0.3919, "step": 2352 }, { "epoch": 0.9022239263803681, "grad_norm": 100.80982208251953, "learning_rate": 5.602897315722199e-06, "loss": 6.0352, "step": 2353 }, { "epoch": 0.9026073619631901, "grad_norm": 211.78219604492188, "learning_rate": 5.581593523647209e-06, "loss": 5.8379, "step": 2354 }, { "epoch": 0.9029907975460123, "grad_norm": 411.9369201660156, "learning_rate": 5.56028973157222e-06, "loss": 5.6138, "step": 2355 }, { "epoch": 0.9033742331288344, "grad_norm": 14.870763778686523, "learning_rate": 5.538985939497231e-06, "loss": 0.0817, "step": 2356 }, { "epoch": 0.9037576687116564, "grad_norm": 52.021644592285156, "learning_rate": 5.517682147422241e-06, "loss": 0.1079, "step": 2357 }, { "epoch": 0.9041411042944786, "grad_norm": 0.7164086699485779, "learning_rate": 5.496378355347252e-06, "loss": 0.0031, "step": 2358 }, { "epoch": 0.9045245398773006, "grad_norm": 5.322601318359375, "learning_rate": 5.475074563272263e-06, "loss": 0.0064, "step": 2359 }, { "epoch": 0.9049079754601227, "grad_norm": 5.817986488342285, "learning_rate": 5.453770771197273e-06, "loss": 0.3792, "step": 2360 }, { "epoch": 0.9052914110429447, "grad_norm": 1.5054324865341187, "learning_rate": 5.432466979122284e-06, "loss": 0.0029, "step": 2361 }, { "epoch": 0.9056748466257669, "grad_norm": 0.9984419345855713, "learning_rate": 5.4111631870472944e-06, "loss": 0.0023, "step": 2362 }, { "epoch": 0.906058282208589, "grad_norm": 17.932889938354492, "learning_rate": 5.3898593949723054e-06, "loss": 0.1219, "step": 2363 }, { "epoch": 0.906441717791411, "grad_norm": 0.13186410069465637, "learning_rate": 5.3685556028973165e-06, "loss": 0.0011, "step": 2364 }, { "epoch": 0.9068251533742331, "grad_norm": 15.44273853302002, "learning_rate": 5.347251810822327e-06, "loss": 0.2459, "step": 2365 }, { "epoch": 0.9072085889570553, "grad_norm": 152.20364379882812, "learning_rate": 5.325948018747337e-06, "loss": 1.9581, "step": 2366 }, { "epoch": 0.9075920245398773, "grad_norm": 282.3381042480469, "learning_rate": 5.304644226672348e-06, "loss": 1.8594, "step": 2367 }, { "epoch": 0.9079754601226994, "grad_norm": 9.226381301879883, "learning_rate": 5.283340434597359e-06, "loss": 0.0276, "step": 2368 }, { "epoch": 0.9083588957055214, "grad_norm": 38.01323318481445, "learning_rate": 5.26203664252237e-06, "loss": 0.5596, "step": 2369 }, { "epoch": 0.9087423312883436, "grad_norm": 11.03457260131836, "learning_rate": 5.24073285044738e-06, "loss": 0.0641, "step": 2370 }, { "epoch": 0.9091257668711656, "grad_norm": 4.320954322814941, "learning_rate": 5.21942905837239e-06, "loss": 0.0196, "step": 2371 }, { "epoch": 0.9095092024539877, "grad_norm": 3.1540722846984863, "learning_rate": 5.198125266297401e-06, "loss": 0.0105, "step": 2372 }, { "epoch": 0.9098926380368099, "grad_norm": 13.682193756103516, "learning_rate": 5.176821474222412e-06, "loss": 0.0806, "step": 2373 }, { "epoch": 0.9102760736196319, "grad_norm": 50.47124099731445, "learning_rate": 5.155517682147422e-06, "loss": 0.9649, "step": 2374 }, { "epoch": 0.910659509202454, "grad_norm": 7.538661003112793, "learning_rate": 5.134213890072433e-06, "loss": 0.0341, "step": 2375 }, { "epoch": 0.911042944785276, "grad_norm": 17.855735778808594, "learning_rate": 5.112910097997444e-06, "loss": 0.4041, "step": 2376 }, { "epoch": 0.9114263803680982, "grad_norm": 145.78993225097656, "learning_rate": 5.0916063059224544e-06, "loss": 0.8418, "step": 2377 }, { "epoch": 0.9118098159509203, "grad_norm": 37.60247039794922, "learning_rate": 5.070302513847465e-06, "loss": 0.6544, "step": 2378 }, { "epoch": 0.9121932515337423, "grad_norm": 109.55523681640625, "learning_rate": 5.048998721772476e-06, "loss": 2.5608, "step": 2379 }, { "epoch": 0.9125766871165644, "grad_norm": 176.50656127929688, "learning_rate": 5.027694929697487e-06, "loss": 6.3423, "step": 2380 }, { "epoch": 0.9129601226993865, "grad_norm": 0.9892469048500061, "learning_rate": 5.006391137622498e-06, "loss": 0.0066, "step": 2381 }, { "epoch": 0.9133435582822086, "grad_norm": 21.42610740661621, "learning_rate": 4.985087345547508e-06, "loss": 0.47, "step": 2382 }, { "epoch": 0.9137269938650306, "grad_norm": 16.929384231567383, "learning_rate": 4.963783553472518e-06, "loss": 0.5884, "step": 2383 }, { "epoch": 0.9141104294478528, "grad_norm": 33.386165618896484, "learning_rate": 4.942479761397529e-06, "loss": 0.6704, "step": 2384 }, { "epoch": 0.9144938650306749, "grad_norm": 0.8747240304946899, "learning_rate": 4.92117596932254e-06, "loss": 0.0036, "step": 2385 }, { "epoch": 0.9148773006134969, "grad_norm": 173.33103942871094, "learning_rate": 4.89987217724755e-06, "loss": 6.1895, "step": 2386 }, { "epoch": 0.915260736196319, "grad_norm": 142.9227294921875, "learning_rate": 4.878568385172561e-06, "loss": 1.5088, "step": 2387 }, { "epoch": 0.9156441717791411, "grad_norm": 0.28370898962020874, "learning_rate": 4.857264593097571e-06, "loss": 0.0019, "step": 2388 }, { "epoch": 0.9160276073619632, "grad_norm": 5.686431407928467, "learning_rate": 4.835960801022582e-06, "loss": 0.0222, "step": 2389 }, { "epoch": 0.9164110429447853, "grad_norm": 37.061012268066406, "learning_rate": 4.814657008947593e-06, "loss": 0.5405, "step": 2390 }, { "epoch": 0.9167944785276073, "grad_norm": 0.753302276134491, "learning_rate": 4.7933532168726034e-06, "loss": 0.0015, "step": 2391 }, { "epoch": 0.9171779141104295, "grad_norm": 11.971282005310059, "learning_rate": 4.7720494247976144e-06, "loss": 0.0361, "step": 2392 }, { "epoch": 0.9175613496932515, "grad_norm": 4.824380874633789, "learning_rate": 4.7507456327226254e-06, "loss": 0.0121, "step": 2393 }, { "epoch": 0.9179447852760736, "grad_norm": 0.9249712228775024, "learning_rate": 4.729441840647636e-06, "loss": 0.0036, "step": 2394 }, { "epoch": 0.9183282208588958, "grad_norm": 62.19675827026367, "learning_rate": 4.708138048572646e-06, "loss": 0.7403, "step": 2395 }, { "epoch": 0.9187116564417178, "grad_norm": 1.465177059173584, "learning_rate": 4.686834256497657e-06, "loss": 0.0041, "step": 2396 }, { "epoch": 0.9190950920245399, "grad_norm": 214.34622192382812, "learning_rate": 4.665530464422668e-06, "loss": 1.5811, "step": 2397 }, { "epoch": 0.9194785276073619, "grad_norm": 84.20104217529297, "learning_rate": 4.644226672347679e-06, "loss": 0.9165, "step": 2398 }, { "epoch": 0.9198619631901841, "grad_norm": 13.95626163482666, "learning_rate": 4.622922880272688e-06, "loss": 0.4883, "step": 2399 }, { "epoch": 0.9202453987730062, "grad_norm": 102.75283813476562, "learning_rate": 4.601619088197699e-06, "loss": 1.0986, "step": 2400 }, { "epoch": 0.9206288343558282, "grad_norm": 237.07943725585938, "learning_rate": 4.58031529612271e-06, "loss": 3.1571, "step": 2401 }, { "epoch": 0.9210122699386503, "grad_norm": 250.6201934814453, "learning_rate": 4.559011504047721e-06, "loss": 6.2051, "step": 2402 }, { "epoch": 0.9213957055214724, "grad_norm": 0.19728828966617584, "learning_rate": 4.537707711972731e-06, "loss": 0.0014, "step": 2403 }, { "epoch": 0.9217791411042945, "grad_norm": 15.691020965576172, "learning_rate": 4.516403919897742e-06, "loss": 0.04, "step": 2404 }, { "epoch": 0.9221625766871165, "grad_norm": 78.75389862060547, "learning_rate": 4.4951001278227524e-06, "loss": 1.0703, "step": 2405 }, { "epoch": 0.9225460122699386, "grad_norm": 128.8955078125, "learning_rate": 4.4737963357477634e-06, "loss": 1.9014, "step": 2406 }, { "epoch": 0.9229294478527608, "grad_norm": 120.89571380615234, "learning_rate": 4.452492543672774e-06, "loss": 1.8448, "step": 2407 }, { "epoch": 0.9233128834355828, "grad_norm": 3.4830448627471924, "learning_rate": 4.431188751597785e-06, "loss": 0.0076, "step": 2408 }, { "epoch": 0.9236963190184049, "grad_norm": 3.8731465339660645, "learning_rate": 4.409884959522796e-06, "loss": 0.3518, "step": 2409 }, { "epoch": 0.924079754601227, "grad_norm": 190.44200134277344, "learning_rate": 4.388581167447807e-06, "loss": 6.1836, "step": 2410 }, { "epoch": 0.9244631901840491, "grad_norm": 8.610749244689941, "learning_rate": 4.367277375372816e-06, "loss": 0.2986, "step": 2411 }, { "epoch": 0.9248466257668712, "grad_norm": 0.6672891974449158, "learning_rate": 4.345973583297827e-06, "loss": 0.0024, "step": 2412 }, { "epoch": 0.9252300613496932, "grad_norm": 2.1213881969451904, "learning_rate": 4.324669791222838e-06, "loss": 0.0089, "step": 2413 }, { "epoch": 0.9256134969325154, "grad_norm": 97.69578552246094, "learning_rate": 4.303365999147849e-06, "loss": 1.3272, "step": 2414 }, { "epoch": 0.9259969325153374, "grad_norm": 2.741866111755371, "learning_rate": 4.282062207072859e-06, "loss": 0.0073, "step": 2415 }, { "epoch": 0.9263803680981595, "grad_norm": 165.419921875, "learning_rate": 4.260758414997869e-06, "loss": 1.989, "step": 2416 }, { "epoch": 0.9267638036809815, "grad_norm": 1.4724115133285522, "learning_rate": 4.23945462292288e-06, "loss": 0.0049, "step": 2417 }, { "epoch": 0.9271472392638037, "grad_norm": 141.52371215820312, "learning_rate": 4.218150830847891e-06, "loss": 6.0625, "step": 2418 }, { "epoch": 0.9275306748466258, "grad_norm": 8.115018844604492, "learning_rate": 4.196847038772901e-06, "loss": 0.0194, "step": 2419 }, { "epoch": 0.9279141104294478, "grad_norm": 30.14399528503418, "learning_rate": 4.175543246697912e-06, "loss": 0.4922, "step": 2420 }, { "epoch": 0.92829754601227, "grad_norm": 117.81703186035156, "learning_rate": 4.1542394546229234e-06, "loss": 6.6914, "step": 2421 }, { "epoch": 0.928680981595092, "grad_norm": 245.2737579345703, "learning_rate": 4.132935662547934e-06, "loss": 1.753, "step": 2422 }, { "epoch": 0.9290644171779141, "grad_norm": 1.8317362070083618, "learning_rate": 4.111631870472945e-06, "loss": 0.0031, "step": 2423 }, { "epoch": 0.9294478527607362, "grad_norm": 16.254554748535156, "learning_rate": 4.090328078397955e-06, "loss": 0.0554, "step": 2424 }, { "epoch": 0.9298312883435583, "grad_norm": 3.539659023284912, "learning_rate": 4.069024286322966e-06, "loss": 0.0079, "step": 2425 }, { "epoch": 0.9302147239263804, "grad_norm": 25.596466064453125, "learning_rate": 4.047720494247977e-06, "loss": 0.0656, "step": 2426 }, { "epoch": 0.9305981595092024, "grad_norm": 10.608080863952637, "learning_rate": 4.026416702172987e-06, "loss": 0.0469, "step": 2427 }, { "epoch": 0.9309815950920245, "grad_norm": 1.3304612636566162, "learning_rate": 4.005112910097997e-06, "loss": 0.004, "step": 2428 }, { "epoch": 0.9313650306748467, "grad_norm": 100.94953155517578, "learning_rate": 3.983809118023008e-06, "loss": 6.1836, "step": 2429 }, { "epoch": 0.9317484662576687, "grad_norm": 9.298829078674316, "learning_rate": 3.962505325948019e-06, "loss": 0.0185, "step": 2430 }, { "epoch": 0.9321319018404908, "grad_norm": 1.1112534999847412, "learning_rate": 3.94120153387303e-06, "loss": 0.0054, "step": 2431 }, { "epoch": 0.9325153374233128, "grad_norm": 2.8512754440307617, "learning_rate": 3.91989774179804e-06, "loss": 0.0169, "step": 2432 }, { "epoch": 0.932898773006135, "grad_norm": 174.24310302734375, "learning_rate": 3.89859394972305e-06, "loss": 2.583, "step": 2433 }, { "epoch": 0.933282208588957, "grad_norm": 146.72923278808594, "learning_rate": 3.877290157648061e-06, "loss": 1.752, "step": 2434 }, { "epoch": 0.9336656441717791, "grad_norm": 30.88206672668457, "learning_rate": 3.855986365573072e-06, "loss": 0.1167, "step": 2435 }, { "epoch": 0.9340490797546013, "grad_norm": 132.73995971679688, "learning_rate": 3.834682573498083e-06, "loss": 5.4766, "step": 2436 }, { "epoch": 0.9344325153374233, "grad_norm": 3.126117706298828, "learning_rate": 3.8133787814230936e-06, "loss": 0.0162, "step": 2437 }, { "epoch": 0.9348159509202454, "grad_norm": 20.657733917236328, "learning_rate": 3.7920749893481046e-06, "loss": 0.4814, "step": 2438 }, { "epoch": 0.9351993865030674, "grad_norm": 202.00848388671875, "learning_rate": 3.7707711972731147e-06, "loss": 2.4808, "step": 2439 }, { "epoch": 0.9355828220858896, "grad_norm": 2.9972264766693115, "learning_rate": 3.7494674051981253e-06, "loss": 0.0074, "step": 2440 }, { "epoch": 0.9359662576687117, "grad_norm": 0.44538336992263794, "learning_rate": 3.728163613123136e-06, "loss": 0.0021, "step": 2441 }, { "epoch": 0.9363496932515337, "grad_norm": 6.419764518737793, "learning_rate": 3.706859821048147e-06, "loss": 0.0383, "step": 2442 }, { "epoch": 0.9367331288343558, "grad_norm": 205.5599365234375, "learning_rate": 3.6855560289731575e-06, "loss": 2.389, "step": 2443 }, { "epoch": 0.9371165644171779, "grad_norm": 1.176452398300171, "learning_rate": 3.6642522368981677e-06, "loss": 0.0046, "step": 2444 }, { "epoch": 0.9375, "grad_norm": 144.3276824951172, "learning_rate": 3.6429484448231787e-06, "loss": 1.5157, "step": 2445 }, { "epoch": 0.9378834355828221, "grad_norm": 7.984076499938965, "learning_rate": 3.6216446527481892e-06, "loss": 0.3045, "step": 2446 }, { "epoch": 0.9382668711656442, "grad_norm": 26.80510902404785, "learning_rate": 3.6003408606732e-06, "loss": 0.502, "step": 2447 }, { "epoch": 0.9386503067484663, "grad_norm": 183.5045623779297, "learning_rate": 3.579037068598211e-06, "loss": 4.8023, "step": 2448 }, { "epoch": 0.9390337423312883, "grad_norm": 2.254589080810547, "learning_rate": 3.5577332765232214e-06, "loss": 0.0066, "step": 2449 }, { "epoch": 0.9394171779141104, "grad_norm": 2.363844156265259, "learning_rate": 3.5364294844482316e-06, "loss": 0.0128, "step": 2450 }, { "epoch": 0.9398006134969326, "grad_norm": 0.25697576999664307, "learning_rate": 3.5151256923732426e-06, "loss": 0.0016, "step": 2451 }, { "epoch": 0.9401840490797546, "grad_norm": 145.4252471923828, "learning_rate": 3.493821900298253e-06, "loss": 6.5972, "step": 2452 }, { "epoch": 0.9405674846625767, "grad_norm": 25.66444969177246, "learning_rate": 3.472518108223264e-06, "loss": 0.4866, "step": 2453 }, { "epoch": 0.9409509202453987, "grad_norm": 14.369718551635742, "learning_rate": 3.4512143161482747e-06, "loss": 0.0793, "step": 2454 }, { "epoch": 0.9413343558282209, "grad_norm": 4.101546287536621, "learning_rate": 3.4299105240732853e-06, "loss": 0.0145, "step": 2455 }, { "epoch": 0.941717791411043, "grad_norm": 15.712298393249512, "learning_rate": 3.4086067319982955e-06, "loss": 0.4119, "step": 2456 }, { "epoch": 0.942101226993865, "grad_norm": 3.878542423248291, "learning_rate": 3.3873029399233065e-06, "loss": 0.0136, "step": 2457 }, { "epoch": 0.9424846625766872, "grad_norm": 9.528055191040039, "learning_rate": 3.365999147848317e-06, "loss": 0.3865, "step": 2458 }, { "epoch": 0.9428680981595092, "grad_norm": 0.7963228225708008, "learning_rate": 3.344695355773328e-06, "loss": 0.0031, "step": 2459 }, { "epoch": 0.9432515337423313, "grad_norm": 1.9365051984786987, "learning_rate": 3.3233915636983387e-06, "loss": 0.0049, "step": 2460 }, { "epoch": 0.9436349693251533, "grad_norm": 13.211302757263184, "learning_rate": 3.302087771623349e-06, "loss": 0.4617, "step": 2461 }, { "epoch": 0.9440184049079755, "grad_norm": 2.683659315109253, "learning_rate": 3.2807839795483594e-06, "loss": 0.0094, "step": 2462 }, { "epoch": 0.9444018404907976, "grad_norm": 9.588644981384277, "learning_rate": 3.2594801874733704e-06, "loss": 0.405, "step": 2463 }, { "epoch": 0.9447852760736196, "grad_norm": 2.5389938354492188, "learning_rate": 3.238176395398381e-06, "loss": 0.01, "step": 2464 }, { "epoch": 0.9451687116564417, "grad_norm": 66.77586364746094, "learning_rate": 3.216872603323392e-06, "loss": 0.6187, "step": 2465 }, { "epoch": 0.9455521472392638, "grad_norm": 7.38784122467041, "learning_rate": 3.1955688112484026e-06, "loss": 0.0334, "step": 2466 }, { "epoch": 0.9459355828220859, "grad_norm": 139.38377380371094, "learning_rate": 3.1742650191734127e-06, "loss": 2.5042, "step": 2467 }, { "epoch": 0.946319018404908, "grad_norm": 79.56858825683594, "learning_rate": 3.1529612270984233e-06, "loss": 0.8926, "step": 2468 }, { "epoch": 0.94670245398773, "grad_norm": 0.7099560499191284, "learning_rate": 3.1316574350234343e-06, "loss": 0.0025, "step": 2469 }, { "epoch": 0.9470858895705522, "grad_norm": 39.421669006347656, "learning_rate": 3.110353642948445e-06, "loss": 0.6362, "step": 2470 }, { "epoch": 0.9474693251533742, "grad_norm": 20.225257873535156, "learning_rate": 3.0890498508734555e-06, "loss": 0.1151, "step": 2471 }, { "epoch": 0.9478527607361963, "grad_norm": 134.07919311523438, "learning_rate": 3.067746058798466e-06, "loss": 4.6895, "step": 2472 }, { "epoch": 0.9482361963190185, "grad_norm": 1.9007248878479004, "learning_rate": 3.046442266723477e-06, "loss": 0.0071, "step": 2473 }, { "epoch": 0.9486196319018405, "grad_norm": 139.70526123046875, "learning_rate": 3.0251384746484876e-06, "loss": 1.2463, "step": 2474 }, { "epoch": 0.9490030674846626, "grad_norm": 2.7501935958862305, "learning_rate": 3.0038346825734982e-06, "loss": 0.0124, "step": 2475 }, { "epoch": 0.9493865030674846, "grad_norm": 14.394262313842773, "learning_rate": 2.982530890498509e-06, "loss": 0.1665, "step": 2476 }, { "epoch": 0.9497699386503068, "grad_norm": 171.37869262695312, "learning_rate": 2.9612270984235194e-06, "loss": 5.6914, "step": 2477 }, { "epoch": 0.9501533742331288, "grad_norm": 187.74874877929688, "learning_rate": 2.9399233063485304e-06, "loss": 2.7542, "step": 2478 }, { "epoch": 0.9505368098159509, "grad_norm": 140.52798461914062, "learning_rate": 2.918619514273541e-06, "loss": 6.0078, "step": 2479 }, { "epoch": 0.950920245398773, "grad_norm": 32.75605773925781, "learning_rate": 2.8973157221985516e-06, "loss": 0.1206, "step": 2480 }, { "epoch": 0.9513036809815951, "grad_norm": 6.467891693115234, "learning_rate": 2.876011930123562e-06, "loss": 0.0302, "step": 2481 }, { "epoch": 0.9516871165644172, "grad_norm": 83.42328643798828, "learning_rate": 2.854708138048573e-06, "loss": 0.9303, "step": 2482 }, { "epoch": 0.9520705521472392, "grad_norm": 204.3867950439453, "learning_rate": 2.8334043459735833e-06, "loss": 5.5215, "step": 2483 }, { "epoch": 0.9524539877300614, "grad_norm": 4.0695905685424805, "learning_rate": 2.8121005538985943e-06, "loss": 0.0194, "step": 2484 }, { "epoch": 0.9528374233128835, "grad_norm": 132.18507385253906, "learning_rate": 2.7907967618236045e-06, "loss": 5.9141, "step": 2485 }, { "epoch": 0.9532208588957055, "grad_norm": 32.51949691772461, "learning_rate": 2.7694929697486155e-06, "loss": 0.1285, "step": 2486 }, { "epoch": 0.9536042944785276, "grad_norm": 0.47829169034957886, "learning_rate": 2.748189177673626e-06, "loss": 0.0027, "step": 2487 }, { "epoch": 0.9539877300613497, "grad_norm": 3.1473779678344727, "learning_rate": 2.7268853855986366e-06, "loss": 0.0097, "step": 2488 }, { "epoch": 0.9543711656441718, "grad_norm": 74.50848388671875, "learning_rate": 2.7055815935236472e-06, "loss": 1.0918, "step": 2489 }, { "epoch": 0.9547546012269938, "grad_norm": 102.51834106445312, "learning_rate": 2.6842778014486582e-06, "loss": 0.9526, "step": 2490 }, { "epoch": 0.9551380368098159, "grad_norm": 0.42178601026535034, "learning_rate": 2.6629740093736684e-06, "loss": 0.0024, "step": 2491 }, { "epoch": 0.9555214723926381, "grad_norm": 2.300245523452759, "learning_rate": 2.6416702172986794e-06, "loss": 0.0092, "step": 2492 }, { "epoch": 0.9559049079754601, "grad_norm": 24.434701919555664, "learning_rate": 2.62036642522369e-06, "loss": 0.0971, "step": 2493 }, { "epoch": 0.9562883435582822, "grad_norm": 0.3528355658054352, "learning_rate": 2.5990626331487006e-06, "loss": 0.0016, "step": 2494 }, { "epoch": 0.9566717791411042, "grad_norm": 15.437116622924805, "learning_rate": 2.577758841073711e-06, "loss": 0.1912, "step": 2495 }, { "epoch": 0.9570552147239264, "grad_norm": 107.94813537597656, "learning_rate": 2.556455048998722e-06, "loss": 1.1475, "step": 2496 }, { "epoch": 0.9574386503067485, "grad_norm": 1.5852389335632324, "learning_rate": 2.5351512569237323e-06, "loss": 0.0063, "step": 2497 }, { "epoch": 0.9578220858895705, "grad_norm": 41.83260726928711, "learning_rate": 2.5138474648487433e-06, "loss": 0.5923, "step": 2498 }, { "epoch": 0.9582055214723927, "grad_norm": 0.9283663630485535, "learning_rate": 2.492543672773754e-06, "loss": 0.0036, "step": 2499 }, { "epoch": 0.9585889570552147, "grad_norm": 50.718238830566406, "learning_rate": 2.4712398806987645e-06, "loss": 0.7407, "step": 2500 }, { "epoch": 0.9589723926380368, "grad_norm": 3.917775869369507, "learning_rate": 2.449936088623775e-06, "loss": 0.3538, "step": 2501 }, { "epoch": 0.9593558282208589, "grad_norm": 1.7006443738937378, "learning_rate": 2.4286322965487856e-06, "loss": 0.0052, "step": 2502 }, { "epoch": 0.959739263803681, "grad_norm": 181.96173095703125, "learning_rate": 2.4073285044737966e-06, "loss": 1.8067, "step": 2503 }, { "epoch": 0.9601226993865031, "grad_norm": 96.75506591796875, "learning_rate": 2.3860247123988072e-06, "loss": 0.6309, "step": 2504 }, { "epoch": 0.9605061349693251, "grad_norm": 0.08275123685598373, "learning_rate": 2.364720920323818e-06, "loss": 0.0006, "step": 2505 }, { "epoch": 0.9608895705521472, "grad_norm": 65.58185577392578, "learning_rate": 2.3434171282488284e-06, "loss": 1.1084, "step": 2506 }, { "epoch": 0.9612730061349694, "grad_norm": 19.927858352661133, "learning_rate": 2.3221133361738394e-06, "loss": 0.1521, "step": 2507 }, { "epoch": 0.9616564417177914, "grad_norm": 30.809091567993164, "learning_rate": 2.3008095440988495e-06, "loss": 0.5117, "step": 2508 }, { "epoch": 0.9620398773006135, "grad_norm": 117.1274642944336, "learning_rate": 2.2795057520238606e-06, "loss": 0.8424, "step": 2509 }, { "epoch": 0.9624233128834356, "grad_norm": 1.3715885877609253, "learning_rate": 2.258201959948871e-06, "loss": 0.0061, "step": 2510 }, { "epoch": 0.9628067484662577, "grad_norm": 0.5358007550239563, "learning_rate": 2.2368981678738817e-06, "loss": 0.0028, "step": 2511 }, { "epoch": 0.9631901840490797, "grad_norm": 7.49700403213501, "learning_rate": 2.2155943757988923e-06, "loss": 0.2952, "step": 2512 }, { "epoch": 0.9635736196319018, "grad_norm": 0.17554740607738495, "learning_rate": 2.1942905837239033e-06, "loss": 0.0012, "step": 2513 }, { "epoch": 0.963957055214724, "grad_norm": 86.48233795166016, "learning_rate": 2.1729867916489135e-06, "loss": 0.9214, "step": 2514 }, { "epoch": 0.964340490797546, "grad_norm": 1.360700011253357, "learning_rate": 2.1516829995739245e-06, "loss": 0.0034, "step": 2515 }, { "epoch": 0.9647239263803681, "grad_norm": 0.373241662979126, "learning_rate": 2.1303792074989346e-06, "loss": 0.0017, "step": 2516 }, { "epoch": 0.9651073619631901, "grad_norm": 0.5386194586753845, "learning_rate": 2.1090754154239456e-06, "loss": 0.0038, "step": 2517 }, { "epoch": 0.9654907975460123, "grad_norm": 0.5087804198265076, "learning_rate": 2.087771623348956e-06, "loss": 0.0014, "step": 2518 }, { "epoch": 0.9658742331288344, "grad_norm": 2.500549077987671, "learning_rate": 2.066467831273967e-06, "loss": 0.0079, "step": 2519 }, { "epoch": 0.9662576687116564, "grad_norm": 16.712366104125977, "learning_rate": 2.0451640391989774e-06, "loss": 0.4468, "step": 2520 }, { "epoch": 0.9666411042944786, "grad_norm": 29.929351806640625, "learning_rate": 2.0238602471239884e-06, "loss": 0.0795, "step": 2521 }, { "epoch": 0.9670245398773006, "grad_norm": 121.40982055664062, "learning_rate": 2.0025564550489985e-06, "loss": 1.1143, "step": 2522 }, { "epoch": 0.9674079754601227, "grad_norm": 1.408011555671692, "learning_rate": 1.9812526629740095e-06, "loss": 0.0031, "step": 2523 }, { "epoch": 0.9677914110429447, "grad_norm": 136.08197021484375, "learning_rate": 1.95994887089902e-06, "loss": 2.3304, "step": 2524 }, { "epoch": 0.9681748466257669, "grad_norm": 1.030416488647461, "learning_rate": 1.9386450788240307e-06, "loss": 0.0036, "step": 2525 }, { "epoch": 0.968558282208589, "grad_norm": 10.150887489318848, "learning_rate": 1.9173412867490413e-06, "loss": 0.4077, "step": 2526 }, { "epoch": 0.968941717791411, "grad_norm": 2.4118311405181885, "learning_rate": 1.8960374946740523e-06, "loss": 0.0071, "step": 2527 }, { "epoch": 0.9693251533742331, "grad_norm": 244.69618225097656, "learning_rate": 1.8747337025990627e-06, "loss": 2.379, "step": 2528 }, { "epoch": 0.9697085889570553, "grad_norm": 36.92804718017578, "learning_rate": 1.8534299105240735e-06, "loss": 0.6929, "step": 2529 }, { "epoch": 0.9700920245398773, "grad_norm": 36.10527420043945, "learning_rate": 1.8321261184490838e-06, "loss": 0.0977, "step": 2530 }, { "epoch": 0.9704754601226994, "grad_norm": 71.33319854736328, "learning_rate": 1.8108223263740946e-06, "loss": 0.8179, "step": 2531 }, { "epoch": 0.9708588957055214, "grad_norm": 1.6740525960922241, "learning_rate": 1.7895185342991054e-06, "loss": 0.0041, "step": 2532 }, { "epoch": 0.9712423312883436, "grad_norm": 5.817898273468018, "learning_rate": 1.7682147422241158e-06, "loss": 0.3701, "step": 2533 }, { "epoch": 0.9716257668711656, "grad_norm": 38.433372497558594, "learning_rate": 1.7469109501491266e-06, "loss": 0.5977, "step": 2534 }, { "epoch": 0.9720092024539877, "grad_norm": 8.080559730529785, "learning_rate": 1.7256071580741374e-06, "loss": 0.0315, "step": 2535 }, { "epoch": 0.9723926380368099, "grad_norm": 40.872737884521484, "learning_rate": 1.7043033659991477e-06, "loss": 0.7588, "step": 2536 }, { "epoch": 0.9727760736196319, "grad_norm": 123.65471649169922, "learning_rate": 1.6829995739241585e-06, "loss": 5.457, "step": 2537 }, { "epoch": 0.973159509202454, "grad_norm": 1.4213929176330566, "learning_rate": 1.6616957818491693e-06, "loss": 0.0041, "step": 2538 }, { "epoch": 0.973542944785276, "grad_norm": 17.070253372192383, "learning_rate": 1.6403919897741797e-06, "loss": 0.027, "step": 2539 }, { "epoch": 0.9739263803680982, "grad_norm": 124.78246307373047, "learning_rate": 1.6190881976991905e-06, "loss": 1.8664, "step": 2540 }, { "epoch": 0.9743098159509203, "grad_norm": 182.86981201171875, "learning_rate": 1.5977844056242013e-06, "loss": 6.3516, "step": 2541 }, { "epoch": 0.9746932515337423, "grad_norm": 1.20124089717865, "learning_rate": 1.5764806135492117e-06, "loss": 0.0052, "step": 2542 }, { "epoch": 0.9750766871165644, "grad_norm": 36.85075759887695, "learning_rate": 1.5551768214742224e-06, "loss": 0.4468, "step": 2543 }, { "epoch": 0.9754601226993865, "grad_norm": 5.178058624267578, "learning_rate": 1.533873029399233e-06, "loss": 0.0122, "step": 2544 }, { "epoch": 0.9758435582822086, "grad_norm": 0.5216243863105774, "learning_rate": 1.5125692373242438e-06, "loss": 0.0021, "step": 2545 }, { "epoch": 0.9762269938650306, "grad_norm": 1.5058411359786987, "learning_rate": 1.4912654452492544e-06, "loss": 0.0042, "step": 2546 }, { "epoch": 0.9766104294478528, "grad_norm": 1.4786838293075562, "learning_rate": 1.4699616531742652e-06, "loss": 0.0051, "step": 2547 }, { "epoch": 0.9769938650306749, "grad_norm": 0.48048126697540283, "learning_rate": 1.4486578610992758e-06, "loss": 0.0025, "step": 2548 }, { "epoch": 0.9773773006134969, "grad_norm": 0.6981642842292786, "learning_rate": 1.4273540690242866e-06, "loss": 0.0047, "step": 2549 }, { "epoch": 0.977760736196319, "grad_norm": 0.47611889243125916, "learning_rate": 1.4060502769492972e-06, "loss": 0.0037, "step": 2550 }, { "epoch": 0.9781441717791411, "grad_norm": 1.5145431756973267, "learning_rate": 1.3847464848743077e-06, "loss": 0.0091, "step": 2551 }, { "epoch": 0.9785276073619632, "grad_norm": 4.604262351989746, "learning_rate": 1.3634426927993183e-06, "loss": 0.0241, "step": 2552 }, { "epoch": 0.9789110429447853, "grad_norm": 221.05865478515625, "learning_rate": 1.3421389007243291e-06, "loss": 1.8107, "step": 2553 }, { "epoch": 0.9792944785276073, "grad_norm": 97.07608032226562, "learning_rate": 1.3208351086493397e-06, "loss": 0.9634, "step": 2554 }, { "epoch": 0.9796779141104295, "grad_norm": 16.28598976135254, "learning_rate": 1.2995313165743503e-06, "loss": 0.5479, "step": 2555 }, { "epoch": 0.9800613496932515, "grad_norm": 204.9497528076172, "learning_rate": 1.278227524499361e-06, "loss": 2.7604, "step": 2556 }, { "epoch": 0.9804447852760736, "grad_norm": 215.8769073486328, "learning_rate": 1.2569237324243717e-06, "loss": 6.1328, "step": 2557 }, { "epoch": 0.9808282208588958, "grad_norm": 22.42989730834961, "learning_rate": 1.2356199403493822e-06, "loss": 0.5039, "step": 2558 }, { "epoch": 0.9812116564417178, "grad_norm": 107.1098861694336, "learning_rate": 1.2143161482743928e-06, "loss": 2.715, "step": 2559 }, { "epoch": 0.9815950920245399, "grad_norm": 25.10755729675293, "learning_rate": 1.1930123561994036e-06, "loss": 0.1469, "step": 2560 }, { "epoch": 0.9819785276073619, "grad_norm": 12.362627983093262, "learning_rate": 1.1717085641244142e-06, "loss": 0.4625, "step": 2561 }, { "epoch": 0.9823619631901841, "grad_norm": 0.613105833530426, "learning_rate": 1.1504047720494248e-06, "loss": 0.0022, "step": 2562 }, { "epoch": 0.9827453987730062, "grad_norm": 155.92428588867188, "learning_rate": 1.1291009799744356e-06, "loss": 0.8775, "step": 2563 }, { "epoch": 0.9831288343558282, "grad_norm": 71.01866149902344, "learning_rate": 1.1077971878994461e-06, "loss": 0.8364, "step": 2564 }, { "epoch": 0.9835122699386503, "grad_norm": 5.93011474609375, "learning_rate": 1.0864933958244567e-06, "loss": 0.0209, "step": 2565 }, { "epoch": 0.9838957055214724, "grad_norm": 237.0297393798828, "learning_rate": 1.0651896037494673e-06, "loss": 2.2697, "step": 2566 }, { "epoch": 0.9842791411042945, "grad_norm": 1.2019493579864502, "learning_rate": 1.043885811674478e-06, "loss": 0.0051, "step": 2567 }, { "epoch": 0.9846625766871165, "grad_norm": 0.4159267544746399, "learning_rate": 1.0225820195994887e-06, "loss": 0.0015, "step": 2568 }, { "epoch": 0.9850460122699386, "grad_norm": 9.824392318725586, "learning_rate": 1.0012782275244993e-06, "loss": 0.397, "step": 2569 }, { "epoch": 0.9854294478527608, "grad_norm": 153.7998809814453, "learning_rate": 9.7997443544951e-07, "loss": 1.3145, "step": 2570 }, { "epoch": 0.9858128834355828, "grad_norm": 135.33055114746094, "learning_rate": 9.586706433745206e-07, "loss": 0.7295, "step": 2571 }, { "epoch": 0.9861963190184049, "grad_norm": 16.33224868774414, "learning_rate": 9.373668512995313e-07, "loss": 0.1193, "step": 2572 }, { "epoch": 0.986579754601227, "grad_norm": 20.235445022583008, "learning_rate": 9.160630592245419e-07, "loss": 0.4543, "step": 2573 }, { "epoch": 0.9869631901840491, "grad_norm": 6.205273151397705, "learning_rate": 8.947592671495527e-07, "loss": 0.0212, "step": 2574 }, { "epoch": 0.9873466257668712, "grad_norm": 56.99029541015625, "learning_rate": 8.734554750745633e-07, "loss": 0.8604, "step": 2575 }, { "epoch": 0.9877300613496932, "grad_norm": 482.4648742675781, "learning_rate": 8.521516829995739e-07, "loss": 2.7152, "step": 2576 }, { "epoch": 0.9881134969325154, "grad_norm": 102.6148910522461, "learning_rate": 8.308478909245847e-07, "loss": 2.2579, "step": 2577 }, { "epoch": 0.9884969325153374, "grad_norm": 100.00586700439453, "learning_rate": 8.095440988495952e-07, "loss": 1.3702, "step": 2578 }, { "epoch": 0.9888803680981595, "grad_norm": 0.8538267016410828, "learning_rate": 7.882403067746058e-07, "loss": 0.002, "step": 2579 }, { "epoch": 0.9892638036809815, "grad_norm": 14.852790832519531, "learning_rate": 7.669365146996165e-07, "loss": 0.4805, "step": 2580 }, { "epoch": 0.9896472392638037, "grad_norm": 0.8749407529830933, "learning_rate": 7.456327226246272e-07, "loss": 0.0018, "step": 2581 }, { "epoch": 0.9900306748466258, "grad_norm": 183.30038452148438, "learning_rate": 7.243289305496379e-07, "loss": 0.9742, "step": 2582 }, { "epoch": 0.9904141104294478, "grad_norm": 87.85253143310547, "learning_rate": 7.030251384746486e-07, "loss": 0.7261, "step": 2583 }, { "epoch": 0.99079754601227, "grad_norm": 53.84426498413086, "learning_rate": 6.817213463996592e-07, "loss": 0.9238, "step": 2584 }, { "epoch": 0.991180981595092, "grad_norm": 12.457634925842285, "learning_rate": 6.604175543246698e-07, "loss": 0.0502, "step": 2585 }, { "epoch": 0.9915644171779141, "grad_norm": 2.8092494010925293, "learning_rate": 6.391137622496805e-07, "loss": 0.0063, "step": 2586 }, { "epoch": 0.9919478527607362, "grad_norm": 24.783527374267578, "learning_rate": 6.178099701746911e-07, "loss": 0.0685, "step": 2587 }, { "epoch": 0.9923312883435583, "grad_norm": 13.74468994140625, "learning_rate": 5.965061780997018e-07, "loss": 0.1213, "step": 2588 }, { "epoch": 0.9927147239263804, "grad_norm": 1.3093582391738892, "learning_rate": 5.752023860247124e-07, "loss": 0.0043, "step": 2589 }, { "epoch": 0.9930981595092024, "grad_norm": 152.93663024902344, "learning_rate": 5.538985939497231e-07, "loss": 3.19, "step": 2590 }, { "epoch": 0.9934815950920245, "grad_norm": 31.667722702026367, "learning_rate": 5.325948018747337e-07, "loss": 0.4617, "step": 2591 }, { "epoch": 0.9938650306748467, "grad_norm": 46.76748275756836, "learning_rate": 5.112910097997443e-07, "loss": 0.5439, "step": 2592 }, { "epoch": 0.9942484662576687, "grad_norm": 2.7833364009857178, "learning_rate": 4.89987217724755e-07, "loss": 0.0067, "step": 2593 }, { "epoch": 0.9946319018404908, "grad_norm": 49.33405685424805, "learning_rate": 4.6868342564976567e-07, "loss": 0.0971, "step": 2594 }, { "epoch": 0.9950153374233128, "grad_norm": 137.8705596923828, "learning_rate": 4.4737963357477635e-07, "loss": 2.4572, "step": 2595 }, { "epoch": 0.995398773006135, "grad_norm": 160.5074005126953, "learning_rate": 4.2607584149978694e-07, "loss": 6.1523, "step": 2596 }, { "epoch": 0.995782208588957, "grad_norm": 16.433561325073242, "learning_rate": 4.047720494247976e-07, "loss": 0.0363, "step": 2597 }, { "epoch": 0.9961656441717791, "grad_norm": 3.9944512844085693, "learning_rate": 3.8346825734980826e-07, "loss": 0.0101, "step": 2598 }, { "epoch": 0.9965490797546013, "grad_norm": 0.3866381347179413, "learning_rate": 3.6216446527481895e-07, "loss": 0.0019, "step": 2599 }, { "epoch": 0.9969325153374233, "grad_norm": 0.9906994700431824, "learning_rate": 3.408606731998296e-07, "loss": 0.0035, "step": 2600 }, { "epoch": 0.9973159509202454, "grad_norm": 0.24632364511489868, "learning_rate": 3.1955688112484027e-07, "loss": 0.0017, "step": 2601 }, { "epoch": 0.9976993865030674, "grad_norm": 2.1847798824310303, "learning_rate": 2.982530890498509e-07, "loss": 0.0062, "step": 2602 }, { "epoch": 0.9980828220858896, "grad_norm": 22.774734497070312, "learning_rate": 2.7694929697486154e-07, "loss": 0.1135, "step": 2603 }, { "epoch": 0.9984662576687117, "grad_norm": 13.302017211914062, "learning_rate": 2.5564550489987217e-07, "loss": 0.0401, "step": 2604 }, { "epoch": 0.9988496932515337, "grad_norm": 3.474975109100342, "learning_rate": 2.3434171282488283e-07, "loss": 0.0167, "step": 2605 }, { "epoch": 0.9992331288343558, "grad_norm": 64.99967956542969, "learning_rate": 2.1303792074989347e-07, "loss": 0.8428, "step": 2606 }, { "epoch": 0.9996165644171779, "grad_norm": 15.43187427520752, "learning_rate": 1.9173412867490413e-07, "loss": 0.1441, "step": 2607 }, { "epoch": 1.0, "grad_norm": 18.857643127441406, "learning_rate": 1.704303365999148e-07, "loss": 0.0876, "step": 2608 }, { "epoch": 1.0, "eval_accuracy": 0.5, "eval_auc": 0.8125, "eval_f1": 0.6666666666666666, "eval_loss": 1.8160488605499268, "eval_precision": 0.5, "eval_recall": 1.0, "eval_runtime": 0.3919, "eval_samples_per_second": 40.827, "eval_steps_per_second": 10.207, "step": 2608 } ], "logging_steps": 1, "max_steps": 2608, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1076386427961344e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }