{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1389, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021598272138228943, "grad_norm": 2.878943681716919, "learning_rate": 7.194244604316547e-08, "loss": 0.7835, "step": 1 }, { "epoch": 0.004319654427645789, "grad_norm": 2.902249813079834, "learning_rate": 1.4388489208633095e-07, "loss": 0.7896, "step": 2 }, { "epoch": 0.0064794816414686825, "grad_norm": 2.8640873432159424, "learning_rate": 2.1582733812949643e-07, "loss": 0.7588, "step": 3 }, { "epoch": 0.008639308855291577, "grad_norm": 2.825040102005005, "learning_rate": 2.877697841726619e-07, "loss": 0.779, "step": 4 }, { "epoch": 0.01079913606911447, "grad_norm": 2.9926884174346924, "learning_rate": 3.5971223021582736e-07, "loss": 0.7816, "step": 5 }, { "epoch": 0.012958963282937365, "grad_norm": 2.8692467212677, "learning_rate": 4.3165467625899287e-07, "loss": 0.7695, "step": 6 }, { "epoch": 0.01511879049676026, "grad_norm": 2.79731822013855, "learning_rate": 5.035971223021583e-07, "loss": 0.7683, "step": 7 }, { "epoch": 0.017278617710583154, "grad_norm": 2.832988739013672, "learning_rate": 5.755395683453238e-07, "loss": 0.7865, "step": 8 }, { "epoch": 0.019438444924406047, "grad_norm": 2.787931442260742, "learning_rate": 6.474820143884893e-07, "loss": 0.7716, "step": 9 }, { "epoch": 0.02159827213822894, "grad_norm": 2.6542158126831055, "learning_rate": 7.194244604316547e-07, "loss": 0.7708, "step": 10 }, { "epoch": 0.023758099352051837, "grad_norm": 2.5756170749664307, "learning_rate": 7.913669064748202e-07, "loss": 0.7548, "step": 11 }, { "epoch": 0.02591792656587473, "grad_norm": 2.2221007347106934, "learning_rate": 8.633093525179857e-07, "loss": 0.7544, "step": 12 }, { "epoch": 0.028077753779697623, "grad_norm": 2.165950298309326, "learning_rate": 9.352517985611512e-07, "loss": 0.7345, "step": 13 }, { "epoch": 0.03023758099352052, "grad_norm": 2.1415212154388428, "learning_rate": 1.0071942446043167e-06, "loss": 0.7375, "step": 14 }, { "epoch": 0.032397408207343416, "grad_norm": 2.045217275619507, "learning_rate": 1.079136690647482e-06, "loss": 0.7251, "step": 15 }, { "epoch": 0.03455723542116631, "grad_norm": 1.8833245038986206, "learning_rate": 1.1510791366906476e-06, "loss": 0.734, "step": 16 }, { "epoch": 0.0367170626349892, "grad_norm": 1.4383106231689453, "learning_rate": 1.2230215827338131e-06, "loss": 0.7126, "step": 17 }, { "epoch": 0.038876889848812095, "grad_norm": 1.3764389753341675, "learning_rate": 1.2949640287769785e-06, "loss": 0.69, "step": 18 }, { "epoch": 0.04103671706263499, "grad_norm": 1.3699392080307007, "learning_rate": 1.366906474820144e-06, "loss": 0.7071, "step": 19 }, { "epoch": 0.04319654427645788, "grad_norm": 1.2943273782730103, "learning_rate": 1.4388489208633094e-06, "loss": 0.686, "step": 20 }, { "epoch": 0.04535637149028078, "grad_norm": 1.2634108066558838, "learning_rate": 1.510791366906475e-06, "loss": 0.6902, "step": 21 }, { "epoch": 0.047516198704103674, "grad_norm": 1.066751480102539, "learning_rate": 1.5827338129496403e-06, "loss": 0.6644, "step": 22 }, { "epoch": 0.04967602591792657, "grad_norm": 1.004930019378662, "learning_rate": 1.654676258992806e-06, "loss": 0.6602, "step": 23 }, { "epoch": 0.05183585313174946, "grad_norm": 0.9834485054016113, "learning_rate": 1.7266187050359715e-06, "loss": 0.6525, "step": 24 }, { "epoch": 0.05399568034557235, "grad_norm": 0.9758538007736206, "learning_rate": 1.7985611510791368e-06, "loss": 0.6452, "step": 25 }, { "epoch": 0.056155507559395246, "grad_norm": 0.9222759008407593, "learning_rate": 1.8705035971223024e-06, "loss": 0.6485, "step": 26 }, { "epoch": 0.058315334773218146, "grad_norm": 0.8775356411933899, "learning_rate": 1.942446043165468e-06, "loss": 0.6388, "step": 27 }, { "epoch": 0.06047516198704104, "grad_norm": 0.8008519411087036, "learning_rate": 2.0143884892086333e-06, "loss": 0.6328, "step": 28 }, { "epoch": 0.06263498920086392, "grad_norm": 0.7609057426452637, "learning_rate": 2.0863309352517987e-06, "loss": 0.6253, "step": 29 }, { "epoch": 0.06479481641468683, "grad_norm": 0.6197890043258667, "learning_rate": 2.158273381294964e-06, "loss": 0.6253, "step": 30 }, { "epoch": 0.06695464362850972, "grad_norm": 0.6675652265548706, "learning_rate": 2.23021582733813e-06, "loss": 0.605, "step": 31 }, { "epoch": 0.06911447084233262, "grad_norm": 0.6976248621940613, "learning_rate": 2.302158273381295e-06, "loss": 0.6077, "step": 32 }, { "epoch": 0.07127429805615551, "grad_norm": 0.6653661131858826, "learning_rate": 2.3741007194244605e-06, "loss": 0.6021, "step": 33 }, { "epoch": 0.0734341252699784, "grad_norm": 0.6243202090263367, "learning_rate": 2.4460431654676263e-06, "loss": 0.6147, "step": 34 }, { "epoch": 0.0755939524838013, "grad_norm": 0.5303459167480469, "learning_rate": 2.5179856115107916e-06, "loss": 0.6, "step": 35 }, { "epoch": 0.07775377969762419, "grad_norm": 0.48958107829093933, "learning_rate": 2.589928057553957e-06, "loss": 0.5829, "step": 36 }, { "epoch": 0.07991360691144708, "grad_norm": 0.4979974031448364, "learning_rate": 2.6618705035971228e-06, "loss": 0.5852, "step": 37 }, { "epoch": 0.08207343412526998, "grad_norm": 0.508642852306366, "learning_rate": 2.733812949640288e-06, "loss": 0.5827, "step": 38 }, { "epoch": 0.08423326133909287, "grad_norm": 0.5054506063461304, "learning_rate": 2.805755395683453e-06, "loss": 0.5627, "step": 39 }, { "epoch": 0.08639308855291576, "grad_norm": 0.42791351675987244, "learning_rate": 2.877697841726619e-06, "loss": 0.557, "step": 40 }, { "epoch": 0.08855291576673865, "grad_norm": 0.3770763874053955, "learning_rate": 2.949640287769784e-06, "loss": 0.5452, "step": 41 }, { "epoch": 0.09071274298056156, "grad_norm": 0.38157957792282104, "learning_rate": 3.02158273381295e-06, "loss": 0.5552, "step": 42 }, { "epoch": 0.09287257019438445, "grad_norm": 0.4018012583255768, "learning_rate": 3.0935251798561158e-06, "loss": 0.5559, "step": 43 }, { "epoch": 0.09503239740820735, "grad_norm": 0.3959904611110687, "learning_rate": 3.1654676258992807e-06, "loss": 0.5493, "step": 44 }, { "epoch": 0.09719222462203024, "grad_norm": 0.38622933626174927, "learning_rate": 3.237410071942446e-06, "loss": 0.5512, "step": 45 }, { "epoch": 0.09935205183585313, "grad_norm": 0.3973333239555359, "learning_rate": 3.309352517985612e-06, "loss": 0.5413, "step": 46 }, { "epoch": 0.10151187904967603, "grad_norm": 0.3897247910499573, "learning_rate": 3.381294964028777e-06, "loss": 0.5223, "step": 47 }, { "epoch": 0.10367170626349892, "grad_norm": 0.37678107619285583, "learning_rate": 3.453237410071943e-06, "loss": 0.5296, "step": 48 }, { "epoch": 0.10583153347732181, "grad_norm": 0.33324435353279114, "learning_rate": 3.525179856115108e-06, "loss": 0.5184, "step": 49 }, { "epoch": 0.1079913606911447, "grad_norm": 0.303320974111557, "learning_rate": 3.5971223021582737e-06, "loss": 0.5331, "step": 50 }, { "epoch": 0.1101511879049676, "grad_norm": 0.30076754093170166, "learning_rate": 3.669064748201439e-06, "loss": 0.5331, "step": 51 }, { "epoch": 0.11231101511879049, "grad_norm": 0.2589012086391449, "learning_rate": 3.741007194244605e-06, "loss": 0.5109, "step": 52 }, { "epoch": 0.11447084233261338, "grad_norm": 0.2596394121646881, "learning_rate": 3.81294964028777e-06, "loss": 0.5227, "step": 53 }, { "epoch": 0.11663066954643629, "grad_norm": 0.255307137966156, "learning_rate": 3.884892086330936e-06, "loss": 0.5169, "step": 54 }, { "epoch": 0.11879049676025918, "grad_norm": 0.2433944046497345, "learning_rate": 3.956834532374101e-06, "loss": 0.5161, "step": 55 }, { "epoch": 0.12095032397408208, "grad_norm": 0.2333260476589203, "learning_rate": 4.028776978417267e-06, "loss": 0.5096, "step": 56 }, { "epoch": 0.12311015118790497, "grad_norm": 0.22751125693321228, "learning_rate": 4.100719424460432e-06, "loss": 0.5115, "step": 57 }, { "epoch": 0.12526997840172785, "grad_norm": 0.2149927169084549, "learning_rate": 4.172661870503597e-06, "loss": 0.5132, "step": 58 }, { "epoch": 0.12742980561555076, "grad_norm": 0.22358939051628113, "learning_rate": 4.244604316546763e-06, "loss": 0.5057, "step": 59 }, { "epoch": 0.12958963282937366, "grad_norm": 0.19954045116901398, "learning_rate": 4.316546762589928e-06, "loss": 0.4994, "step": 60 }, { "epoch": 0.13174946004319654, "grad_norm": 0.1936485469341278, "learning_rate": 4.388489208633094e-06, "loss": 0.4954, "step": 61 }, { "epoch": 0.13390928725701945, "grad_norm": 0.1977352499961853, "learning_rate": 4.46043165467626e-06, "loss": 0.4919, "step": 62 }, { "epoch": 0.13606911447084233, "grad_norm": 0.19697633385658264, "learning_rate": 4.5323741007194245e-06, "loss": 0.4895, "step": 63 }, { "epoch": 0.13822894168466524, "grad_norm": 0.2068362534046173, "learning_rate": 4.60431654676259e-06, "loss": 0.4848, "step": 64 }, { "epoch": 0.14038876889848811, "grad_norm": 0.2056417018175125, "learning_rate": 4.676258992805755e-06, "loss": 0.4901, "step": 65 }, { "epoch": 0.14254859611231102, "grad_norm": 0.20445196330547333, "learning_rate": 4.748201438848921e-06, "loss": 0.4986, "step": 66 }, { "epoch": 0.1447084233261339, "grad_norm": 0.17678698897361755, "learning_rate": 4.820143884892087e-06, "loss": 0.4784, "step": 67 }, { "epoch": 0.1468682505399568, "grad_norm": 0.17606988549232483, "learning_rate": 4.892086330935253e-06, "loss": 0.4818, "step": 68 }, { "epoch": 0.1490280777537797, "grad_norm": 0.1764959990978241, "learning_rate": 4.9640287769784175e-06, "loss": 0.4832, "step": 69 }, { "epoch": 0.1511879049676026, "grad_norm": 0.18899278342723846, "learning_rate": 5.035971223021583e-06, "loss": 0.4832, "step": 70 }, { "epoch": 0.15334773218142547, "grad_norm": 0.18127930164337158, "learning_rate": 5.107913669064749e-06, "loss": 0.4781, "step": 71 }, { "epoch": 0.15550755939524838, "grad_norm": 0.15677423775196075, "learning_rate": 5.179856115107914e-06, "loss": 0.4795, "step": 72 }, { "epoch": 0.15766738660907129, "grad_norm": 0.17852047085762024, "learning_rate": 5.251798561151079e-06, "loss": 0.4802, "step": 73 }, { "epoch": 0.15982721382289417, "grad_norm": 0.16051283478736877, "learning_rate": 5.3237410071942456e-06, "loss": 0.4758, "step": 74 }, { "epoch": 0.16198704103671707, "grad_norm": 0.15272092819213867, "learning_rate": 5.3956834532374105e-06, "loss": 0.4742, "step": 75 }, { "epoch": 0.16414686825053995, "grad_norm": 0.18069250881671906, "learning_rate": 5.467625899280576e-06, "loss": 0.4788, "step": 76 }, { "epoch": 0.16630669546436286, "grad_norm": 0.18495260179042816, "learning_rate": 5.539568345323741e-06, "loss": 0.477, "step": 77 }, { "epoch": 0.16846652267818574, "grad_norm": 0.15244323015213013, "learning_rate": 5.611510791366906e-06, "loss": 0.4738, "step": 78 }, { "epoch": 0.17062634989200864, "grad_norm": 0.15029869973659515, "learning_rate": 5.683453237410073e-06, "loss": 0.4809, "step": 79 }, { "epoch": 0.17278617710583152, "grad_norm": 0.15908615291118622, "learning_rate": 5.755395683453238e-06, "loss": 0.4682, "step": 80 }, { "epoch": 0.17494600431965443, "grad_norm": 0.16395969688892365, "learning_rate": 5.8273381294964035e-06, "loss": 0.4786, "step": 81 }, { "epoch": 0.1771058315334773, "grad_norm": 0.15997102856636047, "learning_rate": 5.899280575539568e-06, "loss": 0.4728, "step": 82 }, { "epoch": 0.17926565874730022, "grad_norm": 0.15442821383476257, "learning_rate": 5.971223021582734e-06, "loss": 0.4693, "step": 83 }, { "epoch": 0.18142548596112312, "grad_norm": 0.17457455396652222, "learning_rate": 6.0431654676259e-06, "loss": 0.4535, "step": 84 }, { "epoch": 0.183585313174946, "grad_norm": 0.17761239409446716, "learning_rate": 6.115107913669065e-06, "loss": 0.4615, "step": 85 }, { "epoch": 0.1857451403887689, "grad_norm": 0.15749000012874603, "learning_rate": 6.1870503597122315e-06, "loss": 0.4757, "step": 86 }, { "epoch": 0.1879049676025918, "grad_norm": 0.1500880867242813, "learning_rate": 6.2589928057553964e-06, "loss": 0.468, "step": 87 }, { "epoch": 0.1900647948164147, "grad_norm": 0.16475360095500946, "learning_rate": 6.330935251798561e-06, "loss": 0.453, "step": 88 }, { "epoch": 0.19222462203023757, "grad_norm": 0.15528172254562378, "learning_rate": 6.402877697841727e-06, "loss": 0.4606, "step": 89 }, { "epoch": 0.19438444924406048, "grad_norm": 0.18330231308937073, "learning_rate": 6.474820143884892e-06, "loss": 0.4645, "step": 90 }, { "epoch": 0.19654427645788336, "grad_norm": 0.15349973738193512, "learning_rate": 6.546762589928059e-06, "loss": 0.4589, "step": 91 }, { "epoch": 0.19870410367170627, "grad_norm": 0.17889103293418884, "learning_rate": 6.618705035971224e-06, "loss": 0.4698, "step": 92 }, { "epoch": 0.20086393088552915, "grad_norm": 0.16917382180690765, "learning_rate": 6.6906474820143886e-06, "loss": 0.45, "step": 93 }, { "epoch": 0.20302375809935205, "grad_norm": 0.15472815930843353, "learning_rate": 6.762589928057554e-06, "loss": 0.4554, "step": 94 }, { "epoch": 0.20518358531317496, "grad_norm": 0.15166456997394562, "learning_rate": 6.834532374100719e-06, "loss": 0.4603, "step": 95 }, { "epoch": 0.20734341252699784, "grad_norm": 0.15480853617191315, "learning_rate": 6.906474820143886e-06, "loss": 0.4527, "step": 96 }, { "epoch": 0.20950323974082075, "grad_norm": 0.18076568841934204, "learning_rate": 6.978417266187051e-06, "loss": 0.4543, "step": 97 }, { "epoch": 0.21166306695464362, "grad_norm": 0.14898645877838135, "learning_rate": 7.050359712230216e-06, "loss": 0.4645, "step": 98 }, { "epoch": 0.21382289416846653, "grad_norm": 0.16191677749156952, "learning_rate": 7.122302158273382e-06, "loss": 0.4556, "step": 99 }, { "epoch": 0.2159827213822894, "grad_norm": 0.15693144500255585, "learning_rate": 7.194244604316547e-06, "loss": 0.4636, "step": 100 }, { "epoch": 0.21814254859611232, "grad_norm": 0.1577419489622116, "learning_rate": 7.266187050359713e-06, "loss": 0.445, "step": 101 }, { "epoch": 0.2203023758099352, "grad_norm": 0.1567850261926651, "learning_rate": 7.338129496402878e-06, "loss": 0.4579, "step": 102 }, { "epoch": 0.2224622030237581, "grad_norm": 0.15102896094322205, "learning_rate": 7.410071942446043e-06, "loss": 0.4381, "step": 103 }, { "epoch": 0.22462203023758098, "grad_norm": 0.18107765913009644, "learning_rate": 7.48201438848921e-06, "loss": 0.4479, "step": 104 }, { "epoch": 0.2267818574514039, "grad_norm": 0.15492849051952362, "learning_rate": 7.5539568345323745e-06, "loss": 0.4466, "step": 105 }, { "epoch": 0.22894168466522677, "grad_norm": 0.16862063109874725, "learning_rate": 7.62589928057554e-06, "loss": 0.4556, "step": 106 }, { "epoch": 0.23110151187904968, "grad_norm": 0.1701633483171463, "learning_rate": 7.697841726618706e-06, "loss": 0.4483, "step": 107 }, { "epoch": 0.23326133909287258, "grad_norm": 0.18902191519737244, "learning_rate": 7.769784172661872e-06, "loss": 0.4383, "step": 108 }, { "epoch": 0.23542116630669546, "grad_norm": 0.16331182420253754, "learning_rate": 7.841726618705036e-06, "loss": 0.4438, "step": 109 }, { "epoch": 0.23758099352051837, "grad_norm": 0.18327923119068146, "learning_rate": 7.913669064748202e-06, "loss": 0.4535, "step": 110 }, { "epoch": 0.23974082073434125, "grad_norm": 0.16586214303970337, "learning_rate": 7.985611510791367e-06, "loss": 0.452, "step": 111 }, { "epoch": 0.24190064794816415, "grad_norm": 0.1756211370229721, "learning_rate": 8.057553956834533e-06, "loss": 0.4461, "step": 112 }, { "epoch": 0.24406047516198703, "grad_norm": 0.17397738993167877, "learning_rate": 8.129496402877699e-06, "loss": 0.4444, "step": 113 }, { "epoch": 0.24622030237580994, "grad_norm": 0.1517469584941864, "learning_rate": 8.201438848920865e-06, "loss": 0.4423, "step": 114 }, { "epoch": 0.24838012958963282, "grad_norm": 0.15296703577041626, "learning_rate": 8.273381294964029e-06, "loss": 0.4434, "step": 115 }, { "epoch": 0.2505399568034557, "grad_norm": 0.17677851021289825, "learning_rate": 8.345323741007195e-06, "loss": 0.4352, "step": 116 }, { "epoch": 0.2526997840172786, "grad_norm": 0.1546233892440796, "learning_rate": 8.41726618705036e-06, "loss": 0.4416, "step": 117 }, { "epoch": 0.2548596112311015, "grad_norm": 0.17565761506557465, "learning_rate": 8.489208633093526e-06, "loss": 0.4484, "step": 118 }, { "epoch": 0.2570194384449244, "grad_norm": 0.1443185657262802, "learning_rate": 8.561151079136692e-06, "loss": 0.4291, "step": 119 }, { "epoch": 0.2591792656587473, "grad_norm": 0.17720922827720642, "learning_rate": 8.633093525179856e-06, "loss": 0.4356, "step": 120 }, { "epoch": 0.2613390928725702, "grad_norm": 0.17487414181232452, "learning_rate": 8.705035971223022e-06, "loss": 0.4465, "step": 121 }, { "epoch": 0.2634989200863931, "grad_norm": 0.16723576188087463, "learning_rate": 8.776978417266188e-06, "loss": 0.4463, "step": 122 }, { "epoch": 0.265658747300216, "grad_norm": 0.19939404726028442, "learning_rate": 8.848920863309353e-06, "loss": 0.4387, "step": 123 }, { "epoch": 0.2678185745140389, "grad_norm": 0.1569490283727646, "learning_rate": 8.92086330935252e-06, "loss": 0.4332, "step": 124 }, { "epoch": 0.26997840172786175, "grad_norm": 0.17922881245613098, "learning_rate": 8.992805755395683e-06, "loss": 0.4404, "step": 125 }, { "epoch": 0.27213822894168466, "grad_norm": 0.17273768782615662, "learning_rate": 9.064748201438849e-06, "loss": 0.4445, "step": 126 }, { "epoch": 0.27429805615550756, "grad_norm": 0.16782942414283752, "learning_rate": 9.136690647482015e-06, "loss": 0.4316, "step": 127 }, { "epoch": 0.27645788336933047, "grad_norm": 0.17636790871620178, "learning_rate": 9.20863309352518e-06, "loss": 0.4361, "step": 128 }, { "epoch": 0.2786177105831533, "grad_norm": 0.18042488396167755, "learning_rate": 9.280575539568346e-06, "loss": 0.4316, "step": 129 }, { "epoch": 0.28077753779697623, "grad_norm": 0.21798282861709595, "learning_rate": 9.35251798561151e-06, "loss": 0.4384, "step": 130 }, { "epoch": 0.28293736501079914, "grad_norm": 0.18524324893951416, "learning_rate": 9.424460431654678e-06, "loss": 0.4434, "step": 131 }, { "epoch": 0.28509719222462204, "grad_norm": 0.19849282503128052, "learning_rate": 9.496402877697842e-06, "loss": 0.4454, "step": 132 }, { "epoch": 0.28725701943844495, "grad_norm": 0.17093205451965332, "learning_rate": 9.568345323741008e-06, "loss": 0.4449, "step": 133 }, { "epoch": 0.2894168466522678, "grad_norm": 0.19003981351852417, "learning_rate": 9.640287769784174e-06, "loss": 0.4244, "step": 134 }, { "epoch": 0.2915766738660907, "grad_norm": 0.2193020135164261, "learning_rate": 9.712230215827338e-06, "loss": 0.434, "step": 135 }, { "epoch": 0.2937365010799136, "grad_norm": 0.19183115661144257, "learning_rate": 9.784172661870505e-06, "loss": 0.4259, "step": 136 }, { "epoch": 0.2958963282937365, "grad_norm": 0.17214708030223846, "learning_rate": 9.85611510791367e-06, "loss": 0.4433, "step": 137 }, { "epoch": 0.2980561555075594, "grad_norm": 0.16226549446582794, "learning_rate": 9.928057553956835e-06, "loss": 0.4389, "step": 138 }, { "epoch": 0.3002159827213823, "grad_norm": 0.17609405517578125, "learning_rate": 1e-05, "loss": 0.4387, "step": 139 }, { "epoch": 0.3023758099352052, "grad_norm": 0.15736715495586395, "learning_rate": 9.999984208641271e-06, "loss": 0.4324, "step": 140 }, { "epoch": 0.3045356371490281, "grad_norm": 0.2223547101020813, "learning_rate": 9.99993683466483e-06, "loss": 0.4245, "step": 141 }, { "epoch": 0.30669546436285094, "grad_norm": 0.17344172298908234, "learning_rate": 9.999857878369917e-06, "loss": 0.4302, "step": 142 }, { "epoch": 0.30885529157667385, "grad_norm": 0.16877353191375732, "learning_rate": 9.99974734025526e-06, "loss": 0.4497, "step": 143 }, { "epoch": 0.31101511879049676, "grad_norm": 0.1692124605178833, "learning_rate": 9.999605221019082e-06, "loss": 0.4414, "step": 144 }, { "epoch": 0.31317494600431967, "grad_norm": 0.18339934945106506, "learning_rate": 9.999431521559081e-06, "loss": 0.4392, "step": 145 }, { "epoch": 0.31533477321814257, "grad_norm": 0.19719652831554413, "learning_rate": 9.999226242972445e-06, "loss": 0.4331, "step": 146 }, { "epoch": 0.3174946004319654, "grad_norm": 0.14894719421863556, "learning_rate": 9.998989386555815e-06, "loss": 0.4344, "step": 147 }, { "epoch": 0.31965442764578833, "grad_norm": 0.20450158417224884, "learning_rate": 9.998720953805312e-06, "loss": 0.4397, "step": 148 }, { "epoch": 0.32181425485961124, "grad_norm": 0.1889685094356537, "learning_rate": 9.9984209464165e-06, "loss": 0.4297, "step": 149 }, { "epoch": 0.32397408207343414, "grad_norm": 0.16375744342803955, "learning_rate": 9.998089366284392e-06, "loss": 0.4228, "step": 150 }, { "epoch": 0.326133909287257, "grad_norm": 0.15281440317630768, "learning_rate": 9.997726215503422e-06, "loss": 0.4264, "step": 151 }, { "epoch": 0.3282937365010799, "grad_norm": 0.16808317601680756, "learning_rate": 9.997331496367455e-06, "loss": 0.4247, "step": 152 }, { "epoch": 0.3304535637149028, "grad_norm": 0.168230339884758, "learning_rate": 9.996905211369748e-06, "loss": 0.4245, "step": 153 }, { "epoch": 0.3326133909287257, "grad_norm": 0.1692979782819748, "learning_rate": 9.996447363202947e-06, "loss": 0.4309, "step": 154 }, { "epoch": 0.3347732181425486, "grad_norm": 0.190389946103096, "learning_rate": 9.995957954759073e-06, "loss": 0.4239, "step": 155 }, { "epoch": 0.3369330453563715, "grad_norm": 0.18425118923187256, "learning_rate": 9.995436989129495e-06, "loss": 0.4316, "step": 156 }, { "epoch": 0.3390928725701944, "grad_norm": 0.1809394657611847, "learning_rate": 9.994884469604913e-06, "loss": 0.4276, "step": 157 }, { "epoch": 0.3412526997840173, "grad_norm": 0.20476803183555603, "learning_rate": 9.994300399675342e-06, "loss": 0.4375, "step": 158 }, { "epoch": 0.3434125269978402, "grad_norm": 0.16786271333694458, "learning_rate": 9.99368478303009e-06, "loss": 0.4352, "step": 159 }, { "epoch": 0.34557235421166305, "grad_norm": 0.16701987385749817, "learning_rate": 9.993037623557716e-06, "loss": 0.4193, "step": 160 }, { "epoch": 0.34773218142548595, "grad_norm": 0.19547881186008453, "learning_rate": 9.99235892534604e-06, "loss": 0.4264, "step": 161 }, { "epoch": 0.34989200863930886, "grad_norm": 0.16596215963363647, "learning_rate": 9.991648692682083e-06, "loss": 0.4347, "step": 162 }, { "epoch": 0.35205183585313177, "grad_norm": 0.1804916262626648, "learning_rate": 9.990906930052065e-06, "loss": 0.4168, "step": 163 }, { "epoch": 0.3542116630669546, "grad_norm": 0.16082727909088135, "learning_rate": 9.990133642141359e-06, "loss": 0.4281, "step": 164 }, { "epoch": 0.3563714902807775, "grad_norm": 0.180884450674057, "learning_rate": 9.989328833834472e-06, "loss": 0.4318, "step": 165 }, { "epoch": 0.35853131749460043, "grad_norm": 0.16864454746246338, "learning_rate": 9.988492510215011e-06, "loss": 0.4306, "step": 166 }, { "epoch": 0.36069114470842334, "grad_norm": 0.17244219779968262, "learning_rate": 9.987624676565652e-06, "loss": 0.4282, "step": 167 }, { "epoch": 0.36285097192224625, "grad_norm": 0.1679103672504425, "learning_rate": 9.986725338368103e-06, "loss": 0.4195, "step": 168 }, { "epoch": 0.3650107991360691, "grad_norm": 0.16607263684272766, "learning_rate": 9.98579450130307e-06, "loss": 0.4288, "step": 169 }, { "epoch": 0.367170626349892, "grad_norm": 0.16679252684116364, "learning_rate": 9.98483217125023e-06, "loss": 0.418, "step": 170 }, { "epoch": 0.3693304535637149, "grad_norm": 0.15752755105495453, "learning_rate": 9.983838354288181e-06, "loss": 0.438, "step": 171 }, { "epoch": 0.3714902807775378, "grad_norm": 0.1747094839811325, "learning_rate": 9.982813056694411e-06, "loss": 0.4316, "step": 172 }, { "epoch": 0.37365010799136067, "grad_norm": 0.1854209452867508, "learning_rate": 9.981756284945256e-06, "loss": 0.424, "step": 173 }, { "epoch": 0.3758099352051836, "grad_norm": 0.1754128485918045, "learning_rate": 9.980668045715864e-06, "loss": 0.4115, "step": 174 }, { "epoch": 0.3779697624190065, "grad_norm": 0.17791932821273804, "learning_rate": 9.979548345880142e-06, "loss": 0.4272, "step": 175 }, { "epoch": 0.3801295896328294, "grad_norm": 0.15502074360847473, "learning_rate": 9.978397192510722e-06, "loss": 0.4161, "step": 176 }, { "epoch": 0.38228941684665224, "grad_norm": 0.1928102672100067, "learning_rate": 9.977214592878917e-06, "loss": 0.4202, "step": 177 }, { "epoch": 0.38444924406047515, "grad_norm": 0.1752733737230301, "learning_rate": 9.976000554454668e-06, "loss": 0.4251, "step": 178 }, { "epoch": 0.38660907127429806, "grad_norm": 0.15899012982845306, "learning_rate": 9.974755084906503e-06, "loss": 0.4212, "step": 179 }, { "epoch": 0.38876889848812096, "grad_norm": 0.20840278267860413, "learning_rate": 9.97347819210148e-06, "loss": 0.4193, "step": 180 }, { "epoch": 0.39092872570194387, "grad_norm": 0.16049212217330933, "learning_rate": 9.972169884105155e-06, "loss": 0.4222, "step": 181 }, { "epoch": 0.3930885529157667, "grad_norm": 0.21681340038776398, "learning_rate": 9.970830169181504e-06, "loss": 0.4221, "step": 182 }, { "epoch": 0.3952483801295896, "grad_norm": 0.20257696509361267, "learning_rate": 9.969459055792903e-06, "loss": 0.412, "step": 183 }, { "epoch": 0.39740820734341253, "grad_norm": 0.1784621775150299, "learning_rate": 9.968056552600043e-06, "loss": 0.4308, "step": 184 }, { "epoch": 0.39956803455723544, "grad_norm": 0.21011000871658325, "learning_rate": 9.966622668461899e-06, "loss": 0.4196, "step": 185 }, { "epoch": 0.4017278617710583, "grad_norm": 0.17967236042022705, "learning_rate": 9.965157412435663e-06, "loss": 0.4171, "step": 186 }, { "epoch": 0.4038876889848812, "grad_norm": 0.21680930256843567, "learning_rate": 9.963660793776689e-06, "loss": 0.4188, "step": 187 }, { "epoch": 0.4060475161987041, "grad_norm": 0.1738550364971161, "learning_rate": 9.96213282193843e-06, "loss": 0.4207, "step": 188 }, { "epoch": 0.408207343412527, "grad_norm": 0.1727888137102127, "learning_rate": 9.960573506572391e-06, "loss": 0.4244, "step": 189 }, { "epoch": 0.4103671706263499, "grad_norm": 0.19244728982448578, "learning_rate": 9.958982857528053e-06, "loss": 0.4162, "step": 190 }, { "epoch": 0.41252699784017277, "grad_norm": 0.1902923285961151, "learning_rate": 9.957360884852819e-06, "loss": 0.4272, "step": 191 }, { "epoch": 0.4146868250539957, "grad_norm": 0.15449483692646027, "learning_rate": 9.955707598791952e-06, "loss": 0.4103, "step": 192 }, { "epoch": 0.4168466522678186, "grad_norm": 0.16577541828155518, "learning_rate": 9.954023009788505e-06, "loss": 0.4262, "step": 193 }, { "epoch": 0.4190064794816415, "grad_norm": 0.17047494649887085, "learning_rate": 9.952307128483257e-06, "loss": 0.416, "step": 194 }, { "epoch": 0.42116630669546434, "grad_norm": 0.16605447232723236, "learning_rate": 9.950559965714647e-06, "loss": 0.4118, "step": 195 }, { "epoch": 0.42332613390928725, "grad_norm": 0.17296399176120758, "learning_rate": 9.948781532518706e-06, "loss": 0.415, "step": 196 }, { "epoch": 0.42548596112311016, "grad_norm": 0.16557732224464417, "learning_rate": 9.946971840128982e-06, "loss": 0.399, "step": 197 }, { "epoch": 0.42764578833693306, "grad_norm": 0.1601681262254715, "learning_rate": 9.945130899976477e-06, "loss": 0.4091, "step": 198 }, { "epoch": 0.4298056155507559, "grad_norm": 0.17228373885154724, "learning_rate": 9.94325872368957e-06, "loss": 0.4169, "step": 199 }, { "epoch": 0.4319654427645788, "grad_norm": 0.1871252954006195, "learning_rate": 9.941355323093944e-06, "loss": 0.4064, "step": 200 }, { "epoch": 0.43412526997840173, "grad_norm": 0.16557608544826508, "learning_rate": 9.939420710212511e-06, "loss": 0.4022, "step": 201 }, { "epoch": 0.43628509719222464, "grad_norm": 0.19201870262622833, "learning_rate": 9.937454897265338e-06, "loss": 0.4106, "step": 202 }, { "epoch": 0.43844492440604754, "grad_norm": 0.20729224383831024, "learning_rate": 9.935457896669568e-06, "loss": 0.4231, "step": 203 }, { "epoch": 0.4406047516198704, "grad_norm": 0.1870211958885193, "learning_rate": 9.93342972103934e-06, "loss": 0.4048, "step": 204 }, { "epoch": 0.4427645788336933, "grad_norm": 0.17580384016036987, "learning_rate": 9.931370383185717e-06, "loss": 0.4088, "step": 205 }, { "epoch": 0.4449244060475162, "grad_norm": 0.20925194025039673, "learning_rate": 9.929279896116595e-06, "loss": 0.4148, "step": 206 }, { "epoch": 0.4470842332613391, "grad_norm": 0.2229665368795395, "learning_rate": 9.927158273036624e-06, "loss": 0.4185, "step": 207 }, { "epoch": 0.44924406047516197, "grad_norm": 0.1665569692850113, "learning_rate": 9.925005527347132e-06, "loss": 0.4137, "step": 208 }, { "epoch": 0.4514038876889849, "grad_norm": 0.18300025165081024, "learning_rate": 9.922821672646028e-06, "loss": 0.4098, "step": 209 }, { "epoch": 0.4535637149028078, "grad_norm": 0.21622765064239502, "learning_rate": 9.920606722727726e-06, "loss": 0.413, "step": 210 }, { "epoch": 0.4557235421166307, "grad_norm": 0.1885174661874771, "learning_rate": 9.918360691583056e-06, "loss": 0.4156, "step": 211 }, { "epoch": 0.45788336933045354, "grad_norm": 0.20590178668498993, "learning_rate": 9.916083593399167e-06, "loss": 0.4192, "step": 212 }, { "epoch": 0.46004319654427644, "grad_norm": 0.19168834388256073, "learning_rate": 9.913775442559451e-06, "loss": 0.42, "step": 213 }, { "epoch": 0.46220302375809935, "grad_norm": 0.2033228576183319, "learning_rate": 9.911436253643445e-06, "loss": 0.4294, "step": 214 }, { "epoch": 0.46436285097192226, "grad_norm": 0.1603459119796753, "learning_rate": 9.909066041426733e-06, "loss": 0.4257, "step": 215 }, { "epoch": 0.46652267818574517, "grad_norm": 0.17825163900852203, "learning_rate": 9.906664820880869e-06, "loss": 0.4196, "step": 216 }, { "epoch": 0.468682505399568, "grad_norm": 0.19199080765247345, "learning_rate": 9.904232607173262e-06, "loss": 0.4213, "step": 217 }, { "epoch": 0.4708423326133909, "grad_norm": 0.16068002581596375, "learning_rate": 9.9017694156671e-06, "loss": 0.4178, "step": 218 }, { "epoch": 0.47300215982721383, "grad_norm": 0.18598708510398865, "learning_rate": 9.899275261921236e-06, "loss": 0.4119, "step": 219 }, { "epoch": 0.47516198704103674, "grad_norm": 0.17735686898231506, "learning_rate": 9.8967501616901e-06, "loss": 0.4159, "step": 220 }, { "epoch": 0.4773218142548596, "grad_norm": 0.20054501295089722, "learning_rate": 9.894194130923602e-06, "loss": 0.4228, "step": 221 }, { "epoch": 0.4794816414686825, "grad_norm": 0.1531924605369568, "learning_rate": 9.891607185767018e-06, "loss": 0.4182, "step": 222 }, { "epoch": 0.4816414686825054, "grad_norm": 0.20048613846302032, "learning_rate": 9.8889893425609e-06, "loss": 0.4184, "step": 223 }, { "epoch": 0.4838012958963283, "grad_norm": 0.16016018390655518, "learning_rate": 9.886340617840968e-06, "loss": 0.4162, "step": 224 }, { "epoch": 0.48596112311015116, "grad_norm": 0.17939400672912598, "learning_rate": 9.883661028338009e-06, "loss": 0.4216, "step": 225 }, { "epoch": 0.48812095032397407, "grad_norm": 0.17419300973415375, "learning_rate": 9.880950590977764e-06, "loss": 0.4165, "step": 226 }, { "epoch": 0.490280777537797, "grad_norm": 0.18040290474891663, "learning_rate": 9.87820932288083e-06, "loss": 0.4148, "step": 227 }, { "epoch": 0.4924406047516199, "grad_norm": 0.2009221911430359, "learning_rate": 9.875437241362546e-06, "loss": 0.4088, "step": 228 }, { "epoch": 0.4946004319654428, "grad_norm": 0.16184359788894653, "learning_rate": 9.872634363932887e-06, "loss": 0.4246, "step": 229 }, { "epoch": 0.49676025917926564, "grad_norm": 0.19945880770683289, "learning_rate": 9.869800708296347e-06, "loss": 0.415, "step": 230 }, { "epoch": 0.49892008639308855, "grad_norm": 0.1834121197462082, "learning_rate": 9.866936292351837e-06, "loss": 0.413, "step": 231 }, { "epoch": 0.5010799136069114, "grad_norm": 0.19155217707157135, "learning_rate": 9.864041134192563e-06, "loss": 0.4145, "step": 232 }, { "epoch": 0.5032397408207343, "grad_norm": 0.16527444124221802, "learning_rate": 9.861115252105922e-06, "loss": 0.411, "step": 233 }, { "epoch": 0.5053995680345572, "grad_norm": 0.2018229365348816, "learning_rate": 9.85815866457337e-06, "loss": 0.4144, "step": 234 }, { "epoch": 0.5075593952483801, "grad_norm": 0.18045048415660858, "learning_rate": 9.855171390270325e-06, "loss": 0.4173, "step": 235 }, { "epoch": 0.509719222462203, "grad_norm": 0.16335050761699677, "learning_rate": 9.852153448066031e-06, "loss": 0.4184, "step": 236 }, { "epoch": 0.5118790496760259, "grad_norm": 0.1876971423625946, "learning_rate": 9.849104857023455e-06, "loss": 0.4149, "step": 237 }, { "epoch": 0.5140388768898488, "grad_norm": 0.1632338911294937, "learning_rate": 9.846025636399152e-06, "loss": 0.4281, "step": 238 }, { "epoch": 0.5161987041036717, "grad_norm": 0.1685461848974228, "learning_rate": 9.842915805643156e-06, "loss": 0.4168, "step": 239 }, { "epoch": 0.5183585313174947, "grad_norm": 0.1753380447626114, "learning_rate": 9.839775384398846e-06, "loss": 0.4163, "step": 240 }, { "epoch": 0.5205183585313174, "grad_norm": 0.17196574807167053, "learning_rate": 9.836604392502829e-06, "loss": 0.4264, "step": 241 }, { "epoch": 0.5226781857451404, "grad_norm": 0.22572582960128784, "learning_rate": 9.833402849984815e-06, "loss": 0.4116, "step": 242 }, { "epoch": 0.5248380129589633, "grad_norm": 0.16782043874263763, "learning_rate": 9.830170777067486e-06, "loss": 0.416, "step": 243 }, { "epoch": 0.5269978401727862, "grad_norm": 0.1964120864868164, "learning_rate": 9.82690819416637e-06, "loss": 0.4189, "step": 244 }, { "epoch": 0.5291576673866091, "grad_norm": 0.16382494568824768, "learning_rate": 9.823615121889716e-06, "loss": 0.4216, "step": 245 }, { "epoch": 0.531317494600432, "grad_norm": 0.19145262241363525, "learning_rate": 9.820291581038354e-06, "loss": 0.4069, "step": 246 }, { "epoch": 0.5334773218142549, "grad_norm": 0.1793445199728012, "learning_rate": 9.81693759260558e-06, "loss": 0.4073, "step": 247 }, { "epoch": 0.5356371490280778, "grad_norm": 0.20790617167949677, "learning_rate": 9.813553177777005e-06, "loss": 0.4098, "step": 248 }, { "epoch": 0.5377969762419006, "grad_norm": 0.17739000916481018, "learning_rate": 9.81013835793043e-06, "loss": 0.416, "step": 249 }, { "epoch": 0.5399568034557235, "grad_norm": 0.1868736743927002, "learning_rate": 9.806693154635719e-06, "loss": 0.4192, "step": 250 }, { "epoch": 0.5421166306695464, "grad_norm": 0.2077651023864746, "learning_rate": 9.803217589654642e-06, "loss": 0.4001, "step": 251 }, { "epoch": 0.5442764578833693, "grad_norm": 0.17842766642570496, "learning_rate": 9.79971168494076e-06, "loss": 0.4122, "step": 252 }, { "epoch": 0.5464362850971922, "grad_norm": 0.20299148559570312, "learning_rate": 9.796175462639273e-06, "loss": 0.4164, "step": 253 }, { "epoch": 0.5485961123110151, "grad_norm": 0.20451399683952332, "learning_rate": 9.79260894508688e-06, "loss": 0.4194, "step": 254 }, { "epoch": 0.550755939524838, "grad_norm": 0.16164958477020264, "learning_rate": 9.789012154811648e-06, "loss": 0.4037, "step": 255 }, { "epoch": 0.5529157667386609, "grad_norm": 0.19269876182079315, "learning_rate": 9.785385114532858e-06, "loss": 0.4086, "step": 256 }, { "epoch": 0.5550755939524838, "grad_norm": 0.22143694758415222, "learning_rate": 9.781727847160865e-06, "loss": 0.4205, "step": 257 }, { "epoch": 0.5572354211663066, "grad_norm": 0.20241893827915192, "learning_rate": 9.77804037579696e-06, "loss": 0.4135, "step": 258 }, { "epoch": 0.5593952483801296, "grad_norm": 0.19745588302612305, "learning_rate": 9.774322723733216e-06, "loss": 0.4129, "step": 259 }, { "epoch": 0.5615550755939525, "grad_norm": 0.1914190798997879, "learning_rate": 9.770574914452343e-06, "loss": 0.4153, "step": 260 }, { "epoch": 0.5637149028077754, "grad_norm": 0.18239063024520874, "learning_rate": 9.766796971627543e-06, "loss": 0.4183, "step": 261 }, { "epoch": 0.5658747300215983, "grad_norm": 0.18472230434417725, "learning_rate": 9.762988919122354e-06, "loss": 0.4129, "step": 262 }, { "epoch": 0.5680345572354212, "grad_norm": 0.18945036828517914, "learning_rate": 9.759150780990508e-06, "loss": 0.4145, "step": 263 }, { "epoch": 0.5701943844492441, "grad_norm": 0.18478325009346008, "learning_rate": 9.755282581475769e-06, "loss": 0.4057, "step": 264 }, { "epoch": 0.572354211663067, "grad_norm": 0.21743497252464294, "learning_rate": 9.751384345011787e-06, "loss": 0.4161, "step": 265 }, { "epoch": 0.5745140388768899, "grad_norm": 0.17114116251468658, "learning_rate": 9.747456096221946e-06, "loss": 0.4007, "step": 266 }, { "epoch": 0.5766738660907127, "grad_norm": 0.187269389629364, "learning_rate": 9.743497859919196e-06, "loss": 0.4048, "step": 267 }, { "epoch": 0.5788336933045356, "grad_norm": 0.16859321296215057, "learning_rate": 9.739509661105912e-06, "loss": 0.4109, "step": 268 }, { "epoch": 0.5809935205183585, "grad_norm": 0.1999719887971878, "learning_rate": 9.735491524973723e-06, "loss": 0.3952, "step": 269 }, { "epoch": 0.5831533477321814, "grad_norm": 0.176213800907135, "learning_rate": 9.73144347690336e-06, "loss": 0.4177, "step": 270 }, { "epoch": 0.5853131749460043, "grad_norm": 0.1951010376214981, "learning_rate": 9.727365542464498e-06, "loss": 0.4164, "step": 271 }, { "epoch": 0.5874730021598272, "grad_norm": 0.17570029199123383, "learning_rate": 9.723257747415584e-06, "loss": 0.4094, "step": 272 }, { "epoch": 0.5896328293736501, "grad_norm": 0.179957315325737, "learning_rate": 9.719120117703688e-06, "loss": 0.406, "step": 273 }, { "epoch": 0.591792656587473, "grad_norm": 0.17086850106716156, "learning_rate": 9.714952679464324e-06, "loss": 0.403, "step": 274 }, { "epoch": 0.593952483801296, "grad_norm": 0.17228035628795624, "learning_rate": 9.710755459021297e-06, "loss": 0.4109, "step": 275 }, { "epoch": 0.5961123110151187, "grad_norm": 0.19265015423297882, "learning_rate": 9.706528482886535e-06, "loss": 0.4209, "step": 276 }, { "epoch": 0.5982721382289417, "grad_norm": 0.16357901692390442, "learning_rate": 9.702271777759915e-06, "loss": 0.4061, "step": 277 }, { "epoch": 0.6004319654427646, "grad_norm": 0.17083071172237396, "learning_rate": 9.697985370529101e-06, "loss": 0.3996, "step": 278 }, { "epoch": 0.6025917926565875, "grad_norm": 0.1811821013689041, "learning_rate": 9.693669288269371e-06, "loss": 0.4182, "step": 279 }, { "epoch": 0.6047516198704104, "grad_norm": 0.1488206833600998, "learning_rate": 9.689323558243446e-06, "loss": 0.3981, "step": 280 }, { "epoch": 0.6069114470842333, "grad_norm": 0.185231015086174, "learning_rate": 9.684948207901315e-06, "loss": 0.4132, "step": 281 }, { "epoch": 0.6090712742980562, "grad_norm": 0.14964009821414948, "learning_rate": 9.680543264880075e-06, "loss": 0.4098, "step": 282 }, { "epoch": 0.6112311015118791, "grad_norm": 0.16769437491893768, "learning_rate": 9.676108757003735e-06, "loss": 0.418, "step": 283 }, { "epoch": 0.6133909287257019, "grad_norm": 0.1763710230588913, "learning_rate": 9.671644712283061e-06, "loss": 0.4111, "step": 284 }, { "epoch": 0.6155507559395248, "grad_norm": 0.17033055424690247, "learning_rate": 9.667151158915382e-06, "loss": 0.4138, "step": 285 }, { "epoch": 0.6177105831533477, "grad_norm": 0.21479171514511108, "learning_rate": 9.662628125284426e-06, "loss": 0.4164, "step": 286 }, { "epoch": 0.6198704103671706, "grad_norm": 0.18288952112197876, "learning_rate": 9.65807563996013e-06, "loss": 0.416, "step": 287 }, { "epoch": 0.6220302375809935, "grad_norm": 0.20399482548236847, "learning_rate": 9.653493731698467e-06, "loss": 0.4145, "step": 288 }, { "epoch": 0.6241900647948164, "grad_norm": 0.19287261366844177, "learning_rate": 9.648882429441258e-06, "loss": 0.4131, "step": 289 }, { "epoch": 0.6263498920086393, "grad_norm": 0.17563579976558685, "learning_rate": 9.644241762315995e-06, "loss": 0.4097, "step": 290 }, { "epoch": 0.6285097192224622, "grad_norm": 0.18624839186668396, "learning_rate": 9.639571759635655e-06, "loss": 0.4176, "step": 291 }, { "epoch": 0.6306695464362851, "grad_norm": 0.18379148840904236, "learning_rate": 9.634872450898511e-06, "loss": 0.4035, "step": 292 }, { "epoch": 0.6328293736501079, "grad_norm": 0.1886526644229889, "learning_rate": 9.630143865787951e-06, "loss": 0.4068, "step": 293 }, { "epoch": 0.6349892008639308, "grad_norm": 0.16463734209537506, "learning_rate": 9.62538603417229e-06, "loss": 0.4163, "step": 294 }, { "epoch": 0.6371490280777538, "grad_norm": 0.1974654197692871, "learning_rate": 9.620598986104578e-06, "loss": 0.4039, "step": 295 }, { "epoch": 0.6393088552915767, "grad_norm": 0.1882481724023819, "learning_rate": 9.615782751822413e-06, "loss": 0.4115, "step": 296 }, { "epoch": 0.6414686825053996, "grad_norm": 0.15222138166427612, "learning_rate": 9.610937361747747e-06, "loss": 0.4045, "step": 297 }, { "epoch": 0.6436285097192225, "grad_norm": 0.17053523659706116, "learning_rate": 9.606062846486698e-06, "loss": 0.4119, "step": 298 }, { "epoch": 0.6457883369330454, "grad_norm": 0.15987005829811096, "learning_rate": 9.601159236829353e-06, "loss": 0.3964, "step": 299 }, { "epoch": 0.6479481641468683, "grad_norm": 0.16534611582756042, "learning_rate": 9.596226563749575e-06, "loss": 0.4115, "step": 300 }, { "epoch": 0.6501079913606912, "grad_norm": 0.1743890643119812, "learning_rate": 9.591264858404809e-06, "loss": 0.4241, "step": 301 }, { "epoch": 0.652267818574514, "grad_norm": 0.14473925530910492, "learning_rate": 9.586274152135883e-06, "loss": 0.4011, "step": 302 }, { "epoch": 0.6544276457883369, "grad_norm": 0.1717105656862259, "learning_rate": 9.58125447646681e-06, "loss": 0.4128, "step": 303 }, { "epoch": 0.6565874730021598, "grad_norm": 0.16893403232097626, "learning_rate": 9.576205863104588e-06, "loss": 0.3984, "step": 304 }, { "epoch": 0.6587473002159827, "grad_norm": 0.19387434422969818, "learning_rate": 9.571128343939006e-06, "loss": 0.4086, "step": 305 }, { "epoch": 0.6609071274298056, "grad_norm": 0.1532067507505417, "learning_rate": 9.566021951042432e-06, "loss": 0.413, "step": 306 }, { "epoch": 0.6630669546436285, "grad_norm": 0.19082939624786377, "learning_rate": 9.56088671666962e-06, "loss": 0.4028, "step": 307 }, { "epoch": 0.6652267818574514, "grad_norm": 0.1660735309123993, "learning_rate": 9.555722673257502e-06, "loss": 0.4048, "step": 308 }, { "epoch": 0.6673866090712743, "grad_norm": 0.1646290272474289, "learning_rate": 9.550529853424979e-06, "loss": 0.401, "step": 309 }, { "epoch": 0.6695464362850972, "grad_norm": 0.1946524977684021, "learning_rate": 9.545308289972727e-06, "loss": 0.3999, "step": 310 }, { "epoch": 0.67170626349892, "grad_norm": 0.1731284260749817, "learning_rate": 9.54005801588298e-06, "loss": 0.4056, "step": 311 }, { "epoch": 0.673866090712743, "grad_norm": 0.1577766239643097, "learning_rate": 9.534779064319318e-06, "loss": 0.3952, "step": 312 }, { "epoch": 0.6760259179265659, "grad_norm": 0.20560896396636963, "learning_rate": 9.529471468626472e-06, "loss": 0.4082, "step": 313 }, { "epoch": 0.6781857451403888, "grad_norm": 0.16146545112133026, "learning_rate": 9.524135262330098e-06, "loss": 0.4044, "step": 314 }, { "epoch": 0.6803455723542117, "grad_norm": 0.18924373388290405, "learning_rate": 9.51877047913658e-06, "loss": 0.3949, "step": 315 }, { "epoch": 0.6825053995680346, "grad_norm": 0.20120824873447418, "learning_rate": 9.513377152932796e-06, "loss": 0.4098, "step": 316 }, { "epoch": 0.6846652267818575, "grad_norm": 0.17236529290676117, "learning_rate": 9.507955317785935e-06, "loss": 0.4005, "step": 317 }, { "epoch": 0.6868250539956804, "grad_norm": 0.19479617476463318, "learning_rate": 9.502505007943248e-06, "loss": 0.4115, "step": 318 }, { "epoch": 0.6889848812095032, "grad_norm": 0.18137769401073456, "learning_rate": 9.497026257831856e-06, "loss": 0.4006, "step": 319 }, { "epoch": 0.6911447084233261, "grad_norm": 0.18386590480804443, "learning_rate": 9.491519102058523e-06, "loss": 0.4045, "step": 320 }, { "epoch": 0.693304535637149, "grad_norm": 0.18597717583179474, "learning_rate": 9.48598357540944e-06, "loss": 0.3974, "step": 321 }, { "epoch": 0.6954643628509719, "grad_norm": 0.19069334864616394, "learning_rate": 9.480419712849996e-06, "loss": 0.4139, "step": 322 }, { "epoch": 0.6976241900647948, "grad_norm": 0.18793267011642456, "learning_rate": 9.474827549524574e-06, "loss": 0.4105, "step": 323 }, { "epoch": 0.6997840172786177, "grad_norm": 0.19101367890834808, "learning_rate": 9.46920712075632e-06, "loss": 0.3988, "step": 324 }, { "epoch": 0.7019438444924406, "grad_norm": 0.15915150940418243, "learning_rate": 9.463558462046912e-06, "loss": 0.4052, "step": 325 }, { "epoch": 0.7041036717062635, "grad_norm": 0.20149967074394226, "learning_rate": 9.457881609076352e-06, "loss": 0.4039, "step": 326 }, { "epoch": 0.7062634989200864, "grad_norm": 0.1692509800195694, "learning_rate": 9.452176597702724e-06, "loss": 0.4146, "step": 327 }, { "epoch": 0.7084233261339092, "grad_norm": 0.16798816621303558, "learning_rate": 9.446443463961986e-06, "loss": 0.3943, "step": 328 }, { "epoch": 0.7105831533477321, "grad_norm": 0.16925224661827087, "learning_rate": 9.440682244067724e-06, "loss": 0.3992, "step": 329 }, { "epoch": 0.712742980561555, "grad_norm": 0.19609062373638153, "learning_rate": 9.434892974410932e-06, "loss": 0.4094, "step": 330 }, { "epoch": 0.714902807775378, "grad_norm": 0.19346529245376587, "learning_rate": 9.429075691559788e-06, "loss": 0.4018, "step": 331 }, { "epoch": 0.7170626349892009, "grad_norm": 0.18897579610347748, "learning_rate": 9.423230432259409e-06, "loss": 0.4012, "step": 332 }, { "epoch": 0.7192224622030238, "grad_norm": 0.16114945709705353, "learning_rate": 9.41735723343163e-06, "loss": 0.3988, "step": 333 }, { "epoch": 0.7213822894168467, "grad_norm": 0.1889643669128418, "learning_rate": 9.411456132174768e-06, "loss": 0.3912, "step": 334 }, { "epoch": 0.7235421166306696, "grad_norm": 0.20438078045845032, "learning_rate": 9.405527165763384e-06, "loss": 0.4036, "step": 335 }, { "epoch": 0.7257019438444925, "grad_norm": 0.1676449030637741, "learning_rate": 9.399570371648052e-06, "loss": 0.4085, "step": 336 }, { "epoch": 0.7278617710583153, "grad_norm": 0.23122479021549225, "learning_rate": 9.393585787455125e-06, "loss": 0.4075, "step": 337 }, { "epoch": 0.7300215982721382, "grad_norm": 0.15428000688552856, "learning_rate": 9.387573450986485e-06, "loss": 0.3979, "step": 338 }, { "epoch": 0.7321814254859611, "grad_norm": 0.1889045536518097, "learning_rate": 9.381533400219319e-06, "loss": 0.4004, "step": 339 }, { "epoch": 0.734341252699784, "grad_norm": 0.16855069994926453, "learning_rate": 9.37546567330587e-06, "loss": 0.4021, "step": 340 }, { "epoch": 0.7365010799136069, "grad_norm": 0.15914303064346313, "learning_rate": 9.369370308573198e-06, "loss": 0.4147, "step": 341 }, { "epoch": 0.7386609071274298, "grad_norm": 0.18533971905708313, "learning_rate": 9.363247344522939e-06, "loss": 0.4025, "step": 342 }, { "epoch": 0.7408207343412527, "grad_norm": 0.15280672907829285, "learning_rate": 9.357096819831065e-06, "loss": 0.4061, "step": 343 }, { "epoch": 0.7429805615550756, "grad_norm": 0.1812913715839386, "learning_rate": 9.35091877334763e-06, "loss": 0.4008, "step": 344 }, { "epoch": 0.7451403887688985, "grad_norm": 0.19496847689151764, "learning_rate": 9.344713244096533e-06, "loss": 0.4063, "step": 345 }, { "epoch": 0.7473002159827213, "grad_norm": 0.15390554070472717, "learning_rate": 9.33848027127527e-06, "loss": 0.3943, "step": 346 }, { "epoch": 0.7494600431965442, "grad_norm": 0.18108762800693512, "learning_rate": 9.332219894254686e-06, "loss": 0.4037, "step": 347 }, { "epoch": 0.7516198704103672, "grad_norm": 0.172384575009346, "learning_rate": 9.325932152578726e-06, "loss": 0.404, "step": 348 }, { "epoch": 0.7537796976241901, "grad_norm": 0.1718224287033081, "learning_rate": 9.319617085964177e-06, "loss": 0.4098, "step": 349 }, { "epoch": 0.755939524838013, "grad_norm": 0.16733084619045258, "learning_rate": 9.31327473430044e-06, "loss": 0.41, "step": 350 }, { "epoch": 0.7580993520518359, "grad_norm": 0.15835174918174744, "learning_rate": 9.30690513764925e-06, "loss": 0.4108, "step": 351 }, { "epoch": 0.7602591792656588, "grad_norm": 0.16416366398334503, "learning_rate": 9.300508336244443e-06, "loss": 0.4123, "step": 352 }, { "epoch": 0.7624190064794817, "grad_norm": 0.15685053169727325, "learning_rate": 9.294084370491695e-06, "loss": 0.4026, "step": 353 }, { "epoch": 0.7645788336933045, "grad_norm": 0.17324267327785492, "learning_rate": 9.287633280968263e-06, "loss": 0.4043, "step": 354 }, { "epoch": 0.7667386609071274, "grad_norm": 0.16480839252471924, "learning_rate": 9.281155108422732e-06, "loss": 0.3903, "step": 355 }, { "epoch": 0.7688984881209503, "grad_norm": 0.155819833278656, "learning_rate": 9.274649893774768e-06, "loss": 0.4163, "step": 356 }, { "epoch": 0.7710583153347732, "grad_norm": 0.1437472552061081, "learning_rate": 9.268117678114833e-06, "loss": 0.3983, "step": 357 }, { "epoch": 0.7732181425485961, "grad_norm": 0.1644992083311081, "learning_rate": 9.26155850270396e-06, "loss": 0.4143, "step": 358 }, { "epoch": 0.775377969762419, "grad_norm": 0.15442179143428802, "learning_rate": 9.25497240897346e-06, "loss": 0.4186, "step": 359 }, { "epoch": 0.7775377969762419, "grad_norm": 0.16961856186389923, "learning_rate": 9.248359438524683e-06, "loss": 0.4056, "step": 360 }, { "epoch": 0.7796976241900648, "grad_norm": 0.14529763162136078, "learning_rate": 9.241719633128743e-06, "loss": 0.4081, "step": 361 }, { "epoch": 0.7818574514038877, "grad_norm": 0.17451095581054688, "learning_rate": 9.235053034726261e-06, "loss": 0.4011, "step": 362 }, { "epoch": 0.7840172786177105, "grad_norm": 0.16993848979473114, "learning_rate": 9.228359685427095e-06, "loss": 0.4126, "step": 363 }, { "epoch": 0.7861771058315334, "grad_norm": 0.1698153018951416, "learning_rate": 9.221639627510076e-06, "loss": 0.3983, "step": 364 }, { "epoch": 0.7883369330453563, "grad_norm": 0.15617668628692627, "learning_rate": 9.214892903422745e-06, "loss": 0.3894, "step": 365 }, { "epoch": 0.7904967602591793, "grad_norm": 0.1748441755771637, "learning_rate": 9.208119555781074e-06, "loss": 0.4042, "step": 366 }, { "epoch": 0.7926565874730022, "grad_norm": 0.18701235949993134, "learning_rate": 9.201319627369211e-06, "loss": 0.4166, "step": 367 }, { "epoch": 0.7948164146868251, "grad_norm": 0.15359680354595184, "learning_rate": 9.1944931611392e-06, "loss": 0.4025, "step": 368 }, { "epoch": 0.796976241900648, "grad_norm": 0.17842437326908112, "learning_rate": 9.18764020021071e-06, "loss": 0.4157, "step": 369 }, { "epoch": 0.7991360691144709, "grad_norm": 0.16838903725147247, "learning_rate": 9.180760787870766e-06, "loss": 0.4058, "step": 370 }, { "epoch": 0.8012958963282938, "grad_norm": 0.17230413854122162, "learning_rate": 9.173854967573479e-06, "loss": 0.4063, "step": 371 }, { "epoch": 0.8034557235421166, "grad_norm": 0.17813710868358612, "learning_rate": 9.166922782939759e-06, "loss": 0.4122, "step": 372 }, { "epoch": 0.8056155507559395, "grad_norm": 0.19047455489635468, "learning_rate": 9.159964277757054e-06, "loss": 0.4026, "step": 373 }, { "epoch": 0.8077753779697624, "grad_norm": 0.15476709604263306, "learning_rate": 9.152979495979064e-06, "loss": 0.3872, "step": 374 }, { "epoch": 0.8099352051835853, "grad_norm": 0.15130369365215302, "learning_rate": 9.145968481725466e-06, "loss": 0.4018, "step": 375 }, { "epoch": 0.8120950323974082, "grad_norm": 0.1687459796667099, "learning_rate": 9.13893127928164e-06, "loss": 0.3983, "step": 376 }, { "epoch": 0.8142548596112311, "grad_norm": 0.1546049863100052, "learning_rate": 9.131867933098379e-06, "loss": 0.4109, "step": 377 }, { "epoch": 0.816414686825054, "grad_norm": 0.1616266667842865, "learning_rate": 9.124778487791615e-06, "loss": 0.4039, "step": 378 }, { "epoch": 0.8185745140388769, "grad_norm": 0.1830417811870575, "learning_rate": 9.117662988142138e-06, "loss": 0.4053, "step": 379 }, { "epoch": 0.8207343412526998, "grad_norm": 0.15087199211120605, "learning_rate": 9.110521479095314e-06, "loss": 0.4111, "step": 380 }, { "epoch": 0.8228941684665226, "grad_norm": 0.15791049599647522, "learning_rate": 9.10335400576079e-06, "loss": 0.3882, "step": 381 }, { "epoch": 0.8250539956803455, "grad_norm": 0.16011568903923035, "learning_rate": 9.096160613412228e-06, "loss": 0.4101, "step": 382 }, { "epoch": 0.8272138228941684, "grad_norm": 0.1656263768672943, "learning_rate": 9.088941347487004e-06, "loss": 0.394, "step": 383 }, { "epoch": 0.8293736501079914, "grad_norm": 0.15749986469745636, "learning_rate": 9.08169625358592e-06, "loss": 0.3972, "step": 384 }, { "epoch": 0.8315334773218143, "grad_norm": 0.15940222144126892, "learning_rate": 9.074425377472932e-06, "loss": 0.4003, "step": 385 }, { "epoch": 0.8336933045356372, "grad_norm": 0.17559286952018738, "learning_rate": 9.067128765074842e-06, "loss": 0.4046, "step": 386 }, { "epoch": 0.8358531317494601, "grad_norm": 0.1646784096956253, "learning_rate": 9.059806462481022e-06, "loss": 0.3968, "step": 387 }, { "epoch": 0.838012958963283, "grad_norm": 0.16697706282138824, "learning_rate": 9.052458515943112e-06, "loss": 0.4146, "step": 388 }, { "epoch": 0.8401727861771058, "grad_norm": 0.17301729321479797, "learning_rate": 9.045084971874738e-06, "loss": 0.4037, "step": 389 }, { "epoch": 0.8423326133909287, "grad_norm": 0.1766882836818695, "learning_rate": 9.037685876851211e-06, "loss": 0.4019, "step": 390 }, { "epoch": 0.8444924406047516, "grad_norm": 0.16974475979804993, "learning_rate": 9.030261277609235e-06, "loss": 0.3978, "step": 391 }, { "epoch": 0.8466522678185745, "grad_norm": 0.17788070440292358, "learning_rate": 9.022811221046618e-06, "loss": 0.4062, "step": 392 }, { "epoch": 0.8488120950323974, "grad_norm": 0.16667339205741882, "learning_rate": 9.015335754221964e-06, "loss": 0.4167, "step": 393 }, { "epoch": 0.8509719222462203, "grad_norm": 0.15693309903144836, "learning_rate": 9.007834924354384e-06, "loss": 0.3988, "step": 394 }, { "epoch": 0.8531317494600432, "grad_norm": 0.16362878680229187, "learning_rate": 9.000308778823196e-06, "loss": 0.3995, "step": 395 }, { "epoch": 0.8552915766738661, "grad_norm": 0.14635585248470306, "learning_rate": 8.992757365167625e-06, "loss": 0.4028, "step": 396 }, { "epoch": 0.857451403887689, "grad_norm": 0.16527874767780304, "learning_rate": 8.985180731086505e-06, "loss": 0.406, "step": 397 }, { "epoch": 0.8596112311015118, "grad_norm": 0.2163344919681549, "learning_rate": 8.977578924437976e-06, "loss": 0.3985, "step": 398 }, { "epoch": 0.8617710583153347, "grad_norm": 0.14798112213611603, "learning_rate": 8.969951993239177e-06, "loss": 0.4011, "step": 399 }, { "epoch": 0.8639308855291576, "grad_norm": 0.16196613013744354, "learning_rate": 8.962299985665955e-06, "loss": 0.4057, "step": 400 }, { "epoch": 0.8660907127429806, "grad_norm": 0.15940962731838226, "learning_rate": 8.954622950052543e-06, "loss": 0.4027, "step": 401 }, { "epoch": 0.8682505399568035, "grad_norm": 0.16603127121925354, "learning_rate": 8.946920934891274e-06, "loss": 0.4106, "step": 402 }, { "epoch": 0.8704103671706264, "grad_norm": 0.16625916957855225, "learning_rate": 8.939193988832261e-06, "loss": 0.3997, "step": 403 }, { "epoch": 0.8725701943844493, "grad_norm": 0.17211325466632843, "learning_rate": 8.931442160683094e-06, "loss": 0.4036, "step": 404 }, { "epoch": 0.8747300215982722, "grad_norm": 0.17657049000263214, "learning_rate": 8.923665499408535e-06, "loss": 0.393, "step": 405 }, { "epoch": 0.8768898488120951, "grad_norm": 0.18346846103668213, "learning_rate": 8.915864054130203e-06, "loss": 0.3911, "step": 406 }, { "epoch": 0.8790496760259179, "grad_norm": 0.17051193118095398, "learning_rate": 8.908037874126263e-06, "loss": 0.3916, "step": 407 }, { "epoch": 0.8812095032397408, "grad_norm": 0.15643054246902466, "learning_rate": 8.900187008831124e-06, "loss": 0.3957, "step": 408 }, { "epoch": 0.8833693304535637, "grad_norm": 0.18112455308437347, "learning_rate": 8.892311507835118e-06, "loss": 0.4006, "step": 409 }, { "epoch": 0.8855291576673866, "grad_norm": 0.1472531408071518, "learning_rate": 8.88441142088419e-06, "loss": 0.3969, "step": 410 }, { "epoch": 0.8876889848812095, "grad_norm": 0.16634514927864075, "learning_rate": 8.87648679787958e-06, "loss": 0.4052, "step": 411 }, { "epoch": 0.8898488120950324, "grad_norm": 0.16606342792510986, "learning_rate": 8.868537688877516e-06, "loss": 0.3999, "step": 412 }, { "epoch": 0.8920086393088553, "grad_norm": 0.16223309934139252, "learning_rate": 8.860564144088891e-06, "loss": 0.4053, "step": 413 }, { "epoch": 0.8941684665226782, "grad_norm": 0.17775796353816986, "learning_rate": 8.852566213878947e-06, "loss": 0.3996, "step": 414 }, { "epoch": 0.896328293736501, "grad_norm": 0.16113241016864777, "learning_rate": 8.844543948766958e-06, "loss": 0.3874, "step": 415 }, { "epoch": 0.8984881209503239, "grad_norm": 0.19586795568466187, "learning_rate": 8.83649739942591e-06, "loss": 0.4012, "step": 416 }, { "epoch": 0.9006479481641468, "grad_norm": 0.18052950501441956, "learning_rate": 8.828426616682184e-06, "loss": 0.3973, "step": 417 }, { "epoch": 0.9028077753779697, "grad_norm": 0.16518956422805786, "learning_rate": 8.820331651515226e-06, "loss": 0.3997, "step": 418 }, { "epoch": 0.9049676025917927, "grad_norm": 0.1827470362186432, "learning_rate": 8.81221255505724e-06, "loss": 0.4008, "step": 419 }, { "epoch": 0.9071274298056156, "grad_norm": 0.18678082525730133, "learning_rate": 8.80406937859285e-06, "loss": 0.3953, "step": 420 }, { "epoch": 0.9092872570194385, "grad_norm": 0.1759604662656784, "learning_rate": 8.795902173558784e-06, "loss": 0.4037, "step": 421 }, { "epoch": 0.9114470842332614, "grad_norm": 0.1986621916294098, "learning_rate": 8.787710991543547e-06, "loss": 0.4125, "step": 422 }, { "epoch": 0.9136069114470843, "grad_norm": 0.19601307809352875, "learning_rate": 8.779495884287099e-06, "loss": 0.4018, "step": 423 }, { "epoch": 0.9157667386609071, "grad_norm": 0.16699747741222382, "learning_rate": 8.77125690368052e-06, "loss": 0.4029, "step": 424 }, { "epoch": 0.91792656587473, "grad_norm": 0.16781239211559296, "learning_rate": 8.76299410176569e-06, "loss": 0.3956, "step": 425 }, { "epoch": 0.9200863930885529, "grad_norm": 0.17204856872558594, "learning_rate": 8.754707530734958e-06, "loss": 0.4033, "step": 426 }, { "epoch": 0.9222462203023758, "grad_norm": 0.1568082720041275, "learning_rate": 8.74639724293081e-06, "loss": 0.3937, "step": 427 }, { "epoch": 0.9244060475161987, "grad_norm": 0.18325375020503998, "learning_rate": 8.738063290845536e-06, "loss": 0.4077, "step": 428 }, { "epoch": 0.9265658747300216, "grad_norm": 0.15343928337097168, "learning_rate": 8.729705727120911e-06, "loss": 0.3997, "step": 429 }, { "epoch": 0.9287257019438445, "grad_norm": 0.1750892996788025, "learning_rate": 8.721324604547851e-06, "loss": 0.4151, "step": 430 }, { "epoch": 0.9308855291576674, "grad_norm": 0.17041905224323273, "learning_rate": 8.712919976066078e-06, "loss": 0.4051, "step": 431 }, { "epoch": 0.9330453563714903, "grad_norm": 0.17677395045757294, "learning_rate": 8.704491894763794e-06, "loss": 0.4031, "step": 432 }, { "epoch": 0.9352051835853131, "grad_norm": 0.2149730920791626, "learning_rate": 8.696040413877344e-06, "loss": 0.4029, "step": 433 }, { "epoch": 0.937365010799136, "grad_norm": 0.17261390388011932, "learning_rate": 8.68756558679087e-06, "loss": 0.3998, "step": 434 }, { "epoch": 0.9395248380129589, "grad_norm": 0.17588981986045837, "learning_rate": 8.679067467035989e-06, "loss": 0.4127, "step": 435 }, { "epoch": 0.9416846652267818, "grad_norm": 0.18429699540138245, "learning_rate": 8.670546108291443e-06, "loss": 0.3987, "step": 436 }, { "epoch": 0.9438444924406048, "grad_norm": 0.15987183153629303, "learning_rate": 8.662001564382768e-06, "loss": 0.3911, "step": 437 }, { "epoch": 0.9460043196544277, "grad_norm": 0.17549017071723938, "learning_rate": 8.65343388928194e-06, "loss": 0.4068, "step": 438 }, { "epoch": 0.9481641468682506, "grad_norm": 0.1644325852394104, "learning_rate": 8.644843137107058e-06, "loss": 0.3938, "step": 439 }, { "epoch": 0.9503239740820735, "grad_norm": 0.18092772364616394, "learning_rate": 8.636229362121979e-06, "loss": 0.4036, "step": 440 }, { "epoch": 0.9524838012958964, "grad_norm": 0.19745442271232605, "learning_rate": 8.627592618735989e-06, "loss": 0.4131, "step": 441 }, { "epoch": 0.9546436285097192, "grad_norm": 0.15399040281772614, "learning_rate": 8.618932961503452e-06, "loss": 0.3956, "step": 442 }, { "epoch": 0.9568034557235421, "grad_norm": 0.21613968908786774, "learning_rate": 8.610250445123472e-06, "loss": 0.3957, "step": 443 }, { "epoch": 0.958963282937365, "grad_norm": 0.15756168961524963, "learning_rate": 8.601545124439535e-06, "loss": 0.401, "step": 444 }, { "epoch": 0.9611231101511879, "grad_norm": 0.16475795209407806, "learning_rate": 8.592817054439184e-06, "loss": 0.4091, "step": 445 }, { "epoch": 0.9632829373650108, "grad_norm": 0.17942647635936737, "learning_rate": 8.584066290253649e-06, "loss": 0.3818, "step": 446 }, { "epoch": 0.9654427645788337, "grad_norm": 0.1804707795381546, "learning_rate": 8.575292887157515e-06, "loss": 0.4036, "step": 447 }, { "epoch": 0.9676025917926566, "grad_norm": 0.1610308587551117, "learning_rate": 8.566496900568364e-06, "loss": 0.4046, "step": 448 }, { "epoch": 0.9697624190064795, "grad_norm": 0.17367208003997803, "learning_rate": 8.557678386046429e-06, "loss": 0.399, "step": 449 }, { "epoch": 0.9719222462203023, "grad_norm": 0.16975344717502594, "learning_rate": 8.548837399294235e-06, "loss": 0.3973, "step": 450 }, { "epoch": 0.9740820734341252, "grad_norm": 0.16336052119731903, "learning_rate": 8.539973996156265e-06, "loss": 0.4077, "step": 451 }, { "epoch": 0.9762419006479481, "grad_norm": 0.16016145050525665, "learning_rate": 8.531088232618587e-06, "loss": 0.4005, "step": 452 }, { "epoch": 0.978401727861771, "grad_norm": 0.15805621445178986, "learning_rate": 8.522180164808515e-06, "loss": 0.3885, "step": 453 }, { "epoch": 0.980561555075594, "grad_norm": 0.15626148879528046, "learning_rate": 8.513249848994248e-06, "loss": 0.3912, "step": 454 }, { "epoch": 0.9827213822894169, "grad_norm": 0.1786354035139084, "learning_rate": 8.504297341584509e-06, "loss": 0.4034, "step": 455 }, { "epoch": 0.9848812095032398, "grad_norm": 0.1438089907169342, "learning_rate": 8.495322699128206e-06, "loss": 0.4003, "step": 456 }, { "epoch": 0.9870410367170627, "grad_norm": 0.16011767089366913, "learning_rate": 8.486325978314054e-06, "loss": 0.3985, "step": 457 }, { "epoch": 0.9892008639308856, "grad_norm": 0.18413770198822021, "learning_rate": 8.477307235970235e-06, "loss": 0.3855, "step": 458 }, { "epoch": 0.9913606911447084, "grad_norm": 0.15895338356494904, "learning_rate": 8.468266529064025e-06, "loss": 0.3918, "step": 459 }, { "epoch": 0.9935205183585313, "grad_norm": 0.172573059797287, "learning_rate": 8.459203914701444e-06, "loss": 0.3903, "step": 460 }, { "epoch": 0.9956803455723542, "grad_norm": 0.1525600552558899, "learning_rate": 8.450119450126889e-06, "loss": 0.4066, "step": 461 }, { "epoch": 0.9978401727861771, "grad_norm": 0.1875782459974289, "learning_rate": 8.441013192722774e-06, "loss": 0.405, "step": 462 }, { "epoch": 1.0, "grad_norm": 0.18118026852607727, "learning_rate": 8.431885200009172e-06, "loss": 0.402, "step": 463 }, { "epoch": 1.0021598272138228, "grad_norm": 0.1752985566854477, "learning_rate": 8.422735529643445e-06, "loss": 0.3926, "step": 464 }, { "epoch": 1.0043196544276458, "grad_norm": 0.1703169196844101, "learning_rate": 8.413564239419883e-06, "loss": 0.3838, "step": 465 }, { "epoch": 1.0064794816414686, "grad_norm": 0.181954026222229, "learning_rate": 8.404371387269341e-06, "loss": 0.3863, "step": 466 }, { "epoch": 1.0086393088552916, "grad_norm": 0.16215139627456665, "learning_rate": 8.39515703125887e-06, "loss": 0.3849, "step": 467 }, { "epoch": 1.0107991360691144, "grad_norm": 0.23999503254890442, "learning_rate": 8.385921229591351e-06, "loss": 0.3917, "step": 468 }, { "epoch": 1.0129589632829374, "grad_norm": 0.1752462089061737, "learning_rate": 8.376664040605122e-06, "loss": 0.3812, "step": 469 }, { "epoch": 1.0151187904967602, "grad_norm": 0.17159010469913483, "learning_rate": 8.367385522773625e-06, "loss": 0.386, "step": 470 }, { "epoch": 1.0172786177105833, "grad_norm": 0.19381286203861237, "learning_rate": 8.358085734705021e-06, "loss": 0.3958, "step": 471 }, { "epoch": 1.019438444924406, "grad_norm": 0.17137818038463593, "learning_rate": 8.348764735141823e-06, "loss": 0.3867, "step": 472 }, { "epoch": 1.0215982721382288, "grad_norm": 0.18011374771595, "learning_rate": 8.339422582960533e-06, "loss": 0.3974, "step": 473 }, { "epoch": 1.0237580993520519, "grad_norm": 0.18092289566993713, "learning_rate": 8.33005933717126e-06, "loss": 0.3697, "step": 474 }, { "epoch": 1.0259179265658747, "grad_norm": 0.15359187126159668, "learning_rate": 8.320675056917353e-06, "loss": 0.3813, "step": 475 }, { "epoch": 1.0280777537796977, "grad_norm": 0.16927658021450043, "learning_rate": 8.311269801475026e-06, "loss": 0.3834, "step": 476 }, { "epoch": 1.0302375809935205, "grad_norm": 0.17222736775875092, "learning_rate": 8.301843630252986e-06, "loss": 0.3869, "step": 477 }, { "epoch": 1.0323974082073435, "grad_norm": 0.17333416640758514, "learning_rate": 8.29239660279205e-06, "loss": 0.3853, "step": 478 }, { "epoch": 1.0345572354211663, "grad_norm": 0.18697193264961243, "learning_rate": 8.282928778764783e-06, "loss": 0.3974, "step": 479 }, { "epoch": 1.0367170626349893, "grad_norm": 0.1769992560148239, "learning_rate": 8.273440217975103e-06, "loss": 0.39, "step": 480 }, { "epoch": 1.038876889848812, "grad_norm": 0.1826915144920349, "learning_rate": 8.26393098035792e-06, "loss": 0.383, "step": 481 }, { "epoch": 1.041036717062635, "grad_norm": 0.18807494640350342, "learning_rate": 8.254401125978744e-06, "loss": 0.3875, "step": 482 }, { "epoch": 1.043196544276458, "grad_norm": 0.1729234904050827, "learning_rate": 8.244850715033316e-06, "loss": 0.3888, "step": 483 }, { "epoch": 1.0453563714902807, "grad_norm": 0.18379338085651398, "learning_rate": 8.235279807847223e-06, "loss": 0.3867, "step": 484 }, { "epoch": 1.0475161987041037, "grad_norm": 0.1450575441122055, "learning_rate": 8.225688464875514e-06, "loss": 0.3895, "step": 485 }, { "epoch": 1.0496760259179265, "grad_norm": 0.15889526903629303, "learning_rate": 8.216076746702327e-06, "loss": 0.3817, "step": 486 }, { "epoch": 1.0518358531317495, "grad_norm": 0.16916847229003906, "learning_rate": 8.206444714040496e-06, "loss": 0.382, "step": 487 }, { "epoch": 1.0539956803455723, "grad_norm": 0.1597558856010437, "learning_rate": 8.196792427731175e-06, "loss": 0.3905, "step": 488 }, { "epoch": 1.0561555075593954, "grad_norm": 0.1566566675901413, "learning_rate": 8.18711994874345e-06, "loss": 0.3841, "step": 489 }, { "epoch": 1.0583153347732182, "grad_norm": 0.17559486627578735, "learning_rate": 8.177427338173955e-06, "loss": 0.3792, "step": 490 }, { "epoch": 1.060475161987041, "grad_norm": 0.15165618062019348, "learning_rate": 8.167714657246486e-06, "loss": 0.3804, "step": 491 }, { "epoch": 1.062634989200864, "grad_norm": 0.15612950921058655, "learning_rate": 8.157981967311614e-06, "loss": 0.382, "step": 492 }, { "epoch": 1.0647948164146868, "grad_norm": 0.16774365305900574, "learning_rate": 8.1482293298463e-06, "loss": 0.3905, "step": 493 }, { "epoch": 1.0669546436285098, "grad_norm": 0.1574973613023758, "learning_rate": 8.138456806453503e-06, "loss": 0.3881, "step": 494 }, { "epoch": 1.0691144708423326, "grad_norm": 0.20335029065608978, "learning_rate": 8.12866445886179e-06, "loss": 0.3752, "step": 495 }, { "epoch": 1.0712742980561556, "grad_norm": 0.15830448269844055, "learning_rate": 8.118852348924951e-06, "loss": 0.3814, "step": 496 }, { "epoch": 1.0734341252699784, "grad_norm": 0.20952075719833374, "learning_rate": 8.109020538621607e-06, "loss": 0.3798, "step": 497 }, { "epoch": 1.0755939524838012, "grad_norm": 0.18261830508708954, "learning_rate": 8.099169090054812e-06, "loss": 0.3895, "step": 498 }, { "epoch": 1.0777537796976242, "grad_norm": 0.20644772052764893, "learning_rate": 8.089298065451673e-06, "loss": 0.3744, "step": 499 }, { "epoch": 1.079913606911447, "grad_norm": 0.17039693892002106, "learning_rate": 8.079407527162944e-06, "loss": 0.385, "step": 500 }, { "epoch": 1.08207343412527, "grad_norm": 0.1829117089509964, "learning_rate": 8.069497537662638e-06, "loss": 0.3745, "step": 501 }, { "epoch": 1.0842332613390928, "grad_norm": 0.16001296043395996, "learning_rate": 8.05956815954764e-06, "loss": 0.3796, "step": 502 }, { "epoch": 1.0863930885529158, "grad_norm": 0.1937176138162613, "learning_rate": 8.049619455537296e-06, "loss": 0.3814, "step": 503 }, { "epoch": 1.0885529157667386, "grad_norm": 0.15796703100204468, "learning_rate": 8.039651488473028e-06, "loss": 0.3804, "step": 504 }, { "epoch": 1.0907127429805616, "grad_norm": 0.21041610836982727, "learning_rate": 8.029664321317932e-06, "loss": 0.3862, "step": 505 }, { "epoch": 1.0928725701943844, "grad_norm": 0.18780550360679626, "learning_rate": 8.019658017156384e-06, "loss": 0.3807, "step": 506 }, { "epoch": 1.0950323974082075, "grad_norm": 0.1692945808172226, "learning_rate": 8.009632639193643e-06, "loss": 0.3845, "step": 507 }, { "epoch": 1.0971922246220303, "grad_norm": 0.18981167674064636, "learning_rate": 7.999588250755442e-06, "loss": 0.3848, "step": 508 }, { "epoch": 1.099352051835853, "grad_norm": 0.15760387480258942, "learning_rate": 7.989524915287595e-06, "loss": 0.3757, "step": 509 }, { "epoch": 1.101511879049676, "grad_norm": 0.140371173620224, "learning_rate": 7.979442696355601e-06, "loss": 0.3825, "step": 510 }, { "epoch": 1.1036717062634989, "grad_norm": 0.15832389891147614, "learning_rate": 7.969341657644236e-06, "loss": 0.3863, "step": 511 }, { "epoch": 1.1058315334773219, "grad_norm": 0.14990824460983276, "learning_rate": 7.959221862957149e-06, "loss": 0.3917, "step": 512 }, { "epoch": 1.1079913606911447, "grad_norm": 0.15084333717823029, "learning_rate": 7.94908337621646e-06, "loss": 0.3863, "step": 513 }, { "epoch": 1.1101511879049677, "grad_norm": 0.15440189838409424, "learning_rate": 7.938926261462366e-06, "loss": 0.3785, "step": 514 }, { "epoch": 1.1123110151187905, "grad_norm": 0.16310814023017883, "learning_rate": 7.928750582852722e-06, "loss": 0.3796, "step": 515 }, { "epoch": 1.1144708423326133, "grad_norm": 0.15400250256061554, "learning_rate": 7.918556404662645e-06, "loss": 0.3913, "step": 516 }, { "epoch": 1.1166306695464363, "grad_norm": 0.16480040550231934, "learning_rate": 7.908343791284104e-06, "loss": 0.3817, "step": 517 }, { "epoch": 1.118790496760259, "grad_norm": 0.14894555509090424, "learning_rate": 7.898112807225517e-06, "loss": 0.3797, "step": 518 }, { "epoch": 1.1209503239740821, "grad_norm": 0.16937804222106934, "learning_rate": 7.887863517111337e-06, "loss": 0.3832, "step": 519 }, { "epoch": 1.123110151187905, "grad_norm": 0.1606750190258026, "learning_rate": 7.877595985681656e-06, "loss": 0.3735, "step": 520 }, { "epoch": 1.125269978401728, "grad_norm": 0.1703948825597763, "learning_rate": 7.867310277791778e-06, "loss": 0.3754, "step": 521 }, { "epoch": 1.1274298056155507, "grad_norm": 0.1625399887561798, "learning_rate": 7.857006458411826e-06, "loss": 0.3773, "step": 522 }, { "epoch": 1.1295896328293737, "grad_norm": 0.17872779071331024, "learning_rate": 7.846684592626324e-06, "loss": 0.3867, "step": 523 }, { "epoch": 1.1317494600431965, "grad_norm": 0.14789296686649323, "learning_rate": 7.836344745633785e-06, "loss": 0.3794, "step": 524 }, { "epoch": 1.1339092872570196, "grad_norm": 0.15560902655124664, "learning_rate": 7.8259869827463e-06, "loss": 0.3795, "step": 525 }, { "epoch": 1.1360691144708424, "grad_norm": 0.1677931696176529, "learning_rate": 7.815611369389134e-06, "loss": 0.3921, "step": 526 }, { "epoch": 1.1382289416846652, "grad_norm": 0.15654879808425903, "learning_rate": 7.805217971100295e-06, "loss": 0.3893, "step": 527 }, { "epoch": 1.1403887688984882, "grad_norm": 0.15903332829475403, "learning_rate": 7.794806853530139e-06, "loss": 0.3791, "step": 528 }, { "epoch": 1.142548596112311, "grad_norm": 0.1683822125196457, "learning_rate": 7.78437808244094e-06, "loss": 0.3877, "step": 529 }, { "epoch": 1.144708423326134, "grad_norm": 0.15309610962867737, "learning_rate": 7.773931723706487e-06, "loss": 0.3746, "step": 530 }, { "epoch": 1.1468682505399568, "grad_norm": 0.14578138291835785, "learning_rate": 7.763467843311658e-06, "loss": 0.3767, "step": 531 }, { "epoch": 1.1490280777537798, "grad_norm": 0.16950742900371552, "learning_rate": 7.752986507352009e-06, "loss": 0.3873, "step": 532 }, { "epoch": 1.1511879049676026, "grad_norm": 0.1471281796693802, "learning_rate": 7.742487782033352e-06, "loss": 0.3837, "step": 533 }, { "epoch": 1.1533477321814254, "grad_norm": 0.14385339617729187, "learning_rate": 7.731971733671347e-06, "loss": 0.3944, "step": 534 }, { "epoch": 1.1555075593952484, "grad_norm": 0.14128537476062775, "learning_rate": 7.721438428691065e-06, "loss": 0.3802, "step": 535 }, { "epoch": 1.1576673866090712, "grad_norm": 0.1677146852016449, "learning_rate": 7.71088793362659e-06, "loss": 0.3812, "step": 536 }, { "epoch": 1.1598272138228942, "grad_norm": 0.14564774930477142, "learning_rate": 7.70032031512058e-06, "loss": 0.3827, "step": 537 }, { "epoch": 1.161987041036717, "grad_norm": 0.15598656237125397, "learning_rate": 7.689735639923857e-06, "loss": 0.3829, "step": 538 }, { "epoch": 1.16414686825054, "grad_norm": 0.14980514347553253, "learning_rate": 7.679133974894984e-06, "loss": 0.3767, "step": 539 }, { "epoch": 1.1663066954643628, "grad_norm": 0.15688128769397736, "learning_rate": 7.668515386999837e-06, "loss": 0.3931, "step": 540 }, { "epoch": 1.1684665226781856, "grad_norm": 0.15419645607471466, "learning_rate": 7.65787994331119e-06, "loss": 0.375, "step": 541 }, { "epoch": 1.1706263498920086, "grad_norm": 0.15213316679000854, "learning_rate": 7.647227711008288e-06, "loss": 0.3841, "step": 542 }, { "epoch": 1.1727861771058314, "grad_norm": 0.14635787904262543, "learning_rate": 7.636558757376413e-06, "loss": 0.379, "step": 543 }, { "epoch": 1.1749460043196545, "grad_norm": 0.1601177304983139, "learning_rate": 7.6258731498064796e-06, "loss": 0.3741, "step": 544 }, { "epoch": 1.1771058315334773, "grad_norm": 0.15203504264354706, "learning_rate": 7.615170955794592e-06, "loss": 0.3764, "step": 545 }, { "epoch": 1.1792656587473003, "grad_norm": 0.1715112179517746, "learning_rate": 7.604452242941622e-06, "loss": 0.3811, "step": 546 }, { "epoch": 1.181425485961123, "grad_norm": 0.17397920787334442, "learning_rate": 7.593717078952788e-06, "loss": 0.3826, "step": 547 }, { "epoch": 1.183585313174946, "grad_norm": 0.14259247481822968, "learning_rate": 7.582965531637221e-06, "loss": 0.3725, "step": 548 }, { "epoch": 1.1857451403887689, "grad_norm": 0.16911283135414124, "learning_rate": 7.572197668907533e-06, "loss": 0.3915, "step": 549 }, { "epoch": 1.187904967602592, "grad_norm": 0.1575639694929123, "learning_rate": 7.561413558779401e-06, "loss": 0.3719, "step": 550 }, { "epoch": 1.1900647948164147, "grad_norm": 0.15729346871376038, "learning_rate": 7.550613269371124e-06, "loss": 0.3802, "step": 551 }, { "epoch": 1.1922246220302375, "grad_norm": 0.16103574633598328, "learning_rate": 7.5397968689032e-06, "loss": 0.379, "step": 552 }, { "epoch": 1.1943844492440605, "grad_norm": 0.16614358127117157, "learning_rate": 7.528964425697895e-06, "loss": 0.3874, "step": 553 }, { "epoch": 1.1965442764578833, "grad_norm": 0.14216990768909454, "learning_rate": 7.518116008178805e-06, "loss": 0.3791, "step": 554 }, { "epoch": 1.1987041036717063, "grad_norm": 0.15424562990665436, "learning_rate": 7.507251684870433e-06, "loss": 0.3855, "step": 555 }, { "epoch": 1.2008639308855291, "grad_norm": 0.15728497505187988, "learning_rate": 7.496371524397747e-06, "loss": 0.3767, "step": 556 }, { "epoch": 1.2030237580993521, "grad_norm": 0.16239339113235474, "learning_rate": 7.485475595485756e-06, "loss": 0.39, "step": 557 }, { "epoch": 1.205183585313175, "grad_norm": 0.18078574538230896, "learning_rate": 7.474563966959068e-06, "loss": 0.3805, "step": 558 }, { "epoch": 1.2073434125269977, "grad_norm": 0.1507551670074463, "learning_rate": 7.463636707741458e-06, "loss": 0.385, "step": 559 }, { "epoch": 1.2095032397408207, "grad_norm": 0.1794394552707672, "learning_rate": 7.452693886855438e-06, "loss": 0.3869, "step": 560 }, { "epoch": 1.2116630669546435, "grad_norm": 0.17479896545410156, "learning_rate": 7.4417355734218085e-06, "loss": 0.3763, "step": 561 }, { "epoch": 1.2138228941684666, "grad_norm": 0.15585078299045563, "learning_rate": 7.430761836659235e-06, "loss": 0.3893, "step": 562 }, { "epoch": 1.2159827213822894, "grad_norm": 0.17647355794906616, "learning_rate": 7.4197727458837995e-06, "loss": 0.3858, "step": 563 }, { "epoch": 1.2181425485961124, "grad_norm": 0.1657349020242691, "learning_rate": 7.408768370508577e-06, "loss": 0.3787, "step": 564 }, { "epoch": 1.2203023758099352, "grad_norm": 0.15990415215492249, "learning_rate": 7.397748780043179e-06, "loss": 0.3816, "step": 565 }, { "epoch": 1.2224622030237582, "grad_norm": 0.16552990674972534, "learning_rate": 7.386714044093331e-06, "loss": 0.3818, "step": 566 }, { "epoch": 1.224622030237581, "grad_norm": 0.17762261629104614, "learning_rate": 7.375664232360421e-06, "loss": 0.3823, "step": 567 }, { "epoch": 1.226781857451404, "grad_norm": 0.17362867295742035, "learning_rate": 7.364599414641064e-06, "loss": 0.3796, "step": 568 }, { "epoch": 1.2289416846652268, "grad_norm": 0.15305167436599731, "learning_rate": 7.353519660826665e-06, "loss": 0.3816, "step": 569 }, { "epoch": 1.2311015118790496, "grad_norm": 0.1919698268175125, "learning_rate": 7.342425040902967e-06, "loss": 0.3927, "step": 570 }, { "epoch": 1.2332613390928726, "grad_norm": 0.15654806792736053, "learning_rate": 7.331315624949624e-06, "loss": 0.3844, "step": 571 }, { "epoch": 1.2354211663066954, "grad_norm": 0.1898239254951477, "learning_rate": 7.320191483139742e-06, "loss": 0.3935, "step": 572 }, { "epoch": 1.2375809935205184, "grad_norm": 0.15385276079177856, "learning_rate": 7.309052685739448e-06, "loss": 0.3731, "step": 573 }, { "epoch": 1.2397408207343412, "grad_norm": 0.15585872530937195, "learning_rate": 7.297899303107441e-06, "loss": 0.3802, "step": 574 }, { "epoch": 1.2419006479481642, "grad_norm": 0.14450909197330475, "learning_rate": 7.286731405694544e-06, "loss": 0.368, "step": 575 }, { "epoch": 1.244060475161987, "grad_norm": 0.15306542813777924, "learning_rate": 7.275549064043269e-06, "loss": 0.3827, "step": 576 }, { "epoch": 1.2462203023758098, "grad_norm": 0.15712149441242218, "learning_rate": 7.264352348787364e-06, "loss": 0.3933, "step": 577 }, { "epoch": 1.2483801295896328, "grad_norm": 0.16853763163089752, "learning_rate": 7.253141330651367e-06, "loss": 0.3886, "step": 578 }, { "epoch": 1.2505399568034556, "grad_norm": 0.15141934156417847, "learning_rate": 7.241916080450163e-06, "loss": 0.373, "step": 579 }, { "epoch": 1.2526997840172787, "grad_norm": 0.16748425364494324, "learning_rate": 7.23067666908853e-06, "loss": 0.3779, "step": 580 }, { "epoch": 1.2548596112311015, "grad_norm": 0.15394426882266998, "learning_rate": 7.219423167560701e-06, "loss": 0.3803, "step": 581 }, { "epoch": 1.2570194384449245, "grad_norm": 0.15716637670993805, "learning_rate": 7.208155646949908e-06, "loss": 0.3903, "step": 582 }, { "epoch": 1.2591792656587473, "grad_norm": 0.17571674287319183, "learning_rate": 7.196874178427933e-06, "loss": 0.3693, "step": 583 }, { "epoch": 1.26133909287257, "grad_norm": 0.16210925579071045, "learning_rate": 7.185578833254665e-06, "loss": 0.3806, "step": 584 }, { "epoch": 1.263498920086393, "grad_norm": 0.17312122881412506, "learning_rate": 7.1742696827776415e-06, "loss": 0.3867, "step": 585 }, { "epoch": 1.265658747300216, "grad_norm": 0.16945572197437286, "learning_rate": 7.162946798431605e-06, "loss": 0.3834, "step": 586 }, { "epoch": 1.267818574514039, "grad_norm": 0.15858979523181915, "learning_rate": 7.151610251738045e-06, "loss": 0.3837, "step": 587 }, { "epoch": 1.2699784017278617, "grad_norm": 0.14600925147533417, "learning_rate": 7.1402601143047514e-06, "loss": 0.3797, "step": 588 }, { "epoch": 1.2721382289416847, "grad_norm": 0.15963494777679443, "learning_rate": 7.128896457825364e-06, "loss": 0.3904, "step": 589 }, { "epoch": 1.2742980561555075, "grad_norm": 0.1409822553396225, "learning_rate": 7.11751935407891e-06, "loss": 0.384, "step": 590 }, { "epoch": 1.2764578833693305, "grad_norm": 0.1461641937494278, "learning_rate": 7.106128874929364e-06, "loss": 0.3769, "step": 591 }, { "epoch": 1.2786177105831533, "grad_norm": 0.1487351655960083, "learning_rate": 7.094725092325177e-06, "loss": 0.3766, "step": 592 }, { "epoch": 1.2807775377969763, "grad_norm": 0.1428721696138382, "learning_rate": 7.08330807829884e-06, "loss": 0.3833, "step": 593 }, { "epoch": 1.2829373650107991, "grad_norm": 0.14245618879795074, "learning_rate": 7.071877904966422e-06, "loss": 0.382, "step": 594 }, { "epoch": 1.285097192224622, "grad_norm": 0.1549312025308609, "learning_rate": 7.060434644527105e-06, "loss": 0.3723, "step": 595 }, { "epoch": 1.287257019438445, "grad_norm": 0.14332742989063263, "learning_rate": 7.048978369262747e-06, "loss": 0.385, "step": 596 }, { "epoch": 1.2894168466522677, "grad_norm": 0.15278279781341553, "learning_rate": 7.037509151537404e-06, "loss": 0.3715, "step": 597 }, { "epoch": 1.2915766738660908, "grad_norm": 0.14458084106445312, "learning_rate": 7.026027063796891e-06, "loss": 0.3708, "step": 598 }, { "epoch": 1.2937365010799136, "grad_norm": 0.15547068417072296, "learning_rate": 7.014532178568314e-06, "loss": 0.3784, "step": 599 }, { "epoch": 1.2958963282937366, "grad_norm": 0.15412218868732452, "learning_rate": 7.003024568459614e-06, "loss": 0.3785, "step": 600 }, { "epoch": 1.2980561555075594, "grad_norm": 0.15792393684387207, "learning_rate": 6.991504306159115e-06, "loss": 0.3912, "step": 601 }, { "epoch": 1.3002159827213822, "grad_norm": 0.1512409746646881, "learning_rate": 6.9799714644350504e-06, "loss": 0.3822, "step": 602 }, { "epoch": 1.3023758099352052, "grad_norm": 0.15624138712882996, "learning_rate": 6.968426116135118e-06, "loss": 0.3786, "step": 603 }, { "epoch": 1.3045356371490282, "grad_norm": 0.1699935495853424, "learning_rate": 6.9568683341860135e-06, "loss": 0.382, "step": 604 }, { "epoch": 1.306695464362851, "grad_norm": 0.1427888125181198, "learning_rate": 6.945298191592967e-06, "loss": 0.3694, "step": 605 }, { "epoch": 1.3088552915766738, "grad_norm": 0.15631450712680817, "learning_rate": 6.93371576143929e-06, "loss": 0.3846, "step": 606 }, { "epoch": 1.3110151187904968, "grad_norm": 0.15259280800819397, "learning_rate": 6.922121116885905e-06, "loss": 0.378, "step": 607 }, { "epoch": 1.3131749460043196, "grad_norm": 0.13901083171367645, "learning_rate": 6.910514331170888e-06, "loss": 0.3852, "step": 608 }, { "epoch": 1.3153347732181426, "grad_norm": 0.15216070413589478, "learning_rate": 6.898895477609007e-06, "loss": 0.3852, "step": 609 }, { "epoch": 1.3174946004319654, "grad_norm": 0.13873577117919922, "learning_rate": 6.887264629591254e-06, "loss": 0.3677, "step": 610 }, { "epoch": 1.3196544276457884, "grad_norm": 0.15047885477542877, "learning_rate": 6.875621860584389e-06, "loss": 0.3811, "step": 611 }, { "epoch": 1.3218142548596112, "grad_norm": 0.13761691749095917, "learning_rate": 6.863967244130467e-06, "loss": 0.3766, "step": 612 }, { "epoch": 1.323974082073434, "grad_norm": 0.14068377017974854, "learning_rate": 6.852300853846381e-06, "loss": 0.3768, "step": 613 }, { "epoch": 1.326133909287257, "grad_norm": 0.14240694046020508, "learning_rate": 6.840622763423391e-06, "loss": 0.3804, "step": 614 }, { "epoch": 1.3282937365010798, "grad_norm": 0.14259974658489227, "learning_rate": 6.8289330466266635e-06, "loss": 0.3796, "step": 615 }, { "epoch": 1.3304535637149029, "grad_norm": 0.13572795689105988, "learning_rate": 6.817231777294804e-06, "loss": 0.3791, "step": 616 }, { "epoch": 1.3326133909287257, "grad_norm": 0.14472903311252594, "learning_rate": 6.805519029339388e-06, "loss": 0.3825, "step": 617 }, { "epoch": 1.3347732181425487, "grad_norm": 0.13924075663089752, "learning_rate": 6.793794876744499e-06, "loss": 0.3822, "step": 618 }, { "epoch": 1.3369330453563715, "grad_norm": 0.1507209688425064, "learning_rate": 6.782059393566254e-06, "loss": 0.3799, "step": 619 }, { "epoch": 1.3390928725701943, "grad_norm": 0.15504410862922668, "learning_rate": 6.770312653932346e-06, "loss": 0.396, "step": 620 }, { "epoch": 1.3412526997840173, "grad_norm": 0.14195454120635986, "learning_rate": 6.758554732041564e-06, "loss": 0.3797, "step": 621 }, { "epoch": 1.3434125269978403, "grad_norm": 0.158910870552063, "learning_rate": 6.7467857021633354e-06, "loss": 0.3923, "step": 622 }, { "epoch": 1.345572354211663, "grad_norm": 0.1433819979429245, "learning_rate": 6.7350056386372485e-06, "loss": 0.3819, "step": 623 }, { "epoch": 1.347732181425486, "grad_norm": 0.1474255919456482, "learning_rate": 6.723214615872585e-06, "loss": 0.3819, "step": 624 }, { "epoch": 1.349892008639309, "grad_norm": 0.15845805406570435, "learning_rate": 6.711412708347857e-06, "loss": 0.39, "step": 625 }, { "epoch": 1.3520518358531317, "grad_norm": 0.12925179302692413, "learning_rate": 6.699599990610324e-06, "loss": 0.3779, "step": 626 }, { "epoch": 1.3542116630669545, "grad_norm": 0.1499335616827011, "learning_rate": 6.68777653727553e-06, "loss": 0.3804, "step": 627 }, { "epoch": 1.3563714902807775, "grad_norm": 0.15274159610271454, "learning_rate": 6.675942423026834e-06, "loss": 0.3783, "step": 628 }, { "epoch": 1.3585313174946005, "grad_norm": 0.13748182356357574, "learning_rate": 6.664097722614934e-06, "loss": 0.3735, "step": 629 }, { "epoch": 1.3606911447084233, "grad_norm": 0.14609220623970032, "learning_rate": 6.652242510857395e-06, "loss": 0.392, "step": 630 }, { "epoch": 1.3628509719222461, "grad_norm": 0.1698596030473709, "learning_rate": 6.640376862638176e-06, "loss": 0.3832, "step": 631 }, { "epoch": 1.3650107991360692, "grad_norm": 0.1435316503047943, "learning_rate": 6.6285008529071615e-06, "loss": 0.3819, "step": 632 }, { "epoch": 1.367170626349892, "grad_norm": 0.13530634343624115, "learning_rate": 6.616614556679684e-06, "loss": 0.3809, "step": 633 }, { "epoch": 1.369330453563715, "grad_norm": 0.18893133103847504, "learning_rate": 6.604718049036047e-06, "loss": 0.3828, "step": 634 }, { "epoch": 1.3714902807775378, "grad_norm": 0.15310388803482056, "learning_rate": 6.592811405121064e-06, "loss": 0.3831, "step": 635 }, { "epoch": 1.3736501079913608, "grad_norm": 0.14660876989364624, "learning_rate": 6.580894700143565e-06, "loss": 0.3781, "step": 636 }, { "epoch": 1.3758099352051836, "grad_norm": 0.14833448827266693, "learning_rate": 6.568968009375938e-06, "loss": 0.3775, "step": 637 }, { "epoch": 1.3779697624190064, "grad_norm": 0.1682375818490982, "learning_rate": 6.557031408153642e-06, "loss": 0.3758, "step": 638 }, { "epoch": 1.3801295896328294, "grad_norm": 0.1533748358488083, "learning_rate": 6.545084971874738e-06, "loss": 0.3793, "step": 639 }, { "epoch": 1.3822894168466522, "grad_norm": 0.14140866696834564, "learning_rate": 6.533128775999411e-06, "loss": 0.384, "step": 640 }, { "epoch": 1.3844492440604752, "grad_norm": 0.14936432242393494, "learning_rate": 6.521162896049491e-06, "loss": 0.3891, "step": 641 }, { "epoch": 1.386609071274298, "grad_norm": 0.15731281042099, "learning_rate": 6.509187407607981e-06, "loss": 0.3841, "step": 642 }, { "epoch": 1.388768898488121, "grad_norm": 0.15000282227993011, "learning_rate": 6.497202386318573e-06, "loss": 0.3851, "step": 643 }, { "epoch": 1.3909287257019438, "grad_norm": 0.14697906374931335, "learning_rate": 6.485207907885175e-06, "loss": 0.3773, "step": 644 }, { "epoch": 1.3930885529157666, "grad_norm": 0.1559685468673706, "learning_rate": 6.473204048071433e-06, "loss": 0.3821, "step": 645 }, { "epoch": 1.3952483801295896, "grad_norm": 0.15039733052253723, "learning_rate": 6.4611908827002504e-06, "loss": 0.3847, "step": 646 }, { "epoch": 1.3974082073434126, "grad_norm": 0.16008631885051727, "learning_rate": 6.449168487653305e-06, "loss": 0.3802, "step": 647 }, { "epoch": 1.3995680345572354, "grad_norm": 0.14514020085334778, "learning_rate": 6.437136938870583e-06, "loss": 0.3841, "step": 648 }, { "epoch": 1.4017278617710582, "grad_norm": 0.15419939160346985, "learning_rate": 6.425096312349881e-06, "loss": 0.3903, "step": 649 }, { "epoch": 1.4038876889848813, "grad_norm": 0.1530395895242691, "learning_rate": 6.413046684146343e-06, "loss": 0.3794, "step": 650 }, { "epoch": 1.406047516198704, "grad_norm": 0.15808750689029694, "learning_rate": 6.400988130371969e-06, "loss": 0.3766, "step": 651 }, { "epoch": 1.408207343412527, "grad_norm": 0.14891669154167175, "learning_rate": 6.388920727195138e-06, "loss": 0.3781, "step": 652 }, { "epoch": 1.4103671706263499, "grad_norm": 0.14925065636634827, "learning_rate": 6.376844550840126e-06, "loss": 0.3906, "step": 653 }, { "epoch": 1.4125269978401729, "grad_norm": 0.16382241249084473, "learning_rate": 6.364759677586627e-06, "loss": 0.3771, "step": 654 }, { "epoch": 1.4146868250539957, "grad_norm": 0.1546749770641327, "learning_rate": 6.352666183769269e-06, "loss": 0.3863, "step": 655 }, { "epoch": 1.4168466522678185, "grad_norm": 0.14334626495838165, "learning_rate": 6.340564145777131e-06, "loss": 0.3742, "step": 656 }, { "epoch": 1.4190064794816415, "grad_norm": 0.15877507627010345, "learning_rate": 6.328453640053264e-06, "loss": 0.3779, "step": 657 }, { "epoch": 1.4211663066954643, "grad_norm": 0.1556321382522583, "learning_rate": 6.316334743094201e-06, "loss": 0.3739, "step": 658 }, { "epoch": 1.4233261339092873, "grad_norm": 0.14519956707954407, "learning_rate": 6.304207531449486e-06, "loss": 0.3786, "step": 659 }, { "epoch": 1.42548596112311, "grad_norm": 0.14613112807273865, "learning_rate": 6.292072081721173e-06, "loss": 0.381, "step": 660 }, { "epoch": 1.4276457883369331, "grad_norm": 0.15813447535037994, "learning_rate": 6.279928470563365e-06, "loss": 0.3866, "step": 661 }, { "epoch": 1.429805615550756, "grad_norm": 0.15421751141548157, "learning_rate": 6.267776774681703e-06, "loss": 0.3796, "step": 662 }, { "epoch": 1.4319654427645787, "grad_norm": 0.15891966223716736, "learning_rate": 6.255617070832908e-06, "loss": 0.3717, "step": 663 }, { "epoch": 1.4341252699784017, "grad_norm": 0.15779848396778107, "learning_rate": 6.243449435824276e-06, "loss": 0.3701, "step": 664 }, { "epoch": 1.4362850971922247, "grad_norm": 0.14361926913261414, "learning_rate": 6.231273946513201e-06, "loss": 0.3698, "step": 665 }, { "epoch": 1.4384449244060475, "grad_norm": 0.1553213894367218, "learning_rate": 6.219090679806694e-06, "loss": 0.381, "step": 666 }, { "epoch": 1.4406047516198703, "grad_norm": 0.14260952174663544, "learning_rate": 6.206899712660887e-06, "loss": 0.3734, "step": 667 }, { "epoch": 1.4427645788336934, "grad_norm": 0.147933229804039, "learning_rate": 6.1947011220805535e-06, "loss": 0.3799, "step": 668 }, { "epoch": 1.4449244060475162, "grad_norm": 0.15127696096897125, "learning_rate": 6.182494985118625e-06, "loss": 0.3792, "step": 669 }, { "epoch": 1.4470842332613392, "grad_norm": 0.14316388964653015, "learning_rate": 6.170281378875692e-06, "loss": 0.3727, "step": 670 }, { "epoch": 1.449244060475162, "grad_norm": 0.14327897131443024, "learning_rate": 6.158060380499533e-06, "loss": 0.3823, "step": 671 }, { "epoch": 1.451403887688985, "grad_norm": 0.1345098912715912, "learning_rate": 6.145832067184614e-06, "loss": 0.3924, "step": 672 }, { "epoch": 1.4535637149028078, "grad_norm": 0.13999171555042267, "learning_rate": 6.133596516171609e-06, "loss": 0.3809, "step": 673 }, { "epoch": 1.4557235421166306, "grad_norm": 0.12059102207422256, "learning_rate": 6.121353804746907e-06, "loss": 0.3788, "step": 674 }, { "epoch": 1.4578833693304536, "grad_norm": 0.14187489449977875, "learning_rate": 6.109104010242127e-06, "loss": 0.3845, "step": 675 }, { "epoch": 1.4600431965442764, "grad_norm": 0.14432717859745026, "learning_rate": 6.09684721003363e-06, "loss": 0.3801, "step": 676 }, { "epoch": 1.4622030237580994, "grad_norm": 0.1373838186264038, "learning_rate": 6.084583481542028e-06, "loss": 0.3731, "step": 677 }, { "epoch": 1.4643628509719222, "grad_norm": 0.15109464526176453, "learning_rate": 6.072312902231692e-06, "loss": 0.3895, "step": 678 }, { "epoch": 1.4665226781857452, "grad_norm": 0.15525732934474945, "learning_rate": 6.060035549610275e-06, "loss": 0.3785, "step": 679 }, { "epoch": 1.468682505399568, "grad_norm": 0.14632560312747955, "learning_rate": 6.047751501228203e-06, "loss": 0.3793, "step": 680 }, { "epoch": 1.4708423326133908, "grad_norm": 0.1495695561170578, "learning_rate": 6.0354608346782075e-06, "loss": 0.3817, "step": 681 }, { "epoch": 1.4730021598272138, "grad_norm": 0.1651640236377716, "learning_rate": 6.023163627594813e-06, "loss": 0.386, "step": 682 }, { "epoch": 1.4751619870410368, "grad_norm": 0.14565086364746094, "learning_rate": 6.010859957653869e-06, "loss": 0.3749, "step": 683 }, { "epoch": 1.4773218142548596, "grad_norm": 0.1538180410861969, "learning_rate": 5.9985499025720354e-06, "loss": 0.3769, "step": 684 }, { "epoch": 1.4794816414686824, "grad_norm": 0.12949156761169434, "learning_rate": 5.986233540106315e-06, "loss": 0.3721, "step": 685 }, { "epoch": 1.4816414686825055, "grad_norm": 0.14916200935840607, "learning_rate": 5.973910948053545e-06, "loss": 0.386, "step": 686 }, { "epoch": 1.4838012958963283, "grad_norm": 0.1727481335401535, "learning_rate": 5.961582204249915e-06, "loss": 0.3769, "step": 687 }, { "epoch": 1.485961123110151, "grad_norm": 0.1516205221414566, "learning_rate": 5.949247386570471e-06, "loss": 0.3865, "step": 688 }, { "epoch": 1.488120950323974, "grad_norm": 0.164317786693573, "learning_rate": 5.936906572928625e-06, "loss": 0.3803, "step": 689 }, { "epoch": 1.490280777537797, "grad_norm": 0.15949031710624695, "learning_rate": 5.924559841275661e-06, "loss": 0.3819, "step": 690 }, { "epoch": 1.4924406047516199, "grad_norm": 0.13948768377304077, "learning_rate": 5.912207269600252e-06, "loss": 0.381, "step": 691 }, { "epoch": 1.4946004319654427, "grad_norm": 0.17031709849834442, "learning_rate": 5.89984893592795e-06, "loss": 0.3837, "step": 692 }, { "epoch": 1.4967602591792657, "grad_norm": 0.13423483073711395, "learning_rate": 5.887484918320708e-06, "loss": 0.3824, "step": 693 }, { "epoch": 1.4989200863930885, "grad_norm": 0.15468288958072662, "learning_rate": 5.8751152948763815e-06, "loss": 0.372, "step": 694 }, { "epoch": 1.5010799136069113, "grad_norm": 0.16139718890190125, "learning_rate": 5.8627401437282334e-06, "loss": 0.3775, "step": 695 }, { "epoch": 1.5032397408207343, "grad_norm": 0.15016594529151917, "learning_rate": 5.850359543044446e-06, "loss": 0.3781, "step": 696 }, { "epoch": 1.5053995680345573, "grad_norm": 0.1405385285615921, "learning_rate": 5.837973571027621e-06, "loss": 0.3789, "step": 697 }, { "epoch": 1.5075593952483801, "grad_norm": 0.15115734934806824, "learning_rate": 5.82558230591429e-06, "loss": 0.384, "step": 698 }, { "epoch": 1.509719222462203, "grad_norm": 0.14200249314308167, "learning_rate": 5.813185825974419e-06, "loss": 0.3846, "step": 699 }, { "epoch": 1.511879049676026, "grad_norm": 0.147098109126091, "learning_rate": 5.80078420951091e-06, "loss": 0.3839, "step": 700 }, { "epoch": 1.514038876889849, "grad_norm": 0.1505637764930725, "learning_rate": 5.7883775348591146e-06, "loss": 0.3795, "step": 701 }, { "epoch": 1.5161987041036717, "grad_norm": 0.14681459963321686, "learning_rate": 5.77596588038633e-06, "loss": 0.3879, "step": 702 }, { "epoch": 1.5183585313174945, "grad_norm": 0.1480962336063385, "learning_rate": 5.763549324491317e-06, "loss": 0.3851, "step": 703 }, { "epoch": 1.5205183585313176, "grad_norm": 0.1447979360818863, "learning_rate": 5.751127945603786e-06, "loss": 0.379, "step": 704 }, { "epoch": 1.5226781857451404, "grad_norm": 0.14420117437839508, "learning_rate": 5.7387018221839195e-06, "loss": 0.3844, "step": 705 }, { "epoch": 1.5248380129589632, "grad_norm": 0.1400950700044632, "learning_rate": 5.726271032721864e-06, "loss": 0.3854, "step": 706 }, { "epoch": 1.5269978401727862, "grad_norm": 0.14526066184043884, "learning_rate": 5.7138356557372444e-06, "loss": 0.3815, "step": 707 }, { "epoch": 1.5291576673866092, "grad_norm": 0.15576431155204773, "learning_rate": 5.70139576977866e-06, "loss": 0.3866, "step": 708 }, { "epoch": 1.531317494600432, "grad_norm": 0.1484983265399933, "learning_rate": 5.68895145342319e-06, "loss": 0.3695, "step": 709 }, { "epoch": 1.5334773218142548, "grad_norm": 0.14368119835853577, "learning_rate": 5.6765027852759015e-06, "loss": 0.3751, "step": 710 }, { "epoch": 1.5356371490280778, "grad_norm": 0.14985284209251404, "learning_rate": 5.664049843969348e-06, "loss": 0.3759, "step": 711 }, { "epoch": 1.5377969762419006, "grad_norm": 0.14019222557544708, "learning_rate": 5.651592708163074e-06, "loss": 0.3768, "step": 712 }, { "epoch": 1.5399568034557234, "grad_norm": 0.15098613500595093, "learning_rate": 5.639131456543119e-06, "loss": 0.3755, "step": 713 }, { "epoch": 1.5421166306695464, "grad_norm": 0.13311173021793365, "learning_rate": 5.626666167821522e-06, "loss": 0.3727, "step": 714 }, { "epoch": 1.5442764578833694, "grad_norm": 0.1466924548149109, "learning_rate": 5.614196920735822e-06, "loss": 0.3816, "step": 715 }, { "epoch": 1.5464362850971922, "grad_norm": 0.14080245792865753, "learning_rate": 5.601723794048558e-06, "loss": 0.3808, "step": 716 }, { "epoch": 1.548596112311015, "grad_norm": 0.13415026664733887, "learning_rate": 5.58924686654678e-06, "loss": 0.3846, "step": 717 }, { "epoch": 1.550755939524838, "grad_norm": 0.14663489162921906, "learning_rate": 5.576766217041541e-06, "loss": 0.3728, "step": 718 }, { "epoch": 1.552915766738661, "grad_norm": 0.14009855687618256, "learning_rate": 5.5642819243674085e-06, "loss": 0.3661, "step": 719 }, { "epoch": 1.5550755939524838, "grad_norm": 0.12949179112911224, "learning_rate": 5.551794067381959e-06, "loss": 0.3766, "step": 720 }, { "epoch": 1.5572354211663066, "grad_norm": 0.1387072652578354, "learning_rate": 5.5393027249652844e-06, "loss": 0.3863, "step": 721 }, { "epoch": 1.5593952483801297, "grad_norm": 0.13852353394031525, "learning_rate": 5.526807976019492e-06, "loss": 0.3777, "step": 722 }, { "epoch": 1.5615550755939525, "grad_norm": 0.144193634390831, "learning_rate": 5.514309899468209e-06, "loss": 0.3708, "step": 723 }, { "epoch": 1.5637149028077753, "grad_norm": 0.13979433476924896, "learning_rate": 5.5018085742560745e-06, "loss": 0.3827, "step": 724 }, { "epoch": 1.5658747300215983, "grad_norm": 0.1412208378314972, "learning_rate": 5.489304079348259e-06, "loss": 0.3819, "step": 725 }, { "epoch": 1.5680345572354213, "grad_norm": 0.14092062413692474, "learning_rate": 5.476796493729943e-06, "loss": 0.38, "step": 726 }, { "epoch": 1.570194384449244, "grad_norm": 0.15495967864990234, "learning_rate": 5.46428589640584e-06, "loss": 0.3941, "step": 727 }, { "epoch": 1.5723542116630669, "grad_norm": 0.14512132108211517, "learning_rate": 5.451772366399678e-06, "loss": 0.3912, "step": 728 }, { "epoch": 1.57451403887689, "grad_norm": 0.15392383933067322, "learning_rate": 5.439255982753717e-06, "loss": 0.3751, "step": 729 }, { "epoch": 1.5766738660907127, "grad_norm": 0.14311961829662323, "learning_rate": 5.426736824528236e-06, "loss": 0.379, "step": 730 }, { "epoch": 1.5788336933045355, "grad_norm": 0.14618806540966034, "learning_rate": 5.414214970801041e-06, "loss": 0.3794, "step": 731 }, { "epoch": 1.5809935205183585, "grad_norm": 0.13308942317962646, "learning_rate": 5.401690500666972e-06, "loss": 0.3823, "step": 732 }, { "epoch": 1.5831533477321815, "grad_norm": 0.14792852103710175, "learning_rate": 5.389163493237382e-06, "loss": 0.379, "step": 733 }, { "epoch": 1.5853131749460043, "grad_norm": 0.15516813099384308, "learning_rate": 5.376634027639664e-06, "loss": 0.381, "step": 734 }, { "epoch": 1.5874730021598271, "grad_norm": 0.13429994881153107, "learning_rate": 5.36410218301673e-06, "loss": 0.3848, "step": 735 }, { "epoch": 1.5896328293736501, "grad_norm": 0.13838984072208405, "learning_rate": 5.35156803852652e-06, "loss": 0.3802, "step": 736 }, { "epoch": 1.5917926565874732, "grad_norm": 0.14528213441371918, "learning_rate": 5.339031673341505e-06, "loss": 0.3677, "step": 737 }, { "epoch": 1.593952483801296, "grad_norm": 0.14330457150936127, "learning_rate": 5.326493166648179e-06, "loss": 0.3754, "step": 738 }, { "epoch": 1.5961123110151187, "grad_norm": 0.150346577167511, "learning_rate": 5.3139525976465675e-06, "loss": 0.3867, "step": 739 }, { "epoch": 1.5982721382289418, "grad_norm": 0.13384312391281128, "learning_rate": 5.301410045549719e-06, "loss": 0.3807, "step": 740 }, { "epoch": 1.6004319654427646, "grad_norm": 0.13676618039608002, "learning_rate": 5.2888655895832075e-06, "loss": 0.3776, "step": 741 }, { "epoch": 1.6025917926565874, "grad_norm": 0.1279810667037964, "learning_rate": 5.276319308984637e-06, "loss": 0.3701, "step": 742 }, { "epoch": 1.6047516198704104, "grad_norm": 0.14258207380771637, "learning_rate": 5.263771283003133e-06, "loss": 0.3724, "step": 743 }, { "epoch": 1.6069114470842334, "grad_norm": 0.14498306810855865, "learning_rate": 5.251221590898848e-06, "loss": 0.3716, "step": 744 }, { "epoch": 1.6090712742980562, "grad_norm": 0.14295688271522522, "learning_rate": 5.238670311942459e-06, "loss": 0.3877, "step": 745 }, { "epoch": 1.611231101511879, "grad_norm": 0.12707604467868805, "learning_rate": 5.226117525414663e-06, "loss": 0.3724, "step": 746 }, { "epoch": 1.613390928725702, "grad_norm": 0.14804835617542267, "learning_rate": 5.213563310605686e-06, "loss": 0.3827, "step": 747 }, { "epoch": 1.6155507559395248, "grad_norm": 0.12879379093647003, "learning_rate": 5.201007746814767e-06, "loss": 0.3706, "step": 748 }, { "epoch": 1.6177105831533476, "grad_norm": 0.14704205095767975, "learning_rate": 5.188450913349674e-06, "loss": 0.3869, "step": 749 }, { "epoch": 1.6198704103671706, "grad_norm": 0.1534765362739563, "learning_rate": 5.175892889526189e-06, "loss": 0.3736, "step": 750 }, { "epoch": 1.6220302375809936, "grad_norm": 0.1547222137451172, "learning_rate": 5.16333375466762e-06, "loss": 0.3796, "step": 751 }, { "epoch": 1.6241900647948164, "grad_norm": 0.13741885125637054, "learning_rate": 5.150773588104284e-06, "loss": 0.3817, "step": 752 }, { "epoch": 1.6263498920086392, "grad_norm": 0.13468679785728455, "learning_rate": 5.138212469173022e-06, "loss": 0.3781, "step": 753 }, { "epoch": 1.6285097192224622, "grad_norm": 0.1447237730026245, "learning_rate": 5.1256504772166885e-06, "loss": 0.3609, "step": 754 }, { "epoch": 1.6306695464362853, "grad_norm": 0.14089703559875488, "learning_rate": 5.1130876915836495e-06, "loss": 0.3609, "step": 755 }, { "epoch": 1.6328293736501078, "grad_norm": 0.13828714191913605, "learning_rate": 5.100524191627289e-06, "loss": 0.377, "step": 756 }, { "epoch": 1.6349892008639308, "grad_norm": 0.14428219199180603, "learning_rate": 5.087960056705499e-06, "loss": 0.3702, "step": 757 }, { "epoch": 1.6371490280777539, "grad_norm": 0.15890643000602722, "learning_rate": 5.075395366180186e-06, "loss": 0.3838, "step": 758 }, { "epoch": 1.6393088552915767, "grad_norm": 0.1514156460762024, "learning_rate": 5.062830199416764e-06, "loss": 0.3852, "step": 759 }, { "epoch": 1.6414686825053995, "grad_norm": 0.13208477199077606, "learning_rate": 5.050264635783654e-06, "loss": 0.3925, "step": 760 }, { "epoch": 1.6436285097192225, "grad_norm": 0.13601286709308624, "learning_rate": 5.037698754651786e-06, "loss": 0.3847, "step": 761 }, { "epoch": 1.6457883369330455, "grad_norm": 0.1418139487504959, "learning_rate": 5.025132635394095e-06, "loss": 0.3744, "step": 762 }, { "epoch": 1.6479481641468683, "grad_norm": 0.14959508180618286, "learning_rate": 5.0125663573850204e-06, "loss": 0.3712, "step": 763 }, { "epoch": 1.650107991360691, "grad_norm": 0.12993188202381134, "learning_rate": 5e-06, "loss": 0.38, "step": 764 }, { "epoch": 1.652267818574514, "grad_norm": 0.14041666686534882, "learning_rate": 4.987433642614981e-06, "loss": 0.3751, "step": 765 }, { "epoch": 1.654427645788337, "grad_norm": 0.1548304408788681, "learning_rate": 4.974867364605906e-06, "loss": 0.3588, "step": 766 }, { "epoch": 1.6565874730021597, "grad_norm": 0.12369633466005325, "learning_rate": 4.962301245348215e-06, "loss": 0.3822, "step": 767 }, { "epoch": 1.6587473002159827, "grad_norm": 0.13229462504386902, "learning_rate": 4.949735364216348e-06, "loss": 0.3631, "step": 768 }, { "epoch": 1.6609071274298057, "grad_norm": 0.13191936910152435, "learning_rate": 4.937169800583237e-06, "loss": 0.3783, "step": 769 }, { "epoch": 1.6630669546436285, "grad_norm": 0.14189469814300537, "learning_rate": 4.924604633819815e-06, "loss": 0.3724, "step": 770 }, { "epoch": 1.6652267818574513, "grad_norm": 0.1306021511554718, "learning_rate": 4.912039943294502e-06, "loss": 0.3736, "step": 771 }, { "epoch": 1.6673866090712743, "grad_norm": 0.1423332244157791, "learning_rate": 4.899475808372714e-06, "loss": 0.3735, "step": 772 }, { "epoch": 1.6695464362850974, "grad_norm": 0.13784444332122803, "learning_rate": 4.886912308416353e-06, "loss": 0.3737, "step": 773 }, { "epoch": 1.67170626349892, "grad_norm": 0.13520213961601257, "learning_rate": 4.874349522783313e-06, "loss": 0.3678, "step": 774 }, { "epoch": 1.673866090712743, "grad_norm": 0.15318076312541962, "learning_rate": 4.861787530826979e-06, "loss": 0.3716, "step": 775 }, { "epoch": 1.676025917926566, "grad_norm": 0.12309125065803528, "learning_rate": 4.8492264118957165e-06, "loss": 0.386, "step": 776 }, { "epoch": 1.6781857451403888, "grad_norm": 0.1470710188150406, "learning_rate": 4.8366662453323826e-06, "loss": 0.3848, "step": 777 }, { "epoch": 1.6803455723542116, "grad_norm": 0.12907086312770844, "learning_rate": 4.8241071104738115e-06, "loss": 0.3689, "step": 778 }, { "epoch": 1.6825053995680346, "grad_norm": 0.13970084488391876, "learning_rate": 4.811549086650327e-06, "loss": 0.3814, "step": 779 }, { "epoch": 1.6846652267818576, "grad_norm": 0.13439306616783142, "learning_rate": 4.798992253185233e-06, "loss": 0.3717, "step": 780 }, { "epoch": 1.6868250539956804, "grad_norm": 0.13519345223903656, "learning_rate": 4.786436689394317e-06, "loss": 0.3765, "step": 781 }, { "epoch": 1.6889848812095032, "grad_norm": 0.13258984684944153, "learning_rate": 4.773882474585338e-06, "loss": 0.3809, "step": 782 }, { "epoch": 1.6911447084233262, "grad_norm": 0.12966322898864746, "learning_rate": 4.761329688057543e-06, "loss": 0.3782, "step": 783 }, { "epoch": 1.693304535637149, "grad_norm": 0.13643068075180054, "learning_rate": 4.748778409101153e-06, "loss": 0.3796, "step": 784 }, { "epoch": 1.6954643628509718, "grad_norm": 0.1507895290851593, "learning_rate": 4.736228716996868e-06, "loss": 0.3789, "step": 785 }, { "epoch": 1.6976241900647948, "grad_norm": 0.14031574130058289, "learning_rate": 4.723680691015366e-06, "loss": 0.3816, "step": 786 }, { "epoch": 1.6997840172786178, "grad_norm": 0.13055071234703064, "learning_rate": 4.711134410416794e-06, "loss": 0.3643, "step": 787 }, { "epoch": 1.7019438444924406, "grad_norm": 0.15579389035701752, "learning_rate": 4.6985899544502835e-06, "loss": 0.3797, "step": 788 }, { "epoch": 1.7041036717062634, "grad_norm": 0.1301419883966446, "learning_rate": 4.686047402353433e-06, "loss": 0.3793, "step": 789 }, { "epoch": 1.7062634989200864, "grad_norm": 0.13526466488838196, "learning_rate": 4.673506833351821e-06, "loss": 0.3911, "step": 790 }, { "epoch": 1.7084233261339092, "grad_norm": 0.1373325139284134, "learning_rate": 4.660968326658497e-06, "loss": 0.3774, "step": 791 }, { "epoch": 1.710583153347732, "grad_norm": 0.1474619358778, "learning_rate": 4.648431961473482e-06, "loss": 0.368, "step": 792 }, { "epoch": 1.712742980561555, "grad_norm": 0.14143545925617218, "learning_rate": 4.635897816983272e-06, "loss": 0.3779, "step": 793 }, { "epoch": 1.714902807775378, "grad_norm": 0.14204931259155273, "learning_rate": 4.6233659723603374e-06, "loss": 0.3667, "step": 794 }, { "epoch": 1.7170626349892009, "grad_norm": 0.13979306817054749, "learning_rate": 4.610836506762618e-06, "loss": 0.3782, "step": 795 }, { "epoch": 1.7192224622030237, "grad_norm": 0.14510124921798706, "learning_rate": 4.59830949933303e-06, "loss": 0.3705, "step": 796 }, { "epoch": 1.7213822894168467, "grad_norm": 0.14503952860832214, "learning_rate": 4.5857850291989596e-06, "loss": 0.3804, "step": 797 }, { "epoch": 1.7235421166306697, "grad_norm": 0.13347502052783966, "learning_rate": 4.573263175471766e-06, "loss": 0.3706, "step": 798 }, { "epoch": 1.7257019438444925, "grad_norm": 0.12476824969053268, "learning_rate": 4.560744017246284e-06, "loss": 0.3756, "step": 799 }, { "epoch": 1.7278617710583153, "grad_norm": 0.13821221888065338, "learning_rate": 4.548227633600322e-06, "loss": 0.3802, "step": 800 }, { "epoch": 1.7300215982721383, "grad_norm": 0.13669945299625397, "learning_rate": 4.535714103594162e-06, "loss": 0.3818, "step": 801 }, { "epoch": 1.732181425485961, "grad_norm": 0.1308770775794983, "learning_rate": 4.523203506270058e-06, "loss": 0.3836, "step": 802 }, { "epoch": 1.734341252699784, "grad_norm": 0.1351071000099182, "learning_rate": 4.510695920651742e-06, "loss": 0.3757, "step": 803 }, { "epoch": 1.736501079913607, "grad_norm": 0.1277286410331726, "learning_rate": 4.4981914257439254e-06, "loss": 0.387, "step": 804 }, { "epoch": 1.73866090712743, "grad_norm": 0.1272444725036621, "learning_rate": 4.485690100531793e-06, "loss": 0.3829, "step": 805 }, { "epoch": 1.7408207343412527, "grad_norm": 0.14064733684062958, "learning_rate": 4.473192023980509e-06, "loss": 0.3822, "step": 806 }, { "epoch": 1.7429805615550755, "grad_norm": 0.13635459542274475, "learning_rate": 4.460697275034717e-06, "loss": 0.38, "step": 807 }, { "epoch": 1.7451403887688985, "grad_norm": 0.136144757270813, "learning_rate": 4.448205932618042e-06, "loss": 0.3794, "step": 808 }, { "epoch": 1.7473002159827213, "grad_norm": 0.14044472575187683, "learning_rate": 4.4357180756325915e-06, "loss": 0.3741, "step": 809 }, { "epoch": 1.7494600431965441, "grad_norm": 0.13555637001991272, "learning_rate": 4.423233782958459e-06, "loss": 0.369, "step": 810 }, { "epoch": 1.7516198704103672, "grad_norm": 0.1326342225074768, "learning_rate": 4.410753133453222e-06, "loss": 0.3784, "step": 811 }, { "epoch": 1.7537796976241902, "grad_norm": 0.13601535558700562, "learning_rate": 4.398276205951443e-06, "loss": 0.3821, "step": 812 }, { "epoch": 1.755939524838013, "grad_norm": 0.13336274027824402, "learning_rate": 4.38580307926418e-06, "loss": 0.3713, "step": 813 }, { "epoch": 1.7580993520518358, "grad_norm": 0.14658118784427643, "learning_rate": 4.373333832178478e-06, "loss": 0.3825, "step": 814 }, { "epoch": 1.7602591792656588, "grad_norm": 0.14051300287246704, "learning_rate": 4.360868543456883e-06, "loss": 0.3685, "step": 815 }, { "epoch": 1.7624190064794818, "grad_norm": 0.1231897696852684, "learning_rate": 4.348407291836928e-06, "loss": 0.37, "step": 816 }, { "epoch": 1.7645788336933044, "grad_norm": 0.13528205454349518, "learning_rate": 4.335950156030653e-06, "loss": 0.3855, "step": 817 }, { "epoch": 1.7667386609071274, "grad_norm": 0.14070774614810944, "learning_rate": 4.323497214724099e-06, "loss": 0.3752, "step": 818 }, { "epoch": 1.7688984881209504, "grad_norm": 0.1284414529800415, "learning_rate": 4.31104854657681e-06, "loss": 0.3659, "step": 819 }, { "epoch": 1.7710583153347732, "grad_norm": 0.13247400522232056, "learning_rate": 4.298604230221341e-06, "loss": 0.3727, "step": 820 }, { "epoch": 1.773218142548596, "grad_norm": 0.12880460917949677, "learning_rate": 4.286164344262756e-06, "loss": 0.3867, "step": 821 }, { "epoch": 1.775377969762419, "grad_norm": 0.12950289249420166, "learning_rate": 4.273728967278137e-06, "loss": 0.3685, "step": 822 }, { "epoch": 1.777537796976242, "grad_norm": 0.1209382489323616, "learning_rate": 4.261298177816082e-06, "loss": 0.3658, "step": 823 }, { "epoch": 1.7796976241900648, "grad_norm": 0.1271076798439026, "learning_rate": 4.248872054396215e-06, "loss": 0.3801, "step": 824 }, { "epoch": 1.7818574514038876, "grad_norm": 0.1265021562576294, "learning_rate": 4.2364506755086856e-06, "loss": 0.3719, "step": 825 }, { "epoch": 1.7840172786177106, "grad_norm": 0.12241175025701523, "learning_rate": 4.224034119613671e-06, "loss": 0.3744, "step": 826 }, { "epoch": 1.7861771058315334, "grad_norm": 0.12162206321954727, "learning_rate": 4.211622465140887e-06, "loss": 0.3797, "step": 827 }, { "epoch": 1.7883369330453562, "grad_norm": 0.12623707950115204, "learning_rate": 4.199215790489091e-06, "loss": 0.3859, "step": 828 }, { "epoch": 1.7904967602591793, "grad_norm": 0.13991111516952515, "learning_rate": 4.186814174025582e-06, "loss": 0.3736, "step": 829 }, { "epoch": 1.7926565874730023, "grad_norm": 0.12621738016605377, "learning_rate": 4.174417694085711e-06, "loss": 0.3743, "step": 830 }, { "epoch": 1.794816414686825, "grad_norm": 0.12811291217803955, "learning_rate": 4.16202642897238e-06, "loss": 0.3782, "step": 831 }, { "epoch": 1.7969762419006479, "grad_norm": 0.12236473709344864, "learning_rate": 4.149640456955555e-06, "loss": 0.3764, "step": 832 }, { "epoch": 1.7991360691144709, "grad_norm": 0.142435684800148, "learning_rate": 4.137259856271767e-06, "loss": 0.3719, "step": 833 }, { "epoch": 1.801295896328294, "grad_norm": 0.12946586310863495, "learning_rate": 4.124884705123619e-06, "loss": 0.3852, "step": 834 }, { "epoch": 1.8034557235421165, "grad_norm": 0.1189626008272171, "learning_rate": 4.112515081679295e-06, "loss": 0.3751, "step": 835 }, { "epoch": 1.8056155507559395, "grad_norm": 0.13230590522289276, "learning_rate": 4.1001510640720525e-06, "loss": 0.3688, "step": 836 }, { "epoch": 1.8077753779697625, "grad_norm": 0.13355307281017303, "learning_rate": 4.087792730399749e-06, "loss": 0.3885, "step": 837 }, { "epoch": 1.8099352051835853, "grad_norm": 0.13115477561950684, "learning_rate": 4.075440158724339e-06, "loss": 0.3807, "step": 838 }, { "epoch": 1.812095032397408, "grad_norm": 0.11709022521972656, "learning_rate": 4.063093427071376e-06, "loss": 0.3715, "step": 839 }, { "epoch": 1.8142548596112311, "grad_norm": 0.13939060270786285, "learning_rate": 4.0507526134295314e-06, "loss": 0.3718, "step": 840 }, { "epoch": 1.8164146868250541, "grad_norm": 0.13002587854862213, "learning_rate": 4.038417795750086e-06, "loss": 0.378, "step": 841 }, { "epoch": 1.818574514038877, "grad_norm": 0.13568507134914398, "learning_rate": 4.0260890519464565e-06, "loss": 0.3715, "step": 842 }, { "epoch": 1.8207343412526997, "grad_norm": 0.13115161657333374, "learning_rate": 4.013766459893686e-06, "loss": 0.374, "step": 843 }, { "epoch": 1.8228941684665227, "grad_norm": 0.1360725313425064, "learning_rate": 4.001450097427965e-06, "loss": 0.3915, "step": 844 }, { "epoch": 1.8250539956803455, "grad_norm": 0.14375773072242737, "learning_rate": 3.989140042346134e-06, "loss": 0.3823, "step": 845 }, { "epoch": 1.8272138228941683, "grad_norm": 0.14056192338466644, "learning_rate": 3.9768363724051875e-06, "loss": 0.3797, "step": 846 }, { "epoch": 1.8293736501079914, "grad_norm": 0.13145223259925842, "learning_rate": 3.964539165321795e-06, "loss": 0.3651, "step": 847 }, { "epoch": 1.8315334773218144, "grad_norm": 0.1401747614145279, "learning_rate": 3.952248498771797e-06, "loss": 0.3803, "step": 848 }, { "epoch": 1.8336933045356372, "grad_norm": 0.1457161009311676, "learning_rate": 3.939964450389728e-06, "loss": 0.3875, "step": 849 }, { "epoch": 1.83585313174946, "grad_norm": 0.1399625837802887, "learning_rate": 3.927687097768309e-06, "loss": 0.3855, "step": 850 }, { "epoch": 1.838012958963283, "grad_norm": 0.12442053109407425, "learning_rate": 3.915416518457974e-06, "loss": 0.3885, "step": 851 }, { "epoch": 1.8401727861771058, "grad_norm": 0.12682035565376282, "learning_rate": 3.9031527899663705e-06, "loss": 0.3708, "step": 852 }, { "epoch": 1.8423326133909286, "grad_norm": 0.12829378247261047, "learning_rate": 3.890895989757874e-06, "loss": 0.376, "step": 853 }, { "epoch": 1.8444924406047516, "grad_norm": 0.14053881168365479, "learning_rate": 3.8786461952530955e-06, "loss": 0.373, "step": 854 }, { "epoch": 1.8466522678185746, "grad_norm": 0.1281130015850067, "learning_rate": 3.866403483828392e-06, "loss": 0.3773, "step": 855 }, { "epoch": 1.8488120950323974, "grad_norm": 0.12932966649532318, "learning_rate": 3.854167932815387e-06, "loss": 0.383, "step": 856 }, { "epoch": 1.8509719222462202, "grad_norm": 0.1419646143913269, "learning_rate": 3.841939619500468e-06, "loss": 0.3674, "step": 857 }, { "epoch": 1.8531317494600432, "grad_norm": 0.12675082683563232, "learning_rate": 3.8297186211243085e-06, "loss": 0.3814, "step": 858 }, { "epoch": 1.8552915766738662, "grad_norm": 0.11979357898235321, "learning_rate": 3.817505014881378e-06, "loss": 0.38, "step": 859 }, { "epoch": 1.857451403887689, "grad_norm": 0.13714280724525452, "learning_rate": 3.8052988779194478e-06, "loss": 0.3823, "step": 860 }, { "epoch": 1.8596112311015118, "grad_norm": 0.13954536616802216, "learning_rate": 3.7931002873391156e-06, "loss": 0.3796, "step": 861 }, { "epoch": 1.8617710583153348, "grad_norm": 0.11842264980077744, "learning_rate": 3.7809093201933078e-06, "loss": 0.3761, "step": 862 }, { "epoch": 1.8639308855291576, "grad_norm": 0.12878850102424622, "learning_rate": 3.7687260534868e-06, "loss": 0.3821, "step": 863 }, { "epoch": 1.8660907127429804, "grad_norm": 0.14155155420303345, "learning_rate": 3.756550564175727e-06, "loss": 0.3748, "step": 864 }, { "epoch": 1.8682505399568035, "grad_norm": 0.12996132671833038, "learning_rate": 3.744382929167094e-06, "loss": 0.3741, "step": 865 }, { "epoch": 1.8704103671706265, "grad_norm": 0.12898430228233337, "learning_rate": 3.7322232253182984e-06, "loss": 0.3763, "step": 866 }, { "epoch": 1.8725701943844493, "grad_norm": 0.12044209241867065, "learning_rate": 3.7200715294366376e-06, "loss": 0.3747, "step": 867 }, { "epoch": 1.874730021598272, "grad_norm": 0.12121162563562393, "learning_rate": 3.7079279182788263e-06, "loss": 0.381, "step": 868 }, { "epoch": 1.876889848812095, "grad_norm": 0.12106562405824661, "learning_rate": 3.695792468550517e-06, "loss": 0.3767, "step": 869 }, { "epoch": 1.8790496760259179, "grad_norm": 0.118615061044693, "learning_rate": 3.6836652569057994e-06, "loss": 0.3708, "step": 870 }, { "epoch": 1.8812095032397407, "grad_norm": 0.12058837711811066, "learning_rate": 3.6715463599467372e-06, "loss": 0.3778, "step": 871 }, { "epoch": 1.8833693304535637, "grad_norm": 0.12485583126544952, "learning_rate": 3.659435854222869e-06, "loss": 0.3679, "step": 872 }, { "epoch": 1.8855291576673867, "grad_norm": 0.11957580596208572, "learning_rate": 3.6473338162307314e-06, "loss": 0.3709, "step": 873 }, { "epoch": 1.8876889848812095, "grad_norm": 0.12649306654930115, "learning_rate": 3.635240322413375e-06, "loss": 0.3803, "step": 874 }, { "epoch": 1.8898488120950323, "grad_norm": 0.12188448011875153, "learning_rate": 3.6231554491598766e-06, "loss": 0.3753, "step": 875 }, { "epoch": 1.8920086393088553, "grad_norm": 0.12264645844697952, "learning_rate": 3.6110792728048636e-06, "loss": 0.3736, "step": 876 }, { "epoch": 1.8941684665226783, "grad_norm": 0.12633198499679565, "learning_rate": 3.599011869628033e-06, "loss": 0.3734, "step": 877 }, { "epoch": 1.896328293736501, "grad_norm": 0.12245716154575348, "learning_rate": 3.5869533158536583e-06, "loss": 0.3661, "step": 878 }, { "epoch": 1.898488120950324, "grad_norm": 0.11652278900146484, "learning_rate": 3.5749036876501196e-06, "loss": 0.3775, "step": 879 }, { "epoch": 1.900647948164147, "grad_norm": 0.1323150098323822, "learning_rate": 3.562863061129419e-06, "loss": 0.3736, "step": 880 }, { "epoch": 1.9028077753779697, "grad_norm": 0.1193445473909378, "learning_rate": 3.550831512346695e-06, "loss": 0.3756, "step": 881 }, { "epoch": 1.9049676025917925, "grad_norm": 0.11626636981964111, "learning_rate": 3.538809117299751e-06, "loss": 0.3771, "step": 882 }, { "epoch": 1.9071274298056156, "grad_norm": 0.13352340459823608, "learning_rate": 3.526795951928569e-06, "loss": 0.3828, "step": 883 }, { "epoch": 1.9092872570194386, "grad_norm": 0.13351115584373474, "learning_rate": 3.5147920921148267e-06, "loss": 0.3645, "step": 884 }, { "epoch": 1.9114470842332614, "grad_norm": 0.11943700909614563, "learning_rate": 3.502797613681429e-06, "loss": 0.386, "step": 885 }, { "epoch": 1.9136069114470842, "grad_norm": 0.1416894644498825, "learning_rate": 3.4908125923920204e-06, "loss": 0.3771, "step": 886 }, { "epoch": 1.9157667386609072, "grad_norm": 0.12883397936820984, "learning_rate": 3.478837103950509e-06, "loss": 0.38, "step": 887 }, { "epoch": 1.91792656587473, "grad_norm": 0.12375036627054214, "learning_rate": 3.4668712240005912e-06, "loss": 0.3771, "step": 888 }, { "epoch": 1.9200863930885528, "grad_norm": 0.13057532906532288, "learning_rate": 3.4549150281252635e-06, "loss": 0.3867, "step": 889 }, { "epoch": 1.9222462203023758, "grad_norm": 0.13063670694828033, "learning_rate": 3.442968591846359e-06, "loss": 0.3746, "step": 890 }, { "epoch": 1.9244060475161988, "grad_norm": 0.11987043917179108, "learning_rate": 3.431031990624063e-06, "loss": 0.3733, "step": 891 }, { "epoch": 1.9265658747300216, "grad_norm": 0.13609716296195984, "learning_rate": 3.4191052998564344e-06, "loss": 0.3766, "step": 892 }, { "epoch": 1.9287257019438444, "grad_norm": 0.1286943554878235, "learning_rate": 3.407188594878938e-06, "loss": 0.3777, "step": 893 }, { "epoch": 1.9308855291576674, "grad_norm": 0.15128813683986664, "learning_rate": 3.3952819509639534e-06, "loss": 0.3729, "step": 894 }, { "epoch": 1.9330453563714904, "grad_norm": 0.12446796149015427, "learning_rate": 3.3833854433203185e-06, "loss": 0.3773, "step": 895 }, { "epoch": 1.935205183585313, "grad_norm": 0.1316557675600052, "learning_rate": 3.3714991470928393e-06, "loss": 0.3843, "step": 896 }, { "epoch": 1.937365010799136, "grad_norm": 0.12782582640647888, "learning_rate": 3.359623137361825e-06, "loss": 0.3787, "step": 897 }, { "epoch": 1.939524838012959, "grad_norm": 0.13275963068008423, "learning_rate": 3.347757489142608e-06, "loss": 0.3809, "step": 898 }, { "epoch": 1.9416846652267818, "grad_norm": 0.1355566680431366, "learning_rate": 3.3359022773850673e-06, "loss": 0.3798, "step": 899 }, { "epoch": 1.9438444924406046, "grad_norm": 0.12694980204105377, "learning_rate": 3.3240575769731662e-06, "loss": 0.3825, "step": 900 }, { "epoch": 1.9460043196544277, "grad_norm": 0.13038381934165955, "learning_rate": 3.312223462724472e-06, "loss": 0.3861, "step": 901 }, { "epoch": 1.9481641468682507, "grad_norm": 0.1373014897108078, "learning_rate": 3.300400009389678e-06, "loss": 0.3828, "step": 902 }, { "epoch": 1.9503239740820735, "grad_norm": 0.1347927749156952, "learning_rate": 3.2885872916521445e-06, "loss": 0.3701, "step": 903 }, { "epoch": 1.9524838012958963, "grad_norm": 0.12417130172252655, "learning_rate": 3.2767853841274154e-06, "loss": 0.3823, "step": 904 }, { "epoch": 1.9546436285097193, "grad_norm": 0.1381063610315323, "learning_rate": 3.264994361362753e-06, "loss": 0.3768, "step": 905 }, { "epoch": 1.956803455723542, "grad_norm": 0.12305869907140732, "learning_rate": 3.2532142978366654e-06, "loss": 0.3803, "step": 906 }, { "epoch": 1.9589632829373649, "grad_norm": 0.12444626539945602, "learning_rate": 3.241445267958438e-06, "loss": 0.3717, "step": 907 }, { "epoch": 1.961123110151188, "grad_norm": 0.12498262524604797, "learning_rate": 3.2296873460676557e-06, "loss": 0.3739, "step": 908 }, { "epoch": 1.963282937365011, "grad_norm": 0.11730080097913742, "learning_rate": 3.217940606433747e-06, "loss": 0.379, "step": 909 }, { "epoch": 1.9654427645788337, "grad_norm": 0.13885721564292908, "learning_rate": 3.2062051232555024e-06, "loss": 0.3693, "step": 910 }, { "epoch": 1.9676025917926565, "grad_norm": 0.12217065691947937, "learning_rate": 3.1944809706606123e-06, "loss": 0.3739, "step": 911 }, { "epoch": 1.9697624190064795, "grad_norm": 0.1336861401796341, "learning_rate": 3.182768222705198e-06, "loss": 0.3747, "step": 912 }, { "epoch": 1.9719222462203023, "grad_norm": 0.12711098790168762, "learning_rate": 3.171066953373338e-06, "loss": 0.3821, "step": 913 }, { "epoch": 1.9740820734341251, "grad_norm": 0.1329392045736313, "learning_rate": 3.1593772365766107e-06, "loss": 0.376, "step": 914 }, { "epoch": 1.9762419006479481, "grad_norm": 0.12040119618177414, "learning_rate": 3.147699146153621e-06, "loss": 0.3738, "step": 915 }, { "epoch": 1.9784017278617712, "grad_norm": 0.23164218664169312, "learning_rate": 3.1360327558695336e-06, "loss": 0.3802, "step": 916 }, { "epoch": 1.980561555075594, "grad_norm": 0.11707053333520889, "learning_rate": 3.1243781394156138e-06, "loss": 0.3813, "step": 917 }, { "epoch": 1.9827213822894167, "grad_norm": 0.1273183971643448, "learning_rate": 3.1127353704087477e-06, "loss": 0.3779, "step": 918 }, { "epoch": 1.9848812095032398, "grad_norm": 0.1234814003109932, "learning_rate": 3.1011045223909954e-06, "loss": 0.3804, "step": 919 }, { "epoch": 1.9870410367170628, "grad_norm": 0.1258484572172165, "learning_rate": 3.089485668829113e-06, "loss": 0.3811, "step": 920 }, { "epoch": 1.9892008639308856, "grad_norm": 0.1231926903128624, "learning_rate": 3.077878883114096e-06, "loss": 0.3831, "step": 921 }, { "epoch": 1.9913606911447084, "grad_norm": 0.12332677841186523, "learning_rate": 3.066284238560713e-06, "loss": 0.3698, "step": 922 }, { "epoch": 1.9935205183585314, "grad_norm": 0.12334899604320526, "learning_rate": 3.0547018084070344e-06, "loss": 0.3768, "step": 923 }, { "epoch": 1.9956803455723542, "grad_norm": 0.11958076804876328, "learning_rate": 3.043131665813988e-06, "loss": 0.3684, "step": 924 }, { "epoch": 1.997840172786177, "grad_norm": 0.12707985937595367, "learning_rate": 3.031573883864882e-06, "loss": 0.382, "step": 925 }, { "epoch": 2.0, "grad_norm": 0.14072422683238983, "learning_rate": 3.0200285355649504e-06, "loss": 0.3729, "step": 926 }, { "epoch": 2.002159827213823, "grad_norm": 0.1437283754348755, "learning_rate": 3.0084956938408873e-06, "loss": 0.3623, "step": 927 }, { "epoch": 2.0043196544276456, "grad_norm": 0.12962134182453156, "learning_rate": 2.9969754315403865e-06, "loss": 0.3649, "step": 928 }, { "epoch": 2.0064794816414686, "grad_norm": 0.13019190728664398, "learning_rate": 2.9854678214316875e-06, "loss": 0.3626, "step": 929 }, { "epoch": 2.0086393088552916, "grad_norm": 0.12730036675930023, "learning_rate": 2.97397293620311e-06, "loss": 0.3572, "step": 930 }, { "epoch": 2.0107991360691146, "grad_norm": 0.13056515157222748, "learning_rate": 2.962490848462596e-06, "loss": 0.3474, "step": 931 }, { "epoch": 2.012958963282937, "grad_norm": 0.13034315407276154, "learning_rate": 2.951021630737255e-06, "loss": 0.3679, "step": 932 }, { "epoch": 2.0151187904967602, "grad_norm": 0.14478375017642975, "learning_rate": 2.9395653554728955e-06, "loss": 0.3579, "step": 933 }, { "epoch": 2.0172786177105833, "grad_norm": 0.1417471021413803, "learning_rate": 2.92812209503358e-06, "loss": 0.365, "step": 934 }, { "epoch": 2.019438444924406, "grad_norm": 0.13237528502941132, "learning_rate": 2.91669192170116e-06, "loss": 0.3658, "step": 935 }, { "epoch": 2.021598272138229, "grad_norm": 0.1314917653799057, "learning_rate": 2.9052749076748266e-06, "loss": 0.3687, "step": 936 }, { "epoch": 2.023758099352052, "grad_norm": 0.11867891997098923, "learning_rate": 2.8938711250706397e-06, "loss": 0.3643, "step": 937 }, { "epoch": 2.025917926565875, "grad_norm": 0.13976238667964935, "learning_rate": 2.8824806459210907e-06, "loss": 0.3678, "step": 938 }, { "epoch": 2.0280777537796975, "grad_norm": 0.1445903778076172, "learning_rate": 2.871103542174637e-06, "loss": 0.3574, "step": 939 }, { "epoch": 2.0302375809935205, "grad_norm": 0.11427946388721466, "learning_rate": 2.8597398856952473e-06, "loss": 0.3569, "step": 940 }, { "epoch": 2.0323974082073435, "grad_norm": 0.13496039807796478, "learning_rate": 2.8483897482619566e-06, "loss": 0.3717, "step": 941 }, { "epoch": 2.0345572354211665, "grad_norm": 0.13540537655353546, "learning_rate": 2.837053201568396e-06, "loss": 0.3667, "step": 942 }, { "epoch": 2.036717062634989, "grad_norm": 0.12281273305416107, "learning_rate": 2.825730317222358e-06, "loss": 0.3541, "step": 943 }, { "epoch": 2.038876889848812, "grad_norm": 0.12640658020973206, "learning_rate": 2.814421166745337e-06, "loss": 0.3641, "step": 944 }, { "epoch": 2.041036717062635, "grad_norm": 0.12110286951065063, "learning_rate": 2.803125821572068e-06, "loss": 0.3597, "step": 945 }, { "epoch": 2.0431965442764577, "grad_norm": 0.13443802297115326, "learning_rate": 2.791844353050094e-06, "loss": 0.3709, "step": 946 }, { "epoch": 2.0453563714902807, "grad_norm": 0.11386435478925705, "learning_rate": 2.7805768324393017e-06, "loss": 0.3681, "step": 947 }, { "epoch": 2.0475161987041037, "grad_norm": 0.13114500045776367, "learning_rate": 2.769323330911472e-06, "loss": 0.3602, "step": 948 }, { "epoch": 2.0496760259179267, "grad_norm": 0.13121016323566437, "learning_rate": 2.7580839195498397e-06, "loss": 0.3567, "step": 949 }, { "epoch": 2.0518358531317493, "grad_norm": 0.11939337104558945, "learning_rate": 2.746858669348634e-06, "loss": 0.3611, "step": 950 }, { "epoch": 2.0539956803455723, "grad_norm": 0.11663561314344406, "learning_rate": 2.7356476512126386e-06, "loss": 0.3557, "step": 951 }, { "epoch": 2.0561555075593954, "grad_norm": 0.11576730012893677, "learning_rate": 2.724450935956733e-06, "loss": 0.3723, "step": 952 }, { "epoch": 2.058315334773218, "grad_norm": 0.1251356601715088, "learning_rate": 2.713268594305458e-06, "loss": 0.3637, "step": 953 }, { "epoch": 2.060475161987041, "grad_norm": 0.12093979120254517, "learning_rate": 2.7021006968925613e-06, "loss": 0.3752, "step": 954 }, { "epoch": 2.062634989200864, "grad_norm": 0.12214156985282898, "learning_rate": 2.6909473142605522e-06, "loss": 0.3638, "step": 955 }, { "epoch": 2.064794816414687, "grad_norm": 0.12628315389156342, "learning_rate": 2.6798085168602595e-06, "loss": 0.3667, "step": 956 }, { "epoch": 2.0669546436285096, "grad_norm": 0.12237667292356491, "learning_rate": 2.668684375050378e-06, "loss": 0.3653, "step": 957 }, { "epoch": 2.0691144708423326, "grad_norm": 0.1107870489358902, "learning_rate": 2.6575749590970336e-06, "loss": 0.3558, "step": 958 }, { "epoch": 2.0712742980561556, "grad_norm": 0.1208115965127945, "learning_rate": 2.646480339173337e-06, "loss": 0.3733, "step": 959 }, { "epoch": 2.0734341252699786, "grad_norm": 0.12692323327064514, "learning_rate": 2.635400585358937e-06, "loss": 0.3663, "step": 960 }, { "epoch": 2.075593952483801, "grad_norm": 0.11760963499546051, "learning_rate": 2.624335767639582e-06, "loss": 0.3638, "step": 961 }, { "epoch": 2.077753779697624, "grad_norm": 0.12751303613185883, "learning_rate": 2.6132859559066704e-06, "loss": 0.3547, "step": 962 }, { "epoch": 2.079913606911447, "grad_norm": 0.12997639179229736, "learning_rate": 2.6022512199568205e-06, "loss": 0.3558, "step": 963 }, { "epoch": 2.08207343412527, "grad_norm": 0.12291760742664337, "learning_rate": 2.5912316294914232e-06, "loss": 0.3506, "step": 964 }, { "epoch": 2.084233261339093, "grad_norm": 0.11633151024580002, "learning_rate": 2.580227254116199e-06, "loss": 0.3648, "step": 965 }, { "epoch": 2.086393088552916, "grad_norm": 0.12379375100135803, "learning_rate": 2.5692381633407672e-06, "loss": 0.3652, "step": 966 }, { "epoch": 2.088552915766739, "grad_norm": 0.12270376831293106, "learning_rate": 2.558264426578192e-06, "loss": 0.3625, "step": 967 }, { "epoch": 2.0907127429805614, "grad_norm": 0.12057667225599289, "learning_rate": 2.547306113144564e-06, "loss": 0.3712, "step": 968 }, { "epoch": 2.0928725701943844, "grad_norm": 0.1182745024561882, "learning_rate": 2.536363292258543e-06, "loss": 0.3686, "step": 969 }, { "epoch": 2.0950323974082075, "grad_norm": 0.12089554965496063, "learning_rate": 2.5254360330409343e-06, "loss": 0.3603, "step": 970 }, { "epoch": 2.09719222462203, "grad_norm": 0.12302310764789581, "learning_rate": 2.514524404514248e-06, "loss": 0.3599, "step": 971 }, { "epoch": 2.099352051835853, "grad_norm": 0.1283075213432312, "learning_rate": 2.503628475602256e-06, "loss": 0.3685, "step": 972 }, { "epoch": 2.101511879049676, "grad_norm": 0.11500417441129684, "learning_rate": 2.49274831512957e-06, "loss": 0.3657, "step": 973 }, { "epoch": 2.103671706263499, "grad_norm": 0.11335953325033188, "learning_rate": 2.4818839918211963e-06, "loss": 0.3689, "step": 974 }, { "epoch": 2.1058315334773217, "grad_norm": 0.12606894969940186, "learning_rate": 2.4710355743021077e-06, "loss": 0.359, "step": 975 }, { "epoch": 2.1079913606911447, "grad_norm": 0.11400944739580154, "learning_rate": 2.4602031310968013e-06, "loss": 0.3661, "step": 976 }, { "epoch": 2.1101511879049677, "grad_norm": 0.11969246715307236, "learning_rate": 2.4493867306288772e-06, "loss": 0.3618, "step": 977 }, { "epoch": 2.1123110151187907, "grad_norm": 0.11956711113452911, "learning_rate": 2.4385864412206e-06, "loss": 0.3516, "step": 978 }, { "epoch": 2.1144708423326133, "grad_norm": 0.11470730602741241, "learning_rate": 2.4278023310924676e-06, "loss": 0.3651, "step": 979 }, { "epoch": 2.1166306695464363, "grad_norm": 0.12043334543704987, "learning_rate": 2.417034468362782e-06, "loss": 0.3702, "step": 980 }, { "epoch": 2.1187904967602593, "grad_norm": 0.11915960907936096, "learning_rate": 2.406282921047213e-06, "loss": 0.3609, "step": 981 }, { "epoch": 2.120950323974082, "grad_norm": 0.1116413027048111, "learning_rate": 2.395547757058379e-06, "loss": 0.3576, "step": 982 }, { "epoch": 2.123110151187905, "grad_norm": 0.11029747128486633, "learning_rate": 2.3848290442054096e-06, "loss": 0.3618, "step": 983 }, { "epoch": 2.125269978401728, "grad_norm": 0.12164044380187988, "learning_rate": 2.3741268501935212e-06, "loss": 0.3557, "step": 984 }, { "epoch": 2.127429805615551, "grad_norm": 0.11805900186300278, "learning_rate": 2.3634412426235886e-06, "loss": 0.3665, "step": 985 }, { "epoch": 2.1295896328293735, "grad_norm": 0.12578925490379333, "learning_rate": 2.3527722889917147e-06, "loss": 0.3617, "step": 986 }, { "epoch": 2.1317494600431965, "grad_norm": 0.11140415817499161, "learning_rate": 2.3421200566888096e-06, "loss": 0.3529, "step": 987 }, { "epoch": 2.1339092872570196, "grad_norm": 0.12330644577741623, "learning_rate": 2.3314846130001622e-06, "loss": 0.3512, "step": 988 }, { "epoch": 2.136069114470842, "grad_norm": 0.11442252993583679, "learning_rate": 2.320866025105016e-06, "loss": 0.3527, "step": 989 }, { "epoch": 2.138228941684665, "grad_norm": 0.11933194845914841, "learning_rate": 2.3102643600761445e-06, "loss": 0.3481, "step": 990 }, { "epoch": 2.140388768898488, "grad_norm": 0.11264543980360031, "learning_rate": 2.299679684879421e-06, "loss": 0.3583, "step": 991 }, { "epoch": 2.142548596112311, "grad_norm": 0.11280512809753418, "learning_rate": 2.289112066373411e-06, "loss": 0.3629, "step": 992 }, { "epoch": 2.1447084233261338, "grad_norm": 0.11324049532413483, "learning_rate": 2.2785615713089363e-06, "loss": 0.3609, "step": 993 }, { "epoch": 2.146868250539957, "grad_norm": 0.10841232538223267, "learning_rate": 2.268028266328655e-06, "loss": 0.3613, "step": 994 }, { "epoch": 2.14902807775378, "grad_norm": 0.1152244582772255, "learning_rate": 2.25751221796665e-06, "loss": 0.3571, "step": 995 }, { "epoch": 2.1511879049676024, "grad_norm": 0.11110089719295502, "learning_rate": 2.247013492647994e-06, "loss": 0.3548, "step": 996 }, { "epoch": 2.1533477321814254, "grad_norm": 0.11328666657209396, "learning_rate": 2.2365321566883437e-06, "loss": 0.3586, "step": 997 }, { "epoch": 2.1555075593952484, "grad_norm": 0.11004538089036942, "learning_rate": 2.2260682762935137e-06, "loss": 0.3565, "step": 998 }, { "epoch": 2.1576673866090714, "grad_norm": 0.11562500894069672, "learning_rate": 2.2156219175590623e-06, "loss": 0.3619, "step": 999 }, { "epoch": 2.159827213822894, "grad_norm": 0.11296035349369049, "learning_rate": 2.2051931464698636e-06, "loss": 0.3656, "step": 1000 }, { "epoch": 2.161987041036717, "grad_norm": 0.11270337551832199, "learning_rate": 2.1947820288997067e-06, "loss": 0.3496, "step": 1001 }, { "epoch": 2.16414686825054, "grad_norm": 0.11527759581804276, "learning_rate": 2.1843886306108686e-06, "loss": 0.3695, "step": 1002 }, { "epoch": 2.166306695464363, "grad_norm": 0.11408324539661407, "learning_rate": 2.174013017253701e-06, "loss": 0.3651, "step": 1003 }, { "epoch": 2.1684665226781856, "grad_norm": 0.10843408107757568, "learning_rate": 2.1636552543662187e-06, "loss": 0.3692, "step": 1004 }, { "epoch": 2.1706263498920086, "grad_norm": 0.11223003268241882, "learning_rate": 2.153315407373679e-06, "loss": 0.3545, "step": 1005 }, { "epoch": 2.1727861771058317, "grad_norm": 0.11480898410081863, "learning_rate": 2.1429935415881753e-06, "loss": 0.3609, "step": 1006 }, { "epoch": 2.1749460043196542, "grad_norm": 0.1133100613951683, "learning_rate": 2.132689722208223e-06, "loss": 0.361, "step": 1007 }, { "epoch": 2.1771058315334773, "grad_norm": 0.11355537176132202, "learning_rate": 2.1224040143183444e-06, "loss": 0.3681, "step": 1008 }, { "epoch": 2.1792656587473003, "grad_norm": 0.11831656098365784, "learning_rate": 2.112136482888663e-06, "loss": 0.3555, "step": 1009 }, { "epoch": 2.1814254859611233, "grad_norm": 0.11772197484970093, "learning_rate": 2.1018871927744844e-06, "loss": 0.3604, "step": 1010 }, { "epoch": 2.183585313174946, "grad_norm": 0.10822444409132004, "learning_rate": 2.0916562087158964e-06, "loss": 0.3583, "step": 1011 }, { "epoch": 2.185745140388769, "grad_norm": 0.21270522475242615, "learning_rate": 2.0814435953373554e-06, "loss": 0.3651, "step": 1012 }, { "epoch": 2.187904967602592, "grad_norm": 0.11271930485963821, "learning_rate": 2.0712494171472776e-06, "loss": 0.367, "step": 1013 }, { "epoch": 2.190064794816415, "grad_norm": 0.1191214919090271, "learning_rate": 2.061073738537635e-06, "loss": 0.3566, "step": 1014 }, { "epoch": 2.1922246220302375, "grad_norm": 0.1228100061416626, "learning_rate": 2.0509166237835398e-06, "loss": 0.3553, "step": 1015 }, { "epoch": 2.1943844492440605, "grad_norm": 0.11371961981058121, "learning_rate": 2.040778137042852e-06, "loss": 0.3621, "step": 1016 }, { "epoch": 2.1965442764578835, "grad_norm": 0.10948773473501205, "learning_rate": 2.030658342355765e-06, "loss": 0.3612, "step": 1017 }, { "epoch": 2.198704103671706, "grad_norm": 0.10944036394357681, "learning_rate": 2.0205573036443994e-06, "loss": 0.3619, "step": 1018 }, { "epoch": 2.200863930885529, "grad_norm": 0.11753126233816147, "learning_rate": 2.0104750847124075e-06, "loss": 0.3636, "step": 1019 }, { "epoch": 2.203023758099352, "grad_norm": 0.12510347366333008, "learning_rate": 2.0004117492445614e-06, "loss": 0.3789, "step": 1020 }, { "epoch": 2.205183585313175, "grad_norm": 0.1162487342953682, "learning_rate": 1.990367360806359e-06, "loss": 0.3595, "step": 1021 }, { "epoch": 2.2073434125269977, "grad_norm": 0.12260331958532333, "learning_rate": 1.980341982843616e-06, "loss": 0.3659, "step": 1022 }, { "epoch": 2.2095032397408207, "grad_norm": 0.11793196201324463, "learning_rate": 1.9703356786820687e-06, "loss": 0.3644, "step": 1023 }, { "epoch": 2.2116630669546438, "grad_norm": 0.11070533841848373, "learning_rate": 1.9603485115269743e-06, "loss": 0.3587, "step": 1024 }, { "epoch": 2.2138228941684663, "grad_norm": 0.10772062093019485, "learning_rate": 1.9503805444627054e-06, "loss": 0.358, "step": 1025 }, { "epoch": 2.2159827213822894, "grad_norm": 0.11722833663225174, "learning_rate": 1.9404318404523605e-06, "loss": 0.3529, "step": 1026 }, { "epoch": 2.2181425485961124, "grad_norm": 0.11525849252939224, "learning_rate": 1.930502462337362e-06, "loss": 0.3526, "step": 1027 }, { "epoch": 2.2203023758099354, "grad_norm": 0.12186475098133087, "learning_rate": 1.920592472837057e-06, "loss": 0.3642, "step": 1028 }, { "epoch": 2.222462203023758, "grad_norm": 0.11602187156677246, "learning_rate": 1.910701934548329e-06, "loss": 0.3741, "step": 1029 }, { "epoch": 2.224622030237581, "grad_norm": 0.12122868001461029, "learning_rate": 1.900830909945189e-06, "loss": 0.3658, "step": 1030 }, { "epoch": 2.226781857451404, "grad_norm": 0.11481517553329468, "learning_rate": 1.8909794613783943e-06, "loss": 0.3586, "step": 1031 }, { "epoch": 2.2289416846652266, "grad_norm": 0.10677429288625717, "learning_rate": 1.8811476510750486e-06, "loss": 0.367, "step": 1032 }, { "epoch": 2.2311015118790496, "grad_norm": 0.11565054953098297, "learning_rate": 1.8713355411382117e-06, "loss": 0.3629, "step": 1033 }, { "epoch": 2.2332613390928726, "grad_norm": 0.11869722604751587, "learning_rate": 1.8615431935464984e-06, "loss": 0.3455, "step": 1034 }, { "epoch": 2.2354211663066956, "grad_norm": 0.12298930436372757, "learning_rate": 1.8517706701536998e-06, "loss": 0.377, "step": 1035 }, { "epoch": 2.237580993520518, "grad_norm": 0.11223292350769043, "learning_rate": 1.8420180326883857e-06, "loss": 0.3611, "step": 1036 }, { "epoch": 2.239740820734341, "grad_norm": 0.10755477845668793, "learning_rate": 1.8322853427535148e-06, "loss": 0.3636, "step": 1037 }, { "epoch": 2.2419006479481642, "grad_norm": 0.11490552872419357, "learning_rate": 1.822572661826047e-06, "loss": 0.3606, "step": 1038 }, { "epoch": 2.2440604751619873, "grad_norm": 0.11000396311283112, "learning_rate": 1.8128800512565514e-06, "loss": 0.3643, "step": 1039 }, { "epoch": 2.24622030237581, "grad_norm": 0.10895387083292007, "learning_rate": 1.803207572268826e-06, "loss": 0.3623, "step": 1040 }, { "epoch": 2.248380129589633, "grad_norm": 0.11881309747695923, "learning_rate": 1.7935552859595058e-06, "loss": 0.3598, "step": 1041 }, { "epoch": 2.250539956803456, "grad_norm": 0.11568914353847504, "learning_rate": 1.7839232532976746e-06, "loss": 0.3652, "step": 1042 }, { "epoch": 2.2526997840172784, "grad_norm": 0.10827185958623886, "learning_rate": 1.7743115351244883e-06, "loss": 0.3616, "step": 1043 }, { "epoch": 2.2548596112311015, "grad_norm": 0.12083268910646439, "learning_rate": 1.7647201921527802e-06, "loss": 0.3696, "step": 1044 }, { "epoch": 2.2570194384449245, "grad_norm": 0.11744555830955505, "learning_rate": 1.7551492849666857e-06, "loss": 0.3547, "step": 1045 }, { "epoch": 2.2591792656587475, "grad_norm": 0.11333145946264267, "learning_rate": 1.7455988740212576e-06, "loss": 0.3648, "step": 1046 }, { "epoch": 2.26133909287257, "grad_norm": 0.1083984524011612, "learning_rate": 1.7360690196420816e-06, "loss": 0.3609, "step": 1047 }, { "epoch": 2.263498920086393, "grad_norm": 0.12069600075483322, "learning_rate": 1.7265597820248987e-06, "loss": 0.3617, "step": 1048 }, { "epoch": 2.265658747300216, "grad_norm": 0.11563380807638168, "learning_rate": 1.7170712212352187e-06, "loss": 0.3554, "step": 1049 }, { "epoch": 2.267818574514039, "grad_norm": 0.11244919896125793, "learning_rate": 1.7076033972079503e-06, "loss": 0.3526, "step": 1050 }, { "epoch": 2.2699784017278617, "grad_norm": 0.11943572014570236, "learning_rate": 1.698156369747016e-06, "loss": 0.3639, "step": 1051 }, { "epoch": 2.2721382289416847, "grad_norm": 0.11513727903366089, "learning_rate": 1.6887301985249754e-06, "loss": 0.3622, "step": 1052 }, { "epoch": 2.2742980561555077, "grad_norm": 0.11251917481422424, "learning_rate": 1.6793249430826502e-06, "loss": 0.3606, "step": 1053 }, { "epoch": 2.2764578833693303, "grad_norm": 0.11887813359498978, "learning_rate": 1.6699406628287423e-06, "loss": 0.3602, "step": 1054 }, { "epoch": 2.2786177105831533, "grad_norm": 0.1043018326163292, "learning_rate": 1.6605774170394683e-06, "loss": 0.3597, "step": 1055 }, { "epoch": 2.2807775377969763, "grad_norm": 0.11690463870763779, "learning_rate": 1.651235264858177e-06, "loss": 0.3706, "step": 1056 }, { "epoch": 2.282937365010799, "grad_norm": 0.11119679361581802, "learning_rate": 1.6419142652949793e-06, "loss": 0.3755, "step": 1057 }, { "epoch": 2.285097192224622, "grad_norm": 0.12275518476963043, "learning_rate": 1.6326144772263752e-06, "loss": 0.3617, "step": 1058 }, { "epoch": 2.287257019438445, "grad_norm": 0.11455702781677246, "learning_rate": 1.6233359593948777e-06, "loss": 0.3561, "step": 1059 }, { "epoch": 2.289416846652268, "grad_norm": 0.1072060838341713, "learning_rate": 1.6140787704086502e-06, "loss": 0.3595, "step": 1060 }, { "epoch": 2.2915766738660905, "grad_norm": 0.11446718126535416, "learning_rate": 1.6048429687411294e-06, "loss": 0.3579, "step": 1061 }, { "epoch": 2.2937365010799136, "grad_norm": 0.1233833059668541, "learning_rate": 1.5956286127306591e-06, "loss": 0.3571, "step": 1062 }, { "epoch": 2.2958963282937366, "grad_norm": 0.11054225265979767, "learning_rate": 1.586435760580118e-06, "loss": 0.3592, "step": 1063 }, { "epoch": 2.2980561555075596, "grad_norm": 0.11470706015825272, "learning_rate": 1.5772644703565564e-06, "loss": 0.3602, "step": 1064 }, { "epoch": 2.300215982721382, "grad_norm": 0.1131376326084137, "learning_rate": 1.5681147999908308e-06, "loss": 0.3579, "step": 1065 }, { "epoch": 2.302375809935205, "grad_norm": 0.1124383881688118, "learning_rate": 1.5589868072772279e-06, "loss": 0.3592, "step": 1066 }, { "epoch": 2.304535637149028, "grad_norm": 0.13568998873233795, "learning_rate": 1.5498805498731146e-06, "loss": 0.3687, "step": 1067 }, { "epoch": 2.306695464362851, "grad_norm": 0.11868295818567276, "learning_rate": 1.5407960852985582e-06, "loss": 0.3741, "step": 1068 }, { "epoch": 2.308855291576674, "grad_norm": 0.11386443674564362, "learning_rate": 1.531733470935976e-06, "loss": 0.3702, "step": 1069 }, { "epoch": 2.311015118790497, "grad_norm": 0.11155420541763306, "learning_rate": 1.5226927640297663e-06, "loss": 0.3543, "step": 1070 }, { "epoch": 2.31317494600432, "grad_norm": 0.11469469219446182, "learning_rate": 1.5136740216859464e-06, "loss": 0.3718, "step": 1071 }, { "epoch": 2.3153347732181424, "grad_norm": 0.10761052370071411, "learning_rate": 1.5046773008717968e-06, "loss": 0.3728, "step": 1072 }, { "epoch": 2.3174946004319654, "grad_norm": 0.10855443775653839, "learning_rate": 1.4957026584154926e-06, "loss": 0.3612, "step": 1073 }, { "epoch": 2.3196544276457884, "grad_norm": 0.11300813406705856, "learning_rate": 1.4867501510057548e-06, "loss": 0.3629, "step": 1074 }, { "epoch": 2.3218142548596115, "grad_norm": 0.1190650686621666, "learning_rate": 1.4778198351914853e-06, "loss": 0.358, "step": 1075 }, { "epoch": 2.323974082073434, "grad_norm": 0.11081087589263916, "learning_rate": 1.4689117673814135e-06, "loss": 0.3579, "step": 1076 }, { "epoch": 2.326133909287257, "grad_norm": 0.10845163464546204, "learning_rate": 1.4600260038437376e-06, "loss": 0.3547, "step": 1077 }, { "epoch": 2.32829373650108, "grad_norm": 0.10712606459856033, "learning_rate": 1.4511626007057667e-06, "loss": 0.3702, "step": 1078 }, { "epoch": 2.3304535637149026, "grad_norm": 0.10121244937181473, "learning_rate": 1.4423216139535735e-06, "loss": 0.3701, "step": 1079 }, { "epoch": 2.3326133909287257, "grad_norm": 0.10943249613046646, "learning_rate": 1.4335030994316357e-06, "loss": 0.3673, "step": 1080 }, { "epoch": 2.3347732181425487, "grad_norm": 0.11610903590917587, "learning_rate": 1.4247071128424838e-06, "loss": 0.3603, "step": 1081 }, { "epoch": 2.3369330453563713, "grad_norm": 0.11263252794742584, "learning_rate": 1.4159337097463515e-06, "loss": 0.3646, "step": 1082 }, { "epoch": 2.3390928725701943, "grad_norm": 0.11808553338050842, "learning_rate": 1.407182945560817e-06, "loss": 0.3551, "step": 1083 }, { "epoch": 2.3412526997840173, "grad_norm": 0.11071130633354187, "learning_rate": 1.3984548755604655e-06, "loss": 0.3591, "step": 1084 }, { "epoch": 2.3434125269978403, "grad_norm": 0.10774732381105423, "learning_rate": 1.38974955487653e-06, "loss": 0.3701, "step": 1085 }, { "epoch": 2.345572354211663, "grad_norm": 0.10596179217100143, "learning_rate": 1.3810670384965469e-06, "loss": 0.3619, "step": 1086 }, { "epoch": 2.347732181425486, "grad_norm": 0.10586302727460861, "learning_rate": 1.372407381264011e-06, "loss": 0.3671, "step": 1087 }, { "epoch": 2.349892008639309, "grad_norm": 0.11271238327026367, "learning_rate": 1.3637706378780209e-06, "loss": 0.369, "step": 1088 }, { "epoch": 2.352051835853132, "grad_norm": 0.11300753057003021, "learning_rate": 1.3551568628929434e-06, "loss": 0.366, "step": 1089 }, { "epoch": 2.3542116630669545, "grad_norm": 0.10634942352771759, "learning_rate": 1.346566110718061e-06, "loss": 0.3608, "step": 1090 }, { "epoch": 2.3563714902807775, "grad_norm": 0.11670755594968796, "learning_rate": 1.337998435617235e-06, "loss": 0.3649, "step": 1091 }, { "epoch": 2.3585313174946005, "grad_norm": 0.11227838695049286, "learning_rate": 1.3294538917085586e-06, "loss": 0.3496, "step": 1092 }, { "epoch": 2.360691144708423, "grad_norm": 0.11369525641202927, "learning_rate": 1.3209325329640126e-06, "loss": 0.367, "step": 1093 }, { "epoch": 2.362850971922246, "grad_norm": 0.10753148049116135, "learning_rate": 1.312434413209131e-06, "loss": 0.3654, "step": 1094 }, { "epoch": 2.365010799136069, "grad_norm": 0.11079417914152145, "learning_rate": 1.3039595861226579e-06, "loss": 0.3535, "step": 1095 }, { "epoch": 2.367170626349892, "grad_norm": 0.10849615931510925, "learning_rate": 1.2955081052362072e-06, "loss": 0.3584, "step": 1096 }, { "epoch": 2.3693304535637147, "grad_norm": 0.10960622876882553, "learning_rate": 1.2870800239339237e-06, "loss": 0.3578, "step": 1097 }, { "epoch": 2.3714902807775378, "grad_norm": 0.11225436627864838, "learning_rate": 1.2786753954521508e-06, "loss": 0.3645, "step": 1098 }, { "epoch": 2.373650107991361, "grad_norm": 0.11186996102333069, "learning_rate": 1.2702942728790897e-06, "loss": 0.3564, "step": 1099 }, { "epoch": 2.375809935205184, "grad_norm": 0.10800560563802719, "learning_rate": 1.2619367091544654e-06, "loss": 0.3595, "step": 1100 }, { "epoch": 2.3779697624190064, "grad_norm": 0.11503534764051437, "learning_rate": 1.2536027570691938e-06, "loss": 0.363, "step": 1101 }, { "epoch": 2.3801295896328294, "grad_norm": 0.1053680032491684, "learning_rate": 1.2452924692650443e-06, "loss": 0.3668, "step": 1102 }, { "epoch": 2.3822894168466524, "grad_norm": 0.10837449133396149, "learning_rate": 1.2370058982343109e-06, "loss": 0.3646, "step": 1103 }, { "epoch": 2.384449244060475, "grad_norm": 0.10401103645563126, "learning_rate": 1.2287430963194807e-06, "loss": 0.3523, "step": 1104 }, { "epoch": 2.386609071274298, "grad_norm": 0.1130133643746376, "learning_rate": 1.2205041157129017e-06, "loss": 0.3522, "step": 1105 }, { "epoch": 2.388768898488121, "grad_norm": 0.11143437772989273, "learning_rate": 1.2122890084564542e-06, "loss": 0.3622, "step": 1106 }, { "epoch": 2.390928725701944, "grad_norm": 0.1088298037648201, "learning_rate": 1.204097826441218e-06, "loss": 0.3524, "step": 1107 }, { "epoch": 2.3930885529157666, "grad_norm": 0.11658685654401779, "learning_rate": 1.1959306214071508e-06, "loss": 0.3649, "step": 1108 }, { "epoch": 2.3952483801295896, "grad_norm": 0.10530900955200195, "learning_rate": 1.18778744494276e-06, "loss": 0.3732, "step": 1109 }, { "epoch": 2.3974082073434126, "grad_norm": 0.10576412826776505, "learning_rate": 1.1796683484847731e-06, "loss": 0.3528, "step": 1110 }, { "epoch": 2.3995680345572357, "grad_norm": 0.10664583742618561, "learning_rate": 1.1715733833178178e-06, "loss": 0.3638, "step": 1111 }, { "epoch": 2.4017278617710582, "grad_norm": 0.11170324683189392, "learning_rate": 1.1635026005740902e-06, "loss": 0.3632, "step": 1112 }, { "epoch": 2.4038876889848813, "grad_norm": 0.10899297147989273, "learning_rate": 1.1554560512330437e-06, "loss": 0.3717, "step": 1113 }, { "epoch": 2.4060475161987043, "grad_norm": 0.10355883091688156, "learning_rate": 1.1474337861210543e-06, "loss": 0.3669, "step": 1114 }, { "epoch": 2.408207343412527, "grad_norm": 0.11601343005895615, "learning_rate": 1.1394358559111101e-06, "loss": 0.3675, "step": 1115 }, { "epoch": 2.41036717062635, "grad_norm": 0.10625651478767395, "learning_rate": 1.1314623111224865e-06, "loss": 0.3696, "step": 1116 }, { "epoch": 2.412526997840173, "grad_norm": 0.1087704598903656, "learning_rate": 1.1235132021204226e-06, "loss": 0.3678, "step": 1117 }, { "epoch": 2.4146868250539955, "grad_norm": 0.1125335842370987, "learning_rate": 1.1155885791158128e-06, "loss": 0.3676, "step": 1118 }, { "epoch": 2.4168466522678185, "grad_norm": 0.10977572947740555, "learning_rate": 1.1076884921648834e-06, "loss": 0.3597, "step": 1119 }, { "epoch": 2.4190064794816415, "grad_norm": 0.11624909937381744, "learning_rate": 1.0998129911688766e-06, "loss": 0.3645, "step": 1120 }, { "epoch": 2.4211663066954645, "grad_norm": 0.11193333566188812, "learning_rate": 1.0919621258737384e-06, "loss": 0.3679, "step": 1121 }, { "epoch": 2.423326133909287, "grad_norm": 0.10702624171972275, "learning_rate": 1.0841359458697986e-06, "loss": 0.3675, "step": 1122 }, { "epoch": 2.42548596112311, "grad_norm": 0.11081477999687195, "learning_rate": 1.0763345005914649e-06, "loss": 0.3733, "step": 1123 }, { "epoch": 2.427645788336933, "grad_norm": 0.11152873933315277, "learning_rate": 1.0685578393169054e-06, "loss": 0.3634, "step": 1124 }, { "epoch": 2.429805615550756, "grad_norm": 0.11278684437274933, "learning_rate": 1.0608060111677409e-06, "loss": 0.3646, "step": 1125 }, { "epoch": 2.4319654427645787, "grad_norm": 0.10329707711935043, "learning_rate": 1.053079065108728e-06, "loss": 0.3616, "step": 1126 }, { "epoch": 2.4341252699784017, "grad_norm": 0.11579885333776474, "learning_rate": 1.0453770499474585e-06, "loss": 0.3642, "step": 1127 }, { "epoch": 2.4362850971922247, "grad_norm": 0.11287212371826172, "learning_rate": 1.037700014334047e-06, "loss": 0.3588, "step": 1128 }, { "epoch": 2.4384449244060473, "grad_norm": 0.10435645282268524, "learning_rate": 1.0300480067608232e-06, "loss": 0.3621, "step": 1129 }, { "epoch": 2.4406047516198703, "grad_norm": 0.1117047443985939, "learning_rate": 1.0224210755620257e-06, "loss": 0.3665, "step": 1130 }, { "epoch": 2.4427645788336934, "grad_norm": 0.11821126937866211, "learning_rate": 1.014819268913495e-06, "loss": 0.3659, "step": 1131 }, { "epoch": 2.4449244060475164, "grad_norm": 0.11257217824459076, "learning_rate": 1.0072426348323754e-06, "loss": 0.3629, "step": 1132 }, { "epoch": 2.447084233261339, "grad_norm": 0.10960426181554794, "learning_rate": 9.99691221176805e-07, "loss": 0.3702, "step": 1133 }, { "epoch": 2.449244060475162, "grad_norm": 0.11274091899394989, "learning_rate": 9.921650756456164e-07, "loss": 0.3552, "step": 1134 }, { "epoch": 2.451403887688985, "grad_norm": 0.11033818125724792, "learning_rate": 9.84664245778037e-07, "loss": 0.3622, "step": 1135 }, { "epoch": 2.453563714902808, "grad_norm": 0.11202115565538406, "learning_rate": 9.771887789533818e-07, "loss": 0.3641, "step": 1136 }, { "epoch": 2.4557235421166306, "grad_norm": 0.10436037182807922, "learning_rate": 9.69738722390765e-07, "loss": 0.3722, "step": 1137 }, { "epoch": 2.4578833693304536, "grad_norm": 0.11260079592466354, "learning_rate": 9.623141231487904e-07, "loss": 0.3664, "step": 1138 }, { "epoch": 2.4600431965442766, "grad_norm": 0.10981511324644089, "learning_rate": 9.549150281252633e-07, "loss": 0.3729, "step": 1139 }, { "epoch": 2.462203023758099, "grad_norm": 0.11340730637311935, "learning_rate": 9.475414840568903e-07, "loss": 0.3614, "step": 1140 }, { "epoch": 2.464362850971922, "grad_norm": 0.10501902550458908, "learning_rate": 9.401935375189802e-07, "loss": 0.3601, "step": 1141 }, { "epoch": 2.466522678185745, "grad_norm": 0.11369086056947708, "learning_rate": 9.32871234925159e-07, "loss": 0.3669, "step": 1142 }, { "epoch": 2.468682505399568, "grad_norm": 0.10842647403478622, "learning_rate": 9.255746225270689e-07, "loss": 0.3582, "step": 1143 }, { "epoch": 2.470842332613391, "grad_norm": 0.1089843288064003, "learning_rate": 9.183037464140804e-07, "loss": 0.3532, "step": 1144 }, { "epoch": 2.473002159827214, "grad_norm": 0.1023058295249939, "learning_rate": 9.110586525129988e-07, "loss": 0.3473, "step": 1145 }, { "epoch": 2.475161987041037, "grad_norm": 0.1110844761133194, "learning_rate": 9.038393865877725e-07, "loss": 0.3629, "step": 1146 }, { "epoch": 2.4773218142548594, "grad_norm": 0.10468819737434387, "learning_rate": 8.966459942392108e-07, "loss": 0.3631, "step": 1147 }, { "epoch": 2.4794816414686824, "grad_norm": 0.11002985388040543, "learning_rate": 8.894785209046886e-07, "loss": 0.3584, "step": 1148 }, { "epoch": 2.4816414686825055, "grad_norm": 0.10573374480009079, "learning_rate": 8.823370118578628e-07, "loss": 0.3681, "step": 1149 }, { "epoch": 2.4838012958963285, "grad_norm": 0.11796517670154572, "learning_rate": 8.752215122083874e-07, "loss": 0.3617, "step": 1150 }, { "epoch": 2.485961123110151, "grad_norm": 0.1184302419424057, "learning_rate": 8.68132066901623e-07, "loss": 0.3672, "step": 1151 }, { "epoch": 2.488120950323974, "grad_norm": 0.13177676498889923, "learning_rate": 8.610687207183604e-07, "loss": 0.3573, "step": 1152 }, { "epoch": 2.490280777537797, "grad_norm": 0.11671025305986404, "learning_rate": 8.540315182745329e-07, "loss": 0.3569, "step": 1153 }, { "epoch": 2.4924406047516197, "grad_norm": 0.10741881281137466, "learning_rate": 8.470205040209362e-07, "loss": 0.3558, "step": 1154 }, { "epoch": 2.4946004319654427, "grad_norm": 0.12825675308704376, "learning_rate": 8.400357222429473e-07, "loss": 0.3575, "step": 1155 }, { "epoch": 2.4967602591792657, "grad_norm": 0.10776403546333313, "learning_rate": 8.330772170602424e-07, "loss": 0.3589, "step": 1156 }, { "epoch": 2.4989200863930887, "grad_norm": 0.11745335906744003, "learning_rate": 8.261450324265225e-07, "loss": 0.3617, "step": 1157 }, { "epoch": 2.5010799136069113, "grad_norm": 0.10803595185279846, "learning_rate": 8.192392121292336e-07, "loss": 0.3636, "step": 1158 }, { "epoch": 2.5032397408207343, "grad_norm": 0.11620043963193893, "learning_rate": 8.123597997892918e-07, "loss": 0.3688, "step": 1159 }, { "epoch": 2.5053995680345573, "grad_norm": 0.11771270632743835, "learning_rate": 8.055068388608011e-07, "loss": 0.3633, "step": 1160 }, { "epoch": 2.5075593952483803, "grad_norm": 0.11002473533153534, "learning_rate": 7.986803726307901e-07, "loss": 0.3649, "step": 1161 }, { "epoch": 2.509719222462203, "grad_norm": 0.11476074159145355, "learning_rate": 7.918804442189271e-07, "loss": 0.3482, "step": 1162 }, { "epoch": 2.511879049676026, "grad_norm": 0.10824240744113922, "learning_rate": 7.851070965772572e-07, "loss": 0.3502, "step": 1163 }, { "epoch": 2.514038876889849, "grad_norm": 0.11206220835447311, "learning_rate": 7.783603724899258e-07, "loss": 0.3668, "step": 1164 }, { "epoch": 2.5161987041036715, "grad_norm": 0.11207690834999084, "learning_rate": 7.716403145729073e-07, "loss": 0.3585, "step": 1165 }, { "epoch": 2.5183585313174945, "grad_norm": 0.10834087431430817, "learning_rate": 7.649469652737407e-07, "loss": 0.3557, "step": 1166 }, { "epoch": 2.5205183585313176, "grad_norm": 0.11165751516819, "learning_rate": 7.582803668712579e-07, "loss": 0.3654, "step": 1167 }, { "epoch": 2.52267818574514, "grad_norm": 0.10847879201173782, "learning_rate": 7.51640561475318e-07, "loss": 0.362, "step": 1168 }, { "epoch": 2.524838012958963, "grad_norm": 0.11347544938325882, "learning_rate": 7.450275910265415e-07, "loss": 0.3631, "step": 1169 }, { "epoch": 2.526997840172786, "grad_norm": 0.11547064036130905, "learning_rate": 7.384414972960419e-07, "loss": 0.3613, "step": 1170 }, { "epoch": 2.529157667386609, "grad_norm": 0.11166190356016159, "learning_rate": 7.318823218851668e-07, "loss": 0.3664, "step": 1171 }, { "epoch": 2.531317494600432, "grad_norm": 0.11519124358892441, "learning_rate": 7.253501062252338e-07, "loss": 0.3715, "step": 1172 }, { "epoch": 2.533477321814255, "grad_norm": 0.12818704545497894, "learning_rate": 7.188448915772673e-07, "loss": 0.3568, "step": 1173 }, { "epoch": 2.535637149028078, "grad_norm": 0.11333166062831879, "learning_rate": 7.123667190317396e-07, "loss": 0.366, "step": 1174 }, { "epoch": 2.537796976241901, "grad_norm": 0.11098440736532211, "learning_rate": 7.059156295083064e-07, "loss": 0.3651, "step": 1175 }, { "epoch": 2.5399568034557234, "grad_norm": 0.11005040258169174, "learning_rate": 6.994916637555571e-07, "loss": 0.3658, "step": 1176 }, { "epoch": 2.5421166306695464, "grad_norm": 0.10551054775714874, "learning_rate": 6.930948623507505e-07, "loss": 0.3654, "step": 1177 }, { "epoch": 2.5442764578833694, "grad_norm": 0.10704758763313293, "learning_rate": 6.86725265699561e-07, "loss": 0.3562, "step": 1178 }, { "epoch": 2.546436285097192, "grad_norm": 0.1092720702290535, "learning_rate": 6.803829140358237e-07, "loss": 0.3619, "step": 1179 }, { "epoch": 2.548596112311015, "grad_norm": 0.10640691220760345, "learning_rate": 6.74067847421277e-07, "loss": 0.3674, "step": 1180 }, { "epoch": 2.550755939524838, "grad_norm": 0.10517946630716324, "learning_rate": 6.677801057453143e-07, "loss": 0.3556, "step": 1181 }, { "epoch": 2.552915766738661, "grad_norm": 0.10489367693662643, "learning_rate": 6.615197287247299e-07, "loss": 0.3766, "step": 1182 }, { "epoch": 2.555075593952484, "grad_norm": 0.11467967927455902, "learning_rate": 6.552867559034687e-07, "loss": 0.3569, "step": 1183 }, { "epoch": 2.5572354211663066, "grad_norm": 0.11009713262319565, "learning_rate": 6.490812266523716e-07, "loss": 0.3654, "step": 1184 }, { "epoch": 2.5593952483801297, "grad_norm": 0.10729658603668213, "learning_rate": 6.429031801689362e-07, "loss": 0.3564, "step": 1185 }, { "epoch": 2.5615550755939527, "grad_norm": 0.1073872372508049, "learning_rate": 6.36752655477062e-07, "loss": 0.3606, "step": 1186 }, { "epoch": 2.5637149028077753, "grad_norm": 0.10580222308635712, "learning_rate": 6.30629691426804e-07, "loss": 0.371, "step": 1187 }, { "epoch": 2.5658747300215983, "grad_norm": 0.11771810799837112, "learning_rate": 6.245343266941328e-07, "loss": 0.3597, "step": 1188 }, { "epoch": 2.5680345572354213, "grad_norm": 0.11992885917425156, "learning_rate": 6.184665997806832e-07, "loss": 0.3559, "step": 1189 }, { "epoch": 2.570194384449244, "grad_norm": 0.11079053580760956, "learning_rate": 6.124265490135161e-07, "loss": 0.3635, "step": 1190 }, { "epoch": 2.572354211663067, "grad_norm": 0.10871004313230515, "learning_rate": 6.064142125448763e-07, "loss": 0.3625, "step": 1191 }, { "epoch": 2.57451403887689, "grad_norm": 0.11944089829921722, "learning_rate": 6.004296283519478e-07, "loss": 0.3531, "step": 1192 }, { "epoch": 2.5766738660907125, "grad_norm": 0.11835870891809464, "learning_rate": 5.944728342366179e-07, "loss": 0.3596, "step": 1193 }, { "epoch": 2.5788336933045355, "grad_norm": 0.10851329565048218, "learning_rate": 5.885438678252342e-07, "loss": 0.3692, "step": 1194 }, { "epoch": 2.5809935205183585, "grad_norm": 0.10725897550582886, "learning_rate": 5.826427665683715e-07, "loss": 0.3621, "step": 1195 }, { "epoch": 2.5831533477321815, "grad_norm": 0.10977955162525177, "learning_rate": 5.767695677405921e-07, "loss": 0.3536, "step": 1196 }, { "epoch": 2.5853131749460045, "grad_norm": 0.11643577367067337, "learning_rate": 5.709243084402128e-07, "loss": 0.3624, "step": 1197 }, { "epoch": 2.587473002159827, "grad_norm": 0.11957161873579025, "learning_rate": 5.651070255890689e-07, "loss": 0.3567, "step": 1198 }, { "epoch": 2.58963282937365, "grad_norm": 0.11547524482011795, "learning_rate": 5.593177559322776e-07, "loss": 0.3526, "step": 1199 }, { "epoch": 2.591792656587473, "grad_norm": 0.10810908675193787, "learning_rate": 5.535565360380146e-07, "loss": 0.3627, "step": 1200 }, { "epoch": 2.5939524838012957, "grad_norm": 0.10978656262159348, "learning_rate": 5.478234022972756e-07, "loss": 0.3689, "step": 1201 }, { "epoch": 2.5961123110151187, "grad_norm": 0.11710033565759659, "learning_rate": 5.421183909236494e-07, "loss": 0.354, "step": 1202 }, { "epoch": 2.5982721382289418, "grad_norm": 0.10731150209903717, "learning_rate": 5.364415379530891e-07, "loss": 0.3672, "step": 1203 }, { "epoch": 2.6004319654427643, "grad_norm": 0.10609705001115799, "learning_rate": 5.307928792436812e-07, "loss": 0.3541, "step": 1204 }, { "epoch": 2.6025917926565874, "grad_norm": 0.11076472699642181, "learning_rate": 5.251724504754258e-07, "loss": 0.3651, "step": 1205 }, { "epoch": 2.6047516198704104, "grad_norm": 0.11111797392368317, "learning_rate": 5.19580287150005e-07, "loss": 0.3557, "step": 1206 }, { "epoch": 2.6069114470842334, "grad_norm": 0.10651623457670212, "learning_rate": 5.140164245905633e-07, "loss": 0.3537, "step": 1207 }, { "epoch": 2.6090712742980564, "grad_norm": 0.11073900759220123, "learning_rate": 5.084808979414779e-07, "loss": 0.3623, "step": 1208 }, { "epoch": 2.611231101511879, "grad_norm": 0.11509796231985092, "learning_rate": 5.029737421681446e-07, "loss": 0.3669, "step": 1209 }, { "epoch": 2.613390928725702, "grad_norm": 0.11190790683031082, "learning_rate": 4.97494992056754e-07, "loss": 0.3662, "step": 1210 }, { "epoch": 2.615550755939525, "grad_norm": 0.11598829925060272, "learning_rate": 4.920446822140673e-07, "loss": 0.3617, "step": 1211 }, { "epoch": 2.6177105831533476, "grad_norm": 0.11533954739570618, "learning_rate": 4.866228470672041e-07, "loss": 0.3589, "step": 1212 }, { "epoch": 2.6198704103671706, "grad_norm": 0.10564534366130829, "learning_rate": 4.812295208634238e-07, "loss": 0.3626, "step": 1213 }, { "epoch": 2.6220302375809936, "grad_norm": 0.11170712113380432, "learning_rate": 4.758647376699033e-07, "loss": 0.3672, "step": 1214 }, { "epoch": 2.624190064794816, "grad_norm": 0.11519314348697662, "learning_rate": 4.705285313735297e-07, "loss": 0.3666, "step": 1215 }, { "epoch": 2.626349892008639, "grad_norm": 0.11605649441480637, "learning_rate": 4.6522093568068307e-07, "loss": 0.3484, "step": 1216 }, { "epoch": 2.6285097192224622, "grad_norm": 0.11404189467430115, "learning_rate": 4.599419841170216e-07, "loss": 0.3555, "step": 1217 }, { "epoch": 2.6306695464362853, "grad_norm": 0.11835578829050064, "learning_rate": 4.546917100272735e-07, "loss": 0.3552, "step": 1218 }, { "epoch": 2.632829373650108, "grad_norm": 0.11513664573431015, "learning_rate": 4.494701465750217e-07, "loss": 0.3522, "step": 1219 }, { "epoch": 2.634989200863931, "grad_norm": 0.11740648001432419, "learning_rate": 4.4427732674250045e-07, "loss": 0.3625, "step": 1220 }, { "epoch": 2.637149028077754, "grad_norm": 0.12071909755468369, "learning_rate": 4.391132833303807e-07, "loss": 0.3684, "step": 1221 }, { "epoch": 2.639308855291577, "grad_norm": 0.1136975884437561, "learning_rate": 4.3397804895756957e-07, "loss": 0.3684, "step": 1222 }, { "epoch": 2.6414686825053995, "grad_norm": 0.11149821430444717, "learning_rate": 4.2887165606099513e-07, "loss": 0.3603, "step": 1223 }, { "epoch": 2.6436285097192225, "grad_norm": 0.12100395560264587, "learning_rate": 4.237941368954124e-07, "loss": 0.3624, "step": 1224 }, { "epoch": 2.6457883369330455, "grad_norm": 0.1222655400633812, "learning_rate": 4.1874552353319107e-07, "loss": 0.3526, "step": 1225 }, { "epoch": 2.647948164146868, "grad_norm": 0.11921314895153046, "learning_rate": 4.137258478641176e-07, "loss": 0.3647, "step": 1226 }, { "epoch": 2.650107991360691, "grad_norm": 0.11398887634277344, "learning_rate": 4.087351415951918e-07, "loss": 0.3593, "step": 1227 }, { "epoch": 2.652267818574514, "grad_norm": 0.11155658215284348, "learning_rate": 4.0377343625042587e-07, "loss": 0.37, "step": 1228 }, { "epoch": 2.6544276457883367, "grad_norm": 0.1191490963101387, "learning_rate": 3.9884076317064813e-07, "loss": 0.3588, "step": 1229 }, { "epoch": 2.6565874730021597, "grad_norm": 0.12826910614967346, "learning_rate": 3.9393715351330243e-07, "loss": 0.3566, "step": 1230 }, { "epoch": 2.6587473002159827, "grad_norm": 0.11224586516618729, "learning_rate": 3.890626382522539e-07, "loss": 0.3604, "step": 1231 }, { "epoch": 2.6609071274298057, "grad_norm": 0.11304951459169388, "learning_rate": 3.8421724817758745e-07, "loss": 0.3719, "step": 1232 }, { "epoch": 2.6630669546436287, "grad_norm": 0.10955885052680969, "learning_rate": 3.794010138954213e-07, "loss": 0.3611, "step": 1233 }, { "epoch": 2.6652267818574513, "grad_norm": 0.11885318905115128, "learning_rate": 3.7461396582771035e-07, "loss": 0.3732, "step": 1234 }, { "epoch": 2.6673866090712743, "grad_norm": 0.11816181242465973, "learning_rate": 3.698561342120499e-07, "loss": 0.3577, "step": 1235 }, { "epoch": 2.6695464362850974, "grad_norm": 0.11143229156732559, "learning_rate": 3.651275491014905e-07, "loss": 0.3561, "step": 1236 }, { "epoch": 2.67170626349892, "grad_norm": 0.113620825111866, "learning_rate": 3.604282403643472e-07, "loss": 0.3659, "step": 1237 }, { "epoch": 2.673866090712743, "grad_norm": 0.11192460358142853, "learning_rate": 3.557582376840063e-07, "loss": 0.3627, "step": 1238 }, { "epoch": 2.676025917926566, "grad_norm": 0.11559736728668213, "learning_rate": 3.511175705587433e-07, "loss": 0.3632, "step": 1239 }, { "epoch": 2.6781857451403885, "grad_norm": 0.11298345029354095, "learning_rate": 3.465062683015341e-07, "loss": 0.3617, "step": 1240 }, { "epoch": 2.6803455723542116, "grad_norm": 0.1136719286441803, "learning_rate": 3.419243600398703e-07, "loss": 0.3534, "step": 1241 }, { "epoch": 2.6825053995680346, "grad_norm": 0.11135457456111908, "learning_rate": 3.373718747155752e-07, "loss": 0.3723, "step": 1242 }, { "epoch": 2.6846652267818576, "grad_norm": 0.10721197724342346, "learning_rate": 3.328488410846187e-07, "loss": 0.3551, "step": 1243 }, { "epoch": 2.6868250539956806, "grad_norm": 0.11308667808771133, "learning_rate": 3.283552877169399e-07, "loss": 0.3667, "step": 1244 }, { "epoch": 2.688984881209503, "grad_norm": 0.10848429799079895, "learning_rate": 3.2389124299626483e-07, "loss": 0.3643, "step": 1245 }, { "epoch": 2.691144708423326, "grad_norm": 0.11723221838474274, "learning_rate": 3.194567351199257e-07, "loss": 0.3717, "step": 1246 }, { "epoch": 2.693304535637149, "grad_norm": 0.12472040206193924, "learning_rate": 3.150517920986851e-07, "loss": 0.3608, "step": 1247 }, { "epoch": 2.695464362850972, "grad_norm": 0.11016938090324402, "learning_rate": 3.106764417565561e-07, "loss": 0.3588, "step": 1248 }, { "epoch": 2.697624190064795, "grad_norm": 0.11815854161977768, "learning_rate": 3.0633071173062966e-07, "loss": 0.3617, "step": 1249 }, { "epoch": 2.699784017278618, "grad_norm": 0.1177084818482399, "learning_rate": 3.0201462947089865e-07, "loss": 0.3576, "step": 1250 }, { "epoch": 2.7019438444924404, "grad_norm": 0.11179111897945404, "learning_rate": 2.9772822224008515e-07, "loss": 0.3667, "step": 1251 }, { "epoch": 2.7041036717062634, "grad_norm": 0.11454194784164429, "learning_rate": 2.9347151711346556e-07, "loss": 0.3707, "step": 1252 }, { "epoch": 2.7062634989200864, "grad_norm": 0.10757472366094589, "learning_rate": 2.892445409787037e-07, "loss": 0.3628, "step": 1253 }, { "epoch": 2.708423326133909, "grad_norm": 0.11914849281311035, "learning_rate": 2.850473205356774e-07, "loss": 0.3468, "step": 1254 }, { "epoch": 2.710583153347732, "grad_norm": 0.1173713430762291, "learning_rate": 2.8087988229631325e-07, "loss": 0.3668, "step": 1255 }, { "epoch": 2.712742980561555, "grad_norm": 0.11365855485200882, "learning_rate": 2.76742252584416e-07, "loss": 0.359, "step": 1256 }, { "epoch": 2.714902807775378, "grad_norm": 0.11546127498149872, "learning_rate": 2.7263445753550275e-07, "loss": 0.364, "step": 1257 }, { "epoch": 2.717062634989201, "grad_norm": 0.11186777800321579, "learning_rate": 2.685565230966408e-07, "loss": 0.3526, "step": 1258 }, { "epoch": 2.7192224622030237, "grad_norm": 0.10442403703927994, "learning_rate": 2.6450847502627883e-07, "loss": 0.3551, "step": 1259 }, { "epoch": 2.7213822894168467, "grad_norm": 0.12204797565937042, "learning_rate": 2.604903388940899e-07, "loss": 0.3587, "step": 1260 }, { "epoch": 2.7235421166306697, "grad_norm": 0.11084363609552383, "learning_rate": 2.5650214008080544e-07, "loss": 0.3679, "step": 1261 }, { "epoch": 2.7257019438444923, "grad_norm": 0.10979737341403961, "learning_rate": 2.525439037780558e-07, "loss": 0.3717, "step": 1262 }, { "epoch": 2.7278617710583153, "grad_norm": 0.11145438998937607, "learning_rate": 2.486156549882135e-07, "loss": 0.3613, "step": 1263 }, { "epoch": 2.7300215982721383, "grad_norm": 0.11015837639570236, "learning_rate": 2.447174185242324e-07, "loss": 0.3652, "step": 1264 }, { "epoch": 2.732181425485961, "grad_norm": 0.1096833273768425, "learning_rate": 2.40849219009493e-07, "loss": 0.3531, "step": 1265 }, { "epoch": 2.734341252699784, "grad_norm": 0.109636589884758, "learning_rate": 2.3701108087764657e-07, "loss": 0.3596, "step": 1266 }, { "epoch": 2.736501079913607, "grad_norm": 0.11428305506706238, "learning_rate": 2.3320302837245846e-07, "loss": 0.3659, "step": 1267 }, { "epoch": 2.73866090712743, "grad_norm": 0.11387787014245987, "learning_rate": 2.2942508554765764e-07, "loss": 0.3726, "step": 1268 }, { "epoch": 2.740820734341253, "grad_norm": 0.10690239071846008, "learning_rate": 2.2567727626678527e-07, "loss": 0.3651, "step": 1269 }, { "epoch": 2.7429805615550755, "grad_norm": 0.10845934599637985, "learning_rate": 2.2195962420304083e-07, "loss": 0.3608, "step": 1270 }, { "epoch": 2.7451403887688985, "grad_norm": 0.11751694232225418, "learning_rate": 2.1827215283913683e-07, "loss": 0.3659, "step": 1271 }, { "epoch": 2.7473002159827216, "grad_norm": 0.10652041435241699, "learning_rate": 2.1461488546714425e-07, "loss": 0.3634, "step": 1272 }, { "epoch": 2.749460043196544, "grad_norm": 0.10296986997127533, "learning_rate": 2.1098784518835292e-07, "loss": 0.3632, "step": 1273 }, { "epoch": 2.751619870410367, "grad_norm": 0.10827996581792831, "learning_rate": 2.0739105491312028e-07, "loss": 0.3624, "step": 1274 }, { "epoch": 2.75377969762419, "grad_norm": 0.11208463460206985, "learning_rate": 2.0382453736072838e-07, "loss": 0.3552, "step": 1275 }, { "epoch": 2.7559395248380127, "grad_norm": 0.11274047195911407, "learning_rate": 2.0028831505924162e-07, "loss": 0.3613, "step": 1276 }, { "epoch": 2.7580993520518358, "grad_norm": 0.10478544235229492, "learning_rate": 1.967824103453597e-07, "loss": 0.3592, "step": 1277 }, { "epoch": 2.760259179265659, "grad_norm": 0.10351528972387314, "learning_rate": 1.9330684536428335e-07, "loss": 0.3693, "step": 1278 }, { "epoch": 2.762419006479482, "grad_norm": 0.11334282159805298, "learning_rate": 1.8986164206957037e-07, "loss": 0.3615, "step": 1279 }, { "epoch": 2.7645788336933044, "grad_norm": 0.10871846228837967, "learning_rate": 1.8644682222299703e-07, "loss": 0.3644, "step": 1280 }, { "epoch": 2.7667386609071274, "grad_norm": 0.10826321691274643, "learning_rate": 1.8306240739442094e-07, "loss": 0.3599, "step": 1281 }, { "epoch": 2.7688984881209504, "grad_norm": 0.1105961948633194, "learning_rate": 1.7970841896164658e-07, "loss": 0.3652, "step": 1282 }, { "epoch": 2.7710583153347734, "grad_norm": 0.10997821390628815, "learning_rate": 1.7638487811028616e-07, "loss": 0.3675, "step": 1283 }, { "epoch": 2.773218142548596, "grad_norm": 0.1074373796582222, "learning_rate": 1.7309180583363062e-07, "loss": 0.3542, "step": 1284 }, { "epoch": 2.775377969762419, "grad_norm": 0.10459216684103012, "learning_rate": 1.6982922293251548e-07, "loss": 0.3538, "step": 1285 }, { "epoch": 2.777537796976242, "grad_norm": 0.10451044887304306, "learning_rate": 1.6659715001518583e-07, "loss": 0.367, "step": 1286 }, { "epoch": 2.7796976241900646, "grad_norm": 0.10947411507368088, "learning_rate": 1.6339560749717154e-07, "loss": 0.3515, "step": 1287 }, { "epoch": 2.7818574514038876, "grad_norm": 0.11110340058803558, "learning_rate": 1.6022461560115498e-07, "loss": 0.3603, "step": 1288 }, { "epoch": 2.7840172786177106, "grad_norm": 0.10515395551919937, "learning_rate": 1.5708419435684463e-07, "loss": 0.3547, "step": 1289 }, { "epoch": 2.786177105831533, "grad_norm": 0.10683929920196533, "learning_rate": 1.5397436360084784e-07, "loss": 0.3617, "step": 1290 }, { "epoch": 2.7883369330453562, "grad_norm": 0.10624652355909348, "learning_rate": 1.5089514297654594e-07, "loss": 0.3553, "step": 1291 }, { "epoch": 2.7904967602591793, "grad_norm": 0.11002147197723389, "learning_rate": 1.4784655193396947e-07, "loss": 0.3557, "step": 1292 }, { "epoch": 2.7926565874730023, "grad_norm": 0.1125330999493599, "learning_rate": 1.448286097296764e-07, "loss": 0.3544, "step": 1293 }, { "epoch": 2.7948164146868253, "grad_norm": 0.11160624772310257, "learning_rate": 1.4184133542663014e-07, "loss": 0.3694, "step": 1294 }, { "epoch": 2.796976241900648, "grad_norm": 0.10507107526063919, "learning_rate": 1.388847478940797e-07, "loss": 0.3713, "step": 1295 }, { "epoch": 2.799136069114471, "grad_norm": 0.107913538813591, "learning_rate": 1.3595886580743677e-07, "loss": 0.3698, "step": 1296 }, { "epoch": 2.801295896328294, "grad_norm": 0.11146403104066849, "learning_rate": 1.330637076481639e-07, "loss": 0.36, "step": 1297 }, { "epoch": 2.8034557235421165, "grad_norm": 0.10874520242214203, "learning_rate": 1.3019929170365376e-07, "loss": 0.3639, "step": 1298 }, { "epoch": 2.8056155507559395, "grad_norm": 0.11767850816249847, "learning_rate": 1.2736563606711384e-07, "loss": 0.3618, "step": 1299 }, { "epoch": 2.8077753779697625, "grad_norm": 0.10746905952692032, "learning_rate": 1.2456275863745426e-07, "loss": 0.3624, "step": 1300 }, { "epoch": 2.809935205183585, "grad_norm": 0.10965242981910706, "learning_rate": 1.2179067711917015e-07, "loss": 0.3732, "step": 1301 }, { "epoch": 2.812095032397408, "grad_norm": 0.10720682889223099, "learning_rate": 1.1904940902223661e-07, "loss": 0.3661, "step": 1302 }, { "epoch": 2.814254859611231, "grad_norm": 0.11190472543239594, "learning_rate": 1.1633897166199227e-07, "loss": 0.3572, "step": 1303 }, { "epoch": 2.816414686825054, "grad_norm": 0.10630635917186737, "learning_rate": 1.136593821590326e-07, "loss": 0.3587, "step": 1304 }, { "epoch": 2.818574514038877, "grad_norm": 0.10910697281360626, "learning_rate": 1.1101065743910122e-07, "loss": 0.3666, "step": 1305 }, { "epoch": 2.8207343412526997, "grad_norm": 0.11752592027187347, "learning_rate": 1.0839281423298375e-07, "loss": 0.3638, "step": 1306 }, { "epoch": 2.8228941684665227, "grad_norm": 0.11391156911849976, "learning_rate": 1.0580586907639912e-07, "loss": 0.3605, "step": 1307 }, { "epoch": 2.8250539956803458, "grad_norm": 0.11459757387638092, "learning_rate": 1.032498383099001e-07, "loss": 0.365, "step": 1308 }, { "epoch": 2.8272138228941683, "grad_norm": 0.10249683260917664, "learning_rate": 1.007247380787657e-07, "loss": 0.3609, "step": 1309 }, { "epoch": 2.8293736501079914, "grad_norm": 0.11776190996170044, "learning_rate": 9.823058433290178e-08, "loss": 0.3667, "step": 1310 }, { "epoch": 2.8315334773218144, "grad_norm": 0.10746931284666061, "learning_rate": 9.576739282673886e-08, "loss": 0.3598, "step": 1311 }, { "epoch": 2.833693304535637, "grad_norm": 0.1106642559170723, "learning_rate": 9.333517911913281e-08, "loss": 0.3627, "step": 1312 }, { "epoch": 2.83585313174946, "grad_norm": 0.1114298552274704, "learning_rate": 9.093395857326714e-08, "loss": 0.3521, "step": 1313 }, { "epoch": 2.838012958963283, "grad_norm": 0.11040709167718887, "learning_rate": 8.856374635655696e-08, "loss": 0.3618, "step": 1314 }, { "epoch": 2.8401727861771056, "grad_norm": 0.10548478364944458, "learning_rate": 8.622455744054958e-08, "loss": 0.3574, "step": 1315 }, { "epoch": 2.8423326133909286, "grad_norm": 0.1121056005358696, "learning_rate": 8.391640660083411e-08, "loss": 0.3693, "step": 1316 }, { "epoch": 2.8444924406047516, "grad_norm": 0.11348962038755417, "learning_rate": 8.163930841694589e-08, "loss": 0.3569, "step": 1317 }, { "epoch": 2.8466522678185746, "grad_norm": 0.10726695507764816, "learning_rate": 7.939327727227441e-08, "loss": 0.3667, "step": 1318 }, { "epoch": 2.8488120950323976, "grad_norm": 0.10446982830762863, "learning_rate": 7.717832735397335e-08, "loss": 0.3685, "step": 1319 }, { "epoch": 2.85097192224622, "grad_norm": 0.11472396552562714, "learning_rate": 7.499447265286952e-08, "loss": 0.364, "step": 1320 }, { "epoch": 2.853131749460043, "grad_norm": 0.10750308632850647, "learning_rate": 7.284172696337688e-08, "loss": 0.3626, "step": 1321 }, { "epoch": 2.8552915766738662, "grad_norm": 0.11191460490226746, "learning_rate": 7.072010388340656e-08, "loss": 0.3623, "step": 1322 }, { "epoch": 2.857451403887689, "grad_norm": 0.10993245989084244, "learning_rate": 6.862961681428304e-08, "loss": 0.3549, "step": 1323 }, { "epoch": 2.859611231101512, "grad_norm": 0.11314646899700165, "learning_rate": 6.657027896065982e-08, "loss": 0.3542, "step": 1324 }, { "epoch": 2.861771058315335, "grad_norm": 0.1285964399576187, "learning_rate": 6.454210333043275e-08, "loss": 0.3572, "step": 1325 }, { "epoch": 2.8639308855291574, "grad_norm": 0.10818547010421753, "learning_rate": 6.254510273466186e-08, "loss": 0.3676, "step": 1326 }, { "epoch": 2.8660907127429804, "grad_norm": 0.10412049293518066, "learning_rate": 6.057928978748906e-08, "loss": 0.3685, "step": 1327 }, { "epoch": 2.8682505399568035, "grad_norm": 0.10978944599628448, "learning_rate": 5.864467690605613e-08, "loss": 0.3671, "step": 1328 }, { "epoch": 2.8704103671706265, "grad_norm": 0.1174926683306694, "learning_rate": 5.674127631043025e-08, "loss": 0.3658, "step": 1329 }, { "epoch": 2.8725701943844495, "grad_norm": 0.11143560707569122, "learning_rate": 5.4869100023523526e-08, "loss": 0.3624, "step": 1330 }, { "epoch": 2.874730021598272, "grad_norm": 0.10805241763591766, "learning_rate": 5.302815987101917e-08, "loss": 0.3636, "step": 1331 }, { "epoch": 2.876889848812095, "grad_norm": 0.11456768959760666, "learning_rate": 5.121846748129544e-08, "loss": 0.3537, "step": 1332 }, { "epoch": 2.879049676025918, "grad_norm": 0.1143973246216774, "learning_rate": 4.944003428535349e-08, "loss": 0.361, "step": 1333 }, { "epoch": 2.8812095032397407, "grad_norm": 0.11492909491062164, "learning_rate": 4.769287151674407e-08, "loss": 0.3529, "step": 1334 }, { "epoch": 2.8833693304535637, "grad_norm": 0.11312732100486755, "learning_rate": 4.597699021149649e-08, "loss": 0.3604, "step": 1335 }, { "epoch": 2.8855291576673867, "grad_norm": 0.11396172642707825, "learning_rate": 4.429240120804923e-08, "loss": 0.3601, "step": 1336 }, { "epoch": 2.8876889848812093, "grad_norm": 0.10564181953668594, "learning_rate": 4.263911514718222e-08, "loss": 0.365, "step": 1337 }, { "epoch": 2.8898488120950323, "grad_norm": 0.11512638628482819, "learning_rate": 4.10171424719491e-08, "loss": 0.3658, "step": 1338 }, { "epoch": 2.8920086393088553, "grad_norm": 0.11602869629859924, "learning_rate": 3.9426493427611177e-08, "loss": 0.3611, "step": 1339 }, { "epoch": 2.8941684665226783, "grad_norm": 0.11228124052286148, "learning_rate": 3.786717806157136e-08, "loss": 0.3615, "step": 1340 }, { "epoch": 2.896328293736501, "grad_norm": 0.1036510244011879, "learning_rate": 3.633920622331311e-08, "loss": 0.3621, "step": 1341 }, { "epoch": 2.898488120950324, "grad_norm": 0.11727307736873627, "learning_rate": 3.4842587564337674e-08, "loss": 0.3569, "step": 1342 }, { "epoch": 2.900647948164147, "grad_norm": 0.10487374663352966, "learning_rate": 3.337733153810141e-08, "loss": 0.362, "step": 1343 }, { "epoch": 2.90280777537797, "grad_norm": 0.10877780616283417, "learning_rate": 3.194344739995803e-08, "loss": 0.349, "step": 1344 }, { "epoch": 2.9049676025917925, "grad_norm": 0.11223684251308441, "learning_rate": 3.054094420709863e-08, "loss": 0.365, "step": 1345 }, { "epoch": 2.9071274298056156, "grad_norm": 0.103155717253685, "learning_rate": 2.9169830818496226e-08, "loss": 0.3592, "step": 1346 }, { "epoch": 2.9092872570194386, "grad_norm": 0.11470197141170502, "learning_rate": 2.783011589484741e-08, "loss": 0.3578, "step": 1347 }, { "epoch": 2.911447084233261, "grad_norm": 0.12167331576347351, "learning_rate": 2.6521807898520214e-08, "loss": 0.353, "step": 1348 }, { "epoch": 2.913606911447084, "grad_norm": 0.1081666648387909, "learning_rate": 2.5244915093499134e-08, "loss": 0.3703, "step": 1349 }, { "epoch": 2.915766738660907, "grad_norm": 0.11052247881889343, "learning_rate": 2.3999445545332955e-08, "loss": 0.3593, "step": 1350 }, { "epoch": 2.9179265658747298, "grad_norm": 0.10058227181434631, "learning_rate": 2.2785407121084236e-08, "loss": 0.371, "step": 1351 }, { "epoch": 2.920086393088553, "grad_norm": 0.11320126056671143, "learning_rate": 2.1602807489279344e-08, "loss": 0.3549, "step": 1352 }, { "epoch": 2.922246220302376, "grad_norm": 0.11561363190412521, "learning_rate": 2.0451654119860164e-08, "loss": 0.3578, "step": 1353 }, { "epoch": 2.924406047516199, "grad_norm": 0.10961954295635223, "learning_rate": 1.9331954284137476e-08, "loss": 0.3676, "step": 1354 }, { "epoch": 2.926565874730022, "grad_norm": 0.10924555361270905, "learning_rate": 1.8243715054744315e-08, "loss": 0.3726, "step": 1355 }, { "epoch": 2.9287257019438444, "grad_norm": 0.10880248993635178, "learning_rate": 1.71869433055899e-08, "loss": 0.3547, "step": 1356 }, { "epoch": 2.9308855291576674, "grad_norm": 0.10798300057649612, "learning_rate": 1.6161645711819664e-08, "loss": 0.3569, "step": 1357 }, { "epoch": 2.9330453563714904, "grad_norm": 0.11236506700515747, "learning_rate": 1.5167828749770853e-08, "loss": 0.3681, "step": 1358 }, { "epoch": 2.935205183585313, "grad_norm": 0.10835976153612137, "learning_rate": 1.4205498696930332e-08, "loss": 0.3613, "step": 1359 }, { "epoch": 2.937365010799136, "grad_norm": 0.12566576898097992, "learning_rate": 1.3274661631899055e-08, "loss": 0.3637, "step": 1360 }, { "epoch": 2.939524838012959, "grad_norm": 0.10957465320825577, "learning_rate": 1.2375323434348773e-08, "loss": 0.3504, "step": 1361 }, { "epoch": 2.9416846652267816, "grad_norm": 0.10701923072338104, "learning_rate": 1.1507489784989278e-08, "loss": 0.3607, "step": 1362 }, { "epoch": 2.9438444924406046, "grad_norm": 0.10923349112272263, "learning_rate": 1.067116616552899e-08, "loss": 0.3706, "step": 1363 }, { "epoch": 2.9460043196544277, "grad_norm": 0.11076433211565018, "learning_rate": 9.866357858642206e-09, "loss": 0.3746, "step": 1364 }, { "epoch": 2.9481641468682507, "grad_norm": 0.10817807167768478, "learning_rate": 9.09306994793635e-09, "loss": 0.3648, "step": 1365 }, { "epoch": 2.9503239740820737, "grad_norm": 0.10961637645959854, "learning_rate": 8.351307317917002e-09, "loss": 0.3571, "step": 1366 }, { "epoch": 2.9524838012958963, "grad_norm": 0.10122332721948624, "learning_rate": 7.641074653961244e-09, "loss": 0.3681, "step": 1367 }, { "epoch": 2.9546436285097193, "grad_norm": 0.10331834852695465, "learning_rate": 6.962376442284368e-09, "loss": 0.3566, "step": 1368 }, { "epoch": 2.9568034557235423, "grad_norm": 0.10392733663320541, "learning_rate": 6.315216969912663e-09, "loss": 0.3422, "step": 1369 }, { "epoch": 2.958963282937365, "grad_norm": 0.1083427146077156, "learning_rate": 5.699600324657328e-09, "loss": 0.3711, "step": 1370 }, { "epoch": 2.961123110151188, "grad_norm": 0.1106184870004654, "learning_rate": 5.115530395087276e-09, "loss": 0.3639, "step": 1371 }, { "epoch": 2.963282937365011, "grad_norm": 0.10812865942716599, "learning_rate": 4.5630108705063684e-09, "loss": 0.3647, "step": 1372 }, { "epoch": 2.9654427645788335, "grad_norm": 0.11043433845043182, "learning_rate": 4.042045240927883e-09, "loss": 0.3706, "step": 1373 }, { "epoch": 2.9676025917926565, "grad_norm": 0.1146334782242775, "learning_rate": 3.5526367970539765e-09, "loss": 0.3564, "step": 1374 }, { "epoch": 2.9697624190064795, "grad_norm": 0.11209335923194885, "learning_rate": 3.094788630254031e-09, "loss": 0.369, "step": 1375 }, { "epoch": 2.971922246220302, "grad_norm": 0.10334824025630951, "learning_rate": 2.6685036325457826e-09, "loss": 0.3614, "step": 1376 }, { "epoch": 2.974082073434125, "grad_norm": 0.12000252306461334, "learning_rate": 2.2737844965775578e-09, "loss": 0.3677, "step": 1377 }, { "epoch": 2.976241900647948, "grad_norm": 0.10969026386737823, "learning_rate": 1.9106337156099553e-09, "loss": 0.3506, "step": 1378 }, { "epoch": 2.978401727861771, "grad_norm": 0.11796250939369202, "learning_rate": 1.5790535835003006e-09, "loss": 0.3698, "step": 1379 }, { "epoch": 2.980561555075594, "grad_norm": 0.11176804453134537, "learning_rate": 1.2790461946887712e-09, "loss": 0.3574, "step": 1380 }, { "epoch": 2.9827213822894167, "grad_norm": 0.10805931687355042, "learning_rate": 1.0106134441850712e-09, "loss": 0.3732, "step": 1381 }, { "epoch": 2.9848812095032398, "grad_norm": 0.11747419834136963, "learning_rate": 7.737570275573314e-10, "loss": 0.3544, "step": 1382 }, { "epoch": 2.987041036717063, "grad_norm": 0.11195072531700134, "learning_rate": 5.684784409182298e-10, "loss": 0.3743, "step": 1383 }, { "epoch": 2.9892008639308854, "grad_norm": 0.11737102270126343, "learning_rate": 3.9477898091944135e-10, "loss": 0.3672, "step": 1384 }, { "epoch": 2.9913606911447084, "grad_norm": 0.1034155786037445, "learning_rate": 2.5265974474109054e-10, "loss": 0.3586, "step": 1385 }, { "epoch": 2.9935205183585314, "grad_norm": 0.11616694182157516, "learning_rate": 1.4212163008509028e-10, "loss": 0.36, "step": 1386 }, { "epoch": 2.995680345572354, "grad_norm": 0.11564143747091293, "learning_rate": 6.316533517125578e-11, "loss": 0.3624, "step": 1387 }, { "epoch": 2.997840172786177, "grad_norm": 0.11294636130332947, "learning_rate": 1.57913587295333e-11, "loss": 0.3607, "step": 1388 }, { "epoch": 3.0, "grad_norm": 0.10668095201253891, "learning_rate": 0.0, "loss": 0.3576, "step": 1389 }, { "epoch": 3.0, "step": 1389, "total_flos": 2.853679693771571e+16, "train_loss": 0.023199697249737295, "train_runtime": 5911.0235, "train_samples_per_second": 90.117, "train_steps_per_second": 0.235 } ], "logging_steps": 1, "max_steps": 1389, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.853679693771571e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }