{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2431216824020422, "eval_steps": 375, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00016208112160136148, "grad_norm": 0.32745468616485596, "learning_rate": 2e-05, "loss": 3.9495, "step": 1 }, { "epoch": 0.00032416224320272295, "grad_norm": 0.3682727813720703, "learning_rate": 4e-05, "loss": 4.3546, "step": 2 }, { "epoch": 0.00048624336480408443, "grad_norm": 0.3981589674949646, "learning_rate": 6e-05, "loss": 4.6652, "step": 3 }, { "epoch": 0.0006483244864054459, "grad_norm": 0.3928574323654175, "learning_rate": 8e-05, "loss": 4.4432, "step": 4 }, { "epoch": 0.0008104056080068074, "grad_norm": 0.4036743640899658, "learning_rate": 0.0001, "loss": 4.3653, "step": 5 }, { "epoch": 0.0009724867296081689, "grad_norm": 0.48630794882774353, "learning_rate": 0.00012, "loss": 4.258, "step": 6 }, { "epoch": 0.0011345678512095303, "grad_norm": 0.5550399422645569, "learning_rate": 0.00014, "loss": 4.3999, "step": 7 }, { "epoch": 0.0012966489728108918, "grad_norm": 0.6449398994445801, "learning_rate": 0.00016, "loss": 4.4376, "step": 8 }, { "epoch": 0.0014587300944122533, "grad_norm": 0.6104982495307922, "learning_rate": 0.00018, "loss": 4.3146, "step": 9 }, { "epoch": 0.0016208112160136148, "grad_norm": 0.509041965007782, "learning_rate": 0.0002, "loss": 3.9339, "step": 10 }, { "epoch": 0.0017828923376149762, "grad_norm": 0.6153712272644043, "learning_rate": 0.00019999977772170748, "loss": 3.9375, "step": 11 }, { "epoch": 0.0019449734592163377, "grad_norm": 0.8663574457168579, "learning_rate": 0.00019999911088781805, "loss": 3.9669, "step": 12 }, { "epoch": 0.002107054580817699, "grad_norm": 1.0472509860992432, "learning_rate": 0.0001999979995012962, "loss": 3.9359, "step": 13 }, { "epoch": 0.0022691357024190607, "grad_norm": 1.0902113914489746, "learning_rate": 0.00019999644356708261, "loss": 3.7759, "step": 14 }, { "epoch": 0.002431216824020422, "grad_norm": 0.8225506544113159, "learning_rate": 0.00019999444309209432, "loss": 3.8277, "step": 15 }, { "epoch": 0.0025932979456217836, "grad_norm": 0.6225056052207947, "learning_rate": 0.0001999919980852246, "loss": 3.7358, "step": 16 }, { "epoch": 0.002755379067223145, "grad_norm": 0.7404794692993164, "learning_rate": 0.00019998910855734288, "loss": 3.8308, "step": 17 }, { "epoch": 0.0029174601888245066, "grad_norm": 1.1214481592178345, "learning_rate": 0.0001999857745212947, "loss": 3.8571, "step": 18 }, { "epoch": 0.003079541310425868, "grad_norm": 1.0956681966781616, "learning_rate": 0.00019998199599190178, "loss": 3.6167, "step": 19 }, { "epoch": 0.0032416224320272295, "grad_norm": 1.0617244243621826, "learning_rate": 0.0001999777729859618, "loss": 3.6155, "step": 20 }, { "epoch": 0.003403703553628591, "grad_norm": 0.872593104839325, "learning_rate": 0.00019997310552224846, "loss": 3.5173, "step": 21 }, { "epoch": 0.0035657846752299525, "grad_norm": 0.6966594457626343, "learning_rate": 0.00019996799362151122, "loss": 3.4923, "step": 22 }, { "epoch": 0.003727865796831314, "grad_norm": 0.8658596277236938, "learning_rate": 0.00019996243730647538, "loss": 3.5702, "step": 23 }, { "epoch": 0.0038899469184326754, "grad_norm": 0.9274637699127197, "learning_rate": 0.00019995643660184191, "loss": 3.2378, "step": 24 }, { "epoch": 0.004052028040034037, "grad_norm": 0.8372480273246765, "learning_rate": 0.00019994999153428737, "loss": 3.4719, "step": 25 }, { "epoch": 0.004214109161635398, "grad_norm": 0.8468143939971924, "learning_rate": 0.00019994310213246368, "loss": 3.4335, "step": 26 }, { "epoch": 0.00437619028323676, "grad_norm": 0.6658130884170532, "learning_rate": 0.00019993576842699816, "loss": 3.3862, "step": 27 }, { "epoch": 0.004538271404838121, "grad_norm": 0.9544421434402466, "learning_rate": 0.0001999279904504933, "loss": 3.4167, "step": 28 }, { "epoch": 0.004700352526439483, "grad_norm": 1.2153635025024414, "learning_rate": 0.00019991976823752653, "loss": 3.4523, "step": 29 }, { "epoch": 0.004862433648040844, "grad_norm": 1.0208470821380615, "learning_rate": 0.00019991110182465032, "loss": 3.2573, "step": 30 }, { "epoch": 0.005024514769642206, "grad_norm": 0.7645734548568726, "learning_rate": 0.00019990199125039174, "loss": 3.2797, "step": 31 }, { "epoch": 0.005186595891243567, "grad_norm": 0.7118749022483826, "learning_rate": 0.00019989243655525247, "loss": 3.2598, "step": 32 }, { "epoch": 0.005348677012844929, "grad_norm": 1.0173993110656738, "learning_rate": 0.00019988243778170853, "loss": 3.2678, "step": 33 }, { "epoch": 0.00551075813444629, "grad_norm": 1.3724397420883179, "learning_rate": 0.0001998719949742101, "loss": 3.2351, "step": 34 }, { "epoch": 0.005672839256047652, "grad_norm": 1.1269340515136719, "learning_rate": 0.0001998611081791814, "loss": 3.2311, "step": 35 }, { "epoch": 0.005834920377649013, "grad_norm": 0.7900427579879761, "learning_rate": 0.00019984977744502038, "loss": 3.1872, "step": 36 }, { "epoch": 0.005997001499250375, "grad_norm": 0.996651828289032, "learning_rate": 0.00019983800282209857, "loss": 3.272, "step": 37 }, { "epoch": 0.006159082620851736, "grad_norm": 1.2565242052078247, "learning_rate": 0.00019982578436276082, "loss": 3.2472, "step": 38 }, { "epoch": 0.006321163742453098, "grad_norm": 1.2510484457015991, "learning_rate": 0.00019981312212132512, "loss": 3.4406, "step": 39 }, { "epoch": 0.006483244864054459, "grad_norm": 0.872576117515564, "learning_rate": 0.00019980001615408228, "loss": 3.2457, "step": 40 }, { "epoch": 0.0066453259856558205, "grad_norm": 1.4087495803833008, "learning_rate": 0.00019978646651929572, "loss": 3.6152, "step": 41 }, { "epoch": 0.006807407107257182, "grad_norm": 1.4746192693710327, "learning_rate": 0.00019977247327720128, "loss": 3.4374, "step": 42 }, { "epoch": 0.0069694882288585435, "grad_norm": 1.278622031211853, "learning_rate": 0.0001997580364900068, "loss": 3.3347, "step": 43 }, { "epoch": 0.007131569350459905, "grad_norm": 1.1743550300598145, "learning_rate": 0.000199743156221892, "loss": 3.4706, "step": 44 }, { "epoch": 0.007293650472061266, "grad_norm": 1.4559597969055176, "learning_rate": 0.00019972783253900808, "loss": 3.3314, "step": 45 }, { "epoch": 0.007455731593662628, "grad_norm": 1.3842203617095947, "learning_rate": 0.00019971206550947748, "loss": 3.477, "step": 46 }, { "epoch": 0.007617812715263989, "grad_norm": 1.3693876266479492, "learning_rate": 0.00019969585520339354, "loss": 3.4538, "step": 47 }, { "epoch": 0.007779893836865351, "grad_norm": 1.9597094058990479, "learning_rate": 0.0001996792016928203, "loss": 3.5043, "step": 48 }, { "epoch": 0.007941974958466713, "grad_norm": 1.8300191164016724, "learning_rate": 0.00019966210505179197, "loss": 3.7964, "step": 49 }, { "epoch": 0.008104056080068074, "grad_norm": 2.7875523567199707, "learning_rate": 0.00019964456535631286, "loss": 4.3156, "step": 50 }, { "epoch": 0.008266137201669436, "grad_norm": 1.066572666168213, "learning_rate": 0.0001996265826843568, "loss": 2.8682, "step": 51 }, { "epoch": 0.008428218323270797, "grad_norm": 1.0621248483657837, "learning_rate": 0.00019960815711586696, "loss": 3.0677, "step": 52 }, { "epoch": 0.008590299444872159, "grad_norm": 0.9207605123519897, "learning_rate": 0.00019958928873275539, "loss": 2.8688, "step": 53 }, { "epoch": 0.00875238056647352, "grad_norm": 0.7903249263763428, "learning_rate": 0.00019956997761890277, "loss": 2.7778, "step": 54 }, { "epoch": 0.008914461688074882, "grad_norm": 0.8444027900695801, "learning_rate": 0.00019955022386015792, "loss": 2.6653, "step": 55 }, { "epoch": 0.009076542809676243, "grad_norm": 0.8025749921798706, "learning_rate": 0.00019953002754433743, "loss": 2.6123, "step": 56 }, { "epoch": 0.009238623931277605, "grad_norm": 0.7850587964057922, "learning_rate": 0.00019950938876122542, "loss": 2.7537, "step": 57 }, { "epoch": 0.009400705052878966, "grad_norm": 0.7457495927810669, "learning_rate": 0.00019948830760257291, "loss": 2.7807, "step": 58 }, { "epoch": 0.009562786174480328, "grad_norm": 0.7835890650749207, "learning_rate": 0.0001994667841620976, "loss": 2.6427, "step": 59 }, { "epoch": 0.009724867296081689, "grad_norm": 0.8000785708427429, "learning_rate": 0.00019944481853548335, "loss": 2.5705, "step": 60 }, { "epoch": 0.009886948417683051, "grad_norm": 1.0432358980178833, "learning_rate": 0.00019942241082037982, "loss": 2.8235, "step": 61 }, { "epoch": 0.010049029539284412, "grad_norm": 0.7705084681510925, "learning_rate": 0.00019939956111640197, "loss": 2.5888, "step": 62 }, { "epoch": 0.010211110660885774, "grad_norm": 0.8200328350067139, "learning_rate": 0.00019937626952512964, "loss": 2.7773, "step": 63 }, { "epoch": 0.010373191782487134, "grad_norm": 0.6848300099372864, "learning_rate": 0.0001993525361501072, "loss": 2.7091, "step": 64 }, { "epoch": 0.010535272904088497, "grad_norm": 0.7549343705177307, "learning_rate": 0.00019932836109684286, "loss": 2.4635, "step": 65 }, { "epoch": 0.010697354025689857, "grad_norm": 0.7910590171813965, "learning_rate": 0.00019930374447280845, "loss": 2.6742, "step": 66 }, { "epoch": 0.01085943514729122, "grad_norm": 0.8571573495864868, "learning_rate": 0.00019927868638743875, "loss": 2.6027, "step": 67 }, { "epoch": 0.01102151626889258, "grad_norm": 0.7997406721115112, "learning_rate": 0.0001992531869521312, "loss": 2.6535, "step": 68 }, { "epoch": 0.011183597390493943, "grad_norm": 0.8311846852302551, "learning_rate": 0.00019922724628024515, "loss": 2.743, "step": 69 }, { "epoch": 0.011345678512095303, "grad_norm": 0.7518640756607056, "learning_rate": 0.0001992008644871016, "loss": 2.5151, "step": 70 }, { "epoch": 0.011507759633696666, "grad_norm": 0.890500545501709, "learning_rate": 0.00019917404168998256, "loss": 2.7083, "step": 71 }, { "epoch": 0.011669840755298026, "grad_norm": 0.7954429984092712, "learning_rate": 0.0001991467780081305, "loss": 2.6477, "step": 72 }, { "epoch": 0.011831921876899389, "grad_norm": 0.7375217080116272, "learning_rate": 0.00019911907356274795, "loss": 2.4982, "step": 73 }, { "epoch": 0.01199400299850075, "grad_norm": 0.7944385409355164, "learning_rate": 0.00019909092847699683, "loss": 2.6131, "step": 74 }, { "epoch": 0.012156084120102112, "grad_norm": 0.881007194519043, "learning_rate": 0.00019906234287599798, "loss": 2.7245, "step": 75 }, { "epoch": 0.012318165241703472, "grad_norm": 0.8365350961685181, "learning_rate": 0.00019903331688683057, "loss": 2.6156, "step": 76 }, { "epoch": 0.012480246363304835, "grad_norm": 0.8571576476097107, "learning_rate": 0.00019900385063853154, "loss": 2.5997, "step": 77 }, { "epoch": 0.012642327484906195, "grad_norm": 0.8759616017341614, "learning_rate": 0.00019897394426209505, "loss": 2.5906, "step": 78 }, { "epoch": 0.012804408606507557, "grad_norm": 0.8624269962310791, "learning_rate": 0.00019894359789047187, "loss": 2.7459, "step": 79 }, { "epoch": 0.012966489728108918, "grad_norm": 0.9585336446762085, "learning_rate": 0.00019891281165856873, "loss": 2.4964, "step": 80 }, { "epoch": 0.01312857084971028, "grad_norm": 0.9535956978797913, "learning_rate": 0.00019888158570324795, "loss": 2.6338, "step": 81 }, { "epoch": 0.013290651971311641, "grad_norm": 0.8365891575813293, "learning_rate": 0.0001988499201633265, "loss": 2.6502, "step": 82 }, { "epoch": 0.013452733092913003, "grad_norm": 1.0698297023773193, "learning_rate": 0.00019881781517957562, "loss": 2.6211, "step": 83 }, { "epoch": 0.013614814214514364, "grad_norm": 0.9466975331306458, "learning_rate": 0.0001987852708947202, "loss": 2.69, "step": 84 }, { "epoch": 0.013776895336115726, "grad_norm": 1.2203270196914673, "learning_rate": 0.00019875228745343794, "loss": 2.741, "step": 85 }, { "epoch": 0.013938976457717087, "grad_norm": 1.1437451839447021, "learning_rate": 0.0001987188650023589, "loss": 2.8166, "step": 86 }, { "epoch": 0.01410105757931845, "grad_norm": 1.4375919103622437, "learning_rate": 0.0001986850036900648, "loss": 2.6542, "step": 87 }, { "epoch": 0.01426313870091981, "grad_norm": 1.3663667440414429, "learning_rate": 0.00019865070366708836, "loss": 2.597, "step": 88 }, { "epoch": 0.014425219822521172, "grad_norm": 1.2260905504226685, "learning_rate": 0.00019861596508591255, "loss": 2.9777, "step": 89 }, { "epoch": 0.014587300944122533, "grad_norm": 1.7002184391021729, "learning_rate": 0.00019858078810097002, "loss": 2.6876, "step": 90 }, { "epoch": 0.014749382065723895, "grad_norm": 1.382377028465271, "learning_rate": 0.00019854517286864245, "loss": 2.9024, "step": 91 }, { "epoch": 0.014911463187325256, "grad_norm": 1.7291725873947144, "learning_rate": 0.0001985091195472596, "loss": 2.8279, "step": 92 }, { "epoch": 0.015073544308926618, "grad_norm": 1.690996766090393, "learning_rate": 0.0001984726282970989, "loss": 2.9042, "step": 93 }, { "epoch": 0.015235625430527979, "grad_norm": 1.3773317337036133, "learning_rate": 0.0001984356992803847, "loss": 2.8591, "step": 94 }, { "epoch": 0.015397706552129341, "grad_norm": 2.045140027999878, "learning_rate": 0.00019839833266128724, "loss": 3.0208, "step": 95 }, { "epoch": 0.015559787673730702, "grad_norm": 1.5940877199172974, "learning_rate": 0.00019836052860592237, "loss": 3.0601, "step": 96 }, { "epoch": 0.015721868795332064, "grad_norm": 1.720646858215332, "learning_rate": 0.0001983222872823505, "loss": 2.9597, "step": 97 }, { "epoch": 0.015883949916933426, "grad_norm": 2.037874460220337, "learning_rate": 0.00019828360886057594, "loss": 3.1805, "step": 98 }, { "epoch": 0.016046031038534785, "grad_norm": 2.515458822250366, "learning_rate": 0.00019824449351254616, "loss": 3.2132, "step": 99 }, { "epoch": 0.016208112160136148, "grad_norm": 3.2635490894317627, "learning_rate": 0.00019820494141215104, "loss": 3.7978, "step": 100 }, { "epoch": 0.01637019328173751, "grad_norm": 1.3721284866333008, "learning_rate": 0.000198164952735222, "loss": 2.7209, "step": 101 }, { "epoch": 0.016532274403338872, "grad_norm": 0.8304929733276367, "learning_rate": 0.00019812452765953135, "loss": 2.4077, "step": 102 }, { "epoch": 0.01669435552494023, "grad_norm": 1.0842708349227905, "learning_rate": 0.00019808366636479147, "loss": 2.4252, "step": 103 }, { "epoch": 0.016856436646541594, "grad_norm": 0.8582240343093872, "learning_rate": 0.00019804236903265388, "loss": 2.2418, "step": 104 }, { "epoch": 0.017018517768142956, "grad_norm": 0.8859397172927856, "learning_rate": 0.00019800063584670863, "loss": 2.3411, "step": 105 }, { "epoch": 0.017180598889744318, "grad_norm": 0.9900501370429993, "learning_rate": 0.00019795846699248332, "loss": 2.4639, "step": 106 }, { "epoch": 0.017342680011345677, "grad_norm": 1.003010869026184, "learning_rate": 0.00019791586265744237, "loss": 2.5822, "step": 107 }, { "epoch": 0.01750476113294704, "grad_norm": 0.8580072522163391, "learning_rate": 0.00019787282303098617, "loss": 2.3763, "step": 108 }, { "epoch": 0.017666842254548402, "grad_norm": 0.9203416109085083, "learning_rate": 0.0001978293483044502, "loss": 2.416, "step": 109 }, { "epoch": 0.017828923376149764, "grad_norm": 0.7686523199081421, "learning_rate": 0.00019778543867110426, "loss": 2.3347, "step": 110 }, { "epoch": 0.017991004497751123, "grad_norm": 0.7696153521537781, "learning_rate": 0.00019774109432615147, "loss": 2.1202, "step": 111 }, { "epoch": 0.018153085619352485, "grad_norm": 0.8893393278121948, "learning_rate": 0.00019769631546672756, "loss": 2.4122, "step": 112 }, { "epoch": 0.018315166740953848, "grad_norm": 0.9954109787940979, "learning_rate": 0.00019765110229189988, "loss": 2.3672, "step": 113 }, { "epoch": 0.01847724786255521, "grad_norm": 0.9527397751808167, "learning_rate": 0.00019760545500266657, "loss": 2.3611, "step": 114 }, { "epoch": 0.01863932898415657, "grad_norm": 0.9039053320884705, "learning_rate": 0.00019755937380195568, "loss": 2.3882, "step": 115 }, { "epoch": 0.01880141010575793, "grad_norm": 0.8039470911026001, "learning_rate": 0.00019751285889462423, "loss": 2.3428, "step": 116 }, { "epoch": 0.018963491227359294, "grad_norm": 0.8728412389755249, "learning_rate": 0.0001974659104874573, "loss": 2.4066, "step": 117 }, { "epoch": 0.019125572348960656, "grad_norm": 0.847098708152771, "learning_rate": 0.0001974185287891671, "loss": 2.465, "step": 118 }, { "epoch": 0.019287653470562015, "grad_norm": 0.8857790231704712, "learning_rate": 0.0001973707140103921, "loss": 2.4859, "step": 119 }, { "epoch": 0.019449734592163377, "grad_norm": 0.9171362519264221, "learning_rate": 0.00019732246636369605, "loss": 2.3492, "step": 120 }, { "epoch": 0.01961181571376474, "grad_norm": 0.8632153868675232, "learning_rate": 0.00019727378606356703, "loss": 2.3529, "step": 121 }, { "epoch": 0.019773896835366102, "grad_norm": 1.0254154205322266, "learning_rate": 0.00019722467332641656, "loss": 2.4594, "step": 122 }, { "epoch": 0.01993597795696746, "grad_norm": 0.9391173720359802, "learning_rate": 0.00019717512837057855, "loss": 2.1394, "step": 123 }, { "epoch": 0.020098059078568823, "grad_norm": 1.1696561574935913, "learning_rate": 0.0001971251514163083, "loss": 2.4457, "step": 124 }, { "epoch": 0.020260140200170185, "grad_norm": 1.1914767026901245, "learning_rate": 0.0001970747426857817, "loss": 2.4778, "step": 125 }, { "epoch": 0.020422221321771548, "grad_norm": 1.1912813186645508, "learning_rate": 0.00019702390240309404, "loss": 2.5327, "step": 126 }, { "epoch": 0.020584302443372907, "grad_norm": 1.272786021232605, "learning_rate": 0.0001969726307942592, "loss": 2.5287, "step": 127 }, { "epoch": 0.02074638356497427, "grad_norm": 0.9330551028251648, "learning_rate": 0.00019692092808720846, "loss": 2.5395, "step": 128 }, { "epoch": 0.02090846468657563, "grad_norm": 1.370786428451538, "learning_rate": 0.0001968687945117896, "loss": 2.4154, "step": 129 }, { "epoch": 0.021070545808176994, "grad_norm": 1.041229009628296, "learning_rate": 0.00019681623029976588, "loss": 2.435, "step": 130 }, { "epoch": 0.021232626929778353, "grad_norm": 1.0356900691986084, "learning_rate": 0.00019676323568481498, "loss": 2.603, "step": 131 }, { "epoch": 0.021394708051379715, "grad_norm": 1.4157955646514893, "learning_rate": 0.00019670981090252792, "loss": 2.4694, "step": 132 }, { "epoch": 0.021556789172981077, "grad_norm": 1.2365660667419434, "learning_rate": 0.00019665595619040808, "loss": 2.4311, "step": 133 }, { "epoch": 0.02171887029458244, "grad_norm": 1.5523958206176758, "learning_rate": 0.0001966016717878702, "loss": 2.5472, "step": 134 }, { "epoch": 0.0218809514161838, "grad_norm": 1.3933662176132202, "learning_rate": 0.00019654695793623907, "loss": 2.5602, "step": 135 }, { "epoch": 0.02204303253778516, "grad_norm": 1.3399230241775513, "learning_rate": 0.0001964918148787488, "loss": 2.3793, "step": 136 }, { "epoch": 0.022205113659386523, "grad_norm": 1.1035332679748535, "learning_rate": 0.00019643624286054144, "loss": 2.6046, "step": 137 }, { "epoch": 0.022367194780987885, "grad_norm": 1.1267286539077759, "learning_rate": 0.00019638024212866606, "loss": 2.5393, "step": 138 }, { "epoch": 0.022529275902589244, "grad_norm": 1.1763005256652832, "learning_rate": 0.0001963238129320776, "loss": 2.5834, "step": 139 }, { "epoch": 0.022691357024190607, "grad_norm": 1.3848681449890137, "learning_rate": 0.00019626695552163578, "loss": 2.8858, "step": 140 }, { "epoch": 0.02285343814579197, "grad_norm": 1.1526983976364136, "learning_rate": 0.00019620967015010395, "loss": 2.6724, "step": 141 }, { "epoch": 0.02301551926739333, "grad_norm": 1.6136480569839478, "learning_rate": 0.00019615195707214803, "loss": 2.7808, "step": 142 }, { "epoch": 0.02317760038899469, "grad_norm": 1.360098123550415, "learning_rate": 0.0001960938165443353, "loss": 2.923, "step": 143 }, { "epoch": 0.023339681510596053, "grad_norm": 1.4082938432693481, "learning_rate": 0.00019603524882513327, "loss": 2.744, "step": 144 }, { "epoch": 0.023501762632197415, "grad_norm": 1.5163816213607788, "learning_rate": 0.0001959762541749086, "loss": 2.8498, "step": 145 }, { "epoch": 0.023663843753798777, "grad_norm": 1.6904058456420898, "learning_rate": 0.00019591683285592593, "loss": 2.8353, "step": 146 }, { "epoch": 0.023825924875400136, "grad_norm": 1.7553011178970337, "learning_rate": 0.00019585698513234663, "loss": 2.8577, "step": 147 }, { "epoch": 0.0239880059970015, "grad_norm": 2.2253310680389404, "learning_rate": 0.0001957967112702277, "loss": 3.1318, "step": 148 }, { "epoch": 0.02415008711860286, "grad_norm": 2.3773317337036133, "learning_rate": 0.00019573601153752052, "loss": 3.1695, "step": 149 }, { "epoch": 0.024312168240204223, "grad_norm": 3.46358585357666, "learning_rate": 0.00019567488620406983, "loss": 3.66, "step": 150 }, { "epoch": 0.024474249361805582, "grad_norm": 1.1002161502838135, "learning_rate": 0.00019561333554161224, "loss": 2.5787, "step": 151 }, { "epoch": 0.024636330483406944, "grad_norm": 0.8970770835876465, "learning_rate": 0.0001955513598237753, "loss": 2.3198, "step": 152 }, { "epoch": 0.024798411605008307, "grad_norm": 0.9427694082260132, "learning_rate": 0.00019548895932607621, "loss": 2.4208, "step": 153 }, { "epoch": 0.02496049272660967, "grad_norm": 0.936225414276123, "learning_rate": 0.00019542613432592038, "loss": 2.3963, "step": 154 }, { "epoch": 0.025122573848211028, "grad_norm": 0.8925445675849915, "learning_rate": 0.00019536288510260056, "loss": 2.2791, "step": 155 }, { "epoch": 0.02528465496981239, "grad_norm": 1.1286380290985107, "learning_rate": 0.00019529921193729534, "loss": 2.3948, "step": 156 }, { "epoch": 0.025446736091413753, "grad_norm": 0.8829410672187805, "learning_rate": 0.00019523511511306793, "loss": 2.3171, "step": 157 }, { "epoch": 0.025608817213015115, "grad_norm": 0.9467886686325073, "learning_rate": 0.000195170594914865, "loss": 2.2482, "step": 158 }, { "epoch": 0.025770898334616474, "grad_norm": 1.098661184310913, "learning_rate": 0.00019510565162951537, "loss": 2.2541, "step": 159 }, { "epoch": 0.025932979456217836, "grad_norm": 1.0854690074920654, "learning_rate": 0.00019504028554572864, "loss": 2.261, "step": 160 }, { "epoch": 0.0260950605778192, "grad_norm": 0.887253999710083, "learning_rate": 0.00019497449695409408, "loss": 2.3605, "step": 161 }, { "epoch": 0.02625714169942056, "grad_norm": 0.8699051737785339, "learning_rate": 0.00019490828614707916, "loss": 2.1781, "step": 162 }, { "epoch": 0.02641922282102192, "grad_norm": 0.9914195537567139, "learning_rate": 0.00019484165341902845, "loss": 2.3487, "step": 163 }, { "epoch": 0.026581303942623282, "grad_norm": 0.7977816462516785, "learning_rate": 0.00019477459906616206, "loss": 2.2499, "step": 164 }, { "epoch": 0.026743385064224644, "grad_norm": 1.0885521173477173, "learning_rate": 0.00019470712338657458, "loss": 2.3353, "step": 165 }, { "epoch": 0.026905466185826007, "grad_norm": 0.9307372570037842, "learning_rate": 0.0001946392266802336, "loss": 2.349, "step": 166 }, { "epoch": 0.02706754730742737, "grad_norm": 0.8721451163291931, "learning_rate": 0.0001945709092489783, "loss": 2.1844, "step": 167 }, { "epoch": 0.027229628429028728, "grad_norm": 0.8243699669837952, "learning_rate": 0.00019450217139651844, "loss": 2.3944, "step": 168 }, { "epoch": 0.02739170955063009, "grad_norm": 0.8237115144729614, "learning_rate": 0.0001944330134284326, "loss": 2.3017, "step": 169 }, { "epoch": 0.027553790672231453, "grad_norm": 1.0215017795562744, "learning_rate": 0.00019436343565216711, "loss": 2.377, "step": 170 }, { "epoch": 0.027715871793832815, "grad_norm": 1.0807178020477295, "learning_rate": 0.00019429343837703455, "loss": 2.3705, "step": 171 }, { "epoch": 0.027877952915434174, "grad_norm": 0.8931332230567932, "learning_rate": 0.0001942230219142124, "loss": 2.3758, "step": 172 }, { "epoch": 0.028040034037035536, "grad_norm": 0.9059082865715027, "learning_rate": 0.0001941521865767417, "loss": 2.3395, "step": 173 }, { "epoch": 0.0282021151586369, "grad_norm": 0.8843547701835632, "learning_rate": 0.0001940809326795256, "loss": 2.3348, "step": 174 }, { "epoch": 0.02836419628023826, "grad_norm": 0.9769134521484375, "learning_rate": 0.000194009260539328, "loss": 2.3773, "step": 175 }, { "epoch": 0.02852627740183962, "grad_norm": 0.9252240657806396, "learning_rate": 0.0001939371704747721, "loss": 2.2362, "step": 176 }, { "epoch": 0.028688358523440982, "grad_norm": 0.9405879974365234, "learning_rate": 0.00019386466280633906, "loss": 2.3078, "step": 177 }, { "epoch": 0.028850439645042344, "grad_norm": 0.9854640960693359, "learning_rate": 0.00019379173785636646, "loss": 2.4224, "step": 178 }, { "epoch": 0.029012520766643707, "grad_norm": 1.0185924768447876, "learning_rate": 0.000193718395949047, "loss": 2.3796, "step": 179 }, { "epoch": 0.029174601888245066, "grad_norm": 1.1267231702804565, "learning_rate": 0.00019364463741042694, "loss": 2.5936, "step": 180 }, { "epoch": 0.029336683009846428, "grad_norm": 1.0917437076568604, "learning_rate": 0.00019357046256840473, "loss": 2.436, "step": 181 }, { "epoch": 0.02949876413144779, "grad_norm": 1.4547231197357178, "learning_rate": 0.00019349587175272948, "loss": 2.4356, "step": 182 }, { "epoch": 0.029660845253049153, "grad_norm": 1.2778880596160889, "learning_rate": 0.0001934208652949996, "loss": 2.3618, "step": 183 }, { "epoch": 0.02982292637465051, "grad_norm": 1.2359460592269897, "learning_rate": 0.00019334544352866127, "loss": 2.2503, "step": 184 }, { "epoch": 0.029985007496251874, "grad_norm": 1.6187139749526978, "learning_rate": 0.00019326960678900688, "loss": 2.5739, "step": 185 }, { "epoch": 0.030147088617853236, "grad_norm": 1.077547311782837, "learning_rate": 0.00019319335541317361, "loss": 2.5456, "step": 186 }, { "epoch": 0.0303091697394546, "grad_norm": 1.2906684875488281, "learning_rate": 0.00019311668974014208, "loss": 2.4993, "step": 187 }, { "epoch": 0.030471250861055958, "grad_norm": 1.446529507637024, "learning_rate": 0.00019303961011073447, "loss": 2.4235, "step": 188 }, { "epoch": 0.03063333198265732, "grad_norm": 1.2975163459777832, "learning_rate": 0.00019296211686761346, "loss": 2.6774, "step": 189 }, { "epoch": 0.030795413104258682, "grad_norm": 1.6122586727142334, "learning_rate": 0.00019288421035528028, "loss": 2.5858, "step": 190 }, { "epoch": 0.030957494225860045, "grad_norm": 1.3004416227340698, "learning_rate": 0.00019280589092007352, "loss": 2.6111, "step": 191 }, { "epoch": 0.031119575347461403, "grad_norm": 1.5591881275177002, "learning_rate": 0.00019272715891016735, "loss": 2.7386, "step": 192 }, { "epoch": 0.031281656469062766, "grad_norm": 1.6340643167495728, "learning_rate": 0.00019264801467557007, "loss": 2.628, "step": 193 }, { "epoch": 0.03144373759066413, "grad_norm": 1.4968132972717285, "learning_rate": 0.00019256845856812266, "loss": 2.8749, "step": 194 }, { "epoch": 0.03160581871226549, "grad_norm": 1.6335184574127197, "learning_rate": 0.000192488490941497, "loss": 2.6994, "step": 195 }, { "epoch": 0.03176789983386685, "grad_norm": 1.96530020236969, "learning_rate": 0.00019240811215119448, "loss": 2.7515, "step": 196 }, { "epoch": 0.031929980955468215, "grad_norm": 1.7235907316207886, "learning_rate": 0.00019232732255454422, "loss": 2.7286, "step": 197 }, { "epoch": 0.03209206207706957, "grad_norm": 2.007321834564209, "learning_rate": 0.00019224612251070175, "loss": 2.8945, "step": 198 }, { "epoch": 0.03225414319867093, "grad_norm": 2.7299695014953613, "learning_rate": 0.0001921645123806472, "loss": 3.2715, "step": 199 }, { "epoch": 0.032416224320272295, "grad_norm": 3.4719176292419434, "learning_rate": 0.0001920824925271838, "loss": 3.5109, "step": 200 }, { "epoch": 0.03257830544187366, "grad_norm": 1.11237370967865, "learning_rate": 0.0001920000633149362, "loss": 2.4583, "step": 201 }, { "epoch": 0.03274038656347502, "grad_norm": 1.248706579208374, "learning_rate": 0.00019191722511034884, "loss": 2.1951, "step": 202 }, { "epoch": 0.03290246768507638, "grad_norm": 1.2184851169586182, "learning_rate": 0.00019183397828168448, "loss": 2.1751, "step": 203 }, { "epoch": 0.033064548806677745, "grad_norm": 1.3734302520751953, "learning_rate": 0.00019175032319902234, "loss": 2.2204, "step": 204 }, { "epoch": 0.03322662992827911, "grad_norm": 1.0059350728988647, "learning_rate": 0.00019166626023425662, "loss": 2.2202, "step": 205 }, { "epoch": 0.03338871104988046, "grad_norm": 0.9674267172813416, "learning_rate": 0.00019158178976109476, "loss": 2.2254, "step": 206 }, { "epoch": 0.033550792171481825, "grad_norm": 1.008048415184021, "learning_rate": 0.0001914969121550558, "loss": 2.3269, "step": 207 }, { "epoch": 0.03371287329308319, "grad_norm": 0.890281081199646, "learning_rate": 0.00019141162779346874, "loss": 2.0735, "step": 208 }, { "epoch": 0.03387495441468455, "grad_norm": 0.901919424533844, "learning_rate": 0.00019132593705547082, "loss": 2.2881, "step": 209 }, { "epoch": 0.03403703553628591, "grad_norm": 1.0526825189590454, "learning_rate": 0.00019123984032200586, "loss": 2.1952, "step": 210 }, { "epoch": 0.034199116657887274, "grad_norm": 0.7686760425567627, "learning_rate": 0.00019115333797582254, "loss": 2.1207, "step": 211 }, { "epoch": 0.034361197779488636, "grad_norm": 1.027368426322937, "learning_rate": 0.00019106643040147278, "loss": 2.1631, "step": 212 }, { "epoch": 0.03452327890109, "grad_norm": 1.0761582851409912, "learning_rate": 0.00019097911798530987, "loss": 2.2614, "step": 213 }, { "epoch": 0.034685360022691354, "grad_norm": 0.8369147777557373, "learning_rate": 0.00019089140111548696, "loss": 2.1758, "step": 214 }, { "epoch": 0.034847441144292716, "grad_norm": 0.8233863115310669, "learning_rate": 0.00019080328018195513, "loss": 2.1106, "step": 215 }, { "epoch": 0.03500952226589408, "grad_norm": 1.033157229423523, "learning_rate": 0.0001907147555764618, "loss": 2.2292, "step": 216 }, { "epoch": 0.03517160338749544, "grad_norm": 0.8740055561065674, "learning_rate": 0.00019062582769254895, "loss": 2.1657, "step": 217 }, { "epoch": 0.035333684509096804, "grad_norm": 1.2374004125595093, "learning_rate": 0.00019053649692555135, "loss": 2.2105, "step": 218 }, { "epoch": 0.035495765630698166, "grad_norm": 0.9455053806304932, "learning_rate": 0.00019044676367259476, "loss": 2.2436, "step": 219 }, { "epoch": 0.03565784675229953, "grad_norm": 1.1991380453109741, "learning_rate": 0.00019035662833259432, "loss": 2.2907, "step": 220 }, { "epoch": 0.03581992787390089, "grad_norm": 1.0990439653396606, "learning_rate": 0.00019026609130625257, "loss": 2.0842, "step": 221 }, { "epoch": 0.035982008995502246, "grad_norm": 0.9472472667694092, "learning_rate": 0.00019017515299605788, "loss": 2.189, "step": 222 }, { "epoch": 0.03614409011710361, "grad_norm": 1.0272729396820068, "learning_rate": 0.00019008381380628247, "loss": 2.1383, "step": 223 }, { "epoch": 0.03630617123870497, "grad_norm": 1.0076351165771484, "learning_rate": 0.00018999207414298067, "loss": 2.2219, "step": 224 }, { "epoch": 0.03646825236030633, "grad_norm": 0.9722249507904053, "learning_rate": 0.00018989993441398726, "loss": 2.2792, "step": 225 }, { "epoch": 0.036630333481907695, "grad_norm": 0.9026159048080444, "learning_rate": 0.00018980739502891546, "loss": 2.2208, "step": 226 }, { "epoch": 0.03679241460350906, "grad_norm": 1.2710939645767212, "learning_rate": 0.0001897144563991552, "loss": 2.1999, "step": 227 }, { "epoch": 0.03695449572511042, "grad_norm": 1.0243679285049438, "learning_rate": 0.00018962111893787128, "loss": 2.3907, "step": 228 }, { "epoch": 0.03711657684671178, "grad_norm": 1.4581419229507446, "learning_rate": 0.00018952738306000151, "loss": 2.3261, "step": 229 }, { "epoch": 0.03727865796831314, "grad_norm": 1.0644052028656006, "learning_rate": 0.00018943324918225494, "loss": 2.2427, "step": 230 }, { "epoch": 0.0374407390899145, "grad_norm": 0.9349438548088074, "learning_rate": 0.0001893387177231099, "loss": 2.2227, "step": 231 }, { "epoch": 0.03760282021151586, "grad_norm": 1.7584396600723267, "learning_rate": 0.0001892437891028122, "loss": 2.5222, "step": 232 }, { "epoch": 0.037764901333117225, "grad_norm": 1.1246528625488281, "learning_rate": 0.0001891484637433733, "loss": 2.3478, "step": 233 }, { "epoch": 0.03792698245471859, "grad_norm": 1.7504336833953857, "learning_rate": 0.00018905274206856837, "loss": 2.4176, "step": 234 }, { "epoch": 0.03808906357631995, "grad_norm": 1.7333769798278809, "learning_rate": 0.00018895662450393438, "loss": 2.3627, "step": 235 }, { "epoch": 0.03825114469792131, "grad_norm": 1.1291919946670532, "learning_rate": 0.00018886011147676833, "loss": 2.2326, "step": 236 }, { "epoch": 0.038413225819522674, "grad_norm": 2.3500661849975586, "learning_rate": 0.00018876320341612522, "loss": 2.3338, "step": 237 }, { "epoch": 0.03857530694112403, "grad_norm": 1.9575247764587402, "learning_rate": 0.00018866590075281624, "loss": 2.4039, "step": 238 }, { "epoch": 0.03873738806272539, "grad_norm": 1.1364452838897705, "learning_rate": 0.00018856820391940674, "loss": 2.4962, "step": 239 }, { "epoch": 0.038899469184326754, "grad_norm": 1.8569177389144897, "learning_rate": 0.00018847011335021449, "loss": 2.5976, "step": 240 }, { "epoch": 0.03906155030592812, "grad_norm": 1.4362958669662476, "learning_rate": 0.00018837162948130752, "loss": 2.5504, "step": 241 }, { "epoch": 0.03922363142752948, "grad_norm": 1.2726794481277466, "learning_rate": 0.00018827275275050233, "loss": 2.5475, "step": 242 }, { "epoch": 0.03938571254913084, "grad_norm": 1.394042730331421, "learning_rate": 0.00018817348359736203, "loss": 2.6246, "step": 243 }, { "epoch": 0.039547793670732204, "grad_norm": 1.6995958089828491, "learning_rate": 0.00018807382246319412, "loss": 2.6082, "step": 244 }, { "epoch": 0.039709874792333566, "grad_norm": 1.566512942314148, "learning_rate": 0.00018797376979104872, "loss": 2.6337, "step": 245 }, { "epoch": 0.03987195591393492, "grad_norm": 1.7819331884384155, "learning_rate": 0.00018787332602571662, "loss": 2.6822, "step": 246 }, { "epoch": 0.040034037035536284, "grad_norm": 1.922326922416687, "learning_rate": 0.00018777249161372713, "loss": 2.7644, "step": 247 }, { "epoch": 0.040196118157137646, "grad_norm": 2.2240724563598633, "learning_rate": 0.00018767126700334634, "loss": 3.0321, "step": 248 }, { "epoch": 0.04035819927873901, "grad_norm": 2.2767648696899414, "learning_rate": 0.0001875696526445749, "loss": 2.767, "step": 249 }, { "epoch": 0.04052028040034037, "grad_norm": 3.148064613342285, "learning_rate": 0.0001874676489891461, "loss": 3.5881, "step": 250 }, { "epoch": 0.04068236152194173, "grad_norm": 1.428133487701416, "learning_rate": 0.00018736525649052394, "loss": 2.3483, "step": 251 }, { "epoch": 0.040844442643543095, "grad_norm": 1.0642366409301758, "learning_rate": 0.00018726247560390099, "loss": 2.3377, "step": 252 }, { "epoch": 0.04100652376514446, "grad_norm": 1.277927279472351, "learning_rate": 0.00018715930678619644, "loss": 2.3101, "step": 253 }, { "epoch": 0.04116860488674581, "grad_norm": 1.297062873840332, "learning_rate": 0.00018705575049605413, "loss": 2.1857, "step": 254 }, { "epoch": 0.041330686008347176, "grad_norm": 1.0266833305358887, "learning_rate": 0.00018695180719384029, "loss": 2.2796, "step": 255 }, { "epoch": 0.04149276712994854, "grad_norm": 1.329106330871582, "learning_rate": 0.00018684747734164177, "loss": 2.097, "step": 256 }, { "epoch": 0.0416548482515499, "grad_norm": 1.1051158905029297, "learning_rate": 0.00018674276140326376, "loss": 2.1601, "step": 257 }, { "epoch": 0.04181692937315126, "grad_norm": 1.1395552158355713, "learning_rate": 0.00018663765984422786, "loss": 2.2773, "step": 258 }, { "epoch": 0.041979010494752625, "grad_norm": 1.0176094770431519, "learning_rate": 0.00018653217313177004, "loss": 2.1027, "step": 259 }, { "epoch": 0.04214109161635399, "grad_norm": 1.410070538520813, "learning_rate": 0.00018642630173483832, "loss": 2.2349, "step": 260 }, { "epoch": 0.04230317273795535, "grad_norm": 1.0913872718811035, "learning_rate": 0.00018632004612409103, "loss": 2.1952, "step": 261 }, { "epoch": 0.042465253859556705, "grad_norm": 1.0407989025115967, "learning_rate": 0.00018621340677189453, "loss": 2.0673, "step": 262 }, { "epoch": 0.04262733498115807, "grad_norm": 0.8925177454948425, "learning_rate": 0.00018610638415232097, "loss": 2.0325, "step": 263 }, { "epoch": 0.04278941610275943, "grad_norm": 1.2974311113357544, "learning_rate": 0.00018599897874114652, "loss": 2.2241, "step": 264 }, { "epoch": 0.04295149722436079, "grad_norm": 1.0207972526550293, "learning_rate": 0.00018589119101584898, "loss": 2.1243, "step": 265 }, { "epoch": 0.043113578345962154, "grad_norm": 0.9445371627807617, "learning_rate": 0.00018578302145560584, "loss": 2.1291, "step": 266 }, { "epoch": 0.04327565946756352, "grad_norm": 1.0071361064910889, "learning_rate": 0.00018567447054129195, "loss": 2.0553, "step": 267 }, { "epoch": 0.04343774058916488, "grad_norm": 0.9603691101074219, "learning_rate": 0.00018556553875547754, "loss": 2.1355, "step": 268 }, { "epoch": 0.04359982171076624, "grad_norm": 0.9493493437767029, "learning_rate": 0.00018545622658242607, "loss": 2.2426, "step": 269 }, { "epoch": 0.0437619028323676, "grad_norm": 1.3570773601531982, "learning_rate": 0.00018534653450809197, "loss": 2.17, "step": 270 }, { "epoch": 0.04392398395396896, "grad_norm": 1.1473950147628784, "learning_rate": 0.00018523646302011867, "loss": 2.1311, "step": 271 }, { "epoch": 0.04408606507557032, "grad_norm": 0.9925466775894165, "learning_rate": 0.00018512601260783606, "loss": 2.0384, "step": 272 }, { "epoch": 0.044248146197171684, "grad_norm": 1.0609676837921143, "learning_rate": 0.00018501518376225887, "loss": 2.2505, "step": 273 }, { "epoch": 0.044410227318773046, "grad_norm": 0.9563316702842712, "learning_rate": 0.00018490397697608395, "loss": 2.0907, "step": 274 }, { "epoch": 0.04457230844037441, "grad_norm": 0.998549222946167, "learning_rate": 0.0001847923927436884, "loss": 2.1951, "step": 275 }, { "epoch": 0.04473438956197577, "grad_norm": 0.9470577836036682, "learning_rate": 0.00018468043156112728, "loss": 2.1417, "step": 276 }, { "epoch": 0.04489647068357713, "grad_norm": 1.134791374206543, "learning_rate": 0.0001845680939261314, "loss": 2.1117, "step": 277 }, { "epoch": 0.04505855180517849, "grad_norm": 1.16586434841156, "learning_rate": 0.00018445538033810515, "loss": 2.2197, "step": 278 }, { "epoch": 0.04522063292677985, "grad_norm": 1.1667602062225342, "learning_rate": 0.00018434229129812418, "loss": 2.3319, "step": 279 }, { "epoch": 0.04538271404838121, "grad_norm": 1.3442655801773071, "learning_rate": 0.0001842288273089332, "loss": 2.1906, "step": 280 }, { "epoch": 0.045544795169982576, "grad_norm": 1.522715449333191, "learning_rate": 0.00018411498887494396, "loss": 2.2284, "step": 281 }, { "epoch": 0.04570687629158394, "grad_norm": 1.083176851272583, "learning_rate": 0.00018400077650223263, "loss": 2.4033, "step": 282 }, { "epoch": 0.0458689574131853, "grad_norm": 1.6272213459014893, "learning_rate": 0.0001838861906985379, "loss": 2.4303, "step": 283 }, { "epoch": 0.04603103853478666, "grad_norm": 1.0966017246246338, "learning_rate": 0.00018377123197325842, "loss": 2.4413, "step": 284 }, { "epoch": 0.046193119656388025, "grad_norm": 1.427307367324829, "learning_rate": 0.00018365590083745085, "loss": 2.2448, "step": 285 }, { "epoch": 0.04635520077798938, "grad_norm": 1.1614744663238525, "learning_rate": 0.00018354019780382735, "loss": 2.3916, "step": 286 }, { "epoch": 0.04651728189959074, "grad_norm": 1.4718098640441895, "learning_rate": 0.0001834241233867533, "loss": 2.3877, "step": 287 }, { "epoch": 0.046679363021192105, "grad_norm": 1.2141605615615845, "learning_rate": 0.00018330767810224524, "loss": 2.4906, "step": 288 }, { "epoch": 0.04684144414279347, "grad_norm": 1.296593427658081, "learning_rate": 0.0001831908624679683, "loss": 2.4217, "step": 289 }, { "epoch": 0.04700352526439483, "grad_norm": 1.238234519958496, "learning_rate": 0.0001830736770032341, "loss": 2.4124, "step": 290 }, { "epoch": 0.04716560638599619, "grad_norm": 1.3287264108657837, "learning_rate": 0.0001829561222289984, "loss": 2.5919, "step": 291 }, { "epoch": 0.047327687507597554, "grad_norm": 1.575232744216919, "learning_rate": 0.00018283819866785853, "loss": 2.6511, "step": 292 }, { "epoch": 0.04748976862919892, "grad_norm": 1.5590267181396484, "learning_rate": 0.0001827199068440516, "loss": 2.3753, "step": 293 }, { "epoch": 0.04765184975080027, "grad_norm": 1.3798424005508423, "learning_rate": 0.00018260124728345162, "loss": 2.6057, "step": 294 }, { "epoch": 0.047813930872401635, "grad_norm": 1.6804996728897095, "learning_rate": 0.00018248222051356754, "loss": 2.7774, "step": 295 }, { "epoch": 0.047976011994003, "grad_norm": 1.7314913272857666, "learning_rate": 0.00018236282706354063, "loss": 2.603, "step": 296 }, { "epoch": 0.04813809311560436, "grad_norm": 1.8370392322540283, "learning_rate": 0.00018224306746414238, "loss": 2.6394, "step": 297 }, { "epoch": 0.04830017423720572, "grad_norm": 2.3413655757904053, "learning_rate": 0.00018212294224777197, "loss": 2.8592, "step": 298 }, { "epoch": 0.048462255358807084, "grad_norm": 3.0159950256347656, "learning_rate": 0.00018200245194845399, "loss": 3.157, "step": 299 }, { "epoch": 0.048624336480408446, "grad_norm": 3.672441005706787, "learning_rate": 0.00018188159710183594, "loss": 3.2499, "step": 300 }, { "epoch": 0.04878641760200981, "grad_norm": 1.427798867225647, "learning_rate": 0.000181760378245186, "loss": 2.4282, "step": 301 }, { "epoch": 0.048948498723611164, "grad_norm": 1.2800281047821045, "learning_rate": 0.00018163879591739067, "loss": 2.1978, "step": 302 }, { "epoch": 0.049110579845212526, "grad_norm": 0.8815104961395264, "learning_rate": 0.0001815168506589521, "loss": 2.2418, "step": 303 }, { "epoch": 0.04927266096681389, "grad_norm": 1.0522781610488892, "learning_rate": 0.000181394543011986, "loss": 2.0928, "step": 304 }, { "epoch": 0.04943474208841525, "grad_norm": 1.028746485710144, "learning_rate": 0.00018127187352021907, "loss": 2.11, "step": 305 }, { "epoch": 0.04959682321001661, "grad_norm": 0.9874324798583984, "learning_rate": 0.0001811488427289866, "loss": 2.0513, "step": 306 }, { "epoch": 0.049758904331617976, "grad_norm": 0.8898067474365234, "learning_rate": 0.00018102545118523007, "loss": 2.0351, "step": 307 }, { "epoch": 0.04992098545321934, "grad_norm": 0.834274411201477, "learning_rate": 0.00018090169943749476, "loss": 1.9415, "step": 308 }, { "epoch": 0.0500830665748207, "grad_norm": 0.928421676158905, "learning_rate": 0.00018077758803592718, "loss": 2.1434, "step": 309 }, { "epoch": 0.050245147696422056, "grad_norm": 0.8722582459449768, "learning_rate": 0.00018065311753227273, "loss": 2.0717, "step": 310 }, { "epoch": 0.05040722881802342, "grad_norm": 0.7962062358856201, "learning_rate": 0.0001805282884798732, "loss": 2.0131, "step": 311 }, { "epoch": 0.05056930993962478, "grad_norm": 0.9616008400917053, "learning_rate": 0.00018040310143366446, "loss": 2.0606, "step": 312 }, { "epoch": 0.05073139106122614, "grad_norm": 0.811673641204834, "learning_rate": 0.00018027755695017368, "loss": 2.0475, "step": 313 }, { "epoch": 0.050893472182827505, "grad_norm": 0.8326351642608643, "learning_rate": 0.00018015165558751717, "loss": 2.2088, "step": 314 }, { "epoch": 0.05105555330442887, "grad_norm": 0.9125694632530212, "learning_rate": 0.00018002539790539773, "loss": 1.91, "step": 315 }, { "epoch": 0.05121763442603023, "grad_norm": 1.0221319198608398, "learning_rate": 0.00017989878446510215, "loss": 2.1462, "step": 316 }, { "epoch": 0.05137971554763159, "grad_norm": 0.8795247077941895, "learning_rate": 0.00017977181582949888, "loss": 2.1541, "step": 317 }, { "epoch": 0.05154179666923295, "grad_norm": 0.8685728311538696, "learning_rate": 0.0001796444925630353, "loss": 2.1562, "step": 318 }, { "epoch": 0.05170387779083431, "grad_norm": 1.1004655361175537, "learning_rate": 0.00017951681523173542, "loss": 2.1231, "step": 319 }, { "epoch": 0.05186595891243567, "grad_norm": 0.9287633299827576, "learning_rate": 0.0001793887844031972, "loss": 2.0784, "step": 320 }, { "epoch": 0.052028040034037035, "grad_norm": 1.0385360717773438, "learning_rate": 0.00017926040064659014, "loss": 2.086, "step": 321 }, { "epoch": 0.0521901211556384, "grad_norm": 0.8999946117401123, "learning_rate": 0.0001791316645326526, "loss": 2.0312, "step": 322 }, { "epoch": 0.05235220227723976, "grad_norm": 1.0374078750610352, "learning_rate": 0.00017900257663368963, "loss": 2.1407, "step": 323 }, { "epoch": 0.05251428339884112, "grad_norm": 1.061728835105896, "learning_rate": 0.0001788731375235698, "loss": 2.1352, "step": 324 }, { "epoch": 0.052676364520442484, "grad_norm": 1.033845067024231, "learning_rate": 0.00017874334777772327, "loss": 2.2804, "step": 325 }, { "epoch": 0.05283844564204384, "grad_norm": 1.2489559650421143, "learning_rate": 0.00017861320797313892, "loss": 2.1746, "step": 326 }, { "epoch": 0.0530005267636452, "grad_norm": 0.988294243812561, "learning_rate": 0.0001784827186883618, "loss": 2.1422, "step": 327 }, { "epoch": 0.053162607885246564, "grad_norm": 1.2897940874099731, "learning_rate": 0.00017835188050349064, "loss": 2.0962, "step": 328 }, { "epoch": 0.053324689006847927, "grad_norm": 1.1834853887557983, "learning_rate": 0.00017822069400017516, "loss": 2.2816, "step": 329 }, { "epoch": 0.05348677012844929, "grad_norm": 1.1888794898986816, "learning_rate": 0.00017808915976161362, "loss": 2.0314, "step": 330 }, { "epoch": 0.05364885125005065, "grad_norm": 1.5024569034576416, "learning_rate": 0.00017795727837255015, "loss": 2.1153, "step": 331 }, { "epoch": 0.053810932371652014, "grad_norm": 1.130858302116394, "learning_rate": 0.00017782505041927216, "loss": 2.3989, "step": 332 }, { "epoch": 0.053973013493253376, "grad_norm": 1.5125319957733154, "learning_rate": 0.00017769247648960774, "loss": 2.1236, "step": 333 }, { "epoch": 0.05413509461485474, "grad_norm": 1.5707167387008667, "learning_rate": 0.00017755955717292296, "loss": 2.1928, "step": 334 }, { "epoch": 0.054297175736456094, "grad_norm": 1.1016305685043335, "learning_rate": 0.00017742629306011944, "loss": 2.2474, "step": 335 }, { "epoch": 0.054459256858057456, "grad_norm": 1.2245185375213623, "learning_rate": 0.00017729268474363154, "loss": 2.2744, "step": 336 }, { "epoch": 0.05462133797965882, "grad_norm": 1.4685486555099487, "learning_rate": 0.0001771587328174239, "loss": 2.3349, "step": 337 }, { "epoch": 0.05478341910126018, "grad_norm": 1.1773123741149902, "learning_rate": 0.0001770244378769885, "loss": 2.3933, "step": 338 }, { "epoch": 0.05494550022286154, "grad_norm": 1.5868879556655884, "learning_rate": 0.0001768898005193425, "loss": 2.4543, "step": 339 }, { "epoch": 0.055107581344462905, "grad_norm": 1.4568393230438232, "learning_rate": 0.000176754821343025, "loss": 2.3851, "step": 340 }, { "epoch": 0.05526966246606427, "grad_norm": 1.3486446142196655, "learning_rate": 0.0001766195009480949, "loss": 2.4321, "step": 341 }, { "epoch": 0.05543174358766563, "grad_norm": 1.5296896696090698, "learning_rate": 0.0001764838399361279, "loss": 2.5294, "step": 342 }, { "epoch": 0.055593824709266985, "grad_norm": 1.6126346588134766, "learning_rate": 0.00017634783891021393, "loss": 2.6512, "step": 343 }, { "epoch": 0.05575590583086835, "grad_norm": 1.5726284980773926, "learning_rate": 0.00017621149847495458, "loss": 2.5767, "step": 344 }, { "epoch": 0.05591798695246971, "grad_norm": 2.3036086559295654, "learning_rate": 0.00017607481923646016, "loss": 2.6289, "step": 345 }, { "epoch": 0.05608006807407107, "grad_norm": 1.6342629194259644, "learning_rate": 0.0001759378018023473, "loss": 2.6854, "step": 346 }, { "epoch": 0.056242149195672435, "grad_norm": 1.9900621175765991, "learning_rate": 0.00017580044678173592, "loss": 2.9069, "step": 347 }, { "epoch": 0.0564042303172738, "grad_norm": 2.33109450340271, "learning_rate": 0.00017566275478524693, "loss": 2.5896, "step": 348 }, { "epoch": 0.05656631143887516, "grad_norm": 2.7142515182495117, "learning_rate": 0.0001755247264249991, "loss": 3.0566, "step": 349 }, { "epoch": 0.05672839256047652, "grad_norm": 3.5597503185272217, "learning_rate": 0.0001753863623146066, "loss": 3.4787, "step": 350 }, { "epoch": 0.05689047368207788, "grad_norm": 1.2078644037246704, "learning_rate": 0.00017524766306917618, "loss": 2.2079, "step": 351 }, { "epoch": 0.05705255480367924, "grad_norm": 0.9320293068885803, "learning_rate": 0.0001751086293053045, "loss": 2.2482, "step": 352 }, { "epoch": 0.0572146359252806, "grad_norm": 1.6044763326644897, "learning_rate": 0.0001749692616410753, "loss": 2.1207, "step": 353 }, { "epoch": 0.057376717046881964, "grad_norm": 0.9895527362823486, "learning_rate": 0.00017482956069605668, "loss": 2.1629, "step": 354 }, { "epoch": 0.05753879816848333, "grad_norm": 0.9529826641082764, "learning_rate": 0.00017468952709129846, "loss": 2.1446, "step": 355 }, { "epoch": 0.05770087929008469, "grad_norm": 1.0963599681854248, "learning_rate": 0.00017454916144932922, "loss": 2.0365, "step": 356 }, { "epoch": 0.05786296041168605, "grad_norm": 0.8234672546386719, "learning_rate": 0.0001744084643941536, "loss": 2.0966, "step": 357 }, { "epoch": 0.058025041533287414, "grad_norm": 0.9184057712554932, "learning_rate": 0.00017426743655124974, "loss": 2.0621, "step": 358 }, { "epoch": 0.05818712265488877, "grad_norm": 0.7979874014854431, "learning_rate": 0.0001741260785475661, "loss": 2.0949, "step": 359 }, { "epoch": 0.05834920377649013, "grad_norm": 1.0196961164474487, "learning_rate": 0.00017398439101151905, "loss": 2.0579, "step": 360 }, { "epoch": 0.058511284898091494, "grad_norm": 0.9489096403121948, "learning_rate": 0.00017384237457298987, "loss": 2.2265, "step": 361 }, { "epoch": 0.058673366019692856, "grad_norm": 0.8029397130012512, "learning_rate": 0.00017370002986332193, "loss": 2.087, "step": 362 }, { "epoch": 0.05883544714129422, "grad_norm": 0.9402124285697937, "learning_rate": 0.00017355735751531807, "loss": 2.1656, "step": 363 }, { "epoch": 0.05899752826289558, "grad_norm": 1.070616364479065, "learning_rate": 0.00017341435816323756, "loss": 2.2847, "step": 364 }, { "epoch": 0.05915960938449694, "grad_norm": 0.8646097183227539, "learning_rate": 0.00017327103244279348, "loss": 2.0102, "step": 365 }, { "epoch": 0.059321690506098305, "grad_norm": 0.9388591647148132, "learning_rate": 0.00017312738099114973, "loss": 2.1441, "step": 366 }, { "epoch": 0.05948377162769966, "grad_norm": 0.9204132556915283, "learning_rate": 0.00017298340444691835, "loss": 2.0562, "step": 367 }, { "epoch": 0.05964585274930102, "grad_norm": 0.9248064160346985, "learning_rate": 0.00017283910345015647, "loss": 2.0438, "step": 368 }, { "epoch": 0.059807933870902386, "grad_norm": 1.0187243223190308, "learning_rate": 0.0001726944786423637, "loss": 2.1806, "step": 369 }, { "epoch": 0.05997001499250375, "grad_norm": 0.9070429801940918, "learning_rate": 0.00017254953066647913, "loss": 2.0144, "step": 370 }, { "epoch": 0.06013209611410511, "grad_norm": 1.2213033437728882, "learning_rate": 0.00017240426016687863, "loss": 2.0817, "step": 371 }, { "epoch": 0.06029417723570647, "grad_norm": 0.9484167695045471, "learning_rate": 0.00017225866778937165, "loss": 2.0152, "step": 372 }, { "epoch": 0.060456258357307835, "grad_norm": 1.308074712753296, "learning_rate": 0.00017211275418119876, "loss": 2.1363, "step": 373 }, { "epoch": 0.0606183394789092, "grad_norm": 1.1076298952102661, "learning_rate": 0.0001719665199910285, "loss": 2.0591, "step": 374 }, { "epoch": 0.06078042060051055, "grad_norm": 1.1952176094055176, "learning_rate": 0.00017181996586895454, "loss": 2.0732, "step": 375 }, { "epoch": 0.06078042060051055, "eval_loss": 2.2569665908813477, "eval_runtime": 615.1133, "eval_samples_per_second": 16.894, "eval_steps_per_second": 8.447, "step": 375 }, { "epoch": 0.060942501722111915, "grad_norm": 1.1536272764205933, "learning_rate": 0.00017167309246649297, "loss": 2.1701, "step": 376 }, { "epoch": 0.06110458284371328, "grad_norm": 1.0178909301757812, "learning_rate": 0.0001715259004365791, "loss": 2.2424, "step": 377 }, { "epoch": 0.06126666396531464, "grad_norm": 1.2246919870376587, "learning_rate": 0.00017137839043356484, "loss": 2.1582, "step": 378 }, { "epoch": 0.061428745086916, "grad_norm": 1.0697070360183716, "learning_rate": 0.00017123056311321562, "loss": 2.1036, "step": 379 }, { "epoch": 0.061590826208517364, "grad_norm": 1.1001579761505127, "learning_rate": 0.0001710824191327075, "loss": 2.1875, "step": 380 }, { "epoch": 0.06175290733011873, "grad_norm": 1.2305734157562256, "learning_rate": 0.00017093395915062428, "loss": 2.2054, "step": 381 }, { "epoch": 0.06191498845172009, "grad_norm": 1.0534331798553467, "learning_rate": 0.00017078518382695465, "loss": 2.1096, "step": 382 }, { "epoch": 0.062077069573321444, "grad_norm": 1.355393886566162, "learning_rate": 0.00017063609382308908, "loss": 2.1579, "step": 383 }, { "epoch": 0.06223915069492281, "grad_norm": 1.642356514930725, "learning_rate": 0.00017048668980181698, "loss": 2.3534, "step": 384 }, { "epoch": 0.06240123181652417, "grad_norm": 1.2375314235687256, "learning_rate": 0.00017033697242732377, "loss": 2.2071, "step": 385 }, { "epoch": 0.06256331293812553, "grad_norm": 1.4431997537612915, "learning_rate": 0.0001701869423651879, "loss": 2.4343, "step": 386 }, { "epoch": 0.0627253940597269, "grad_norm": 1.9159278869628906, "learning_rate": 0.00017003660028237793, "loss": 2.4255, "step": 387 }, { "epoch": 0.06288747518132826, "grad_norm": 1.2807015180587769, "learning_rate": 0.00016988594684724947, "loss": 2.2231, "step": 388 }, { "epoch": 0.06304955630292962, "grad_norm": 1.5241148471832275, "learning_rate": 0.00016973498272954222, "loss": 2.3968, "step": 389 }, { "epoch": 0.06321163742453098, "grad_norm": 1.9623697996139526, "learning_rate": 0.00016958370860037717, "loss": 2.4719, "step": 390 }, { "epoch": 0.06337371854613234, "grad_norm": 1.771894097328186, "learning_rate": 0.00016943212513225345, "loss": 2.4695, "step": 391 }, { "epoch": 0.0635357996677337, "grad_norm": 1.5892380475997925, "learning_rate": 0.00016928023299904533, "loss": 2.6788, "step": 392 }, { "epoch": 0.06369788078933507, "grad_norm": 1.7588838338851929, "learning_rate": 0.0001691280328759992, "loss": 2.4339, "step": 393 }, { "epoch": 0.06385996191093643, "grad_norm": 1.7632874250411987, "learning_rate": 0.00016897552543973084, "loss": 2.6033, "step": 394 }, { "epoch": 0.06402204303253778, "grad_norm": 1.6468191146850586, "learning_rate": 0.00016882271136822206, "loss": 2.8283, "step": 395 }, { "epoch": 0.06418412415413914, "grad_norm": 2.1224801540374756, "learning_rate": 0.0001686695913408179, "loss": 2.5717, "step": 396 }, { "epoch": 0.0643462052757405, "grad_norm": 1.6829807758331299, "learning_rate": 0.0001685161660382235, "loss": 2.6187, "step": 397 }, { "epoch": 0.06450828639734187, "grad_norm": 1.9323954582214355, "learning_rate": 0.00016836243614250113, "loss": 2.8017, "step": 398 }, { "epoch": 0.06467036751894323, "grad_norm": 2.5498275756835938, "learning_rate": 0.00016820840233706719, "loss": 2.9575, "step": 399 }, { "epoch": 0.06483244864054459, "grad_norm": 3.1542487144470215, "learning_rate": 0.0001680540653066891, "loss": 3.0259, "step": 400 }, { "epoch": 0.06499452976214595, "grad_norm": 1.081676721572876, "learning_rate": 0.00016789942573748232, "loss": 2.2282, "step": 401 }, { "epoch": 0.06515661088374732, "grad_norm": 0.8676232695579529, "learning_rate": 0.0001677444843169072, "loss": 2.2736, "step": 402 }, { "epoch": 0.06531869200534868, "grad_norm": 1.4076144695281982, "learning_rate": 0.00016758924173376603, "loss": 2.3064, "step": 403 }, { "epoch": 0.06548077312695004, "grad_norm": 1.048743486404419, "learning_rate": 0.0001674336986781999, "loss": 2.1371, "step": 404 }, { "epoch": 0.0656428542485514, "grad_norm": 1.3138046264648438, "learning_rate": 0.00016727785584168581, "loss": 2.0911, "step": 405 }, { "epoch": 0.06580493537015276, "grad_norm": 1.2267826795578003, "learning_rate": 0.0001671217139170333, "loss": 2.0977, "step": 406 }, { "epoch": 0.06596701649175413, "grad_norm": 0.855918288230896, "learning_rate": 0.00016696527359838154, "loss": 2.2079, "step": 407 }, { "epoch": 0.06612909761335549, "grad_norm": 1.083713412284851, "learning_rate": 0.00016680853558119632, "loss": 2.3051, "step": 408 }, { "epoch": 0.06629117873495685, "grad_norm": 1.0036380290985107, "learning_rate": 0.0001666515005622668, "loss": 2.0158, "step": 409 }, { "epoch": 0.06645325985655821, "grad_norm": 0.942942202091217, "learning_rate": 0.0001664941692397025, "loss": 2.0838, "step": 410 }, { "epoch": 0.06661534097815956, "grad_norm": 0.9875555634498596, "learning_rate": 0.00016633654231293013, "loss": 2.0539, "step": 411 }, { "epoch": 0.06677742209976092, "grad_norm": 0.8408850431442261, "learning_rate": 0.00016617862048269065, "loss": 2.1277, "step": 412 }, { "epoch": 0.06693950322136229, "grad_norm": 0.9507250189781189, "learning_rate": 0.00016602040445103588, "loss": 2.0143, "step": 413 }, { "epoch": 0.06710158434296365, "grad_norm": 0.8782068490982056, "learning_rate": 0.00016586189492132566, "loss": 2.0381, "step": 414 }, { "epoch": 0.06726366546456501, "grad_norm": 0.8878350257873535, "learning_rate": 0.00016570309259822453, "loss": 2.0379, "step": 415 }, { "epoch": 0.06742574658616637, "grad_norm": 1.1589128971099854, "learning_rate": 0.0001655439981876987, "loss": 2.0535, "step": 416 }, { "epoch": 0.06758782770776774, "grad_norm": 0.877856433391571, "learning_rate": 0.00016538461239701277, "loss": 2.0592, "step": 417 }, { "epoch": 0.0677499088293691, "grad_norm": 1.0968033075332642, "learning_rate": 0.00016522493593472683, "loss": 2.071, "step": 418 }, { "epoch": 0.06791198995097046, "grad_norm": 0.8311752676963806, "learning_rate": 0.0001650649695106931, "loss": 1.9332, "step": 419 }, { "epoch": 0.06807407107257182, "grad_norm": 0.9262987375259399, "learning_rate": 0.00016490471383605288, "loss": 2.0505, "step": 420 }, { "epoch": 0.06823615219417319, "grad_norm": 0.8824668526649475, "learning_rate": 0.00016474416962323325, "loss": 2.047, "step": 421 }, { "epoch": 0.06839823331577455, "grad_norm": 0.9121283292770386, "learning_rate": 0.00016458333758594414, "loss": 2.19, "step": 422 }, { "epoch": 0.06856031443737591, "grad_norm": 0.9066396355628967, "learning_rate": 0.00016442221843917496, "loss": 2.125, "step": 423 }, { "epoch": 0.06872239555897727, "grad_norm": 0.9347816109657288, "learning_rate": 0.00016426081289919143, "loss": 2.12, "step": 424 }, { "epoch": 0.06888447668057864, "grad_norm": 0.9283341765403748, "learning_rate": 0.0001640991216835326, "loss": 2.0683, "step": 425 }, { "epoch": 0.06904655780218, "grad_norm": 1.2320733070373535, "learning_rate": 0.00016393714551100734, "loss": 2.2133, "step": 426 }, { "epoch": 0.06920863892378135, "grad_norm": 0.9498825669288635, "learning_rate": 0.0001637748851016914, "loss": 2.0915, "step": 427 }, { "epoch": 0.06937072004538271, "grad_norm": 1.025742530822754, "learning_rate": 0.00016361234117692413, "loss": 2.0515, "step": 428 }, { "epoch": 0.06953280116698407, "grad_norm": 1.0089415311813354, "learning_rate": 0.00016344951445930526, "loss": 2.1095, "step": 429 }, { "epoch": 0.06969488228858543, "grad_norm": 1.0522841215133667, "learning_rate": 0.0001632864056726917, "loss": 2.2381, "step": 430 }, { "epoch": 0.0698569634101868, "grad_norm": 1.0336872339248657, "learning_rate": 0.00016312301554219426, "loss": 2.1262, "step": 431 }, { "epoch": 0.07001904453178816, "grad_norm": 1.1000628471374512, "learning_rate": 0.00016295934479417453, "loss": 2.2625, "step": 432 }, { "epoch": 0.07018112565338952, "grad_norm": 1.1140820980072021, "learning_rate": 0.00016279539415624164, "loss": 2.27, "step": 433 }, { "epoch": 0.07034320677499088, "grad_norm": 1.0736558437347412, "learning_rate": 0.0001626311643572489, "loss": 2.1662, "step": 434 }, { "epoch": 0.07050528789659224, "grad_norm": 1.2580877542495728, "learning_rate": 0.00016246665612729074, "loss": 2.2225, "step": 435 }, { "epoch": 0.07066736901819361, "grad_norm": 1.2165358066558838, "learning_rate": 0.00016230187019769928, "loss": 2.1325, "step": 436 }, { "epoch": 0.07082945013979497, "grad_norm": 1.0291610956192017, "learning_rate": 0.00016213680730104124, "loss": 2.0592, "step": 437 }, { "epoch": 0.07099153126139633, "grad_norm": 1.574893832206726, "learning_rate": 0.0001619714681711146, "loss": 2.3681, "step": 438 }, { "epoch": 0.0711536123829977, "grad_norm": 1.2416809797286987, "learning_rate": 0.00016180585354294536, "loss": 2.2693, "step": 439 }, { "epoch": 0.07131569350459906, "grad_norm": 1.3693636655807495, "learning_rate": 0.00016163996415278424, "loss": 2.387, "step": 440 }, { "epoch": 0.07147777462620042, "grad_norm": 1.3710639476776123, "learning_rate": 0.00016147380073810346, "loss": 2.4192, "step": 441 }, { "epoch": 0.07163985574780178, "grad_norm": 1.2685997486114502, "learning_rate": 0.0001613073640375934, "loss": 2.2619, "step": 442 }, { "epoch": 0.07180193686940313, "grad_norm": 1.3018382787704468, "learning_rate": 0.00016114065479115946, "loss": 2.3602, "step": 443 }, { "epoch": 0.07196401799100449, "grad_norm": 1.4972432851791382, "learning_rate": 0.00016097367373991842, "loss": 2.4239, "step": 444 }, { "epoch": 0.07212609911260585, "grad_norm": 1.4277229309082031, "learning_rate": 0.00016080642162619565, "loss": 2.511, "step": 445 }, { "epoch": 0.07228818023420722, "grad_norm": 1.787883996963501, "learning_rate": 0.0001606388991935214, "loss": 2.682, "step": 446 }, { "epoch": 0.07245026135580858, "grad_norm": 1.7695914506912231, "learning_rate": 0.0001604711071866277, "loss": 2.5689, "step": 447 }, { "epoch": 0.07261234247740994, "grad_norm": 2.118831157684326, "learning_rate": 0.00016030304635144494, "loss": 2.5306, "step": 448 }, { "epoch": 0.0727744235990113, "grad_norm": 2.5734965801239014, "learning_rate": 0.00016013471743509862, "loss": 2.8471, "step": 449 }, { "epoch": 0.07293650472061267, "grad_norm": 3.6286282539367676, "learning_rate": 0.00015996612118590603, "loss": 3.0702, "step": 450 }, { "epoch": 0.07309858584221403, "grad_norm": 1.1082240343093872, "learning_rate": 0.00015979725835337294, "loss": 2.3496, "step": 451 }, { "epoch": 0.07326066696381539, "grad_norm": 0.9374189376831055, "learning_rate": 0.00015962812968819016, "loss": 2.1217, "step": 452 }, { "epoch": 0.07342274808541675, "grad_norm": 1.0230820178985596, "learning_rate": 0.0001594587359422303, "loss": 2.0076, "step": 453 }, { "epoch": 0.07358482920701812, "grad_norm": 0.959244966506958, "learning_rate": 0.0001592890778685444, "loss": 1.8903, "step": 454 }, { "epoch": 0.07374691032861948, "grad_norm": 0.9159017205238342, "learning_rate": 0.00015911915622135862, "loss": 2.0109, "step": 455 }, { "epoch": 0.07390899145022084, "grad_norm": 1.018968105316162, "learning_rate": 0.00015894897175607086, "loss": 2.0448, "step": 456 }, { "epoch": 0.0740710725718222, "grad_norm": 0.9271405935287476, "learning_rate": 0.00015877852522924732, "loss": 1.986, "step": 457 }, { "epoch": 0.07423315369342356, "grad_norm": 1.0381321907043457, "learning_rate": 0.00015860781739861928, "loss": 2.1234, "step": 458 }, { "epoch": 0.07439523481502491, "grad_norm": 0.9286422729492188, "learning_rate": 0.00015843684902307962, "loss": 1.9599, "step": 459 }, { "epoch": 0.07455731593662628, "grad_norm": 0.8693100810050964, "learning_rate": 0.00015826562086267956, "loss": 2.0004, "step": 460 }, { "epoch": 0.07471939705822764, "grad_norm": 0.8630943894386292, "learning_rate": 0.00015809413367862512, "loss": 1.9542, "step": 461 }, { "epoch": 0.074881478179829, "grad_norm": 0.8371545672416687, "learning_rate": 0.00015792238823327388, "loss": 1.9829, "step": 462 }, { "epoch": 0.07504355930143036, "grad_norm": 1.0650413036346436, "learning_rate": 0.00015775038529013152, "loss": 1.9757, "step": 463 }, { "epoch": 0.07520564042303172, "grad_norm": 0.930605411529541, "learning_rate": 0.0001575781256138485, "loss": 2.0803, "step": 464 }, { "epoch": 0.07536772154463309, "grad_norm": 0.9635065793991089, "learning_rate": 0.00015740560997021648, "loss": 2.0132, "step": 465 }, { "epoch": 0.07552980266623445, "grad_norm": 0.929509162902832, "learning_rate": 0.00015723283912616513, "loss": 2.0276, "step": 466 }, { "epoch": 0.07569188378783581, "grad_norm": 1.141801357269287, "learning_rate": 0.00015705981384975866, "loss": 2.043, "step": 467 }, { "epoch": 0.07585396490943717, "grad_norm": 0.9718663692474365, "learning_rate": 0.0001568865349101923, "loss": 2.0888, "step": 468 }, { "epoch": 0.07601604603103854, "grad_norm": 0.9537129998207092, "learning_rate": 0.00015671300307778898, "loss": 2.0603, "step": 469 }, { "epoch": 0.0761781271526399, "grad_norm": 0.9165297746658325, "learning_rate": 0.00015653921912399589, "loss": 2.0427, "step": 470 }, { "epoch": 0.07634020827424126, "grad_norm": 0.9288861751556396, "learning_rate": 0.00015636518382138107, "loss": 2.0051, "step": 471 }, { "epoch": 0.07650228939584262, "grad_norm": 0.9579792022705078, "learning_rate": 0.0001561908979436299, "loss": 2.0354, "step": 472 }, { "epoch": 0.07666437051744399, "grad_norm": 0.9760196208953857, "learning_rate": 0.00015601636226554168, "loss": 2.1132, "step": 473 }, { "epoch": 0.07682645163904535, "grad_norm": 0.9693097472190857, "learning_rate": 0.00015584157756302634, "loss": 2.0307, "step": 474 }, { "epoch": 0.0769885327606467, "grad_norm": 0.9873945116996765, "learning_rate": 0.0001556665446131007, "loss": 2.1233, "step": 475 }, { "epoch": 0.07715061388224806, "grad_norm": 0.9827134013175964, "learning_rate": 0.00015549126419388536, "loss": 1.9541, "step": 476 }, { "epoch": 0.07731269500384942, "grad_norm": 1.073141098022461, "learning_rate": 0.0001553157370846009, "loss": 2.0755, "step": 477 }, { "epoch": 0.07747477612545078, "grad_norm": 0.9997089505195618, "learning_rate": 0.00015513996406556465, "loss": 2.1132, "step": 478 }, { "epoch": 0.07763685724705215, "grad_norm": 1.0285980701446533, "learning_rate": 0.00015496394591818716, "loss": 2.1153, "step": 479 }, { "epoch": 0.07779893836865351, "grad_norm": 1.0112553834915161, "learning_rate": 0.0001547876834249687, "loss": 2.1579, "step": 480 }, { "epoch": 0.07796101949025487, "grad_norm": 1.0673810243606567, "learning_rate": 0.00015461117736949577, "loss": 2.1398, "step": 481 }, { "epoch": 0.07812310061185623, "grad_norm": 1.0069586038589478, "learning_rate": 0.00015443442853643762, "loss": 2.135, "step": 482 }, { "epoch": 0.0782851817334576, "grad_norm": 1.0911566019058228, "learning_rate": 0.00015425743771154294, "loss": 2.1956, "step": 483 }, { "epoch": 0.07844726285505896, "grad_norm": 1.1652536392211914, "learning_rate": 0.00015408020568163602, "loss": 2.1024, "step": 484 }, { "epoch": 0.07860934397666032, "grad_norm": 1.2288628816604614, "learning_rate": 0.00015390273323461352, "loss": 2.2401, "step": 485 }, { "epoch": 0.07877142509826168, "grad_norm": 1.0709394216537476, "learning_rate": 0.0001537250211594409, "loss": 2.0335, "step": 486 }, { "epoch": 0.07893350621986304, "grad_norm": 1.1880048513412476, "learning_rate": 0.0001535470702461489, "loss": 2.182, "step": 487 }, { "epoch": 0.07909558734146441, "grad_norm": 1.2499237060546875, "learning_rate": 0.00015336888128583, "loss": 2.408, "step": 488 }, { "epoch": 0.07925766846306577, "grad_norm": 1.2968357801437378, "learning_rate": 0.000153190455070635, "loss": 2.4277, "step": 489 }, { "epoch": 0.07941974958466713, "grad_norm": 1.2639665603637695, "learning_rate": 0.00015301179239376938, "loss": 2.3629, "step": 490 }, { "epoch": 0.0795818307062685, "grad_norm": 1.3755820989608765, "learning_rate": 0.00015283289404948976, "loss": 2.618, "step": 491 }, { "epoch": 0.07974391182786984, "grad_norm": 1.5569093227386475, "learning_rate": 0.0001526537608331006, "loss": 2.4789, "step": 492 }, { "epoch": 0.0799059929494712, "grad_norm": 1.4523049592971802, "learning_rate": 0.00015247439354095041, "loss": 2.7411, "step": 493 }, { "epoch": 0.08006807407107257, "grad_norm": 1.5402491092681885, "learning_rate": 0.00015229479297042823, "loss": 2.6944, "step": 494 }, { "epoch": 0.08023015519267393, "grad_norm": 1.6666029691696167, "learning_rate": 0.00015211495991996027, "loss": 2.5658, "step": 495 }, { "epoch": 0.08039223631427529, "grad_norm": 1.840347170829773, "learning_rate": 0.0001519348951890062, "loss": 2.8292, "step": 496 }, { "epoch": 0.08055431743587665, "grad_norm": 2.0858819484710693, "learning_rate": 0.0001517545995780556, "loss": 2.8278, "step": 497 }, { "epoch": 0.08071639855747802, "grad_norm": 2.174330711364746, "learning_rate": 0.00015157407388862452, "loss": 2.7114, "step": 498 }, { "epoch": 0.08087847967907938, "grad_norm": 2.6221251487731934, "learning_rate": 0.00015139331892325179, "loss": 2.9763, "step": 499 }, { "epoch": 0.08104056080068074, "grad_norm": 3.545398712158203, "learning_rate": 0.0001512123354854955, "loss": 3.303, "step": 500 }, { "epoch": 0.0812026419222821, "grad_norm": 1.182886004447937, "learning_rate": 0.0001510311243799295, "loss": 2.2017, "step": 501 }, { "epoch": 0.08136472304388347, "grad_norm": 1.0625380277633667, "learning_rate": 0.00015084968641213958, "loss": 2.0697, "step": 502 }, { "epoch": 0.08152680416548483, "grad_norm": 0.854444146156311, "learning_rate": 0.00015066802238872023, "loss": 1.9969, "step": 503 }, { "epoch": 0.08168888528708619, "grad_norm": 0.9453656673431396, "learning_rate": 0.0001504861331172709, "loss": 2.0006, "step": 504 }, { "epoch": 0.08185096640868755, "grad_norm": 0.9780434966087341, "learning_rate": 0.0001503040194063922, "loss": 1.9021, "step": 505 }, { "epoch": 0.08201304753028892, "grad_norm": 0.9137747287750244, "learning_rate": 0.00015012168206568268, "loss": 1.9835, "step": 506 }, { "epoch": 0.08217512865189028, "grad_norm": 0.9674559235572815, "learning_rate": 0.00014993912190573505, "loss": 2.1078, "step": 507 }, { "epoch": 0.08233720977349163, "grad_norm": 0.9159846901893616, "learning_rate": 0.00014975633973813242, "loss": 2.0036, "step": 508 }, { "epoch": 0.08249929089509299, "grad_norm": 0.9136930108070374, "learning_rate": 0.00014957333637544503, "loss": 1.9723, "step": 509 }, { "epoch": 0.08266137201669435, "grad_norm": 0.9637822508811951, "learning_rate": 0.00014939011263122634, "loss": 1.9701, "step": 510 }, { "epoch": 0.08282345313829571, "grad_norm": 0.8497329354286194, "learning_rate": 0.0001492066693200096, "loss": 1.9886, "step": 511 }, { "epoch": 0.08298553425989708, "grad_norm": 0.9407211542129517, "learning_rate": 0.00014902300725730413, "loss": 2.1275, "step": 512 }, { "epoch": 0.08314761538149844, "grad_norm": 0.9027053713798523, "learning_rate": 0.00014883912725959167, "loss": 2.1159, "step": 513 }, { "epoch": 0.0833096965030998, "grad_norm": 1.0457254648208618, "learning_rate": 0.00014865503014432292, "loss": 2.0784, "step": 514 }, { "epoch": 0.08347177762470116, "grad_norm": 1.1884881258010864, "learning_rate": 0.00014847071672991367, "loss": 2.0567, "step": 515 }, { "epoch": 0.08363385874630253, "grad_norm": 0.9630997776985168, "learning_rate": 0.0001482861878357414, "loss": 2.1544, "step": 516 }, { "epoch": 0.08379593986790389, "grad_norm": 0.8533034324645996, "learning_rate": 0.00014810144428214144, "loss": 1.8446, "step": 517 }, { "epoch": 0.08395802098950525, "grad_norm": 1.1944236755371094, "learning_rate": 0.0001479164868904034, "loss": 2.0542, "step": 518 }, { "epoch": 0.08412010211110661, "grad_norm": 0.9619053602218628, "learning_rate": 0.00014773131648276758, "loss": 1.9975, "step": 519 }, { "epoch": 0.08428218323270797, "grad_norm": 1.2170730829238892, "learning_rate": 0.00014754593388242117, "loss": 2.1551, "step": 520 }, { "epoch": 0.08444426435430934, "grad_norm": 0.8952792286872864, "learning_rate": 0.0001473603399134948, "loss": 1.9141, "step": 521 }, { "epoch": 0.0846063454759107, "grad_norm": 1.1837362051010132, "learning_rate": 0.0001471745354010586, "loss": 2.0866, "step": 522 }, { "epoch": 0.08476842659751206, "grad_norm": 1.183732032775879, "learning_rate": 0.00014698852117111884, "loss": 2.1341, "step": 523 }, { "epoch": 0.08493050771911341, "grad_norm": 1.1048567295074463, "learning_rate": 0.000146802298050614, "loss": 2.0361, "step": 524 }, { "epoch": 0.08509258884071477, "grad_norm": 1.001400113105774, "learning_rate": 0.0001466158668674112, "loss": 2.035, "step": 525 }, { "epoch": 0.08525466996231613, "grad_norm": 0.9850467443466187, "learning_rate": 0.00014642922845030257, "loss": 2.0974, "step": 526 }, { "epoch": 0.0854167510839175, "grad_norm": 1.166247010231018, "learning_rate": 0.0001462423836290015, "loss": 2.1661, "step": 527 }, { "epoch": 0.08557883220551886, "grad_norm": 0.9725326299667358, "learning_rate": 0.00014605533323413887, "loss": 1.9392, "step": 528 }, { "epoch": 0.08574091332712022, "grad_norm": 0.9696087837219238, "learning_rate": 0.00014586807809725962, "loss": 2.0367, "step": 529 }, { "epoch": 0.08590299444872158, "grad_norm": 1.0696322917938232, "learning_rate": 0.00014568061905081875, "loss": 2.1808, "step": 530 }, { "epoch": 0.08606507557032295, "grad_norm": 1.1029514074325562, "learning_rate": 0.00014549295692817778, "loss": 2.1904, "step": 531 }, { "epoch": 0.08622715669192431, "grad_norm": 1.0270493030548096, "learning_rate": 0.00014530509256360102, "loss": 2.0564, "step": 532 }, { "epoch": 0.08638923781352567, "grad_norm": 1.3101474046707153, "learning_rate": 0.00014511702679225193, "loss": 2.1255, "step": 533 }, { "epoch": 0.08655131893512703, "grad_norm": 1.2178043127059937, "learning_rate": 0.0001449287604501893, "loss": 2.1088, "step": 534 }, { "epoch": 0.0867134000567284, "grad_norm": 1.0532468557357788, "learning_rate": 0.00014474029437436348, "loss": 2.0873, "step": 535 }, { "epoch": 0.08687548117832976, "grad_norm": 1.3009625673294067, "learning_rate": 0.00014455162940261285, "loss": 2.0877, "step": 536 }, { "epoch": 0.08703756229993112, "grad_norm": 1.366796612739563, "learning_rate": 0.0001443627663736599, "loss": 2.1198, "step": 537 }, { "epoch": 0.08719964342153248, "grad_norm": 1.2497854232788086, "learning_rate": 0.00014417370612710778, "loss": 2.3931, "step": 538 }, { "epoch": 0.08736172454313385, "grad_norm": 1.2573349475860596, "learning_rate": 0.00014398444950343623, "loss": 2.3961, "step": 539 }, { "epoch": 0.0875238056647352, "grad_norm": 1.3254156112670898, "learning_rate": 0.00014379499734399798, "loss": 2.4867, "step": 540 }, { "epoch": 0.08768588678633656, "grad_norm": 1.6568411588668823, "learning_rate": 0.0001436053504910151, "loss": 2.4315, "step": 541 }, { "epoch": 0.08784796790793792, "grad_norm": 1.3078749179840088, "learning_rate": 0.0001434155097875752, "loss": 2.26, "step": 542 }, { "epoch": 0.08801004902953928, "grad_norm": 1.433417797088623, "learning_rate": 0.00014322547607762762, "loss": 2.4982, "step": 543 }, { "epoch": 0.08817213015114064, "grad_norm": 1.4297300577163696, "learning_rate": 0.0001430352502059797, "loss": 2.5665, "step": 544 }, { "epoch": 0.088334211272742, "grad_norm": 1.716572642326355, "learning_rate": 0.0001428448330182931, "loss": 2.7043, "step": 545 }, { "epoch": 0.08849629239434337, "grad_norm": 1.8622252941131592, "learning_rate": 0.00014265422536107993, "loss": 2.369, "step": 546 }, { "epoch": 0.08865837351594473, "grad_norm": 1.7725635766983032, "learning_rate": 0.00014246342808169914, "loss": 2.6181, "step": 547 }, { "epoch": 0.08882045463754609, "grad_norm": 1.9221594333648682, "learning_rate": 0.00014227244202835257, "loss": 2.5098, "step": 548 }, { "epoch": 0.08898253575914745, "grad_norm": 2.227409839630127, "learning_rate": 0.0001420812680500813, "loss": 2.6636, "step": 549 }, { "epoch": 0.08914461688074882, "grad_norm": 3.6198713779449463, "learning_rate": 0.00014188990699676184, "loss": 2.9657, "step": 550 }, { "epoch": 0.08930669800235018, "grad_norm": 0.9082000255584717, "learning_rate": 0.00014169835971910238, "loss": 2.1088, "step": 551 }, { "epoch": 0.08946877912395154, "grad_norm": 0.919455885887146, "learning_rate": 0.0001415066270686389, "loss": 2.106, "step": 552 }, { "epoch": 0.0896308602455529, "grad_norm": 0.8734371662139893, "learning_rate": 0.00014131470989773158, "loss": 1.9932, "step": 553 }, { "epoch": 0.08979294136715427, "grad_norm": 0.8582708239555359, "learning_rate": 0.0001411226090595608, "loss": 1.8642, "step": 554 }, { "epoch": 0.08995502248875563, "grad_norm": 0.8578787446022034, "learning_rate": 0.00014093032540812348, "loss": 1.8948, "step": 555 }, { "epoch": 0.09011710361035698, "grad_norm": 0.8347747325897217, "learning_rate": 0.0001407378597982293, "loss": 1.9868, "step": 556 }, { "epoch": 0.09027918473195834, "grad_norm": 0.8815526962280273, "learning_rate": 0.00014054521308549673, "loss": 2.0765, "step": 557 }, { "epoch": 0.0904412658535597, "grad_norm": 0.9284449219703674, "learning_rate": 0.0001403523861263495, "loss": 1.9122, "step": 558 }, { "epoch": 0.09060334697516106, "grad_norm": 0.8774821162223816, "learning_rate": 0.00014015937977801256, "loss": 1.907, "step": 559 }, { "epoch": 0.09076542809676243, "grad_norm": 0.927590012550354, "learning_rate": 0.00013996619489850822, "loss": 1.9296, "step": 560 }, { "epoch": 0.09092750921836379, "grad_norm": 1.0081753730773926, "learning_rate": 0.00013977283234665273, "loss": 2.0729, "step": 561 }, { "epoch": 0.09108959033996515, "grad_norm": 0.8960214257240295, "learning_rate": 0.00013957929298205195, "loss": 2.0994, "step": 562 }, { "epoch": 0.09125167146156651, "grad_norm": 1.0175071954727173, "learning_rate": 0.00013938557766509792, "loss": 2.0466, "step": 563 }, { "epoch": 0.09141375258316788, "grad_norm": 0.9070597290992737, "learning_rate": 0.0001391916872569648, "loss": 1.9866, "step": 564 }, { "epoch": 0.09157583370476924, "grad_norm": 0.979101836681366, "learning_rate": 0.00013899762261960518, "loss": 1.9367, "step": 565 }, { "epoch": 0.0917379148263706, "grad_norm": 1.0428049564361572, "learning_rate": 0.0001388033846157462, "loss": 1.9654, "step": 566 }, { "epoch": 0.09189999594797196, "grad_norm": 1.0069422721862793, "learning_rate": 0.0001386089741088857, "loss": 2.0254, "step": 567 }, { "epoch": 0.09206207706957333, "grad_norm": 0.9754946827888489, "learning_rate": 0.00013841439196328836, "loss": 2.0594, "step": 568 }, { "epoch": 0.09222415819117469, "grad_norm": 1.0194846391677856, "learning_rate": 0.00013821963904398193, "loss": 1.9216, "step": 569 }, { "epoch": 0.09238623931277605, "grad_norm": 0.9197936654090881, "learning_rate": 0.00013802471621675338, "loss": 1.8856, "step": 570 }, { "epoch": 0.09254832043437741, "grad_norm": 1.1789802312850952, "learning_rate": 0.00013782962434814492, "loss": 2.1333, "step": 571 }, { "epoch": 0.09271040155597876, "grad_norm": 1.0808286666870117, "learning_rate": 0.00013763436430545034, "loss": 2.0365, "step": 572 }, { "epoch": 0.09287248267758012, "grad_norm": 1.0070189237594604, "learning_rate": 0.00013743893695671096, "loss": 1.9297, "step": 573 }, { "epoch": 0.09303456379918149, "grad_norm": 1.1577085256576538, "learning_rate": 0.00013724334317071198, "loss": 2.0753, "step": 574 }, { "epoch": 0.09319664492078285, "grad_norm": 1.0902458429336548, "learning_rate": 0.00013704758381697844, "loss": 1.9515, "step": 575 }, { "epoch": 0.09335872604238421, "grad_norm": 1.0244091749191284, "learning_rate": 0.00013685165976577146, "loss": 2.0309, "step": 576 }, { "epoch": 0.09352080716398557, "grad_norm": 0.9980378150939941, "learning_rate": 0.0001366555718880843, "loss": 2.0269, "step": 577 }, { "epoch": 0.09368288828558693, "grad_norm": 1.028735876083374, "learning_rate": 0.00013645932105563844, "loss": 2.0516, "step": 578 }, { "epoch": 0.0938449694071883, "grad_norm": 1.1166259050369263, "learning_rate": 0.00013626290814088005, "loss": 2.1411, "step": 579 }, { "epoch": 0.09400705052878966, "grad_norm": 1.214897871017456, "learning_rate": 0.00013606633401697557, "loss": 1.9371, "step": 580 }, { "epoch": 0.09416913165039102, "grad_norm": 1.0801588296890259, "learning_rate": 0.00013586959955780824, "loss": 2.0996, "step": 581 }, { "epoch": 0.09433121277199238, "grad_norm": 1.081018090248108, "learning_rate": 0.00013567270563797398, "loss": 2.0832, "step": 582 }, { "epoch": 0.09449329389359375, "grad_norm": 1.1935162544250488, "learning_rate": 0.00013547565313277776, "loss": 2.1576, "step": 583 }, { "epoch": 0.09465537501519511, "grad_norm": 1.085982084274292, "learning_rate": 0.00013527844291822948, "loss": 2.0502, "step": 584 }, { "epoch": 0.09481745613679647, "grad_norm": 1.1478854417800903, "learning_rate": 0.0001350810758710401, "loss": 2.1914, "step": 585 }, { "epoch": 0.09497953725839783, "grad_norm": 1.228962779045105, "learning_rate": 0.00013488355286861783, "loss": 2.0726, "step": 586 }, { "epoch": 0.0951416183799992, "grad_norm": 1.2617672681808472, "learning_rate": 0.0001346858747890642, "loss": 2.189, "step": 587 }, { "epoch": 0.09530369950160054, "grad_norm": 1.2520960569381714, "learning_rate": 0.00013448804251117003, "loss": 2.2756, "step": 588 }, { "epoch": 0.0954657806232019, "grad_norm": 1.3662278652191162, "learning_rate": 0.0001342900569144119, "loss": 2.2647, "step": 589 }, { "epoch": 0.09562786174480327, "grad_norm": 1.2935956716537476, "learning_rate": 0.0001340919188789477, "loss": 2.5557, "step": 590 }, { "epoch": 0.09578994286640463, "grad_norm": 1.3315271139144897, "learning_rate": 0.00013389362928561317, "loss": 2.2852, "step": 591 }, { "epoch": 0.095952023988006, "grad_norm": 1.469378113746643, "learning_rate": 0.00013369518901591772, "loss": 2.3789, "step": 592 }, { "epoch": 0.09611410510960736, "grad_norm": 1.4220696687698364, "learning_rate": 0.00013349659895204067, "loss": 2.3764, "step": 593 }, { "epoch": 0.09627618623120872, "grad_norm": 1.6237775087356567, "learning_rate": 0.0001332978599768272, "loss": 2.407, "step": 594 }, { "epoch": 0.09643826735281008, "grad_norm": 1.833509922027588, "learning_rate": 0.00013309897297378455, "loss": 2.5151, "step": 595 }, { "epoch": 0.09660034847441144, "grad_norm": 1.6859548091888428, "learning_rate": 0.00013289993882707797, "loss": 2.715, "step": 596 }, { "epoch": 0.0967624295960128, "grad_norm": 2.123373508453369, "learning_rate": 0.00013270075842152678, "loss": 2.4141, "step": 597 }, { "epoch": 0.09692451071761417, "grad_norm": 2.3691325187683105, "learning_rate": 0.00013250143264260074, "loss": 2.7913, "step": 598 }, { "epoch": 0.09708659183921553, "grad_norm": 2.886382579803467, "learning_rate": 0.0001323019623764156, "loss": 2.8778, "step": 599 }, { "epoch": 0.09724867296081689, "grad_norm": 3.866049289703369, "learning_rate": 0.00013210234850972964, "loss": 3.4114, "step": 600 }, { "epoch": 0.09741075408241825, "grad_norm": 1.071763038635254, "learning_rate": 0.0001319025919299394, "loss": 2.2168, "step": 601 }, { "epoch": 0.09757283520401962, "grad_norm": 1.2165462970733643, "learning_rate": 0.00013170269352507597, "loss": 2.1014, "step": 602 }, { "epoch": 0.09773491632562098, "grad_norm": 0.9065914750099182, "learning_rate": 0.0001315026541838008, "loss": 2.0618, "step": 603 }, { "epoch": 0.09789699744722233, "grad_norm": 1.0970734357833862, "learning_rate": 0.00013130247479540202, "loss": 2.0379, "step": 604 }, { "epoch": 0.09805907856882369, "grad_norm": 1.0984913110733032, "learning_rate": 0.00013110215624979025, "loss": 2.0136, "step": 605 }, { "epoch": 0.09822115969042505, "grad_norm": 0.9368547797203064, "learning_rate": 0.00013090169943749476, "loss": 2.113, "step": 606 }, { "epoch": 0.09838324081202642, "grad_norm": 1.2285770177841187, "learning_rate": 0.00013070110524965954, "loss": 1.9133, "step": 607 }, { "epoch": 0.09854532193362778, "grad_norm": 0.9251296520233154, "learning_rate": 0.00013050037457803924, "loss": 1.995, "step": 608 }, { "epoch": 0.09870740305522914, "grad_norm": 0.9772763252258301, "learning_rate": 0.0001302995083149953, "loss": 1.8804, "step": 609 }, { "epoch": 0.0988694841768305, "grad_norm": 1.0714612007141113, "learning_rate": 0.0001300985073534919, "loss": 1.9533, "step": 610 }, { "epoch": 0.09903156529843186, "grad_norm": 0.8222501277923584, "learning_rate": 0.00012989737258709203, "loss": 1.8502, "step": 611 }, { "epoch": 0.09919364642003323, "grad_norm": 1.061041235923767, "learning_rate": 0.00012969610490995358, "loss": 1.8848, "step": 612 }, { "epoch": 0.09935572754163459, "grad_norm": 1.0173934698104858, "learning_rate": 0.00012949470521682528, "loss": 1.8873, "step": 613 }, { "epoch": 0.09951780866323595, "grad_norm": 0.8259528875350952, "learning_rate": 0.0001292931744030427, "loss": 1.876, "step": 614 }, { "epoch": 0.09967988978483731, "grad_norm": 0.8482165336608887, "learning_rate": 0.0001290915133645243, "loss": 2.0685, "step": 615 }, { "epoch": 0.09984197090643868, "grad_norm": 0.9458348155021667, "learning_rate": 0.00012888972299776754, "loss": 2.0443, "step": 616 }, { "epoch": 0.10000405202804004, "grad_norm": 0.9534991383552551, "learning_rate": 0.00012868780419984482, "loss": 1.9131, "step": 617 }, { "epoch": 0.1001661331496414, "grad_norm": 0.9313825368881226, "learning_rate": 0.00012848575786839943, "loss": 1.9108, "step": 618 }, { "epoch": 0.10032821427124276, "grad_norm": 0.8526967763900757, "learning_rate": 0.0001282835849016416, "loss": 1.9619, "step": 619 }, { "epoch": 0.10049029539284411, "grad_norm": 0.9859809279441833, "learning_rate": 0.00012808128619834461, "loss": 1.9927, "step": 620 }, { "epoch": 0.10065237651444547, "grad_norm": 1.0257471799850464, "learning_rate": 0.0001278788626578407, "loss": 2.0363, "step": 621 }, { "epoch": 0.10081445763604684, "grad_norm": 0.8856364488601685, "learning_rate": 0.00012767631518001698, "loss": 2.0012, "step": 622 }, { "epoch": 0.1009765387576482, "grad_norm": 0.979571521282196, "learning_rate": 0.00012747364466531163, "loss": 2.0095, "step": 623 }, { "epoch": 0.10113861987924956, "grad_norm": 1.0153757333755493, "learning_rate": 0.00012727085201470973, "loss": 1.886, "step": 624 }, { "epoch": 0.10130070100085092, "grad_norm": 0.9400056600570679, "learning_rate": 0.00012706793812973941, "loss": 1.9164, "step": 625 }, { "epoch": 0.10146278212245229, "grad_norm": 0.9391161799430847, "learning_rate": 0.0001268649039124677, "loss": 1.9919, "step": 626 }, { "epoch": 0.10162486324405365, "grad_norm": 0.9559611678123474, "learning_rate": 0.00012666175026549662, "loss": 2.0035, "step": 627 }, { "epoch": 0.10178694436565501, "grad_norm": 1.0190401077270508, "learning_rate": 0.000126458478091959, "loss": 1.972, "step": 628 }, { "epoch": 0.10194902548725637, "grad_norm": 1.0643559694290161, "learning_rate": 0.00012625508829551473, "loss": 2.1395, "step": 629 }, { "epoch": 0.10211110660885774, "grad_norm": 1.0885188579559326, "learning_rate": 0.00012605158178034654, "loss": 2.1064, "step": 630 }, { "epoch": 0.1022731877304591, "grad_norm": 1.1046760082244873, "learning_rate": 0.00012584795945115603, "loss": 2.0215, "step": 631 }, { "epoch": 0.10243526885206046, "grad_norm": 1.0968583822250366, "learning_rate": 0.0001256442222131597, "loss": 2.1134, "step": 632 }, { "epoch": 0.10259734997366182, "grad_norm": 1.0828778743743896, "learning_rate": 0.0001254403709720848, "loss": 2.067, "step": 633 }, { "epoch": 0.10275943109526318, "grad_norm": 1.1520904302597046, "learning_rate": 0.0001252364066341655, "loss": 2.2446, "step": 634 }, { "epoch": 0.10292151221686455, "grad_norm": 1.0999350547790527, "learning_rate": 0.00012503233010613865, "loss": 2.0136, "step": 635 }, { "epoch": 0.1030835933384659, "grad_norm": 1.2035555839538574, "learning_rate": 0.00012482814229523997, "loss": 2.1282, "step": 636 }, { "epoch": 0.10324567446006726, "grad_norm": 1.1753383874893188, "learning_rate": 0.00012462384410919975, "loss": 2.256, "step": 637 }, { "epoch": 0.10340775558166862, "grad_norm": 1.226996898651123, "learning_rate": 0.00012441943645623903, "loss": 2.0919, "step": 638 }, { "epoch": 0.10356983670326998, "grad_norm": 1.136850118637085, "learning_rate": 0.00012421492024506555, "loss": 2.1027, "step": 639 }, { "epoch": 0.10373191782487134, "grad_norm": 1.231503963470459, "learning_rate": 0.00012401029638486953, "loss": 2.3625, "step": 640 }, { "epoch": 0.10389399894647271, "grad_norm": 1.5343997478485107, "learning_rate": 0.0001238055657853198, "loss": 2.2502, "step": 641 }, { "epoch": 0.10405608006807407, "grad_norm": 1.4711858034133911, "learning_rate": 0.00012360072935655982, "loss": 2.4321, "step": 642 }, { "epoch": 0.10421816118967543, "grad_norm": 1.310766577720642, "learning_rate": 0.00012339578800920332, "loss": 2.2328, "step": 643 }, { "epoch": 0.1043802423112768, "grad_norm": 1.4926822185516357, "learning_rate": 0.00012319074265433063, "loss": 2.4092, "step": 644 }, { "epoch": 0.10454232343287816, "grad_norm": 1.4710139036178589, "learning_rate": 0.00012298559420348437, "loss": 2.481, "step": 645 }, { "epoch": 0.10470440455447952, "grad_norm": 1.7710152864456177, "learning_rate": 0.00012278034356866545, "loss": 2.4981, "step": 646 }, { "epoch": 0.10486648567608088, "grad_norm": 1.8097518682479858, "learning_rate": 0.00012257499166232907, "loss": 2.6765, "step": 647 }, { "epoch": 0.10502856679768224, "grad_norm": 2.0177907943725586, "learning_rate": 0.0001223695393973807, "loss": 2.6031, "step": 648 }, { "epoch": 0.1051906479192836, "grad_norm": 2.2055444717407227, "learning_rate": 0.0001221639876871719, "loss": 2.6634, "step": 649 }, { "epoch": 0.10535272904088497, "grad_norm": 3.3884146213531494, "learning_rate": 0.0001219583374454963, "loss": 3.1642, "step": 650 }, { "epoch": 0.10551481016248633, "grad_norm": 0.9355396628379822, "learning_rate": 0.00012175258958658564, "loss": 2.2199, "step": 651 }, { "epoch": 0.10567689128408768, "grad_norm": 0.9128090739250183, "learning_rate": 0.00012154674502510555, "loss": 2.2072, "step": 652 }, { "epoch": 0.10583897240568904, "grad_norm": 0.7615045309066772, "learning_rate": 0.00012134080467615159, "loss": 1.9128, "step": 653 }, { "epoch": 0.1060010535272904, "grad_norm": 0.7822588682174683, "learning_rate": 0.00012113476945524513, "loss": 1.8426, "step": 654 }, { "epoch": 0.10616313464889177, "grad_norm": 0.8321034908294678, "learning_rate": 0.00012092864027832933, "loss": 1.9809, "step": 655 }, { "epoch": 0.10632521577049313, "grad_norm": 0.8651915788650513, "learning_rate": 0.000120722418061765, "loss": 1.9756, "step": 656 }, { "epoch": 0.10648729689209449, "grad_norm": 0.9207678437232971, "learning_rate": 0.0001205161037223266, "loss": 1.9541, "step": 657 }, { "epoch": 0.10664937801369585, "grad_norm": 0.8429062366485596, "learning_rate": 0.00012030969817719808, "loss": 1.8338, "step": 658 }, { "epoch": 0.10681145913529722, "grad_norm": 0.8811150789260864, "learning_rate": 0.00012010320234396894, "loss": 1.8557, "step": 659 }, { "epoch": 0.10697354025689858, "grad_norm": 1.0047332048416138, "learning_rate": 0.00011989661714062999, "loss": 1.9776, "step": 660 }, { "epoch": 0.10713562137849994, "grad_norm": 0.9004315137863159, "learning_rate": 0.0001196899434855693, "loss": 2.0346, "step": 661 }, { "epoch": 0.1072977025001013, "grad_norm": 0.9530714750289917, "learning_rate": 0.00011948318229756827, "loss": 1.8889, "step": 662 }, { "epoch": 0.10745978362170266, "grad_norm": 1.1834092140197754, "learning_rate": 0.00011927633449579735, "loss": 1.9331, "step": 663 }, { "epoch": 0.10762186474330403, "grad_norm": 0.9460235834121704, "learning_rate": 0.0001190694009998121, "loss": 1.9315, "step": 664 }, { "epoch": 0.10778394586490539, "grad_norm": 1.031552791595459, "learning_rate": 0.00011886238272954897, "loss": 1.8497, "step": 665 }, { "epoch": 0.10794602698650675, "grad_norm": 1.3479676246643066, "learning_rate": 0.00011865528060532127, "loss": 2.0665, "step": 666 }, { "epoch": 0.10810810810810811, "grad_norm": 1.033022403717041, "learning_rate": 0.0001184480955478152, "loss": 1.9557, "step": 667 }, { "epoch": 0.10827018922970948, "grad_norm": 1.038385272026062, "learning_rate": 0.00011824082847808558, "loss": 1.8925, "step": 668 }, { "epoch": 0.10843227035131082, "grad_norm": 1.3555527925491333, "learning_rate": 0.00011803348031755179, "loss": 1.8727, "step": 669 }, { "epoch": 0.10859435147291219, "grad_norm": 1.0465887784957886, "learning_rate": 0.0001178260519879937, "loss": 1.8651, "step": 670 }, { "epoch": 0.10875643259451355, "grad_norm": 0.9604950547218323, "learning_rate": 0.00011761854441154767, "loss": 1.934, "step": 671 }, { "epoch": 0.10891851371611491, "grad_norm": 1.2489222288131714, "learning_rate": 0.00011741095851070228, "loss": 2.0186, "step": 672 }, { "epoch": 0.10908059483771627, "grad_norm": 1.0158320665359497, "learning_rate": 0.00011720329520829429, "loss": 1.8764, "step": 673 }, { "epoch": 0.10924267595931764, "grad_norm": 1.0656780004501343, "learning_rate": 0.0001169955554275046, "loss": 2.0056, "step": 674 }, { "epoch": 0.109404757080919, "grad_norm": 1.0877074003219604, "learning_rate": 0.0001167877400918541, "loss": 1.9513, "step": 675 }, { "epoch": 0.10956683820252036, "grad_norm": 1.2851334810256958, "learning_rate": 0.00011657985012519952, "loss": 1.9123, "step": 676 }, { "epoch": 0.10972891932412172, "grad_norm": 1.251997709274292, "learning_rate": 0.00011637188645172944, "loss": 2.0518, "step": 677 }, { "epoch": 0.10989100044572309, "grad_norm": 0.975654125213623, "learning_rate": 0.00011616384999596006, "loss": 2.0128, "step": 678 }, { "epoch": 0.11005308156732445, "grad_norm": 1.223952293395996, "learning_rate": 0.00011595574168273111, "loss": 1.9713, "step": 679 }, { "epoch": 0.11021516268892581, "grad_norm": 0.9663240313529968, "learning_rate": 0.0001157475624372018, "loss": 2.0079, "step": 680 }, { "epoch": 0.11037724381052717, "grad_norm": 1.295284390449524, "learning_rate": 0.0001155393131848467, "loss": 2.0931, "step": 681 }, { "epoch": 0.11053932493212854, "grad_norm": 1.20632803440094, "learning_rate": 0.00011533099485145155, "loss": 2.0594, "step": 682 }, { "epoch": 0.1107014060537299, "grad_norm": 1.0678186416625977, "learning_rate": 0.00011512260836310924, "loss": 1.9552, "step": 683 }, { "epoch": 0.11086348717533126, "grad_norm": 1.171813726425171, "learning_rate": 0.00011491415464621562, "loss": 2.0355, "step": 684 }, { "epoch": 0.11102556829693261, "grad_norm": 1.2411259412765503, "learning_rate": 0.00011470563462746541, "loss": 2.126, "step": 685 }, { "epoch": 0.11118764941853397, "grad_norm": 1.493956446647644, "learning_rate": 0.00011449704923384812, "loss": 2.0421, "step": 686 }, { "epoch": 0.11134973054013533, "grad_norm": 1.1940127611160278, "learning_rate": 0.00011428839939264382, "loss": 2.0215, "step": 687 }, { "epoch": 0.1115118116617367, "grad_norm": 1.3996338844299316, "learning_rate": 0.0001140796860314191, "loss": 2.1449, "step": 688 }, { "epoch": 0.11167389278333806, "grad_norm": 1.2596129179000854, "learning_rate": 0.00011387091007802297, "loss": 2.2701, "step": 689 }, { "epoch": 0.11183597390493942, "grad_norm": 1.4058315753936768, "learning_rate": 0.0001136620724605827, "loss": 2.2262, "step": 690 }, { "epoch": 0.11199805502654078, "grad_norm": 1.347208857536316, "learning_rate": 0.00011345317410749964, "loss": 2.3649, "step": 691 }, { "epoch": 0.11216013614814214, "grad_norm": 1.4130449295043945, "learning_rate": 0.00011324421594744516, "loss": 2.2562, "step": 692 }, { "epoch": 0.11232221726974351, "grad_norm": 1.5953857898712158, "learning_rate": 0.00011303519890935656, "loss": 2.3604, "step": 693 }, { "epoch": 0.11248429839134487, "grad_norm": 1.5131069421768188, "learning_rate": 0.00011282612392243286, "loss": 2.372, "step": 694 }, { "epoch": 0.11264637951294623, "grad_norm": 1.6171170473098755, "learning_rate": 0.00011261699191613066, "loss": 2.5048, "step": 695 }, { "epoch": 0.1128084606345476, "grad_norm": 1.5999062061309814, "learning_rate": 0.00011240780382016005, "loss": 2.3717, "step": 696 }, { "epoch": 0.11297054175614896, "grad_norm": 1.7249175310134888, "learning_rate": 0.00011219856056448051, "loss": 2.4459, "step": 697 }, { "epoch": 0.11313262287775032, "grad_norm": 1.9398958683013916, "learning_rate": 0.00011198926307929664, "loss": 2.4401, "step": 698 }, { "epoch": 0.11329470399935168, "grad_norm": 2.17580509185791, "learning_rate": 0.00011177991229505431, "loss": 2.4952, "step": 699 }, { "epoch": 0.11345678512095304, "grad_norm": 3.585874319076538, "learning_rate": 0.00011157050914243614, "loss": 3.1707, "step": 700 }, { "epoch": 0.11361886624255439, "grad_norm": 0.953359067440033, "learning_rate": 0.00011136105455235766, "loss": 2.2743, "step": 701 }, { "epoch": 0.11378094736415575, "grad_norm": 0.876708447933197, "learning_rate": 0.00011115154945596305, "loss": 2.0934, "step": 702 }, { "epoch": 0.11394302848575712, "grad_norm": 0.9024324417114258, "learning_rate": 0.00011094199478462095, "loss": 2.1287, "step": 703 }, { "epoch": 0.11410510960735848, "grad_norm": 0.8797346949577332, "learning_rate": 0.00011073239146992054, "loss": 1.9953, "step": 704 }, { "epoch": 0.11426719072895984, "grad_norm": 0.9407122135162354, "learning_rate": 0.00011052274044366711, "loss": 1.8401, "step": 705 }, { "epoch": 0.1144292718505612, "grad_norm": 0.910133957862854, "learning_rate": 0.00011031304263787812, "loss": 1.8964, "step": 706 }, { "epoch": 0.11459135297216257, "grad_norm": 0.7928838729858398, "learning_rate": 0.00011010329898477891, "loss": 1.8369, "step": 707 }, { "epoch": 0.11475343409376393, "grad_norm": 0.9891617894172668, "learning_rate": 0.0001098935104167988, "loss": 1.8453, "step": 708 }, { "epoch": 0.11491551521536529, "grad_norm": 0.8322477340698242, "learning_rate": 0.00010968367786656663, "loss": 1.9136, "step": 709 }, { "epoch": 0.11507759633696665, "grad_norm": 0.9456565380096436, "learning_rate": 0.00010947380226690684, "loss": 1.9338, "step": 710 }, { "epoch": 0.11523967745856802, "grad_norm": 0.9236729145050049, "learning_rate": 0.00010926388455083522, "loss": 2.0013, "step": 711 }, { "epoch": 0.11540175858016938, "grad_norm": 0.9143500924110413, "learning_rate": 0.00010905392565155477, "loss": 1.9021, "step": 712 }, { "epoch": 0.11556383970177074, "grad_norm": 0.8841348886489868, "learning_rate": 0.00010884392650245165, "loss": 1.995, "step": 713 }, { "epoch": 0.1157259208233721, "grad_norm": 1.009333848953247, "learning_rate": 0.00010863388803709089, "loss": 2.0329, "step": 714 }, { "epoch": 0.11588800194497346, "grad_norm": 0.9305991530418396, "learning_rate": 0.00010842381118921232, "loss": 1.944, "step": 715 }, { "epoch": 0.11605008306657483, "grad_norm": 1.111525535583496, "learning_rate": 0.00010821369689272638, "loss": 1.9559, "step": 716 }, { "epoch": 0.11621216418817618, "grad_norm": 0.9530056715011597, "learning_rate": 0.00010800354608171003, "loss": 1.9959, "step": 717 }, { "epoch": 0.11637424530977754, "grad_norm": 0.925412118434906, "learning_rate": 0.00010779335969040252, "loss": 1.9985, "step": 718 }, { "epoch": 0.1165363264313789, "grad_norm": 1.09770929813385, "learning_rate": 0.00010758313865320134, "loss": 1.9022, "step": 719 }, { "epoch": 0.11669840755298026, "grad_norm": 1.0506656169891357, "learning_rate": 0.00010737288390465792, "loss": 2.019, "step": 720 }, { "epoch": 0.11686048867458163, "grad_norm": 0.9373312592506409, "learning_rate": 0.00010716259637947357, "loss": 1.8382, "step": 721 }, { "epoch": 0.11702256979618299, "grad_norm": 1.0534740686416626, "learning_rate": 0.00010695227701249537, "loss": 1.9618, "step": 722 }, { "epoch": 0.11718465091778435, "grad_norm": 1.2244242429733276, "learning_rate": 0.00010674192673871191, "loss": 1.9127, "step": 723 }, { "epoch": 0.11734673203938571, "grad_norm": 0.9955419898033142, "learning_rate": 0.00010653154649324917, "loss": 1.8751, "step": 724 }, { "epoch": 0.11750881316098707, "grad_norm": 1.0729632377624512, "learning_rate": 0.00010632113721136636, "loss": 2.0526, "step": 725 }, { "epoch": 0.11767089428258844, "grad_norm": 1.1402127742767334, "learning_rate": 0.00010611069982845183, "loss": 2.0516, "step": 726 }, { "epoch": 0.1178329754041898, "grad_norm": 1.0431184768676758, "learning_rate": 0.00010590023528001884, "loss": 1.8726, "step": 727 }, { "epoch": 0.11799505652579116, "grad_norm": 1.1810261011123657, "learning_rate": 0.00010568974450170139, "loss": 2.031, "step": 728 }, { "epoch": 0.11815713764739252, "grad_norm": 1.0496701002120972, "learning_rate": 0.00010547922842925008, "loss": 2.1863, "step": 729 }, { "epoch": 0.11831921876899389, "grad_norm": 0.9929841756820679, "learning_rate": 0.00010526868799852796, "loss": 2.0269, "step": 730 }, { "epoch": 0.11848129989059525, "grad_norm": 0.9979134202003479, "learning_rate": 0.0001050581241455064, "loss": 1.9651, "step": 731 }, { "epoch": 0.11864338101219661, "grad_norm": 1.212095022201538, "learning_rate": 0.00010484753780626089, "loss": 2.135, "step": 732 }, { "epoch": 0.11880546213379796, "grad_norm": 1.0873068571090698, "learning_rate": 0.00010463692991696685, "loss": 2.0435, "step": 733 }, { "epoch": 0.11896754325539932, "grad_norm": 1.2885966300964355, "learning_rate": 0.00010442630141389549, "loss": 2.0093, "step": 734 }, { "epoch": 0.11912962437700068, "grad_norm": 1.0481879711151123, "learning_rate": 0.00010421565323340971, "loss": 2.0766, "step": 735 }, { "epoch": 0.11929170549860205, "grad_norm": 1.1832183599472046, "learning_rate": 0.00010400498631195992, "loss": 1.9279, "step": 736 }, { "epoch": 0.11945378662020341, "grad_norm": 1.4472031593322754, "learning_rate": 0.00010379430158607975, "loss": 2.2118, "step": 737 }, { "epoch": 0.11961586774180477, "grad_norm": 1.197808027267456, "learning_rate": 0.000103583599992382, "loss": 2.2347, "step": 738 }, { "epoch": 0.11977794886340613, "grad_norm": 1.1925451755523682, "learning_rate": 0.0001033728824675545, "loss": 2.1039, "step": 739 }, { "epoch": 0.1199400299850075, "grad_norm": 1.2716639041900635, "learning_rate": 0.0001031621499483559, "loss": 2.175, "step": 740 }, { "epoch": 0.12010211110660886, "grad_norm": 1.4327057600021362, "learning_rate": 0.00010295140337161146, "loss": 2.4568, "step": 741 }, { "epoch": 0.12026419222821022, "grad_norm": 1.3392750024795532, "learning_rate": 0.00010274064367420897, "loss": 2.3444, "step": 742 }, { "epoch": 0.12042627334981158, "grad_norm": 1.5073840618133545, "learning_rate": 0.00010252987179309459, "loss": 2.4889, "step": 743 }, { "epoch": 0.12058835447141295, "grad_norm": 1.556594967842102, "learning_rate": 0.00010231908866526851, "loss": 2.5008, "step": 744 }, { "epoch": 0.12075043559301431, "grad_norm": 1.4948514699935913, "learning_rate": 0.00010210829522778111, "loss": 2.433, "step": 745 }, { "epoch": 0.12091251671461567, "grad_norm": 1.7908929586410522, "learning_rate": 0.00010189749241772844, "loss": 2.4459, "step": 746 }, { "epoch": 0.12107459783621703, "grad_norm": 1.8812416791915894, "learning_rate": 0.00010168668117224825, "loss": 2.4916, "step": 747 }, { "epoch": 0.1212366789578184, "grad_norm": 2.7022805213928223, "learning_rate": 0.00010147586242851585, "loss": 2.5122, "step": 748 }, { "epoch": 0.12139876007941974, "grad_norm": 2.592996597290039, "learning_rate": 0.00010126503712373982, "loss": 2.7677, "step": 749 }, { "epoch": 0.1215608412010211, "grad_norm": 3.5674991607666016, "learning_rate": 0.00010105420619515798, "loss": 2.7586, "step": 750 }, { "epoch": 0.1215608412010211, "eval_loss": 2.100837469100952, "eval_runtime": 614.5477, "eval_samples_per_second": 16.91, "eval_steps_per_second": 8.455, "step": 750 }, { "epoch": 0.12172292232262247, "grad_norm": 1.6357475519180298, "learning_rate": 0.00010084337058003303, "loss": 2.0625, "step": 751 }, { "epoch": 0.12188500344422383, "grad_norm": 1.2438329458236694, "learning_rate": 0.00010063253121564868, "loss": 2.0012, "step": 752 }, { "epoch": 0.12204708456582519, "grad_norm": 0.8530017137527466, "learning_rate": 0.00010042168903930514, "loss": 2.0352, "step": 753 }, { "epoch": 0.12220916568742655, "grad_norm": 1.237705945968628, "learning_rate": 0.00010021084498831522, "loss": 2.051, "step": 754 }, { "epoch": 0.12237124680902792, "grad_norm": 1.1300758123397827, "learning_rate": 0.0001, "loss": 1.9153, "step": 755 }, { "epoch": 0.12253332793062928, "grad_norm": 0.8772503733634949, "learning_rate": 9.97891550116848e-05, "loss": 1.7685, "step": 756 }, { "epoch": 0.12269540905223064, "grad_norm": 1.1153267621994019, "learning_rate": 9.957831096069488e-05, "loss": 1.9516, "step": 757 }, { "epoch": 0.122857490173832, "grad_norm": 1.4257912635803223, "learning_rate": 9.936746878435136e-05, "loss": 1.8292, "step": 758 }, { "epoch": 0.12301957129543337, "grad_norm": 0.9260138273239136, "learning_rate": 9.915662941996699e-05, "loss": 1.8869, "step": 759 }, { "epoch": 0.12318165241703473, "grad_norm": 1.097367763519287, "learning_rate": 9.894579380484204e-05, "loss": 1.9747, "step": 760 }, { "epoch": 0.12334373353863609, "grad_norm": 1.1968477964401245, "learning_rate": 9.873496287626019e-05, "loss": 1.7757, "step": 761 }, { "epoch": 0.12350581466023745, "grad_norm": 1.0050510168075562, "learning_rate": 9.852413757148417e-05, "loss": 1.754, "step": 762 }, { "epoch": 0.12366789578183882, "grad_norm": 0.8327038884162903, "learning_rate": 9.831331882775178e-05, "loss": 1.8554, "step": 763 }, { "epoch": 0.12382997690344018, "grad_norm": 0.8313567042350769, "learning_rate": 9.81025075822716e-05, "loss": 1.7536, "step": 764 }, { "epoch": 0.12399205802504153, "grad_norm": 1.2827397584915161, "learning_rate": 9.789170477221891e-05, "loss": 1.8539, "step": 765 }, { "epoch": 0.12415413914664289, "grad_norm": 1.0376012325286865, "learning_rate": 9.76809113347315e-05, "loss": 1.9776, "step": 766 }, { "epoch": 0.12431622026824425, "grad_norm": 1.0492738485336304, "learning_rate": 9.747012820690543e-05, "loss": 2.1623, "step": 767 }, { "epoch": 0.12447830138984561, "grad_norm": 1.0391172170639038, "learning_rate": 9.725935632579104e-05, "loss": 1.9588, "step": 768 }, { "epoch": 0.12464038251144698, "grad_norm": 1.0550709962844849, "learning_rate": 9.704859662838855e-05, "loss": 1.8336, "step": 769 }, { "epoch": 0.12480246363304834, "grad_norm": 0.9650871157646179, "learning_rate": 9.683785005164411e-05, "loss": 1.9008, "step": 770 }, { "epoch": 0.1249645447546497, "grad_norm": 0.9164173603057861, "learning_rate": 9.662711753244551e-05, "loss": 1.8698, "step": 771 }, { "epoch": 0.12512662587625106, "grad_norm": 1.006953477859497, "learning_rate": 9.641640000761802e-05, "loss": 1.9802, "step": 772 }, { "epoch": 0.1252887069978524, "grad_norm": 1.0823261737823486, "learning_rate": 9.620569841392029e-05, "loss": 1.9954, "step": 773 }, { "epoch": 0.1254507881194538, "grad_norm": 0.9342384338378906, "learning_rate": 9.59950136880401e-05, "loss": 1.8948, "step": 774 }, { "epoch": 0.12561286924105514, "grad_norm": 1.1092231273651123, "learning_rate": 9.57843467665903e-05, "loss": 1.9223, "step": 775 }, { "epoch": 0.1257749503626565, "grad_norm": 1.1528615951538086, "learning_rate": 9.557369858610453e-05, "loss": 1.835, "step": 776 }, { "epoch": 0.12593703148425786, "grad_norm": 1.034740924835205, "learning_rate": 9.53630700830332e-05, "loss": 2.0088, "step": 777 }, { "epoch": 0.12609911260585924, "grad_norm": 1.0698503255844116, "learning_rate": 9.51524621937391e-05, "loss": 2.0756, "step": 778 }, { "epoch": 0.12626119372746059, "grad_norm": 1.0951178073883057, "learning_rate": 9.494187585449358e-05, "loss": 1.8632, "step": 779 }, { "epoch": 0.12642327484906196, "grad_norm": 1.0131338834762573, "learning_rate": 9.473131200147205e-05, "loss": 1.9508, "step": 780 }, { "epoch": 0.1265853559706633, "grad_norm": 1.1030675172805786, "learning_rate": 9.452077157074994e-05, "loss": 2.1828, "step": 781 }, { "epoch": 0.1267474370922647, "grad_norm": 1.1066761016845703, "learning_rate": 9.431025549829862e-05, "loss": 2.1577, "step": 782 }, { "epoch": 0.12690951821386603, "grad_norm": 1.1348315477371216, "learning_rate": 9.409976471998118e-05, "loss": 2.0495, "step": 783 }, { "epoch": 0.1270715993354674, "grad_norm": 1.144537091255188, "learning_rate": 9.388930017154819e-05, "loss": 2.0883, "step": 784 }, { "epoch": 0.12723368045706876, "grad_norm": 1.1519125699996948, "learning_rate": 9.367886278863366e-05, "loss": 1.9594, "step": 785 }, { "epoch": 0.12739576157867014, "grad_norm": 1.1154705286026, "learning_rate": 9.346845350675088e-05, "loss": 2.1635, "step": 786 }, { "epoch": 0.12755784270027148, "grad_norm": 1.2760833501815796, "learning_rate": 9.325807326128814e-05, "loss": 2.2134, "step": 787 }, { "epoch": 0.12771992382187286, "grad_norm": 1.2242107391357422, "learning_rate": 9.304772298750463e-05, "loss": 2.2297, "step": 788 }, { "epoch": 0.1278820049434742, "grad_norm": 1.2910743951797485, "learning_rate": 9.283740362052642e-05, "loss": 2.1904, "step": 789 }, { "epoch": 0.12804408606507556, "grad_norm": 1.3698585033416748, "learning_rate": 9.26271160953421e-05, "loss": 2.337, "step": 790 }, { "epoch": 0.12820616718667693, "grad_norm": 1.33097505569458, "learning_rate": 9.241686134679867e-05, "loss": 2.3994, "step": 791 }, { "epoch": 0.12836824830827828, "grad_norm": 1.4310002326965332, "learning_rate": 9.220664030959749e-05, "loss": 2.3221, "step": 792 }, { "epoch": 0.12853032942987966, "grad_norm": 1.4754676818847656, "learning_rate": 9.199645391828999e-05, "loss": 2.3487, "step": 793 }, { "epoch": 0.128692410551481, "grad_norm": 1.5954676866531372, "learning_rate": 9.178630310727365e-05, "loss": 2.3597, "step": 794 }, { "epoch": 0.12885449167308238, "grad_norm": 1.6153950691223145, "learning_rate": 9.157618881078772e-05, "loss": 2.4023, "step": 795 }, { "epoch": 0.12901657279468373, "grad_norm": 1.785256266593933, "learning_rate": 9.136611196290915e-05, "loss": 2.3939, "step": 796 }, { "epoch": 0.1291786539162851, "grad_norm": 1.7453643083572388, "learning_rate": 9.115607349754834e-05, "loss": 2.4206, "step": 797 }, { "epoch": 0.12934073503788646, "grad_norm": 2.04004168510437, "learning_rate": 9.094607434844523e-05, "loss": 2.6013, "step": 798 }, { "epoch": 0.12950281615948783, "grad_norm": 2.553853988647461, "learning_rate": 9.07361154491648e-05, "loss": 2.7647, "step": 799 }, { "epoch": 0.12966489728108918, "grad_norm": 3.740873098373413, "learning_rate": 9.052619773309317e-05, "loss": 3.2341, "step": 800 }, { "epoch": 0.12982697840269056, "grad_norm": 1.1410894393920898, "learning_rate": 9.031632213343339e-05, "loss": 2.0614, "step": 801 }, { "epoch": 0.1299890595242919, "grad_norm": 1.0453135967254639, "learning_rate": 9.01064895832012e-05, "loss": 2.0281, "step": 802 }, { "epoch": 0.13015114064589328, "grad_norm": 1.0344566106796265, "learning_rate": 8.98967010152211e-05, "loss": 1.8177, "step": 803 }, { "epoch": 0.13031322176749463, "grad_norm": 0.9780733585357666, "learning_rate": 8.968695736212193e-05, "loss": 1.8637, "step": 804 }, { "epoch": 0.13047530288909598, "grad_norm": 0.9210421442985535, "learning_rate": 8.947725955633294e-05, "loss": 1.8581, "step": 805 }, { "epoch": 0.13063738401069735, "grad_norm": 1.0712862014770508, "learning_rate": 8.926760853007946e-05, "loss": 1.9776, "step": 806 }, { "epoch": 0.1307994651322987, "grad_norm": 1.1466506719589233, "learning_rate": 8.905800521537905e-05, "loss": 1.8315, "step": 807 }, { "epoch": 0.13096154625390008, "grad_norm": 0.837581992149353, "learning_rate": 8.884845054403699e-05, "loss": 1.8584, "step": 808 }, { "epoch": 0.13112362737550143, "grad_norm": 0.9372288584709167, "learning_rate": 8.863894544764236e-05, "loss": 1.8704, "step": 809 }, { "epoch": 0.1312857084971028, "grad_norm": 1.1507923603057861, "learning_rate": 8.84294908575639e-05, "loss": 1.9021, "step": 810 }, { "epoch": 0.13144778961870415, "grad_norm": 1.080910086631775, "learning_rate": 8.822008770494572e-05, "loss": 1.8538, "step": 811 }, { "epoch": 0.13160987074030553, "grad_norm": 0.9674392342567444, "learning_rate": 8.801073692070337e-05, "loss": 1.9156, "step": 812 }, { "epoch": 0.13177195186190688, "grad_norm": 1.2446630001068115, "learning_rate": 8.780143943551954e-05, "loss": 1.8505, "step": 813 }, { "epoch": 0.13193403298350825, "grad_norm": 1.008480191230774, "learning_rate": 8.759219617983999e-05, "loss": 1.7613, "step": 814 }, { "epoch": 0.1320961141051096, "grad_norm": 0.9498804211616516, "learning_rate": 8.738300808386935e-05, "loss": 1.8792, "step": 815 }, { "epoch": 0.13225819522671098, "grad_norm": 1.1672688722610474, "learning_rate": 8.717387607756713e-05, "loss": 1.8739, "step": 816 }, { "epoch": 0.13242027634831233, "grad_norm": 1.2960506677627563, "learning_rate": 8.696480109064342e-05, "loss": 1.9081, "step": 817 }, { "epoch": 0.1325823574699137, "grad_norm": 1.1460671424865723, "learning_rate": 8.675578405255485e-05, "loss": 1.9553, "step": 818 }, { "epoch": 0.13274443859151505, "grad_norm": 1.061458945274353, "learning_rate": 8.654682589250038e-05, "loss": 1.7947, "step": 819 }, { "epoch": 0.13290651971311643, "grad_norm": 1.107877254486084, "learning_rate": 8.633792753941733e-05, "loss": 1.8459, "step": 820 }, { "epoch": 0.13306860083471778, "grad_norm": 1.3674486875534058, "learning_rate": 8.612908992197705e-05, "loss": 1.9907, "step": 821 }, { "epoch": 0.13323068195631912, "grad_norm": 0.8947020769119263, "learning_rate": 8.592031396858093e-05, "loss": 1.8471, "step": 822 }, { "epoch": 0.1333927630779205, "grad_norm": 0.9594246745109558, "learning_rate": 8.571160060735624e-05, "loss": 1.8078, "step": 823 }, { "epoch": 0.13355484419952185, "grad_norm": 1.1238514184951782, "learning_rate": 8.550295076615188e-05, "loss": 1.8941, "step": 824 }, { "epoch": 0.13371692532112323, "grad_norm": 1.0638455152511597, "learning_rate": 8.529436537253458e-05, "loss": 1.9647, "step": 825 }, { "epoch": 0.13387900644272457, "grad_norm": 0.9952811002731323, "learning_rate": 8.508584535378439e-05, "loss": 2.0224, "step": 826 }, { "epoch": 0.13404108756432595, "grad_norm": 1.0614765882492065, "learning_rate": 8.487739163689079e-05, "loss": 1.9339, "step": 827 }, { "epoch": 0.1342031686859273, "grad_norm": 0.9425110220909119, "learning_rate": 8.466900514854847e-05, "loss": 1.9202, "step": 828 }, { "epoch": 0.13436524980752867, "grad_norm": 0.9917240738868713, "learning_rate": 8.446068681515334e-05, "loss": 1.9207, "step": 829 }, { "epoch": 0.13452733092913002, "grad_norm": 1.0489814281463623, "learning_rate": 8.425243756279824e-05, "loss": 1.9558, "step": 830 }, { "epoch": 0.1346894120507314, "grad_norm": 1.122800588607788, "learning_rate": 8.404425831726894e-05, "loss": 2.0572, "step": 831 }, { "epoch": 0.13485149317233275, "grad_norm": 1.1429322957992554, "learning_rate": 8.383615000404e-05, "loss": 2.0046, "step": 832 }, { "epoch": 0.13501357429393412, "grad_norm": 1.120377779006958, "learning_rate": 8.362811354827059e-05, "loss": 2.0087, "step": 833 }, { "epoch": 0.13517565541553547, "grad_norm": 1.1296513080596924, "learning_rate": 8.342014987480047e-05, "loss": 1.9187, "step": 834 }, { "epoch": 0.13533773653713685, "grad_norm": 1.1733276844024658, "learning_rate": 8.321225990814591e-05, "loss": 1.9788, "step": 835 }, { "epoch": 0.1354998176587382, "grad_norm": 1.4258500337600708, "learning_rate": 8.300444457249543e-05, "loss": 2.132, "step": 836 }, { "epoch": 0.13566189878033957, "grad_norm": 1.315438985824585, "learning_rate": 8.279670479170573e-05, "loss": 2.0177, "step": 837 }, { "epoch": 0.13582397990194092, "grad_norm": 1.1546282768249512, "learning_rate": 8.258904148929775e-05, "loss": 2.2796, "step": 838 }, { "epoch": 0.13598606102354227, "grad_norm": 1.222131609916687, "learning_rate": 8.238145558845235e-05, "loss": 2.1542, "step": 839 }, { "epoch": 0.13614814214514365, "grad_norm": 1.361764669418335, "learning_rate": 8.217394801200631e-05, "loss": 2.084, "step": 840 }, { "epoch": 0.136310223266745, "grad_norm": 1.3947241306304932, "learning_rate": 8.196651968244826e-05, "loss": 2.3296, "step": 841 }, { "epoch": 0.13647230438834637, "grad_norm": 1.3678861856460571, "learning_rate": 8.175917152191447e-05, "loss": 2.2706, "step": 842 }, { "epoch": 0.13663438550994772, "grad_norm": 1.44639253616333, "learning_rate": 8.15519044521848e-05, "loss": 2.3269, "step": 843 }, { "epoch": 0.1367964666315491, "grad_norm": 1.5993499755859375, "learning_rate": 8.134471939467874e-05, "loss": 2.424, "step": 844 }, { "epoch": 0.13695854775315044, "grad_norm": 1.7641056776046753, "learning_rate": 8.113761727045105e-05, "loss": 2.3831, "step": 845 }, { "epoch": 0.13712062887475182, "grad_norm": 1.7237311601638794, "learning_rate": 8.093059900018792e-05, "loss": 2.4245, "step": 846 }, { "epoch": 0.13728270999635317, "grad_norm": 1.9228010177612305, "learning_rate": 8.072366550420266e-05, "loss": 2.5256, "step": 847 }, { "epoch": 0.13744479111795455, "grad_norm": 2.22377872467041, "learning_rate": 8.051681770243175e-05, "loss": 2.667, "step": 848 }, { "epoch": 0.1376068722395559, "grad_norm": 2.614419937133789, "learning_rate": 8.031005651443073e-05, "loss": 2.8375, "step": 849 }, { "epoch": 0.13776895336115727, "grad_norm": 3.5608818531036377, "learning_rate": 8.010338285937006e-05, "loss": 2.8056, "step": 850 }, { "epoch": 0.13793103448275862, "grad_norm": 0.9919871091842651, "learning_rate": 7.989679765603108e-05, "loss": 2.2277, "step": 851 }, { "epoch": 0.13809311560436, "grad_norm": 0.7670711874961853, "learning_rate": 7.969030182280192e-05, "loss": 1.9904, "step": 852 }, { "epoch": 0.13825519672596134, "grad_norm": 0.7723090052604675, "learning_rate": 7.948389627767343e-05, "loss": 1.753, "step": 853 }, { "epoch": 0.1384172778475627, "grad_norm": 0.8651127815246582, "learning_rate": 7.927758193823501e-05, "loss": 1.9254, "step": 854 }, { "epoch": 0.13857935896916407, "grad_norm": 0.8206538558006287, "learning_rate": 7.907135972167069e-05, "loss": 1.8283, "step": 855 }, { "epoch": 0.13874144009076542, "grad_norm": 0.9615094065666199, "learning_rate": 7.88652305447549e-05, "loss": 1.9, "step": 856 }, { "epoch": 0.1389035212123668, "grad_norm": 0.8335433006286621, "learning_rate": 7.865919532384844e-05, "loss": 1.8319, "step": 857 }, { "epoch": 0.13906560233396814, "grad_norm": 0.8094771504402161, "learning_rate": 7.845325497489449e-05, "loss": 1.8524, "step": 858 }, { "epoch": 0.13922768345556952, "grad_norm": 0.9305631518363953, "learning_rate": 7.82474104134144e-05, "loss": 2.0092, "step": 859 }, { "epoch": 0.13938976457717087, "grad_norm": 0.8830282092094421, "learning_rate": 7.804166255450373e-05, "loss": 1.9273, "step": 860 }, { "epoch": 0.13955184569877224, "grad_norm": 0.8910491466522217, "learning_rate": 7.783601231282812e-05, "loss": 1.9844, "step": 861 }, { "epoch": 0.1397139268203736, "grad_norm": 0.8072742223739624, "learning_rate": 7.763046060261932e-05, "loss": 1.7437, "step": 862 }, { "epoch": 0.13987600794197497, "grad_norm": 0.8262855410575867, "learning_rate": 7.742500833767094e-05, "loss": 1.9124, "step": 863 }, { "epoch": 0.14003808906357632, "grad_norm": 1.0997551679611206, "learning_rate": 7.721965643133458e-05, "loss": 1.864, "step": 864 }, { "epoch": 0.1402001701851777, "grad_norm": 0.9168811440467834, "learning_rate": 7.701440579651566e-05, "loss": 1.7489, "step": 865 }, { "epoch": 0.14036225130677904, "grad_norm": 0.9469778537750244, "learning_rate": 7.680925734566937e-05, "loss": 1.7726, "step": 866 }, { "epoch": 0.14052433242838042, "grad_norm": 1.01822829246521, "learning_rate": 7.660421199079669e-05, "loss": 1.783, "step": 867 }, { "epoch": 0.14068641354998176, "grad_norm": 0.9660579562187195, "learning_rate": 7.639927064344022e-05, "loss": 2.0246, "step": 868 }, { "epoch": 0.14084849467158314, "grad_norm": 1.0384860038757324, "learning_rate": 7.619443421468021e-05, "loss": 1.9049, "step": 869 }, { "epoch": 0.1410105757931845, "grad_norm": 0.9351313710212708, "learning_rate": 7.598970361513051e-05, "loss": 1.9748, "step": 870 }, { "epoch": 0.14117265691478584, "grad_norm": 0.8773226141929626, "learning_rate": 7.578507975493448e-05, "loss": 1.9263, "step": 871 }, { "epoch": 0.14133473803638721, "grad_norm": 0.9222373366355896, "learning_rate": 7.558056354376098e-05, "loss": 1.8738, "step": 872 }, { "epoch": 0.14149681915798856, "grad_norm": 0.9951638579368591, "learning_rate": 7.537615589080027e-05, "loss": 1.8731, "step": 873 }, { "epoch": 0.14165890027958994, "grad_norm": 0.9365777373313904, "learning_rate": 7.517185770476006e-05, "loss": 1.8431, "step": 874 }, { "epoch": 0.1418209814011913, "grad_norm": 0.962515652179718, "learning_rate": 7.496766989386136e-05, "loss": 2.0629, "step": 875 }, { "epoch": 0.14198306252279266, "grad_norm": 0.9871416687965393, "learning_rate": 7.476359336583454e-05, "loss": 1.9164, "step": 876 }, { "epoch": 0.142145143644394, "grad_norm": 0.9482909440994263, "learning_rate": 7.455962902791522e-05, "loss": 1.8526, "step": 877 }, { "epoch": 0.1423072247659954, "grad_norm": 1.0334808826446533, "learning_rate": 7.435577778684033e-05, "loss": 1.8418, "step": 878 }, { "epoch": 0.14246930588759674, "grad_norm": 0.9842519164085388, "learning_rate": 7.415204054884399e-05, "loss": 1.9821, "step": 879 }, { "epoch": 0.1426313870091981, "grad_norm": 1.0533347129821777, "learning_rate": 7.394841821965345e-05, "loss": 2.0263, "step": 880 }, { "epoch": 0.14279346813079946, "grad_norm": 1.0583339929580688, "learning_rate": 7.374491170448525e-05, "loss": 2.0331, "step": 881 }, { "epoch": 0.14295554925240084, "grad_norm": 1.0232335329055786, "learning_rate": 7.3541521908041e-05, "loss": 1.9572, "step": 882 }, { "epoch": 0.14311763037400219, "grad_norm": 1.0451996326446533, "learning_rate": 7.33382497345034e-05, "loss": 2.0074, "step": 883 }, { "epoch": 0.14327971149560356, "grad_norm": 1.051679253578186, "learning_rate": 7.313509608753231e-05, "loss": 1.8736, "step": 884 }, { "epoch": 0.1434417926172049, "grad_norm": 1.1811211109161377, "learning_rate": 7.293206187026061e-05, "loss": 1.9808, "step": 885 }, { "epoch": 0.14360387373880626, "grad_norm": 1.1472132205963135, "learning_rate": 7.27291479852903e-05, "loss": 2.0658, "step": 886 }, { "epoch": 0.14376595486040764, "grad_norm": 1.2269686460494995, "learning_rate": 7.252635533468843e-05, "loss": 2.032, "step": 887 }, { "epoch": 0.14392803598200898, "grad_norm": 1.235572338104248, "learning_rate": 7.232368481998309e-05, "loss": 2.1866, "step": 888 }, { "epoch": 0.14409011710361036, "grad_norm": 1.2191613912582397, "learning_rate": 7.212113734215932e-05, "loss": 2.04, "step": 889 }, { "epoch": 0.1442521982252117, "grad_norm": 1.2942798137664795, "learning_rate": 7.191871380165538e-05, "loss": 2.1666, "step": 890 }, { "epoch": 0.14441427934681308, "grad_norm": 1.4039545059204102, "learning_rate": 7.17164150983584e-05, "loss": 2.1723, "step": 891 }, { "epoch": 0.14457636046841443, "grad_norm": 1.400322675704956, "learning_rate": 7.151424213160061e-05, "loss": 2.1382, "step": 892 }, { "epoch": 0.1447384415900158, "grad_norm": 1.4236735105514526, "learning_rate": 7.131219580015521e-05, "loss": 2.3552, "step": 893 }, { "epoch": 0.14490052271161716, "grad_norm": 1.6621019840240479, "learning_rate": 7.11102770022325e-05, "loss": 2.4194, "step": 894 }, { "epoch": 0.14506260383321853, "grad_norm": 1.5574733018875122, "learning_rate": 7.090848663547574e-05, "loss": 2.3852, "step": 895 }, { "epoch": 0.14522468495481988, "grad_norm": 1.7908298969268799, "learning_rate": 7.070682559695736e-05, "loss": 2.3932, "step": 896 }, { "epoch": 0.14538676607642126, "grad_norm": 1.8378123044967651, "learning_rate": 7.050529478317476e-05, "loss": 2.2965, "step": 897 }, { "epoch": 0.1455488471980226, "grad_norm": 2.028446674346924, "learning_rate": 7.03038950900464e-05, "loss": 2.1769, "step": 898 }, { "epoch": 0.14571092831962398, "grad_norm": 2.4650120735168457, "learning_rate": 7.010262741290798e-05, "loss": 2.6098, "step": 899 }, { "epoch": 0.14587300944122533, "grad_norm": 3.8697285652160645, "learning_rate": 6.990149264650814e-05, "loss": 3.2206, "step": 900 }, { "epoch": 0.1460350905628267, "grad_norm": 1.0382612943649292, "learning_rate": 6.970049168500474e-05, "loss": 2.1867, "step": 901 }, { "epoch": 0.14619717168442806, "grad_norm": 0.8437471985816956, "learning_rate": 6.94996254219608e-05, "loss": 1.9932, "step": 902 }, { "epoch": 0.1463592528060294, "grad_norm": 0.8642414212226868, "learning_rate": 6.929889475034048e-05, "loss": 1.9621, "step": 903 }, { "epoch": 0.14652133392763078, "grad_norm": 0.829709529876709, "learning_rate": 6.909830056250527e-05, "loss": 1.7549, "step": 904 }, { "epoch": 0.14668341504923213, "grad_norm": 0.9023981094360352, "learning_rate": 6.889784375020978e-05, "loss": 1.7999, "step": 905 }, { "epoch": 0.1468454961708335, "grad_norm": 0.9434801340103149, "learning_rate": 6.869752520459803e-05, "loss": 1.9526, "step": 906 }, { "epoch": 0.14700757729243485, "grad_norm": 1.0058238506317139, "learning_rate": 6.849734581619918e-05, "loss": 1.8243, "step": 907 }, { "epoch": 0.14716965841403623, "grad_norm": 0.8264224529266357, "learning_rate": 6.829730647492404e-05, "loss": 1.8184, "step": 908 }, { "epoch": 0.14733173953563758, "grad_norm": 0.8843519687652588, "learning_rate": 6.80974080700606e-05, "loss": 1.958, "step": 909 }, { "epoch": 0.14749382065723896, "grad_norm": 0.8653523921966553, "learning_rate": 6.789765149027039e-05, "loss": 1.7968, "step": 910 }, { "epoch": 0.1476559017788403, "grad_norm": 0.899975597858429, "learning_rate": 6.769803762358443e-05, "loss": 1.9741, "step": 911 }, { "epoch": 0.14781798290044168, "grad_norm": 0.839066743850708, "learning_rate": 6.749856735739928e-05, "loss": 1.7658, "step": 912 }, { "epoch": 0.14798006402204303, "grad_norm": 0.8803154826164246, "learning_rate": 6.729924157847323e-05, "loss": 1.9916, "step": 913 }, { "epoch": 0.1481421451436444, "grad_norm": 0.8706285357475281, "learning_rate": 6.710006117292209e-05, "loss": 1.9442, "step": 914 }, { "epoch": 0.14830422626524575, "grad_norm": 0.9332308769226074, "learning_rate": 6.690102702621548e-05, "loss": 1.9825, "step": 915 }, { "epoch": 0.14846630738684713, "grad_norm": 0.9200189709663391, "learning_rate": 6.670214002317278e-05, "loss": 1.7899, "step": 916 }, { "epoch": 0.14862838850844848, "grad_norm": 0.9359891414642334, "learning_rate": 6.650340104795932e-05, "loss": 1.9061, "step": 917 }, { "epoch": 0.14879046963004983, "grad_norm": 0.9093737602233887, "learning_rate": 6.630481098408228e-05, "loss": 1.8754, "step": 918 }, { "epoch": 0.1489525507516512, "grad_norm": 0.9972536563873291, "learning_rate": 6.610637071438686e-05, "loss": 1.7495, "step": 919 }, { "epoch": 0.14911463187325255, "grad_norm": 1.0075587034225464, "learning_rate": 6.590808112105232e-05, "loss": 1.9349, "step": 920 }, { "epoch": 0.14927671299485393, "grad_norm": 0.9807758927345276, "learning_rate": 6.570994308558812e-05, "loss": 1.8885, "step": 921 }, { "epoch": 0.14943879411645528, "grad_norm": 1.0740059614181519, "learning_rate": 6.551195748882997e-05, "loss": 2.0256, "step": 922 }, { "epoch": 0.14960087523805665, "grad_norm": 1.0101613998413086, "learning_rate": 6.531412521093586e-05, "loss": 1.8977, "step": 923 }, { "epoch": 0.149762956359658, "grad_norm": 0.9367290139198303, "learning_rate": 6.51164471313822e-05, "loss": 1.9829, "step": 924 }, { "epoch": 0.14992503748125938, "grad_norm": 0.9805682897567749, "learning_rate": 6.491892412895995e-05, "loss": 1.9775, "step": 925 }, { "epoch": 0.15008711860286073, "grad_norm": 1.027248740196228, "learning_rate": 6.472155708177052e-05, "loss": 1.9416, "step": 926 }, { "epoch": 0.1502491997244621, "grad_norm": 1.0176628828048706, "learning_rate": 6.452434686722224e-05, "loss": 1.9728, "step": 927 }, { "epoch": 0.15041128084606345, "grad_norm": 1.0048375129699707, "learning_rate": 6.432729436202604e-05, "loss": 1.9948, "step": 928 }, { "epoch": 0.15057336196766483, "grad_norm": 1.0000700950622559, "learning_rate": 6.41304004421918e-05, "loss": 1.9813, "step": 929 }, { "epoch": 0.15073544308926617, "grad_norm": 1.0005961656570435, "learning_rate": 6.393366598302446e-05, "loss": 2.0137, "step": 930 }, { "epoch": 0.15089752421086755, "grad_norm": 1.1066198348999023, "learning_rate": 6.373709185911998e-05, "loss": 2.0346, "step": 931 }, { "epoch": 0.1510596053324689, "grad_norm": 1.2351223230361938, "learning_rate": 6.354067894436155e-05, "loss": 1.8956, "step": 932 }, { "epoch": 0.15122168645407028, "grad_norm": 1.2368437051773071, "learning_rate": 6.334442811191576e-05, "loss": 2.0858, "step": 933 }, { "epoch": 0.15138376757567162, "grad_norm": 1.1322507858276367, "learning_rate": 6.314834023422858e-05, "loss": 2.0543, "step": 934 }, { "epoch": 0.15154584869727297, "grad_norm": 1.1150531768798828, "learning_rate": 6.295241618302156e-05, "loss": 1.9735, "step": 935 }, { "epoch": 0.15170792981887435, "grad_norm": 1.1422393321990967, "learning_rate": 6.275665682928803e-05, "loss": 2.0114, "step": 936 }, { "epoch": 0.1518700109404757, "grad_norm": 1.155239224433899, "learning_rate": 6.256106304328905e-05, "loss": 2.0978, "step": 937 }, { "epoch": 0.15203209206207707, "grad_norm": 1.2192412614822388, "learning_rate": 6.23656356945497e-05, "loss": 2.2553, "step": 938 }, { "epoch": 0.15219417318367842, "grad_norm": 1.2727837562561035, "learning_rate": 6.21703756518551e-05, "loss": 2.0736, "step": 939 }, { "epoch": 0.1523562543052798, "grad_norm": 1.4809598922729492, "learning_rate": 6.197528378324665e-05, "loss": 2.2647, "step": 940 }, { "epoch": 0.15251833542688115, "grad_norm": 1.5518218278884888, "learning_rate": 6.17803609560181e-05, "loss": 2.2755, "step": 941 }, { "epoch": 0.15268041654848252, "grad_norm": 1.501102089881897, "learning_rate": 6.158560803671168e-05, "loss": 2.2923, "step": 942 }, { "epoch": 0.15284249767008387, "grad_norm": 1.478593111038208, "learning_rate": 6.139102589111435e-05, "loss": 2.3341, "step": 943 }, { "epoch": 0.15300457879168525, "grad_norm": 1.799355149269104, "learning_rate": 6.119661538425381e-05, "loss": 2.4506, "step": 944 }, { "epoch": 0.1531666599132866, "grad_norm": 1.601035237312317, "learning_rate": 6.100237738039484e-05, "loss": 2.4777, "step": 945 }, { "epoch": 0.15332874103488797, "grad_norm": 1.8279684782028198, "learning_rate": 6.0808312743035236e-05, "loss": 2.3588, "step": 946 }, { "epoch": 0.15349082215648932, "grad_norm": 1.9943811893463135, "learning_rate": 6.061442233490211e-05, "loss": 2.4191, "step": 947 }, { "epoch": 0.1536529032780907, "grad_norm": 2.4720005989074707, "learning_rate": 6.042070701794806e-05, "loss": 2.5547, "step": 948 }, { "epoch": 0.15381498439969205, "grad_norm": 2.8136916160583496, "learning_rate": 6.0227167653347305e-05, "loss": 3.0057, "step": 949 }, { "epoch": 0.1539770655212934, "grad_norm": 4.134425640106201, "learning_rate": 6.0033805101491794e-05, "loss": 3.487, "step": 950 }, { "epoch": 0.15413914664289477, "grad_norm": 0.7873668670654297, "learning_rate": 5.98406202219875e-05, "loss": 2.1287, "step": 951 }, { "epoch": 0.15430122776449612, "grad_norm": 0.8385956287384033, "learning_rate": 5.964761387365052e-05, "loss": 2.1194, "step": 952 }, { "epoch": 0.1544633088860975, "grad_norm": 0.8095195889472961, "learning_rate": 5.9454786914503255e-05, "loss": 1.9995, "step": 953 }, { "epoch": 0.15462539000769884, "grad_norm": 0.8198096752166748, "learning_rate": 5.926214020177074e-05, "loss": 1.8372, "step": 954 }, { "epoch": 0.15478747112930022, "grad_norm": 0.7827789783477783, "learning_rate": 5.9069674591876534e-05, "loss": 1.8768, "step": 955 }, { "epoch": 0.15494955225090157, "grad_norm": 0.8070412278175354, "learning_rate": 5.887739094043923e-05, "loss": 1.9643, "step": 956 }, { "epoch": 0.15511163337250294, "grad_norm": 0.7965165376663208, "learning_rate": 5.868529010226845e-05, "loss": 1.8869, "step": 957 }, { "epoch": 0.1552737144941043, "grad_norm": 0.9103376269340515, "learning_rate": 5.849337293136112e-05, "loss": 1.9411, "step": 958 }, { "epoch": 0.15543579561570567, "grad_norm": 0.8585078716278076, "learning_rate": 5.830164028089766e-05, "loss": 1.8523, "step": 959 }, { "epoch": 0.15559787673730702, "grad_norm": 0.9567294120788574, "learning_rate": 5.811009300323818e-05, "loss": 1.8103, "step": 960 }, { "epoch": 0.1557599578589084, "grad_norm": 0.8584802150726318, "learning_rate": 5.791873194991872e-05, "loss": 1.7882, "step": 961 }, { "epoch": 0.15592203898050974, "grad_norm": 0.8664056658744812, "learning_rate": 5.7727557971647427e-05, "loss": 1.8217, "step": 962 }, { "epoch": 0.15608412010211112, "grad_norm": 0.8807719349861145, "learning_rate": 5.7536571918300864e-05, "loss": 1.7432, "step": 963 }, { "epoch": 0.15624620122371247, "grad_norm": 0.8669372200965881, "learning_rate": 5.734577463892008e-05, "loss": 1.8452, "step": 964 }, { "epoch": 0.15640828234531384, "grad_norm": 0.9092172980308533, "learning_rate": 5.7155166981706956e-05, "loss": 1.9397, "step": 965 }, { "epoch": 0.1565703634669152, "grad_norm": 0.8645500540733337, "learning_rate": 5.6964749794020354e-05, "loss": 1.865, "step": 966 }, { "epoch": 0.15673244458851654, "grad_norm": 0.9376959800720215, "learning_rate": 5.6774523922372394e-05, "loss": 1.954, "step": 967 }, { "epoch": 0.15689452571011792, "grad_norm": 0.9202054738998413, "learning_rate": 5.6584490212424804e-05, "loss": 1.8206, "step": 968 }, { "epoch": 0.15705660683171926, "grad_norm": 0.974638044834137, "learning_rate": 5.639464950898491e-05, "loss": 1.88, "step": 969 }, { "epoch": 0.15721868795332064, "grad_norm": 0.9945436716079712, "learning_rate": 5.620500265600206e-05, "loss": 1.9128, "step": 970 }, { "epoch": 0.157380769074922, "grad_norm": 0.9271765947341919, "learning_rate": 5.601555049656382e-05, "loss": 2.0703, "step": 971 }, { "epoch": 0.15754285019652337, "grad_norm": 0.8724026083946228, "learning_rate": 5.58262938728922e-05, "loss": 1.831, "step": 972 }, { "epoch": 0.1577049313181247, "grad_norm": 0.8741132616996765, "learning_rate": 5.563723362634008e-05, "loss": 1.8377, "step": 973 }, { "epoch": 0.1578670124397261, "grad_norm": 0.9159821271896362, "learning_rate": 5.544837059738719e-05, "loss": 1.7115, "step": 974 }, { "epoch": 0.15802909356132744, "grad_norm": 0.9540914297103882, "learning_rate": 5.525970562563656e-05, "loss": 1.931, "step": 975 }, { "epoch": 0.15819117468292881, "grad_norm": 1.1752443313598633, "learning_rate": 5.507123954981073e-05, "loss": 1.9176, "step": 976 }, { "epoch": 0.15835325580453016, "grad_norm": 0.9616062641143799, "learning_rate": 5.488297320774807e-05, "loss": 1.9151, "step": 977 }, { "epoch": 0.15851533692613154, "grad_norm": 1.0276983976364136, "learning_rate": 5.4694907436399e-05, "loss": 1.9046, "step": 978 }, { "epoch": 0.1586774180477329, "grad_norm": 0.9765022397041321, "learning_rate": 5.4507043071822284e-05, "loss": 1.8811, "step": 979 }, { "epoch": 0.15883949916933426, "grad_norm": 1.085218906402588, "learning_rate": 5.431938094918132e-05, "loss": 1.988, "step": 980 }, { "epoch": 0.1590015802909356, "grad_norm": 1.1478166580200195, "learning_rate": 5.41319219027404e-05, "loss": 2.0078, "step": 981 }, { "epoch": 0.159163661412537, "grad_norm": 1.121962547302246, "learning_rate": 5.394466676586114e-05, "loss": 1.9425, "step": 982 }, { "epoch": 0.15932574253413834, "grad_norm": 1.0868014097213745, "learning_rate": 5.375761637099854e-05, "loss": 1.9219, "step": 983 }, { "epoch": 0.15948782365573969, "grad_norm": 1.1258161067962646, "learning_rate": 5.357077154969742e-05, "loss": 1.9258, "step": 984 }, { "epoch": 0.15964990477734106, "grad_norm": 1.1755454540252686, "learning_rate": 5.3384133132588784e-05, "loss": 2.1467, "step": 985 }, { "epoch": 0.1598119858989424, "grad_norm": 1.109211802482605, "learning_rate": 5.3197701949386e-05, "loss": 2.0517, "step": 986 }, { "epoch": 0.1599740670205438, "grad_norm": 1.1387169361114502, "learning_rate": 5.301147882888116e-05, "loss": 1.9207, "step": 987 }, { "epoch": 0.16013614814214513, "grad_norm": 1.1905651092529297, "learning_rate": 5.28254645989414e-05, "loss": 2.064, "step": 988 }, { "epoch": 0.1602982292637465, "grad_norm": 1.1580886840820312, "learning_rate": 5.2639660086505226e-05, "loss": 2.0172, "step": 989 }, { "epoch": 0.16046031038534786, "grad_norm": 1.363759160041809, "learning_rate": 5.2454066117578815e-05, "loss": 2.0947, "step": 990 }, { "epoch": 0.16062239150694924, "grad_norm": 1.3571370840072632, "learning_rate": 5.226868351723244e-05, "loss": 2.226, "step": 991 }, { "epoch": 0.16078447262855058, "grad_norm": 1.3612700700759888, "learning_rate": 5.2083513109596616e-05, "loss": 2.2807, "step": 992 }, { "epoch": 0.16094655375015196, "grad_norm": 1.4575715065002441, "learning_rate": 5.189855571785859e-05, "loss": 2.3312, "step": 993 }, { "epoch": 0.1611086348717533, "grad_norm": 1.4623538255691528, "learning_rate": 5.171381216425863e-05, "loss": 2.3451, "step": 994 }, { "epoch": 0.16127071599335469, "grad_norm": 1.7681406736373901, "learning_rate": 5.152928327008635e-05, "loss": 2.4868, "step": 995 }, { "epoch": 0.16143279711495603, "grad_norm": 1.645715594291687, "learning_rate": 5.134496985567714e-05, "loss": 2.4009, "step": 996 }, { "epoch": 0.1615948782365574, "grad_norm": 1.9408518075942993, "learning_rate": 5.116087274040837e-05, "loss": 2.3547, "step": 997 }, { "epoch": 0.16175695935815876, "grad_norm": 2.0233843326568604, "learning_rate": 5.0976992742695925e-05, "loss": 2.3173, "step": 998 }, { "epoch": 0.1619190404797601, "grad_norm": 2.2959163188934326, "learning_rate": 5.07933306799904e-05, "loss": 2.5881, "step": 999 }, { "epoch": 0.16208112160136148, "grad_norm": 3.336909532546997, "learning_rate": 5.060988736877366e-05, "loss": 2.8424, "step": 1000 }, { "epoch": 0.16224320272296283, "grad_norm": 0.8006635904312134, "learning_rate": 5.042666362455498e-05, "loss": 2.0874, "step": 1001 }, { "epoch": 0.1624052838445642, "grad_norm": 0.7417387366294861, "learning_rate": 5.024366026186755e-05, "loss": 2.0078, "step": 1002 }, { "epoch": 0.16256736496616556, "grad_norm": 0.8308349847793579, "learning_rate": 5.006087809426496e-05, "loss": 1.8976, "step": 1003 }, { "epoch": 0.16272944608776693, "grad_norm": 0.8369859457015991, "learning_rate": 4.987831793431731e-05, "loss": 1.8497, "step": 1004 }, { "epoch": 0.16289152720936828, "grad_norm": 0.7566342353820801, "learning_rate": 4.9695980593607817e-05, "loss": 1.7877, "step": 1005 }, { "epoch": 0.16305360833096966, "grad_norm": 0.7948352694511414, "learning_rate": 4.9513866882729146e-05, "loss": 1.8605, "step": 1006 }, { "epoch": 0.163215689452571, "grad_norm": 0.8169831037521362, "learning_rate": 4.9331977611279777e-05, "loss": 1.8545, "step": 1007 }, { "epoch": 0.16337777057417238, "grad_norm": 0.7623694539070129, "learning_rate": 4.9150313587860433e-05, "loss": 1.8054, "step": 1008 }, { "epoch": 0.16353985169577373, "grad_norm": 0.8043395280838013, "learning_rate": 4.896887562007054e-05, "loss": 1.8247, "step": 1009 }, { "epoch": 0.1637019328173751, "grad_norm": 0.8478509187698364, "learning_rate": 4.8787664514504504e-05, "loss": 1.8881, "step": 1010 }, { "epoch": 0.16386401393897645, "grad_norm": 0.8304833173751831, "learning_rate": 4.860668107674823e-05, "loss": 1.7478, "step": 1011 }, { "epoch": 0.16402609506057783, "grad_norm": 0.8268377780914307, "learning_rate": 4.8425926111375506e-05, "loss": 1.8382, "step": 1012 }, { "epoch": 0.16418817618217918, "grad_norm": 0.9541943073272705, "learning_rate": 4.824540042194443e-05, "loss": 1.8776, "step": 1013 }, { "epoch": 0.16435025730378056, "grad_norm": 0.8534770011901855, "learning_rate": 4.8065104810993856e-05, "loss": 1.9376, "step": 1014 }, { "epoch": 0.1645123384253819, "grad_norm": 0.8372848629951477, "learning_rate": 4.788504008003978e-05, "loss": 1.8229, "step": 1015 }, { "epoch": 0.16467441954698325, "grad_norm": 0.867513120174408, "learning_rate": 4.770520702957182e-05, "loss": 1.922, "step": 1016 }, { "epoch": 0.16483650066858463, "grad_norm": 0.8784647583961487, "learning_rate": 4.752560645904962e-05, "loss": 1.8011, "step": 1017 }, { "epoch": 0.16499858179018598, "grad_norm": 0.9386810660362244, "learning_rate": 4.734623916689941e-05, "loss": 1.8849, "step": 1018 }, { "epoch": 0.16516066291178735, "grad_norm": 0.8831779360771179, "learning_rate": 4.716710595051022e-05, "loss": 1.7498, "step": 1019 }, { "epoch": 0.1653227440333887, "grad_norm": 0.9589028358459473, "learning_rate": 4.698820760623064e-05, "loss": 1.8178, "step": 1020 }, { "epoch": 0.16548482515499008, "grad_norm": 1.0203953981399536, "learning_rate": 4.6809544929365004e-05, "loss": 1.951, "step": 1021 }, { "epoch": 0.16564690627659143, "grad_norm": 0.9648740887641907, "learning_rate": 4.663111871417e-05, "loss": 1.8394, "step": 1022 }, { "epoch": 0.1658089873981928, "grad_norm": 0.9202787280082703, "learning_rate": 4.645292975385111e-05, "loss": 1.894, "step": 1023 }, { "epoch": 0.16597106851979415, "grad_norm": 0.9346006512641907, "learning_rate": 4.627497884055912e-05, "loss": 1.8064, "step": 1024 }, { "epoch": 0.16613314964139553, "grad_norm": 1.0869497060775757, "learning_rate": 4.609726676538652e-05, "loss": 1.8659, "step": 1025 }, { "epoch": 0.16629523076299688, "grad_norm": 0.9703930616378784, "learning_rate": 4.591979431836402e-05, "loss": 1.8255, "step": 1026 }, { "epoch": 0.16645731188459825, "grad_norm": 1.0407397747039795, "learning_rate": 4.574256228845706e-05, "loss": 1.9679, "step": 1027 }, { "epoch": 0.1666193930061996, "grad_norm": 1.0780534744262695, "learning_rate": 4.5565571463562365e-05, "loss": 1.8632, "step": 1028 }, { "epoch": 0.16678147412780098, "grad_norm": 1.1128103733062744, "learning_rate": 4.5388822630504256e-05, "loss": 1.8397, "step": 1029 }, { "epoch": 0.16694355524940233, "grad_norm": 1.0159236192703247, "learning_rate": 4.521231657503132e-05, "loss": 1.8769, "step": 1030 }, { "epoch": 0.16710563637100367, "grad_norm": 1.0226553678512573, "learning_rate": 4.503605408181286e-05, "loss": 1.9147, "step": 1031 }, { "epoch": 0.16726771749260505, "grad_norm": 1.0692806243896484, "learning_rate": 4.486003593443537e-05, "loss": 1.9294, "step": 1032 }, { "epoch": 0.1674297986142064, "grad_norm": 1.0928372144699097, "learning_rate": 4.468426291539914e-05, "loss": 1.9216, "step": 1033 }, { "epoch": 0.16759187973580777, "grad_norm": 1.1192384958267212, "learning_rate": 4.4508735806114654e-05, "loss": 2.003, "step": 1034 }, { "epoch": 0.16775396085740912, "grad_norm": 1.1762419939041138, "learning_rate": 4.433345538689929e-05, "loss": 2.1035, "step": 1035 }, { "epoch": 0.1679160419790105, "grad_norm": 1.15769624710083, "learning_rate": 4.415842243697369e-05, "loss": 2.0355, "step": 1036 }, { "epoch": 0.16807812310061185, "grad_norm": 1.1447405815124512, "learning_rate": 4.39836377344583e-05, "loss": 2.065, "step": 1037 }, { "epoch": 0.16824020422221322, "grad_norm": 1.287532925605774, "learning_rate": 4.380910205637012e-05, "loss": 2.2201, "step": 1038 }, { "epoch": 0.16840228534381457, "grad_norm": 1.2180663347244263, "learning_rate": 4.363481617861893e-05, "loss": 2.0992, "step": 1039 }, { "epoch": 0.16856436646541595, "grad_norm": 1.2854613065719604, "learning_rate": 4.346078087600412e-05, "loss": 2.0934, "step": 1040 }, { "epoch": 0.1687264475870173, "grad_norm": 1.329726219177246, "learning_rate": 4.3286996922211034e-05, "loss": 2.2888, "step": 1041 }, { "epoch": 0.16888852870861867, "grad_norm": 1.4380382299423218, "learning_rate": 4.311346508980772e-05, "loss": 2.2728, "step": 1042 }, { "epoch": 0.16905060983022002, "grad_norm": 1.6394540071487427, "learning_rate": 4.2940186150241365e-05, "loss": 2.2854, "step": 1043 }, { "epoch": 0.1692126909518214, "grad_norm": 1.51077139377594, "learning_rate": 4.27671608738349e-05, "loss": 2.2677, "step": 1044 }, { "epoch": 0.16937477207342275, "grad_norm": 1.6722321510314941, "learning_rate": 4.2594390029783534e-05, "loss": 2.3743, "step": 1045 }, { "epoch": 0.16953685319502412, "grad_norm": 1.7964955568313599, "learning_rate": 4.242187438615153e-05, "loss": 2.5597, "step": 1046 }, { "epoch": 0.16969893431662547, "grad_norm": 1.8949179649353027, "learning_rate": 4.224961470986849e-05, "loss": 2.3538, "step": 1047 }, { "epoch": 0.16986101543822682, "grad_norm": 2.335127353668213, "learning_rate": 4.207761176672614e-05, "loss": 2.6971, "step": 1048 }, { "epoch": 0.1700230965598282, "grad_norm": 3.060535192489624, "learning_rate": 4.190586632137491e-05, "loss": 2.7979, "step": 1049 }, { "epoch": 0.17018517768142954, "grad_norm": 3.8930301666259766, "learning_rate": 4.173437913732048e-05, "loss": 3.2297, "step": 1050 }, { "epoch": 0.17034725880303092, "grad_norm": 0.7237864136695862, "learning_rate": 4.156315097692037e-05, "loss": 2.0246, "step": 1051 }, { "epoch": 0.17050933992463227, "grad_norm": 0.743639349937439, "learning_rate": 4.139218260138074e-05, "loss": 1.946, "step": 1052 }, { "epoch": 0.17067142104623365, "grad_norm": 0.9066917896270752, "learning_rate": 4.12214747707527e-05, "loss": 1.8558, "step": 1053 }, { "epoch": 0.170833502167835, "grad_norm": 0.7140637040138245, "learning_rate": 4.1051028243929125e-05, "loss": 1.8177, "step": 1054 }, { "epoch": 0.17099558328943637, "grad_norm": 0.8741153478622437, "learning_rate": 4.088084377864135e-05, "loss": 1.9727, "step": 1055 }, { "epoch": 0.17115766441103772, "grad_norm": 0.8254456520080566, "learning_rate": 4.07109221314556e-05, "loss": 1.7322, "step": 1056 }, { "epoch": 0.1713197455326391, "grad_norm": 0.897747278213501, "learning_rate": 4.054126405776971e-05, "loss": 1.9204, "step": 1057 }, { "epoch": 0.17148182665424044, "grad_norm": 0.811434805393219, "learning_rate": 4.037187031180985e-05, "loss": 1.8652, "step": 1058 }, { "epoch": 0.17164390777584182, "grad_norm": 0.8381838798522949, "learning_rate": 4.020274164662707e-05, "loss": 1.9816, "step": 1059 }, { "epoch": 0.17180598889744317, "grad_norm": 0.8337680101394653, "learning_rate": 4.003387881409397e-05, "loss": 1.845, "step": 1060 }, { "epoch": 0.17196807001904454, "grad_norm": 0.8741956949234009, "learning_rate": 3.986528256490141e-05, "loss": 1.8464, "step": 1061 }, { "epoch": 0.1721301511406459, "grad_norm": 0.8832194209098816, "learning_rate": 3.969695364855511e-05, "loss": 1.8506, "step": 1062 }, { "epoch": 0.17229223226224724, "grad_norm": 0.9787096977233887, "learning_rate": 3.952889281337235e-05, "loss": 1.9926, "step": 1063 }, { "epoch": 0.17245431338384862, "grad_norm": 0.9292280077934265, "learning_rate": 3.93611008064786e-05, "loss": 1.9342, "step": 1064 }, { "epoch": 0.17261639450544997, "grad_norm": 0.8793303966522217, "learning_rate": 3.9193578373804364e-05, "loss": 1.8892, "step": 1065 }, { "epoch": 0.17277847562705134, "grad_norm": 0.8778505921363831, "learning_rate": 3.90263262600816e-05, "loss": 1.6929, "step": 1066 }, { "epoch": 0.1729405567486527, "grad_norm": 0.8713865876197815, "learning_rate": 3.88593452088406e-05, "loss": 1.7941, "step": 1067 }, { "epoch": 0.17310263787025407, "grad_norm": 0.9508034586906433, "learning_rate": 3.869263596240661e-05, "loss": 1.9836, "step": 1068 }, { "epoch": 0.17326471899185542, "grad_norm": 0.9075029492378235, "learning_rate": 3.8526199261896544e-05, "loss": 1.8848, "step": 1069 }, { "epoch": 0.1734268001134568, "grad_norm": 0.9615143537521362, "learning_rate": 3.836003584721577e-05, "loss": 1.9272, "step": 1070 }, { "epoch": 0.17358888123505814, "grad_norm": 0.8568268418312073, "learning_rate": 3.8194146457054655e-05, "loss": 1.7457, "step": 1071 }, { "epoch": 0.17375096235665952, "grad_norm": 0.892228901386261, "learning_rate": 3.802853182888543e-05, "loss": 1.8869, "step": 1072 }, { "epoch": 0.17391304347826086, "grad_norm": 0.9012638330459595, "learning_rate": 3.786319269895877e-05, "loss": 1.882, "step": 1073 }, { "epoch": 0.17407512459986224, "grad_norm": 1.0043779611587524, "learning_rate": 3.769812980230074e-05, "loss": 1.8859, "step": 1074 }, { "epoch": 0.1742372057214636, "grad_norm": 1.0329103469848633, "learning_rate": 3.7533343872709294e-05, "loss": 1.9267, "step": 1075 }, { "epoch": 0.17439928684306497, "grad_norm": 1.0233429670333862, "learning_rate": 3.736883564275112e-05, "loss": 1.8735, "step": 1076 }, { "epoch": 0.17456136796466631, "grad_norm": 1.0237981081008911, "learning_rate": 3.7204605843758386e-05, "loss": 1.9477, "step": 1077 }, { "epoch": 0.1747234490862677, "grad_norm": 1.0849707126617432, "learning_rate": 3.704065520582549e-05, "loss": 1.9903, "step": 1078 }, { "epoch": 0.17488553020786904, "grad_norm": 1.0255253314971924, "learning_rate": 3.6876984457805786e-05, "loss": 1.8858, "step": 1079 }, { "epoch": 0.1750476113294704, "grad_norm": 1.0898571014404297, "learning_rate": 3.671359432730834e-05, "loss": 1.856, "step": 1080 }, { "epoch": 0.17520969245107176, "grad_norm": 1.0618890523910522, "learning_rate": 3.655048554069478e-05, "loss": 1.8924, "step": 1081 }, { "epoch": 0.1753717735726731, "grad_norm": 1.0454431772232056, "learning_rate": 3.638765882307589e-05, "loss": 1.9335, "step": 1082 }, { "epoch": 0.1755338546942745, "grad_norm": 1.0827393531799316, "learning_rate": 3.6225114898308634e-05, "loss": 1.8953, "step": 1083 }, { "epoch": 0.17569593581587584, "grad_norm": 1.1539843082427979, "learning_rate": 3.6062854488992714e-05, "loss": 2.0294, "step": 1084 }, { "epoch": 0.1758580169374772, "grad_norm": 1.3190221786499023, "learning_rate": 3.5900878316467454e-05, "loss": 1.9874, "step": 1085 }, { "epoch": 0.17602009805907856, "grad_norm": 1.1466480493545532, "learning_rate": 3.573918710080857e-05, "loss": 1.9573, "step": 1086 }, { "epoch": 0.17618217918067994, "grad_norm": 1.154457926750183, "learning_rate": 3.5577781560825066e-05, "loss": 1.9215, "step": 1087 }, { "epoch": 0.17634426030228129, "grad_norm": 1.1538069248199463, "learning_rate": 3.541666241405588e-05, "loss": 1.9959, "step": 1088 }, { "epoch": 0.17650634142388266, "grad_norm": 1.3226391077041626, "learning_rate": 3.5255830376766764e-05, "loss": 2.1634, "step": 1089 }, { "epoch": 0.176668422545484, "grad_norm": 1.226496934890747, "learning_rate": 3.509528616394716e-05, "loss": 2.0248, "step": 1090 }, { "epoch": 0.1768305036670854, "grad_norm": 1.3939621448516846, "learning_rate": 3.4935030489306883e-05, "loss": 2.3477, "step": 1091 }, { "epoch": 0.17699258478868674, "grad_norm": 1.3951406478881836, "learning_rate": 3.4775064065273165e-05, "loss": 2.4041, "step": 1092 }, { "epoch": 0.1771546659102881, "grad_norm": 1.4524469375610352, "learning_rate": 3.4615387602987236e-05, "loss": 2.3614, "step": 1093 }, { "epoch": 0.17731674703188946, "grad_norm": 1.6162487268447876, "learning_rate": 3.445600181230134e-05, "loss": 2.3496, "step": 1094 }, { "epoch": 0.1774788281534908, "grad_norm": 1.685415506362915, "learning_rate": 3.429690740177549e-05, "loss": 2.4703, "step": 1095 }, { "epoch": 0.17764090927509218, "grad_norm": 1.7755277156829834, "learning_rate": 3.413810507867436e-05, "loss": 2.4649, "step": 1096 }, { "epoch": 0.17780299039669353, "grad_norm": 1.8561453819274902, "learning_rate": 3.397959554896415e-05, "loss": 2.4339, "step": 1097 }, { "epoch": 0.1779650715182949, "grad_norm": 2.014500379562378, "learning_rate": 3.3821379517309405e-05, "loss": 2.4459, "step": 1098 }, { "epoch": 0.17812715263989626, "grad_norm": 2.5885419845581055, "learning_rate": 3.3663457687069924e-05, "loss": 2.6332, "step": 1099 }, { "epoch": 0.17828923376149763, "grad_norm": 3.712876319885254, "learning_rate": 3.350583076029754e-05, "loss": 3.0928, "step": 1100 }, { "epoch": 0.17845131488309898, "grad_norm": 0.785007655620575, "learning_rate": 3.334849943773323e-05, "loss": 2.1315, "step": 1101 }, { "epoch": 0.17861339600470036, "grad_norm": 0.713115394115448, "learning_rate": 3.319146441880371e-05, "loss": 2.1326, "step": 1102 }, { "epoch": 0.1787754771263017, "grad_norm": 0.7978166937828064, "learning_rate": 3.3034726401618444e-05, "loss": 1.8974, "step": 1103 }, { "epoch": 0.17893755824790308, "grad_norm": 0.7951827645301819, "learning_rate": 3.28782860829667e-05, "loss": 1.7932, "step": 1104 }, { "epoch": 0.17909963936950443, "grad_norm": 0.7696529030799866, "learning_rate": 3.272214415831418e-05, "loss": 1.7887, "step": 1105 }, { "epoch": 0.1792617204911058, "grad_norm": 0.8287764191627502, "learning_rate": 3.2566301321800085e-05, "loss": 1.8819, "step": 1106 }, { "epoch": 0.17942380161270716, "grad_norm": 0.7672280073165894, "learning_rate": 3.241075826623401e-05, "loss": 1.7566, "step": 1107 }, { "epoch": 0.17958588273430853, "grad_norm": 0.7949413657188416, "learning_rate": 3.225551568309284e-05, "loss": 1.8398, "step": 1108 }, { "epoch": 0.17974796385590988, "grad_norm": 0.802994966506958, "learning_rate": 3.210057426251773e-05, "loss": 1.7548, "step": 1109 }, { "epoch": 0.17991004497751126, "grad_norm": 0.8089221715927124, "learning_rate": 3.1945934693310896e-05, "loss": 1.7736, "step": 1110 }, { "epoch": 0.1800721260991126, "grad_norm": 0.8903050422668457, "learning_rate": 3.179159766293282e-05, "loss": 1.9156, "step": 1111 }, { "epoch": 0.18023420722071395, "grad_norm": 0.9210566878318787, "learning_rate": 3.163756385749889e-05, "loss": 1.8537, "step": 1112 }, { "epoch": 0.18039628834231533, "grad_norm": 0.8797653913497925, "learning_rate": 3.148383396177653e-05, "loss": 1.9036, "step": 1113 }, { "epoch": 0.18055836946391668, "grad_norm": 0.8586728572845459, "learning_rate": 3.133040865918213e-05, "loss": 1.9411, "step": 1114 }, { "epoch": 0.18072045058551806, "grad_norm": 0.84567791223526, "learning_rate": 3.117728863177796e-05, "loss": 1.8435, "step": 1115 }, { "epoch": 0.1808825317071194, "grad_norm": 0.8560618162155151, "learning_rate": 3.102447456026919e-05, "loss": 1.7905, "step": 1116 }, { "epoch": 0.18104461282872078, "grad_norm": 0.9532570242881775, "learning_rate": 3.0871967124000834e-05, "loss": 1.8291, "step": 1117 }, { "epoch": 0.18120669395032213, "grad_norm": 0.9259549379348755, "learning_rate": 3.0719767000954714e-05, "loss": 1.9017, "step": 1118 }, { "epoch": 0.1813687750719235, "grad_norm": 0.9935880899429321, "learning_rate": 3.056787486774656e-05, "loss": 1.9281, "step": 1119 }, { "epoch": 0.18153085619352485, "grad_norm": 0.9365888833999634, "learning_rate": 3.041629139962283e-05, "loss": 1.8554, "step": 1120 }, { "epoch": 0.18169293731512623, "grad_norm": 0.9560192823410034, "learning_rate": 3.0265017270457775e-05, "loss": 1.9175, "step": 1121 }, { "epoch": 0.18185501843672758, "grad_norm": 0.9258919954299927, "learning_rate": 3.0114053152750556e-05, "loss": 1.8881, "step": 1122 }, { "epoch": 0.18201709955832895, "grad_norm": 1.0171785354614258, "learning_rate": 2.9963399717622077e-05, "loss": 1.8769, "step": 1123 }, { "epoch": 0.1821791806799303, "grad_norm": 1.0115593671798706, "learning_rate": 2.98130576348121e-05, "loss": 1.9043, "step": 1124 }, { "epoch": 0.18234126180153168, "grad_norm": 0.9675939679145813, "learning_rate": 2.966302757267625e-05, "loss": 2.0133, "step": 1125 }, { "epoch": 0.18234126180153168, "eval_loss": 2.022845506668091, "eval_runtime": 614.5777, "eval_samples_per_second": 16.909, "eval_steps_per_second": 8.455, "step": 1125 }, { "epoch": 0.18250334292313303, "grad_norm": 0.9269835948944092, "learning_rate": 2.9513310198183065e-05, "loss": 1.9074, "step": 1126 }, { "epoch": 0.18266542404473438, "grad_norm": 1.075965404510498, "learning_rate": 2.936390617691097e-05, "loss": 1.8784, "step": 1127 }, { "epoch": 0.18282750516633575, "grad_norm": 1.0483362674713135, "learning_rate": 2.9214816173045356e-05, "loss": 1.9883, "step": 1128 }, { "epoch": 0.1829895862879371, "grad_norm": 1.0751255750656128, "learning_rate": 2.906604084937572e-05, "loss": 1.9139, "step": 1129 }, { "epoch": 0.18315166740953848, "grad_norm": 1.020979881286621, "learning_rate": 2.8917580867292526e-05, "loss": 1.9507, "step": 1130 }, { "epoch": 0.18331374853113983, "grad_norm": 1.0141757726669312, "learning_rate": 2.8769436886784408e-05, "loss": 1.8902, "step": 1131 }, { "epoch": 0.1834758296527412, "grad_norm": 1.2489327192306519, "learning_rate": 2.862160956643517e-05, "loss": 1.9329, "step": 1132 }, { "epoch": 0.18363791077434255, "grad_norm": 1.0572302341461182, "learning_rate": 2.847409956342092e-05, "loss": 2.0751, "step": 1133 }, { "epoch": 0.18379999189594393, "grad_norm": 1.0688960552215576, "learning_rate": 2.8326907533507074e-05, "loss": 2.0042, "step": 1134 }, { "epoch": 0.18396207301754527, "grad_norm": 1.0610847473144531, "learning_rate": 2.8180034131045464e-05, "loss": 2.0616, "step": 1135 }, { "epoch": 0.18412415413914665, "grad_norm": 1.1499392986297607, "learning_rate": 2.8033480008971546e-05, "loss": 2.0013, "step": 1136 }, { "epoch": 0.184286235260748, "grad_norm": 1.3222806453704834, "learning_rate": 2.7887245818801277e-05, "loss": 2.2092, "step": 1137 }, { "epoch": 0.18444831638234938, "grad_norm": 1.1725184917449951, "learning_rate": 2.7741332210628345e-05, "loss": 2.1719, "step": 1138 }, { "epoch": 0.18461039750395072, "grad_norm": 1.2436225414276123, "learning_rate": 2.759573983312138e-05, "loss": 2.1319, "step": 1139 }, { "epoch": 0.1847724786255521, "grad_norm": 1.25568687915802, "learning_rate": 2.7450469333520855e-05, "loss": 2.0577, "step": 1140 }, { "epoch": 0.18493455974715345, "grad_norm": 1.2200261354446411, "learning_rate": 2.730552135763632e-05, "loss": 2.0657, "step": 1141 }, { "epoch": 0.18509664086875482, "grad_norm": 1.3557932376861572, "learning_rate": 2.7160896549843562e-05, "loss": 2.1896, "step": 1142 }, { "epoch": 0.18525872199035617, "grad_norm": 1.3588851690292358, "learning_rate": 2.701659555308169e-05, "loss": 2.2572, "step": 1143 }, { "epoch": 0.18542080311195752, "grad_norm": 1.482229232788086, "learning_rate": 2.6872619008850274e-05, "loss": 2.4201, "step": 1144 }, { "epoch": 0.1855828842335589, "grad_norm": 1.6091341972351074, "learning_rate": 2.672896755720654e-05, "loss": 2.4863, "step": 1145 }, { "epoch": 0.18574496535516025, "grad_norm": 1.818866491317749, "learning_rate": 2.6585641836762433e-05, "loss": 2.41, "step": 1146 }, { "epoch": 0.18590704647676162, "grad_norm": 1.8941640853881836, "learning_rate": 2.6442642484681944e-05, "loss": 2.5292, "step": 1147 }, { "epoch": 0.18606912759836297, "grad_norm": 2.0472145080566406, "learning_rate": 2.6299970136678077e-05, "loss": 2.5577, "step": 1148 }, { "epoch": 0.18623120871996435, "grad_norm": 2.4445533752441406, "learning_rate": 2.6157625427010156e-05, "loss": 2.592, "step": 1149 }, { "epoch": 0.1863932898415657, "grad_norm": 4.089385986328125, "learning_rate": 2.6015608988480955e-05, "loss": 3.3517, "step": 1150 }, { "epoch": 0.18655537096316707, "grad_norm": 0.6572228074073792, "learning_rate": 2.5873921452433915e-05, "loss": 1.8484, "step": 1151 }, { "epoch": 0.18671745208476842, "grad_norm": 0.7550435662269592, "learning_rate": 2.57325634487503e-05, "loss": 2.0461, "step": 1152 }, { "epoch": 0.1868795332063698, "grad_norm": 0.7260937690734863, "learning_rate": 2.5591535605846383e-05, "loss": 1.7338, "step": 1153 }, { "epoch": 0.18704161432797115, "grad_norm": 0.8326082825660706, "learning_rate": 2.5450838550670808e-05, "loss": 1.8512, "step": 1154 }, { "epoch": 0.18720369544957252, "grad_norm": 0.7993731498718262, "learning_rate": 2.5310472908701555e-05, "loss": 1.9586, "step": 1155 }, { "epoch": 0.18736577657117387, "grad_norm": 0.7902320623397827, "learning_rate": 2.5170439303943294e-05, "loss": 1.8841, "step": 1156 }, { "epoch": 0.18752785769277525, "grad_norm": 0.7893832325935364, "learning_rate": 2.503073835892471e-05, "loss": 1.9109, "step": 1157 }, { "epoch": 0.1876899388143766, "grad_norm": 0.7799640893936157, "learning_rate": 2.4891370694695517e-05, "loss": 1.965, "step": 1158 }, { "epoch": 0.18785201993597797, "grad_norm": 0.7875563502311707, "learning_rate": 2.4752336930823837e-05, "loss": 1.8452, "step": 1159 }, { "epoch": 0.18801410105757932, "grad_norm": 0.9000365138053894, "learning_rate": 2.4613637685393432e-05, "loss": 1.8454, "step": 1160 }, { "epoch": 0.18817618217918067, "grad_norm": 0.8320596218109131, "learning_rate": 2.4475273575000936e-05, "loss": 1.7712, "step": 1161 }, { "epoch": 0.18833826330078204, "grad_norm": 0.8661396503448486, "learning_rate": 2.4337245214753103e-05, "loss": 1.8233, "step": 1162 }, { "epoch": 0.1885003444223834, "grad_norm": 0.8836917281150818, "learning_rate": 2.4199553218264093e-05, "loss": 1.7637, "step": 1163 }, { "epoch": 0.18866242554398477, "grad_norm": 0.9408859014511108, "learning_rate": 2.4062198197652752e-05, "loss": 1.8305, "step": 1164 }, { "epoch": 0.18882450666558612, "grad_norm": 0.8367508053779602, "learning_rate": 2.3925180763539844e-05, "loss": 1.8132, "step": 1165 }, { "epoch": 0.1889865877871875, "grad_norm": 0.9105957746505737, "learning_rate": 2.3788501525045438e-05, "loss": 1.7251, "step": 1166 }, { "epoch": 0.18914866890878884, "grad_norm": 0.8808852434158325, "learning_rate": 2.3652161089786086e-05, "loss": 1.7983, "step": 1167 }, { "epoch": 0.18931075003039022, "grad_norm": 0.8933706283569336, "learning_rate": 2.351616006387214e-05, "loss": 1.8802, "step": 1168 }, { "epoch": 0.18947283115199157, "grad_norm": 0.9381440281867981, "learning_rate": 2.3380499051905137e-05, "loss": 1.8908, "step": 1169 }, { "epoch": 0.18963491227359294, "grad_norm": 0.912348210811615, "learning_rate": 2.324517865697501e-05, "loss": 1.7839, "step": 1170 }, { "epoch": 0.1897969933951943, "grad_norm": 0.901702880859375, "learning_rate": 2.3110199480657525e-05, "loss": 1.8238, "step": 1171 }, { "epoch": 0.18995907451679567, "grad_norm": 0.9053418636322021, "learning_rate": 2.2975562123011495e-05, "loss": 1.8257, "step": 1172 }, { "epoch": 0.19012115563839702, "grad_norm": 0.9081056118011475, "learning_rate": 2.2841267182576143e-05, "loss": 1.7512, "step": 1173 }, { "epoch": 0.1902832367599984, "grad_norm": 0.9956914782524109, "learning_rate": 2.2707315256368433e-05, "loss": 1.8728, "step": 1174 }, { "epoch": 0.19044531788159974, "grad_norm": 1.0288313627243042, "learning_rate": 2.2573706939880555e-05, "loss": 1.8987, "step": 1175 }, { "epoch": 0.1906073990032011, "grad_norm": 0.9515791535377502, "learning_rate": 2.2440442827077045e-05, "loss": 1.7594, "step": 1176 }, { "epoch": 0.19076948012480247, "grad_norm": 0.9405137300491333, "learning_rate": 2.230752351039228e-05, "loss": 1.818, "step": 1177 }, { "epoch": 0.1909315612464038, "grad_norm": 1.0270601511001587, "learning_rate": 2.2174949580727832e-05, "loss": 1.8245, "step": 1178 }, { "epoch": 0.1910936423680052, "grad_norm": 1.0788053274154663, "learning_rate": 2.2042721627449846e-05, "loss": 2.0, "step": 1179 }, { "epoch": 0.19125572348960654, "grad_norm": 1.0930386781692505, "learning_rate": 2.1910840238386398e-05, "loss": 2.0082, "step": 1180 }, { "epoch": 0.19141780461120791, "grad_norm": 1.0199942588806152, "learning_rate": 2.1779305999824884e-05, "loss": 1.8675, "step": 1181 }, { "epoch": 0.19157988573280926, "grad_norm": 1.0651659965515137, "learning_rate": 2.164811949650942e-05, "loss": 2.0636, "step": 1182 }, { "epoch": 0.19174196685441064, "grad_norm": 1.0971477031707764, "learning_rate": 2.1517281311638217e-05, "loss": 2.022, "step": 1183 }, { "epoch": 0.191904047976012, "grad_norm": 1.1283248662948608, "learning_rate": 2.1386792026861103e-05, "loss": 2.0144, "step": 1184 }, { "epoch": 0.19206612909761336, "grad_norm": 1.119558334350586, "learning_rate": 2.125665222227675e-05, "loss": 2.1146, "step": 1185 }, { "epoch": 0.1922282102192147, "grad_norm": 1.1254726648330688, "learning_rate": 2.112686247643024e-05, "loss": 2.0595, "step": 1186 }, { "epoch": 0.1923902913408161, "grad_norm": 1.1798866987228394, "learning_rate": 2.09974233663104e-05, "loss": 2.1731, "step": 1187 }, { "epoch": 0.19255237246241744, "grad_norm": 1.185274362564087, "learning_rate": 2.0868335467347366e-05, "loss": 2.1197, "step": 1188 }, { "epoch": 0.1927144535840188, "grad_norm": 1.1812537908554077, "learning_rate": 2.073959935340988e-05, "loss": 2.0506, "step": 1189 }, { "epoch": 0.19287653470562016, "grad_norm": 1.2541913986206055, "learning_rate": 2.06112155968028e-05, "loss": 2.193, "step": 1190 }, { "epoch": 0.19303861582722154, "grad_norm": 1.4345555305480957, "learning_rate": 2.0483184768264596e-05, "loss": 2.2745, "step": 1191 }, { "epoch": 0.1932006969488229, "grad_norm": 1.3788560628890991, "learning_rate": 2.035550743696468e-05, "loss": 2.2807, "step": 1192 }, { "epoch": 0.19336277807042423, "grad_norm": 1.5256106853485107, "learning_rate": 2.022818417050113e-05, "loss": 2.3778, "step": 1193 }, { "epoch": 0.1935248591920256, "grad_norm": 1.5746982097625732, "learning_rate": 2.0101215534897855e-05, "loss": 2.4029, "step": 1194 }, { "epoch": 0.19368694031362696, "grad_norm": 1.6141338348388672, "learning_rate": 1.99746020946023e-05, "loss": 2.4174, "step": 1195 }, { "epoch": 0.19384902143522834, "grad_norm": 1.8641674518585205, "learning_rate": 1.9848344412482854e-05, "loss": 2.4308, "step": 1196 }, { "epoch": 0.19401110255682968, "grad_norm": 2.0193698406219482, "learning_rate": 1.9722443049826344e-05, "loss": 2.3566, "step": 1197 }, { "epoch": 0.19417318367843106, "grad_norm": 2.033904790878296, "learning_rate": 1.9596898566335576e-05, "loss": 2.478, "step": 1198 }, { "epoch": 0.1943352648000324, "grad_norm": 2.6513400077819824, "learning_rate": 1.9471711520126824e-05, "loss": 2.8799, "step": 1199 }, { "epoch": 0.19449734592163379, "grad_norm": 3.8469204902648926, "learning_rate": 1.9346882467727325e-05, "loss": 3.1931, "step": 1200 }, { "epoch": 0.19465942704323513, "grad_norm": 0.726023256778717, "learning_rate": 1.9222411964072884e-05, "loss": 2.1961, "step": 1201 }, { "epoch": 0.1948215081648365, "grad_norm": 0.7413092851638794, "learning_rate": 1.9098300562505266e-05, "loss": 1.9151, "step": 1202 }, { "epoch": 0.19498358928643786, "grad_norm": 0.8594390749931335, "learning_rate": 1.8974548814769944e-05, "loss": 1.8902, "step": 1203 }, { "epoch": 0.19514567040803923, "grad_norm": 0.8023420572280884, "learning_rate": 1.8851157271013442e-05, "loss": 1.7929, "step": 1204 }, { "epoch": 0.19530775152964058, "grad_norm": 0.8195720314979553, "learning_rate": 1.872812647978095e-05, "loss": 1.8281, "step": 1205 }, { "epoch": 0.19546983265124196, "grad_norm": 0.7723554372787476, "learning_rate": 1.8605456988014015e-05, "loss": 1.8732, "step": 1206 }, { "epoch": 0.1956319137728433, "grad_norm": 0.8346254229545593, "learning_rate": 1.8483149341047923e-05, "loss": 1.9075, "step": 1207 }, { "epoch": 0.19579399489444466, "grad_norm": 0.7793485522270203, "learning_rate": 1.8361204082609352e-05, "loss": 1.7659, "step": 1208 }, { "epoch": 0.19595607601604603, "grad_norm": 0.8150200843811035, "learning_rate": 1.8239621754813995e-05, "loss": 1.7809, "step": 1209 }, { "epoch": 0.19611815713764738, "grad_norm": 0.8918383121490479, "learning_rate": 1.811840289816409e-05, "loss": 1.7751, "step": 1210 }, { "epoch": 0.19628023825924876, "grad_norm": 0.842333197593689, "learning_rate": 1.799754805154603e-05, "loss": 1.8416, "step": 1211 }, { "epoch": 0.1964423193808501, "grad_norm": 0.8633593916893005, "learning_rate": 1.787705775222802e-05, "loss": 1.7984, "step": 1212 }, { "epoch": 0.19660440050245148, "grad_norm": 0.9862741827964783, "learning_rate": 1.775693253585763e-05, "loss": 1.8719, "step": 1213 }, { "epoch": 0.19676648162405283, "grad_norm": 0.8115241527557373, "learning_rate": 1.763717293645939e-05, "loss": 1.749, "step": 1214 }, { "epoch": 0.1969285627456542, "grad_norm": 0.8699769377708435, "learning_rate": 1.7517779486432495e-05, "loss": 1.8379, "step": 1215 }, { "epoch": 0.19709064386725555, "grad_norm": 0.8686104416847229, "learning_rate": 1.7398752716548395e-05, "loss": 1.7052, "step": 1216 }, { "epoch": 0.19725272498885693, "grad_norm": 0.9405919313430786, "learning_rate": 1.728009315594843e-05, "loss": 1.8254, "step": 1217 }, { "epoch": 0.19741480611045828, "grad_norm": 0.922093391418457, "learning_rate": 1.716180133214149e-05, "loss": 1.9641, "step": 1218 }, { "epoch": 0.19757688723205966, "grad_norm": 0.9219188094139099, "learning_rate": 1.704387777100165e-05, "loss": 1.9483, "step": 1219 }, { "epoch": 0.197738968353661, "grad_norm": 0.8527796268463135, "learning_rate": 1.6926322996765897e-05, "loss": 1.7142, "step": 1220 }, { "epoch": 0.19790104947526238, "grad_norm": 1.0064805746078491, "learning_rate": 1.6809137532031704e-05, "loss": 1.92, "step": 1221 }, { "epoch": 0.19806313059686373, "grad_norm": 0.9660975337028503, "learning_rate": 1.6692321897754758e-05, "loss": 1.9124, "step": 1222 }, { "epoch": 0.1982252117184651, "grad_norm": 1.0552252531051636, "learning_rate": 1.65758766132467e-05, "loss": 1.8532, "step": 1223 }, { "epoch": 0.19838729284006645, "grad_norm": 0.9916226267814636, "learning_rate": 1.6459802196172668e-05, "loss": 2.027, "step": 1224 }, { "epoch": 0.1985493739616678, "grad_norm": 0.9750329256057739, "learning_rate": 1.634409916254914e-05, "loss": 1.8228, "step": 1225 }, { "epoch": 0.19871145508326918, "grad_norm": 1.017592191696167, "learning_rate": 1.622876802674158e-05, "loss": 1.9509, "step": 1226 }, { "epoch": 0.19887353620487053, "grad_norm": 0.9426957964897156, "learning_rate": 1.6113809301462125e-05, "loss": 1.8686, "step": 1227 }, { "epoch": 0.1990356173264719, "grad_norm": 0.9949437975883484, "learning_rate": 1.599922349776738e-05, "loss": 1.8867, "step": 1228 }, { "epoch": 0.19919769844807325, "grad_norm": 0.9932520985603333, "learning_rate": 1.5885011125056047e-05, "loss": 1.9369, "step": 1229 }, { "epoch": 0.19935977956967463, "grad_norm": 1.1354806423187256, "learning_rate": 1.5771172691066794e-05, "loss": 1.8813, "step": 1230 }, { "epoch": 0.19952186069127598, "grad_norm": 1.0394784212112427, "learning_rate": 1.565770870187585e-05, "loss": 1.7944, "step": 1231 }, { "epoch": 0.19968394181287735, "grad_norm": 1.1596986055374146, "learning_rate": 1.5544619661894864e-05, "loss": 2.0071, "step": 1232 }, { "epoch": 0.1998460229344787, "grad_norm": 1.0981085300445557, "learning_rate": 1.543190607386861e-05, "loss": 2.0534, "step": 1233 }, { "epoch": 0.20000810405608008, "grad_norm": 1.0677471160888672, "learning_rate": 1.5319568438872745e-05, "loss": 2.1297, "step": 1234 }, { "epoch": 0.20017018517768143, "grad_norm": 1.1075794696807861, "learning_rate": 1.520760725631164e-05, "loss": 2.2143, "step": 1235 }, { "epoch": 0.2003322662992828, "grad_norm": 1.141735315322876, "learning_rate": 1.5096023023916094e-05, "loss": 1.9121, "step": 1236 }, { "epoch": 0.20049434742088415, "grad_norm": 1.2292425632476807, "learning_rate": 1.498481623774115e-05, "loss": 2.1336, "step": 1237 }, { "epoch": 0.20065642854248553, "grad_norm": 1.274285912513733, "learning_rate": 1.4873987392163947e-05, "loss": 2.1585, "step": 1238 }, { "epoch": 0.20081850966408687, "grad_norm": 1.2518714666366577, "learning_rate": 1.4763536979881354e-05, "loss": 2.1563, "step": 1239 }, { "epoch": 0.20098059078568822, "grad_norm": 1.3572841882705688, "learning_rate": 1.4653465491908003e-05, "loss": 2.2299, "step": 1240 }, { "epoch": 0.2011426719072896, "grad_norm": 1.37043297290802, "learning_rate": 1.4543773417573925e-05, "loss": 2.2334, "step": 1241 }, { "epoch": 0.20130475302889095, "grad_norm": 1.3926852941513062, "learning_rate": 1.4434461244522458e-05, "loss": 2.0816, "step": 1242 }, { "epoch": 0.20146683415049232, "grad_norm": 1.503570556640625, "learning_rate": 1.4325529458708065e-05, "loss": 2.3405, "step": 1243 }, { "epoch": 0.20162891527209367, "grad_norm": 1.5177767276763916, "learning_rate": 1.4216978544394177e-05, "loss": 2.2196, "step": 1244 }, { "epoch": 0.20179099639369505, "grad_norm": 1.644774079322815, "learning_rate": 1.4108808984151023e-05, "loss": 2.3824, "step": 1245 }, { "epoch": 0.2019530775152964, "grad_norm": 1.7863985300064087, "learning_rate": 1.4001021258853509e-05, "loss": 2.5351, "step": 1246 }, { "epoch": 0.20211515863689777, "grad_norm": 2.065764904022217, "learning_rate": 1.3893615847679065e-05, "loss": 2.6942, "step": 1247 }, { "epoch": 0.20227723975849912, "grad_norm": 2.243483543395996, "learning_rate": 1.3786593228105494e-05, "loss": 2.5715, "step": 1248 }, { "epoch": 0.2024393208801005, "grad_norm": 2.432351589202881, "learning_rate": 1.3679953875908957e-05, "loss": 2.6404, "step": 1249 }, { "epoch": 0.20260140200170185, "grad_norm": 3.3445770740509033, "learning_rate": 1.3573698265161683e-05, "loss": 3.0978, "step": 1250 }, { "epoch": 0.20276348312330322, "grad_norm": 0.8473957777023315, "learning_rate": 1.3467826868229994e-05, "loss": 2.2334, "step": 1251 }, { "epoch": 0.20292556424490457, "grad_norm": 0.7838537096977234, "learning_rate": 1.3362340155772146e-05, "loss": 1.9693, "step": 1252 }, { "epoch": 0.20308764536650595, "grad_norm": 0.7724992632865906, "learning_rate": 1.3257238596736266e-05, "loss": 1.7912, "step": 1253 }, { "epoch": 0.2032497264881073, "grad_norm": 0.8153484463691711, "learning_rate": 1.3152522658358245e-05, "loss": 1.9724, "step": 1254 }, { "epoch": 0.20341180760970867, "grad_norm": 0.8190444707870483, "learning_rate": 1.3048192806159721e-05, "loss": 1.9885, "step": 1255 }, { "epoch": 0.20357388873131002, "grad_norm": 0.7742248773574829, "learning_rate": 1.2944249503945894e-05, "loss": 1.7213, "step": 1256 }, { "epoch": 0.20373596985291137, "grad_norm": 0.7935106754302979, "learning_rate": 1.2840693213803545e-05, "loss": 1.7733, "step": 1257 }, { "epoch": 0.20389805097451275, "grad_norm": 0.9317451119422913, "learning_rate": 1.2737524396099032e-05, "loss": 1.8061, "step": 1258 }, { "epoch": 0.2040601320961141, "grad_norm": 0.8607384562492371, "learning_rate": 1.2634743509476088e-05, "loss": 1.8653, "step": 1259 }, { "epoch": 0.20422221321771547, "grad_norm": 0.8509252071380615, "learning_rate": 1.2532351010853916e-05, "loss": 1.7986, "step": 1260 }, { "epoch": 0.20438429433931682, "grad_norm": 0.8512972593307495, "learning_rate": 1.243034735542512e-05, "loss": 1.8779, "step": 1261 }, { "epoch": 0.2045463754609182, "grad_norm": 0.8634916543960571, "learning_rate": 1.2328732996653669e-05, "loss": 1.8112, "step": 1262 }, { "epoch": 0.20470845658251954, "grad_norm": 0.9004796147346497, "learning_rate": 1.2227508386272878e-05, "loss": 1.8701, "step": 1263 }, { "epoch": 0.20487053770412092, "grad_norm": 0.8926157355308533, "learning_rate": 1.212667397428342e-05, "loss": 1.9414, "step": 1264 }, { "epoch": 0.20503261882572227, "grad_norm": 0.9458131790161133, "learning_rate": 1.2026230208951306e-05, "loss": 1.8746, "step": 1265 }, { "epoch": 0.20519469994732364, "grad_norm": 0.914452075958252, "learning_rate": 1.1926177536805905e-05, "loss": 1.8194, "step": 1266 }, { "epoch": 0.205356781068925, "grad_norm": 1.0000059604644775, "learning_rate": 1.1826516402637989e-05, "loss": 1.9296, "step": 1267 }, { "epoch": 0.20551886219052637, "grad_norm": 0.922459065914154, "learning_rate": 1.1727247249497685e-05, "loss": 1.8448, "step": 1268 }, { "epoch": 0.20568094331212772, "grad_norm": 0.9379509687423706, "learning_rate": 1.1628370518692533e-05, "loss": 1.7451, "step": 1269 }, { "epoch": 0.2058430244337291, "grad_norm": 0.9158072471618652, "learning_rate": 1.152988664978556e-05, "loss": 1.8537, "step": 1270 }, { "epoch": 0.20600510555533044, "grad_norm": 0.9501444101333618, "learning_rate": 1.1431796080593283e-05, "loss": 1.7632, "step": 1271 }, { "epoch": 0.2061671866769318, "grad_norm": 0.9927570819854736, "learning_rate": 1.1334099247183783e-05, "loss": 1.9004, "step": 1272 }, { "epoch": 0.20632926779853317, "grad_norm": 1.088786244392395, "learning_rate": 1.1236796583874787e-05, "loss": 1.8845, "step": 1273 }, { "epoch": 0.20649134892013452, "grad_norm": 1.000855803489685, "learning_rate": 1.1139888523231678e-05, "loss": 1.8094, "step": 1274 }, { "epoch": 0.2066534300417359, "grad_norm": 1.023329734802246, "learning_rate": 1.1043375496065611e-05, "loss": 1.9246, "step": 1275 }, { "epoch": 0.20681551116333724, "grad_norm": 1.060957908630371, "learning_rate": 1.0947257931431642e-05, "loss": 1.9773, "step": 1276 }, { "epoch": 0.20697759228493862, "grad_norm": 1.0018396377563477, "learning_rate": 1.0851536256626705e-05, "loss": 1.8473, "step": 1277 }, { "epoch": 0.20713967340653996, "grad_norm": 1.023350715637207, "learning_rate": 1.0756210897187812e-05, "loss": 1.8725, "step": 1278 }, { "epoch": 0.20730175452814134, "grad_norm": 1.0155223608016968, "learning_rate": 1.0661282276890127e-05, "loss": 1.8691, "step": 1279 }, { "epoch": 0.2074638356497427, "grad_norm": 1.0554794073104858, "learning_rate": 1.0566750817745074e-05, "loss": 1.8774, "step": 1280 }, { "epoch": 0.20762591677134407, "grad_norm": 1.0788451433181763, "learning_rate": 1.0472616939998492e-05, "loss": 2.1153, "step": 1281 }, { "epoch": 0.20778799789294541, "grad_norm": 1.0743927955627441, "learning_rate": 1.0378881062128731e-05, "loss": 2.0141, "step": 1282 }, { "epoch": 0.2079500790145468, "grad_norm": 1.08610200881958, "learning_rate": 1.0285543600844804e-05, "loss": 1.9843, "step": 1283 }, { "epoch": 0.20811216013614814, "grad_norm": 1.157723307609558, "learning_rate": 1.019260497108453e-05, "loss": 2.0398, "step": 1284 }, { "epoch": 0.20827424125774952, "grad_norm": 1.2143625020980835, "learning_rate": 1.010006558601274e-05, "loss": 2.0191, "step": 1285 }, { "epoch": 0.20843632237935086, "grad_norm": 1.143126368522644, "learning_rate": 1.000792585701934e-05, "loss": 2.0874, "step": 1286 }, { "epoch": 0.20859840350095224, "grad_norm": 1.1845760345458984, "learning_rate": 9.91618619371757e-06, "loss": 2.0991, "step": 1287 }, { "epoch": 0.2087604846225536, "grad_norm": 1.1913238763809204, "learning_rate": 9.82484700394215e-06, "loss": 2.0549, "step": 1288 }, { "epoch": 0.20892256574415494, "grad_norm": 1.2370691299438477, "learning_rate": 9.73390869374743e-06, "loss": 2.2093, "step": 1289 }, { "epoch": 0.2090846468657563, "grad_norm": 1.3261677026748657, "learning_rate": 9.643371667405698e-06, "loss": 2.1468, "step": 1290 }, { "epoch": 0.20924672798735766, "grad_norm": 1.3013460636138916, "learning_rate": 9.553236327405246e-06, "loss": 2.2629, "step": 1291 }, { "epoch": 0.20940880910895904, "grad_norm": 1.3862487077713013, "learning_rate": 9.463503074448677e-06, "loss": 2.4751, "step": 1292 }, { "epoch": 0.20957089023056039, "grad_norm": 1.5185917615890503, "learning_rate": 9.374172307451068e-06, "loss": 2.238, "step": 1293 }, { "epoch": 0.20973297135216176, "grad_norm": 1.5628163814544678, "learning_rate": 9.285244423538197e-06, "loss": 2.4447, "step": 1294 }, { "epoch": 0.2098950524737631, "grad_norm": 1.6567641496658325, "learning_rate": 9.196719818044886e-06, "loss": 2.2381, "step": 1295 }, { "epoch": 0.2100571335953645, "grad_norm": 1.7779566049575806, "learning_rate": 9.108598884513053e-06, "loss": 2.3042, "step": 1296 }, { "epoch": 0.21021921471696584, "grad_norm": 2.01045298576355, "learning_rate": 9.020882014690136e-06, "loss": 2.4381, "step": 1297 }, { "epoch": 0.2103812958385672, "grad_norm": 2.1557350158691406, "learning_rate": 8.933569598527247e-06, "loss": 2.4348, "step": 1298 }, { "epoch": 0.21054337696016856, "grad_norm": 2.556826114654541, "learning_rate": 8.846662024177477e-06, "loss": 2.6481, "step": 1299 }, { "epoch": 0.21070545808176994, "grad_norm": 3.8834383487701416, "learning_rate": 8.760159677994172e-06, "loss": 3.0726, "step": 1300 }, { "epoch": 0.21086753920337128, "grad_norm": 0.6687748432159424, "learning_rate": 8.674062944529216e-06, "loss": 2.067, "step": 1301 }, { "epoch": 0.21102962032497266, "grad_norm": 0.7516258358955383, "learning_rate": 8.588372206531292e-06, "loss": 1.9569, "step": 1302 }, { "epoch": 0.211191701446574, "grad_norm": 0.7336524724960327, "learning_rate": 8.503087844944213e-06, "loss": 1.8829, "step": 1303 }, { "epoch": 0.21135378256817536, "grad_norm": 0.7719362378120422, "learning_rate": 8.418210238905256e-06, "loss": 1.8936, "step": 1304 }, { "epoch": 0.21151586368977673, "grad_norm": 0.8221637606620789, "learning_rate": 8.333739765743398e-06, "loss": 1.9578, "step": 1305 }, { "epoch": 0.21167794481137808, "grad_norm": 0.7503555417060852, "learning_rate": 8.249676800977658e-06, "loss": 1.8614, "step": 1306 }, { "epoch": 0.21184002593297946, "grad_norm": 0.7742947340011597, "learning_rate": 8.16602171831553e-06, "loss": 1.8019, "step": 1307 }, { "epoch": 0.2120021070545808, "grad_norm": 0.7885059714317322, "learning_rate": 8.082774889651168e-06, "loss": 1.8845, "step": 1308 }, { "epoch": 0.21216418817618218, "grad_norm": 0.838720977306366, "learning_rate": 7.999936685063835e-06, "loss": 1.8543, "step": 1309 }, { "epoch": 0.21232626929778353, "grad_norm": 0.8016371726989746, "learning_rate": 7.91750747281621e-06, "loss": 1.7823, "step": 1310 }, { "epoch": 0.2124883504193849, "grad_norm": 0.8562203049659729, "learning_rate": 7.835487619352811e-06, "loss": 1.8451, "step": 1311 }, { "epoch": 0.21265043154098626, "grad_norm": 0.8714253902435303, "learning_rate": 7.753877489298244e-06, "loss": 1.9546, "step": 1312 }, { "epoch": 0.21281251266258763, "grad_norm": 0.8354063630104065, "learning_rate": 7.67267744545579e-06, "loss": 1.8458, "step": 1313 }, { "epoch": 0.21297459378418898, "grad_norm": 0.8767425417900085, "learning_rate": 7.591887848805545e-06, "loss": 1.7214, "step": 1314 }, { "epoch": 0.21313667490579036, "grad_norm": 0.917818009853363, "learning_rate": 7.5115090585029966e-06, "loss": 1.9427, "step": 1315 }, { "epoch": 0.2132987560273917, "grad_norm": 0.8547624945640564, "learning_rate": 7.431541431877342e-06, "loss": 1.7485, "step": 1316 }, { "epoch": 0.21346083714899308, "grad_norm": 0.8887349963188171, "learning_rate": 7.351985324429933e-06, "loss": 1.8753, "step": 1317 }, { "epoch": 0.21362291827059443, "grad_norm": 0.9365783333778381, "learning_rate": 7.272841089832694e-06, "loss": 1.9225, "step": 1318 }, { "epoch": 0.2137849993921958, "grad_norm": 0.9322122931480408, "learning_rate": 7.194109079926514e-06, "loss": 1.8203, "step": 1319 }, { "epoch": 0.21394708051379716, "grad_norm": 0.9238874912261963, "learning_rate": 7.115789644719728e-06, "loss": 1.7718, "step": 1320 }, { "epoch": 0.2141091616353985, "grad_norm": 0.990264356136322, "learning_rate": 7.037883132386547e-06, "loss": 1.8244, "step": 1321 }, { "epoch": 0.21427124275699988, "grad_norm": 0.9237974286079407, "learning_rate": 6.960389889265517e-06, "loss": 1.8114, "step": 1322 }, { "epoch": 0.21443332387860123, "grad_norm": 0.927346408367157, "learning_rate": 6.883310259857944e-06, "loss": 1.7792, "step": 1323 }, { "epoch": 0.2145954050002026, "grad_norm": 0.9630454778671265, "learning_rate": 6.806644586826383e-06, "loss": 1.874, "step": 1324 }, { "epoch": 0.21475748612180395, "grad_norm": 0.9323937296867371, "learning_rate": 6.730393210993147e-06, "loss": 1.8647, "step": 1325 }, { "epoch": 0.21491956724340533, "grad_norm": 0.9704220294952393, "learning_rate": 6.654556471338746e-06, "loss": 1.9591, "step": 1326 }, { "epoch": 0.21508164836500668, "grad_norm": 0.9892690777778625, "learning_rate": 6.579134705000412e-06, "loss": 1.8956, "step": 1327 }, { "epoch": 0.21524372948660805, "grad_norm": 0.9901792407035828, "learning_rate": 6.504128247270546e-06, "loss": 1.9442, "step": 1328 }, { "epoch": 0.2154058106082094, "grad_norm": 1.0620304346084595, "learning_rate": 6.429537431595312e-06, "loss": 1.8508, "step": 1329 }, { "epoch": 0.21556789172981078, "grad_norm": 1.104321002960205, "learning_rate": 6.355362589573077e-06, "loss": 1.9958, "step": 1330 }, { "epoch": 0.21572997285141213, "grad_norm": 0.9812522530555725, "learning_rate": 6.2816040509530165e-06, "loss": 1.8696, "step": 1331 }, { "epoch": 0.2158920539730135, "grad_norm": 1.0644954442977905, "learning_rate": 6.2082621436335475e-06, "loss": 1.9394, "step": 1332 }, { "epoch": 0.21605413509461485, "grad_norm": 1.0426313877105713, "learning_rate": 6.135337193660962e-06, "loss": 1.8582, "step": 1333 }, { "epoch": 0.21621621621621623, "grad_norm": 1.0715755224227905, "learning_rate": 6.062829525227909e-06, "loss": 1.77, "step": 1334 }, { "epoch": 0.21637829733781758, "grad_norm": 1.1294608116149902, "learning_rate": 5.990739460672024e-06, "loss": 2.0644, "step": 1335 }, { "epoch": 0.21654037845941895, "grad_norm": 1.0986454486846924, "learning_rate": 5.9190673204744255e-06, "loss": 1.8724, "step": 1336 }, { "epoch": 0.2167024595810203, "grad_norm": 1.1359504461288452, "learning_rate": 5.84781342325833e-06, "loss": 1.9936, "step": 1337 }, { "epoch": 0.21686454070262165, "grad_norm": 1.2386897802352905, "learning_rate": 5.77697808578761e-06, "loss": 2.19, "step": 1338 }, { "epoch": 0.21702662182422303, "grad_norm": 1.248641848564148, "learning_rate": 5.706561622965467e-06, "loss": 2.2664, "step": 1339 }, { "epoch": 0.21718870294582437, "grad_norm": 1.2756218910217285, "learning_rate": 5.636564347832907e-06, "loss": 2.3229, "step": 1340 }, { "epoch": 0.21735078406742575, "grad_norm": 1.3369107246398926, "learning_rate": 5.566986571567401e-06, "loss": 2.2825, "step": 1341 }, { "epoch": 0.2175128651890271, "grad_norm": 1.4348537921905518, "learning_rate": 5.497828603481569e-06, "loss": 2.2434, "step": 1342 }, { "epoch": 0.21767494631062848, "grad_norm": 1.470051646232605, "learning_rate": 5.429090751021704e-06, "loss": 2.3144, "step": 1343 }, { "epoch": 0.21783702743222982, "grad_norm": 1.5419750213623047, "learning_rate": 5.3607733197664436e-06, "loss": 2.3097, "step": 1344 }, { "epoch": 0.2179991085538312, "grad_norm": 1.6469056606292725, "learning_rate": 5.2928766134254345e-06, "loss": 2.2236, "step": 1345 }, { "epoch": 0.21816118967543255, "grad_norm": 1.724802851676941, "learning_rate": 5.225400933837954e-06, "loss": 2.2516, "step": 1346 }, { "epoch": 0.21832327079703392, "grad_norm": 1.8932164907455444, "learning_rate": 5.158346580971573e-06, "loss": 2.4412, "step": 1347 }, { "epoch": 0.21848535191863527, "grad_norm": 2.1076700687408447, "learning_rate": 5.091713852920854e-06, "loss": 2.5295, "step": 1348 }, { "epoch": 0.21864743304023665, "grad_norm": 2.541060209274292, "learning_rate": 5.025503045905933e-06, "loss": 2.4928, "step": 1349 }, { "epoch": 0.218809514161838, "grad_norm": 3.9358601570129395, "learning_rate": 4.959714454271369e-06, "loss": 3.1011, "step": 1350 }, { "epoch": 0.21897159528343937, "grad_norm": 0.7298935055732727, "learning_rate": 4.8943483704846475e-06, "loss": 2.1909, "step": 1351 }, { "epoch": 0.21913367640504072, "grad_norm": 0.7388738989830017, "learning_rate": 4.829405085134997e-06, "loss": 2.1498, "step": 1352 }, { "epoch": 0.21929575752664207, "grad_norm": 0.7728161811828613, "learning_rate": 4.764884886932086e-06, "loss": 1.8913, "step": 1353 }, { "epoch": 0.21945783864824345, "grad_norm": 0.7349847555160522, "learning_rate": 4.700788062704687e-06, "loss": 1.8675, "step": 1354 }, { "epoch": 0.2196199197698448, "grad_norm": 0.76107257604599, "learning_rate": 4.6371148973994525e-06, "loss": 1.8459, "step": 1355 }, { "epoch": 0.21978200089144617, "grad_norm": 0.8307090401649475, "learning_rate": 4.573865674079625e-06, "loss": 1.8955, "step": 1356 }, { "epoch": 0.21994408201304752, "grad_norm": 0.789225697517395, "learning_rate": 4.511040673923828e-06, "loss": 1.9572, "step": 1357 }, { "epoch": 0.2201061631346489, "grad_norm": 0.804252564907074, "learning_rate": 4.448640176224694e-06, "loss": 1.8684, "step": 1358 }, { "epoch": 0.22026824425625025, "grad_norm": 0.8471936583518982, "learning_rate": 4.386664458387779e-06, "loss": 1.9508, "step": 1359 }, { "epoch": 0.22043032537785162, "grad_norm": 0.7995375990867615, "learning_rate": 4.325113795930203e-06, "loss": 1.7705, "step": 1360 }, { "epoch": 0.22059240649945297, "grad_norm": 0.8100654482841492, "learning_rate": 4.263988462479484e-06, "loss": 1.8373, "step": 1361 }, { "epoch": 0.22075448762105435, "grad_norm": 0.8469244837760925, "learning_rate": 4.203288729772326e-06, "loss": 1.7919, "step": 1362 }, { "epoch": 0.2209165687426557, "grad_norm": 0.8349213004112244, "learning_rate": 4.143014867653383e-06, "loss": 1.7147, "step": 1363 }, { "epoch": 0.22107864986425707, "grad_norm": 0.849981963634491, "learning_rate": 4.083167144074073e-06, "loss": 1.7979, "step": 1364 }, { "epoch": 0.22124073098585842, "grad_norm": 0.8309417963027954, "learning_rate": 4.023745825091407e-06, "loss": 1.772, "step": 1365 }, { "epoch": 0.2214028121074598, "grad_norm": 0.8737801313400269, "learning_rate": 3.964751174866765e-06, "loss": 1.8053, "step": 1366 }, { "epoch": 0.22156489322906114, "grad_norm": 0.8240771889686584, "learning_rate": 3.906183455664725e-06, "loss": 1.7799, "step": 1367 }, { "epoch": 0.22172697435066252, "grad_norm": 0.9528577923774719, "learning_rate": 3.84804292785198e-06, "loss": 1.8729, "step": 1368 }, { "epoch": 0.22188905547226387, "grad_norm": 0.8988291025161743, "learning_rate": 3.7903298498960572e-06, "loss": 1.8288, "step": 1369 }, { "epoch": 0.22205113659386522, "grad_norm": 0.8863739967346191, "learning_rate": 3.7330444783642338e-06, "loss": 1.8893, "step": 1370 }, { "epoch": 0.2222132177154666, "grad_norm": 0.8911713361740112, "learning_rate": 3.676187067922421e-06, "loss": 1.7313, "step": 1371 }, { "epoch": 0.22237529883706794, "grad_norm": 0.9464294910430908, "learning_rate": 3.619757871333973e-06, "loss": 1.8523, "step": 1372 }, { "epoch": 0.22253737995866932, "grad_norm": 0.9406479001045227, "learning_rate": 3.563757139458579e-06, "loss": 1.8836, "step": 1373 }, { "epoch": 0.22269946108027067, "grad_norm": 0.9911094307899475, "learning_rate": 3.5081851212512175e-06, "loss": 1.8432, "step": 1374 }, { "epoch": 0.22286154220187204, "grad_norm": 1.0339909791946411, "learning_rate": 3.4530420637609363e-06, "loss": 1.9514, "step": 1375 }, { "epoch": 0.2230236233234734, "grad_norm": 0.9484648704528809, "learning_rate": 3.3983282121298086e-06, "loss": 1.8593, "step": 1376 }, { "epoch": 0.22318570444507477, "grad_norm": 0.9661772847175598, "learning_rate": 3.3440438095919126e-06, "loss": 1.8627, "step": 1377 }, { "epoch": 0.22334778556667612, "grad_norm": 1.0593996047973633, "learning_rate": 3.290189097472096e-06, "loss": 1.8467, "step": 1378 }, { "epoch": 0.2235098666882775, "grad_norm": 0.9625633358955383, "learning_rate": 3.236764315185037e-06, "loss": 1.8541, "step": 1379 }, { "epoch": 0.22367194780987884, "grad_norm": 1.0560359954833984, "learning_rate": 3.1837697002341293e-06, "loss": 2.0551, "step": 1380 }, { "epoch": 0.22383402893148022, "grad_norm": 1.039581537246704, "learning_rate": 3.131205488210409e-06, "loss": 1.9046, "step": 1381 }, { "epoch": 0.22399611005308157, "grad_norm": 1.0384910106658936, "learning_rate": 3.0790719127915646e-06, "loss": 1.9562, "step": 1382 }, { "epoch": 0.22415819117468294, "grad_norm": 1.051658034324646, "learning_rate": 3.0273692057408265e-06, "loss": 1.9033, "step": 1383 }, { "epoch": 0.2243202722962843, "grad_norm": 1.0968042612075806, "learning_rate": 2.976097596905969e-06, "loss": 1.9182, "step": 1384 }, { "epoch": 0.22448235341788564, "grad_norm": 1.172212839126587, "learning_rate": 2.9252573142183326e-06, "loss": 1.9159, "step": 1385 }, { "epoch": 0.22464443453948701, "grad_norm": 1.1665847301483154, "learning_rate": 2.874848583691714e-06, "loss": 2.0066, "step": 1386 }, { "epoch": 0.22480651566108836, "grad_norm": 1.1886248588562012, "learning_rate": 2.8248716294214774e-06, "loss": 2.0135, "step": 1387 }, { "epoch": 0.22496859678268974, "grad_norm": 1.1656005382537842, "learning_rate": 2.7753266735834338e-06, "loss": 1.9567, "step": 1388 }, { "epoch": 0.2251306779042911, "grad_norm": 1.3580970764160156, "learning_rate": 2.7262139364329643e-06, "loss": 2.1888, "step": 1389 }, { "epoch": 0.22529275902589246, "grad_norm": 1.1902859210968018, "learning_rate": 2.677533636303964e-06, "loss": 2.2161, "step": 1390 }, { "epoch": 0.2254548401474938, "grad_norm": 1.2612886428833008, "learning_rate": 2.6292859896079213e-06, "loss": 2.1151, "step": 1391 }, { "epoch": 0.2256169212690952, "grad_norm": 1.338210940361023, "learning_rate": 2.581471210832931e-06, "loss": 2.1983, "step": 1392 }, { "epoch": 0.22577900239069654, "grad_norm": 1.3849469423294067, "learning_rate": 2.5340895125427364e-06, "loss": 2.1864, "step": 1393 }, { "epoch": 0.2259410835122979, "grad_norm": 1.5104336738586426, "learning_rate": 2.4871411053757898e-06, "loss": 2.2415, "step": 1394 }, { "epoch": 0.22610316463389926, "grad_norm": 1.6676409244537354, "learning_rate": 2.440626198044327e-06, "loss": 2.306, "step": 1395 }, { "epoch": 0.22626524575550064, "grad_norm": 1.7252272367477417, "learning_rate": 2.394544997333437e-06, "loss": 2.3587, "step": 1396 }, { "epoch": 0.226427326877102, "grad_norm": 1.8143377304077148, "learning_rate": 2.3488977081001394e-06, "loss": 2.397, "step": 1397 }, { "epoch": 0.22658940799870336, "grad_norm": 2.177941083908081, "learning_rate": 2.3036845332724543e-06, "loss": 2.3312, "step": 1398 }, { "epoch": 0.2267514891203047, "grad_norm": 2.6730029582977295, "learning_rate": 2.2589056738485324e-06, "loss": 2.4882, "step": 1399 }, { "epoch": 0.2269135702419061, "grad_norm": 3.5114686489105225, "learning_rate": 2.2145613288957478e-06, "loss": 2.8407, "step": 1400 }, { "epoch": 0.22707565136350744, "grad_norm": 0.7327190041542053, "learning_rate": 2.170651695549786e-06, "loss": 2.0815, "step": 1401 }, { "epoch": 0.22723773248510878, "grad_norm": 0.7992314100265503, "learning_rate": 2.1271769690138332e-06, "loss": 2.0828, "step": 1402 }, { "epoch": 0.22739981360671016, "grad_norm": 0.8525538444519043, "learning_rate": 2.084137342557646e-06, "loss": 1.8101, "step": 1403 }, { "epoch": 0.2275618947283115, "grad_norm": 0.7438859939575195, "learning_rate": 2.0415330075166937e-06, "loss": 1.8091, "step": 1404 }, { "epoch": 0.22772397584991289, "grad_norm": 0.8722172379493713, "learning_rate": 1.9993641532913833e-06, "loss": 1.7626, "step": 1405 }, { "epoch": 0.22788605697151423, "grad_norm": 0.7508912086486816, "learning_rate": 1.9576309673461357e-06, "loss": 1.8757, "step": 1406 }, { "epoch": 0.2280481380931156, "grad_norm": 0.8185262680053711, "learning_rate": 1.916333635208556e-06, "loss": 1.767, "step": 1407 }, { "epoch": 0.22821021921471696, "grad_norm": 0.8111989498138428, "learning_rate": 1.8754723404686425e-06, "loss": 1.9166, "step": 1408 }, { "epoch": 0.22837230033631833, "grad_norm": 0.8381231427192688, "learning_rate": 1.8350472647780116e-06, "loss": 1.9503, "step": 1409 }, { "epoch": 0.22853438145791968, "grad_norm": 0.8022873997688293, "learning_rate": 1.7950585878489856e-06, "loss": 1.7669, "step": 1410 }, { "epoch": 0.22869646257952106, "grad_norm": 0.8391844034194946, "learning_rate": 1.7555064874538397e-06, "loss": 1.7208, "step": 1411 }, { "epoch": 0.2288585437011224, "grad_norm": 0.8703323602676392, "learning_rate": 1.7163911394240672e-06, "loss": 1.9716, "step": 1412 }, { "epoch": 0.22902062482272378, "grad_norm": 0.8481823801994324, "learning_rate": 1.6777127176495043e-06, "loss": 1.8374, "step": 1413 }, { "epoch": 0.22918270594432513, "grad_norm": 0.8700101971626282, "learning_rate": 1.6394713940776296e-06, "loss": 1.8187, "step": 1414 }, { "epoch": 0.2293447870659265, "grad_norm": 0.8388266563415527, "learning_rate": 1.6016673387127646e-06, "loss": 1.808, "step": 1415 }, { "epoch": 0.22950686818752786, "grad_norm": 0.8589704632759094, "learning_rate": 1.5643007196153302e-06, "loss": 1.8178, "step": 1416 }, { "epoch": 0.2296689493091292, "grad_norm": 0.907537579536438, "learning_rate": 1.5273717029010925e-06, "loss": 1.9002, "step": 1417 }, { "epoch": 0.22983103043073058, "grad_norm": 0.8698744773864746, "learning_rate": 1.4908804527404286e-06, "loss": 1.8521, "step": 1418 }, { "epoch": 0.22999311155233193, "grad_norm": 0.931939423084259, "learning_rate": 1.4548271313575835e-06, "loss": 1.8762, "step": 1419 }, { "epoch": 0.2301551926739333, "grad_norm": 0.9257919192314148, "learning_rate": 1.4192118990299707e-06, "loss": 1.8341, "step": 1420 }, { "epoch": 0.23031727379553465, "grad_norm": 0.9721639752388, "learning_rate": 1.3840349140874619e-06, "loss": 1.8434, "step": 1421 }, { "epoch": 0.23047935491713603, "grad_norm": 0.9235073328018188, "learning_rate": 1.3492963329116537e-06, "loss": 1.7988, "step": 1422 }, { "epoch": 0.23064143603873738, "grad_norm": 0.9078055620193481, "learning_rate": 1.3149963099352014e-06, "loss": 1.9622, "step": 1423 }, { "epoch": 0.23080351716033876, "grad_norm": 0.934328019618988, "learning_rate": 1.2811349976411202e-06, "loss": 1.7909, "step": 1424 }, { "epoch": 0.2309655982819401, "grad_norm": 0.97508704662323, "learning_rate": 1.2477125465620853e-06, "loss": 1.9207, "step": 1425 }, { "epoch": 0.23112767940354148, "grad_norm": 1.02005934715271, "learning_rate": 1.2147291052798216e-06, "loss": 1.8721, "step": 1426 }, { "epoch": 0.23128976052514283, "grad_norm": 1.039638638496399, "learning_rate": 1.1821848204243814e-06, "loss": 1.8451, "step": 1427 }, { "epoch": 0.2314518416467442, "grad_norm": 1.014830470085144, "learning_rate": 1.1500798366735233e-06, "loss": 1.8414, "step": 1428 }, { "epoch": 0.23161392276834555, "grad_norm": 1.0277328491210938, "learning_rate": 1.1184142967520794e-06, "loss": 2.086, "step": 1429 }, { "epoch": 0.23177600388994693, "grad_norm": 1.05225670337677, "learning_rate": 1.0871883414312777e-06, "loss": 1.8629, "step": 1430 }, { "epoch": 0.23193808501154828, "grad_norm": 1.1301342248916626, "learning_rate": 1.0564021095281652e-06, "loss": 1.9919, "step": 1431 }, { "epoch": 0.23210016613314965, "grad_norm": 1.1018750667572021, "learning_rate": 1.0260557379049519e-06, "loss": 2.0352, "step": 1432 }, { "epoch": 0.232262247254751, "grad_norm": 1.0818148851394653, "learning_rate": 9.96149361468457e-07, "loss": 2.0119, "step": 1433 }, { "epoch": 0.23242432837635235, "grad_norm": 1.124988317489624, "learning_rate": 9.66683113169431e-07, "loss": 1.9938, "step": 1434 }, { "epoch": 0.23258640949795373, "grad_norm": 1.0845988988876343, "learning_rate": 9.376571240020227e-07, "loss": 1.8687, "step": 1435 }, { "epoch": 0.23274849061955508, "grad_norm": 1.131189227104187, "learning_rate": 9.090715230031688e-07, "loss": 1.9733, "step": 1436 }, { "epoch": 0.23291057174115645, "grad_norm": 1.1639673709869385, "learning_rate": 8.809264372520609e-07, "loss": 2.0066, "step": 1437 }, { "epoch": 0.2330726528627578, "grad_norm": 1.237915277481079, "learning_rate": 8.532219918695128e-07, "loss": 2.1668, "step": 1438 }, { "epoch": 0.23323473398435918, "grad_norm": 1.2396847009658813, "learning_rate": 8.259583100174606e-07, "loss": 2.1294, "step": 1439 }, { "epoch": 0.23339681510596053, "grad_norm": 1.2867481708526611, "learning_rate": 7.991355128984079e-07, "loss": 2.2187, "step": 1440 }, { "epoch": 0.2335588962275619, "grad_norm": 1.3112552165985107, "learning_rate": 7.727537197548707e-07, "loss": 2.2893, "step": 1441 }, { "epoch": 0.23372097734916325, "grad_norm": 1.3056278228759766, "learning_rate": 7.468130478688218e-07, "loss": 2.3824, "step": 1442 }, { "epoch": 0.23388305847076463, "grad_norm": 1.3893333673477173, "learning_rate": 7.213136125612586e-07, "loss": 2.3867, "step": 1443 }, { "epoch": 0.23404513959236597, "grad_norm": 1.5518338680267334, "learning_rate": 6.962555271915805e-07, "loss": 2.3255, "step": 1444 }, { "epoch": 0.23420722071396735, "grad_norm": 1.693285346031189, "learning_rate": 6.716389031571568e-07, "loss": 2.504, "step": 1445 }, { "epoch": 0.2343693018355687, "grad_norm": 1.7149142026901245, "learning_rate": 6.474638498928265e-07, "loss": 2.2711, "step": 1446 }, { "epoch": 0.23453138295717008, "grad_norm": 1.8721922636032104, "learning_rate": 6.237304748703543e-07, "loss": 2.427, "step": 1447 }, { "epoch": 0.23469346407877142, "grad_norm": 2.229055643081665, "learning_rate": 6.004388835980423e-07, "loss": 2.6288, "step": 1448 }, { "epoch": 0.23485554520037277, "grad_norm": 2.595604181289673, "learning_rate": 5.77589179620186e-07, "loss": 2.707, "step": 1449 }, { "epoch": 0.23501762632197415, "grad_norm": 3.945676565170288, "learning_rate": 5.55181464516652e-07, "loss": 3.1348, "step": 1450 }, { "epoch": 0.2351797074435755, "grad_norm": 0.6998302936553955, "learning_rate": 5.332158379024122e-07, "loss": 2.003, "step": 1451 }, { "epoch": 0.23534178856517687, "grad_norm": 0.6811542510986328, "learning_rate": 5.116923974270993e-07, "loss": 1.9607, "step": 1452 }, { "epoch": 0.23550386968677822, "grad_norm": 0.7369147539138794, "learning_rate": 4.906112387745965e-07, "loss": 1.6512, "step": 1453 }, { "epoch": 0.2356659508083796, "grad_norm": 0.701159656047821, "learning_rate": 4.6997245566257064e-07, "loss": 1.7235, "step": 1454 }, { "epoch": 0.23582803192998095, "grad_norm": 0.7372255325317383, "learning_rate": 4.497761398421063e-07, "loss": 1.6099, "step": 1455 }, { "epoch": 0.23599011305158232, "grad_norm": 0.8197316527366638, "learning_rate": 4.3002238109723927e-07, "loss": 1.8559, "step": 1456 }, { "epoch": 0.23615219417318367, "grad_norm": 0.787327229976654, "learning_rate": 4.107112672446123e-07, "loss": 1.7721, "step": 1457 }, { "epoch": 0.23631427529478505, "grad_norm": 0.8572624325752258, "learning_rate": 3.9184288413306456e-07, "loss": 1.7648, "step": 1458 }, { "epoch": 0.2364763564163864, "grad_norm": 0.8201079368591309, "learning_rate": 3.734173156432208e-07, "loss": 1.8707, "step": 1459 }, { "epoch": 0.23663843753798777, "grad_norm": 0.8340595364570618, "learning_rate": 3.554346436871581e-07, "loss": 1.9303, "step": 1460 }, { "epoch": 0.23680051865958912, "grad_norm": 0.8604863882064819, "learning_rate": 3.3789494820803957e-07, "loss": 1.9486, "step": 1461 }, { "epoch": 0.2369625997811905, "grad_norm": 0.8875858783721924, "learning_rate": 3.2079830717972606e-07, "loss": 1.9468, "step": 1462 }, { "epoch": 0.23712468090279185, "grad_norm": 0.9001452922821045, "learning_rate": 3.041447966064648e-07, "loss": 1.9463, "step": 1463 }, { "epoch": 0.23728676202439322, "grad_norm": 0.8194577693939209, "learning_rate": 2.8793449052254563e-07, "loss": 1.8026, "step": 1464 }, { "epoch": 0.23744884314599457, "grad_norm": 0.8343631625175476, "learning_rate": 2.721674609919345e-07, "loss": 1.7982, "step": 1465 }, { "epoch": 0.23761092426759592, "grad_norm": 0.8533897995948792, "learning_rate": 2.568437781080069e-07, "loss": 1.6482, "step": 1466 }, { "epoch": 0.2377730053891973, "grad_norm": 0.8781429529190063, "learning_rate": 2.4196350999320384e-07, "loss": 1.8709, "step": 1467 }, { "epoch": 0.23793508651079864, "grad_norm": 0.9017147421836853, "learning_rate": 2.275267227987321e-07, "loss": 1.8592, "step": 1468 }, { "epoch": 0.23809716763240002, "grad_norm": 0.8879401087760925, "learning_rate": 2.135334807042866e-07, "loss": 1.7503, "step": 1469 }, { "epoch": 0.23825924875400137, "grad_norm": 0.9443486928939819, "learning_rate": 1.9998384591773944e-07, "loss": 1.8992, "step": 1470 }, { "epoch": 0.23842132987560274, "grad_norm": 0.9002135396003723, "learning_rate": 1.8687787867489592e-07, "loss": 1.9903, "step": 1471 }, { "epoch": 0.2385834109972041, "grad_norm": 0.9264406561851501, "learning_rate": 1.7421563723919454e-07, "loss": 1.7793, "step": 1472 }, { "epoch": 0.23874549211880547, "grad_norm": 0.9791941046714783, "learning_rate": 1.6199717790145174e-07, "loss": 1.8952, "step": 1473 }, { "epoch": 0.23890757324040682, "grad_norm": 0.9434860944747925, "learning_rate": 1.5022255497962879e-07, "loss": 1.9194, "step": 1474 }, { "epoch": 0.2390696543620082, "grad_norm": 0.9502322673797607, "learning_rate": 1.3889182081860962e-07, "loss": 1.7819, "step": 1475 }, { "epoch": 0.23923173548360954, "grad_norm": 1.0040006637573242, "learning_rate": 1.2800502578991235e-07, "loss": 1.8933, "step": 1476 }, { "epoch": 0.23939381660521092, "grad_norm": 0.9272596836090088, "learning_rate": 1.1756221829148928e-07, "loss": 1.7617, "step": 1477 }, { "epoch": 0.23955589772681227, "grad_norm": 0.9766111969947815, "learning_rate": 1.0756344474753821e-07, "loss": 1.9032, "step": 1478 }, { "epoch": 0.23971797884841364, "grad_norm": 0.9846522808074951, "learning_rate": 9.800874960826933e-08, "loss": 1.8618, "step": 1479 }, { "epoch": 0.239880059970015, "grad_norm": 1.004815936088562, "learning_rate": 8.889817534969425e-08, "loss": 1.8648, "step": 1480 }, { "epoch": 0.24004214109161637, "grad_norm": 1.0023273229599, "learning_rate": 8.023176247348163e-08, "loss": 1.7953, "step": 1481 }, { "epoch": 0.24020422221321772, "grad_norm": 1.0628821849822998, "learning_rate": 7.200954950673522e-08, "loss": 2.0303, "step": 1482 }, { "epoch": 0.24036630333481906, "grad_norm": 1.106459379196167, "learning_rate": 6.423157300184946e-08, "loss": 2.13, "step": 1483 }, { "epoch": 0.24052838445642044, "grad_norm": 1.1170806884765625, "learning_rate": 5.6897867536331864e-08, "loss": 1.9977, "step": 1484 }, { "epoch": 0.2406904655780218, "grad_norm": 1.1586053371429443, "learning_rate": 5.000846571264761e-08, "loss": 2.0204, "step": 1485 }, { "epoch": 0.24085254669962317, "grad_norm": 1.1813981533050537, "learning_rate": 4.35633981580974e-08, "loss": 2.0133, "step": 1486 }, { "epoch": 0.24101462782122451, "grad_norm": 1.2310514450073242, "learning_rate": 3.756269352462871e-08, "loss": 2.2341, "step": 1487 }, { "epoch": 0.2411767089428259, "grad_norm": 1.1903209686279297, "learning_rate": 3.20063784888025e-08, "loss": 2.0273, "step": 1488 }, { "epoch": 0.24133879006442724, "grad_norm": 1.2221616506576538, "learning_rate": 2.6894477751548964e-08, "loss": 2.0331, "step": 1489 }, { "epoch": 0.24150087118602862, "grad_norm": 1.2321624755859375, "learning_rate": 2.222701403818972e-08, "loss": 2.1394, "step": 1490 }, { "epoch": 0.24166295230762996, "grad_norm": 1.2911142110824585, "learning_rate": 1.8004008098226887e-08, "loss": 2.1829, "step": 1491 }, { "epoch": 0.24182503342923134, "grad_norm": 1.4158598184585571, "learning_rate": 1.4225478705309769e-08, "loss": 2.4087, "step": 1492 }, { "epoch": 0.2419871145508327, "grad_norm": 1.4650459289550781, "learning_rate": 1.0891442657134932e-08, "loss": 2.2821, "step": 1493 }, { "epoch": 0.24214919567243406, "grad_norm": 1.5311766862869263, "learning_rate": 8.001914775401798e-09, "loss": 2.4504, "step": 1494 }, { "epoch": 0.2423112767940354, "grad_norm": 1.5992512702941895, "learning_rate": 5.5569079056794206e-09, "loss": 2.2785, "step": 1495 }, { "epoch": 0.2424733579156368, "grad_norm": 1.7037650346755981, "learning_rate": 3.5564329174064824e-09, "loss": 2.3511, "step": 1496 }, { "epoch": 0.24263543903723814, "grad_norm": 1.8804768323898315, "learning_rate": 2.0004987038246824e-09, "loss": 2.5542, "step": 1497 }, { "epoch": 0.24279752015883949, "grad_norm": 2.285545587539673, "learning_rate": 8.891121819565306e-10, "loss": 2.5937, "step": 1498 }, { "epoch": 0.24295960128044086, "grad_norm": 2.466541290283203, "learning_rate": 2.2227829252763344e-10, "loss": 2.5657, "step": 1499 }, { "epoch": 0.2431216824020422, "grad_norm": 3.7520334720611572, "learning_rate": 0.0, "loss": 3.0612, "step": 1500 }, { "epoch": 0.2431216824020422, "eval_loss": 2.0089547634124756, "eval_runtime": 613.2328, "eval_samples_per_second": 16.946, "eval_steps_per_second": 8.473, "step": 1500 } ], "logging_steps": 1, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 375, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2867741514819174e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }