{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007496251874062968, "grad_norm": 0.11279210702086269, "learning_rate": 4.987531172069825e-07, "loss": 0.0283, "step": 1 }, { "epoch": 0.0014992503748125937, "grad_norm": 0.0708025290604418, "learning_rate": 9.97506234413965e-07, "loss": 0.018, "step": 2 }, { "epoch": 0.0022488755622188904, "grad_norm": 0.08204012806077762, "learning_rate": 1.4962593516209476e-06, "loss": 0.0343, "step": 3 }, { "epoch": 0.0029985007496251873, "grad_norm": 0.1887406661276333, "learning_rate": 1.99501246882793e-06, "loss": 0.0264, "step": 4 }, { "epoch": 0.0037481259370314842, "grad_norm": 0.1702003667682934, "learning_rate": 2.493765586034913e-06, "loss": 0.0817, "step": 5 }, { "epoch": 0.004497751124437781, "grad_norm": 0.08633491698201652, "learning_rate": 2.992518703241895e-06, "loss": 0.0192, "step": 6 }, { "epoch": 0.005247376311844078, "grad_norm": 0.06668682918073623, "learning_rate": 3.4912718204488777e-06, "loss": 0.0269, "step": 7 }, { "epoch": 0.005997001499250375, "grad_norm": 0.0949644930191698, "learning_rate": 3.99002493765586e-06, "loss": 0.0214, "step": 8 }, { "epoch": 0.006746626686656672, "grad_norm": 0.05487535781427025, "learning_rate": 4.488778054862843e-06, "loss": 0.0159, "step": 9 }, { "epoch": 0.0074962518740629685, "grad_norm": 0.17217701524900897, "learning_rate": 4.987531172069826e-06, "loss": 0.0607, "step": 10 }, { "epoch": 0.008245877061469266, "grad_norm": 0.20182278003961085, "learning_rate": 5.486284289276808e-06, "loss": 0.0539, "step": 11 }, { "epoch": 0.008995502248875561, "grad_norm": 0.1312564555806688, "learning_rate": 5.98503740648379e-06, "loss": 0.0329, "step": 12 }, { "epoch": 0.009745127436281859, "grad_norm": 0.09508077532454609, "learning_rate": 6.483790523690773e-06, "loss": 0.1668, "step": 13 }, { "epoch": 0.010494752623688156, "grad_norm": 0.05902506464710472, "learning_rate": 6.982543640897755e-06, "loss": 0.0215, "step": 14 }, { "epoch": 0.011244377811094454, "grad_norm": 0.057580643912089995, "learning_rate": 7.481296758104739e-06, "loss": 0.0266, "step": 15 }, { "epoch": 0.01199400299850075, "grad_norm": 0.04716677617704238, "learning_rate": 7.98004987531172e-06, "loss": 0.0215, "step": 16 }, { "epoch": 0.012743628185907047, "grad_norm": 0.0795864015547478, "learning_rate": 8.478802992518704e-06, "loss": 0.0173, "step": 17 }, { "epoch": 0.013493253373313344, "grad_norm": 0.07671642061904087, "learning_rate": 8.977556109725686e-06, "loss": 0.0224, "step": 18 }, { "epoch": 0.01424287856071964, "grad_norm": 0.06337940392694962, "learning_rate": 9.476309226932668e-06, "loss": 0.0162, "step": 19 }, { "epoch": 0.014992503748125937, "grad_norm": 0.15247751367927734, "learning_rate": 9.975062344139652e-06, "loss": 0.0459, "step": 20 }, { "epoch": 0.015742128935532233, "grad_norm": 0.09470608329189008, "learning_rate": 1.0473815461346634e-05, "loss": 0.0076, "step": 21 }, { "epoch": 0.01649175412293853, "grad_norm": 0.06575899186194488, "learning_rate": 1.0972568578553616e-05, "loss": 0.0192, "step": 22 }, { "epoch": 0.017241379310344827, "grad_norm": 0.05557891385996695, "learning_rate": 1.1471321695760599e-05, "loss": 0.0163, "step": 23 }, { "epoch": 0.017991004497751123, "grad_norm": 0.07023715922511192, "learning_rate": 1.197007481296758e-05, "loss": 0.0299, "step": 24 }, { "epoch": 0.018740629685157422, "grad_norm": 0.05840741377060999, "learning_rate": 1.2468827930174564e-05, "loss": 0.0279, "step": 25 }, { "epoch": 0.019490254872563718, "grad_norm": 0.09879994131870222, "learning_rate": 1.2967581047381547e-05, "loss": 0.0417, "step": 26 }, { "epoch": 0.020239880059970013, "grad_norm": 0.09422607763934358, "learning_rate": 1.3466334164588529e-05, "loss": 0.0233, "step": 27 }, { "epoch": 0.020989505247376312, "grad_norm": 0.08381440041433495, "learning_rate": 1.396508728179551e-05, "loss": 0.0384, "step": 28 }, { "epoch": 0.021739130434782608, "grad_norm": 0.06866246125337262, "learning_rate": 1.4463840399002496e-05, "loss": 0.022, "step": 29 }, { "epoch": 0.022488755622188907, "grad_norm": 0.09579356805805937, "learning_rate": 1.4962593516209478e-05, "loss": 0.029, "step": 30 }, { "epoch": 0.023238380809595203, "grad_norm": 0.10676598627834114, "learning_rate": 1.546134663341646e-05, "loss": 0.0338, "step": 31 }, { "epoch": 0.0239880059970015, "grad_norm": 0.05483831376969332, "learning_rate": 1.596009975062344e-05, "loss": 0.017, "step": 32 }, { "epoch": 0.024737631184407798, "grad_norm": 0.08692197588794064, "learning_rate": 1.6458852867830423e-05, "loss": 0.0264, "step": 33 }, { "epoch": 0.025487256371814093, "grad_norm": 0.0925341267319359, "learning_rate": 1.695760598503741e-05, "loss": 0.0419, "step": 34 }, { "epoch": 0.02623688155922039, "grad_norm": 0.0671445291722069, "learning_rate": 1.745635910224439e-05, "loss": 0.0164, "step": 35 }, { "epoch": 0.026986506746626688, "grad_norm": 0.16026579884927714, "learning_rate": 1.7955112219451373e-05, "loss": 0.0221, "step": 36 }, { "epoch": 0.027736131934032984, "grad_norm": 0.18362459813157384, "learning_rate": 1.8453865336658355e-05, "loss": 0.0489, "step": 37 }, { "epoch": 0.02848575712143928, "grad_norm": 0.06518981670839212, "learning_rate": 1.8952618453865337e-05, "loss": 0.0174, "step": 38 }, { "epoch": 0.02923538230884558, "grad_norm": 0.06889355143618404, "learning_rate": 1.945137157107232e-05, "loss": 0.0191, "step": 39 }, { "epoch": 0.029985007496251874, "grad_norm": 0.21562757118442988, "learning_rate": 1.9950124688279304e-05, "loss": 0.0299, "step": 40 }, { "epoch": 0.03073463268365817, "grad_norm": 0.14245617300583052, "learning_rate": 2.0448877805486287e-05, "loss": 0.0442, "step": 41 }, { "epoch": 0.031484257871064465, "grad_norm": 0.09411252796667173, "learning_rate": 2.094763092269327e-05, "loss": 0.0319, "step": 42 }, { "epoch": 0.03223388305847077, "grad_norm": 0.08117541156848603, "learning_rate": 2.144638403990025e-05, "loss": 0.0225, "step": 43 }, { "epoch": 0.03298350824587706, "grad_norm": 0.08225245879431137, "learning_rate": 2.1945137157107233e-05, "loss": 0.0361, "step": 44 }, { "epoch": 0.03373313343328336, "grad_norm": 0.07830505167470701, "learning_rate": 2.2443890274314215e-05, "loss": 0.0315, "step": 45 }, { "epoch": 0.034482758620689655, "grad_norm": 0.06700726880908887, "learning_rate": 2.2942643391521197e-05, "loss": 0.0153, "step": 46 }, { "epoch": 0.03523238380809595, "grad_norm": 0.4031671176539563, "learning_rate": 2.344139650872818e-05, "loss": 0.0246, "step": 47 }, { "epoch": 0.035982008995502246, "grad_norm": 0.09991732387986536, "learning_rate": 2.394014962593516e-05, "loss": 0.0346, "step": 48 }, { "epoch": 0.03673163418290855, "grad_norm": 0.07162079160721155, "learning_rate": 2.4438902743142143e-05, "loss": 0.0314, "step": 49 }, { "epoch": 0.037481259370314844, "grad_norm": 0.0592546887993547, "learning_rate": 2.493765586034913e-05, "loss": 0.0154, "step": 50 }, { "epoch": 0.03823088455772114, "grad_norm": 0.08479532758764814, "learning_rate": 2.5436408977556108e-05, "loss": 0.0511, "step": 51 }, { "epoch": 0.038980509745127435, "grad_norm": 0.10538449374560438, "learning_rate": 2.5935162094763093e-05, "loss": 0.0296, "step": 52 }, { "epoch": 0.03973013493253373, "grad_norm": 0.05703635747748183, "learning_rate": 2.643391521197008e-05, "loss": 0.0191, "step": 53 }, { "epoch": 0.04047976011994003, "grad_norm": 0.07272076563350424, "learning_rate": 2.6932668329177057e-05, "loss": 0.0289, "step": 54 }, { "epoch": 0.04122938530734633, "grad_norm": 0.06914294384159929, "learning_rate": 2.7431421446384043e-05, "loss": 0.0216, "step": 55 }, { "epoch": 0.041979010494752625, "grad_norm": 0.12458369744795132, "learning_rate": 2.793017456359102e-05, "loss": 0.0521, "step": 56 }, { "epoch": 0.04272863568215892, "grad_norm": 0.03137816907539504, "learning_rate": 2.8428927680798007e-05, "loss": 0.01, "step": 57 }, { "epoch": 0.043478260869565216, "grad_norm": 0.04739555854735861, "learning_rate": 2.8927680798004992e-05, "loss": 0.0086, "step": 58 }, { "epoch": 0.04422788605697151, "grad_norm": 0.10036103034066238, "learning_rate": 2.942643391521197e-05, "loss": 0.0878, "step": 59 }, { "epoch": 0.044977511244377814, "grad_norm": 0.06748868600841415, "learning_rate": 2.9925187032418957e-05, "loss": 0.0248, "step": 60 }, { "epoch": 0.04572713643178411, "grad_norm": 0.07502502472970331, "learning_rate": 3.0423940149625935e-05, "loss": 0.0255, "step": 61 }, { "epoch": 0.046476761619190406, "grad_norm": 0.07710892432454533, "learning_rate": 3.092269326683292e-05, "loss": 0.0263, "step": 62 }, { "epoch": 0.0472263868065967, "grad_norm": 0.10538603215724578, "learning_rate": 3.14214463840399e-05, "loss": 0.0325, "step": 63 }, { "epoch": 0.047976011994003, "grad_norm": 0.07735064418610153, "learning_rate": 3.192019950124688e-05, "loss": 0.0378, "step": 64 }, { "epoch": 0.04872563718140929, "grad_norm": 0.03295359845033927, "learning_rate": 3.241895261845387e-05, "loss": 0.0105, "step": 65 }, { "epoch": 0.049475262368815595, "grad_norm": 0.1474088544285246, "learning_rate": 3.2917705735660846e-05, "loss": 0.026, "step": 66 }, { "epoch": 0.05022488755622189, "grad_norm": 0.05921704611745389, "learning_rate": 3.341645885286783e-05, "loss": 0.0139, "step": 67 }, { "epoch": 0.050974512743628186, "grad_norm": 0.14129659505130113, "learning_rate": 3.391521197007482e-05, "loss": 0.0301, "step": 68 }, { "epoch": 0.05172413793103448, "grad_norm": 0.10626253201464791, "learning_rate": 3.4413965087281796e-05, "loss": 0.0479, "step": 69 }, { "epoch": 0.05247376311844078, "grad_norm": 0.07763289594003318, "learning_rate": 3.491271820448878e-05, "loss": 0.0106, "step": 70 }, { "epoch": 0.05322338830584707, "grad_norm": 0.0691839287861348, "learning_rate": 3.541147132169576e-05, "loss": 0.0271, "step": 71 }, { "epoch": 0.053973013493253376, "grad_norm": 0.07070200939648912, "learning_rate": 3.5910224438902745e-05, "loss": 0.0223, "step": 72 }, { "epoch": 0.05472263868065967, "grad_norm": 0.19778882861726957, "learning_rate": 3.640897755610973e-05, "loss": 0.0465, "step": 73 }, { "epoch": 0.05547226386806597, "grad_norm": 0.060353823216797546, "learning_rate": 3.690773067331671e-05, "loss": 0.0246, "step": 74 }, { "epoch": 0.05622188905547226, "grad_norm": 0.07075868858224835, "learning_rate": 3.7406483790523695e-05, "loss": 0.0209, "step": 75 }, { "epoch": 0.05697151424287856, "grad_norm": 0.08762014555675333, "learning_rate": 3.7905236907730674e-05, "loss": 0.0311, "step": 76 }, { "epoch": 0.05772113943028486, "grad_norm": 0.1326222205107144, "learning_rate": 3.840399002493766e-05, "loss": 0.0407, "step": 77 }, { "epoch": 0.05847076461769116, "grad_norm": 0.0952714563149364, "learning_rate": 3.890274314214464e-05, "loss": 0.0479, "step": 78 }, { "epoch": 0.05922038980509745, "grad_norm": 0.07813721903130615, "learning_rate": 3.9401496259351623e-05, "loss": 0.0194, "step": 79 }, { "epoch": 0.05997001499250375, "grad_norm": 0.03586574592365885, "learning_rate": 3.990024937655861e-05, "loss": 0.0113, "step": 80 }, { "epoch": 0.060719640179910044, "grad_norm": 0.05312807875353562, "learning_rate": 4.039900249376559e-05, "loss": 0.0207, "step": 81 }, { "epoch": 0.06146926536731634, "grad_norm": 0.08206370101583557, "learning_rate": 4.089775561097257e-05, "loss": 0.0213, "step": 82 }, { "epoch": 0.06221889055472264, "grad_norm": 0.057566972449176544, "learning_rate": 4.139650872817955e-05, "loss": 0.0182, "step": 83 }, { "epoch": 0.06296851574212893, "grad_norm": 0.08521467516485813, "learning_rate": 4.189526184538654e-05, "loss": 0.0241, "step": 84 }, { "epoch": 0.06371814092953523, "grad_norm": 0.03571392648309694, "learning_rate": 4.239401496259352e-05, "loss": 0.0161, "step": 85 }, { "epoch": 0.06446776611694154, "grad_norm": 0.1230559997444316, "learning_rate": 4.28927680798005e-05, "loss": 0.0351, "step": 86 }, { "epoch": 0.06521739130434782, "grad_norm": 0.12512964537023197, "learning_rate": 4.339152119700749e-05, "loss": 0.0665, "step": 87 }, { "epoch": 0.06596701649175413, "grad_norm": 0.12217119750622546, "learning_rate": 4.3890274314214466e-05, "loss": 0.0136, "step": 88 }, { "epoch": 0.06671664167916042, "grad_norm": 0.0433698380582678, "learning_rate": 4.438902743142145e-05, "loss": 0.0323, "step": 89 }, { "epoch": 0.06746626686656672, "grad_norm": 0.12633070169699792, "learning_rate": 4.488778054862843e-05, "loss": 0.0179, "step": 90 }, { "epoch": 0.068215892053973, "grad_norm": 0.04740228242710997, "learning_rate": 4.5386533665835415e-05, "loss": 0.0202, "step": 91 }, { "epoch": 0.06896551724137931, "grad_norm": 0.061212399209499495, "learning_rate": 4.5885286783042394e-05, "loss": 0.0256, "step": 92 }, { "epoch": 0.06971514242878561, "grad_norm": 0.05192083675767972, "learning_rate": 4.638403990024938e-05, "loss": 0.016, "step": 93 }, { "epoch": 0.0704647676161919, "grad_norm": 0.1183986051919429, "learning_rate": 4.688279301745636e-05, "loss": 0.0392, "step": 94 }, { "epoch": 0.0712143928035982, "grad_norm": 0.0841259080165031, "learning_rate": 4.7381546134663344e-05, "loss": 0.0276, "step": 95 }, { "epoch": 0.07196401799100449, "grad_norm": 0.03569960655258622, "learning_rate": 4.788029925187032e-05, "loss": 0.0095, "step": 96 }, { "epoch": 0.0727136431784108, "grad_norm": 0.11869332056036354, "learning_rate": 4.837905236907731e-05, "loss": 0.0246, "step": 97 }, { "epoch": 0.0734632683658171, "grad_norm": 0.1133992731680118, "learning_rate": 4.887780548628429e-05, "loss": 0.0455, "step": 98 }, { "epoch": 0.07421289355322339, "grad_norm": 0.05434940900593147, "learning_rate": 4.937655860349127e-05, "loss": 0.0209, "step": 99 }, { "epoch": 0.07496251874062969, "grad_norm": 0.05742963314221326, "learning_rate": 4.987531172069826e-05, "loss": 0.0179, "step": 100 }, { "epoch": 0.07571214392803598, "grad_norm": 0.07358933301031388, "learning_rate": 5.037406483790524e-05, "loss": 0.0325, "step": 101 }, { "epoch": 0.07646176911544228, "grad_norm": 0.14545582068026564, "learning_rate": 5.0872817955112215e-05, "loss": 0.0264, "step": 102 }, { "epoch": 0.07721139430284858, "grad_norm": 0.09413188759154308, "learning_rate": 5.13715710723192e-05, "loss": 0.0253, "step": 103 }, { "epoch": 0.07796101949025487, "grad_norm": 0.03963600360457223, "learning_rate": 5.1870324189526186e-05, "loss": 0.0127, "step": 104 }, { "epoch": 0.07871064467766117, "grad_norm": 0.07599629125236715, "learning_rate": 5.236907730673317e-05, "loss": 0.0251, "step": 105 }, { "epoch": 0.07946026986506746, "grad_norm": 0.04480432796665436, "learning_rate": 5.286783042394016e-05, "loss": 0.0163, "step": 106 }, { "epoch": 0.08020989505247376, "grad_norm": 0.08201871045223541, "learning_rate": 5.336658354114713e-05, "loss": 0.0251, "step": 107 }, { "epoch": 0.08095952023988005, "grad_norm": 0.13488552993210706, "learning_rate": 5.3865336658354115e-05, "loss": 0.0216, "step": 108 }, { "epoch": 0.08170914542728636, "grad_norm": 0.16982953183139035, "learning_rate": 5.43640897755611e-05, "loss": 0.0721, "step": 109 }, { "epoch": 0.08245877061469266, "grad_norm": 0.07036861440927666, "learning_rate": 5.4862842892768086e-05, "loss": 0.0195, "step": 110 }, { "epoch": 0.08320839580209895, "grad_norm": 0.10363789462535822, "learning_rate": 5.536159600997507e-05, "loss": 0.0362, "step": 111 }, { "epoch": 0.08395802098950525, "grad_norm": 0.1389694465437715, "learning_rate": 5.586034912718204e-05, "loss": 0.0402, "step": 112 }, { "epoch": 0.08470764617691154, "grad_norm": 0.037148370829441675, "learning_rate": 5.635910224438903e-05, "loss": 0.0105, "step": 113 }, { "epoch": 0.08545727136431784, "grad_norm": 0.15980843277146153, "learning_rate": 5.6857855361596014e-05, "loss": 0.0581, "step": 114 }, { "epoch": 0.08620689655172414, "grad_norm": 0.12164293405284633, "learning_rate": 5.7356608478803e-05, "loss": 0.0336, "step": 115 }, { "epoch": 0.08695652173913043, "grad_norm": 0.10837151443746265, "learning_rate": 5.7855361596009985e-05, "loss": 0.0503, "step": 116 }, { "epoch": 0.08770614692653673, "grad_norm": 0.1354782049348464, "learning_rate": 5.835411471321696e-05, "loss": 0.0274, "step": 117 }, { "epoch": 0.08845577211394302, "grad_norm": 0.06563788761938176, "learning_rate": 5.885286783042394e-05, "loss": 0.0272, "step": 118 }, { "epoch": 0.08920539730134933, "grad_norm": 0.07104946785041905, "learning_rate": 5.935162094763093e-05, "loss": 0.0229, "step": 119 }, { "epoch": 0.08995502248875563, "grad_norm": 0.16694741445858008, "learning_rate": 5.985037406483791e-05, "loss": 0.0372, "step": 120 }, { "epoch": 0.09070464767616192, "grad_norm": 0.07864447323446397, "learning_rate": 6.03491271820449e-05, "loss": 0.0286, "step": 121 }, { "epoch": 0.09145427286356822, "grad_norm": 0.1366701828054804, "learning_rate": 6.084788029925187e-05, "loss": 0.0231, "step": 122 }, { "epoch": 0.09220389805097451, "grad_norm": 0.1070346979738295, "learning_rate": 6.134663341645886e-05, "loss": 0.0653, "step": 123 }, { "epoch": 0.09295352323838081, "grad_norm": 0.06698803113978262, "learning_rate": 6.184538653366583e-05, "loss": 0.0292, "step": 124 }, { "epoch": 0.0937031484257871, "grad_norm": 0.07097618623732171, "learning_rate": 6.234413965087283e-05, "loss": 0.0283, "step": 125 }, { "epoch": 0.0944527736131934, "grad_norm": 0.042448624191001115, "learning_rate": 6.28428927680798e-05, "loss": 0.0086, "step": 126 }, { "epoch": 0.0952023988005997, "grad_norm": 0.08465946332106826, "learning_rate": 6.334164588528678e-05, "loss": 0.0279, "step": 127 }, { "epoch": 0.095952023988006, "grad_norm": 0.089216378819531, "learning_rate": 6.384039900249376e-05, "loss": 0.031, "step": 128 }, { "epoch": 0.0967016491754123, "grad_norm": 0.07334432224365899, "learning_rate": 6.433915211970076e-05, "loss": 0.0138, "step": 129 }, { "epoch": 0.09745127436281859, "grad_norm": 0.0647148548610915, "learning_rate": 6.483790523690773e-05, "loss": 0.009, "step": 130 }, { "epoch": 0.09820089955022489, "grad_norm": 0.058673312076161714, "learning_rate": 6.533665835411473e-05, "loss": 0.0205, "step": 131 }, { "epoch": 0.09895052473763119, "grad_norm": 0.11603287692005247, "learning_rate": 6.583541147132169e-05, "loss": 0.0437, "step": 132 }, { "epoch": 0.09970014992503748, "grad_norm": 0.21370135684699681, "learning_rate": 6.633416458852868e-05, "loss": 0.0269, "step": 133 }, { "epoch": 0.10044977511244378, "grad_norm": 0.15923649990512292, "learning_rate": 6.683291770573566e-05, "loss": 0.0316, "step": 134 }, { "epoch": 0.10119940029985007, "grad_norm": 0.03278981431531208, "learning_rate": 6.733167082294266e-05, "loss": 0.0094, "step": 135 }, { "epoch": 0.10194902548725637, "grad_norm": 0.15833124285542596, "learning_rate": 6.783042394014963e-05, "loss": 0.0437, "step": 136 }, { "epoch": 0.10269865067466268, "grad_norm": 0.12400180816439312, "learning_rate": 6.832917705735661e-05, "loss": 0.0366, "step": 137 }, { "epoch": 0.10344827586206896, "grad_norm": 0.07377117070595124, "learning_rate": 6.882793017456359e-05, "loss": 0.0123, "step": 138 }, { "epoch": 0.10419790104947527, "grad_norm": 0.07122232015831038, "learning_rate": 6.932668329177058e-05, "loss": 0.0142, "step": 139 }, { "epoch": 0.10494752623688156, "grad_norm": 0.0332242122691683, "learning_rate": 6.982543640897756e-05, "loss": 0.0081, "step": 140 }, { "epoch": 0.10569715142428786, "grad_norm": 0.08678818831164349, "learning_rate": 7.032418952618454e-05, "loss": 0.0201, "step": 141 }, { "epoch": 0.10644677661169415, "grad_norm": 0.10250483247896906, "learning_rate": 7.082294264339152e-05, "loss": 0.0328, "step": 142 }, { "epoch": 0.10719640179910045, "grad_norm": 0.21159615399298012, "learning_rate": 7.132169576059851e-05, "loss": 0.0515, "step": 143 }, { "epoch": 0.10794602698650675, "grad_norm": 0.10522703720494737, "learning_rate": 7.182044887780549e-05, "loss": 0.0387, "step": 144 }, { "epoch": 0.10869565217391304, "grad_norm": 0.11820893519071421, "learning_rate": 7.231920199501247e-05, "loss": 0.0257, "step": 145 }, { "epoch": 0.10944527736131934, "grad_norm": 0.13986913155043815, "learning_rate": 7.281795511221946e-05, "loss": 0.0151, "step": 146 }, { "epoch": 0.11019490254872563, "grad_norm": 0.0559111712109555, "learning_rate": 7.331670822942644e-05, "loss": 0.0118, "step": 147 }, { "epoch": 0.11094452773613193, "grad_norm": 0.0335500919451813, "learning_rate": 7.381546134663342e-05, "loss": 0.0086, "step": 148 }, { "epoch": 0.11169415292353824, "grad_norm": 0.11033605798184272, "learning_rate": 7.43142144638404e-05, "loss": 0.0478, "step": 149 }, { "epoch": 0.11244377811094453, "grad_norm": 0.214427015965264, "learning_rate": 7.481296758104739e-05, "loss": 0.0353, "step": 150 }, { "epoch": 0.11319340329835083, "grad_norm": 0.02568191244810021, "learning_rate": 7.531172069825437e-05, "loss": 0.0043, "step": 151 }, { "epoch": 0.11394302848575712, "grad_norm": 0.09246008396243098, "learning_rate": 7.581047381546135e-05, "loss": 0.0192, "step": 152 }, { "epoch": 0.11469265367316342, "grad_norm": 0.08351131161755415, "learning_rate": 7.630922693266833e-05, "loss": 0.0277, "step": 153 }, { "epoch": 0.11544227886056972, "grad_norm": 0.11694367937691498, "learning_rate": 7.680798004987532e-05, "loss": 0.0307, "step": 154 }, { "epoch": 0.11619190404797601, "grad_norm": 0.09838886537023853, "learning_rate": 7.73067331670823e-05, "loss": 0.0275, "step": 155 }, { "epoch": 0.11694152923538231, "grad_norm": 0.12195844474057815, "learning_rate": 7.780548628428928e-05, "loss": 0.0242, "step": 156 }, { "epoch": 0.1176911544227886, "grad_norm": 0.08775886644144641, "learning_rate": 7.830423940149625e-05, "loss": 0.0339, "step": 157 }, { "epoch": 0.1184407796101949, "grad_norm": 0.0960095760790364, "learning_rate": 7.880299251870325e-05, "loss": 0.0305, "step": 158 }, { "epoch": 0.1191904047976012, "grad_norm": 0.056421431695985014, "learning_rate": 7.930174563591023e-05, "loss": 0.0167, "step": 159 }, { "epoch": 0.1199400299850075, "grad_norm": 0.03780015107287049, "learning_rate": 7.980049875311722e-05, "loss": 0.0132, "step": 160 }, { "epoch": 0.1206896551724138, "grad_norm": 0.06643939369941203, "learning_rate": 8.029925187032418e-05, "loss": 0.0253, "step": 161 }, { "epoch": 0.12143928035982009, "grad_norm": 0.07834551801424494, "learning_rate": 8.079800498753118e-05, "loss": 0.0243, "step": 162 }, { "epoch": 0.12218890554722639, "grad_norm": 0.3428236638579461, "learning_rate": 8.129675810473815e-05, "loss": 0.0761, "step": 163 }, { "epoch": 0.12293853073463268, "grad_norm": 0.25562499213943013, "learning_rate": 8.179551122194515e-05, "loss": 0.0606, "step": 164 }, { "epoch": 0.12368815592203898, "grad_norm": 0.2200505423062946, "learning_rate": 8.229426433915212e-05, "loss": 0.0413, "step": 165 }, { "epoch": 0.12443778110944528, "grad_norm": 0.17323521016059398, "learning_rate": 8.27930174563591e-05, "loss": 0.065, "step": 166 }, { "epoch": 0.12518740629685157, "grad_norm": 0.12386450278132788, "learning_rate": 8.329177057356608e-05, "loss": 0.0465, "step": 167 }, { "epoch": 0.12593703148425786, "grad_norm": 0.06513383821449026, "learning_rate": 8.379052369077307e-05, "loss": 0.0235, "step": 168 }, { "epoch": 0.12668665667166418, "grad_norm": 0.0786581795256923, "learning_rate": 8.428927680798005e-05, "loss": 0.0169, "step": 169 }, { "epoch": 0.12743628185907047, "grad_norm": 0.09510857719264296, "learning_rate": 8.478802992518705e-05, "loss": 0.0328, "step": 170 }, { "epoch": 0.12818590704647675, "grad_norm": 0.06482973979218357, "learning_rate": 8.528678304239401e-05, "loss": 0.0201, "step": 171 }, { "epoch": 0.12893553223388307, "grad_norm": 0.06813364768120031, "learning_rate": 8.5785536159601e-05, "loss": 0.0223, "step": 172 }, { "epoch": 0.12968515742128936, "grad_norm": 0.11763835146175822, "learning_rate": 8.628428927680798e-05, "loss": 0.0178, "step": 173 }, { "epoch": 0.13043478260869565, "grad_norm": 0.07098697087543453, "learning_rate": 8.678304239401497e-05, "loss": 0.0186, "step": 174 }, { "epoch": 0.13118440779610194, "grad_norm": 0.06061152025510322, "learning_rate": 8.728179551122195e-05, "loss": 0.0271, "step": 175 }, { "epoch": 0.13193403298350825, "grad_norm": 0.10686088781724719, "learning_rate": 8.778054862842893e-05, "loss": 0.0354, "step": 176 }, { "epoch": 0.13268365817091454, "grad_norm": 0.04808984107806594, "learning_rate": 8.827930174563591e-05, "loss": 0.0159, "step": 177 }, { "epoch": 0.13343328335832083, "grad_norm": 0.08321421485706715, "learning_rate": 8.87780548628429e-05, "loss": 0.0113, "step": 178 }, { "epoch": 0.13418290854572715, "grad_norm": 0.06279494949837901, "learning_rate": 8.927680798004988e-05, "loss": 0.0266, "step": 179 }, { "epoch": 0.13493253373313344, "grad_norm": 0.047898422343989944, "learning_rate": 8.977556109725686e-05, "loss": 0.0126, "step": 180 }, { "epoch": 0.13568215892053972, "grad_norm": 0.07528619816418361, "learning_rate": 9.027431421446384e-05, "loss": 0.0166, "step": 181 }, { "epoch": 0.136431784107946, "grad_norm": 0.1377475203565619, "learning_rate": 9.077306733167083e-05, "loss": 0.052, "step": 182 }, { "epoch": 0.13718140929535233, "grad_norm": 0.09048176822457338, "learning_rate": 9.127182044887781e-05, "loss": 0.0285, "step": 183 }, { "epoch": 0.13793103448275862, "grad_norm": 0.0809521483858753, "learning_rate": 9.177057356608479e-05, "loss": 0.0141, "step": 184 }, { "epoch": 0.1386806596701649, "grad_norm": 0.09636075780960374, "learning_rate": 9.226932668329178e-05, "loss": 0.0336, "step": 185 }, { "epoch": 0.13943028485757122, "grad_norm": 0.13053296398291478, "learning_rate": 9.276807980049876e-05, "loss": 0.0397, "step": 186 }, { "epoch": 0.1401799100449775, "grad_norm": 0.07863031021793394, "learning_rate": 9.326683291770574e-05, "loss": 0.0162, "step": 187 }, { "epoch": 0.1409295352323838, "grad_norm": 0.1228536793457434, "learning_rate": 9.376558603491272e-05, "loss": 0.044, "step": 188 }, { "epoch": 0.14167916041979012, "grad_norm": 0.09209027498232235, "learning_rate": 9.426433915211971e-05, "loss": 0.0334, "step": 189 }, { "epoch": 0.1424287856071964, "grad_norm": 0.14190775672822614, "learning_rate": 9.476309226932669e-05, "loss": 0.042, "step": 190 }, { "epoch": 0.1431784107946027, "grad_norm": 0.04023312981224354, "learning_rate": 9.526184538653367e-05, "loss": 0.0142, "step": 191 }, { "epoch": 0.14392803598200898, "grad_norm": 0.09583290305937744, "learning_rate": 9.576059850374065e-05, "loss": 0.0385, "step": 192 }, { "epoch": 0.1446776611694153, "grad_norm": 0.21954528231449671, "learning_rate": 9.625935162094764e-05, "loss": 0.1279, "step": 193 }, { "epoch": 0.1454272863568216, "grad_norm": 0.059750365210102005, "learning_rate": 9.675810473815462e-05, "loss": 0.0237, "step": 194 }, { "epoch": 0.14617691154422788, "grad_norm": 0.07514131332440416, "learning_rate": 9.725685785536161e-05, "loss": 0.0283, "step": 195 }, { "epoch": 0.1469265367316342, "grad_norm": 0.29649323772724173, "learning_rate": 9.775561097256857e-05, "loss": 0.0509, "step": 196 }, { "epoch": 0.14767616191904048, "grad_norm": 0.11218042264432664, "learning_rate": 9.825436408977557e-05, "loss": 0.0416, "step": 197 }, { "epoch": 0.14842578710644677, "grad_norm": 0.09965160873863997, "learning_rate": 9.875311720698254e-05, "loss": 0.0274, "step": 198 }, { "epoch": 0.14917541229385306, "grad_norm": 0.03997333482371075, "learning_rate": 9.925187032418954e-05, "loss": 0.01, "step": 199 }, { "epoch": 0.14992503748125938, "grad_norm": 0.07793966938286397, "learning_rate": 9.975062344139652e-05, "loss": 0.0179, "step": 200 }, { "epoch": 0.15067466266866567, "grad_norm": 0.05587033266697241, "learning_rate": 0.0001002493765586035, "loss": 0.0116, "step": 201 }, { "epoch": 0.15142428785607195, "grad_norm": 0.05231986537136687, "learning_rate": 0.00010074812967581049, "loss": 0.0161, "step": 202 }, { "epoch": 0.15217391304347827, "grad_norm": 0.10417060349041267, "learning_rate": 0.00010124688279301747, "loss": 0.0364, "step": 203 }, { "epoch": 0.15292353823088456, "grad_norm": 0.05377861825830805, "learning_rate": 0.00010174563591022443, "loss": 0.0195, "step": 204 }, { "epoch": 0.15367316341829085, "grad_norm": 0.11829981219141997, "learning_rate": 0.00010224438902743144, "loss": 0.018, "step": 205 }, { "epoch": 0.15442278860569716, "grad_norm": 0.07056012168980838, "learning_rate": 0.0001027431421446384, "loss": 0.0228, "step": 206 }, { "epoch": 0.15517241379310345, "grad_norm": 0.40082059596090186, "learning_rate": 0.00010324189526184541, "loss": 0.0755, "step": 207 }, { "epoch": 0.15592203898050974, "grad_norm": 0.05881080321052975, "learning_rate": 0.00010374064837905237, "loss": 0.0126, "step": 208 }, { "epoch": 0.15667166416791603, "grad_norm": 0.04136141605240806, "learning_rate": 0.00010423940149625935, "loss": 0.0072, "step": 209 }, { "epoch": 0.15742128935532235, "grad_norm": 0.1675625733681966, "learning_rate": 0.00010473815461346634, "loss": 0.0272, "step": 210 }, { "epoch": 0.15817091454272864, "grad_norm": 0.15430875264329627, "learning_rate": 0.00010523690773067332, "loss": 0.0446, "step": 211 }, { "epoch": 0.15892053973013492, "grad_norm": 0.09830541143118073, "learning_rate": 0.00010573566084788031, "loss": 0.0199, "step": 212 }, { "epoch": 0.15967016491754124, "grad_norm": 0.10076300729558095, "learning_rate": 0.00010623441396508729, "loss": 0.0259, "step": 213 }, { "epoch": 0.16041979010494753, "grad_norm": 0.34926458260254734, "learning_rate": 0.00010673316708229426, "loss": 0.0519, "step": 214 }, { "epoch": 0.16116941529235382, "grad_norm": 0.06281413567266797, "learning_rate": 0.00010723192019950125, "loss": 0.013, "step": 215 }, { "epoch": 0.1619190404797601, "grad_norm": 0.06672214691330823, "learning_rate": 0.00010773067331670823, "loss": 0.0149, "step": 216 }, { "epoch": 0.16266866566716642, "grad_norm": 0.07641994031425416, "learning_rate": 0.00010822942643391522, "loss": 0.026, "step": 217 }, { "epoch": 0.1634182908545727, "grad_norm": 0.09389436067817795, "learning_rate": 0.0001087281795511222, "loss": 0.0326, "step": 218 }, { "epoch": 0.164167916041979, "grad_norm": 0.0648179280805623, "learning_rate": 0.00010922693266832918, "loss": 0.0191, "step": 219 }, { "epoch": 0.16491754122938532, "grad_norm": 0.0474636418457805, "learning_rate": 0.00010972568578553617, "loss": 0.0132, "step": 220 }, { "epoch": 0.1656671664167916, "grad_norm": 0.06838399746911754, "learning_rate": 0.00011022443890274315, "loss": 0.01, "step": 221 }, { "epoch": 0.1664167916041979, "grad_norm": 0.17561036272482808, "learning_rate": 0.00011072319201995014, "loss": 0.0374, "step": 222 }, { "epoch": 0.1671664167916042, "grad_norm": 0.2569925810493672, "learning_rate": 0.00011122194513715711, "loss": 0.0516, "step": 223 }, { "epoch": 0.1679160419790105, "grad_norm": 0.08357598242934498, "learning_rate": 0.00011172069825436409, "loss": 0.0291, "step": 224 }, { "epoch": 0.1686656671664168, "grad_norm": 0.05291408301494247, "learning_rate": 0.00011221945137157108, "loss": 0.0201, "step": 225 }, { "epoch": 0.16941529235382308, "grad_norm": 0.06497075638907762, "learning_rate": 0.00011271820448877806, "loss": 0.0124, "step": 226 }, { "epoch": 0.1701649175412294, "grad_norm": 0.07406198574289732, "learning_rate": 0.00011321695760598505, "loss": 0.0237, "step": 227 }, { "epoch": 0.17091454272863568, "grad_norm": 0.05439821029505069, "learning_rate": 0.00011371571072319203, "loss": 0.0211, "step": 228 }, { "epoch": 0.17166416791604197, "grad_norm": 0.08111654982385613, "learning_rate": 0.00011421446384039899, "loss": 0.0285, "step": 229 }, { "epoch": 0.1724137931034483, "grad_norm": 0.12398138802704543, "learning_rate": 0.000114713216957606, "loss": 0.0425, "step": 230 }, { "epoch": 0.17316341829085458, "grad_norm": 0.2334451447921542, "learning_rate": 0.00011521197007481296, "loss": 0.0763, "step": 231 }, { "epoch": 0.17391304347826086, "grad_norm": 0.07553003516076283, "learning_rate": 0.00011571072319201997, "loss": 0.0255, "step": 232 }, { "epoch": 0.17466266866566715, "grad_norm": 0.09362267466853937, "learning_rate": 0.00011620947630922693, "loss": 0.0178, "step": 233 }, { "epoch": 0.17541229385307347, "grad_norm": 0.12066656741285244, "learning_rate": 0.00011670822942643391, "loss": 0.0269, "step": 234 }, { "epoch": 0.17616191904047976, "grad_norm": 0.050085879729023425, "learning_rate": 0.0001172069825436409, "loss": 0.0145, "step": 235 }, { "epoch": 0.17691154422788605, "grad_norm": 0.059337800135607535, "learning_rate": 0.00011770573566084788, "loss": 0.0218, "step": 236 }, { "epoch": 0.17766116941529236, "grad_norm": 0.2599879422217277, "learning_rate": 0.00011820448877805488, "loss": 0.0584, "step": 237 }, { "epoch": 0.17841079460269865, "grad_norm": 0.0642722077940849, "learning_rate": 0.00011870324189526186, "loss": 0.0168, "step": 238 }, { "epoch": 0.17916041979010494, "grad_norm": 0.06072818847877561, "learning_rate": 0.00011920199501246882, "loss": 0.0171, "step": 239 }, { "epoch": 0.17991004497751126, "grad_norm": 0.05000902530731167, "learning_rate": 0.00011970074812967583, "loss": 0.0093, "step": 240 }, { "epoch": 0.18065967016491755, "grad_norm": 0.10030548737751505, "learning_rate": 0.00012019950124688279, "loss": 0.0258, "step": 241 }, { "epoch": 0.18140929535232383, "grad_norm": 0.12845354750733506, "learning_rate": 0.0001206982543640898, "loss": 0.0385, "step": 242 }, { "epoch": 0.18215892053973012, "grad_norm": 0.11530518879949096, "learning_rate": 0.00012119700748129676, "loss": 0.0276, "step": 243 }, { "epoch": 0.18290854572713644, "grad_norm": 0.11115377213909522, "learning_rate": 0.00012169576059850374, "loss": 0.0339, "step": 244 }, { "epoch": 0.18365817091454273, "grad_norm": 0.08437903896927137, "learning_rate": 0.00012219451371571073, "loss": 0.0147, "step": 245 }, { "epoch": 0.18440779610194902, "grad_norm": 0.08333540488761688, "learning_rate": 0.0001226932668329177, "loss": 0.0207, "step": 246 }, { "epoch": 0.18515742128935533, "grad_norm": 0.0637836007664769, "learning_rate": 0.0001231920199501247, "loss": 0.0176, "step": 247 }, { "epoch": 0.18590704647676162, "grad_norm": 0.19541845065090152, "learning_rate": 0.00012369077306733167, "loss": 0.0294, "step": 248 }, { "epoch": 0.1866566716641679, "grad_norm": 0.09604182736199869, "learning_rate": 0.00012418952618453865, "loss": 0.0281, "step": 249 }, { "epoch": 0.1874062968515742, "grad_norm": 0.1656571821759516, "learning_rate": 0.00012468827930174565, "loss": 0.023, "step": 250 }, { "epoch": 0.18815592203898052, "grad_norm": 0.07893870621876838, "learning_rate": 0.00012518703241895263, "loss": 0.0149, "step": 251 }, { "epoch": 0.1889055472263868, "grad_norm": 0.08177991639723088, "learning_rate": 0.0001256857855361596, "loss": 0.0275, "step": 252 }, { "epoch": 0.1896551724137931, "grad_norm": 0.09139205799991805, "learning_rate": 0.0001261845386533666, "loss": 0.0316, "step": 253 }, { "epoch": 0.1904047976011994, "grad_norm": 0.13893142926140026, "learning_rate": 0.00012668329177057357, "loss": 0.0169, "step": 254 }, { "epoch": 0.1911544227886057, "grad_norm": 0.12412801138084277, "learning_rate": 0.00012718204488778055, "loss": 0.039, "step": 255 }, { "epoch": 0.191904047976012, "grad_norm": 0.11645301088929443, "learning_rate": 0.00012768079800498753, "loss": 0.0249, "step": 256 }, { "epoch": 0.1926536731634183, "grad_norm": 0.08789313821943712, "learning_rate": 0.00012817955112219453, "loss": 0.0163, "step": 257 }, { "epoch": 0.1934032983508246, "grad_norm": 0.27323828643444004, "learning_rate": 0.0001286783042394015, "loss": 0.0281, "step": 258 }, { "epoch": 0.19415292353823088, "grad_norm": 0.1778613775057513, "learning_rate": 0.0001291770573566085, "loss": 0.081, "step": 259 }, { "epoch": 0.19490254872563717, "grad_norm": 0.1731020084043939, "learning_rate": 0.00012967581047381547, "loss": 0.0432, "step": 260 }, { "epoch": 0.1956521739130435, "grad_norm": 0.12374766749581116, "learning_rate": 0.00013017456359102245, "loss": 0.0497, "step": 261 }, { "epoch": 0.19640179910044978, "grad_norm": 0.12677399108624252, "learning_rate": 0.00013067331670822945, "loss": 0.0266, "step": 262 }, { "epoch": 0.19715142428785606, "grad_norm": 0.18629924644455734, "learning_rate": 0.0001311720698254364, "loss": 0.0414, "step": 263 }, { "epoch": 0.19790104947526238, "grad_norm": 0.0829328057797884, "learning_rate": 0.00013167082294264338, "loss": 0.0205, "step": 264 }, { "epoch": 0.19865067466266867, "grad_norm": 0.08620872083309367, "learning_rate": 0.0001321695760598504, "loss": 0.0173, "step": 265 }, { "epoch": 0.19940029985007496, "grad_norm": 0.09317926758121065, "learning_rate": 0.00013266832917705737, "loss": 0.0077, "step": 266 }, { "epoch": 0.20014992503748125, "grad_norm": 0.08818698775422455, "learning_rate": 0.00013316708229426435, "loss": 0.0328, "step": 267 }, { "epoch": 0.20089955022488756, "grad_norm": 0.08994483021679653, "learning_rate": 0.00013366583541147133, "loss": 0.0348, "step": 268 }, { "epoch": 0.20164917541229385, "grad_norm": 0.09591148983377801, "learning_rate": 0.0001341645885286783, "loss": 0.0236, "step": 269 }, { "epoch": 0.20239880059970014, "grad_norm": 0.11320056714033523, "learning_rate": 0.0001346633416458853, "loss": 0.0218, "step": 270 }, { "epoch": 0.20314842578710646, "grad_norm": 0.5227925892105147, "learning_rate": 0.00013516209476309226, "loss": 0.0362, "step": 271 }, { "epoch": 0.20389805097451275, "grad_norm": 0.03669513253608481, "learning_rate": 0.00013566084788029927, "loss": 0.0124, "step": 272 }, { "epoch": 0.20464767616191903, "grad_norm": 0.07688426864930054, "learning_rate": 0.00013615960099750625, "loss": 0.0256, "step": 273 }, { "epoch": 0.20539730134932535, "grad_norm": 0.08079663825268356, "learning_rate": 0.00013665835411471322, "loss": 0.0265, "step": 274 }, { "epoch": 0.20614692653673164, "grad_norm": 0.19049040538670042, "learning_rate": 0.0001371571072319202, "loss": 0.0431, "step": 275 }, { "epoch": 0.20689655172413793, "grad_norm": 0.19674880634163666, "learning_rate": 0.00013765586034912718, "loss": 0.0259, "step": 276 }, { "epoch": 0.20764617691154422, "grad_norm": 0.2821751560105795, "learning_rate": 0.0001381546134663342, "loss": 0.057, "step": 277 }, { "epoch": 0.20839580209895053, "grad_norm": 0.10359108265397039, "learning_rate": 0.00013865336658354117, "loss": 0.0228, "step": 278 }, { "epoch": 0.20914542728635682, "grad_norm": 0.1806972746554061, "learning_rate": 0.00013915211970074812, "loss": 0.074, "step": 279 }, { "epoch": 0.2098950524737631, "grad_norm": 0.05580356582461233, "learning_rate": 0.00013965087281795512, "loss": 0.0198, "step": 280 }, { "epoch": 0.21064467766116943, "grad_norm": 0.0852610589365401, "learning_rate": 0.0001401496259351621, "loss": 0.0268, "step": 281 }, { "epoch": 0.21139430284857572, "grad_norm": 0.1500943752318949, "learning_rate": 0.00014064837905236908, "loss": 0.0248, "step": 282 }, { "epoch": 0.212143928035982, "grad_norm": 0.34676376255278374, "learning_rate": 0.00014114713216957606, "loss": 0.0455, "step": 283 }, { "epoch": 0.2128935532233883, "grad_norm": 0.07586618072679835, "learning_rate": 0.00014164588528678304, "loss": 0.0282, "step": 284 }, { "epoch": 0.2136431784107946, "grad_norm": 0.0680131198216817, "learning_rate": 0.00014214463840399004, "loss": 0.0168, "step": 285 }, { "epoch": 0.2143928035982009, "grad_norm": 0.15412546383230158, "learning_rate": 0.00014264339152119702, "loss": 0.0262, "step": 286 }, { "epoch": 0.2151424287856072, "grad_norm": 0.32409732407737396, "learning_rate": 0.000143142144638404, "loss": 0.0431, "step": 287 }, { "epoch": 0.2158920539730135, "grad_norm": 0.05689977495195114, "learning_rate": 0.00014364089775561098, "loss": 0.0098, "step": 288 }, { "epoch": 0.2166416791604198, "grad_norm": 0.10219232043950437, "learning_rate": 0.00014413965087281796, "loss": 0.0446, "step": 289 }, { "epoch": 0.21739130434782608, "grad_norm": 0.14753874873503348, "learning_rate": 0.00014463840399002494, "loss": 0.0338, "step": 290 }, { "epoch": 0.2181409295352324, "grad_norm": 0.09688329615506609, "learning_rate": 0.00014513715710723192, "loss": 0.0317, "step": 291 }, { "epoch": 0.21889055472263869, "grad_norm": 0.37462001623399555, "learning_rate": 0.00014563591022443892, "loss": 0.0488, "step": 292 }, { "epoch": 0.21964017991004497, "grad_norm": 0.1744644075575525, "learning_rate": 0.0001461346633416459, "loss": 0.0238, "step": 293 }, { "epoch": 0.22038980509745126, "grad_norm": 0.12911989638647245, "learning_rate": 0.00014663341645885288, "loss": 0.0262, "step": 294 }, { "epoch": 0.22113943028485758, "grad_norm": 0.15596073376994574, "learning_rate": 0.00014713216957605986, "loss": 0.038, "step": 295 }, { "epoch": 0.22188905547226387, "grad_norm": 0.20990059849544168, "learning_rate": 0.00014763092269326684, "loss": 0.0497, "step": 296 }, { "epoch": 0.22263868065967016, "grad_norm": 0.185998926544591, "learning_rate": 0.00014812967581047384, "loss": 0.0438, "step": 297 }, { "epoch": 0.22338830584707647, "grad_norm": 0.1472947653395365, "learning_rate": 0.0001486284289276808, "loss": 0.0482, "step": 298 }, { "epoch": 0.22413793103448276, "grad_norm": 0.11766072318652662, "learning_rate": 0.00014912718204488777, "loss": 0.0341, "step": 299 }, { "epoch": 0.22488755622188905, "grad_norm": 0.16151375968719706, "learning_rate": 0.00014962593516209478, "loss": 0.0235, "step": 300 }, { "epoch": 0.22563718140929534, "grad_norm": 0.13978947872038194, "learning_rate": 0.00015012468827930176, "loss": 0.0393, "step": 301 }, { "epoch": 0.22638680659670166, "grad_norm": 0.21401300347047775, "learning_rate": 0.00015062344139650874, "loss": 0.0178, "step": 302 }, { "epoch": 0.22713643178410794, "grad_norm": 0.24801232446587168, "learning_rate": 0.00015112219451371572, "loss": 0.0407, "step": 303 }, { "epoch": 0.22788605697151423, "grad_norm": 0.058632535114882166, "learning_rate": 0.0001516209476309227, "loss": 0.0206, "step": 304 }, { "epoch": 0.22863568215892055, "grad_norm": 0.0840227180439461, "learning_rate": 0.0001521197007481297, "loss": 0.0287, "step": 305 }, { "epoch": 0.22938530734632684, "grad_norm": 0.08132083789297478, "learning_rate": 0.00015261845386533665, "loss": 0.026, "step": 306 }, { "epoch": 0.23013493253373313, "grad_norm": 0.10781849659948611, "learning_rate": 0.00015311720698254363, "loss": 0.0149, "step": 307 }, { "epoch": 0.23088455772113944, "grad_norm": 0.15794412633172517, "learning_rate": 0.00015361596009975064, "loss": 0.0536, "step": 308 }, { "epoch": 0.23163418290854573, "grad_norm": 0.11903463083343918, "learning_rate": 0.00015411471321695762, "loss": 0.0261, "step": 309 }, { "epoch": 0.23238380809595202, "grad_norm": 0.08270710290559068, "learning_rate": 0.0001546134663341646, "loss": 0.0177, "step": 310 }, { "epoch": 0.2331334332833583, "grad_norm": 0.12039252632311984, "learning_rate": 0.00015511221945137157, "loss": 0.0326, "step": 311 }, { "epoch": 0.23388305847076463, "grad_norm": 0.09035121559893523, "learning_rate": 0.00015561097256857855, "loss": 0.0218, "step": 312 }, { "epoch": 0.23463268365817092, "grad_norm": 0.2608168886436339, "learning_rate": 0.00015610972568578556, "loss": 0.0298, "step": 313 }, { "epoch": 0.2353823088455772, "grad_norm": 0.08183654251092078, "learning_rate": 0.0001566084788029925, "loss": 0.0149, "step": 314 }, { "epoch": 0.23613193403298352, "grad_norm": 0.19660962857791126, "learning_rate": 0.00015710723192019951, "loss": 0.0485, "step": 315 }, { "epoch": 0.2368815592203898, "grad_norm": 0.37386494103393336, "learning_rate": 0.0001576059850374065, "loss": 0.0572, "step": 316 }, { "epoch": 0.2376311844077961, "grad_norm": 0.1704419836308577, "learning_rate": 0.00015810473815461347, "loss": 0.0315, "step": 317 }, { "epoch": 0.2383808095952024, "grad_norm": 0.13091652195928158, "learning_rate": 0.00015860349127182045, "loss": 0.0388, "step": 318 }, { "epoch": 0.2391304347826087, "grad_norm": 0.14223677525604458, "learning_rate": 0.00015910224438902743, "loss": 0.0253, "step": 319 }, { "epoch": 0.239880059970015, "grad_norm": 0.17823510302750656, "learning_rate": 0.00015960099750623444, "loss": 0.0207, "step": 320 }, { "epoch": 0.24062968515742128, "grad_norm": 0.11745599847225957, "learning_rate": 0.00016009975062344141, "loss": 0.0421, "step": 321 }, { "epoch": 0.2413793103448276, "grad_norm": 0.2474114622826013, "learning_rate": 0.00016059850374064837, "loss": 0.0256, "step": 322 }, { "epoch": 0.24212893553223389, "grad_norm": 0.17085562013724867, "learning_rate": 0.00016109725685785537, "loss": 0.0353, "step": 323 }, { "epoch": 0.24287856071964017, "grad_norm": 0.2372708323711952, "learning_rate": 0.00016159600997506235, "loss": 0.0571, "step": 324 }, { "epoch": 0.2436281859070465, "grad_norm": 0.2519151153575236, "learning_rate": 0.00016209476309226933, "loss": 0.06, "step": 325 }, { "epoch": 0.24437781109445278, "grad_norm": 0.24246496343523052, "learning_rate": 0.0001625935162094763, "loss": 0.0518, "step": 326 }, { "epoch": 0.24512743628185907, "grad_norm": 0.10872569583987493, "learning_rate": 0.0001630922693266833, "loss": 0.02, "step": 327 }, { "epoch": 0.24587706146926536, "grad_norm": 0.25513609548593424, "learning_rate": 0.0001635910224438903, "loss": 0.0482, "step": 328 }, { "epoch": 0.24662668665667167, "grad_norm": 0.11260345413307601, "learning_rate": 0.00016408977556109727, "loss": 0.0212, "step": 329 }, { "epoch": 0.24737631184407796, "grad_norm": 0.49079259160521377, "learning_rate": 0.00016458852867830425, "loss": 0.053, "step": 330 }, { "epoch": 0.24812593703148425, "grad_norm": 0.11154822451105055, "learning_rate": 0.00016508728179551123, "loss": 0.0241, "step": 331 }, { "epoch": 0.24887556221889057, "grad_norm": 0.09571826846514553, "learning_rate": 0.0001655860349127182, "loss": 0.0252, "step": 332 }, { "epoch": 0.24962518740629686, "grad_norm": 0.0777734898245894, "learning_rate": 0.00016608478802992519, "loss": 0.027, "step": 333 }, { "epoch": 0.25037481259370314, "grad_norm": 0.14307189815218685, "learning_rate": 0.00016658354114713216, "loss": 0.0383, "step": 334 }, { "epoch": 0.25112443778110943, "grad_norm": 0.13735355901692434, "learning_rate": 0.00016708229426433917, "loss": 0.0303, "step": 335 }, { "epoch": 0.2518740629685157, "grad_norm": 0.16726996989241977, "learning_rate": 0.00016758104738154615, "loss": 0.03, "step": 336 }, { "epoch": 0.25262368815592207, "grad_norm": 0.16945012137988621, "learning_rate": 0.00016807980049875313, "loss": 0.0406, "step": 337 }, { "epoch": 0.25337331334332835, "grad_norm": 0.11530437802632285, "learning_rate": 0.0001685785536159601, "loss": 0.0225, "step": 338 }, { "epoch": 0.25412293853073464, "grad_norm": 0.09960125210392978, "learning_rate": 0.00016907730673316709, "loss": 0.0209, "step": 339 }, { "epoch": 0.25487256371814093, "grad_norm": 0.13443406892444226, "learning_rate": 0.0001695760598503741, "loss": 0.0492, "step": 340 }, { "epoch": 0.2556221889055472, "grad_norm": 0.10886582012917131, "learning_rate": 0.00017007481296758104, "loss": 0.0249, "step": 341 }, { "epoch": 0.2563718140929535, "grad_norm": 0.06084801968020566, "learning_rate": 0.00017057356608478802, "loss": 0.0144, "step": 342 }, { "epoch": 0.2571214392803598, "grad_norm": 0.4190379355567448, "learning_rate": 0.00017107231920199503, "loss": 0.0749, "step": 343 }, { "epoch": 0.25787106446776614, "grad_norm": 0.11112704571061172, "learning_rate": 0.000171571072319202, "loss": 0.0206, "step": 344 }, { "epoch": 0.25862068965517243, "grad_norm": 0.07059400346190974, "learning_rate": 0.00017206982543640898, "loss": 0.0186, "step": 345 }, { "epoch": 0.2593703148425787, "grad_norm": 0.32755669501610246, "learning_rate": 0.00017256857855361596, "loss": 0.0417, "step": 346 }, { "epoch": 0.260119940029985, "grad_norm": 0.1983041340897065, "learning_rate": 0.00017306733167082294, "loss": 0.0405, "step": 347 }, { "epoch": 0.2608695652173913, "grad_norm": 0.26450010843425603, "learning_rate": 0.00017356608478802995, "loss": 0.0254, "step": 348 }, { "epoch": 0.2616191904047976, "grad_norm": 0.05678132030515059, "learning_rate": 0.0001740648379052369, "loss": 0.0103, "step": 349 }, { "epoch": 0.2623688155922039, "grad_norm": 0.07668079412848387, "learning_rate": 0.0001745635910224439, "loss": 0.0162, "step": 350 }, { "epoch": 0.2631184407796102, "grad_norm": 0.22616036460660832, "learning_rate": 0.00017506234413965088, "loss": 0.0361, "step": 351 }, { "epoch": 0.2638680659670165, "grad_norm": 0.11606802219772738, "learning_rate": 0.00017556109725685786, "loss": 0.0118, "step": 352 }, { "epoch": 0.2646176911544228, "grad_norm": 0.44709462961570234, "learning_rate": 0.00017605985037406484, "loss": 0.067, "step": 353 }, { "epoch": 0.2653673163418291, "grad_norm": 0.20489244859924258, "learning_rate": 0.00017655860349127182, "loss": 0.0472, "step": 354 }, { "epoch": 0.2661169415292354, "grad_norm": 0.19112165467765932, "learning_rate": 0.00017705735660847883, "loss": 0.0503, "step": 355 }, { "epoch": 0.26686656671664166, "grad_norm": 0.09843918120704351, "learning_rate": 0.0001775561097256858, "loss": 0.0241, "step": 356 }, { "epoch": 0.26761619190404795, "grad_norm": 0.23798628300883096, "learning_rate": 0.00017805486284289276, "loss": 0.0433, "step": 357 }, { "epoch": 0.2683658170914543, "grad_norm": 0.09665589303199588, "learning_rate": 0.00017855361596009976, "loss": 0.0174, "step": 358 }, { "epoch": 0.2691154422788606, "grad_norm": 0.17403163590794157, "learning_rate": 0.00017905236907730674, "loss": 0.0327, "step": 359 }, { "epoch": 0.2698650674662669, "grad_norm": 0.13658076391511467, "learning_rate": 0.00017955112219451372, "loss": 0.0341, "step": 360 }, { "epoch": 0.27061469265367316, "grad_norm": 0.23004996818420348, "learning_rate": 0.0001800498753117207, "loss": 0.0385, "step": 361 }, { "epoch": 0.27136431784107945, "grad_norm": 0.2235475448370521, "learning_rate": 0.00018054862842892768, "loss": 0.0449, "step": 362 }, { "epoch": 0.27211394302848574, "grad_norm": 0.14802680170837254, "learning_rate": 0.00018104738154613468, "loss": 0.032, "step": 363 }, { "epoch": 0.272863568215892, "grad_norm": 0.2726885316166056, "learning_rate": 0.00018154613466334166, "loss": 0.0186, "step": 364 }, { "epoch": 0.27361319340329837, "grad_norm": 0.18956540775235434, "learning_rate": 0.00018204488778054864, "loss": 0.0293, "step": 365 }, { "epoch": 0.27436281859070466, "grad_norm": 0.15917936571093227, "learning_rate": 0.00018254364089775562, "loss": 0.0363, "step": 366 }, { "epoch": 0.27511244377811095, "grad_norm": 0.14578322218681494, "learning_rate": 0.0001830423940149626, "loss": 0.0314, "step": 367 }, { "epoch": 0.27586206896551724, "grad_norm": 0.10269728307226299, "learning_rate": 0.00018354114713216958, "loss": 0.0129, "step": 368 }, { "epoch": 0.2766116941529235, "grad_norm": 0.21141699577344752, "learning_rate": 0.00018403990024937656, "loss": 0.0537, "step": 369 }, { "epoch": 0.2773613193403298, "grad_norm": 0.15206079985424664, "learning_rate": 0.00018453865336658356, "loss": 0.031, "step": 370 }, { "epoch": 0.27811094452773616, "grad_norm": 0.13011557776643995, "learning_rate": 0.00018503740648379054, "loss": 0.0224, "step": 371 }, { "epoch": 0.27886056971514245, "grad_norm": 0.269149451025218, "learning_rate": 0.00018553615960099752, "loss": 0.0335, "step": 372 }, { "epoch": 0.27961019490254874, "grad_norm": 0.16896565857953327, "learning_rate": 0.0001860349127182045, "loss": 0.0304, "step": 373 }, { "epoch": 0.280359820089955, "grad_norm": 0.07219260932887692, "learning_rate": 0.00018653366583541148, "loss": 0.012, "step": 374 }, { "epoch": 0.2811094452773613, "grad_norm": 0.09981374137995903, "learning_rate": 0.00018703241895261848, "loss": 0.0201, "step": 375 }, { "epoch": 0.2818590704647676, "grad_norm": 0.11898845440731402, "learning_rate": 0.00018753117206982543, "loss": 0.0337, "step": 376 }, { "epoch": 0.2826086956521739, "grad_norm": 0.09794209512689632, "learning_rate": 0.0001880299251870324, "loss": 0.0152, "step": 377 }, { "epoch": 0.28335832083958024, "grad_norm": 0.16926227090380522, "learning_rate": 0.00018852867830423942, "loss": 0.0365, "step": 378 }, { "epoch": 0.2841079460269865, "grad_norm": 0.1336141976814021, "learning_rate": 0.0001890274314214464, "loss": 0.0183, "step": 379 }, { "epoch": 0.2848575712143928, "grad_norm": 0.17357395030789294, "learning_rate": 0.00018952618453865338, "loss": 0.0413, "step": 380 }, { "epoch": 0.2856071964017991, "grad_norm": 0.1171694991646048, "learning_rate": 0.00019002493765586035, "loss": 0.0282, "step": 381 }, { "epoch": 0.2863568215892054, "grad_norm": 0.11853021340241342, "learning_rate": 0.00019052369077306733, "loss": 0.026, "step": 382 }, { "epoch": 0.2871064467766117, "grad_norm": 0.12803104542393728, "learning_rate": 0.00019102244389027434, "loss": 0.0399, "step": 383 }, { "epoch": 0.28785607196401797, "grad_norm": 0.19010760869232313, "learning_rate": 0.0001915211970074813, "loss": 0.0583, "step": 384 }, { "epoch": 0.2886056971514243, "grad_norm": 0.1918972133083329, "learning_rate": 0.0001920199501246883, "loss": 0.0257, "step": 385 }, { "epoch": 0.2893553223388306, "grad_norm": 0.38132752615347837, "learning_rate": 0.00019251870324189527, "loss": 0.0459, "step": 386 }, { "epoch": 0.2901049475262369, "grad_norm": 0.21823951622302395, "learning_rate": 0.00019301745635910225, "loss": 0.0396, "step": 387 }, { "epoch": 0.2908545727136432, "grad_norm": 0.0934811566055218, "learning_rate": 0.00019351620947630923, "loss": 0.0213, "step": 388 }, { "epoch": 0.29160419790104947, "grad_norm": 0.09815902295103762, "learning_rate": 0.0001940149625935162, "loss": 0.0191, "step": 389 }, { "epoch": 0.29235382308845576, "grad_norm": 0.14907156908477912, "learning_rate": 0.00019451371571072322, "loss": 0.0263, "step": 390 }, { "epoch": 0.29310344827586204, "grad_norm": 0.19453601336927534, "learning_rate": 0.0001950124688279302, "loss": 0.0427, "step": 391 }, { "epoch": 0.2938530734632684, "grad_norm": 0.13879553197564173, "learning_rate": 0.00019551122194513715, "loss": 0.0448, "step": 392 }, { "epoch": 0.2946026986506747, "grad_norm": 0.16975459035194854, "learning_rate": 0.00019600997506234415, "loss": 0.0359, "step": 393 }, { "epoch": 0.29535232383808097, "grad_norm": 0.177277140657558, "learning_rate": 0.00019650872817955113, "loss": 0.0395, "step": 394 }, { "epoch": 0.29610194902548725, "grad_norm": 0.11373173330367291, "learning_rate": 0.0001970074812967581, "loss": 0.0356, "step": 395 }, { "epoch": 0.29685157421289354, "grad_norm": 0.15222992950517256, "learning_rate": 0.0001975062344139651, "loss": 0.0428, "step": 396 }, { "epoch": 0.29760119940029983, "grad_norm": 0.09436496677890617, "learning_rate": 0.00019800498753117207, "loss": 0.0159, "step": 397 }, { "epoch": 0.2983508245877061, "grad_norm": 0.1507478045803151, "learning_rate": 0.00019850374064837907, "loss": 0.0267, "step": 398 }, { "epoch": 0.29910044977511246, "grad_norm": 0.051785827436256274, "learning_rate": 0.00019900249376558605, "loss": 0.0088, "step": 399 }, { "epoch": 0.29985007496251875, "grad_norm": 0.18717721855259775, "learning_rate": 0.00019950124688279303, "loss": 0.0559, "step": 400 }, { "epoch": 0.30059970014992504, "grad_norm": 0.14425330741006745, "learning_rate": 0.0002, "loss": 0.0311, "step": 401 }, { "epoch": 0.30134932533733133, "grad_norm": 0.13207723504220212, "learning_rate": 0.00019999996194397014, "loss": 0.0215, "step": 402 }, { "epoch": 0.3020989505247376, "grad_norm": 0.059915434172776075, "learning_rate": 0.00019999984777590944, "loss": 0.0119, "step": 403 }, { "epoch": 0.3028485757121439, "grad_norm": 0.3938617437192051, "learning_rate": 0.00019999965749590486, "loss": 0.0303, "step": 404 }, { "epoch": 0.30359820089955025, "grad_norm": 0.17694572717829843, "learning_rate": 0.0001999993911041012, "loss": 0.0436, "step": 405 }, { "epoch": 0.30434782608695654, "grad_norm": 0.18667444081506102, "learning_rate": 0.0001999990486007012, "loss": 0.034, "step": 406 }, { "epoch": 0.30509745127436283, "grad_norm": 0.1792430745481953, "learning_rate": 0.00019999862998596558, "loss": 0.0574, "step": 407 }, { "epoch": 0.3058470764617691, "grad_norm": 0.15142125119749755, "learning_rate": 0.00019999813526021295, "loss": 0.0416, "step": 408 }, { "epoch": 0.3065967016491754, "grad_norm": 0.15352552956144938, "learning_rate": 0.00019999756442381982, "loss": 0.0289, "step": 409 }, { "epoch": 0.3073463268365817, "grad_norm": 0.1459033018919069, "learning_rate": 0.00019999691747722066, "loss": 0.0188, "step": 410 }, { "epoch": 0.308095952023988, "grad_norm": 0.16054037586365844, "learning_rate": 0.00019999619442090798, "loss": 0.039, "step": 411 }, { "epoch": 0.30884557721139433, "grad_norm": 0.28904351396420497, "learning_rate": 0.000199995395255432, "loss": 0.0631, "step": 412 }, { "epoch": 0.3095952023988006, "grad_norm": 0.368232762601163, "learning_rate": 0.000199994519981401, "loss": 0.0592, "step": 413 }, { "epoch": 0.3103448275862069, "grad_norm": 0.11645861006540552, "learning_rate": 0.00019999356859948123, "loss": 0.0216, "step": 414 }, { "epoch": 0.3110944527736132, "grad_norm": 0.13914821020028115, "learning_rate": 0.00019999254111039677, "loss": 0.0329, "step": 415 }, { "epoch": 0.3118440779610195, "grad_norm": 0.1054919349362133, "learning_rate": 0.00019999143751492962, "loss": 0.0185, "step": 416 }, { "epoch": 0.31259370314842577, "grad_norm": 0.12235988385680502, "learning_rate": 0.0001999902578139198, "loss": 0.0253, "step": 417 }, { "epoch": 0.31334332833583206, "grad_norm": 0.23220583217832894, "learning_rate": 0.0001999890020082652, "loss": 0.0966, "step": 418 }, { "epoch": 0.3140929535232384, "grad_norm": 0.15835605674777165, "learning_rate": 0.00019998767009892163, "loss": 0.0265, "step": 419 }, { "epoch": 0.3148425787106447, "grad_norm": 0.18460817393247528, "learning_rate": 0.00019998626208690283, "loss": 0.0553, "step": 420 }, { "epoch": 0.315592203898051, "grad_norm": 0.15905910975137832, "learning_rate": 0.00019998477797328047, "loss": 0.0336, "step": 421 }, { "epoch": 0.31634182908545727, "grad_norm": 0.10939651490515165, "learning_rate": 0.00019998321775918415, "loss": 0.018, "step": 422 }, { "epoch": 0.31709145427286356, "grad_norm": 0.12177572598606647, "learning_rate": 0.0001999815814458014, "loss": 0.0185, "step": 423 }, { "epoch": 0.31784107946026985, "grad_norm": 0.11831550376041258, "learning_rate": 0.00019997986903437759, "loss": 0.032, "step": 424 }, { "epoch": 0.31859070464767614, "grad_norm": 0.06514033143501995, "learning_rate": 0.00019997808052621608, "loss": 0.0128, "step": 425 }, { "epoch": 0.3193403298350825, "grad_norm": 0.19399927400695385, "learning_rate": 0.00019997621592267824, "loss": 0.043, "step": 426 }, { "epoch": 0.32008995502248877, "grad_norm": 0.11925039389623487, "learning_rate": 0.00019997427522518315, "loss": 0.0157, "step": 427 }, { "epoch": 0.32083958020989506, "grad_norm": 0.07784698086044195, "learning_rate": 0.00019997225843520794, "loss": 0.0133, "step": 428 }, { "epoch": 0.32158920539730135, "grad_norm": 0.10863162782804062, "learning_rate": 0.00019997016555428764, "loss": 0.0147, "step": 429 }, { "epoch": 0.32233883058470764, "grad_norm": 0.12068391287985351, "learning_rate": 0.00019996799658401519, "loss": 0.0197, "step": 430 }, { "epoch": 0.3230884557721139, "grad_norm": 0.23114556727231364, "learning_rate": 0.00019996575152604146, "loss": 0.0529, "step": 431 }, { "epoch": 0.3238380809595202, "grad_norm": 0.32097628810594436, "learning_rate": 0.00019996343038207514, "loss": 0.0669, "step": 432 }, { "epoch": 0.32458770614692656, "grad_norm": 0.12267343492743275, "learning_rate": 0.00019996103315388295, "loss": 0.0338, "step": 433 }, { "epoch": 0.32533733133433285, "grad_norm": 0.19841665026418323, "learning_rate": 0.0001999585598432895, "loss": 0.0261, "step": 434 }, { "epoch": 0.32608695652173914, "grad_norm": 0.19526396807296756, "learning_rate": 0.0001999560104521772, "loss": 0.0395, "step": 435 }, { "epoch": 0.3268365817091454, "grad_norm": 0.19094290559218405, "learning_rate": 0.00019995338498248654, "loss": 0.0263, "step": 436 }, { "epoch": 0.3275862068965517, "grad_norm": 0.10315023911899296, "learning_rate": 0.0001999506834362157, "loss": 0.0286, "step": 437 }, { "epoch": 0.328335832083958, "grad_norm": 0.1513368969371964, "learning_rate": 0.00019994790581542095, "loss": 0.0276, "step": 438 }, { "epoch": 0.32908545727136435, "grad_norm": 0.3896406875829373, "learning_rate": 0.0001999450521222164, "loss": 0.0438, "step": 439 }, { "epoch": 0.32983508245877063, "grad_norm": 0.18728955706645808, "learning_rate": 0.00019994212235877406, "loss": 0.0313, "step": 440 }, { "epoch": 0.3305847076461769, "grad_norm": 0.1763808296207911, "learning_rate": 0.00019993911652732384, "loss": 0.042, "step": 441 }, { "epoch": 0.3313343328335832, "grad_norm": 0.3486237934185716, "learning_rate": 0.00019993603463015349, "loss": 0.0344, "step": 442 }, { "epoch": 0.3320839580209895, "grad_norm": 0.11358739359187257, "learning_rate": 0.00019993287666960876, "loss": 0.0287, "step": 443 }, { "epoch": 0.3328335832083958, "grad_norm": 0.2771651376286589, "learning_rate": 0.00019992964264809316, "loss": 0.0539, "step": 444 }, { "epoch": 0.3335832083958021, "grad_norm": 0.15951465547214178, "learning_rate": 0.0001999263325680683, "loss": 0.0351, "step": 445 }, { "epoch": 0.3343328335832084, "grad_norm": 0.22717670366337234, "learning_rate": 0.00019992294643205342, "loss": 0.0559, "step": 446 }, { "epoch": 0.3350824587706147, "grad_norm": 0.11730729763880723, "learning_rate": 0.00019991948424262582, "loss": 0.025, "step": 447 }, { "epoch": 0.335832083958021, "grad_norm": 0.1488401866535149, "learning_rate": 0.00019991594600242069, "loss": 0.0554, "step": 448 }, { "epoch": 0.3365817091454273, "grad_norm": 0.22698278428915133, "learning_rate": 0.00019991233171413102, "loss": 0.0268, "step": 449 }, { "epoch": 0.3373313343328336, "grad_norm": 0.256587121294311, "learning_rate": 0.0001999086413805077, "loss": 0.0545, "step": 450 }, { "epoch": 0.33808095952023987, "grad_norm": 0.190566506128757, "learning_rate": 0.0001999048750043596, "loss": 0.0383, "step": 451 }, { "epoch": 0.33883058470764615, "grad_norm": 0.20383160433961658, "learning_rate": 0.00019990103258855327, "loss": 0.0373, "step": 452 }, { "epoch": 0.3395802098950525, "grad_norm": 0.05812210389585167, "learning_rate": 0.00019989711413601332, "loss": 0.0127, "step": 453 }, { "epoch": 0.3403298350824588, "grad_norm": 0.15721480837414703, "learning_rate": 0.00019989311964972216, "loss": 0.0327, "step": 454 }, { "epoch": 0.3410794602698651, "grad_norm": 0.11121198405053008, "learning_rate": 0.00019988904913272005, "loss": 0.0308, "step": 455 }, { "epoch": 0.34182908545727136, "grad_norm": 0.2792903863779934, "learning_rate": 0.0001998849025881052, "loss": 0.0208, "step": 456 }, { "epoch": 0.34257871064467765, "grad_norm": 0.32716321508579893, "learning_rate": 0.00019988068001903356, "loss": 0.0435, "step": 457 }, { "epoch": 0.34332833583208394, "grad_norm": 0.20203904519890029, "learning_rate": 0.00019987638142871908, "loss": 0.0235, "step": 458 }, { "epoch": 0.34407796101949023, "grad_norm": 0.11059483818522871, "learning_rate": 0.00019987200682043346, "loss": 0.0187, "step": 459 }, { "epoch": 0.3448275862068966, "grad_norm": 0.09751448421919101, "learning_rate": 0.0001998675561975063, "loss": 0.0103, "step": 460 }, { "epoch": 0.34557721139430286, "grad_norm": 0.1532154010841786, "learning_rate": 0.00019986302956332512, "loss": 0.0112, "step": 461 }, { "epoch": 0.34632683658170915, "grad_norm": 0.2078059436395786, "learning_rate": 0.00019985842692133517, "loss": 0.0379, "step": 462 }, { "epoch": 0.34707646176911544, "grad_norm": 0.24068443221087116, "learning_rate": 0.00019985374827503964, "loss": 0.044, "step": 463 }, { "epoch": 0.34782608695652173, "grad_norm": 0.34075693933551066, "learning_rate": 0.00019984899362799957, "loss": 0.0638, "step": 464 }, { "epoch": 0.348575712143928, "grad_norm": 0.10385908067891564, "learning_rate": 0.00019984416298383378, "loss": 0.0358, "step": 465 }, { "epoch": 0.3493253373313343, "grad_norm": 0.14788340069318578, "learning_rate": 0.00019983925634621893, "loss": 0.0203, "step": 466 }, { "epoch": 0.35007496251874065, "grad_norm": 0.5654716055817488, "learning_rate": 0.0001998342737188897, "loss": 0.0884, "step": 467 }, { "epoch": 0.35082458770614694, "grad_norm": 0.2093358211842703, "learning_rate": 0.00019982921510563838, "loss": 0.0574, "step": 468 }, { "epoch": 0.35157421289355323, "grad_norm": 0.19003237135335357, "learning_rate": 0.00019982408051031513, "loss": 0.0291, "step": 469 }, { "epoch": 0.3523238380809595, "grad_norm": 0.17192643284625606, "learning_rate": 0.00019981886993682813, "loss": 0.023, "step": 470 }, { "epoch": 0.3530734632683658, "grad_norm": 0.14062304972448308, "learning_rate": 0.00019981358338914317, "loss": 0.0367, "step": 471 }, { "epoch": 0.3538230884557721, "grad_norm": 0.10842984435009215, "learning_rate": 0.00019980822087128396, "loss": 0.0228, "step": 472 }, { "epoch": 0.3545727136431784, "grad_norm": 0.18970674109601535, "learning_rate": 0.00019980278238733204, "loss": 0.0421, "step": 473 }, { "epoch": 0.3553223388305847, "grad_norm": 0.14506946604862325, "learning_rate": 0.0001997972679414267, "loss": 0.0172, "step": 474 }, { "epoch": 0.356071964017991, "grad_norm": 0.21685436100064492, "learning_rate": 0.00019979167753776518, "loss": 0.041, "step": 475 }, { "epoch": 0.3568215892053973, "grad_norm": 0.14857894972860353, "learning_rate": 0.00019978601118060237, "loss": 0.0249, "step": 476 }, { "epoch": 0.3575712143928036, "grad_norm": 0.1216382248920481, "learning_rate": 0.00019978026887425117, "loss": 0.0229, "step": 477 }, { "epoch": 0.3583208395802099, "grad_norm": 0.2818825703680443, "learning_rate": 0.000199774450623082, "loss": 0.0415, "step": 478 }, { "epoch": 0.35907046476761617, "grad_norm": 0.23751907682089213, "learning_rate": 0.00019976855643152338, "loss": 0.053, "step": 479 }, { "epoch": 0.3598200899550225, "grad_norm": 0.1191015992983086, "learning_rate": 0.00019976258630406148, "loss": 0.0146, "step": 480 }, { "epoch": 0.3605697151424288, "grad_norm": 0.12423667768269722, "learning_rate": 0.00019975654024524024, "loss": 0.0231, "step": 481 }, { "epoch": 0.3613193403298351, "grad_norm": 0.1294080326047856, "learning_rate": 0.0001997504182596615, "loss": 0.0349, "step": 482 }, { "epoch": 0.3620689655172414, "grad_norm": 0.1642500818195986, "learning_rate": 0.00019974422035198478, "loss": 0.0448, "step": 483 }, { "epoch": 0.36281859070464767, "grad_norm": 0.13000096887910373, "learning_rate": 0.00019973794652692744, "loss": 0.0235, "step": 484 }, { "epoch": 0.36356821589205396, "grad_norm": 0.21703135831908477, "learning_rate": 0.00019973159678926466, "loss": 0.0381, "step": 485 }, { "epoch": 0.36431784107946025, "grad_norm": 0.18005988456596617, "learning_rate": 0.0001997251711438293, "loss": 0.0318, "step": 486 }, { "epoch": 0.3650674662668666, "grad_norm": 0.20888270057043598, "learning_rate": 0.00019971866959551208, "loss": 0.0473, "step": 487 }, { "epoch": 0.3658170914542729, "grad_norm": 0.4512020978781191, "learning_rate": 0.00019971209214926145, "loss": 0.064, "step": 488 }, { "epoch": 0.36656671664167917, "grad_norm": 0.14926340432556415, "learning_rate": 0.00019970543881008367, "loss": 0.0185, "step": 489 }, { "epoch": 0.36731634182908546, "grad_norm": 0.08818125993431587, "learning_rate": 0.0001996987095830427, "loss": 0.0233, "step": 490 }, { "epoch": 0.36806596701649175, "grad_norm": 0.08969986834486512, "learning_rate": 0.0001996919044732603, "loss": 0.019, "step": 491 }, { "epoch": 0.36881559220389803, "grad_norm": 0.41899818938779476, "learning_rate": 0.000199685023485916, "loss": 0.0997, "step": 492 }, { "epoch": 0.3695652173913043, "grad_norm": 0.33768765033376147, "learning_rate": 0.00019967806662624706, "loss": 0.0584, "step": 493 }, { "epoch": 0.37031484257871067, "grad_norm": 0.1901313204957896, "learning_rate": 0.00019967103389954843, "loss": 0.0335, "step": 494 }, { "epoch": 0.37106446776611696, "grad_norm": 0.19759284135729685, "learning_rate": 0.00019966392531117297, "loss": 0.0414, "step": 495 }, { "epoch": 0.37181409295352325, "grad_norm": 0.29640486472757016, "learning_rate": 0.0001996567408665311, "loss": 0.04, "step": 496 }, { "epoch": 0.37256371814092953, "grad_norm": 0.25841113695795564, "learning_rate": 0.00019964948057109102, "loss": 0.0558, "step": 497 }, { "epoch": 0.3733133433283358, "grad_norm": 0.33965462242202665, "learning_rate": 0.00019964214443037875, "loss": 0.0306, "step": 498 }, { "epoch": 0.3740629685157421, "grad_norm": 0.26841847008722564, "learning_rate": 0.00019963473244997797, "loss": 0.069, "step": 499 }, { "epoch": 0.3748125937031484, "grad_norm": 0.30309480215998835, "learning_rate": 0.00019962724463553006, "loss": 0.0459, "step": 500 }, { "epoch": 0.37556221889055474, "grad_norm": 0.14318386120653975, "learning_rate": 0.0001996196809927342, "loss": 0.0381, "step": 501 }, { "epoch": 0.37631184407796103, "grad_norm": 0.19393154852463967, "learning_rate": 0.00019961204152734715, "loss": 0.0416, "step": 502 }, { "epoch": 0.3770614692653673, "grad_norm": 0.1669666962274524, "learning_rate": 0.00019960432624518357, "loss": 0.0271, "step": 503 }, { "epoch": 0.3778110944527736, "grad_norm": 0.19198896931984977, "learning_rate": 0.00019959653515211567, "loss": 0.0189, "step": 504 }, { "epoch": 0.3785607196401799, "grad_norm": 0.27264366841448173, "learning_rate": 0.0001995886682540734, "loss": 0.0364, "step": 505 }, { "epoch": 0.3793103448275862, "grad_norm": 0.16942388949482184, "learning_rate": 0.00019958072555704441, "loss": 0.0346, "step": 506 }, { "epoch": 0.3800599700149925, "grad_norm": 0.413830565527178, "learning_rate": 0.0001995727070670741, "loss": 0.1878, "step": 507 }, { "epoch": 0.3808095952023988, "grad_norm": 0.16726770681064876, "learning_rate": 0.00019956461279026545, "loss": 0.0185, "step": 508 }, { "epoch": 0.3815592203898051, "grad_norm": 0.48839983148694394, "learning_rate": 0.00019955644273277921, "loss": 0.0689, "step": 509 }, { "epoch": 0.3823088455772114, "grad_norm": 0.07894912430939359, "learning_rate": 0.00019954819690083377, "loss": 0.0155, "step": 510 }, { "epoch": 0.3830584707646177, "grad_norm": 0.21924729131386728, "learning_rate": 0.00019953987530070522, "loss": 0.041, "step": 511 }, { "epoch": 0.383808095952024, "grad_norm": 0.30235520265779503, "learning_rate": 0.0001995314779387273, "loss": 0.0746, "step": 512 }, { "epoch": 0.38455772113943026, "grad_norm": 0.09074048325041456, "learning_rate": 0.0001995230048212914, "loss": 0.0219, "step": 513 }, { "epoch": 0.3853073463268366, "grad_norm": 0.1113928831644256, "learning_rate": 0.00019951445595484658, "loss": 0.0336, "step": 514 }, { "epoch": 0.3860569715142429, "grad_norm": 0.12625784415150787, "learning_rate": 0.00019950583134589956, "loss": 0.0169, "step": 515 }, { "epoch": 0.3868065967016492, "grad_norm": 0.19223761966230765, "learning_rate": 0.00019949713100101475, "loss": 0.033, "step": 516 }, { "epoch": 0.3875562218890555, "grad_norm": 0.17631590192078078, "learning_rate": 0.0001994883549268141, "loss": 0.0272, "step": 517 }, { "epoch": 0.38830584707646176, "grad_norm": 0.22486637349137442, "learning_rate": 0.0001994795031299773, "loss": 0.0198, "step": 518 }, { "epoch": 0.38905547226386805, "grad_norm": 0.12301541424429753, "learning_rate": 0.0001994705756172416, "loss": 0.016, "step": 519 }, { "epoch": 0.38980509745127434, "grad_norm": 0.19443551896854133, "learning_rate": 0.00019946157239540194, "loss": 0.0432, "step": 520 }, { "epoch": 0.3905547226386807, "grad_norm": 0.20182425703756351, "learning_rate": 0.00019945249347131088, "loss": 0.0229, "step": 521 }, { "epoch": 0.391304347826087, "grad_norm": 0.2858383123865256, "learning_rate": 0.00019944333885187854, "loss": 0.0579, "step": 522 }, { "epoch": 0.39205397301349326, "grad_norm": 0.2172640794478875, "learning_rate": 0.0001994341085440727, "loss": 0.038, "step": 523 }, { "epoch": 0.39280359820089955, "grad_norm": 0.08883145436518529, "learning_rate": 0.0001994248025549187, "loss": 0.0177, "step": 524 }, { "epoch": 0.39355322338830584, "grad_norm": 0.14015443846926087, "learning_rate": 0.0001994154208914996, "loss": 0.0231, "step": 525 }, { "epoch": 0.39430284857571213, "grad_norm": 0.10302600943071646, "learning_rate": 0.00019940596356095592, "loss": 0.0151, "step": 526 }, { "epoch": 0.3950524737631184, "grad_norm": 0.9708649007460824, "learning_rate": 0.00019939643057048588, "loss": 0.0883, "step": 527 }, { "epoch": 0.39580209895052476, "grad_norm": 0.3148212650301697, "learning_rate": 0.00019938682192734516, "loss": 0.0349, "step": 528 }, { "epoch": 0.39655172413793105, "grad_norm": 0.539851691055398, "learning_rate": 0.00019937713763884714, "loss": 0.1123, "step": 529 }, { "epoch": 0.39730134932533734, "grad_norm": 0.1175820770588898, "learning_rate": 0.00019936737771236273, "loss": 0.0178, "step": 530 }, { "epoch": 0.3980509745127436, "grad_norm": 0.8469963459052572, "learning_rate": 0.00019935754215532042, "loss": 0.0395, "step": 531 }, { "epoch": 0.3988005997001499, "grad_norm": 0.11032966949683876, "learning_rate": 0.00019934763097520622, "loss": 0.0263, "step": 532 }, { "epoch": 0.3995502248875562, "grad_norm": 0.13050708875089792, "learning_rate": 0.00019933764417956376, "loss": 0.0254, "step": 533 }, { "epoch": 0.4002998500749625, "grad_norm": 0.3062589113054434, "learning_rate": 0.00019932758177599423, "loss": 0.084, "step": 534 }, { "epoch": 0.40104947526236884, "grad_norm": 0.26243574980633094, "learning_rate": 0.00019931744377215625, "loss": 0.0536, "step": 535 }, { "epoch": 0.4017991004497751, "grad_norm": 0.33765459498988715, "learning_rate": 0.0001993072301757661, "loss": 0.0591, "step": 536 }, { "epoch": 0.4025487256371814, "grad_norm": 0.25884008527812624, "learning_rate": 0.0001992969409945976, "loss": 0.0359, "step": 537 }, { "epoch": 0.4032983508245877, "grad_norm": 0.17956799020329997, "learning_rate": 0.00019928657623648202, "loss": 0.0504, "step": 538 }, { "epoch": 0.404047976011994, "grad_norm": 0.3472847655807921, "learning_rate": 0.0001992761359093082, "loss": 0.1299, "step": 539 }, { "epoch": 0.4047976011994003, "grad_norm": 0.17649807509299384, "learning_rate": 0.00019926562002102244, "loss": 0.0271, "step": 540 }, { "epoch": 0.40554722638680657, "grad_norm": 0.23706785586528362, "learning_rate": 0.00019925502857962867, "loss": 0.0577, "step": 541 }, { "epoch": 0.4062968515742129, "grad_norm": 0.33871120855449127, "learning_rate": 0.00019924436159318823, "loss": 0.0572, "step": 542 }, { "epoch": 0.4070464767616192, "grad_norm": 0.3197595611177245, "learning_rate": 0.00019923361906981998, "loss": 0.0848, "step": 543 }, { "epoch": 0.4077961019490255, "grad_norm": 0.16797474123430559, "learning_rate": 0.0001992228010177003, "loss": 0.0343, "step": 544 }, { "epoch": 0.4085457271364318, "grad_norm": 0.2859867447131316, "learning_rate": 0.00019921190744506297, "loss": 0.0887, "step": 545 }, { "epoch": 0.40929535232383807, "grad_norm": 0.286424751553018, "learning_rate": 0.00019920093836019937, "loss": 0.046, "step": 546 }, { "epoch": 0.41004497751124436, "grad_norm": 0.26871363149433986, "learning_rate": 0.00019918989377145827, "loss": 0.0338, "step": 547 }, { "epoch": 0.4107946026986507, "grad_norm": 0.22113528855076964, "learning_rate": 0.00019917877368724597, "loss": 0.0393, "step": 548 }, { "epoch": 0.411544227886057, "grad_norm": 0.5080759113354069, "learning_rate": 0.00019916757811602612, "loss": 0.0426, "step": 549 }, { "epoch": 0.4122938530734633, "grad_norm": 0.17224155989086465, "learning_rate": 0.00019915630706631996, "loss": 0.0332, "step": 550 }, { "epoch": 0.41304347826086957, "grad_norm": 0.1413381734356381, "learning_rate": 0.00019914496054670612, "loss": 0.0207, "step": 551 }, { "epoch": 0.41379310344827586, "grad_norm": 0.40497374740103204, "learning_rate": 0.00019913353856582062, "loss": 0.0429, "step": 552 }, { "epoch": 0.41454272863568215, "grad_norm": 0.1827489824911213, "learning_rate": 0.00019912204113235702, "loss": 0.0433, "step": 553 }, { "epoch": 0.41529235382308843, "grad_norm": 0.13058740270083716, "learning_rate": 0.00019911046825506624, "loss": 0.0295, "step": 554 }, { "epoch": 0.4160419790104948, "grad_norm": 0.3724498056481174, "learning_rate": 0.00019909881994275663, "loss": 0.0453, "step": 555 }, { "epoch": 0.41679160419790107, "grad_norm": 0.14013096187220492, "learning_rate": 0.00019908709620429393, "loss": 0.0164, "step": 556 }, { "epoch": 0.41754122938530736, "grad_norm": 0.15261000750805073, "learning_rate": 0.00019907529704860137, "loss": 0.0398, "step": 557 }, { "epoch": 0.41829085457271364, "grad_norm": 0.19468792500213766, "learning_rate": 0.0001990634224846595, "loss": 0.028, "step": 558 }, { "epoch": 0.41904047976011993, "grad_norm": 0.20642163177246278, "learning_rate": 0.0001990514725215063, "loss": 0.0482, "step": 559 }, { "epoch": 0.4197901049475262, "grad_norm": 0.2640924448933564, "learning_rate": 0.00019903944716823712, "loss": 0.0429, "step": 560 }, { "epoch": 0.4205397301349325, "grad_norm": 0.17380599186496173, "learning_rate": 0.00019902734643400475, "loss": 0.0298, "step": 561 }, { "epoch": 0.42128935532233885, "grad_norm": 0.10058745366405412, "learning_rate": 0.00019901517032801927, "loss": 0.0234, "step": 562 }, { "epoch": 0.42203898050974514, "grad_norm": 0.09094624368245648, "learning_rate": 0.00019900291885954817, "loss": 0.0273, "step": 563 }, { "epoch": 0.42278860569715143, "grad_norm": 0.5223505243850018, "learning_rate": 0.0001989905920379163, "loss": 0.057, "step": 564 }, { "epoch": 0.4235382308845577, "grad_norm": 0.1006554752205893, "learning_rate": 0.00019897818987250584, "loss": 0.0242, "step": 565 }, { "epoch": 0.424287856071964, "grad_norm": 0.19684931467674244, "learning_rate": 0.00019896571237275635, "loss": 0.0534, "step": 566 }, { "epoch": 0.4250374812593703, "grad_norm": 0.2875002734473597, "learning_rate": 0.00019895315954816475, "loss": 0.0546, "step": 567 }, { "epoch": 0.4257871064467766, "grad_norm": 0.2722797709611978, "learning_rate": 0.0001989405314082852, "loss": 0.0451, "step": 568 }, { "epoch": 0.42653673163418293, "grad_norm": 0.33219794421491367, "learning_rate": 0.00019892782796272922, "loss": 0.0463, "step": 569 }, { "epoch": 0.4272863568215892, "grad_norm": 0.38257048354477796, "learning_rate": 0.0001989150492211657, "loss": 0.0267, "step": 570 }, { "epoch": 0.4280359820089955, "grad_norm": 0.2016531621539919, "learning_rate": 0.0001989021951933208, "loss": 0.0475, "step": 571 }, { "epoch": 0.4287856071964018, "grad_norm": 0.2735832711729896, "learning_rate": 0.00019888926588897802, "loss": 0.0208, "step": 572 }, { "epoch": 0.4295352323838081, "grad_norm": 0.2390819482301444, "learning_rate": 0.00019887626131797806, "loss": 0.0548, "step": 573 }, { "epoch": 0.4302848575712144, "grad_norm": 0.36787937181380886, "learning_rate": 0.000198863181490219, "loss": 0.0746, "step": 574 }, { "epoch": 0.43103448275862066, "grad_norm": 0.1553408786452615, "learning_rate": 0.00019885002641565613, "loss": 0.0461, "step": 575 }, { "epoch": 0.431784107946027, "grad_norm": 0.11860907854546467, "learning_rate": 0.00019883679610430212, "loss": 0.0285, "step": 576 }, { "epoch": 0.4325337331334333, "grad_norm": 0.48368138764405016, "learning_rate": 0.00019882349056622676, "loss": 0.071, "step": 577 }, { "epoch": 0.4332833583208396, "grad_norm": 0.34040734982154025, "learning_rate": 0.0001988101098115572, "loss": 0.0407, "step": 578 }, { "epoch": 0.4340329835082459, "grad_norm": 0.3794863668368257, "learning_rate": 0.00019879665385047777, "loss": 0.0867, "step": 579 }, { "epoch": 0.43478260869565216, "grad_norm": 0.1897465577471384, "learning_rate": 0.00019878312269323018, "loss": 0.024, "step": 580 }, { "epoch": 0.43553223388305845, "grad_norm": 0.42172080155699915, "learning_rate": 0.00019876951635011317, "loss": 0.0354, "step": 581 }, { "epoch": 0.4362818590704648, "grad_norm": 0.14200473008966963, "learning_rate": 0.00019875583483148286, "loss": 0.0301, "step": 582 }, { "epoch": 0.4370314842578711, "grad_norm": 0.30603866886887476, "learning_rate": 0.00019874207814775253, "loss": 0.07, "step": 583 }, { "epoch": 0.43778110944527737, "grad_norm": 0.29839798216202484, "learning_rate": 0.00019872824630939265, "loss": 0.0695, "step": 584 }, { "epoch": 0.43853073463268366, "grad_norm": 0.16530441756570893, "learning_rate": 0.00019871433932693093, "loss": 0.0309, "step": 585 }, { "epoch": 0.43928035982008995, "grad_norm": 0.11795224999185168, "learning_rate": 0.00019870035721095226, "loss": 0.0174, "step": 586 }, { "epoch": 0.44002998500749624, "grad_norm": 0.1263111693836232, "learning_rate": 0.00019868629997209873, "loss": 0.0147, "step": 587 }, { "epoch": 0.4407796101949025, "grad_norm": 0.885677670738033, "learning_rate": 0.00019867216762106958, "loss": 0.064, "step": 588 }, { "epoch": 0.44152923538230887, "grad_norm": 0.21259265942058897, "learning_rate": 0.00019865796016862124, "loss": 0.029, "step": 589 }, { "epoch": 0.44227886056971516, "grad_norm": 0.43037037278365686, "learning_rate": 0.00019864367762556731, "loss": 0.1131, "step": 590 }, { "epoch": 0.44302848575712145, "grad_norm": 2.717368196121025, "learning_rate": 0.0001986293200027785, "loss": 0.1806, "step": 591 }, { "epoch": 0.44377811094452774, "grad_norm": 0.1869451347832849, "learning_rate": 0.0001986148873111827, "loss": 0.0464, "step": 592 }, { "epoch": 0.444527736131934, "grad_norm": 0.11771674570347297, "learning_rate": 0.00019860037956176491, "loss": 0.0231, "step": 593 }, { "epoch": 0.4452773613193403, "grad_norm": 1.304481347752616, "learning_rate": 0.00019858579676556733, "loss": 0.1225, "step": 594 }, { "epoch": 0.4460269865067466, "grad_norm": 0.14663993783317505, "learning_rate": 0.00019857113893368916, "loss": 0.029, "step": 595 }, { "epoch": 0.44677661169415295, "grad_norm": 0.19789026692683717, "learning_rate": 0.00019855640607728683, "loss": 0.0311, "step": 596 }, { "epoch": 0.44752623688155924, "grad_norm": 0.17345157071354902, "learning_rate": 0.00019854159820757383, "loss": 0.0452, "step": 597 }, { "epoch": 0.4482758620689655, "grad_norm": 0.22394789265532525, "learning_rate": 0.0001985267153358207, "loss": 0.0705, "step": 598 }, { "epoch": 0.4490254872563718, "grad_norm": 0.23461119444057724, "learning_rate": 0.00019851175747335507, "loss": 0.0545, "step": 599 }, { "epoch": 0.4497751124437781, "grad_norm": 0.4304063727827009, "learning_rate": 0.00019849672463156175, "loss": 0.0616, "step": 600 }, { "epoch": 0.4505247376311844, "grad_norm": 0.16955416376624108, "learning_rate": 0.0001984816168218825, "loss": 0.0356, "step": 601 }, { "epoch": 0.4512743628185907, "grad_norm": 0.14119902116354704, "learning_rate": 0.00019846643405581617, "loss": 0.0332, "step": 602 }, { "epoch": 0.452023988005997, "grad_norm": 0.10374004297285797, "learning_rate": 0.00019845117634491874, "loss": 0.0251, "step": 603 }, { "epoch": 0.4527736131934033, "grad_norm": 0.4211702606487173, "learning_rate": 0.0001984358437008031, "loss": 0.064, "step": 604 }, { "epoch": 0.4535232383808096, "grad_norm": 0.3669747220639361, "learning_rate": 0.0001984204361351393, "loss": 0.0455, "step": 605 }, { "epoch": 0.4542728635682159, "grad_norm": 0.1945083024191305, "learning_rate": 0.00019840495365965432, "loss": 0.0279, "step": 606 }, { "epoch": 0.4550224887556222, "grad_norm": 0.21941701311044354, "learning_rate": 0.0001983893962861322, "loss": 0.03, "step": 607 }, { "epoch": 0.45577211394302847, "grad_norm": 0.29686743953828243, "learning_rate": 0.00019837376402641397, "loss": 0.0549, "step": 608 }, { "epoch": 0.45652173913043476, "grad_norm": 0.30788805888526277, "learning_rate": 0.00019835805689239768, "loss": 0.0381, "step": 609 }, { "epoch": 0.4572713643178411, "grad_norm": 0.15721381527401596, "learning_rate": 0.0001983422748960383, "loss": 0.0423, "step": 610 }, { "epoch": 0.4580209895052474, "grad_norm": 0.3045330866637826, "learning_rate": 0.00019832641804934793, "loss": 0.0356, "step": 611 }, { "epoch": 0.4587706146926537, "grad_norm": 0.5176832724837477, "learning_rate": 0.00019831048636439547, "loss": 0.0467, "step": 612 }, { "epoch": 0.45952023988005997, "grad_norm": 0.3645973754690773, "learning_rate": 0.00019829447985330687, "loss": 0.0343, "step": 613 }, { "epoch": 0.46026986506746626, "grad_norm": 0.1612943182389018, "learning_rate": 0.00019827839852826502, "loss": 0.0415, "step": 614 }, { "epoch": 0.46101949025487254, "grad_norm": 0.1792253469087533, "learning_rate": 0.00019826224240150975, "loss": 0.0186, "step": 615 }, { "epoch": 0.4617691154422789, "grad_norm": 0.7884751065266186, "learning_rate": 0.00019824601148533779, "loss": 0.0596, "step": 616 }, { "epoch": 0.4625187406296852, "grad_norm": 0.42798297314799133, "learning_rate": 0.00019822970579210286, "loss": 0.0656, "step": 617 }, { "epoch": 0.46326836581709147, "grad_norm": 0.22456024180897916, "learning_rate": 0.00019821332533421558, "loss": 0.0359, "step": 618 }, { "epoch": 0.46401799100449775, "grad_norm": 0.14179957969217744, "learning_rate": 0.00019819687012414338, "loss": 0.0194, "step": 619 }, { "epoch": 0.46476761619190404, "grad_norm": 0.19534431900414814, "learning_rate": 0.00019818034017441075, "loss": 0.0421, "step": 620 }, { "epoch": 0.46551724137931033, "grad_norm": 0.31092361297673426, "learning_rate": 0.00019816373549759887, "loss": 0.0851, "step": 621 }, { "epoch": 0.4662668665667166, "grad_norm": 0.21818167223637625, "learning_rate": 0.000198147056106346, "loss": 0.0414, "step": 622 }, { "epoch": 0.46701649175412296, "grad_norm": 0.3361005606854733, "learning_rate": 0.0001981303020133471, "loss": 0.0376, "step": 623 }, { "epoch": 0.46776611694152925, "grad_norm": 0.27917569510632445, "learning_rate": 0.00019811347323135415, "loss": 0.0328, "step": 624 }, { "epoch": 0.46851574212893554, "grad_norm": 0.2421974362279143, "learning_rate": 0.00019809656977317578, "loss": 0.0377, "step": 625 }, { "epoch": 0.46926536731634183, "grad_norm": 0.37153373285545455, "learning_rate": 0.00019807959165167756, "loss": 0.0614, "step": 626 }, { "epoch": 0.4700149925037481, "grad_norm": 0.4354760899723439, "learning_rate": 0.00019806253887978193, "loss": 0.0617, "step": 627 }, { "epoch": 0.4707646176911544, "grad_norm": 0.3409376339007273, "learning_rate": 0.00019804541147046812, "loss": 0.072, "step": 628 }, { "epoch": 0.4715142428785607, "grad_norm": 0.20518415022662134, "learning_rate": 0.00019802820943677217, "loss": 0.0353, "step": 629 }, { "epoch": 0.47226386806596704, "grad_norm": 0.48302199757179837, "learning_rate": 0.0001980109327917868, "loss": 0.0949, "step": 630 }, { "epoch": 0.47301349325337333, "grad_norm": 0.3359911080571668, "learning_rate": 0.00019799358154866172, "loss": 0.06, "step": 631 }, { "epoch": 0.4737631184407796, "grad_norm": 0.20737705335990284, "learning_rate": 0.00019797615572060329, "loss": 0.0472, "step": 632 }, { "epoch": 0.4745127436281859, "grad_norm": 0.18157374159375403, "learning_rate": 0.00019795865532087464, "loss": 0.0241, "step": 633 }, { "epoch": 0.4752623688155922, "grad_norm": 0.17515650486105908, "learning_rate": 0.00019794108036279572, "loss": 0.0382, "step": 634 }, { "epoch": 0.4760119940029985, "grad_norm": 0.17927234002552866, "learning_rate": 0.00019792343085974317, "loss": 0.0438, "step": 635 }, { "epoch": 0.4767616191904048, "grad_norm": 0.1490232907487753, "learning_rate": 0.00019790570682515042, "loss": 0.0304, "step": 636 }, { "epoch": 0.4775112443778111, "grad_norm": 0.11924600179670634, "learning_rate": 0.00019788790827250755, "loss": 0.0301, "step": 637 }, { "epoch": 0.4782608695652174, "grad_norm": 0.23746193370080762, "learning_rate": 0.00019787003521536142, "loss": 0.0605, "step": 638 }, { "epoch": 0.4790104947526237, "grad_norm": 0.17930143257936546, "learning_rate": 0.00019785208766731564, "loss": 0.0314, "step": 639 }, { "epoch": 0.47976011994003, "grad_norm": 0.2676999155407892, "learning_rate": 0.00019783406564203036, "loss": 0.0331, "step": 640 }, { "epoch": 0.48050974512743627, "grad_norm": 0.4707846057350767, "learning_rate": 0.00019781596915322263, "loss": 0.0633, "step": 641 }, { "epoch": 0.48125937031484256, "grad_norm": 0.2747406037837668, "learning_rate": 0.00019779779821466599, "loss": 0.0714, "step": 642 }, { "epoch": 0.48200899550224885, "grad_norm": 0.2882785148580262, "learning_rate": 0.00019777955284019068, "loss": 0.0729, "step": 643 }, { "epoch": 0.4827586206896552, "grad_norm": 0.24869504191527775, "learning_rate": 0.00019776123304368368, "loss": 0.047, "step": 644 }, { "epoch": 0.4835082458770615, "grad_norm": 0.29998707906477473, "learning_rate": 0.00019774283883908854, "loss": 0.0871, "step": 645 }, { "epoch": 0.48425787106446777, "grad_norm": 0.24329771856894528, "learning_rate": 0.00019772437024040556, "loss": 0.0399, "step": 646 }, { "epoch": 0.48500749625187406, "grad_norm": 0.14822184490418175, "learning_rate": 0.00019770582726169145, "loss": 0.0278, "step": 647 }, { "epoch": 0.48575712143928035, "grad_norm": 0.15941464998123417, "learning_rate": 0.0001976872099170597, "loss": 0.0217, "step": 648 }, { "epoch": 0.48650674662668664, "grad_norm": 0.1703461578982572, "learning_rate": 0.00019766851822068034, "loss": 0.0281, "step": 649 }, { "epoch": 0.487256371814093, "grad_norm": 0.2009908790648143, "learning_rate": 0.00019764975218678003, "loss": 0.044, "step": 650 }, { "epoch": 0.48800599700149927, "grad_norm": 0.41720382557820945, "learning_rate": 0.000197630911829642, "loss": 0.0615, "step": 651 }, { "epoch": 0.48875562218890556, "grad_norm": 0.38062572986525894, "learning_rate": 0.000197611997163606, "loss": 0.0652, "step": 652 }, { "epoch": 0.48950524737631185, "grad_norm": 0.23796291211665221, "learning_rate": 0.00019759300820306833, "loss": 0.0344, "step": 653 }, { "epoch": 0.49025487256371814, "grad_norm": 0.2940292683898967, "learning_rate": 0.00019757394496248195, "loss": 0.0278, "step": 654 }, { "epoch": 0.4910044977511244, "grad_norm": 0.12805376294248555, "learning_rate": 0.0001975548074563563, "loss": 0.0379, "step": 655 }, { "epoch": 0.4917541229385307, "grad_norm": 0.6717296161523604, "learning_rate": 0.00019753559569925727, "loss": 0.0311, "step": 656 }, { "epoch": 0.49250374812593706, "grad_norm": 0.21375850359128662, "learning_rate": 0.00019751630970580737, "loss": 0.04, "step": 657 }, { "epoch": 0.49325337331334335, "grad_norm": 1.1011544057656006, "learning_rate": 0.00019749694949068553, "loss": 0.0447, "step": 658 }, { "epoch": 0.49400299850074963, "grad_norm": 0.2344054804046405, "learning_rate": 0.00019747751506862723, "loss": 0.0398, "step": 659 }, { "epoch": 0.4947526236881559, "grad_norm": 0.2688401179053218, "learning_rate": 0.0001974580064544244, "loss": 0.0491, "step": 660 }, { "epoch": 0.4955022488755622, "grad_norm": 0.3609463716415746, "learning_rate": 0.0001974384236629254, "loss": 0.0283, "step": 661 }, { "epoch": 0.4962518740629685, "grad_norm": 0.1924430257206513, "learning_rate": 0.0001974187667090352, "loss": 0.0387, "step": 662 }, { "epoch": 0.4970014992503748, "grad_norm": 2.4858317118535243, "learning_rate": 0.00019739903560771506, "loss": 0.1203, "step": 663 }, { "epoch": 0.49775112443778113, "grad_norm": 0.15149651631112154, "learning_rate": 0.00019737923037398272, "loss": 0.0347, "step": 664 }, { "epoch": 0.4985007496251874, "grad_norm": 0.19402979633130193, "learning_rate": 0.00019735935102291238, "loss": 0.0269, "step": 665 }, { "epoch": 0.4992503748125937, "grad_norm": 0.3861280819653178, "learning_rate": 0.00019733939756963456, "loss": 0.0525, "step": 666 }, { "epoch": 0.5, "grad_norm": 0.6142151870943602, "learning_rate": 0.00019731937002933633, "loss": 0.0987, "step": 667 }, { "epoch": 0.5007496251874063, "grad_norm": 0.31598454375716306, "learning_rate": 0.00019729926841726098, "loss": 0.0529, "step": 668 }, { "epoch": 0.5014992503748126, "grad_norm": 0.5469064911673248, "learning_rate": 0.0001972790927487083, "loss": 0.0813, "step": 669 }, { "epoch": 0.5022488755622189, "grad_norm": 0.34228656676192837, "learning_rate": 0.00019725884303903443, "loss": 0.0515, "step": 670 }, { "epoch": 0.5029985007496252, "grad_norm": 0.09512824108899938, "learning_rate": 0.0001972385193036518, "loss": 0.023, "step": 671 }, { "epoch": 0.5037481259370314, "grad_norm": 1.0188850279916533, "learning_rate": 0.00019721812155802922, "loss": 0.0571, "step": 672 }, { "epoch": 0.5044977511244377, "grad_norm": 0.18675863579698465, "learning_rate": 0.0001971976498176919, "loss": 0.0377, "step": 673 }, { "epoch": 0.5052473763118441, "grad_norm": 0.3293723258020241, "learning_rate": 0.00019717710409822127, "loss": 0.0615, "step": 674 }, { "epoch": 0.5059970014992504, "grad_norm": 0.1875167046612201, "learning_rate": 0.00019715648441525504, "loss": 0.0482, "step": 675 }, { "epoch": 0.5067466266866567, "grad_norm": 0.2144866629023266, "learning_rate": 0.00019713579078448737, "loss": 0.0508, "step": 676 }, { "epoch": 0.507496251874063, "grad_norm": 0.254014517882649, "learning_rate": 0.00019711502322166855, "loss": 0.0465, "step": 677 }, { "epoch": 0.5082458770614693, "grad_norm": 0.3144605160912174, "learning_rate": 0.0001970941817426052, "loss": 0.0404, "step": 678 }, { "epoch": 0.5089955022488756, "grad_norm": 0.17268374165707537, "learning_rate": 0.00019707326636316024, "loss": 0.0403, "step": 679 }, { "epoch": 0.5097451274362819, "grad_norm": 0.3780163974603985, "learning_rate": 0.00019705227709925275, "loss": 0.1052, "step": 680 }, { "epoch": 0.5104947526236882, "grad_norm": 0.2701403020134673, "learning_rate": 0.0001970312139668581, "loss": 0.0391, "step": 681 }, { "epoch": 0.5112443778110944, "grad_norm": 0.40230538667716226, "learning_rate": 0.0001970100769820079, "loss": 0.0508, "step": 682 }, { "epoch": 0.5119940029985007, "grad_norm": 0.18935347804559302, "learning_rate": 0.00019698886616078992, "loss": 0.03, "step": 683 }, { "epoch": 0.512743628185907, "grad_norm": 0.284015406678792, "learning_rate": 0.00019696758151934818, "loss": 0.0467, "step": 684 }, { "epoch": 0.5134932533733133, "grad_norm": 0.2654446934199373, "learning_rate": 0.0001969462230738828, "loss": 0.0272, "step": 685 }, { "epoch": 0.5142428785607196, "grad_norm": 0.31418359639514515, "learning_rate": 0.00019692479084065022, "loss": 0.0559, "step": 686 }, { "epoch": 0.5149925037481259, "grad_norm": 0.17910995632397456, "learning_rate": 0.00019690328483596288, "loss": 0.023, "step": 687 }, { "epoch": 0.5157421289355323, "grad_norm": 0.3000429801531833, "learning_rate": 0.00019688170507618948, "loss": 0.0394, "step": 688 }, { "epoch": 0.5164917541229386, "grad_norm": 0.22024717592881649, "learning_rate": 0.00019686005157775483, "loss": 0.0276, "step": 689 }, { "epoch": 0.5172413793103449, "grad_norm": 0.19465821016826632, "learning_rate": 0.0001968383243571398, "loss": 0.0443, "step": 690 }, { "epoch": 0.5179910044977512, "grad_norm": 0.2892151529682172, "learning_rate": 0.00019681652343088145, "loss": 0.043, "step": 691 }, { "epoch": 0.5187406296851574, "grad_norm": 0.7128456619867778, "learning_rate": 0.00019679464881557296, "loss": 0.0591, "step": 692 }, { "epoch": 0.5194902548725637, "grad_norm": 0.15695747122101597, "learning_rate": 0.0001967727005278635, "loss": 0.0271, "step": 693 }, { "epoch": 0.52023988005997, "grad_norm": 0.16536616371095036, "learning_rate": 0.00019675067858445836, "loss": 0.0348, "step": 694 }, { "epoch": 0.5209895052473763, "grad_norm": 0.3521117447848854, "learning_rate": 0.0001967285830021189, "loss": 0.0587, "step": 695 }, { "epoch": 0.5217391304347826, "grad_norm": 0.11617823896876604, "learning_rate": 0.00019670641379766258, "loss": 0.0214, "step": 696 }, { "epoch": 0.5224887556221889, "grad_norm": 0.8403253233628167, "learning_rate": 0.00019668417098796278, "loss": 0.0485, "step": 697 }, { "epoch": 0.5232383808095952, "grad_norm": 0.3771163400001756, "learning_rate": 0.00019666185458994896, "loss": 0.0714, "step": 698 }, { "epoch": 0.5239880059970015, "grad_norm": 0.26485705204934235, "learning_rate": 0.00019663946462060664, "loss": 0.0538, "step": 699 }, { "epoch": 0.5247376311844077, "grad_norm": 0.2035383135631764, "learning_rate": 0.0001966170010969772, "loss": 0.0541, "step": 700 }, { "epoch": 0.525487256371814, "grad_norm": 0.28178178783706, "learning_rate": 0.00019659446403615815, "loss": 0.0473, "step": 701 }, { "epoch": 0.5262368815592204, "grad_norm": 0.23503639805567167, "learning_rate": 0.00019657185345530294, "loss": 0.0587, "step": 702 }, { "epoch": 0.5269865067466267, "grad_norm": 0.160425614898426, "learning_rate": 0.00019654916937162086, "loss": 0.038, "step": 703 }, { "epoch": 0.527736131934033, "grad_norm": 0.189609585963746, "learning_rate": 0.0001965264118023773, "loss": 0.0354, "step": 704 }, { "epoch": 0.5284857571214393, "grad_norm": 0.22997929354771204, "learning_rate": 0.00019650358076489353, "loss": 0.0337, "step": 705 }, { "epoch": 0.5292353823088456, "grad_norm": 0.2570696653356216, "learning_rate": 0.00019648067627654666, "loss": 0.0676, "step": 706 }, { "epoch": 0.5299850074962519, "grad_norm": 0.145032416897268, "learning_rate": 0.00019645769835476983, "loss": 0.0268, "step": 707 }, { "epoch": 0.5307346326836582, "grad_norm": 0.1375934632321734, "learning_rate": 0.00019643464701705194, "loss": 0.0342, "step": 708 }, { "epoch": 0.5314842578710645, "grad_norm": 0.1593212524417168, "learning_rate": 0.0001964115222809379, "loss": 0.0243, "step": 709 }, { "epoch": 0.5322338830584707, "grad_norm": 0.11195453186432093, "learning_rate": 0.0001963883241640284, "loss": 0.0209, "step": 710 }, { "epoch": 0.532983508245877, "grad_norm": 0.11361465549252572, "learning_rate": 0.00019636505268397998, "loss": 0.0267, "step": 711 }, { "epoch": 0.5337331334332833, "grad_norm": 0.12507931145914533, "learning_rate": 0.0001963417078585051, "loss": 0.0224, "step": 712 }, { "epoch": 0.5344827586206896, "grad_norm": 0.14794369943130184, "learning_rate": 0.00019631828970537196, "loss": 0.0417, "step": 713 }, { "epoch": 0.5352323838080959, "grad_norm": 0.31812402854885846, "learning_rate": 0.00019629479824240456, "loss": 0.0498, "step": 714 }, { "epoch": 0.5359820089955023, "grad_norm": 0.13942207901890347, "learning_rate": 0.00019627123348748277, "loss": 0.0229, "step": 715 }, { "epoch": 0.5367316341829086, "grad_norm": 0.26849290243133633, "learning_rate": 0.00019624759545854226, "loss": 0.0343, "step": 716 }, { "epoch": 0.5374812593703149, "grad_norm": 0.2824175224592763, "learning_rate": 0.00019622388417357434, "loss": 0.0497, "step": 717 }, { "epoch": 0.5382308845577212, "grad_norm": 0.26827286082010765, "learning_rate": 0.00019620009965062618, "loss": 0.056, "step": 718 }, { "epoch": 0.5389805097451275, "grad_norm": 0.2507495350520352, "learning_rate": 0.00019617624190780068, "loss": 0.0525, "step": 719 }, { "epoch": 0.5397301349325337, "grad_norm": 0.15206872543556993, "learning_rate": 0.00019615231096325645, "loss": 0.0382, "step": 720 }, { "epoch": 0.54047976011994, "grad_norm": 0.2534444723340265, "learning_rate": 0.00019612830683520785, "loss": 0.0484, "step": 721 }, { "epoch": 0.5412293853073463, "grad_norm": 0.17449911174599694, "learning_rate": 0.00019610422954192491, "loss": 0.0433, "step": 722 }, { "epoch": 0.5419790104947526, "grad_norm": 0.23763217664971512, "learning_rate": 0.00019608007910173334, "loss": 0.0243, "step": 723 }, { "epoch": 0.5427286356821589, "grad_norm": 0.39657496104464174, "learning_rate": 0.00019605585553301454, "loss": 0.0627, "step": 724 }, { "epoch": 0.5434782608695652, "grad_norm": 0.2629287185998369, "learning_rate": 0.00019603155885420555, "loss": 0.0388, "step": 725 }, { "epoch": 0.5442278860569715, "grad_norm": 0.2925833008523378, "learning_rate": 0.00019600718908379912, "loss": 0.0477, "step": 726 }, { "epoch": 0.5449775112443778, "grad_norm": 0.1835903425623574, "learning_rate": 0.00019598274624034353, "loss": 0.0488, "step": 727 }, { "epoch": 0.545727136431784, "grad_norm": 0.1309786695888962, "learning_rate": 0.00019595823034244275, "loss": 0.0283, "step": 728 }, { "epoch": 0.5464767616191905, "grad_norm": 0.21239724844772065, "learning_rate": 0.00019593364140875637, "loss": 0.0372, "step": 729 }, { "epoch": 0.5472263868065967, "grad_norm": 0.1282745709770564, "learning_rate": 0.0001959089794579995, "loss": 0.0284, "step": 730 }, { "epoch": 0.547976011994003, "grad_norm": 0.13062589463989482, "learning_rate": 0.00019588424450894283, "loss": 0.0412, "step": 731 }, { "epoch": 0.5487256371814093, "grad_norm": 0.1671330845222855, "learning_rate": 0.00019585943658041271, "loss": 0.0329, "step": 732 }, { "epoch": 0.5494752623688156, "grad_norm": 0.11756985208400414, "learning_rate": 0.00019583455569129092, "loss": 0.0184, "step": 733 }, { "epoch": 0.5502248875562219, "grad_norm": 0.18075813633689752, "learning_rate": 0.0001958096018605148, "loss": 0.0329, "step": 734 }, { "epoch": 0.5509745127436282, "grad_norm": 0.12221795958442933, "learning_rate": 0.00019578457510707728, "loss": 0.0178, "step": 735 }, { "epoch": 0.5517241379310345, "grad_norm": 0.16343977992570569, "learning_rate": 0.0001957594754500267, "loss": 0.0188, "step": 736 }, { "epoch": 0.5524737631184408, "grad_norm": 0.11873410698691594, "learning_rate": 0.00019573430290846692, "loss": 0.0215, "step": 737 }, { "epoch": 0.553223388305847, "grad_norm": 0.5353970283675913, "learning_rate": 0.00019570905750155733, "loss": 0.1006, "step": 738 }, { "epoch": 0.5539730134932533, "grad_norm": 0.11628544332223312, "learning_rate": 0.00019568373924851265, "loss": 0.0281, "step": 739 }, { "epoch": 0.5547226386806596, "grad_norm": 0.14481274986660742, "learning_rate": 0.0001956583481686032, "loss": 0.0285, "step": 740 }, { "epoch": 0.5554722638680659, "grad_norm": 0.10228835567792285, "learning_rate": 0.0001956328842811546, "loss": 0.0182, "step": 741 }, { "epoch": 0.5562218890554723, "grad_norm": 0.14598036221788213, "learning_rate": 0.000195607347605548, "loss": 0.0164, "step": 742 }, { "epoch": 0.5569715142428786, "grad_norm": 0.1337465267184873, "learning_rate": 0.0001955817381612198, "loss": 0.0226, "step": 743 }, { "epoch": 0.5577211394302849, "grad_norm": 0.17512979637507264, "learning_rate": 0.00019555605596766195, "loss": 0.0257, "step": 744 }, { "epoch": 0.5584707646176912, "grad_norm": 0.26359447087528937, "learning_rate": 0.00019553030104442164, "loss": 0.0784, "step": 745 }, { "epoch": 0.5592203898050975, "grad_norm": 0.24337056758185213, "learning_rate": 0.00019550447341110157, "loss": 0.0119, "step": 746 }, { "epoch": 0.5599700149925038, "grad_norm": 0.2291020714386086, "learning_rate": 0.00019547857308735954, "loss": 0.0386, "step": 747 }, { "epoch": 0.56071964017991, "grad_norm": 0.6099594043333565, "learning_rate": 0.00019545260009290894, "loss": 0.0686, "step": 748 }, { "epoch": 0.5614692653673163, "grad_norm": 0.23313273301357787, "learning_rate": 0.00019542655444751832, "loss": 0.0358, "step": 749 }, { "epoch": 0.5622188905547226, "grad_norm": 0.30721566706948694, "learning_rate": 0.0001954004361710115, "loss": 0.056, "step": 750 }, { "epoch": 0.5629685157421289, "grad_norm": 0.14868748862150133, "learning_rate": 0.00019537424528326774, "loss": 0.0347, "step": 751 }, { "epoch": 0.5637181409295352, "grad_norm": 0.35283868429312015, "learning_rate": 0.00019534798180422138, "loss": 0.055, "step": 752 }, { "epoch": 0.5644677661169415, "grad_norm": 0.1916124066278781, "learning_rate": 0.00019532164575386215, "loss": 0.0361, "step": 753 }, { "epoch": 0.5652173913043478, "grad_norm": 0.21612561370379293, "learning_rate": 0.0001952952371522349, "loss": 0.0249, "step": 754 }, { "epoch": 0.5659670164917541, "grad_norm": 0.2924146234215674, "learning_rate": 0.0001952687560194398, "loss": 0.0565, "step": 755 }, { "epoch": 0.5667166416791605, "grad_norm": 0.2852650400539102, "learning_rate": 0.0001952422023756322, "loss": 0.0667, "step": 756 }, { "epoch": 0.5674662668665668, "grad_norm": 0.11941722208051125, "learning_rate": 0.0001952155762410226, "loss": 0.0287, "step": 757 }, { "epoch": 0.568215892053973, "grad_norm": 0.4540888173579844, "learning_rate": 0.0001951888776358767, "loss": 0.0204, "step": 758 }, { "epoch": 0.5689655172413793, "grad_norm": 0.3373617845244113, "learning_rate": 0.00019516210658051535, "loss": 0.0682, "step": 759 }, { "epoch": 0.5697151424287856, "grad_norm": 0.22739701260998646, "learning_rate": 0.00019513526309531459, "loss": 0.0407, "step": 760 }, { "epoch": 0.5704647676161919, "grad_norm": 0.1672076747143311, "learning_rate": 0.0001951083472007055, "loss": 0.0318, "step": 761 }, { "epoch": 0.5712143928035982, "grad_norm": 0.1389020412850082, "learning_rate": 0.00019508135891717435, "loss": 0.0211, "step": 762 }, { "epoch": 0.5719640179910045, "grad_norm": 0.17191468308215127, "learning_rate": 0.00019505429826526246, "loss": 0.0453, "step": 763 }, { "epoch": 0.5727136431784108, "grad_norm": 0.17000750173740772, "learning_rate": 0.0001950271652655663, "loss": 0.0225, "step": 764 }, { "epoch": 0.5734632683658171, "grad_norm": 0.11216968441486351, "learning_rate": 0.0001949999599387373, "loss": 0.018, "step": 765 }, { "epoch": 0.5742128935532234, "grad_norm": 0.25006865233631487, "learning_rate": 0.00019497268230548201, "loss": 0.022, "step": 766 }, { "epoch": 0.5749625187406296, "grad_norm": 0.49925242688607274, "learning_rate": 0.00019494533238656204, "loss": 0.0656, "step": 767 }, { "epoch": 0.5757121439280359, "grad_norm": 0.21186420607643336, "learning_rate": 0.0001949179102027939, "loss": 0.0478, "step": 768 }, { "epoch": 0.5764617691154422, "grad_norm": 0.6140154868296298, "learning_rate": 0.0001948904157750492, "loss": 0.0392, "step": 769 }, { "epoch": 0.5772113943028486, "grad_norm": 0.09252928326152278, "learning_rate": 0.00019486284912425457, "loss": 0.0204, "step": 770 }, { "epoch": 0.5779610194902549, "grad_norm": 0.19345629880652473, "learning_rate": 0.0001948352102713915, "loss": 0.0251, "step": 771 }, { "epoch": 0.5787106446776612, "grad_norm": 0.31202632688570753, "learning_rate": 0.00019480749923749655, "loss": 0.0904, "step": 772 }, { "epoch": 0.5794602698650675, "grad_norm": 0.38726486933860926, "learning_rate": 0.00019477971604366106, "loss": 0.0521, "step": 773 }, { "epoch": 0.5802098950524738, "grad_norm": 0.2030051748098563, "learning_rate": 0.00019475186071103148, "loss": 0.0431, "step": 774 }, { "epoch": 0.5809595202398801, "grad_norm": 0.15348623359559546, "learning_rate": 0.00019472393326080905, "loss": 0.0253, "step": 775 }, { "epoch": 0.5817091454272864, "grad_norm": 0.2288982438224531, "learning_rate": 0.0001946959337142499, "loss": 0.0524, "step": 776 }, { "epoch": 0.5824587706146926, "grad_norm": 0.11109856913043137, "learning_rate": 0.0001946678620926651, "loss": 0.0269, "step": 777 }, { "epoch": 0.5832083958020989, "grad_norm": 2.07579081751696, "learning_rate": 0.00019463971841742056, "loss": 0.0681, "step": 778 }, { "epoch": 0.5839580209895052, "grad_norm": 0.31296298255106864, "learning_rate": 0.00019461150270993693, "loss": 0.045, "step": 779 }, { "epoch": 0.5847076461769115, "grad_norm": 0.22119938706955986, "learning_rate": 0.0001945832149916898, "loss": 0.0251, "step": 780 }, { "epoch": 0.5854572713643178, "grad_norm": 0.18401368936862708, "learning_rate": 0.00019455485528420958, "loss": 0.0447, "step": 781 }, { "epoch": 0.5862068965517241, "grad_norm": 0.34006952655746775, "learning_rate": 0.0001945264236090814, "loss": 0.0328, "step": 782 }, { "epoch": 0.5869565217391305, "grad_norm": 0.5719459819450913, "learning_rate": 0.00019449791998794516, "loss": 0.0232, "step": 783 }, { "epoch": 0.5877061469265368, "grad_norm": 0.1491340544778081, "learning_rate": 0.0001944693444424956, "loss": 0.0277, "step": 784 }, { "epoch": 0.5884557721139431, "grad_norm": 0.6464031380286603, "learning_rate": 0.00019444069699448214, "loss": 0.0444, "step": 785 }, { "epoch": 0.5892053973013494, "grad_norm": 0.23038490458838246, "learning_rate": 0.0001944119776657089, "loss": 0.0376, "step": 786 }, { "epoch": 0.5899550224887556, "grad_norm": 0.24375368417956617, "learning_rate": 0.00019438318647803482, "loss": 0.0359, "step": 787 }, { "epoch": 0.5907046476761619, "grad_norm": 0.22794673576648067, "learning_rate": 0.00019435432345337345, "loss": 0.054, "step": 788 }, { "epoch": 0.5914542728635682, "grad_norm": 1.5666537621230843, "learning_rate": 0.000194325388613693, "loss": 0.0565, "step": 789 }, { "epoch": 0.5922038980509745, "grad_norm": 0.23939494460941754, "learning_rate": 0.00019429638198101642, "loss": 0.0262, "step": 790 }, { "epoch": 0.5929535232383808, "grad_norm": 0.11512448557106804, "learning_rate": 0.00019426730357742123, "loss": 0.0333, "step": 791 }, { "epoch": 0.5937031484257871, "grad_norm": 0.08278414269358454, "learning_rate": 0.00019423815342503958, "loss": 0.0192, "step": 792 }, { "epoch": 0.5944527736131934, "grad_norm": 0.8234039282439967, "learning_rate": 0.00019420893154605828, "loss": 0.0202, "step": 793 }, { "epoch": 0.5952023988005997, "grad_norm": 0.1844774144077661, "learning_rate": 0.00019417963796271873, "loss": 0.0293, "step": 794 }, { "epoch": 0.595952023988006, "grad_norm": 0.5361123111117241, "learning_rate": 0.00019415027269731677, "loss": 0.0624, "step": 795 }, { "epoch": 0.5967016491754122, "grad_norm": 0.1558817603943623, "learning_rate": 0.00019412083577220304, "loss": 0.0299, "step": 796 }, { "epoch": 0.5974512743628186, "grad_norm": 0.25135916091871174, "learning_rate": 0.00019409132720978254, "loss": 0.0446, "step": 797 }, { "epoch": 0.5982008995502249, "grad_norm": 0.4137620063339877, "learning_rate": 0.00019406174703251482, "loss": 0.0617, "step": 798 }, { "epoch": 0.5989505247376312, "grad_norm": 0.13801847124843378, "learning_rate": 0.000194032095262914, "loss": 0.035, "step": 799 }, { "epoch": 0.5997001499250375, "grad_norm": 0.09371617366687199, "learning_rate": 0.0001940023719235486, "loss": 0.0136, "step": 800 }, { "epoch": 0.6004497751124438, "grad_norm": 0.1190313546094842, "learning_rate": 0.00019397257703704173, "loss": 0.0211, "step": 801 }, { "epoch": 0.6011994002998501, "grad_norm": 0.30906713673794106, "learning_rate": 0.00019394271062607085, "loss": 0.0345, "step": 802 }, { "epoch": 0.6019490254872564, "grad_norm": 0.25312589879783953, "learning_rate": 0.00019391277271336792, "loss": 0.0226, "step": 803 }, { "epoch": 0.6026986506746627, "grad_norm": 0.37710261116216454, "learning_rate": 0.0001938827633217193, "loss": 0.0386, "step": 804 }, { "epoch": 0.603448275862069, "grad_norm": 0.1889732863444513, "learning_rate": 0.00019385268247396571, "loss": 0.04, "step": 805 }, { "epoch": 0.6041979010494752, "grad_norm": 0.22033441208719085, "learning_rate": 0.0001938225301930024, "loss": 0.0512, "step": 806 }, { "epoch": 0.6049475262368815, "grad_norm": 0.2836356477878693, "learning_rate": 0.00019379230650177877, "loss": 0.0434, "step": 807 }, { "epoch": 0.6056971514242878, "grad_norm": 0.23341456899879873, "learning_rate": 0.00019376201142329882, "loss": 0.0452, "step": 808 }, { "epoch": 0.6064467766116941, "grad_norm": 0.25296621661074753, "learning_rate": 0.00019373164498062066, "loss": 0.0308, "step": 809 }, { "epoch": 0.6071964017991005, "grad_norm": 0.23863639164852932, "learning_rate": 0.00019370120719685687, "loss": 0.0271, "step": 810 }, { "epoch": 0.6079460269865068, "grad_norm": 0.20291122107848233, "learning_rate": 0.00019367069809517426, "loss": 0.0359, "step": 811 }, { "epoch": 0.6086956521739131, "grad_norm": 0.20115751012517755, "learning_rate": 0.00019364011769879393, "loss": 0.0331, "step": 812 }, { "epoch": 0.6094452773613194, "grad_norm": 0.725662505864241, "learning_rate": 0.00019360946603099127, "loss": 0.0808, "step": 813 }, { "epoch": 0.6101949025487257, "grad_norm": 0.3707632032177578, "learning_rate": 0.00019357874311509585, "loss": 0.0406, "step": 814 }, { "epoch": 0.610944527736132, "grad_norm": 0.2492238320478391, "learning_rate": 0.00019354794897449154, "loss": 0.0273, "step": 815 }, { "epoch": 0.6116941529235382, "grad_norm": 0.28949212299924676, "learning_rate": 0.00019351708363261641, "loss": 0.0741, "step": 816 }, { "epoch": 0.6124437781109445, "grad_norm": 0.16433768096408977, "learning_rate": 0.0001934861471129627, "loss": 0.0333, "step": 817 }, { "epoch": 0.6131934032983508, "grad_norm": 0.261884934270732, "learning_rate": 0.0001934551394390768, "loss": 0.0476, "step": 818 }, { "epoch": 0.6139430284857571, "grad_norm": 0.17727981078341598, "learning_rate": 0.00019342406063455934, "loss": 0.0332, "step": 819 }, { "epoch": 0.6146926536731634, "grad_norm": 0.8903617488607476, "learning_rate": 0.00019339291072306502, "loss": 0.1448, "step": 820 }, { "epoch": 0.6154422788605697, "grad_norm": 0.29030138456788634, "learning_rate": 0.00019336168972830265, "loss": 0.0523, "step": 821 }, { "epoch": 0.616191904047976, "grad_norm": 0.29500920261796953, "learning_rate": 0.0001933303976740352, "loss": 0.0559, "step": 822 }, { "epoch": 0.6169415292353823, "grad_norm": 0.19708221752070373, "learning_rate": 0.0001932990345840797, "loss": 0.0387, "step": 823 }, { "epoch": 0.6176911544227887, "grad_norm": 0.1453239404111134, "learning_rate": 0.00019326760048230725, "loss": 0.021, "step": 824 }, { "epoch": 0.618440779610195, "grad_norm": 0.16559600117050013, "learning_rate": 0.00019323609539264295, "loss": 0.0328, "step": 825 }, { "epoch": 0.6191904047976012, "grad_norm": 0.2434062543518236, "learning_rate": 0.00019320451933906602, "loss": 0.0458, "step": 826 }, { "epoch": 0.6199400299850075, "grad_norm": 0.2920576187948728, "learning_rate": 0.00019317287234560962, "loss": 0.0282, "step": 827 }, { "epoch": 0.6206896551724138, "grad_norm": 0.18444689764461758, "learning_rate": 0.00019314115443636094, "loss": 0.0505, "step": 828 }, { "epoch": 0.6214392803598201, "grad_norm": 0.33347763003860115, "learning_rate": 0.00019310936563546113, "loss": 0.0375, "step": 829 }, { "epoch": 0.6221889055472264, "grad_norm": 0.3151793343232866, "learning_rate": 0.00019307750596710528, "loss": 0.0688, "step": 830 }, { "epoch": 0.6229385307346327, "grad_norm": 0.17223079808853164, "learning_rate": 0.00019304557545554246, "loss": 0.0149, "step": 831 }, { "epoch": 0.623688155922039, "grad_norm": 0.16131606757257835, "learning_rate": 0.00019301357412507568, "loss": 0.0358, "step": 832 }, { "epoch": 0.6244377811094453, "grad_norm": 0.15451180093753095, "learning_rate": 0.00019298150200006172, "loss": 0.0132, "step": 833 }, { "epoch": 0.6251874062968515, "grad_norm": 0.18246287463532035, "learning_rate": 0.00019294935910491143, "loss": 0.0299, "step": 834 }, { "epoch": 0.6259370314842578, "grad_norm": 0.23038969159958683, "learning_rate": 0.00019291714546408936, "loss": 0.0299, "step": 835 }, { "epoch": 0.6266866566716641, "grad_norm": 0.15241068451072204, "learning_rate": 0.000192884861102114, "loss": 0.03, "step": 836 }, { "epoch": 0.6274362818590704, "grad_norm": 0.1772524433219351, "learning_rate": 0.00019285250604355767, "loss": 0.0291, "step": 837 }, { "epoch": 0.6281859070464768, "grad_norm": 0.2908627086925324, "learning_rate": 0.00019282008031304643, "loss": 0.0447, "step": 838 }, { "epoch": 0.6289355322338831, "grad_norm": 0.26052288609262275, "learning_rate": 0.0001927875839352602, "loss": 0.0375, "step": 839 }, { "epoch": 0.6296851574212894, "grad_norm": 0.1569085422616155, "learning_rate": 0.0001927550169349326, "loss": 0.0415, "step": 840 }, { "epoch": 0.6304347826086957, "grad_norm": 0.21810162792732946, "learning_rate": 0.0001927223793368511, "loss": 0.0292, "step": 841 }, { "epoch": 0.631184407796102, "grad_norm": 0.43588945410407454, "learning_rate": 0.0001926896711658568, "loss": 0.0402, "step": 842 }, { "epoch": 0.6319340329835083, "grad_norm": 0.18607701495773632, "learning_rate": 0.0001926568924468446, "loss": 0.0299, "step": 843 }, { "epoch": 0.6326836581709145, "grad_norm": 0.38921493550716046, "learning_rate": 0.00019262404320476303, "loss": 0.0333, "step": 844 }, { "epoch": 0.6334332833583208, "grad_norm": 0.37969401105335127, "learning_rate": 0.00019259112346461436, "loss": 0.0496, "step": 845 }, { "epoch": 0.6341829085457271, "grad_norm": 0.3216879681669849, "learning_rate": 0.00019255813325145448, "loss": 0.023, "step": 846 }, { "epoch": 0.6349325337331334, "grad_norm": 0.27089341793735805, "learning_rate": 0.00019252507259039285, "loss": 0.0459, "step": 847 }, { "epoch": 0.6356821589205397, "grad_norm": 0.19188607086964637, "learning_rate": 0.0001924919415065927, "loss": 0.0285, "step": 848 }, { "epoch": 0.636431784107946, "grad_norm": 0.4712514156925914, "learning_rate": 0.00019245874002527072, "loss": 0.0382, "step": 849 }, { "epoch": 0.6371814092953523, "grad_norm": 0.4385017620850573, "learning_rate": 0.00019242546817169732, "loss": 0.0481, "step": 850 }, { "epoch": 0.6379310344827587, "grad_norm": 0.1242568847686868, "learning_rate": 0.0001923921259711963, "loss": 0.0297, "step": 851 }, { "epoch": 0.638680659670165, "grad_norm": 0.28540644515657065, "learning_rate": 0.00019235871344914517, "loss": 0.0394, "step": 852 }, { "epoch": 0.6394302848575713, "grad_norm": 0.5744981724187342, "learning_rate": 0.00019232523063097487, "loss": 0.0742, "step": 853 }, { "epoch": 0.6401799100449775, "grad_norm": 0.24642627559782443, "learning_rate": 0.00019229167754216984, "loss": 0.0348, "step": 854 }, { "epoch": 0.6409295352323838, "grad_norm": 0.44105899266315524, "learning_rate": 0.00019225805420826806, "loss": 0.0621, "step": 855 }, { "epoch": 0.6416791604197901, "grad_norm": 0.24535853410831998, "learning_rate": 0.00019222436065486088, "loss": 0.0355, "step": 856 }, { "epoch": 0.6424287856071964, "grad_norm": 0.3648266082854025, "learning_rate": 0.00019219059690759322, "loss": 0.0378, "step": 857 }, { "epoch": 0.6431784107946027, "grad_norm": 0.23410467927646403, "learning_rate": 0.00019215676299216337, "loss": 0.0453, "step": 858 }, { "epoch": 0.643928035982009, "grad_norm": 0.5771722806268218, "learning_rate": 0.00019212285893432295, "loss": 0.0605, "step": 859 }, { "epoch": 0.6446776611694153, "grad_norm": 0.42837179995622626, "learning_rate": 0.0001920888847598771, "loss": 0.0757, "step": 860 }, { "epoch": 0.6454272863568216, "grad_norm": 0.30782392032850886, "learning_rate": 0.00019205484049468426, "loss": 0.0559, "step": 861 }, { "epoch": 0.6461769115442278, "grad_norm": 0.32187116161181983, "learning_rate": 0.00019202072616465618, "loss": 0.0325, "step": 862 }, { "epoch": 0.6469265367316341, "grad_norm": 0.3170202880484331, "learning_rate": 0.00019198654179575803, "loss": 0.0616, "step": 863 }, { "epoch": 0.6476761619190404, "grad_norm": 0.15415415536695157, "learning_rate": 0.00019195228741400818, "loss": 0.0279, "step": 864 }, { "epoch": 0.6484257871064468, "grad_norm": 0.468023792818238, "learning_rate": 0.00019191796304547838, "loss": 0.072, "step": 865 }, { "epoch": 0.6491754122938531, "grad_norm": 0.17747724891036817, "learning_rate": 0.00019188356871629363, "loss": 0.0269, "step": 866 }, { "epoch": 0.6499250374812594, "grad_norm": 0.11896332361586975, "learning_rate": 0.00019184910445263213, "loss": 0.0231, "step": 867 }, { "epoch": 0.6506746626686657, "grad_norm": 0.12177188142655841, "learning_rate": 0.00019181457028072537, "loss": 0.0324, "step": 868 }, { "epoch": 0.651424287856072, "grad_norm": 0.21226493913526695, "learning_rate": 0.00019177996622685799, "loss": 0.0303, "step": 869 }, { "epoch": 0.6521739130434783, "grad_norm": 0.35288822757454136, "learning_rate": 0.00019174529231736788, "loss": 0.0528, "step": 870 }, { "epoch": 0.6529235382308846, "grad_norm": 0.18467843757358945, "learning_rate": 0.00019171054857864604, "loss": 0.0467, "step": 871 }, { "epoch": 0.6536731634182908, "grad_norm": 0.11919292144770044, "learning_rate": 0.00019167573503713664, "loss": 0.028, "step": 872 }, { "epoch": 0.6544227886056971, "grad_norm": 0.14188580015944366, "learning_rate": 0.00019164085171933701, "loss": 0.0224, "step": 873 }, { "epoch": 0.6551724137931034, "grad_norm": 0.17981285086992055, "learning_rate": 0.00019160589865179755, "loss": 0.0297, "step": 874 }, { "epoch": 0.6559220389805097, "grad_norm": 0.15306582904565683, "learning_rate": 0.00019157087586112176, "loss": 0.0213, "step": 875 }, { "epoch": 0.656671664167916, "grad_norm": 0.28590338086654155, "learning_rate": 0.00019153578337396617, "loss": 0.0405, "step": 876 }, { "epoch": 0.6574212893553223, "grad_norm": 0.08011629532591379, "learning_rate": 0.00019150062121704047, "loss": 0.0122, "step": 877 }, { "epoch": 0.6581709145427287, "grad_norm": 0.09441159438236417, "learning_rate": 0.00019146538941710727, "loss": 0.018, "step": 878 }, { "epoch": 0.658920539730135, "grad_norm": 0.13011973729869275, "learning_rate": 0.00019143008800098218, "loss": 0.0201, "step": 879 }, { "epoch": 0.6596701649175413, "grad_norm": 0.9504515435787874, "learning_rate": 0.0001913947169955339, "loss": 0.1022, "step": 880 }, { "epoch": 0.6604197901049476, "grad_norm": 0.11513124353776294, "learning_rate": 0.00019135927642768396, "loss": 0.0182, "step": 881 }, { "epoch": 0.6611694152923538, "grad_norm": 0.18336727207182985, "learning_rate": 0.00019132376632440695, "loss": 0.0348, "step": 882 }, { "epoch": 0.6619190404797601, "grad_norm": 0.2576296296304189, "learning_rate": 0.0001912881867127304, "loss": 0.052, "step": 883 }, { "epoch": 0.6626686656671664, "grad_norm": 0.17875011992570874, "learning_rate": 0.00019125253761973453, "loss": 0.0382, "step": 884 }, { "epoch": 0.6634182908545727, "grad_norm": 0.3514697728255144, "learning_rate": 0.00019121681907255272, "loss": 0.065, "step": 885 }, { "epoch": 0.664167916041979, "grad_norm": 0.49314124227628053, "learning_rate": 0.0001911810310983711, "loss": 0.0901, "step": 886 }, { "epoch": 0.6649175412293853, "grad_norm": 0.30243395137434004, "learning_rate": 0.00019114517372442857, "loss": 0.06, "step": 887 }, { "epoch": 0.6656671664167916, "grad_norm": 0.2127563337244439, "learning_rate": 0.00019110924697801694, "loss": 0.0319, "step": 888 }, { "epoch": 0.6664167916041979, "grad_norm": 0.14128878156969202, "learning_rate": 0.00019107325088648077, "loss": 0.0269, "step": 889 }, { "epoch": 0.6671664167916042, "grad_norm": 0.14278769406678316, "learning_rate": 0.00019103718547721748, "loss": 0.0515, "step": 890 }, { "epoch": 0.6679160419790104, "grad_norm": 0.09835609392274643, "learning_rate": 0.00019100105077767717, "loss": 0.0209, "step": 891 }, { "epoch": 0.6686656671664168, "grad_norm": 0.4225039625143984, "learning_rate": 0.00019096484681536274, "loss": 0.0626, "step": 892 }, { "epoch": 0.6694152923538231, "grad_norm": 0.1979603681699738, "learning_rate": 0.00019092857361782968, "loss": 0.048, "step": 893 }, { "epoch": 0.6701649175412294, "grad_norm": 0.22017581022730506, "learning_rate": 0.00019089223121268633, "loss": 0.0433, "step": 894 }, { "epoch": 0.6709145427286357, "grad_norm": 0.23746282110411276, "learning_rate": 0.00019085581962759366, "loss": 0.0394, "step": 895 }, { "epoch": 0.671664167916042, "grad_norm": 0.22877488071584384, "learning_rate": 0.00019081933889026522, "loss": 0.0689, "step": 896 }, { "epoch": 0.6724137931034483, "grad_norm": 0.17291962523254184, "learning_rate": 0.00019078278902846734, "loss": 0.0386, "step": 897 }, { "epoch": 0.6731634182908546, "grad_norm": 0.11386075637226863, "learning_rate": 0.00019074617007001877, "loss": 0.0241, "step": 898 }, { "epoch": 0.6739130434782609, "grad_norm": 0.36286240601431663, "learning_rate": 0.00019070948204279103, "loss": 0.0587, "step": 899 }, { "epoch": 0.6746626686656672, "grad_norm": 0.15708001038991823, "learning_rate": 0.00019067272497470808, "loss": 0.0412, "step": 900 }, { "epoch": 0.6754122938530734, "grad_norm": 0.2063576324808673, "learning_rate": 0.0001906358988937465, "loss": 0.0307, "step": 901 }, { "epoch": 0.6761619190404797, "grad_norm": 0.5738564722564278, "learning_rate": 0.0001905990038279354, "loss": 0.0964, "step": 902 }, { "epoch": 0.676911544227886, "grad_norm": 0.39568723739967954, "learning_rate": 0.00019056203980535638, "loss": 0.0671, "step": 903 }, { "epoch": 0.6776611694152923, "grad_norm": 0.23745034772582727, "learning_rate": 0.00019052500685414348, "loss": 0.0448, "step": 904 }, { "epoch": 0.6784107946026986, "grad_norm": 0.2186670327549479, "learning_rate": 0.00019048790500248325, "loss": 0.0382, "step": 905 }, { "epoch": 0.679160419790105, "grad_norm": 0.23083494649569086, "learning_rate": 0.00019045073427861469, "loss": 0.0534, "step": 906 }, { "epoch": 0.6799100449775113, "grad_norm": 0.20107179690681276, "learning_rate": 0.00019041349471082922, "loss": 0.036, "step": 907 }, { "epoch": 0.6806596701649176, "grad_norm": 0.14110579856132485, "learning_rate": 0.0001903761863274706, "loss": 0.0267, "step": 908 }, { "epoch": 0.6814092953523239, "grad_norm": 0.11737441028479874, "learning_rate": 0.000190338809156935, "loss": 0.013, "step": 909 }, { "epoch": 0.6821589205397302, "grad_norm": 0.3354065360433217, "learning_rate": 0.00019030136322767104, "loss": 0.0503, "step": 910 }, { "epoch": 0.6829085457271364, "grad_norm": 0.2933379028903616, "learning_rate": 0.0001902638485681795, "loss": 0.0484, "step": 911 }, { "epoch": 0.6836581709145427, "grad_norm": 0.2772432648087348, "learning_rate": 0.00019022626520701357, "loss": 0.0501, "step": 912 }, { "epoch": 0.684407796101949, "grad_norm": 0.06103394009376496, "learning_rate": 0.00019018861317277876, "loss": 0.014, "step": 913 }, { "epoch": 0.6851574212893553, "grad_norm": 0.30436820029229084, "learning_rate": 0.00019015089249413278, "loss": 0.0227, "step": 914 }, { "epoch": 0.6859070464767616, "grad_norm": 0.21316900529347577, "learning_rate": 0.00019011310319978566, "loss": 0.0301, "step": 915 }, { "epoch": 0.6866566716641679, "grad_norm": 0.15624463828972068, "learning_rate": 0.00019007524531849954, "loss": 0.0175, "step": 916 }, { "epoch": 0.6874062968515742, "grad_norm": 0.21724546668213623, "learning_rate": 0.00019003731887908888, "loss": 0.0419, "step": 917 }, { "epoch": 0.6881559220389805, "grad_norm": 0.17497708434454778, "learning_rate": 0.00018999932391042026, "loss": 0.0494, "step": 918 }, { "epoch": 0.6889055472263869, "grad_norm": 0.28296123614292606, "learning_rate": 0.00018996126044141243, "loss": 0.0542, "step": 919 }, { "epoch": 0.6896551724137931, "grad_norm": 0.3138450154926571, "learning_rate": 0.00018992312850103633, "loss": 0.0451, "step": 920 }, { "epoch": 0.6904047976011994, "grad_norm": 0.23985575635803402, "learning_rate": 0.00018988492811831487, "loss": 0.0425, "step": 921 }, { "epoch": 0.6911544227886057, "grad_norm": 0.28213195649994594, "learning_rate": 0.00018984665932232324, "loss": 0.0432, "step": 922 }, { "epoch": 0.691904047976012, "grad_norm": 0.17556584000730519, "learning_rate": 0.00018980832214218854, "loss": 0.0417, "step": 923 }, { "epoch": 0.6926536731634183, "grad_norm": 0.2153080576912875, "learning_rate": 0.00018976991660709, "loss": 0.0248, "step": 924 }, { "epoch": 0.6934032983508246, "grad_norm": 0.21085999350466364, "learning_rate": 0.0001897314427462589, "loss": 0.039, "step": 925 }, { "epoch": 0.6941529235382309, "grad_norm": 0.23685975191705796, "learning_rate": 0.00018969290058897849, "loss": 0.0416, "step": 926 }, { "epoch": 0.6949025487256372, "grad_norm": 0.2349320329159903, "learning_rate": 0.00018965429016458397, "loss": 0.0485, "step": 927 }, { "epoch": 0.6956521739130435, "grad_norm": 0.2003843750715276, "learning_rate": 0.00018961561150246254, "loss": 0.0248, "step": 928 }, { "epoch": 0.6964017991004497, "grad_norm": 0.17207195243686713, "learning_rate": 0.00018957686463205332, "loss": 0.0393, "step": 929 }, { "epoch": 0.697151424287856, "grad_norm": 0.10409802702518689, "learning_rate": 0.00018953804958284732, "loss": 0.0143, "step": 930 }, { "epoch": 0.6979010494752623, "grad_norm": 0.2834877094860702, "learning_rate": 0.00018949916638438753, "loss": 0.0423, "step": 931 }, { "epoch": 0.6986506746626686, "grad_norm": 0.16706698433341532, "learning_rate": 0.00018946021506626874, "loss": 0.0311, "step": 932 }, { "epoch": 0.699400299850075, "grad_norm": 0.27550981150523285, "learning_rate": 0.0001894211956581376, "loss": 0.0485, "step": 933 }, { "epoch": 0.7001499250374813, "grad_norm": 0.5822939921119175, "learning_rate": 0.00018938210818969254, "loss": 0.0735, "step": 934 }, { "epoch": 0.7008995502248876, "grad_norm": 0.15172545323797987, "learning_rate": 0.00018934295269068391, "loss": 0.0313, "step": 935 }, { "epoch": 0.7016491754122939, "grad_norm": 0.19155910351663208, "learning_rate": 0.00018930372919091372, "loss": 0.036, "step": 936 }, { "epoch": 0.7023988005997002, "grad_norm": 0.1920009030641335, "learning_rate": 0.0001892644377202358, "loss": 0.0303, "step": 937 }, { "epoch": 0.7031484257871065, "grad_norm": 0.2315620580933706, "learning_rate": 0.0001892250783085557, "loss": 0.0564, "step": 938 }, { "epoch": 0.7038980509745127, "grad_norm": 0.2277782259514648, "learning_rate": 0.00018918565098583064, "loss": 0.0482, "step": 939 }, { "epoch": 0.704647676161919, "grad_norm": 0.1290013927843416, "learning_rate": 0.00018914615578206965, "loss": 0.0374, "step": 940 }, { "epoch": 0.7053973013493253, "grad_norm": 0.3096012036471806, "learning_rate": 0.00018910659272733325, "loss": 0.0425, "step": 941 }, { "epoch": 0.7061469265367316, "grad_norm": 0.0962987698303328, "learning_rate": 0.00018906696185173373, "loss": 0.0122, "step": 942 }, { "epoch": 0.7068965517241379, "grad_norm": 0.6081649862961295, "learning_rate": 0.000189027263185435, "loss": 0.0342, "step": 943 }, { "epoch": 0.7076461769115442, "grad_norm": 0.34556925423240414, "learning_rate": 0.0001889874967586525, "loss": 0.035, "step": 944 }, { "epoch": 0.7083958020989505, "grad_norm": 0.25383191175507674, "learning_rate": 0.00018894766260165328, "loss": 0.0301, "step": 945 }, { "epoch": 0.7091454272863568, "grad_norm": 0.20690331301143616, "learning_rate": 0.00018890776074475594, "loss": 0.0309, "step": 946 }, { "epoch": 0.7098950524737632, "grad_norm": 0.11773090179399598, "learning_rate": 0.00018886779121833063, "loss": 0.0306, "step": 947 }, { "epoch": 0.7106446776611695, "grad_norm": 0.5402748502999926, "learning_rate": 0.0001888277540527989, "loss": 0.0838, "step": 948 }, { "epoch": 0.7113943028485757, "grad_norm": 0.13353170834904762, "learning_rate": 0.00018878764927863397, "loss": 0.0243, "step": 949 }, { "epoch": 0.712143928035982, "grad_norm": 0.30817237528562064, "learning_rate": 0.00018874747692636032, "loss": 0.0443, "step": 950 }, { "epoch": 0.7128935532233883, "grad_norm": 0.31287844891395794, "learning_rate": 0.00018870723702655402, "loss": 0.0427, "step": 951 }, { "epoch": 0.7136431784107946, "grad_norm": 0.13038883844419452, "learning_rate": 0.00018866692960984246, "loss": 0.0267, "step": 952 }, { "epoch": 0.7143928035982009, "grad_norm": 0.1564690576279143, "learning_rate": 0.0001886265547069044, "loss": 0.0321, "step": 953 }, { "epoch": 0.7151424287856072, "grad_norm": 0.18995784247308192, "learning_rate": 0.0001885861123484701, "loss": 0.0403, "step": 954 }, { "epoch": 0.7158920539730135, "grad_norm": 0.1130067211033811, "learning_rate": 0.000188545602565321, "loss": 0.0195, "step": 955 }, { "epoch": 0.7166416791604198, "grad_norm": 0.18772400417992988, "learning_rate": 0.00018850502538828997, "loss": 0.0264, "step": 956 }, { "epoch": 0.717391304347826, "grad_norm": 0.3597706845751724, "learning_rate": 0.0001884643808482611, "loss": 0.0346, "step": 957 }, { "epoch": 0.7181409295352323, "grad_norm": 0.15381224969583301, "learning_rate": 0.0001884236689761698, "loss": 0.0204, "step": 958 }, { "epoch": 0.7188905547226386, "grad_norm": 0.16678496756982067, "learning_rate": 0.00018838288980300275, "loss": 0.0238, "step": 959 }, { "epoch": 0.719640179910045, "grad_norm": 0.11629607248863856, "learning_rate": 0.00018834204335979777, "loss": 0.0164, "step": 960 }, { "epoch": 0.7203898050974513, "grad_norm": 0.12667186231323446, "learning_rate": 0.00018830112967764396, "loss": 0.0238, "step": 961 }, { "epoch": 0.7211394302848576, "grad_norm": 0.22995623152007172, "learning_rate": 0.00018826014878768157, "loss": 0.0164, "step": 962 }, { "epoch": 0.7218890554722639, "grad_norm": 0.2358255161333237, "learning_rate": 0.00018821910072110197, "loss": 0.0331, "step": 963 }, { "epoch": 0.7226386806596702, "grad_norm": 0.3438825305304082, "learning_rate": 0.00018817798550914772, "loss": 0.0322, "step": 964 }, { "epoch": 0.7233883058470765, "grad_norm": 0.20848681012820786, "learning_rate": 0.00018813680318311243, "loss": 0.0454, "step": 965 }, { "epoch": 0.7241379310344828, "grad_norm": 0.15272896327846594, "learning_rate": 0.00018809555377434078, "loss": 0.0335, "step": 966 }, { "epoch": 0.724887556221889, "grad_norm": 0.23046040770462634, "learning_rate": 0.00018805423731422868, "loss": 0.029, "step": 967 }, { "epoch": 0.7256371814092953, "grad_norm": 0.29532647572282444, "learning_rate": 0.00018801285383422277, "loss": 0.0653, "step": 968 }, { "epoch": 0.7263868065967016, "grad_norm": 0.25954608236551174, "learning_rate": 0.00018797140336582096, "loss": 0.0299, "step": 969 }, { "epoch": 0.7271364317841079, "grad_norm": 0.15951338875544732, "learning_rate": 0.00018792988594057208, "loss": 0.0158, "step": 970 }, { "epoch": 0.7278860569715142, "grad_norm": 0.19570079875891358, "learning_rate": 0.00018788830159007582, "loss": 0.0209, "step": 971 }, { "epoch": 0.7286356821589205, "grad_norm": 0.27173924703759006, "learning_rate": 0.00018784665034598294, "loss": 0.0526, "step": 972 }, { "epoch": 0.7293853073463268, "grad_norm": 0.20541123116423304, "learning_rate": 0.00018780493223999507, "loss": 0.0541, "step": 973 }, { "epoch": 0.7301349325337332, "grad_norm": 0.08279968415297809, "learning_rate": 0.0001877631473038647, "loss": 0.014, "step": 974 }, { "epoch": 0.7308845577211395, "grad_norm": 0.261041791157805, "learning_rate": 0.00018772129556939517, "loss": 0.0339, "step": 975 }, { "epoch": 0.7316341829085458, "grad_norm": 0.20121706293886343, "learning_rate": 0.0001876793770684407, "loss": 0.0383, "step": 976 }, { "epoch": 0.732383808095952, "grad_norm": 0.22576321461521023, "learning_rate": 0.00018763739183290642, "loss": 0.0471, "step": 977 }, { "epoch": 0.7331334332833583, "grad_norm": 0.20509942958433353, "learning_rate": 0.00018759533989474807, "loss": 0.0559, "step": 978 }, { "epoch": 0.7338830584707646, "grad_norm": 0.13069410558257796, "learning_rate": 0.00018755322128597226, "loss": 0.0237, "step": 979 }, { "epoch": 0.7346326836581709, "grad_norm": 0.4352376492807258, "learning_rate": 0.00018751103603863634, "loss": 0.0483, "step": 980 }, { "epoch": 0.7353823088455772, "grad_norm": 0.4586041127351121, "learning_rate": 0.00018746878418484837, "loss": 0.0367, "step": 981 }, { "epoch": 0.7361319340329835, "grad_norm": 0.20081207301221854, "learning_rate": 0.0001874264657567671, "loss": 0.0373, "step": 982 }, { "epoch": 0.7368815592203898, "grad_norm": 0.17972854685564316, "learning_rate": 0.00018738408078660196, "loss": 0.0499, "step": 983 }, { "epoch": 0.7376311844077961, "grad_norm": 0.5123799767559905, "learning_rate": 0.00018734162930661303, "loss": 0.044, "step": 984 }, { "epoch": 0.7383808095952024, "grad_norm": 0.15387818818143484, "learning_rate": 0.00018729911134911099, "loss": 0.0247, "step": 985 }, { "epoch": 0.7391304347826086, "grad_norm": 0.15234012077579065, "learning_rate": 0.00018725652694645716, "loss": 0.0369, "step": 986 }, { "epoch": 0.739880059970015, "grad_norm": 0.0840260959966149, "learning_rate": 0.00018721387613106338, "loss": 0.0158, "step": 987 }, { "epoch": 0.7406296851574213, "grad_norm": 0.28193234623834884, "learning_rate": 0.00018717115893539208, "loss": 0.0595, "step": 988 }, { "epoch": 0.7413793103448276, "grad_norm": 0.2901103028757637, "learning_rate": 0.0001871283753919562, "loss": 0.0271, "step": 989 }, { "epoch": 0.7421289355322339, "grad_norm": 0.20023279045247946, "learning_rate": 0.00018708552553331918, "loss": 0.0284, "step": 990 }, { "epoch": 0.7428785607196402, "grad_norm": 0.16813357710639712, "learning_rate": 0.0001870426093920949, "loss": 0.0282, "step": 991 }, { "epoch": 0.7436281859070465, "grad_norm": 0.5817170450684567, "learning_rate": 0.00018699962700094772, "loss": 0.0557, "step": 992 }, { "epoch": 0.7443778110944528, "grad_norm": 0.23910035514534853, "learning_rate": 0.00018695657839259245, "loss": 0.0361, "step": 993 }, { "epoch": 0.7451274362818591, "grad_norm": 0.10757725591523912, "learning_rate": 0.00018691346359979427, "loss": 0.0148, "step": 994 }, { "epoch": 0.7458770614692654, "grad_norm": 0.2389873118034019, "learning_rate": 0.00018687028265536873, "loss": 0.0269, "step": 995 }, { "epoch": 0.7466266866566716, "grad_norm": 0.09890578148764179, "learning_rate": 0.00018682703559218174, "loss": 0.028, "step": 996 }, { "epoch": 0.7473763118440779, "grad_norm": 0.423335856348218, "learning_rate": 0.00018678372244314954, "loss": 0.0511, "step": 997 }, { "epoch": 0.7481259370314842, "grad_norm": 0.1664160505747008, "learning_rate": 0.00018674034324123865, "loss": 0.0178, "step": 998 }, { "epoch": 0.7488755622188905, "grad_norm": 0.30443104002016347, "learning_rate": 0.00018669689801946584, "loss": 0.0306, "step": 999 }, { "epoch": 0.7496251874062968, "grad_norm": 0.16915223787278677, "learning_rate": 0.0001866533868108982, "loss": 0.0202, "step": 1000 }, { "epoch": 0.7503748125937032, "grad_norm": 0.21806633940083398, "learning_rate": 0.000186609809648653, "loss": 0.0391, "step": 1001 }, { "epoch": 0.7511244377811095, "grad_norm": 0.18073732919950314, "learning_rate": 0.00018656616656589775, "loss": 0.0341, "step": 1002 }, { "epoch": 0.7518740629685158, "grad_norm": 0.32621454820715445, "learning_rate": 0.00018652245759585002, "loss": 0.0354, "step": 1003 }, { "epoch": 0.7526236881559221, "grad_norm": 0.17220315147771398, "learning_rate": 0.0001864786827717777, "loss": 0.0388, "step": 1004 }, { "epoch": 0.7533733133433284, "grad_norm": 0.19048101156044403, "learning_rate": 0.0001864348421269986, "loss": 0.0243, "step": 1005 }, { "epoch": 0.7541229385307346, "grad_norm": 0.1467422816576331, "learning_rate": 0.00018639093569488088, "loss": 0.0219, "step": 1006 }, { "epoch": 0.7548725637181409, "grad_norm": 0.17892458855046647, "learning_rate": 0.0001863469635088425, "loss": 0.0234, "step": 1007 }, { "epoch": 0.7556221889055472, "grad_norm": 1.2435338434659133, "learning_rate": 0.00018630292560235166, "loss": 0.109, "step": 1008 }, { "epoch": 0.7563718140929535, "grad_norm": 0.08933758885347165, "learning_rate": 0.00018625882200892652, "loss": 0.01, "step": 1009 }, { "epoch": 0.7571214392803598, "grad_norm": 0.15900071145370545, "learning_rate": 0.0001862146527621352, "loss": 0.0249, "step": 1010 }, { "epoch": 0.7578710644677661, "grad_norm": 0.22387635530307326, "learning_rate": 0.00018617041789559586, "loss": 0.0503, "step": 1011 }, { "epoch": 0.7586206896551724, "grad_norm": 0.2773550375385691, "learning_rate": 0.0001861261174429765, "loss": 0.0689, "step": 1012 }, { "epoch": 0.7593703148425787, "grad_norm": 0.13482706212842185, "learning_rate": 0.00018608175143799524, "loss": 0.0199, "step": 1013 }, { "epoch": 0.760119940029985, "grad_norm": 0.14016837389353834, "learning_rate": 0.0001860373199144198, "loss": 0.025, "step": 1014 }, { "epoch": 0.7608695652173914, "grad_norm": 0.15921551130271686, "learning_rate": 0.00018599282290606808, "loss": 0.0411, "step": 1015 }, { "epoch": 0.7616191904047976, "grad_norm": 0.18076247696963452, "learning_rate": 0.00018594826044680756, "loss": 0.042, "step": 1016 }, { "epoch": 0.7623688155922039, "grad_norm": 0.3393069856609448, "learning_rate": 0.00018590363257055565, "loss": 0.0518, "step": 1017 }, { "epoch": 0.7631184407796102, "grad_norm": 0.8923903150439438, "learning_rate": 0.0001858589393112796, "loss": 0.0684, "step": 1018 }, { "epoch": 0.7638680659670165, "grad_norm": 0.3761190414620271, "learning_rate": 0.00018581418070299637, "loss": 0.048, "step": 1019 }, { "epoch": 0.7646176911544228, "grad_norm": 0.18620916906321824, "learning_rate": 0.0001857693567797726, "loss": 0.057, "step": 1020 }, { "epoch": 0.7653673163418291, "grad_norm": 0.14050003177244677, "learning_rate": 0.00018572446757572476, "loss": 0.0281, "step": 1021 }, { "epoch": 0.7661169415292354, "grad_norm": 0.15336984527633807, "learning_rate": 0.0001856795131250189, "loss": 0.0468, "step": 1022 }, { "epoch": 0.7668665667166417, "grad_norm": 0.19002269472919547, "learning_rate": 0.00018563449346187084, "loss": 0.0209, "step": 1023 }, { "epoch": 0.767616191904048, "grad_norm": 0.3336582335944263, "learning_rate": 0.00018558940862054593, "loss": 0.0559, "step": 1024 }, { "epoch": 0.7683658170914542, "grad_norm": 0.19568831389842506, "learning_rate": 0.00018554425863535917, "loss": 0.0555, "step": 1025 }, { "epoch": 0.7691154422788605, "grad_norm": 0.3240431823925636, "learning_rate": 0.00018549904354067516, "loss": 0.0585, "step": 1026 }, { "epoch": 0.7698650674662668, "grad_norm": 0.10923976048976274, "learning_rate": 0.00018545376337090802, "loss": 0.017, "step": 1027 }, { "epoch": 0.7706146926536732, "grad_norm": 0.23005253382946222, "learning_rate": 0.0001854084181605214, "loss": 0.0421, "step": 1028 }, { "epoch": 0.7713643178410795, "grad_norm": 0.1420842462842462, "learning_rate": 0.00018536300794402854, "loss": 0.027, "step": 1029 }, { "epoch": 0.7721139430284858, "grad_norm": 0.2659412690343279, "learning_rate": 0.00018531753275599205, "loss": 0.0691, "step": 1030 }, { "epoch": 0.7728635682158921, "grad_norm": 0.19542245079333642, "learning_rate": 0.00018527199263102402, "loss": 0.0294, "step": 1031 }, { "epoch": 0.7736131934032984, "grad_norm": 0.2289918980003493, "learning_rate": 0.00018522638760378604, "loss": 0.0212, "step": 1032 }, { "epoch": 0.7743628185907047, "grad_norm": 0.2237723071198558, "learning_rate": 0.00018518071770898896, "loss": 0.0282, "step": 1033 }, { "epoch": 0.775112443778111, "grad_norm": 0.16713847253625994, "learning_rate": 0.00018513498298139308, "loss": 0.0431, "step": 1034 }, { "epoch": 0.7758620689655172, "grad_norm": 0.4190531497604175, "learning_rate": 0.00018508918345580812, "loss": 0.0977, "step": 1035 }, { "epoch": 0.7766116941529235, "grad_norm": 0.11647871866815543, "learning_rate": 0.00018504331916709295, "loss": 0.0288, "step": 1036 }, { "epoch": 0.7773613193403298, "grad_norm": 0.12692899870309066, "learning_rate": 0.00018499739015015592, "loss": 0.018, "step": 1037 }, { "epoch": 0.7781109445277361, "grad_norm": 0.25096730088442054, "learning_rate": 0.00018495139643995447, "loss": 0.0411, "step": 1038 }, { "epoch": 0.7788605697151424, "grad_norm": 0.12678759482102697, "learning_rate": 0.0001849053380714954, "loss": 0.0268, "step": 1039 }, { "epoch": 0.7796101949025487, "grad_norm": 0.21993853166525845, "learning_rate": 0.00018485921507983464, "loss": 0.0331, "step": 1040 }, { "epoch": 0.780359820089955, "grad_norm": 0.13735175115597095, "learning_rate": 0.00018481302750007742, "loss": 0.0208, "step": 1041 }, { "epoch": 0.7811094452773614, "grad_norm": 0.27636509366101925, "learning_rate": 0.000184766775367378, "loss": 0.0401, "step": 1042 }, { "epoch": 0.7818590704647677, "grad_norm": 0.08300196710365276, "learning_rate": 0.00018472045871693985, "loss": 0.0155, "step": 1043 }, { "epoch": 0.782608695652174, "grad_norm": 0.10884016679085827, "learning_rate": 0.00018467407758401553, "loss": 0.022, "step": 1044 }, { "epoch": 0.7833583208395802, "grad_norm": 0.1414092080989368, "learning_rate": 0.00018462763200390668, "loss": 0.0176, "step": 1045 }, { "epoch": 0.7841079460269865, "grad_norm": 0.1592806396396988, "learning_rate": 0.00018458112201196397, "loss": 0.029, "step": 1046 }, { "epoch": 0.7848575712143928, "grad_norm": 0.3496710914903041, "learning_rate": 0.00018453454764358714, "loss": 0.0515, "step": 1047 }, { "epoch": 0.7856071964017991, "grad_norm": 0.19305827437982326, "learning_rate": 0.0001844879089342249, "loss": 0.0236, "step": 1048 }, { "epoch": 0.7863568215892054, "grad_norm": 0.2784510112379799, "learning_rate": 0.00018444120591937487, "loss": 0.0524, "step": 1049 }, { "epoch": 0.7871064467766117, "grad_norm": 0.16340562242653447, "learning_rate": 0.00018439443863458376, "loss": 0.036, "step": 1050 }, { "epoch": 0.787856071964018, "grad_norm": 0.09076231538746275, "learning_rate": 0.00018434760711544707, "loss": 0.0205, "step": 1051 }, { "epoch": 0.7886056971514243, "grad_norm": 0.10274725254355516, "learning_rate": 0.00018430071139760923, "loss": 0.019, "step": 1052 }, { "epoch": 0.7893553223388305, "grad_norm": 0.07424607061274292, "learning_rate": 0.00018425375151676356, "loss": 0.0124, "step": 1053 }, { "epoch": 0.7901049475262368, "grad_norm": 0.1478021118244442, "learning_rate": 0.0001842067275086522, "loss": 0.0315, "step": 1054 }, { "epoch": 0.7908545727136432, "grad_norm": 0.34576822503473104, "learning_rate": 0.00018415963940906601, "loss": 0.0386, "step": 1055 }, { "epoch": 0.7916041979010495, "grad_norm": 0.32702451129931603, "learning_rate": 0.0001841124872538448, "loss": 0.0823, "step": 1056 }, { "epoch": 0.7923538230884558, "grad_norm": 0.14429418533170335, "learning_rate": 0.00018406527107887706, "loss": 0.0228, "step": 1057 }, { "epoch": 0.7931034482758621, "grad_norm": 0.24367175933694912, "learning_rate": 0.0001840179909200999, "loss": 0.0299, "step": 1058 }, { "epoch": 0.7938530734632684, "grad_norm": 0.6099095348127705, "learning_rate": 0.00018397064681349932, "loss": 0.0532, "step": 1059 }, { "epoch": 0.7946026986506747, "grad_norm": 0.1197860468696163, "learning_rate": 0.00018392323879510983, "loss": 0.0232, "step": 1060 }, { "epoch": 0.795352323838081, "grad_norm": 0.12722737968329859, "learning_rate": 0.00018387576690101465, "loss": 0.0201, "step": 1061 }, { "epoch": 0.7961019490254873, "grad_norm": 0.14544820117937343, "learning_rate": 0.00018382823116734568, "loss": 0.0506, "step": 1062 }, { "epoch": 0.7968515742128935, "grad_norm": 0.20201164576960434, "learning_rate": 0.00018378063163028327, "loss": 0.0333, "step": 1063 }, { "epoch": 0.7976011994002998, "grad_norm": 0.19983512293761604, "learning_rate": 0.00018373296832605647, "loss": 0.0316, "step": 1064 }, { "epoch": 0.7983508245877061, "grad_norm": 0.2025587075303726, "learning_rate": 0.00018368524129094277, "loss": 0.0404, "step": 1065 }, { "epoch": 0.7991004497751124, "grad_norm": 0.21875036758596228, "learning_rate": 0.00018363745056126818, "loss": 0.0409, "step": 1066 }, { "epoch": 0.7998500749625187, "grad_norm": 0.08751800766519974, "learning_rate": 0.00018358959617340722, "loss": 0.0134, "step": 1067 }, { "epoch": 0.800599700149925, "grad_norm": 0.17751840607101965, "learning_rate": 0.00018354167816378293, "loss": 0.0298, "step": 1068 }, { "epoch": 0.8013493253373314, "grad_norm": 0.13844585273110852, "learning_rate": 0.00018349369656886654, "loss": 0.0225, "step": 1069 }, { "epoch": 0.8020989505247377, "grad_norm": 0.14779825985182682, "learning_rate": 0.00018344565142517798, "loss": 0.0172, "step": 1070 }, { "epoch": 0.802848575712144, "grad_norm": 0.23329649714840747, "learning_rate": 0.00018339754276928532, "loss": 0.0166, "step": 1071 }, { "epoch": 0.8035982008995503, "grad_norm": 0.27924723804645607, "learning_rate": 0.00018334937063780506, "loss": 0.0455, "step": 1072 }, { "epoch": 0.8043478260869565, "grad_norm": 0.3767242325154973, "learning_rate": 0.000183301135067402, "loss": 0.0504, "step": 1073 }, { "epoch": 0.8050974512743628, "grad_norm": 0.23456829764371648, "learning_rate": 0.00018325283609478925, "loss": 0.0295, "step": 1074 }, { "epoch": 0.8058470764617691, "grad_norm": 0.0953486660630768, "learning_rate": 0.0001832044737567281, "loss": 0.0142, "step": 1075 }, { "epoch": 0.8065967016491754, "grad_norm": 0.328669019767791, "learning_rate": 0.0001831560480900282, "loss": 0.046, "step": 1076 }, { "epoch": 0.8073463268365817, "grad_norm": 0.13689582912930398, "learning_rate": 0.00018310755913154725, "loss": 0.0246, "step": 1077 }, { "epoch": 0.808095952023988, "grad_norm": 0.1414467884002127, "learning_rate": 0.0001830590069181912, "loss": 0.0223, "step": 1078 }, { "epoch": 0.8088455772113943, "grad_norm": 0.4175305732858486, "learning_rate": 0.0001830103914869142, "loss": 0.038, "step": 1079 }, { "epoch": 0.8095952023988006, "grad_norm": 0.31226288028632215, "learning_rate": 0.00018296171287471844, "loss": 0.0302, "step": 1080 }, { "epoch": 0.8103448275862069, "grad_norm": 0.326138105636252, "learning_rate": 0.00018291297111865416, "loss": 0.0307, "step": 1081 }, { "epoch": 0.8110944527736131, "grad_norm": 0.19855722071274357, "learning_rate": 0.00018286416625581974, "loss": 0.0333, "step": 1082 }, { "epoch": 0.8118440779610195, "grad_norm": 0.19808218582591067, "learning_rate": 0.00018281529832336158, "loss": 0.0128, "step": 1083 }, { "epoch": 0.8125937031484258, "grad_norm": 0.18000093916311433, "learning_rate": 0.0001827663673584741, "loss": 0.0375, "step": 1084 }, { "epoch": 0.8133433283358321, "grad_norm": 0.34396375971888243, "learning_rate": 0.0001827173733983996, "loss": 0.0665, "step": 1085 }, { "epoch": 0.8140929535232384, "grad_norm": 0.2262246608766294, "learning_rate": 0.0001826683164804284, "loss": 0.0308, "step": 1086 }, { "epoch": 0.8148425787106447, "grad_norm": 0.2962731621576649, "learning_rate": 0.0001826191966418988, "loss": 0.0696, "step": 1087 }, { "epoch": 0.815592203898051, "grad_norm": 0.20833559391810397, "learning_rate": 0.00018257001392019685, "loss": 0.0313, "step": 1088 }, { "epoch": 0.8163418290854573, "grad_norm": 0.15919323978485042, "learning_rate": 0.00018252076835275656, "loss": 0.0308, "step": 1089 }, { "epoch": 0.8170914542728636, "grad_norm": 0.16911598179212334, "learning_rate": 0.00018247145997705975, "loss": 0.0176, "step": 1090 }, { "epoch": 0.8178410794602698, "grad_norm": 0.19861291408359866, "learning_rate": 0.000182422088830636, "loss": 0.0255, "step": 1091 }, { "epoch": 0.8185907046476761, "grad_norm": 0.17452275737517106, "learning_rate": 0.0001823726549510628, "loss": 0.0324, "step": 1092 }, { "epoch": 0.8193403298350824, "grad_norm": 0.09352058354683734, "learning_rate": 0.00018232315837596517, "loss": 0.0103, "step": 1093 }, { "epoch": 0.8200899550224887, "grad_norm": 0.32286404722292333, "learning_rate": 0.00018227359914301606, "loss": 0.0449, "step": 1094 }, { "epoch": 0.820839580209895, "grad_norm": 0.1281920640041683, "learning_rate": 0.00018222397728993604, "loss": 0.0223, "step": 1095 }, { "epoch": 0.8215892053973014, "grad_norm": 0.11089662157992478, "learning_rate": 0.0001821742928544932, "loss": 0.0187, "step": 1096 }, { "epoch": 0.8223388305847077, "grad_norm": 0.287036679017573, "learning_rate": 0.0001821245458745035, "loss": 0.042, "step": 1097 }, { "epoch": 0.823088455772114, "grad_norm": 0.19024568124342148, "learning_rate": 0.00018207473638783038, "loss": 0.0227, "step": 1098 }, { "epoch": 0.8238380809595203, "grad_norm": 0.1254493943051864, "learning_rate": 0.00018202486443238486, "loss": 0.035, "step": 1099 }, { "epoch": 0.8245877061469266, "grad_norm": 0.2299310002082326, "learning_rate": 0.00018197493004612548, "loss": 0.0409, "step": 1100 }, { "epoch": 0.8253373313343328, "grad_norm": 0.20942758491518929, "learning_rate": 0.00018192493326705836, "loss": 0.0719, "step": 1101 }, { "epoch": 0.8260869565217391, "grad_norm": 0.2169193782357245, "learning_rate": 0.00018187487413323707, "loss": 0.0522, "step": 1102 }, { "epoch": 0.8268365817091454, "grad_norm": 0.10248145734203475, "learning_rate": 0.00018182475268276263, "loss": 0.0252, "step": 1103 }, { "epoch": 0.8275862068965517, "grad_norm": 0.14669484702742797, "learning_rate": 0.00018177456895378353, "loss": 0.0326, "step": 1104 }, { "epoch": 0.828335832083958, "grad_norm": 0.29384935141714214, "learning_rate": 0.00018172432298449565, "loss": 0.0334, "step": 1105 }, { "epoch": 0.8290854572713643, "grad_norm": 0.3435307574051933, "learning_rate": 0.00018167401481314222, "loss": 0.0447, "step": 1106 }, { "epoch": 0.8298350824587706, "grad_norm": 0.10926290483056297, "learning_rate": 0.0001816236444780138, "loss": 0.0254, "step": 1107 }, { "epoch": 0.8305847076461769, "grad_norm": 0.3592195940240025, "learning_rate": 0.0001815732120174483, "loss": 0.0432, "step": 1108 }, { "epoch": 0.8313343328335832, "grad_norm": 0.2745019580357286, "learning_rate": 0.00018152271746983093, "loss": 0.0317, "step": 1109 }, { "epoch": 0.8320839580209896, "grad_norm": 0.3719022292991417, "learning_rate": 0.00018147216087359414, "loss": 0.0554, "step": 1110 }, { "epoch": 0.8328335832083958, "grad_norm": 0.14829998786574425, "learning_rate": 0.00018142154226721754, "loss": 0.0267, "step": 1111 }, { "epoch": 0.8335832083958021, "grad_norm": 0.3984907269507194, "learning_rate": 0.00018137086168922806, "loss": 0.0392, "step": 1112 }, { "epoch": 0.8343328335832084, "grad_norm": 0.2696505935588497, "learning_rate": 0.00018132011917819968, "loss": 0.029, "step": 1113 }, { "epoch": 0.8350824587706147, "grad_norm": 0.16595550885495156, "learning_rate": 0.0001812693147727536, "loss": 0.0366, "step": 1114 }, { "epoch": 0.835832083958021, "grad_norm": 0.11780495839903224, "learning_rate": 0.00018121844851155805, "loss": 0.0326, "step": 1115 }, { "epoch": 0.8365817091454273, "grad_norm": 0.30894718763924667, "learning_rate": 0.00018116752043332847, "loss": 0.0225, "step": 1116 }, { "epoch": 0.8373313343328336, "grad_norm": 0.2601722543581295, "learning_rate": 0.0001811165305768272, "loss": 0.0298, "step": 1117 }, { "epoch": 0.8380809595202399, "grad_norm": 0.1612494703238606, "learning_rate": 0.00018106547898086374, "loss": 0.0213, "step": 1118 }, { "epoch": 0.8388305847076462, "grad_norm": 0.18569002752088673, "learning_rate": 0.0001810143656842944, "loss": 0.0218, "step": 1119 }, { "epoch": 0.8395802098950524, "grad_norm": 0.19615867709287715, "learning_rate": 0.00018096319072602267, "loss": 0.0239, "step": 1120 }, { "epoch": 0.8403298350824587, "grad_norm": 0.11600381929740078, "learning_rate": 0.00018091195414499884, "loss": 0.0125, "step": 1121 }, { "epoch": 0.841079460269865, "grad_norm": 0.2012290615387593, "learning_rate": 0.0001808606559802201, "loss": 0.0356, "step": 1122 }, { "epoch": 0.8418290854572713, "grad_norm": 0.5241588035079642, "learning_rate": 0.0001808092962707306, "loss": 0.0557, "step": 1123 }, { "epoch": 0.8425787106446777, "grad_norm": 0.16468248800615976, "learning_rate": 0.00018075787505562115, "loss": 0.0174, "step": 1124 }, { "epoch": 0.843328335832084, "grad_norm": 0.17673252568720604, "learning_rate": 0.00018070639237402958, "loss": 0.0185, "step": 1125 }, { "epoch": 0.8440779610194903, "grad_norm": 0.15194993456361727, "learning_rate": 0.00018065484826514046, "loss": 0.0267, "step": 1126 }, { "epoch": 0.8448275862068966, "grad_norm": 0.44198164798151507, "learning_rate": 0.00018060324276818497, "loss": 0.0432, "step": 1127 }, { "epoch": 0.8455772113943029, "grad_norm": 0.11663777926382558, "learning_rate": 0.0001805515759224412, "loss": 0.0142, "step": 1128 }, { "epoch": 0.8463268365817092, "grad_norm": 0.21518598232120772, "learning_rate": 0.00018049984776723384, "loss": 0.0492, "step": 1129 }, { "epoch": 0.8470764617691154, "grad_norm": 0.4390773125412656, "learning_rate": 0.00018044805834193422, "loss": 0.0214, "step": 1130 }, { "epoch": 0.8478260869565217, "grad_norm": 0.2342293522852248, "learning_rate": 0.00018039620768596035, "loss": 0.0327, "step": 1131 }, { "epoch": 0.848575712143928, "grad_norm": 0.28434849206147805, "learning_rate": 0.00018034429583877682, "loss": 0.0468, "step": 1132 }, { "epoch": 0.8493253373313343, "grad_norm": 0.20845198697998343, "learning_rate": 0.00018029232283989485, "loss": 0.0512, "step": 1133 }, { "epoch": 0.8500749625187406, "grad_norm": 0.13380972738039215, "learning_rate": 0.0001802402887288721, "loss": 0.0347, "step": 1134 }, { "epoch": 0.8508245877061469, "grad_norm": 0.21020059893001336, "learning_rate": 0.00018018819354531288, "loss": 0.0324, "step": 1135 }, { "epoch": 0.8515742128935532, "grad_norm": 0.10604539154052423, "learning_rate": 0.00018013603732886782, "loss": 0.0229, "step": 1136 }, { "epoch": 0.8523238380809596, "grad_norm": 0.1316986405730497, "learning_rate": 0.00018008382011923418, "loss": 0.0289, "step": 1137 }, { "epoch": 0.8530734632683659, "grad_norm": 0.24477833124890808, "learning_rate": 0.00018003154195615547, "loss": 0.0377, "step": 1138 }, { "epoch": 0.8538230884557722, "grad_norm": 0.5055205643247289, "learning_rate": 0.00017997920287942174, "loss": 0.0709, "step": 1139 }, { "epoch": 0.8545727136431784, "grad_norm": 0.10917773052009153, "learning_rate": 0.0001799268029288693, "loss": 0.0134, "step": 1140 }, { "epoch": 0.8553223388305847, "grad_norm": 0.11921356496141748, "learning_rate": 0.00017987434214438084, "loss": 0.0171, "step": 1141 }, { "epoch": 0.856071964017991, "grad_norm": 0.09046442975402154, "learning_rate": 0.00017982182056588535, "loss": 0.0164, "step": 1142 }, { "epoch": 0.8568215892053973, "grad_norm": 0.7470385217785291, "learning_rate": 0.0001797692382333581, "loss": 0.0853, "step": 1143 }, { "epoch": 0.8575712143928036, "grad_norm": 0.3587711861829344, "learning_rate": 0.00017971659518682055, "loss": 0.069, "step": 1144 }, { "epoch": 0.8583208395802099, "grad_norm": 0.13478931125708365, "learning_rate": 0.00017966389146634043, "loss": 0.0267, "step": 1145 }, { "epoch": 0.8590704647676162, "grad_norm": 0.31830295239537176, "learning_rate": 0.00017961112711203166, "loss": 0.0338, "step": 1146 }, { "epoch": 0.8598200899550225, "grad_norm": 0.3826627534808154, "learning_rate": 0.0001795583021640542, "loss": 0.0386, "step": 1147 }, { "epoch": 0.8605697151424287, "grad_norm": 0.34627413661240874, "learning_rate": 0.00017950541666261427, "loss": 0.026, "step": 1148 }, { "epoch": 0.861319340329835, "grad_norm": 0.16387687839960238, "learning_rate": 0.0001794524706479641, "loss": 0.0315, "step": 1149 }, { "epoch": 0.8620689655172413, "grad_norm": 0.1641245742855337, "learning_rate": 0.000179399464160402, "loss": 0.0311, "step": 1150 }, { "epoch": 0.8628185907046477, "grad_norm": 0.07336979330054261, "learning_rate": 0.00017934639724027224, "loss": 0.0084, "step": 1151 }, { "epoch": 0.863568215892054, "grad_norm": 0.12978420808613153, "learning_rate": 0.0001792932699279652, "loss": 0.0331, "step": 1152 }, { "epoch": 0.8643178410794603, "grad_norm": 0.14286036892050807, "learning_rate": 0.00017924008226391717, "loss": 0.0317, "step": 1153 }, { "epoch": 0.8650674662668666, "grad_norm": 0.32962370915972483, "learning_rate": 0.00017918683428861035, "loss": 0.0382, "step": 1154 }, { "epoch": 0.8658170914542729, "grad_norm": 0.13110929982565675, "learning_rate": 0.0001791335260425729, "loss": 0.0223, "step": 1155 }, { "epoch": 0.8665667166416792, "grad_norm": 0.2966167078785812, "learning_rate": 0.00017908015756637884, "loss": 0.0402, "step": 1156 }, { "epoch": 0.8673163418290855, "grad_norm": 0.19862644988985317, "learning_rate": 0.00017902672890064797, "loss": 0.026, "step": 1157 }, { "epoch": 0.8680659670164917, "grad_norm": 0.19852351054120773, "learning_rate": 0.00017897324008604598, "loss": 0.0343, "step": 1158 }, { "epoch": 0.868815592203898, "grad_norm": 0.1875769862756106, "learning_rate": 0.0001789196911632843, "loss": 0.0231, "step": 1159 }, { "epoch": 0.8695652173913043, "grad_norm": 0.1897377931741348, "learning_rate": 0.0001788660821731201, "loss": 0.0286, "step": 1160 }, { "epoch": 0.8703148425787106, "grad_norm": 0.14919402045830019, "learning_rate": 0.0001788124131563563, "loss": 0.0234, "step": 1161 }, { "epoch": 0.8710644677661169, "grad_norm": 0.42801423004173667, "learning_rate": 0.0001787586841538415, "loss": 0.0477, "step": 1162 }, { "epoch": 0.8718140929535232, "grad_norm": 0.17922410832990773, "learning_rate": 0.00017870489520646992, "loss": 0.0339, "step": 1163 }, { "epoch": 0.8725637181409296, "grad_norm": 0.22900265727699864, "learning_rate": 0.00017865104635518151, "loss": 0.0542, "step": 1164 }, { "epoch": 0.8733133433283359, "grad_norm": 0.1694529110436179, "learning_rate": 0.00017859713764096166, "loss": 0.024, "step": 1165 }, { "epoch": 0.8740629685157422, "grad_norm": 0.3762369755923995, "learning_rate": 0.00017854316910484146, "loss": 0.034, "step": 1166 }, { "epoch": 0.8748125937031485, "grad_norm": 0.31175400181963114, "learning_rate": 0.00017848914078789742, "loss": 0.038, "step": 1167 }, { "epoch": 0.8755622188905547, "grad_norm": 0.3913855756756641, "learning_rate": 0.00017843505273125164, "loss": 0.0691, "step": 1168 }, { "epoch": 0.876311844077961, "grad_norm": 0.14525941244746013, "learning_rate": 0.00017838090497607166, "loss": 0.0239, "step": 1169 }, { "epoch": 0.8770614692653673, "grad_norm": 0.34454983943420076, "learning_rate": 0.00017832669756357044, "loss": 0.0374, "step": 1170 }, { "epoch": 0.8778110944527736, "grad_norm": 0.15201977953343096, "learning_rate": 0.00017827243053500632, "loss": 0.0215, "step": 1171 }, { "epoch": 0.8785607196401799, "grad_norm": 0.11462754614358202, "learning_rate": 0.00017821810393168312, "loss": 0.0186, "step": 1172 }, { "epoch": 0.8793103448275862, "grad_norm": 0.48064716216895115, "learning_rate": 0.00017816371779494992, "loss": 0.0497, "step": 1173 }, { "epoch": 0.8800599700149925, "grad_norm": 0.1236272833087385, "learning_rate": 0.00017810927216620111, "loss": 0.0196, "step": 1174 }, { "epoch": 0.8808095952023988, "grad_norm": 0.1709420115474228, "learning_rate": 0.00017805476708687639, "loss": 0.0222, "step": 1175 }, { "epoch": 0.881559220389805, "grad_norm": 0.2602585822615709, "learning_rate": 0.00017800020259846068, "loss": 0.0397, "step": 1176 }, { "epoch": 0.8823088455772113, "grad_norm": 0.19903973260321675, "learning_rate": 0.00017794557874248413, "loss": 0.0259, "step": 1177 }, { "epoch": 0.8830584707646177, "grad_norm": 0.40046335200360744, "learning_rate": 0.00017789089556052214, "loss": 0.0891, "step": 1178 }, { "epoch": 0.883808095952024, "grad_norm": 0.7922737046828083, "learning_rate": 0.00017783615309419518, "loss": 0.1839, "step": 1179 }, { "epoch": 0.8845577211394303, "grad_norm": 0.4024668107918972, "learning_rate": 0.00017778135138516883, "loss": 0.0494, "step": 1180 }, { "epoch": 0.8853073463268366, "grad_norm": 0.19193099715331405, "learning_rate": 0.00017772649047515383, "loss": 0.0535, "step": 1181 }, { "epoch": 0.8860569715142429, "grad_norm": 0.22104073938037833, "learning_rate": 0.00017767157040590595, "loss": 0.0285, "step": 1182 }, { "epoch": 0.8868065967016492, "grad_norm": 0.22765987584867606, "learning_rate": 0.000177616591219226, "loss": 0.0277, "step": 1183 }, { "epoch": 0.8875562218890555, "grad_norm": 0.2934152221276665, "learning_rate": 0.0001775615529569597, "loss": 0.0361, "step": 1184 }, { "epoch": 0.8883058470764618, "grad_norm": 0.28250721417463565, "learning_rate": 0.0001775064556609979, "loss": 0.0306, "step": 1185 }, { "epoch": 0.889055472263868, "grad_norm": 0.49569143870479365, "learning_rate": 0.00017745129937327622, "loss": 0.066, "step": 1186 }, { "epoch": 0.8898050974512743, "grad_norm": 0.24149842890659284, "learning_rate": 0.0001773960841357753, "loss": 0.0533, "step": 1187 }, { "epoch": 0.8905547226386806, "grad_norm": 0.23085852188961126, "learning_rate": 0.00017734080999052054, "loss": 0.0299, "step": 1188 }, { "epoch": 0.8913043478260869, "grad_norm": 0.21446160288818875, "learning_rate": 0.00017728547697958228, "loss": 0.0398, "step": 1189 }, { "epoch": 0.8920539730134932, "grad_norm": 0.16362181582236918, "learning_rate": 0.00017723008514507557, "loss": 0.0336, "step": 1190 }, { "epoch": 0.8928035982008995, "grad_norm": 0.22547577091360518, "learning_rate": 0.00017717463452916033, "loss": 0.0168, "step": 1191 }, { "epoch": 0.8935532233883059, "grad_norm": 0.41468130480243326, "learning_rate": 0.00017711912517404108, "loss": 0.033, "step": 1192 }, { "epoch": 0.8943028485757122, "grad_norm": 0.1367013504712274, "learning_rate": 0.00017706355712196722, "loss": 0.0227, "step": 1193 }, { "epoch": 0.8950524737631185, "grad_norm": 0.30838009069288647, "learning_rate": 0.00017700793041523273, "loss": 0.0256, "step": 1194 }, { "epoch": 0.8958020989505248, "grad_norm": 0.3643600538498598, "learning_rate": 0.00017695224509617615, "loss": 0.0264, "step": 1195 }, { "epoch": 0.896551724137931, "grad_norm": 0.1773585712190774, "learning_rate": 0.00017689650120718083, "loss": 0.0206, "step": 1196 }, { "epoch": 0.8973013493253373, "grad_norm": 0.09466427230844304, "learning_rate": 0.00017684069879067457, "loss": 0.0164, "step": 1197 }, { "epoch": 0.8980509745127436, "grad_norm": 0.2478374046281075, "learning_rate": 0.00017678483788912972, "loss": 0.0345, "step": 1198 }, { "epoch": 0.8988005997001499, "grad_norm": 0.1835506836680278, "learning_rate": 0.00017672891854506314, "loss": 0.0314, "step": 1199 }, { "epoch": 0.8995502248875562, "grad_norm": 0.7432904340156298, "learning_rate": 0.00017667294080103623, "loss": 0.1117, "step": 1200 }, { "epoch": 0.9002998500749625, "grad_norm": 0.19874364295361727, "learning_rate": 0.00017661690469965478, "loss": 0.0422, "step": 1201 }, { "epoch": 0.9010494752623688, "grad_norm": 0.2525657860864409, "learning_rate": 0.00017656081028356906, "loss": 0.0285, "step": 1202 }, { "epoch": 0.9017991004497751, "grad_norm": 0.15560729781092428, "learning_rate": 0.00017650465759547364, "loss": 0.0271, "step": 1203 }, { "epoch": 0.9025487256371814, "grad_norm": 0.20738627856114195, "learning_rate": 0.0001764484466781075, "loss": 0.0411, "step": 1204 }, { "epoch": 0.9032983508245878, "grad_norm": 0.25912924094622075, "learning_rate": 0.00017639217757425398, "loss": 0.0519, "step": 1205 }, { "epoch": 0.904047976011994, "grad_norm": 0.20049698324538573, "learning_rate": 0.0001763358503267406, "loss": 0.0285, "step": 1206 }, { "epoch": 0.9047976011994003, "grad_norm": 0.07636996700298185, "learning_rate": 0.00017627946497843916, "loss": 0.0129, "step": 1207 }, { "epoch": 0.9055472263868066, "grad_norm": 0.5409845972277662, "learning_rate": 0.00017622302157226574, "loss": 0.0385, "step": 1208 }, { "epoch": 0.9062968515742129, "grad_norm": 0.1377468501829297, "learning_rate": 0.0001761665201511806, "loss": 0.0305, "step": 1209 }, { "epoch": 0.9070464767616192, "grad_norm": 0.14571322925380692, "learning_rate": 0.00017610996075818813, "loss": 0.0291, "step": 1210 }, { "epoch": 0.9077961019490255, "grad_norm": 0.17281825498288494, "learning_rate": 0.00017605334343633684, "loss": 0.0288, "step": 1211 }, { "epoch": 0.9085457271364318, "grad_norm": 0.15628256876301608, "learning_rate": 0.00017599666822871933, "loss": 0.02, "step": 1212 }, { "epoch": 0.9092953523238381, "grad_norm": 0.1626084608026224, "learning_rate": 0.0001759399351784723, "loss": 0.0203, "step": 1213 }, { "epoch": 0.9100449775112444, "grad_norm": 0.37674788969646644, "learning_rate": 0.00017588314432877638, "loss": 0.0397, "step": 1214 }, { "epoch": 0.9107946026986506, "grad_norm": 0.1760643627933756, "learning_rate": 0.00017582629572285633, "loss": 0.0284, "step": 1215 }, { "epoch": 0.9115442278860569, "grad_norm": 0.26890020727090713, "learning_rate": 0.00017576938940398076, "loss": 0.0317, "step": 1216 }, { "epoch": 0.9122938530734632, "grad_norm": 0.1377644628031859, "learning_rate": 0.00017571242541546223, "loss": 0.0073, "step": 1217 }, { "epoch": 0.9130434782608695, "grad_norm": 0.1248429742984794, "learning_rate": 0.0001756554038006572, "loss": 0.0175, "step": 1218 }, { "epoch": 0.9137931034482759, "grad_norm": 0.12377973795358645, "learning_rate": 0.00017559832460296604, "loss": 0.0073, "step": 1219 }, { "epoch": 0.9145427286356822, "grad_norm": 0.16832600619943616, "learning_rate": 0.0001755411878658329, "loss": 0.0355, "step": 1220 }, { "epoch": 0.9152923538230885, "grad_norm": 0.05515583642707854, "learning_rate": 0.00017548399363274565, "loss": 0.0049, "step": 1221 }, { "epoch": 0.9160419790104948, "grad_norm": 0.17631658341381273, "learning_rate": 0.00017542674194723608, "loss": 0.0353, "step": 1222 }, { "epoch": 0.9167916041979011, "grad_norm": 0.3291715350286628, "learning_rate": 0.0001753694328528796, "loss": 0.0463, "step": 1223 }, { "epoch": 0.9175412293853074, "grad_norm": 0.38547473671666316, "learning_rate": 0.00017531206639329533, "loss": 0.0348, "step": 1224 }, { "epoch": 0.9182908545727136, "grad_norm": 0.14435028574913475, "learning_rate": 0.0001752546426121461, "loss": 0.0158, "step": 1225 }, { "epoch": 0.9190404797601199, "grad_norm": 0.47189193230826404, "learning_rate": 0.00017519716155313828, "loss": 0.0408, "step": 1226 }, { "epoch": 0.9197901049475262, "grad_norm": 0.12512750733662412, "learning_rate": 0.00017513962326002193, "loss": 0.0164, "step": 1227 }, { "epoch": 0.9205397301349325, "grad_norm": 0.2270342675936685, "learning_rate": 0.0001750820277765906, "loss": 0.0451, "step": 1228 }, { "epoch": 0.9212893553223388, "grad_norm": 0.13846789875210425, "learning_rate": 0.00017502437514668143, "loss": 0.0241, "step": 1229 }, { "epoch": 0.9220389805097451, "grad_norm": 0.475479072807183, "learning_rate": 0.00017496666541417502, "loss": 0.0733, "step": 1230 }, { "epoch": 0.9227886056971514, "grad_norm": 0.18052319968526903, "learning_rate": 0.00017490889862299542, "loss": 0.0255, "step": 1231 }, { "epoch": 0.9235382308845578, "grad_norm": 0.19929368687111212, "learning_rate": 0.00017485107481711012, "loss": 0.0577, "step": 1232 }, { "epoch": 0.9242878560719641, "grad_norm": 0.42547844329682477, "learning_rate": 0.00017479319404053003, "loss": 0.0437, "step": 1233 }, { "epoch": 0.9250374812593704, "grad_norm": 0.18707097210297646, "learning_rate": 0.00017473525633730937, "loss": 0.0315, "step": 1234 }, { "epoch": 0.9257871064467766, "grad_norm": 0.5106290605728168, "learning_rate": 0.00017467726175154574, "loss": 0.0551, "step": 1235 }, { "epoch": 0.9265367316341829, "grad_norm": 0.5377982295452615, "learning_rate": 0.00017461921032738004, "loss": 0.0546, "step": 1236 }, { "epoch": 0.9272863568215892, "grad_norm": 0.18458832162076752, "learning_rate": 0.00017456110210899639, "loss": 0.0275, "step": 1237 }, { "epoch": 0.9280359820089955, "grad_norm": 0.29982500992143923, "learning_rate": 0.00017450293714062212, "loss": 0.041, "step": 1238 }, { "epoch": 0.9287856071964018, "grad_norm": 0.14029784308266852, "learning_rate": 0.00017444471546652782, "loss": 0.03, "step": 1239 }, { "epoch": 0.9295352323838081, "grad_norm": 0.1792643339212138, "learning_rate": 0.00017438643713102716, "loss": 0.043, "step": 1240 }, { "epoch": 0.9302848575712144, "grad_norm": 0.1536906615959804, "learning_rate": 0.000174328102178477, "loss": 0.0302, "step": 1241 }, { "epoch": 0.9310344827586207, "grad_norm": 0.11341392647383206, "learning_rate": 0.00017426971065327736, "loss": 0.0152, "step": 1242 }, { "epoch": 0.931784107946027, "grad_norm": 0.13502410688401909, "learning_rate": 0.0001742112625998711, "loss": 0.025, "step": 1243 }, { "epoch": 0.9325337331334332, "grad_norm": 0.1615541948769028, "learning_rate": 0.00017415275806274431, "loss": 0.0237, "step": 1244 }, { "epoch": 0.9332833583208395, "grad_norm": 0.17981864857011048, "learning_rate": 0.000174094197086426, "loss": 0.0506, "step": 1245 }, { "epoch": 0.9340329835082459, "grad_norm": 0.24992613967096078, "learning_rate": 0.0001740355797154881, "loss": 0.0409, "step": 1246 }, { "epoch": 0.9347826086956522, "grad_norm": 0.1892497454856021, "learning_rate": 0.00017397690599454553, "loss": 0.0415, "step": 1247 }, { "epoch": 0.9355322338830585, "grad_norm": 0.12789737367676784, "learning_rate": 0.00017391817596825604, "loss": 0.015, "step": 1248 }, { "epoch": 0.9362818590704648, "grad_norm": 0.2617865941401239, "learning_rate": 0.0001738593896813203, "loss": 0.044, "step": 1249 }, { "epoch": 0.9370314842578711, "grad_norm": 0.21519189097963973, "learning_rate": 0.00017380054717848172, "loss": 0.0262, "step": 1250 }, { "epoch": 0.9377811094452774, "grad_norm": 0.34850546167695395, "learning_rate": 0.0001737416485045266, "loss": 0.0831, "step": 1251 }, { "epoch": 0.9385307346326837, "grad_norm": 0.20079517269559174, "learning_rate": 0.00017368269370428388, "loss": 0.0394, "step": 1252 }, { "epoch": 0.93928035982009, "grad_norm": 0.11554891447368328, "learning_rate": 0.0001736236828226253, "loss": 0.0246, "step": 1253 }, { "epoch": 0.9400299850074962, "grad_norm": 0.22652384701168027, "learning_rate": 0.00017356461590446523, "loss": 0.0339, "step": 1254 }, { "epoch": 0.9407796101949025, "grad_norm": 0.231228100996135, "learning_rate": 0.00017350549299476072, "loss": 0.0541, "step": 1255 }, { "epoch": 0.9415292353823088, "grad_norm": 0.1731235239961964, "learning_rate": 0.0001734463141385115, "loss": 0.0231, "step": 1256 }, { "epoch": 0.9422788605697151, "grad_norm": 0.15592401116422774, "learning_rate": 0.00017338707938075972, "loss": 0.0329, "step": 1257 }, { "epoch": 0.9430284857571214, "grad_norm": 0.24034812804004377, "learning_rate": 0.00017332778876659027, "loss": 0.0316, "step": 1258 }, { "epoch": 0.9437781109445277, "grad_norm": 0.13463299904907405, "learning_rate": 0.00017326844234113038, "loss": 0.019, "step": 1259 }, { "epoch": 0.9445277361319341, "grad_norm": 0.11207675048093935, "learning_rate": 0.00017320904014954985, "loss": 0.0187, "step": 1260 }, { "epoch": 0.9452773613193404, "grad_norm": 0.323716163461882, "learning_rate": 0.00017314958223706094, "loss": 0.0279, "step": 1261 }, { "epoch": 0.9460269865067467, "grad_norm": 0.13799052831708086, "learning_rate": 0.00017309006864891827, "loss": 0.0194, "step": 1262 }, { "epoch": 0.946776611694153, "grad_norm": 0.21693844938529427, "learning_rate": 0.00017303049943041888, "loss": 0.0251, "step": 1263 }, { "epoch": 0.9475262368815592, "grad_norm": 0.135271477910859, "learning_rate": 0.00017297087462690215, "loss": 0.0183, "step": 1264 }, { "epoch": 0.9482758620689655, "grad_norm": 0.14842531923446786, "learning_rate": 0.00017291119428374966, "loss": 0.0149, "step": 1265 }, { "epoch": 0.9490254872563718, "grad_norm": 0.2968947241483329, "learning_rate": 0.00017285145844638542, "loss": 0.0402, "step": 1266 }, { "epoch": 0.9497751124437781, "grad_norm": 0.20604469545460485, "learning_rate": 0.00017279166716027559, "loss": 0.0287, "step": 1267 }, { "epoch": 0.9505247376311844, "grad_norm": 0.20275501855488534, "learning_rate": 0.00017273182047092855, "loss": 0.0118, "step": 1268 }, { "epoch": 0.9512743628185907, "grad_norm": 0.3031556099834936, "learning_rate": 0.00017267191842389482, "loss": 0.0535, "step": 1269 }, { "epoch": 0.952023988005997, "grad_norm": 0.1446261371898781, "learning_rate": 0.00017261196106476713, "loss": 0.0241, "step": 1270 }, { "epoch": 0.9527736131934033, "grad_norm": 0.4708395724862073, "learning_rate": 0.0001725519484391802, "loss": 0.0488, "step": 1271 }, { "epoch": 0.9535232383808095, "grad_norm": 0.2111295614057366, "learning_rate": 0.00017249188059281098, "loss": 0.0246, "step": 1272 }, { "epoch": 0.954272863568216, "grad_norm": 0.15133666839742155, "learning_rate": 0.0001724317575713782, "loss": 0.0279, "step": 1273 }, { "epoch": 0.9550224887556222, "grad_norm": 0.2522194496469408, "learning_rate": 0.0001723715794206429, "loss": 0.0621, "step": 1274 }, { "epoch": 0.9557721139430285, "grad_norm": 0.2112780955153208, "learning_rate": 0.00017231134618640774, "loss": 0.0475, "step": 1275 }, { "epoch": 0.9565217391304348, "grad_norm": 0.2880812439470075, "learning_rate": 0.00017225105791451756, "loss": 0.0386, "step": 1276 }, { "epoch": 0.9572713643178411, "grad_norm": 0.2982729221462802, "learning_rate": 0.00017219071465085904, "loss": 0.0377, "step": 1277 }, { "epoch": 0.9580209895052474, "grad_norm": 0.2542609303536709, "learning_rate": 0.00017213031644136063, "loss": 0.0449, "step": 1278 }, { "epoch": 0.9587706146926537, "grad_norm": 0.09945597804821016, "learning_rate": 0.00017206986333199266, "loss": 0.018, "step": 1279 }, { "epoch": 0.95952023988006, "grad_norm": 0.19852384643166057, "learning_rate": 0.00017200935536876722, "loss": 0.0308, "step": 1280 }, { "epoch": 0.9602698650674663, "grad_norm": 0.31496987120458725, "learning_rate": 0.00017194879259773815, "loss": 0.0279, "step": 1281 }, { "epoch": 0.9610194902548725, "grad_norm": 0.2644898077162191, "learning_rate": 0.00017188817506500113, "loss": 0.0227, "step": 1282 }, { "epoch": 0.9617691154422788, "grad_norm": 0.11459251760842831, "learning_rate": 0.00017182750281669328, "loss": 0.0245, "step": 1283 }, { "epoch": 0.9625187406296851, "grad_norm": 0.2852291865642535, "learning_rate": 0.00017176677589899358, "loss": 0.0434, "step": 1284 }, { "epoch": 0.9632683658170914, "grad_norm": 0.12222861445621339, "learning_rate": 0.00017170599435812253, "loss": 0.0188, "step": 1285 }, { "epoch": 0.9640179910044977, "grad_norm": 0.22805218335960845, "learning_rate": 0.0001716451582403422, "loss": 0.0394, "step": 1286 }, { "epoch": 0.9647676161919041, "grad_norm": 0.21051893238370895, "learning_rate": 0.0001715842675919562, "loss": 0.0372, "step": 1287 }, { "epoch": 0.9655172413793104, "grad_norm": 0.1634771437962494, "learning_rate": 0.00017152332245930967, "loss": 0.0259, "step": 1288 }, { "epoch": 0.9662668665667167, "grad_norm": 0.39264183246076684, "learning_rate": 0.00017146232288878918, "loss": 0.0498, "step": 1289 }, { "epoch": 0.967016491754123, "grad_norm": 0.07351653247369035, "learning_rate": 0.00017140126892682282, "loss": 0.0115, "step": 1290 }, { "epoch": 0.9677661169415293, "grad_norm": 0.1389315827033655, "learning_rate": 0.00017134016061987994, "loss": 0.0333, "step": 1291 }, { "epoch": 0.9685157421289355, "grad_norm": 0.07247032170621293, "learning_rate": 0.00017127899801447143, "loss": 0.008, "step": 1292 }, { "epoch": 0.9692653673163418, "grad_norm": 0.375106496168661, "learning_rate": 0.00017121778115714928, "loss": 0.0364, "step": 1293 }, { "epoch": 0.9700149925037481, "grad_norm": 0.11378529206241661, "learning_rate": 0.00017115651009450702, "loss": 0.0172, "step": 1294 }, { "epoch": 0.9707646176911544, "grad_norm": 0.14501111657727805, "learning_rate": 0.00017109518487317925, "loss": 0.0168, "step": 1295 }, { "epoch": 0.9715142428785607, "grad_norm": 0.14111436249266363, "learning_rate": 0.0001710338055398419, "loss": 0.0138, "step": 1296 }, { "epoch": 0.972263868065967, "grad_norm": 0.15700514015924621, "learning_rate": 0.000170972372141212, "loss": 0.0368, "step": 1297 }, { "epoch": 0.9730134932533733, "grad_norm": 0.09296493562932354, "learning_rate": 0.0001709108847240478, "loss": 0.0185, "step": 1298 }, { "epoch": 0.9737631184407796, "grad_norm": 0.1852179762312589, "learning_rate": 0.00017084934333514866, "loss": 0.0309, "step": 1299 }, { "epoch": 0.974512743628186, "grad_norm": 0.1591504875482633, "learning_rate": 0.00017078774802135494, "loss": 0.0428, "step": 1300 }, { "epoch": 0.9752623688155923, "grad_norm": 0.22413860967262877, "learning_rate": 0.00017072609882954817, "loss": 0.0583, "step": 1301 }, { "epoch": 0.9760119940029985, "grad_norm": 0.10757982979112977, "learning_rate": 0.00017066439580665078, "loss": 0.0171, "step": 1302 }, { "epoch": 0.9767616191904048, "grad_norm": 0.04943454155924863, "learning_rate": 0.00017060263899962622, "loss": 0.0053, "step": 1303 }, { "epoch": 0.9775112443778111, "grad_norm": 0.22475189797552722, "learning_rate": 0.00017054082845547886, "loss": 0.0248, "step": 1304 }, { "epoch": 0.9782608695652174, "grad_norm": 0.07570818993749279, "learning_rate": 0.000170478964221254, "loss": 0.0129, "step": 1305 }, { "epoch": 0.9790104947526237, "grad_norm": 0.10998906191991285, "learning_rate": 0.00017041704634403776, "loss": 0.0182, "step": 1306 }, { "epoch": 0.97976011994003, "grad_norm": 0.16522886925498625, "learning_rate": 0.00017035507487095714, "loss": 0.0357, "step": 1307 }, { "epoch": 0.9805097451274363, "grad_norm": 0.1231331490493064, "learning_rate": 0.0001702930498491799, "loss": 0.017, "step": 1308 }, { "epoch": 0.9812593703148426, "grad_norm": 0.17243252970067333, "learning_rate": 0.0001702309713259145, "loss": 0.0368, "step": 1309 }, { "epoch": 0.9820089955022488, "grad_norm": 0.1995148535367195, "learning_rate": 0.00017016883934841025, "loss": 0.0167, "step": 1310 }, { "epoch": 0.9827586206896551, "grad_norm": 0.22039155231203994, "learning_rate": 0.00017010665396395704, "loss": 0.0394, "step": 1311 }, { "epoch": 0.9835082458770614, "grad_norm": 0.20335893313531456, "learning_rate": 0.0001700444152198855, "loss": 0.0574, "step": 1312 }, { "epoch": 0.9842578710644677, "grad_norm": 0.40000935163353163, "learning_rate": 0.00016998212316356677, "loss": 0.0873, "step": 1313 }, { "epoch": 0.9850074962518741, "grad_norm": 0.2006157626929846, "learning_rate": 0.00016991977784241262, "loss": 0.0294, "step": 1314 }, { "epoch": 0.9857571214392804, "grad_norm": 0.3823245678319009, "learning_rate": 0.00016985737930387537, "loss": 0.0553, "step": 1315 }, { "epoch": 0.9865067466266867, "grad_norm": 0.16980459817111984, "learning_rate": 0.00016979492759544785, "loss": 0.0329, "step": 1316 }, { "epoch": 0.987256371814093, "grad_norm": 0.19056633968766426, "learning_rate": 0.00016973242276466333, "loss": 0.0375, "step": 1317 }, { "epoch": 0.9880059970014993, "grad_norm": 0.09073012938358295, "learning_rate": 0.00016966986485909548, "loss": 0.0188, "step": 1318 }, { "epoch": 0.9887556221889056, "grad_norm": 0.32659909705802087, "learning_rate": 0.0001696072539263585, "loss": 0.0364, "step": 1319 }, { "epoch": 0.9895052473763118, "grad_norm": 0.284602610836338, "learning_rate": 0.0001695445900141068, "loss": 0.0635, "step": 1320 }, { "epoch": 0.9902548725637181, "grad_norm": 0.20725592747179133, "learning_rate": 0.00016948187317003516, "loss": 0.0542, "step": 1321 }, { "epoch": 0.9910044977511244, "grad_norm": 0.20036234880761852, "learning_rate": 0.0001694191034418787, "loss": 0.0351, "step": 1322 }, { "epoch": 0.9917541229385307, "grad_norm": 0.25074475113235223, "learning_rate": 0.00016935628087741273, "loss": 0.0259, "step": 1323 }, { "epoch": 0.992503748125937, "grad_norm": 0.19747618705563696, "learning_rate": 0.00016929340552445282, "loss": 0.0343, "step": 1324 }, { "epoch": 0.9932533733133433, "grad_norm": 0.14414285780381583, "learning_rate": 0.00016923047743085467, "loss": 0.0223, "step": 1325 }, { "epoch": 0.9940029985007496, "grad_norm": 0.2723679181126613, "learning_rate": 0.00016916749664451416, "loss": 0.0294, "step": 1326 }, { "epoch": 0.9947526236881559, "grad_norm": 0.21451817983901414, "learning_rate": 0.0001691044632133673, "loss": 0.0484, "step": 1327 }, { "epoch": 0.9955022488755623, "grad_norm": 0.15711266739216587, "learning_rate": 0.00016904137718539005, "loss": 0.0298, "step": 1328 }, { "epoch": 0.9962518740629686, "grad_norm": 0.1379956124539861, "learning_rate": 0.00016897823860859856, "loss": 0.0144, "step": 1329 }, { "epoch": 0.9970014992503748, "grad_norm": 0.21097619659308514, "learning_rate": 0.00016891504753104887, "loss": 0.0264, "step": 1330 }, { "epoch": 0.9977511244377811, "grad_norm": 0.10797556881915736, "learning_rate": 0.00016885180400083702, "loss": 0.0168, "step": 1331 }, { "epoch": 0.9985007496251874, "grad_norm": 0.15790769385517053, "learning_rate": 0.00016878850806609897, "loss": 0.0274, "step": 1332 }, { "epoch": 0.9992503748125937, "grad_norm": 0.14995259627727942, "learning_rate": 0.00016872515977501055, "loss": 0.0338, "step": 1333 }, { "epoch": 1.0, "grad_norm": 0.23881055263244724, "learning_rate": 0.0001686617591757874, "loss": 0.0331, "step": 1334 }, { "epoch": 1.0, "eval_loss": 0.03587876632809639, "eval_runtime": 1844.658, "eval_samples_per_second": 5.625, "eval_steps_per_second": 0.703, "step": 1334 }, { "epoch": 1.0007496251874064, "grad_norm": 0.08185628595380386, "learning_rate": 0.00016859830631668513, "loss": 0.0095, "step": 1335 }, { "epoch": 1.0014992503748126, "grad_norm": 0.17881441136974335, "learning_rate": 0.0001685348012459989, "loss": 0.0363, "step": 1336 }, { "epoch": 1.002248875562219, "grad_norm": 0.13731889201420747, "learning_rate": 0.00016847124401206384, "loss": 0.0159, "step": 1337 }, { "epoch": 1.0029985007496252, "grad_norm": 0.17747397404715173, "learning_rate": 0.00016840763466325457, "loss": 0.029, "step": 1338 }, { "epoch": 1.0037481259370316, "grad_norm": 0.2158747869092809, "learning_rate": 0.00016834397324798554, "loss": 0.0296, "step": 1339 }, { "epoch": 1.0044977511244377, "grad_norm": 0.1762430900702256, "learning_rate": 0.00016828025981471074, "loss": 0.0246, "step": 1340 }, { "epoch": 1.0052473763118441, "grad_norm": 0.2791849833350819, "learning_rate": 0.00016821649441192379, "loss": 0.022, "step": 1341 }, { "epoch": 1.0059970014992503, "grad_norm": 0.21450121217367965, "learning_rate": 0.00016815267708815784, "loss": 0.038, "step": 1342 }, { "epoch": 1.0067466266866567, "grad_norm": 0.10815207403591505, "learning_rate": 0.0001680888078919856, "loss": 0.0172, "step": 1343 }, { "epoch": 1.0074962518740629, "grad_norm": 0.07090930239731541, "learning_rate": 0.00016802488687201917, "loss": 0.0103, "step": 1344 }, { "epoch": 1.0082458770614693, "grad_norm": 0.2098963791136399, "learning_rate": 0.0001679609140769102, "loss": 0.0361, "step": 1345 }, { "epoch": 1.0089955022488755, "grad_norm": 0.09496829420002569, "learning_rate": 0.00016789688955534966, "loss": 0.014, "step": 1346 }, { "epoch": 1.0097451274362819, "grad_norm": 0.5919358618067819, "learning_rate": 0.000167832813356068, "loss": 0.0555, "step": 1347 }, { "epoch": 1.0104947526236883, "grad_norm": 0.28060088970632135, "learning_rate": 0.00016776868552783488, "loss": 0.0309, "step": 1348 }, { "epoch": 1.0112443778110944, "grad_norm": 0.09028108389815097, "learning_rate": 0.00016770450611945935, "loss": 0.0158, "step": 1349 }, { "epoch": 1.0119940029985008, "grad_norm": 0.12178089448441314, "learning_rate": 0.00016764027517978961, "loss": 0.0207, "step": 1350 }, { "epoch": 1.012743628185907, "grad_norm": 0.3504490855417358, "learning_rate": 0.00016757599275771324, "loss": 0.0585, "step": 1351 }, { "epoch": 1.0134932533733134, "grad_norm": 0.15135554030374876, "learning_rate": 0.00016751165890215686, "loss": 0.0265, "step": 1352 }, { "epoch": 1.0142428785607196, "grad_norm": 0.1888391656646971, "learning_rate": 0.00016744727366208633, "loss": 0.0173, "step": 1353 }, { "epoch": 1.014992503748126, "grad_norm": 0.24059619446312613, "learning_rate": 0.00016738283708650652, "loss": 0.0111, "step": 1354 }, { "epoch": 1.0157421289355322, "grad_norm": 0.15515112646263388, "learning_rate": 0.0001673183492244615, "loss": 0.0183, "step": 1355 }, { "epoch": 1.0164917541229386, "grad_norm": 0.05867960140949975, "learning_rate": 0.00016725381012503427, "loss": 0.0084, "step": 1356 }, { "epoch": 1.0172413793103448, "grad_norm": 0.15773163802954165, "learning_rate": 0.0001671892198373469, "loss": 0.0253, "step": 1357 }, { "epoch": 1.0179910044977512, "grad_norm": 0.21478798582104225, "learning_rate": 0.00016712457841056038, "loss": 0.0236, "step": 1358 }, { "epoch": 1.0187406296851573, "grad_norm": 0.22090328678852036, "learning_rate": 0.0001670598858938746, "loss": 0.0173, "step": 1359 }, { "epoch": 1.0194902548725637, "grad_norm": 0.1499121573611819, "learning_rate": 0.0001669951423365284, "loss": 0.0255, "step": 1360 }, { "epoch": 1.02023988005997, "grad_norm": 0.1831800231984105, "learning_rate": 0.00016693034778779943, "loss": 0.0172, "step": 1361 }, { "epoch": 1.0209895052473763, "grad_norm": 0.1492805212650543, "learning_rate": 0.00016686550229700413, "loss": 0.013, "step": 1362 }, { "epoch": 1.0217391304347827, "grad_norm": 0.34305168405576464, "learning_rate": 0.00016680060591349775, "loss": 0.0373, "step": 1363 }, { "epoch": 1.0224887556221889, "grad_norm": 0.13975032211968622, "learning_rate": 0.00016673565868667432, "loss": 0.0161, "step": 1364 }, { "epoch": 1.0232383808095953, "grad_norm": 0.3064630006414311, "learning_rate": 0.00016667066066596643, "loss": 0.0496, "step": 1365 }, { "epoch": 1.0239880059970015, "grad_norm": 0.0826424133342666, "learning_rate": 0.00016660561190084546, "loss": 0.0125, "step": 1366 }, { "epoch": 1.0247376311844079, "grad_norm": 0.1382847007841654, "learning_rate": 0.00016654051244082137, "loss": 0.0135, "step": 1367 }, { "epoch": 1.025487256371814, "grad_norm": 0.14488556357441107, "learning_rate": 0.00016647536233544265, "loss": 0.0159, "step": 1368 }, { "epoch": 1.0262368815592204, "grad_norm": 0.2429422011210165, "learning_rate": 0.00016641016163429644, "loss": 0.0261, "step": 1369 }, { "epoch": 1.0269865067466266, "grad_norm": 0.09274489305019605, "learning_rate": 0.0001663449103870083, "loss": 0.0127, "step": 1370 }, { "epoch": 1.027736131934033, "grad_norm": 0.13706007234801748, "learning_rate": 0.00016627960864324233, "loss": 0.0186, "step": 1371 }, { "epoch": 1.0284857571214392, "grad_norm": 0.1667184859979947, "learning_rate": 0.00016621425645270099, "loss": 0.0469, "step": 1372 }, { "epoch": 1.0292353823088456, "grad_norm": 0.13711825824543697, "learning_rate": 0.00016614885386512525, "loss": 0.017, "step": 1373 }, { "epoch": 1.0299850074962518, "grad_norm": 0.15682613258083772, "learning_rate": 0.00016608340093029428, "loss": 0.0153, "step": 1374 }, { "epoch": 1.0307346326836582, "grad_norm": 0.09438959849052585, "learning_rate": 0.0001660178976980257, "loss": 0.0099, "step": 1375 }, { "epoch": 1.0314842578710646, "grad_norm": 0.15132109568049978, "learning_rate": 0.0001659523442181754, "loss": 0.0197, "step": 1376 }, { "epoch": 1.0322338830584707, "grad_norm": 0.08767583362202569, "learning_rate": 0.00016588674054063743, "loss": 0.011, "step": 1377 }, { "epoch": 1.0329835082458771, "grad_norm": 0.21411442763454988, "learning_rate": 0.0001658210867153441, "loss": 0.0137, "step": 1378 }, { "epoch": 1.0337331334332833, "grad_norm": 0.1454566222783013, "learning_rate": 0.00016575538279226594, "loss": 0.0146, "step": 1379 }, { "epoch": 1.0344827586206897, "grad_norm": 0.13381699143135503, "learning_rate": 0.00016568962882141156, "loss": 0.0219, "step": 1380 }, { "epoch": 1.035232383808096, "grad_norm": 0.2426147235827075, "learning_rate": 0.00016562382485282758, "loss": 0.0184, "step": 1381 }, { "epoch": 1.0359820089955023, "grad_norm": 0.6098398132822853, "learning_rate": 0.00016555797093659882, "loss": 0.0134, "step": 1382 }, { "epoch": 1.0367316341829085, "grad_norm": 0.18837425350082296, "learning_rate": 0.00016549206712284807, "loss": 0.0155, "step": 1383 }, { "epoch": 1.0374812593703149, "grad_norm": 0.16473229178818735, "learning_rate": 0.000165426113461736, "loss": 0.0257, "step": 1384 }, { "epoch": 1.038230884557721, "grad_norm": 0.23267668027297075, "learning_rate": 0.00016536011000346137, "loss": 0.0318, "step": 1385 }, { "epoch": 1.0389805097451275, "grad_norm": 0.16538388110134583, "learning_rate": 0.00016529405679826075, "loss": 0.029, "step": 1386 }, { "epoch": 1.0397301349325336, "grad_norm": 0.2169175282271481, "learning_rate": 0.00016522795389640858, "loss": 0.0146, "step": 1387 }, { "epoch": 1.04047976011994, "grad_norm": 0.1584369960804152, "learning_rate": 0.00016516180134821718, "loss": 0.0293, "step": 1388 }, { "epoch": 1.0412293853073464, "grad_norm": 0.22755520337638765, "learning_rate": 0.0001650955992040366, "loss": 0.0219, "step": 1389 }, { "epoch": 1.0419790104947526, "grad_norm": 0.13611778505409552, "learning_rate": 0.00016502934751425467, "loss": 0.0307, "step": 1390 }, { "epoch": 1.042728635682159, "grad_norm": 0.3217127284596519, "learning_rate": 0.00016496304632929686, "loss": 0.014, "step": 1391 }, { "epoch": 1.0434782608695652, "grad_norm": 0.16275994278824357, "learning_rate": 0.00016489669569962637, "loss": 0.0177, "step": 1392 }, { "epoch": 1.0442278860569716, "grad_norm": 0.36007736364563725, "learning_rate": 0.00016483029567574412, "loss": 0.0148, "step": 1393 }, { "epoch": 1.0449775112443778, "grad_norm": 0.31673298912243997, "learning_rate": 0.00016476384630818847, "loss": 0.0311, "step": 1394 }, { "epoch": 1.0457271364317842, "grad_norm": 0.24465021912040835, "learning_rate": 0.0001646973476475354, "loss": 0.0322, "step": 1395 }, { "epoch": 1.0464767616191903, "grad_norm": 0.13745202157374345, "learning_rate": 0.00016463079974439842, "loss": 0.0138, "step": 1396 }, { "epoch": 1.0472263868065967, "grad_norm": 0.22563440257592166, "learning_rate": 0.0001645642026494285, "loss": 0.0246, "step": 1397 }, { "epoch": 1.047976011994003, "grad_norm": 0.17875194448102819, "learning_rate": 0.00016449755641331407, "loss": 0.022, "step": 1398 }, { "epoch": 1.0487256371814093, "grad_norm": 0.2036285436358285, "learning_rate": 0.00016443086108678098, "loss": 0.021, "step": 1399 }, { "epoch": 1.0494752623688155, "grad_norm": 0.20156890156581853, "learning_rate": 0.00016436411672059238, "loss": 0.0109, "step": 1400 }, { "epoch": 1.050224887556222, "grad_norm": 0.395941922751957, "learning_rate": 0.0001642973233655488, "loss": 0.03, "step": 1401 }, { "epoch": 1.050974512743628, "grad_norm": 0.061274010065021596, "learning_rate": 0.00016423048107248802, "loss": 0.0055, "step": 1402 }, { "epoch": 1.0517241379310345, "grad_norm": 0.19642591911550913, "learning_rate": 0.00016416358989228508, "loss": 0.0243, "step": 1403 }, { "epoch": 1.0524737631184409, "grad_norm": 0.17905794054557944, "learning_rate": 0.00016409664987585232, "loss": 0.0256, "step": 1404 }, { "epoch": 1.053223388305847, "grad_norm": 0.08570887447141709, "learning_rate": 0.00016402966107413903, "loss": 0.0068, "step": 1405 }, { "epoch": 1.0539730134932535, "grad_norm": 0.25819303796441384, "learning_rate": 0.00016396262353813188, "loss": 0.0452, "step": 1406 }, { "epoch": 1.0547226386806596, "grad_norm": 0.14319320724859413, "learning_rate": 0.00016389553731885445, "loss": 0.0307, "step": 1407 }, { "epoch": 1.055472263868066, "grad_norm": 0.08861014982546456, "learning_rate": 0.0001638284024673675, "loss": 0.02, "step": 1408 }, { "epoch": 1.0562218890554722, "grad_norm": 0.42806895391787697, "learning_rate": 0.00016376121903476865, "loss": 0.0354, "step": 1409 }, { "epoch": 1.0569715142428786, "grad_norm": 0.05845392704546912, "learning_rate": 0.0001636939870721927, "loss": 0.0047, "step": 1410 }, { "epoch": 1.0577211394302848, "grad_norm": 0.22523256553737497, "learning_rate": 0.00016362670663081126, "loss": 0.0256, "step": 1411 }, { "epoch": 1.0584707646176912, "grad_norm": 0.2390347688368189, "learning_rate": 0.0001635593777618328, "loss": 0.0269, "step": 1412 }, { "epoch": 1.0592203898050974, "grad_norm": 0.23149230743504765, "learning_rate": 0.0001634920005165028, "loss": 0.0291, "step": 1413 }, { "epoch": 1.0599700149925038, "grad_norm": 0.13249901049153695, "learning_rate": 0.00016342457494610338, "loss": 0.0228, "step": 1414 }, { "epoch": 1.06071964017991, "grad_norm": 0.18082496024917116, "learning_rate": 0.0001633571011019536, "loss": 0.0366, "step": 1415 }, { "epoch": 1.0614692653673163, "grad_norm": 0.2076705932655202, "learning_rate": 0.00016328957903540917, "loss": 0.1393, "step": 1416 }, { "epoch": 1.0622188905547227, "grad_norm": 0.08378805485028017, "learning_rate": 0.00016322200879786248, "loss": 0.0143, "step": 1417 }, { "epoch": 1.062968515742129, "grad_norm": 0.2566888867792056, "learning_rate": 0.0001631543904407427, "loss": 0.0427, "step": 1418 }, { "epoch": 1.0637181409295353, "grad_norm": 0.18958504372246548, "learning_rate": 0.0001630867240155155, "loss": 0.0197, "step": 1419 }, { "epoch": 1.0644677661169415, "grad_norm": 0.134466245166497, "learning_rate": 0.00016301900957368321, "loss": 0.0197, "step": 1420 }, { "epoch": 1.065217391304348, "grad_norm": 0.23636192887764287, "learning_rate": 0.00016295124716678473, "loss": 0.0307, "step": 1421 }, { "epoch": 1.065967016491754, "grad_norm": 0.3062339725051187, "learning_rate": 0.00016288343684639535, "loss": 0.018, "step": 1422 }, { "epoch": 1.0667166416791605, "grad_norm": 0.22431620780731476, "learning_rate": 0.00016281557866412692, "loss": 0.0234, "step": 1423 }, { "epoch": 1.0674662668665666, "grad_norm": 0.4321605036607824, "learning_rate": 0.00016274767267162777, "loss": 0.0139, "step": 1424 }, { "epoch": 1.068215892053973, "grad_norm": 0.23333187413650924, "learning_rate": 0.00016267971892058243, "loss": 0.0429, "step": 1425 }, { "epoch": 1.0689655172413792, "grad_norm": 0.19561907405199422, "learning_rate": 0.000162611717462712, "loss": 0.0249, "step": 1426 }, { "epoch": 1.0697151424287856, "grad_norm": 0.11527450520671127, "learning_rate": 0.00016254366834977374, "loss": 0.0068, "step": 1427 }, { "epoch": 1.0704647676161918, "grad_norm": 0.17629341766686163, "learning_rate": 0.00016247557163356127, "loss": 0.0128, "step": 1428 }, { "epoch": 1.0712143928035982, "grad_norm": 0.3968345991829092, "learning_rate": 0.00016240742736590438, "loss": 0.0218, "step": 1429 }, { "epoch": 1.0719640179910046, "grad_norm": 0.16395160947758386, "learning_rate": 0.00016233923559866902, "loss": 0.0215, "step": 1430 }, { "epoch": 1.0727136431784108, "grad_norm": 0.14224442019132602, "learning_rate": 0.00016227099638375746, "loss": 0.009, "step": 1431 }, { "epoch": 1.0734632683658172, "grad_norm": 0.26354074290357404, "learning_rate": 0.0001622027097731079, "loss": 0.041, "step": 1432 }, { "epoch": 1.0742128935532234, "grad_norm": 0.20767258252642307, "learning_rate": 0.00016213437581869472, "loss": 0.0202, "step": 1433 }, { "epoch": 1.0749625187406298, "grad_norm": 0.09750442000754214, "learning_rate": 0.00016206599457252826, "loss": 0.0127, "step": 1434 }, { "epoch": 1.075712143928036, "grad_norm": 0.1912404856886841, "learning_rate": 0.0001619975660866549, "loss": 0.0155, "step": 1435 }, { "epoch": 1.0764617691154423, "grad_norm": 0.18752152354183707, "learning_rate": 0.000161929090413157, "loss": 0.0214, "step": 1436 }, { "epoch": 1.0772113943028485, "grad_norm": 0.13332532955034693, "learning_rate": 0.00016186056760415278, "loss": 0.0221, "step": 1437 }, { "epoch": 1.077961019490255, "grad_norm": 0.1908859896689113, "learning_rate": 0.0001617919977117964, "loss": 0.0195, "step": 1438 }, { "epoch": 1.078710644677661, "grad_norm": 0.35561503622368495, "learning_rate": 0.0001617233807882778, "loss": 0.0903, "step": 1439 }, { "epoch": 1.0794602698650675, "grad_norm": 0.1191563810615422, "learning_rate": 0.00016165471688582269, "loss": 0.0144, "step": 1440 }, { "epoch": 1.0802098950524737, "grad_norm": 0.10310384566107852, "learning_rate": 0.00016158600605669263, "loss": 0.0155, "step": 1441 }, { "epoch": 1.08095952023988, "grad_norm": 0.16808318985944135, "learning_rate": 0.00016151724835318482, "loss": 0.0298, "step": 1442 }, { "epoch": 1.0817091454272862, "grad_norm": 0.2928806936676149, "learning_rate": 0.0001614484438276322, "loss": 0.0274, "step": 1443 }, { "epoch": 1.0824587706146926, "grad_norm": 0.3014557599632698, "learning_rate": 0.00016137959253240328, "loss": 0.0612, "step": 1444 }, { "epoch": 1.083208395802099, "grad_norm": 0.2780565413334579, "learning_rate": 0.0001613106945199022, "loss": 0.009, "step": 1445 }, { "epoch": 1.0839580209895052, "grad_norm": 0.18495490207774667, "learning_rate": 0.0001612417498425687, "loss": 0.0177, "step": 1446 }, { "epoch": 1.0847076461769116, "grad_norm": 0.24486514912871213, "learning_rate": 0.0001611727585528779, "loss": 0.0478, "step": 1447 }, { "epoch": 1.0854572713643178, "grad_norm": 0.11918873624620073, "learning_rate": 0.0001611037207033406, "loss": 0.0175, "step": 1448 }, { "epoch": 1.0862068965517242, "grad_norm": 0.1785435197671633, "learning_rate": 0.00016103463634650284, "loss": 0.0135, "step": 1449 }, { "epoch": 1.0869565217391304, "grad_norm": 0.12194464031268934, "learning_rate": 0.0001609655055349462, "loss": 0.0181, "step": 1450 }, { "epoch": 1.0877061469265368, "grad_norm": 0.09997546100461988, "learning_rate": 0.00016089632832128756, "loss": 0.0169, "step": 1451 }, { "epoch": 1.088455772113943, "grad_norm": 0.14404140334250182, "learning_rate": 0.00016082710475817912, "loss": 0.0117, "step": 1452 }, { "epoch": 1.0892053973013494, "grad_norm": 0.22062567051623222, "learning_rate": 0.00016075783489830834, "loss": 0.034, "step": 1453 }, { "epoch": 1.0899550224887555, "grad_norm": 0.13431333548606741, "learning_rate": 0.00016068851879439793, "loss": 0.0172, "step": 1454 }, { "epoch": 1.090704647676162, "grad_norm": 0.10841985313441686, "learning_rate": 0.00016061915649920585, "loss": 0.0136, "step": 1455 }, { "epoch": 1.0914542728635683, "grad_norm": 0.15014258139931097, "learning_rate": 0.00016054974806552514, "loss": 0.018, "step": 1456 }, { "epoch": 1.0922038980509745, "grad_norm": 0.14319312568552878, "learning_rate": 0.00016048029354618398, "loss": 0.0166, "step": 1457 }, { "epoch": 1.092953523238381, "grad_norm": 0.25933770441264264, "learning_rate": 0.0001604107929940457, "loss": 0.043, "step": 1458 }, { "epoch": 1.093703148425787, "grad_norm": 0.08961664023952531, "learning_rate": 0.00016034124646200848, "loss": 0.013, "step": 1459 }, { "epoch": 1.0944527736131935, "grad_norm": 0.29012788304698733, "learning_rate": 0.00016027165400300572, "loss": 0.028, "step": 1460 }, { "epoch": 1.0952023988005997, "grad_norm": 0.15912351000061195, "learning_rate": 0.00016020201567000565, "loss": 0.014, "step": 1461 }, { "epoch": 1.095952023988006, "grad_norm": 0.16601612022448584, "learning_rate": 0.00016013233151601142, "loss": 0.0198, "step": 1462 }, { "epoch": 1.0967016491754122, "grad_norm": 0.12808278580064933, "learning_rate": 0.00016006260159406112, "loss": 0.0136, "step": 1463 }, { "epoch": 1.0974512743628186, "grad_norm": 0.6327112833083882, "learning_rate": 0.00015999282595722758, "loss": 0.0392, "step": 1464 }, { "epoch": 1.0982008995502248, "grad_norm": 0.08472043991318816, "learning_rate": 0.00015992300465861848, "loss": 0.0114, "step": 1465 }, { "epoch": 1.0989505247376312, "grad_norm": 0.1909500285972242, "learning_rate": 0.00015985313775137628, "loss": 0.0168, "step": 1466 }, { "epoch": 1.0997001499250374, "grad_norm": 0.19173488400546776, "learning_rate": 0.00015978322528867808, "loss": 0.033, "step": 1467 }, { "epoch": 1.1004497751124438, "grad_norm": 0.07865480318728033, "learning_rate": 0.00015971326732373573, "loss": 0.0106, "step": 1468 }, { "epoch": 1.10119940029985, "grad_norm": 0.3808152008530821, "learning_rate": 0.00015964326390979566, "loss": 0.0431, "step": 1469 }, { "epoch": 1.1019490254872564, "grad_norm": 0.12153952973021516, "learning_rate": 0.00015957321510013894, "loss": 0.0157, "step": 1470 }, { "epoch": 1.1026986506746628, "grad_norm": 0.18323525625666828, "learning_rate": 0.0001595031209480811, "loss": 0.0288, "step": 1471 }, { "epoch": 1.103448275862069, "grad_norm": 0.21675767974438637, "learning_rate": 0.0001594329815069723, "loss": 0.0203, "step": 1472 }, { "epoch": 1.1041979010494753, "grad_norm": 0.14831560448490197, "learning_rate": 0.0001593627968301971, "loss": 0.0257, "step": 1473 }, { "epoch": 1.1049475262368815, "grad_norm": 0.11944549610257972, "learning_rate": 0.0001592925669711745, "loss": 0.0089, "step": 1474 }, { "epoch": 1.105697151424288, "grad_norm": 0.1388075886641613, "learning_rate": 0.00015922229198335786, "loss": 0.0156, "step": 1475 }, { "epoch": 1.106446776611694, "grad_norm": 0.1396408694801958, "learning_rate": 0.000159151971920235, "loss": 0.0216, "step": 1476 }, { "epoch": 1.1071964017991005, "grad_norm": 0.20531653159393498, "learning_rate": 0.00015908160683532788, "loss": 0.0282, "step": 1477 }, { "epoch": 1.1079460269865067, "grad_norm": 0.07127518191582545, "learning_rate": 0.00015901119678219286, "loss": 0.0103, "step": 1478 }, { "epoch": 1.108695652173913, "grad_norm": 0.25309598215724977, "learning_rate": 0.0001589407418144205, "loss": 0.0265, "step": 1479 }, { "epoch": 1.1094452773613193, "grad_norm": 0.08942874508347794, "learning_rate": 0.00015887024198563552, "loss": 0.01, "step": 1480 }, { "epoch": 1.1101949025487257, "grad_norm": 0.23687972616077854, "learning_rate": 0.00015879969734949675, "loss": 0.011, "step": 1481 }, { "epoch": 1.1109445277361318, "grad_norm": 0.4708176787292712, "learning_rate": 0.00015872910795969718, "loss": 0.0453, "step": 1482 }, { "epoch": 1.1116941529235382, "grad_norm": 0.1814159239414209, "learning_rate": 0.00015865847386996386, "loss": 0.0213, "step": 1483 }, { "epoch": 1.1124437781109444, "grad_norm": 0.2884094234411925, "learning_rate": 0.00015858779513405784, "loss": 0.085, "step": 1484 }, { "epoch": 1.1131934032983508, "grad_norm": 0.1495251782404274, "learning_rate": 0.0001585170718057742, "loss": 0.0362, "step": 1485 }, { "epoch": 1.1139430284857572, "grad_norm": 0.22512374680856442, "learning_rate": 0.00015844630393894188, "loss": 0.0314, "step": 1486 }, { "epoch": 1.1146926536731634, "grad_norm": 0.09988941306056143, "learning_rate": 0.00015837549158742378, "loss": 0.0147, "step": 1487 }, { "epoch": 1.1154422788605698, "grad_norm": 0.0901363000558977, "learning_rate": 0.00015830463480511663, "loss": 0.0131, "step": 1488 }, { "epoch": 1.116191904047976, "grad_norm": 0.16402213224909407, "learning_rate": 0.000158233733645951, "loss": 0.0293, "step": 1489 }, { "epoch": 1.1169415292353824, "grad_norm": 0.13463191653600406, "learning_rate": 0.0001581627881638912, "loss": 0.0081, "step": 1490 }, { "epoch": 1.1176911544227885, "grad_norm": 0.15770222986745397, "learning_rate": 0.0001580917984129353, "loss": 0.0224, "step": 1491 }, { "epoch": 1.118440779610195, "grad_norm": 0.16417954231468038, "learning_rate": 0.0001580207644471151, "loss": 0.0274, "step": 1492 }, { "epoch": 1.1191904047976011, "grad_norm": 0.07265795493103328, "learning_rate": 0.000157949686320496, "loss": 0.0089, "step": 1493 }, { "epoch": 1.1199400299850075, "grad_norm": 0.13153007530375596, "learning_rate": 0.00015787856408717697, "loss": 0.0151, "step": 1494 }, { "epoch": 1.1206896551724137, "grad_norm": 0.212497179793201, "learning_rate": 0.00015780739780129068, "loss": 0.0182, "step": 1495 }, { "epoch": 1.12143928035982, "grad_norm": 0.18185328948368945, "learning_rate": 0.0001577361875170032, "loss": 0.0334, "step": 1496 }, { "epoch": 1.1221889055472265, "grad_norm": 0.1384683028189125, "learning_rate": 0.0001576649332885142, "loss": 0.0122, "step": 1497 }, { "epoch": 1.1229385307346327, "grad_norm": 0.13502693122215248, "learning_rate": 0.0001575936351700567, "loss": 0.0146, "step": 1498 }, { "epoch": 1.123688155922039, "grad_norm": 0.2647766754878005, "learning_rate": 0.00015752229321589717, "loss": 0.0338, "step": 1499 }, { "epoch": 1.1244377811094453, "grad_norm": 0.16050841014726616, "learning_rate": 0.00015745090748033546, "loss": 0.0197, "step": 1500 }, { "epoch": 1.1251874062968517, "grad_norm": 0.21556781551128099, "learning_rate": 0.0001573794780177047, "loss": 0.0196, "step": 1501 }, { "epoch": 1.1259370314842578, "grad_norm": 0.10147478324129008, "learning_rate": 0.00015730800488237135, "loss": 0.013, "step": 1502 }, { "epoch": 1.1266866566716642, "grad_norm": 0.17908072704859868, "learning_rate": 0.00015723648812873507, "loss": 0.0208, "step": 1503 }, { "epoch": 1.1274362818590704, "grad_norm": 0.1427235790172978, "learning_rate": 0.00015716492781122875, "loss": 0.0334, "step": 1504 }, { "epoch": 1.1281859070464768, "grad_norm": 0.09601915395576731, "learning_rate": 0.00015709332398431843, "loss": 0.0139, "step": 1505 }, { "epoch": 1.128935532233883, "grad_norm": 0.2653400088799801, "learning_rate": 0.0001570216767025032, "loss": 0.0296, "step": 1506 }, { "epoch": 1.1296851574212894, "grad_norm": 0.22056456709614147, "learning_rate": 0.0001569499860203153, "loss": 0.0343, "step": 1507 }, { "epoch": 1.1304347826086956, "grad_norm": 0.16808986480694899, "learning_rate": 0.00015687825199232, "loss": 0.0204, "step": 1508 }, { "epoch": 1.131184407796102, "grad_norm": 0.17447894186209695, "learning_rate": 0.00015680647467311557, "loss": 0.0234, "step": 1509 }, { "epoch": 1.1319340329835081, "grad_norm": 0.12815018624473096, "learning_rate": 0.00015673465411733322, "loss": 0.0227, "step": 1510 }, { "epoch": 1.1326836581709145, "grad_norm": 0.09165413310209258, "learning_rate": 0.00015666279037963697, "loss": 0.0111, "step": 1511 }, { "epoch": 1.133433283358321, "grad_norm": 0.07284436146532805, "learning_rate": 0.00015659088351472388, "loss": 0.0153, "step": 1512 }, { "epoch": 1.1341829085457271, "grad_norm": 0.1039038555643137, "learning_rate": 0.00015651893357732368, "loss": 0.0192, "step": 1513 }, { "epoch": 1.1349325337331335, "grad_norm": 0.12641273866798364, "learning_rate": 0.00015644694062219897, "loss": 0.0212, "step": 1514 }, { "epoch": 1.1356821589205397, "grad_norm": 0.1303955647396933, "learning_rate": 0.00015637490470414513, "loss": 0.0271, "step": 1515 }, { "epoch": 1.136431784107946, "grad_norm": 0.14201348367234287, "learning_rate": 0.00015630282587799008, "loss": 0.0315, "step": 1516 }, { "epoch": 1.1371814092953523, "grad_norm": 0.11012632959351705, "learning_rate": 0.00015623070419859455, "loss": 0.0103, "step": 1517 }, { "epoch": 1.1379310344827587, "grad_norm": 0.12460339857679491, "learning_rate": 0.00015615853972085185, "loss": 0.0172, "step": 1518 }, { "epoch": 1.1386806596701649, "grad_norm": 0.22870490943904723, "learning_rate": 0.00015608633249968783, "loss": 0.0318, "step": 1519 }, { "epoch": 1.1394302848575713, "grad_norm": 0.22798883978071888, "learning_rate": 0.0001560140825900609, "loss": 0.0598, "step": 1520 }, { "epoch": 1.1401799100449774, "grad_norm": 0.07875878857169033, "learning_rate": 0.00015594179004696192, "loss": 0.0087, "step": 1521 }, { "epoch": 1.1409295352323838, "grad_norm": 0.11520020041983646, "learning_rate": 0.0001558694549254143, "loss": 0.0096, "step": 1522 }, { "epoch": 1.1416791604197902, "grad_norm": 0.4275082260936782, "learning_rate": 0.00015579707728047377, "loss": 0.0275, "step": 1523 }, { "epoch": 1.1424287856071964, "grad_norm": 0.13461389601112744, "learning_rate": 0.0001557246571672284, "loss": 0.0128, "step": 1524 }, { "epoch": 1.1431784107946026, "grad_norm": 0.2116201093302502, "learning_rate": 0.00015565219464079867, "loss": 0.016, "step": 1525 }, { "epoch": 1.143928035982009, "grad_norm": 0.12566130628845476, "learning_rate": 0.0001555796897563373, "loss": 0.0196, "step": 1526 }, { "epoch": 1.1446776611694154, "grad_norm": 0.19729834701794738, "learning_rate": 0.00015550714256902924, "loss": 0.0229, "step": 1527 }, { "epoch": 1.1454272863568216, "grad_norm": 0.1943109518312495, "learning_rate": 0.00015543455313409167, "loss": 0.0286, "step": 1528 }, { "epoch": 1.146176911544228, "grad_norm": 0.15684836904901786, "learning_rate": 0.00015536192150677387, "loss": 0.0266, "step": 1529 }, { "epoch": 1.1469265367316341, "grad_norm": 0.19744182244166947, "learning_rate": 0.00015528924774235728, "loss": 0.0388, "step": 1530 }, { "epoch": 1.1476761619190405, "grad_norm": 0.1428512558705438, "learning_rate": 0.00015521653189615542, "loss": 0.0109, "step": 1531 }, { "epoch": 1.1484257871064467, "grad_norm": 0.09805806018139068, "learning_rate": 0.00015514377402351377, "loss": 0.0078, "step": 1532 }, { "epoch": 1.1491754122938531, "grad_norm": 0.1727329923961472, "learning_rate": 0.00015507097417980992, "loss": 0.0259, "step": 1533 }, { "epoch": 1.1499250374812593, "grad_norm": 0.0781432848720514, "learning_rate": 0.00015499813242045326, "loss": 0.0088, "step": 1534 }, { "epoch": 1.1506746626686657, "grad_norm": 0.1793998560755001, "learning_rate": 0.0001549252488008852, "loss": 0.0187, "step": 1535 }, { "epoch": 1.1514242878560719, "grad_norm": 0.20464042558859094, "learning_rate": 0.00015485232337657893, "loss": 0.0252, "step": 1536 }, { "epoch": 1.1521739130434783, "grad_norm": 0.15466608218391978, "learning_rate": 0.00015477935620303952, "loss": 0.0119, "step": 1537 }, { "epoch": 1.1529235382308847, "grad_norm": 0.14546232820077265, "learning_rate": 0.0001547063473358038, "loss": 0.0221, "step": 1538 }, { "epoch": 1.1536731634182908, "grad_norm": 0.20254604263054207, "learning_rate": 0.00015463329683044027, "loss": 0.0203, "step": 1539 }, { "epoch": 1.1544227886056972, "grad_norm": 0.11962821907793976, "learning_rate": 0.0001545602047425492, "loss": 0.0171, "step": 1540 }, { "epoch": 1.1551724137931034, "grad_norm": 0.17742009738804226, "learning_rate": 0.0001544870711277625, "loss": 0.0388, "step": 1541 }, { "epoch": 1.1559220389805098, "grad_norm": 0.13547950643645995, "learning_rate": 0.00015441389604174365, "loss": 0.0224, "step": 1542 }, { "epoch": 1.156671664167916, "grad_norm": 0.13957811910369117, "learning_rate": 0.00015434067954018773, "loss": 0.0181, "step": 1543 }, { "epoch": 1.1574212893553224, "grad_norm": 0.10593824435236912, "learning_rate": 0.00015426742167882131, "loss": 0.0192, "step": 1544 }, { "epoch": 1.1581709145427286, "grad_norm": 0.1923422077623307, "learning_rate": 0.0001541941225134025, "loss": 0.0223, "step": 1545 }, { "epoch": 1.158920539730135, "grad_norm": 0.09584418842068963, "learning_rate": 0.00015412078209972076, "loss": 0.01, "step": 1546 }, { "epoch": 1.1596701649175412, "grad_norm": 0.1928952002851109, "learning_rate": 0.00015404740049359696, "loss": 0.0392, "step": 1547 }, { "epoch": 1.1604197901049476, "grad_norm": 0.07336743595400493, "learning_rate": 0.00015397397775088347, "loss": 0.0151, "step": 1548 }, { "epoch": 1.1611694152923537, "grad_norm": 0.08828692920585819, "learning_rate": 0.0001539005139274637, "loss": 0.0124, "step": 1549 }, { "epoch": 1.1619190404797601, "grad_norm": 0.13783225187379142, "learning_rate": 0.0001538270090792526, "loss": 0.0238, "step": 1550 }, { "epoch": 1.1626686656671663, "grad_norm": 0.10249729690468168, "learning_rate": 0.00015375346326219617, "loss": 0.0081, "step": 1551 }, { "epoch": 1.1634182908545727, "grad_norm": 0.20748713541551198, "learning_rate": 0.00015367987653227164, "loss": 0.0179, "step": 1552 }, { "epoch": 1.1641679160419791, "grad_norm": 0.10779303756805979, "learning_rate": 0.00015360624894548744, "loss": 0.0193, "step": 1553 }, { "epoch": 1.1649175412293853, "grad_norm": 0.11649926171215857, "learning_rate": 0.00015353258055788297, "loss": 0.0119, "step": 1554 }, { "epoch": 1.1656671664167917, "grad_norm": 0.08919536536879394, "learning_rate": 0.0001534588714255288, "loss": 0.0113, "step": 1555 }, { "epoch": 1.1664167916041979, "grad_norm": 0.16624570100412386, "learning_rate": 0.00015338512160452647, "loss": 0.0192, "step": 1556 }, { "epoch": 1.1671664167916043, "grad_norm": 0.16784360601652096, "learning_rate": 0.00015331133115100847, "loss": 0.0172, "step": 1557 }, { "epoch": 1.1679160419790104, "grad_norm": 0.07319159417719012, "learning_rate": 0.00015323750012113827, "loss": 0.0169, "step": 1558 }, { "epoch": 1.1686656671664168, "grad_norm": 0.18373874199472398, "learning_rate": 0.00015316362857111012, "loss": 0.0216, "step": 1559 }, { "epoch": 1.169415292353823, "grad_norm": 0.06673105844962858, "learning_rate": 0.00015308971655714925, "loss": 0.0066, "step": 1560 }, { "epoch": 1.1701649175412294, "grad_norm": 0.21392038301444416, "learning_rate": 0.00015301576413551154, "loss": 0.0242, "step": 1561 }, { "epoch": 1.1709145427286356, "grad_norm": 0.18719939485446224, "learning_rate": 0.00015294177136248379, "loss": 0.0205, "step": 1562 }, { "epoch": 1.171664167916042, "grad_norm": 0.14217515743768686, "learning_rate": 0.00015286773829438334, "loss": 0.0176, "step": 1563 }, { "epoch": 1.1724137931034484, "grad_norm": 0.2329284232310269, "learning_rate": 0.00015279366498755837, "loss": 0.0314, "step": 1564 }, { "epoch": 1.1731634182908546, "grad_norm": 0.1225692888740097, "learning_rate": 0.0001527195514983875, "loss": 0.0139, "step": 1565 }, { "epoch": 1.1739130434782608, "grad_norm": 0.3449262128498415, "learning_rate": 0.00015264539788328012, "loss": 0.0237, "step": 1566 }, { "epoch": 1.1746626686656672, "grad_norm": 0.13106031341373675, "learning_rate": 0.00015257120419867603, "loss": 0.0076, "step": 1567 }, { "epoch": 1.1754122938530736, "grad_norm": 0.09537967707527366, "learning_rate": 0.0001524969705010456, "loss": 0.0089, "step": 1568 }, { "epoch": 1.1761619190404797, "grad_norm": 0.3558565450829918, "learning_rate": 0.00015242269684688956, "loss": 0.0376, "step": 1569 }, { "epoch": 1.1769115442278861, "grad_norm": 0.3121634965388302, "learning_rate": 0.00015234838329273922, "loss": 0.0205, "step": 1570 }, { "epoch": 1.1776611694152923, "grad_norm": 0.1452208720901486, "learning_rate": 0.00015227402989515608, "loss": 0.0185, "step": 1571 }, { "epoch": 1.1784107946026987, "grad_norm": 0.14467715052341956, "learning_rate": 0.00015219963671073204, "loss": 0.0212, "step": 1572 }, { "epoch": 1.1791604197901049, "grad_norm": 0.12022759396311145, "learning_rate": 0.00015212520379608932, "loss": 0.0211, "step": 1573 }, { "epoch": 1.1799100449775113, "grad_norm": 0.1459844281580083, "learning_rate": 0.00015205073120788036, "loss": 0.0128, "step": 1574 }, { "epoch": 1.1806596701649175, "grad_norm": 0.22936341003304037, "learning_rate": 0.0001519762190027877, "loss": 0.027, "step": 1575 }, { "epoch": 1.1814092953523239, "grad_norm": 0.19513599358000275, "learning_rate": 0.00015190166723752423, "loss": 0.0267, "step": 1576 }, { "epoch": 1.18215892053973, "grad_norm": 0.1167567522117949, "learning_rate": 0.00015182707596883275, "loss": 0.0206, "step": 1577 }, { "epoch": 1.1829085457271364, "grad_norm": 0.22445548930184916, "learning_rate": 0.00015175244525348624, "loss": 0.0355, "step": 1578 }, { "epoch": 1.1836581709145428, "grad_norm": 0.08670964454479742, "learning_rate": 0.00015167777514828767, "loss": 0.0129, "step": 1579 }, { "epoch": 1.184407796101949, "grad_norm": 0.14385266850244458, "learning_rate": 0.00015160306571006995, "loss": 0.0177, "step": 1580 }, { "epoch": 1.1851574212893554, "grad_norm": 0.08759140642886128, "learning_rate": 0.00015152831699569605, "loss": 0.0081, "step": 1581 }, { "epoch": 1.1859070464767616, "grad_norm": 0.1316505612026678, "learning_rate": 0.00015145352906205872, "loss": 0.0161, "step": 1582 }, { "epoch": 1.186656671664168, "grad_norm": 0.11305442190492103, "learning_rate": 0.0001513787019660806, "loss": 0.0168, "step": 1583 }, { "epoch": 1.1874062968515742, "grad_norm": 0.1521461905461006, "learning_rate": 0.00015130383576471415, "loss": 0.0136, "step": 1584 }, { "epoch": 1.1881559220389806, "grad_norm": 0.18712535443586656, "learning_rate": 0.00015122893051494152, "loss": 0.0273, "step": 1585 }, { "epoch": 1.1889055472263867, "grad_norm": 0.21720392213142015, "learning_rate": 0.0001511539862737747, "loss": 0.0252, "step": 1586 }, { "epoch": 1.1896551724137931, "grad_norm": 0.09435652810077971, "learning_rate": 0.00015107900309825528, "loss": 0.0124, "step": 1587 }, { "epoch": 1.1904047976011993, "grad_norm": 0.8196844795700977, "learning_rate": 0.0001510039810454545, "loss": 0.06, "step": 1588 }, { "epoch": 1.1911544227886057, "grad_norm": 0.13157054198911564, "learning_rate": 0.00015092892017247317, "loss": 0.0139, "step": 1589 }, { "epoch": 1.191904047976012, "grad_norm": 0.09674622860826092, "learning_rate": 0.0001508538205364417, "loss": 0.0119, "step": 1590 }, { "epoch": 1.1926536731634183, "grad_norm": 0.22671538004519848, "learning_rate": 0.00015077868219451993, "loss": 0.0248, "step": 1591 }, { "epoch": 1.1934032983508245, "grad_norm": 0.6288248427499471, "learning_rate": 0.0001507035052038972, "loss": 0.0329, "step": 1592 }, { "epoch": 1.1941529235382309, "grad_norm": 0.10526177711938337, "learning_rate": 0.00015062828962179232, "loss": 0.0149, "step": 1593 }, { "epoch": 1.1949025487256373, "grad_norm": 0.11834890522369187, "learning_rate": 0.00015055303550545336, "loss": 0.0138, "step": 1594 }, { "epoch": 1.1956521739130435, "grad_norm": 0.144173189523043, "learning_rate": 0.0001504777429121578, "loss": 0.0105, "step": 1595 }, { "epoch": 1.1964017991004499, "grad_norm": 0.13345656446876325, "learning_rate": 0.0001504024118992124, "loss": 0.0118, "step": 1596 }, { "epoch": 1.197151424287856, "grad_norm": 0.16313411751578458, "learning_rate": 0.00015032704252395315, "loss": 0.021, "step": 1597 }, { "epoch": 1.1979010494752624, "grad_norm": 0.17781430400215803, "learning_rate": 0.0001502516348437452, "loss": 0.0107, "step": 1598 }, { "epoch": 1.1986506746626686, "grad_norm": 0.30242153760052526, "learning_rate": 0.0001501761889159829, "loss": 0.0211, "step": 1599 }, { "epoch": 1.199400299850075, "grad_norm": 0.1716985732561441, "learning_rate": 0.0001501007047980897, "loss": 0.0193, "step": 1600 }, { "epoch": 1.2001499250374812, "grad_norm": 0.16363815472205612, "learning_rate": 0.00015002518254751817, "loss": 0.0158, "step": 1601 }, { "epoch": 1.2008995502248876, "grad_norm": 0.18657535077120233, "learning_rate": 0.00014994962222174976, "loss": 0.0137, "step": 1602 }, { "epoch": 1.2016491754122938, "grad_norm": 0.14295222527249, "learning_rate": 0.00014987402387829502, "loss": 0.0094, "step": 1603 }, { "epoch": 1.2023988005997002, "grad_norm": 0.13886556519199952, "learning_rate": 0.00014979838757469343, "loss": 0.014, "step": 1604 }, { "epoch": 1.2031484257871066, "grad_norm": 0.2803664879457345, "learning_rate": 0.00014972271336851332, "loss": 0.0216, "step": 1605 }, { "epoch": 1.2038980509745127, "grad_norm": 0.2360500791249835, "learning_rate": 0.0001496470013173519, "loss": 0.0193, "step": 1606 }, { "epoch": 1.204647676161919, "grad_norm": 0.1433081534684736, "learning_rate": 0.00014957125147883516, "loss": 0.02, "step": 1607 }, { "epoch": 1.2053973013493253, "grad_norm": 0.07375451718256198, "learning_rate": 0.00014949546391061785, "loss": 0.0215, "step": 1608 }, { "epoch": 1.2061469265367317, "grad_norm": 0.22419918033335512, "learning_rate": 0.0001494196386703835, "loss": 0.0308, "step": 1609 }, { "epoch": 1.206896551724138, "grad_norm": 0.17684058082971194, "learning_rate": 0.00014934377581584424, "loss": 0.0168, "step": 1610 }, { "epoch": 1.2076461769115443, "grad_norm": 0.3677656468075877, "learning_rate": 0.00014926787540474082, "loss": 0.0384, "step": 1611 }, { "epoch": 1.2083958020989505, "grad_norm": 0.18620787001438915, "learning_rate": 0.00014919193749484265, "loss": 0.0289, "step": 1612 }, { "epoch": 1.2091454272863569, "grad_norm": 0.16725832597887177, "learning_rate": 0.00014911596214394756, "loss": 0.0212, "step": 1613 }, { "epoch": 1.209895052473763, "grad_norm": 0.24480319746883517, "learning_rate": 0.00014903994940988207, "loss": 0.0285, "step": 1614 }, { "epoch": 1.2106446776611695, "grad_norm": 0.17387780125153765, "learning_rate": 0.00014896389935050092, "loss": 0.0202, "step": 1615 }, { "epoch": 1.2113943028485756, "grad_norm": 0.06439508057734938, "learning_rate": 0.00014888781202368744, "loss": 0.0103, "step": 1616 }, { "epoch": 1.212143928035982, "grad_norm": 0.635346044877298, "learning_rate": 0.00014881168748735328, "loss": 0.0779, "step": 1617 }, { "epoch": 1.2128935532233882, "grad_norm": 0.23203401133628196, "learning_rate": 0.00014873552579943838, "loss": 0.027, "step": 1618 }, { "epoch": 1.2136431784107946, "grad_norm": 0.21435827140817057, "learning_rate": 0.00014865932701791092, "loss": 0.0124, "step": 1619 }, { "epoch": 1.214392803598201, "grad_norm": 0.15654195261076978, "learning_rate": 0.00014858309120076738, "loss": 0.0253, "step": 1620 }, { "epoch": 1.2151424287856072, "grad_norm": 0.10613248035654502, "learning_rate": 0.0001485068184060325, "loss": 0.0161, "step": 1621 }, { "epoch": 1.2158920539730136, "grad_norm": 0.17812431129324593, "learning_rate": 0.00014843050869175895, "loss": 0.0228, "step": 1622 }, { "epoch": 1.2166416791604198, "grad_norm": 0.13830600870492107, "learning_rate": 0.0001483541621160277, "loss": 0.0115, "step": 1623 }, { "epoch": 1.2173913043478262, "grad_norm": 0.25717512053655084, "learning_rate": 0.0001482777787369477, "loss": 0.0286, "step": 1624 }, { "epoch": 1.2181409295352323, "grad_norm": 0.19518494420243698, "learning_rate": 0.00014820135861265586, "loss": 0.0224, "step": 1625 }, { "epoch": 1.2188905547226387, "grad_norm": 0.19758206936074088, "learning_rate": 0.00014812490180131715, "loss": 0.0319, "step": 1626 }, { "epoch": 1.219640179910045, "grad_norm": 0.4488302974160454, "learning_rate": 0.00014804840836112445, "loss": 0.0288, "step": 1627 }, { "epoch": 1.2203898050974513, "grad_norm": 0.1997144958727106, "learning_rate": 0.0001479718783502984, "loss": 0.0165, "step": 1628 }, { "epoch": 1.2211394302848575, "grad_norm": 0.2055089164972565, "learning_rate": 0.00014789531182708766, "loss": 0.0244, "step": 1629 }, { "epoch": 1.221889055472264, "grad_norm": 0.2692431065482803, "learning_rate": 0.0001478187088497686, "loss": 0.0228, "step": 1630 }, { "epoch": 1.22263868065967, "grad_norm": 0.20209950356916023, "learning_rate": 0.0001477420694766452, "loss": 0.0351, "step": 1631 }, { "epoch": 1.2233883058470765, "grad_norm": 0.19805813440782882, "learning_rate": 0.0001476653937660494, "loss": 0.0181, "step": 1632 }, { "epoch": 1.2241379310344827, "grad_norm": 0.15315490197525022, "learning_rate": 0.0001475886817763406, "loss": 0.0213, "step": 1633 }, { "epoch": 1.224887556221889, "grad_norm": 0.407888110454625, "learning_rate": 0.0001475119335659059, "loss": 0.0372, "step": 1634 }, { "epoch": 1.2256371814092955, "grad_norm": 0.1625357469940467, "learning_rate": 0.0001474351491931599, "loss": 0.0258, "step": 1635 }, { "epoch": 1.2263868065967016, "grad_norm": 0.15337239039044595, "learning_rate": 0.0001473583287165448, "loss": 0.0188, "step": 1636 }, { "epoch": 1.227136431784108, "grad_norm": 0.18287395919285426, "learning_rate": 0.00014728147219453025, "loss": 0.0246, "step": 1637 }, { "epoch": 1.2278860569715142, "grad_norm": 0.322448187413205, "learning_rate": 0.00014720457968561335, "loss": 0.033, "step": 1638 }, { "epoch": 1.2286356821589206, "grad_norm": 0.3009810363920835, "learning_rate": 0.0001471276512483185, "loss": 0.0282, "step": 1639 }, { "epoch": 1.2293853073463268, "grad_norm": 0.07444066072515641, "learning_rate": 0.00014705068694119758, "loss": 0.01, "step": 1640 }, { "epoch": 1.2301349325337332, "grad_norm": 0.1344181162415075, "learning_rate": 0.0001469736868228297, "loss": 0.0219, "step": 1641 }, { "epoch": 1.2308845577211394, "grad_norm": 0.09155453389791111, "learning_rate": 0.00014689665095182127, "loss": 0.0103, "step": 1642 }, { "epoch": 1.2316341829085458, "grad_norm": 0.26088856461980525, "learning_rate": 0.00014681957938680578, "loss": 0.0247, "step": 1643 }, { "epoch": 1.232383808095952, "grad_norm": 0.19509461351710658, "learning_rate": 0.00014674247218644405, "loss": 0.0141, "step": 1644 }, { "epoch": 1.2331334332833583, "grad_norm": 0.4607207890686461, "learning_rate": 0.00014666532940942398, "loss": 0.0246, "step": 1645 }, { "epoch": 1.2338830584707647, "grad_norm": 0.2869915343589999, "learning_rate": 0.0001465881511144605, "loss": 0.0348, "step": 1646 }, { "epoch": 1.234632683658171, "grad_norm": 0.14409772847201524, "learning_rate": 0.0001465109373602956, "loss": 0.0253, "step": 1647 }, { "epoch": 1.235382308845577, "grad_norm": 0.11869113344240399, "learning_rate": 0.00014643368820569825, "loss": 0.0149, "step": 1648 }, { "epoch": 1.2361319340329835, "grad_norm": 0.1286931104098736, "learning_rate": 0.0001463564037094644, "loss": 0.0108, "step": 1649 }, { "epoch": 1.23688155922039, "grad_norm": 0.11931274021012966, "learning_rate": 0.00014627908393041682, "loss": 0.0141, "step": 1650 }, { "epoch": 1.237631184407796, "grad_norm": 0.08123893228448557, "learning_rate": 0.00014620172892740524, "loss": 0.0083, "step": 1651 }, { "epoch": 1.2383808095952025, "grad_norm": 0.3291940283809328, "learning_rate": 0.00014612433875930611, "loss": 0.0284, "step": 1652 }, { "epoch": 1.2391304347826086, "grad_norm": 0.08529523042895133, "learning_rate": 0.0001460469134850227, "loss": 0.0125, "step": 1653 }, { "epoch": 1.239880059970015, "grad_norm": 0.16550478244779057, "learning_rate": 0.000145969453163485, "loss": 0.0157, "step": 1654 }, { "epoch": 1.2406296851574212, "grad_norm": 0.09919390179136478, "learning_rate": 0.0001458919578536496, "loss": 0.0164, "step": 1655 }, { "epoch": 1.2413793103448276, "grad_norm": 0.36606333145203, "learning_rate": 0.00014581442761449985, "loss": 0.0295, "step": 1656 }, { "epoch": 1.2421289355322338, "grad_norm": 0.28225111197774455, "learning_rate": 0.00014573686250504554, "loss": 0.0353, "step": 1657 }, { "epoch": 1.2428785607196402, "grad_norm": 0.13363727729556232, "learning_rate": 0.00014565926258432312, "loss": 0.0134, "step": 1658 }, { "epoch": 1.2436281859070464, "grad_norm": 0.09140998955860476, "learning_rate": 0.00014558162791139547, "loss": 0.013, "step": 1659 }, { "epoch": 1.2443778110944528, "grad_norm": 0.3050733321128361, "learning_rate": 0.00014550395854535196, "loss": 0.0606, "step": 1660 }, { "epoch": 1.2451274362818592, "grad_norm": 0.17081104749370116, "learning_rate": 0.00014542625454530832, "loss": 0.0332, "step": 1661 }, { "epoch": 1.2458770614692654, "grad_norm": 0.2857070156163614, "learning_rate": 0.00014534851597040665, "loss": 0.0389, "step": 1662 }, { "epoch": 1.2466266866566718, "grad_norm": 0.1122991717428358, "learning_rate": 0.0001452707428798154, "loss": 0.0129, "step": 1663 }, { "epoch": 1.247376311844078, "grad_norm": 0.1440697543098693, "learning_rate": 0.00014519293533272928, "loss": 0.0205, "step": 1664 }, { "epoch": 1.2481259370314843, "grad_norm": 0.16650823684741128, "learning_rate": 0.00014511509338836922, "loss": 0.0243, "step": 1665 }, { "epoch": 1.2488755622188905, "grad_norm": 0.08828955807548386, "learning_rate": 0.0001450372171059823, "loss": 0.019, "step": 1666 }, { "epoch": 1.249625187406297, "grad_norm": 0.14752096738070544, "learning_rate": 0.0001449593065448418, "loss": 0.0233, "step": 1667 }, { "epoch": 1.250374812593703, "grad_norm": 0.34921232162258087, "learning_rate": 0.00014488136176424703, "loss": 0.0379, "step": 1668 }, { "epoch": 1.2511244377811095, "grad_norm": 0.11129278503195328, "learning_rate": 0.00014480338282352336, "loss": 0.0267, "step": 1669 }, { "epoch": 1.2518740629685157, "grad_norm": 0.14544728814047606, "learning_rate": 0.00014472536978202218, "loss": 0.0165, "step": 1670 }, { "epoch": 1.252623688155922, "grad_norm": 0.22625628725911506, "learning_rate": 0.00014464732269912082, "loss": 0.0221, "step": 1671 }, { "epoch": 1.2533733133433285, "grad_norm": 0.13860636776343901, "learning_rate": 0.00014456924163422255, "loss": 0.0243, "step": 1672 }, { "epoch": 1.2541229385307346, "grad_norm": 0.1101369473658894, "learning_rate": 0.0001444911266467564, "loss": 0.0096, "step": 1673 }, { "epoch": 1.2548725637181408, "grad_norm": 0.2876119675897652, "learning_rate": 0.0001444129777961774, "loss": 0.0202, "step": 1674 }, { "epoch": 1.2556221889055472, "grad_norm": 0.29583502202052947, "learning_rate": 0.00014433479514196616, "loss": 0.0822, "step": 1675 }, { "epoch": 1.2563718140929536, "grad_norm": 0.21487573028514387, "learning_rate": 0.00014425657874362913, "loss": 0.0348, "step": 1676 }, { "epoch": 1.2571214392803598, "grad_norm": 0.11451811504867611, "learning_rate": 0.00014417832866069847, "loss": 0.0225, "step": 1677 }, { "epoch": 1.2578710644677662, "grad_norm": 0.2173650139609283, "learning_rate": 0.00014410004495273186, "loss": 0.0299, "step": 1678 }, { "epoch": 1.2586206896551724, "grad_norm": 0.3899048511072257, "learning_rate": 0.0001440217276793127, "loss": 0.0206, "step": 1679 }, { "epoch": 1.2593703148425788, "grad_norm": 0.3059178365171424, "learning_rate": 0.00014394337690004985, "loss": 0.0387, "step": 1680 }, { "epoch": 1.260119940029985, "grad_norm": 0.09967613478175827, "learning_rate": 0.00014386499267457774, "loss": 0.0157, "step": 1681 }, { "epoch": 1.2608695652173914, "grad_norm": 0.1172357014637228, "learning_rate": 0.00014378657506255616, "loss": 0.0136, "step": 1682 }, { "epoch": 1.2616191904047975, "grad_norm": 0.18697067489098376, "learning_rate": 0.00014370812412367042, "loss": 0.0378, "step": 1683 }, { "epoch": 1.262368815592204, "grad_norm": 0.19464374859211744, "learning_rate": 0.00014362963991763114, "loss": 0.0323, "step": 1684 }, { "epoch": 1.26311844077961, "grad_norm": 0.10611168216251465, "learning_rate": 0.0001435511225041742, "loss": 0.0149, "step": 1685 }, { "epoch": 1.2638680659670165, "grad_norm": 0.2880623504224047, "learning_rate": 0.00014347257194306093, "loss": 0.0464, "step": 1686 }, { "epoch": 1.264617691154423, "grad_norm": 0.15171315173537525, "learning_rate": 0.00014339398829407767, "loss": 0.0175, "step": 1687 }, { "epoch": 1.265367316341829, "grad_norm": 0.2888464808325791, "learning_rate": 0.0001433153716170361, "loss": 0.0406, "step": 1688 }, { "epoch": 1.2661169415292353, "grad_norm": 0.10076514229124206, "learning_rate": 0.00014323672197177304, "loss": 0.0251, "step": 1689 }, { "epoch": 1.2668665667166417, "grad_norm": 0.20768788736228558, "learning_rate": 0.00014315803941815028, "loss": 0.0293, "step": 1690 }, { "epoch": 1.267616191904048, "grad_norm": 0.20206611512917563, "learning_rate": 0.00014307932401605477, "loss": 0.0286, "step": 1691 }, { "epoch": 1.2683658170914542, "grad_norm": 0.15658400789301777, "learning_rate": 0.00014300057582539844, "loss": 0.0214, "step": 1692 }, { "epoch": 1.2691154422788606, "grad_norm": 0.14329880535204859, "learning_rate": 0.00014292179490611812, "loss": 0.0147, "step": 1693 }, { "epoch": 1.2698650674662668, "grad_norm": 0.12107833682851883, "learning_rate": 0.00014284298131817563, "loss": 0.02, "step": 1694 }, { "epoch": 1.2706146926536732, "grad_norm": 0.17914505564164096, "learning_rate": 0.00014276413512155756, "loss": 0.02, "step": 1695 }, { "epoch": 1.2713643178410794, "grad_norm": 0.21049302813792745, "learning_rate": 0.00014268525637627541, "loss": 0.0409, "step": 1696 }, { "epoch": 1.2721139430284858, "grad_norm": 0.08102354032890279, "learning_rate": 0.00014260634514236544, "loss": 0.0101, "step": 1697 }, { "epoch": 1.272863568215892, "grad_norm": 0.1820823122811385, "learning_rate": 0.00014252740147988857, "loss": 0.018, "step": 1698 }, { "epoch": 1.2736131934032984, "grad_norm": 0.12064252019808996, "learning_rate": 0.00014244842544893047, "loss": 0.0158, "step": 1699 }, { "epoch": 1.2743628185907045, "grad_norm": 0.18327142948153743, "learning_rate": 0.00014236941710960143, "loss": 0.0133, "step": 1700 }, { "epoch": 1.275112443778111, "grad_norm": 0.12017554093437577, "learning_rate": 0.00014229037652203628, "loss": 0.0142, "step": 1701 }, { "epoch": 1.2758620689655173, "grad_norm": 0.2064898698033883, "learning_rate": 0.00014221130374639454, "loss": 0.02, "step": 1702 }, { "epoch": 1.2766116941529235, "grad_norm": 0.11760686399607434, "learning_rate": 0.00014213219884286004, "loss": 0.007, "step": 1703 }, { "epoch": 1.2773613193403297, "grad_norm": 0.16875461561682745, "learning_rate": 0.00014205306187164117, "loss": 0.0222, "step": 1704 }, { "epoch": 1.278110944527736, "grad_norm": 0.21819466446133043, "learning_rate": 0.0001419738928929707, "loss": 0.0132, "step": 1705 }, { "epoch": 1.2788605697151425, "grad_norm": 0.07003267858663324, "learning_rate": 0.00014189469196710584, "loss": 0.0079, "step": 1706 }, { "epoch": 1.2796101949025487, "grad_norm": 0.23700881089716225, "learning_rate": 0.00014181545915432795, "loss": 0.0392, "step": 1707 }, { "epoch": 1.280359820089955, "grad_norm": 0.29435932277262994, "learning_rate": 0.0001417361945149428, "loss": 0.021, "step": 1708 }, { "epoch": 1.2811094452773613, "grad_norm": 0.14827007937512016, "learning_rate": 0.00014165689810928035, "loss": 0.0189, "step": 1709 }, { "epoch": 1.2818590704647677, "grad_norm": 0.11574931571363065, "learning_rate": 0.0001415775699976947, "loss": 0.0118, "step": 1710 }, { "epoch": 1.2826086956521738, "grad_norm": 0.2424262114701182, "learning_rate": 0.0001414982102405641, "loss": 0.0214, "step": 1711 }, { "epoch": 1.2833583208395802, "grad_norm": 0.1395124838749881, "learning_rate": 0.00014141881889829095, "loss": 0.0189, "step": 1712 }, { "epoch": 1.2841079460269866, "grad_norm": 0.2711886619263315, "learning_rate": 0.0001413393960313016, "loss": 0.0215, "step": 1713 }, { "epoch": 1.2848575712143928, "grad_norm": 0.4334505201331919, "learning_rate": 0.00014125994170004644, "loss": 0.0278, "step": 1714 }, { "epoch": 1.285607196401799, "grad_norm": 0.1254629293847402, "learning_rate": 0.0001411804559649998, "loss": 0.0117, "step": 1715 }, { "epoch": 1.2863568215892054, "grad_norm": 0.12718887367154608, "learning_rate": 0.0001411009388866599, "loss": 0.0171, "step": 1716 }, { "epoch": 1.2871064467766118, "grad_norm": 0.19073545908028275, "learning_rate": 0.00014102139052554882, "loss": 0.0176, "step": 1717 }, { "epoch": 1.287856071964018, "grad_norm": 0.13287567380141152, "learning_rate": 0.00014094181094221247, "loss": 0.0158, "step": 1718 }, { "epoch": 1.2886056971514244, "grad_norm": 0.0876281346085455, "learning_rate": 0.00014086220019722052, "loss": 0.0092, "step": 1719 }, { "epoch": 1.2893553223388305, "grad_norm": 0.1728777781891654, "learning_rate": 0.00014078255835116635, "loss": 0.0443, "step": 1720 }, { "epoch": 1.290104947526237, "grad_norm": 0.10517570102792818, "learning_rate": 0.00014070288546466696, "loss": 0.0135, "step": 1721 }, { "epoch": 1.2908545727136431, "grad_norm": 0.2599468156608288, "learning_rate": 0.00014062318159836304, "loss": 0.0437, "step": 1722 }, { "epoch": 1.2916041979010495, "grad_norm": 0.30636437280383555, "learning_rate": 0.00014054344681291888, "loss": 0.0445, "step": 1723 }, { "epoch": 1.2923538230884557, "grad_norm": 0.2589336307064375, "learning_rate": 0.00014046368116902227, "loss": 0.0236, "step": 1724 }, { "epoch": 1.293103448275862, "grad_norm": 0.18574128506800772, "learning_rate": 0.00014038388472738445, "loss": 0.0168, "step": 1725 }, { "epoch": 1.2938530734632683, "grad_norm": 0.07816126620288197, "learning_rate": 0.00014030405754874014, "loss": 0.0118, "step": 1726 }, { "epoch": 1.2946026986506747, "grad_norm": 0.10180035843931197, "learning_rate": 0.00014022419969384747, "loss": 0.0089, "step": 1727 }, { "epoch": 1.295352323838081, "grad_norm": 0.0925964533291629, "learning_rate": 0.0001401443112234879, "loss": 0.0086, "step": 1728 }, { "epoch": 1.2961019490254873, "grad_norm": 0.20432854716273183, "learning_rate": 0.0001400643921984662, "loss": 0.0226, "step": 1729 }, { "epoch": 1.2968515742128934, "grad_norm": 0.19414344538018496, "learning_rate": 0.00013998444267961031, "loss": 0.0201, "step": 1730 }, { "epoch": 1.2976011994002998, "grad_norm": 0.4773435279947028, "learning_rate": 0.00013990446272777157, "loss": 0.0371, "step": 1731 }, { "epoch": 1.2983508245877062, "grad_norm": 0.19809194521611023, "learning_rate": 0.00013982445240382426, "loss": 0.0243, "step": 1732 }, { "epoch": 1.2991004497751124, "grad_norm": 0.30608293614416343, "learning_rate": 0.00013974441176866598, "loss": 0.024, "step": 1733 }, { "epoch": 1.2998500749625188, "grad_norm": 0.16507027944955746, "learning_rate": 0.0001396643408832172, "loss": 0.0288, "step": 1734 }, { "epoch": 1.300599700149925, "grad_norm": 0.0654535038808333, "learning_rate": 0.00013958423980842163, "loss": 0.0055, "step": 1735 }, { "epoch": 1.3013493253373314, "grad_norm": 0.21694312244015637, "learning_rate": 0.00013950410860524577, "loss": 0.0329, "step": 1736 }, { "epoch": 1.3020989505247376, "grad_norm": 0.3783138781837035, "learning_rate": 0.00013942394733467916, "loss": 0.0341, "step": 1737 }, { "epoch": 1.302848575712144, "grad_norm": 0.14809260941669292, "learning_rate": 0.0001393437560577342, "loss": 0.0325, "step": 1738 }, { "epoch": 1.3035982008995504, "grad_norm": 0.1703374135823408, "learning_rate": 0.00013926353483544615, "loss": 0.0245, "step": 1739 }, { "epoch": 1.3043478260869565, "grad_norm": 0.1380282229997763, "learning_rate": 0.00013918328372887294, "loss": 0.0133, "step": 1740 }, { "epoch": 1.3050974512743627, "grad_norm": 0.06564048171598381, "learning_rate": 0.00013910300279909544, "loss": 0.0099, "step": 1741 }, { "epoch": 1.3058470764617691, "grad_norm": 0.11204069567707248, "learning_rate": 0.0001390226921072171, "loss": 0.0084, "step": 1742 }, { "epoch": 1.3065967016491755, "grad_norm": 0.10595645979145439, "learning_rate": 0.00013894235171436399, "loss": 0.0088, "step": 1743 }, { "epoch": 1.3073463268365817, "grad_norm": 0.2804525087464718, "learning_rate": 0.00013886198168168486, "loss": 0.0143, "step": 1744 }, { "epoch": 1.3080959520239879, "grad_norm": 0.15181289771519554, "learning_rate": 0.00013878158207035102, "loss": 0.0163, "step": 1745 }, { "epoch": 1.3088455772113943, "grad_norm": 0.10709342533170021, "learning_rate": 0.00013870115294155623, "loss": 0.0145, "step": 1746 }, { "epoch": 1.3095952023988007, "grad_norm": 0.16300325070420085, "learning_rate": 0.0001386206943565168, "loss": 0.0254, "step": 1747 }, { "epoch": 1.3103448275862069, "grad_norm": 0.07927816829120087, "learning_rate": 0.0001385402063764714, "loss": 0.0127, "step": 1748 }, { "epoch": 1.3110944527736133, "grad_norm": 0.14735840705021575, "learning_rate": 0.0001384596890626811, "loss": 0.0203, "step": 1749 }, { "epoch": 1.3118440779610194, "grad_norm": 0.2586943019227501, "learning_rate": 0.00013837914247642928, "loss": 0.0518, "step": 1750 }, { "epoch": 1.3125937031484258, "grad_norm": 0.12290073980900268, "learning_rate": 0.00013829856667902157, "loss": 0.014, "step": 1751 }, { "epoch": 1.313343328335832, "grad_norm": 0.1621805016410538, "learning_rate": 0.00013821796173178595, "loss": 0.0307, "step": 1752 }, { "epoch": 1.3140929535232384, "grad_norm": 0.21411790431804426, "learning_rate": 0.00013813732769607238, "loss": 0.0263, "step": 1753 }, { "epoch": 1.3148425787106448, "grad_norm": 0.11902217132590387, "learning_rate": 0.00013805666463325325, "loss": 0.0133, "step": 1754 }, { "epoch": 1.315592203898051, "grad_norm": 0.1449363778880504, "learning_rate": 0.00013797597260472273, "loss": 0.0291, "step": 1755 }, { "epoch": 1.3163418290854572, "grad_norm": 0.09905236252703252, "learning_rate": 0.00013789525167189726, "loss": 0.0139, "step": 1756 }, { "epoch": 1.3170914542728636, "grad_norm": 0.3075729201212794, "learning_rate": 0.00013781450189621517, "loss": 0.0285, "step": 1757 }, { "epoch": 1.31784107946027, "grad_norm": 0.35471879427394654, "learning_rate": 0.00013773372333913682, "loss": 0.0549, "step": 1758 }, { "epoch": 1.3185907046476761, "grad_norm": 0.12116073666563233, "learning_rate": 0.0001376529160621444, "loss": 0.0228, "step": 1759 }, { "epoch": 1.3193403298350825, "grad_norm": 0.21253060476515195, "learning_rate": 0.000137572080126742, "loss": 0.018, "step": 1760 }, { "epoch": 1.3200899550224887, "grad_norm": 0.09814476061104814, "learning_rate": 0.0001374912155944555, "loss": 0.0208, "step": 1761 }, { "epoch": 1.3208395802098951, "grad_norm": 0.1093144771005278, "learning_rate": 0.00013741032252683263, "loss": 0.015, "step": 1762 }, { "epoch": 1.3215892053973013, "grad_norm": 0.14958978266296255, "learning_rate": 0.00013732940098544268, "loss": 0.0166, "step": 1763 }, { "epoch": 1.3223388305847077, "grad_norm": 0.16548122115072086, "learning_rate": 0.00013724845103187678, "loss": 0.0161, "step": 1764 }, { "epoch": 1.3230884557721139, "grad_norm": 0.11831041233749966, "learning_rate": 0.00013716747272774754, "loss": 0.0199, "step": 1765 }, { "epoch": 1.3238380809595203, "grad_norm": 0.11784371645916646, "learning_rate": 0.00013708646613468925, "loss": 0.0233, "step": 1766 }, { "epoch": 1.3245877061469264, "grad_norm": 0.18173569247378074, "learning_rate": 0.0001370054313143577, "loss": 0.018, "step": 1767 }, { "epoch": 1.3253373313343328, "grad_norm": 0.7794282666624248, "learning_rate": 0.00013692436832843014, "loss": 0.0424, "step": 1768 }, { "epoch": 1.3260869565217392, "grad_norm": 0.18519517663898552, "learning_rate": 0.00013684327723860527, "loss": 0.0296, "step": 1769 }, { "epoch": 1.3268365817091454, "grad_norm": 0.18195460378044268, "learning_rate": 0.00013676215810660324, "loss": 0.0203, "step": 1770 }, { "epoch": 1.3275862068965516, "grad_norm": 0.19484493296286087, "learning_rate": 0.00013668101099416547, "loss": 0.0137, "step": 1771 }, { "epoch": 1.328335832083958, "grad_norm": 0.3283993250061566, "learning_rate": 0.00013659983596305466, "loss": 0.0288, "step": 1772 }, { "epoch": 1.3290854572713644, "grad_norm": 0.4043023801075964, "learning_rate": 0.0001365186330750548, "loss": 0.0611, "step": 1773 }, { "epoch": 1.3298350824587706, "grad_norm": 0.15411743278139997, "learning_rate": 0.0001364374023919712, "loss": 0.0119, "step": 1774 }, { "epoch": 1.330584707646177, "grad_norm": 0.19170968954122777, "learning_rate": 0.00013635614397563002, "loss": 0.0232, "step": 1775 }, { "epoch": 1.3313343328335832, "grad_norm": 0.17839369834152638, "learning_rate": 0.00013627485788787885, "loss": 0.0106, "step": 1776 }, { "epoch": 1.3320839580209896, "grad_norm": 0.25787303687719876, "learning_rate": 0.00013619354419058618, "loss": 0.0241, "step": 1777 }, { "epoch": 1.3328335832083957, "grad_norm": 0.47991060517567485, "learning_rate": 0.00013611220294564152, "loss": 0.0625, "step": 1778 }, { "epoch": 1.3335832083958021, "grad_norm": 0.2458517936158923, "learning_rate": 0.00013603083421495534, "loss": 0.0399, "step": 1779 }, { "epoch": 1.3343328335832085, "grad_norm": 0.05758846150700338, "learning_rate": 0.0001359494380604591, "loss": 0.0078, "step": 1780 }, { "epoch": 1.3350824587706147, "grad_norm": 0.12083746471620882, "learning_rate": 0.0001358680145441051, "loss": 0.0195, "step": 1781 }, { "epoch": 1.3358320839580209, "grad_norm": 0.36133090387357214, "learning_rate": 0.0001357865637278664, "loss": 0.0393, "step": 1782 }, { "epoch": 1.3365817091454273, "grad_norm": 0.18830531508318982, "learning_rate": 0.00013570508567373697, "loss": 0.021, "step": 1783 }, { "epoch": 1.3373313343328337, "grad_norm": 0.18900867761081533, "learning_rate": 0.00013562358044373135, "loss": 0.0195, "step": 1784 }, { "epoch": 1.3380809595202399, "grad_norm": 0.16864151254310344, "learning_rate": 0.00013554204809988489, "loss": 0.0148, "step": 1785 }, { "epoch": 1.338830584707646, "grad_norm": 0.1821482393097731, "learning_rate": 0.00013546048870425356, "loss": 0.0315, "step": 1786 }, { "epoch": 1.3395802098950524, "grad_norm": 0.22791765279336143, "learning_rate": 0.00013537890231891389, "loss": 0.0264, "step": 1787 }, { "epoch": 1.3403298350824588, "grad_norm": 0.11951864033785457, "learning_rate": 0.00013529728900596292, "loss": 0.0186, "step": 1788 }, { "epoch": 1.341079460269865, "grad_norm": 0.2607319327329998, "learning_rate": 0.00013521564882751824, "loss": 0.0306, "step": 1789 }, { "epoch": 1.3418290854572714, "grad_norm": 0.13017071114069304, "learning_rate": 0.0001351339818457179, "loss": 0.047, "step": 1790 }, { "epoch": 1.3425787106446776, "grad_norm": 0.1450659367892797, "learning_rate": 0.00013505228812272027, "loss": 0.028, "step": 1791 }, { "epoch": 1.343328335832084, "grad_norm": 0.14477929492194744, "learning_rate": 0.00013497056772070418, "loss": 0.0228, "step": 1792 }, { "epoch": 1.3440779610194902, "grad_norm": 0.22976772294738085, "learning_rate": 0.0001348888207018687, "loss": 0.0261, "step": 1793 }, { "epoch": 1.3448275862068966, "grad_norm": 0.18993895374099645, "learning_rate": 0.0001348070471284331, "loss": 0.0272, "step": 1794 }, { "epoch": 1.345577211394303, "grad_norm": 0.18607634443900709, "learning_rate": 0.00013472524706263704, "loss": 0.0164, "step": 1795 }, { "epoch": 1.3463268365817092, "grad_norm": 0.21608895921140736, "learning_rate": 0.00013464342056674014, "loss": 0.0266, "step": 1796 }, { "epoch": 1.3470764617691153, "grad_norm": 0.13419548434843545, "learning_rate": 0.00013456156770302232, "loss": 0.0182, "step": 1797 }, { "epoch": 1.3478260869565217, "grad_norm": 0.09113490437408855, "learning_rate": 0.00013447968853378339, "loss": 0.0164, "step": 1798 }, { "epoch": 1.3485757121439281, "grad_norm": 0.1880407479342471, "learning_rate": 0.00013439778312134335, "loss": 0.0304, "step": 1799 }, { "epoch": 1.3493253373313343, "grad_norm": 0.20552205705509324, "learning_rate": 0.00013431585152804204, "loss": 0.0174, "step": 1800 }, { "epoch": 1.3500749625187407, "grad_norm": 0.30216264696825906, "learning_rate": 0.00013423389381623933, "loss": 0.0389, "step": 1801 }, { "epoch": 1.3508245877061469, "grad_norm": 0.1466753916995075, "learning_rate": 0.00013415191004831486, "loss": 0.0214, "step": 1802 }, { "epoch": 1.3515742128935533, "grad_norm": 0.13263747900098585, "learning_rate": 0.00013406990028666825, "loss": 0.0249, "step": 1803 }, { "epoch": 1.3523238380809595, "grad_norm": 0.3684032218976006, "learning_rate": 0.0001339878645937187, "loss": 0.0269, "step": 1804 }, { "epoch": 1.3530734632683659, "grad_norm": 0.07399337775573261, "learning_rate": 0.0001339058030319054, "loss": 0.0119, "step": 1805 }, { "epoch": 1.353823088455772, "grad_norm": 0.2539273759373133, "learning_rate": 0.000133823715663687, "loss": 0.0347, "step": 1806 }, { "epoch": 1.3545727136431784, "grad_norm": 0.10545031650173455, "learning_rate": 0.00013374160255154197, "loss": 0.0188, "step": 1807 }, { "epoch": 1.3553223388305846, "grad_norm": 0.058772091983560384, "learning_rate": 0.00013365946375796817, "loss": 0.0065, "step": 1808 }, { "epoch": 1.356071964017991, "grad_norm": 0.11314109561850787, "learning_rate": 0.00013357729934548325, "loss": 0.0119, "step": 1809 }, { "epoch": 1.3568215892053974, "grad_norm": 0.06928897914135622, "learning_rate": 0.00013349510937662415, "loss": 0.0062, "step": 1810 }, { "epoch": 1.3575712143928036, "grad_norm": 0.11684449276906832, "learning_rate": 0.0001334128939139474, "loss": 0.0134, "step": 1811 }, { "epoch": 1.3583208395802098, "grad_norm": 0.23702047079708088, "learning_rate": 0.00013333065302002887, "loss": 0.0206, "step": 1812 }, { "epoch": 1.3590704647676162, "grad_norm": 0.2643609076687036, "learning_rate": 0.0001332483867574638, "loss": 0.0116, "step": 1813 }, { "epoch": 1.3598200899550226, "grad_norm": 0.13595700377100592, "learning_rate": 0.0001331660951888667, "loss": 0.0142, "step": 1814 }, { "epoch": 1.3605697151424287, "grad_norm": 0.13346544162743765, "learning_rate": 0.00013308377837687143, "loss": 0.0254, "step": 1815 }, { "epoch": 1.3613193403298351, "grad_norm": 0.1695190423319145, "learning_rate": 0.00013300143638413098, "loss": 0.0213, "step": 1816 }, { "epoch": 1.3620689655172413, "grad_norm": 0.10765569160056693, "learning_rate": 0.00013291906927331753, "loss": 0.01, "step": 1817 }, { "epoch": 1.3628185907046477, "grad_norm": 0.1354854839518553, "learning_rate": 0.00013283667710712243, "loss": 0.0066, "step": 1818 }, { "epoch": 1.363568215892054, "grad_norm": 0.22292484082791938, "learning_rate": 0.00013275425994825605, "loss": 0.0151, "step": 1819 }, { "epoch": 1.3643178410794603, "grad_norm": 0.14596732401139165, "learning_rate": 0.0001326718178594477, "loss": 0.0204, "step": 1820 }, { "epoch": 1.3650674662668667, "grad_norm": 0.17866809132707354, "learning_rate": 0.00013258935090344585, "loss": 0.0363, "step": 1821 }, { "epoch": 1.3658170914542729, "grad_norm": 0.17175637013243722, "learning_rate": 0.00013250685914301775, "loss": 0.0153, "step": 1822 }, { "epoch": 1.366566716641679, "grad_norm": 0.160782474163614, "learning_rate": 0.0001324243426409496, "loss": 0.0265, "step": 1823 }, { "epoch": 1.3673163418290855, "grad_norm": 0.16591399321605485, "learning_rate": 0.00013234180146004637, "loss": 0.0201, "step": 1824 }, { "epoch": 1.3680659670164919, "grad_norm": 0.30566822867483184, "learning_rate": 0.00013225923566313193, "loss": 0.0361, "step": 1825 }, { "epoch": 1.368815592203898, "grad_norm": 0.10134118807960453, "learning_rate": 0.00013217664531304875, "loss": 0.0153, "step": 1826 }, { "epoch": 1.3695652173913042, "grad_norm": 0.2673027205910499, "learning_rate": 0.00013209403047265803, "loss": 0.0313, "step": 1827 }, { "epoch": 1.3703148425787106, "grad_norm": 0.18874733815150369, "learning_rate": 0.00013201139120483968, "loss": 0.0241, "step": 1828 }, { "epoch": 1.371064467766117, "grad_norm": 0.11031481892631355, "learning_rate": 0.00013192872757249212, "loss": 0.0146, "step": 1829 }, { "epoch": 1.3718140929535232, "grad_norm": 0.18598840361478705, "learning_rate": 0.00013184603963853234, "loss": 0.0237, "step": 1830 }, { "epoch": 1.3725637181409296, "grad_norm": 0.04790301796130572, "learning_rate": 0.00013176332746589586, "loss": 0.008, "step": 1831 }, { "epoch": 1.3733133433283358, "grad_norm": 0.14298108928099823, "learning_rate": 0.00013168059111753658, "loss": 0.0205, "step": 1832 }, { "epoch": 1.3740629685157422, "grad_norm": 0.06819818800878674, "learning_rate": 0.00013159783065642684, "loss": 0.0051, "step": 1833 }, { "epoch": 1.3748125937031483, "grad_norm": 0.17992706431322583, "learning_rate": 0.00013151504614555734, "loss": 0.0183, "step": 1834 }, { "epoch": 1.3755622188905547, "grad_norm": 0.04716179303045061, "learning_rate": 0.0001314322376479371, "loss": 0.0056, "step": 1835 }, { "epoch": 1.3763118440779611, "grad_norm": 0.06108436289594824, "learning_rate": 0.0001313494052265934, "loss": 0.0075, "step": 1836 }, { "epoch": 1.3770614692653673, "grad_norm": 0.1114869532383892, "learning_rate": 0.0001312665489445716, "loss": 0.0163, "step": 1837 }, { "epoch": 1.3778110944527735, "grad_norm": 0.14439215967136948, "learning_rate": 0.0001311836688649354, "loss": 0.0242, "step": 1838 }, { "epoch": 1.37856071964018, "grad_norm": 0.13410528402626273, "learning_rate": 0.0001311007650507665, "loss": 0.0219, "step": 1839 }, { "epoch": 1.3793103448275863, "grad_norm": 0.10440850471031544, "learning_rate": 0.00013101783756516472, "loss": 0.014, "step": 1840 }, { "epoch": 1.3800599700149925, "grad_norm": 0.09242749157688925, "learning_rate": 0.0001309348864712479, "loss": 0.0126, "step": 1841 }, { "epoch": 1.3808095952023989, "grad_norm": 0.0887009346031432, "learning_rate": 0.00013085191183215177, "loss": 0.0209, "step": 1842 }, { "epoch": 1.381559220389805, "grad_norm": 0.2960216409085203, "learning_rate": 0.0001307689137110301, "loss": 0.0266, "step": 1843 }, { "epoch": 1.3823088455772115, "grad_norm": 0.068323210842311, "learning_rate": 0.00013068589217105441, "loss": 0.0055, "step": 1844 }, { "epoch": 1.3830584707646176, "grad_norm": 0.08638662473406213, "learning_rate": 0.00013060284727541414, "loss": 0.007, "step": 1845 }, { "epoch": 1.383808095952024, "grad_norm": 0.1702916586839961, "learning_rate": 0.00013051977908731644, "loss": 0.0218, "step": 1846 }, { "epoch": 1.3845577211394302, "grad_norm": 0.18436823215505085, "learning_rate": 0.00013043668766998627, "loss": 0.0125, "step": 1847 }, { "epoch": 1.3853073463268366, "grad_norm": 0.44702716210172605, "learning_rate": 0.0001303535730866662, "loss": 0.0215, "step": 1848 }, { "epoch": 1.3860569715142428, "grad_norm": 0.10089046147113483, "learning_rate": 0.0001302704354006164, "loss": 0.0197, "step": 1849 }, { "epoch": 1.3868065967016492, "grad_norm": 0.26863373480847147, "learning_rate": 0.00013018727467511474, "loss": 0.031, "step": 1850 }, { "epoch": 1.3875562218890556, "grad_norm": 0.22774278816662305, "learning_rate": 0.00013010409097345655, "loss": 0.0602, "step": 1851 }, { "epoch": 1.3883058470764618, "grad_norm": 0.14560522069571033, "learning_rate": 0.00013002088435895464, "loss": 0.0178, "step": 1852 }, { "epoch": 1.389055472263868, "grad_norm": 0.12305291457680514, "learning_rate": 0.00012993765489493928, "loss": 0.0216, "step": 1853 }, { "epoch": 1.3898050974512743, "grad_norm": 0.15409826230237533, "learning_rate": 0.00012985440264475813, "loss": 0.019, "step": 1854 }, { "epoch": 1.3905547226386807, "grad_norm": 0.21164672474158286, "learning_rate": 0.0001297711276717762, "loss": 0.0117, "step": 1855 }, { "epoch": 1.391304347826087, "grad_norm": 0.1355214980132497, "learning_rate": 0.00012968783003937577, "loss": 0.0219, "step": 1856 }, { "epoch": 1.3920539730134933, "grad_norm": 0.12625414203875335, "learning_rate": 0.00012960450981095643, "loss": 0.0169, "step": 1857 }, { "epoch": 1.3928035982008995, "grad_norm": 0.3024182672541223, "learning_rate": 0.00012952116704993482, "loss": 0.0168, "step": 1858 }, { "epoch": 1.393553223388306, "grad_norm": 0.11275880823677788, "learning_rate": 0.00012943780181974497, "loss": 0.0139, "step": 1859 }, { "epoch": 1.394302848575712, "grad_norm": 0.20070703000689094, "learning_rate": 0.0001293544141838378, "loss": 0.0212, "step": 1860 }, { "epoch": 1.3950524737631185, "grad_norm": 0.09003839234478617, "learning_rate": 0.00012927100420568132, "loss": 0.0151, "step": 1861 }, { "epoch": 1.3958020989505249, "grad_norm": 0.12825471757454604, "learning_rate": 0.00012918757194876066, "loss": 0.0154, "step": 1862 }, { "epoch": 1.396551724137931, "grad_norm": 0.08699576227833801, "learning_rate": 0.00012910411747657778, "loss": 0.0113, "step": 1863 }, { "epoch": 1.3973013493253372, "grad_norm": 0.21761113805906224, "learning_rate": 0.0001290206408526516, "loss": 0.0301, "step": 1864 }, { "epoch": 1.3980509745127436, "grad_norm": 0.1548273942024453, "learning_rate": 0.00012893714214051792, "loss": 0.014, "step": 1865 }, { "epoch": 1.39880059970015, "grad_norm": 0.09941037024754006, "learning_rate": 0.00012885362140372934, "loss": 0.0136, "step": 1866 }, { "epoch": 1.3995502248875562, "grad_norm": 0.18751823083887997, "learning_rate": 0.0001287700787058552, "loss": 0.0204, "step": 1867 }, { "epoch": 1.4002998500749624, "grad_norm": 0.15499241538245642, "learning_rate": 0.00012868651411048154, "loss": 0.0141, "step": 1868 }, { "epoch": 1.4010494752623688, "grad_norm": 0.12843498545392468, "learning_rate": 0.0001286029276812111, "loss": 0.0179, "step": 1869 }, { "epoch": 1.4017991004497752, "grad_norm": 0.4731933208232578, "learning_rate": 0.00012851931948166327, "loss": 0.0661, "step": 1870 }, { "epoch": 1.4025487256371814, "grad_norm": 0.14952964839476857, "learning_rate": 0.00012843568957547394, "loss": 0.0211, "step": 1871 }, { "epoch": 1.4032983508245878, "grad_norm": 0.17276714217465589, "learning_rate": 0.00012835203802629555, "loss": 0.0135, "step": 1872 }, { "epoch": 1.404047976011994, "grad_norm": 0.46872245156442344, "learning_rate": 0.00012826836489779708, "loss": 0.0497, "step": 1873 }, { "epoch": 1.4047976011994003, "grad_norm": 0.08363564829601658, "learning_rate": 0.00012818467025366376, "loss": 0.016, "step": 1874 }, { "epoch": 1.4055472263868065, "grad_norm": 0.10232915132284348, "learning_rate": 0.00012810095415759744, "loss": 0.0128, "step": 1875 }, { "epoch": 1.406296851574213, "grad_norm": 0.061288942568132866, "learning_rate": 0.00012801721667331605, "loss": 0.0089, "step": 1876 }, { "epoch": 1.4070464767616193, "grad_norm": 0.13128930961987118, "learning_rate": 0.000127933457864554, "loss": 0.0215, "step": 1877 }, { "epoch": 1.4077961019490255, "grad_norm": 0.11242897724452197, "learning_rate": 0.00012784967779506178, "loss": 0.0204, "step": 1878 }, { "epoch": 1.4085457271364317, "grad_norm": 0.10983058504415504, "learning_rate": 0.00012776587652860613, "loss": 0.0302, "step": 1879 }, { "epoch": 1.409295352323838, "grad_norm": 0.045545185026683276, "learning_rate": 0.00012768205412897, "loss": 0.0059, "step": 1880 }, { "epoch": 1.4100449775112445, "grad_norm": 0.15299570276557586, "learning_rate": 0.00012759821065995224, "loss": 0.0323, "step": 1881 }, { "epoch": 1.4107946026986506, "grad_norm": 0.08608560440679149, "learning_rate": 0.00012751434618536787, "loss": 0.0108, "step": 1882 }, { "epoch": 1.411544227886057, "grad_norm": 0.27803913808993475, "learning_rate": 0.00012743046076904793, "loss": 0.0201, "step": 1883 }, { "epoch": 1.4122938530734632, "grad_norm": 0.10612971680946036, "learning_rate": 0.0001273465544748393, "loss": 0.0099, "step": 1884 }, { "epoch": 1.4130434782608696, "grad_norm": 0.12436357529377723, "learning_rate": 0.00012726262736660474, "loss": 0.0284, "step": 1885 }, { "epoch": 1.4137931034482758, "grad_norm": 0.11607890022114893, "learning_rate": 0.00012717867950822297, "loss": 0.0101, "step": 1886 }, { "epoch": 1.4145427286356822, "grad_norm": 0.14770329705497223, "learning_rate": 0.00012709471096358842, "loss": 0.0315, "step": 1887 }, { "epoch": 1.4152923538230884, "grad_norm": 0.18604699056734458, "learning_rate": 0.00012701072179661127, "loss": 0.0293, "step": 1888 }, { "epoch": 1.4160419790104948, "grad_norm": 0.18470673903506774, "learning_rate": 0.0001269267120712174, "loss": 0.024, "step": 1889 }, { "epoch": 1.416791604197901, "grad_norm": 0.060328972475384775, "learning_rate": 0.00012684268185134834, "loss": 0.0059, "step": 1890 }, { "epoch": 1.4175412293853074, "grad_norm": 0.173380371633488, "learning_rate": 0.00012675863120096122, "loss": 0.0339, "step": 1891 }, { "epoch": 1.4182908545727138, "grad_norm": 0.11015965133252213, "learning_rate": 0.00012667456018402874, "loss": 0.0131, "step": 1892 }, { "epoch": 1.41904047976012, "grad_norm": 0.09097324818575385, "learning_rate": 0.00012659046886453904, "loss": 0.0116, "step": 1893 }, { "epoch": 1.419790104947526, "grad_norm": 0.1388229356745322, "learning_rate": 0.0001265063573064958, "loss": 0.013, "step": 1894 }, { "epoch": 1.4205397301349325, "grad_norm": 0.12052532519735364, "learning_rate": 0.00012642222557391808, "loss": 0.0134, "step": 1895 }, { "epoch": 1.421289355322339, "grad_norm": 0.15500408517715691, "learning_rate": 0.0001263380737308402, "loss": 0.0338, "step": 1896 }, { "epoch": 1.422038980509745, "grad_norm": 0.11227641668568701, "learning_rate": 0.0001262539018413119, "loss": 0.0159, "step": 1897 }, { "epoch": 1.4227886056971515, "grad_norm": 0.1628839753085024, "learning_rate": 0.00012616970996939814, "loss": 0.0173, "step": 1898 }, { "epoch": 1.4235382308845577, "grad_norm": 0.315608669835069, "learning_rate": 0.00012608549817917913, "loss": 0.0568, "step": 1899 }, { "epoch": 1.424287856071964, "grad_norm": 0.09860125372264825, "learning_rate": 0.00012600126653475012, "loss": 0.009, "step": 1900 }, { "epoch": 1.4250374812593702, "grad_norm": 0.13144368707140067, "learning_rate": 0.0001259170151002216, "loss": 0.0077, "step": 1901 }, { "epoch": 1.4257871064467766, "grad_norm": 0.0982665549847179, "learning_rate": 0.00012583274393971906, "loss": 0.0089, "step": 1902 }, { "epoch": 1.426536731634183, "grad_norm": 0.1386778146043919, "learning_rate": 0.000125748453117383, "loss": 0.011, "step": 1903 }, { "epoch": 1.4272863568215892, "grad_norm": 0.07388630267828614, "learning_rate": 0.0001256641426973689, "loss": 0.0117, "step": 1904 }, { "epoch": 1.4280359820089954, "grad_norm": 0.18284401715096338, "learning_rate": 0.0001255798127438472, "loss": 0.0084, "step": 1905 }, { "epoch": 1.4287856071964018, "grad_norm": 0.10679066875328382, "learning_rate": 0.0001254954633210031, "loss": 0.0132, "step": 1906 }, { "epoch": 1.4295352323838082, "grad_norm": 0.19195698414866189, "learning_rate": 0.00012541109449303676, "loss": 0.0216, "step": 1907 }, { "epoch": 1.4302848575712144, "grad_norm": 0.3666573212530311, "learning_rate": 0.00012532670632416293, "loss": 0.0326, "step": 1908 }, { "epoch": 1.4310344827586206, "grad_norm": 0.11101667561738876, "learning_rate": 0.00012524229887861133, "loss": 0.0171, "step": 1909 }, { "epoch": 1.431784107946027, "grad_norm": 0.08791025320937725, "learning_rate": 0.00012515787222062608, "loss": 0.0095, "step": 1910 }, { "epoch": 1.4325337331334334, "grad_norm": 0.1342867251099514, "learning_rate": 0.0001250734264144661, "loss": 0.0188, "step": 1911 }, { "epoch": 1.4332833583208395, "grad_norm": 0.09608017121352272, "learning_rate": 0.00012498896152440483, "loss": 0.0075, "step": 1912 }, { "epoch": 1.434032983508246, "grad_norm": 0.13020892593954997, "learning_rate": 0.00012490447761473022, "loss": 0.0083, "step": 1913 }, { "epoch": 1.434782608695652, "grad_norm": 0.09629403477170517, "learning_rate": 0.0001248199747497447, "loss": 0.0103, "step": 1914 }, { "epoch": 1.4355322338830585, "grad_norm": 0.13425262075259645, "learning_rate": 0.0001247354529937652, "loss": 0.0199, "step": 1915 }, { "epoch": 1.4362818590704647, "grad_norm": 0.08860400439856671, "learning_rate": 0.0001246509124111229, "loss": 0.0134, "step": 1916 }, { "epoch": 1.437031484257871, "grad_norm": 0.3389225893719704, "learning_rate": 0.00012456635306616344, "loss": 0.0534, "step": 1917 }, { "epoch": 1.4377811094452775, "grad_norm": 0.1685711891815974, "learning_rate": 0.00012448177502324663, "loss": 0.0216, "step": 1918 }, { "epoch": 1.4385307346326837, "grad_norm": 0.2572067238711098, "learning_rate": 0.0001243971783467466, "loss": 0.0159, "step": 1919 }, { "epoch": 1.4392803598200898, "grad_norm": 0.13640220342782158, "learning_rate": 0.00012431256310105158, "loss": 0.0183, "step": 1920 }, { "epoch": 1.4400299850074962, "grad_norm": 0.09959171259587224, "learning_rate": 0.000124227929350564, "loss": 0.0136, "step": 1921 }, { "epoch": 1.4407796101949026, "grad_norm": 0.22152896373552985, "learning_rate": 0.0001241432771597004, "loss": 0.0262, "step": 1922 }, { "epoch": 1.4415292353823088, "grad_norm": 0.15349376103343082, "learning_rate": 0.00012405860659289123, "loss": 0.0092, "step": 1923 }, { "epoch": 1.4422788605697152, "grad_norm": 0.14915283299222099, "learning_rate": 0.00012397391771458105, "loss": 0.0077, "step": 1924 }, { "epoch": 1.4430284857571214, "grad_norm": 0.12541301370117197, "learning_rate": 0.00012388921058922829, "loss": 0.0181, "step": 1925 }, { "epoch": 1.4437781109445278, "grad_norm": 0.1414621996215833, "learning_rate": 0.00012380448528130525, "loss": 0.0251, "step": 1926 }, { "epoch": 1.444527736131934, "grad_norm": 0.09985535961077165, "learning_rate": 0.00012371974185529817, "loss": 0.0087, "step": 1927 }, { "epoch": 1.4452773613193404, "grad_norm": 0.20252240412543931, "learning_rate": 0.000123634980375707, "loss": 0.0314, "step": 1928 }, { "epoch": 1.4460269865067465, "grad_norm": 0.07083834632533098, "learning_rate": 0.00012355020090704544, "loss": 0.0094, "step": 1929 }, { "epoch": 1.446776611694153, "grad_norm": 0.0988355632047036, "learning_rate": 0.0001234654035138409, "loss": 0.0153, "step": 1930 }, { "epoch": 1.4475262368815591, "grad_norm": 0.07249163434224347, "learning_rate": 0.00012338058826063438, "loss": 0.0104, "step": 1931 }, { "epoch": 1.4482758620689655, "grad_norm": 0.09806488951161967, "learning_rate": 0.00012329575521198054, "loss": 0.0138, "step": 1932 }, { "epoch": 1.449025487256372, "grad_norm": 0.13624110709087367, "learning_rate": 0.00012321090443244762, "loss": 0.0246, "step": 1933 }, { "epoch": 1.449775112443778, "grad_norm": 0.1179493925747529, "learning_rate": 0.0001231260359866172, "loss": 0.0054, "step": 1934 }, { "epoch": 1.4505247376311843, "grad_norm": 0.09174066897570911, "learning_rate": 0.00012304114993908448, "loss": 0.0189, "step": 1935 }, { "epoch": 1.4512743628185907, "grad_norm": 0.14148182215863198, "learning_rate": 0.00012295624635445796, "loss": 0.0266, "step": 1936 }, { "epoch": 1.452023988005997, "grad_norm": 0.2045256699403224, "learning_rate": 0.00012287132529735945, "loss": 0.0195, "step": 1937 }, { "epoch": 1.4527736131934033, "grad_norm": 0.25985288542875007, "learning_rate": 0.0001227863868324242, "loss": 0.0242, "step": 1938 }, { "epoch": 1.4535232383808097, "grad_norm": 0.21860590313539296, "learning_rate": 0.00012270143102430055, "loss": 0.0396, "step": 1939 }, { "epoch": 1.4542728635682158, "grad_norm": 0.07925976743388066, "learning_rate": 0.0001226164579376502, "loss": 0.0083, "step": 1940 }, { "epoch": 1.4550224887556222, "grad_norm": 0.24388208826017888, "learning_rate": 0.00012253146763714783, "loss": 0.0148, "step": 1941 }, { "epoch": 1.4557721139430284, "grad_norm": 0.12812885050817915, "learning_rate": 0.00012244646018748134, "loss": 0.016, "step": 1942 }, { "epoch": 1.4565217391304348, "grad_norm": 0.12975353474198595, "learning_rate": 0.00012236143565335164, "loss": 0.0158, "step": 1943 }, { "epoch": 1.4572713643178412, "grad_norm": 0.21293324073380665, "learning_rate": 0.00012227639409947272, "loss": 0.0349, "step": 1944 }, { "epoch": 1.4580209895052474, "grad_norm": 0.223606496350551, "learning_rate": 0.0001221913355905714, "loss": 0.021, "step": 1945 }, { "epoch": 1.4587706146926536, "grad_norm": 0.40452905855669036, "learning_rate": 0.00012210626019138748, "loss": 0.0524, "step": 1946 }, { "epoch": 1.45952023988006, "grad_norm": 0.20036469604349294, "learning_rate": 0.00012202116796667356, "loss": 0.0191, "step": 1947 }, { "epoch": 1.4602698650674664, "grad_norm": 0.20366839982219276, "learning_rate": 0.00012193605898119514, "loss": 0.0296, "step": 1948 }, { "epoch": 1.4610194902548725, "grad_norm": 0.18687874497395973, "learning_rate": 0.00012185093329973038, "loss": 0.0203, "step": 1949 }, { "epoch": 1.461769115442279, "grad_norm": 0.09124165683037777, "learning_rate": 0.00012176579098707023, "loss": 0.0118, "step": 1950 }, { "epoch": 1.4625187406296851, "grad_norm": 0.2532954888220123, "learning_rate": 0.0001216806321080182, "loss": 0.0285, "step": 1951 }, { "epoch": 1.4632683658170915, "grad_norm": 0.1361058594029782, "learning_rate": 0.00012159545672739054, "loss": 0.0146, "step": 1952 }, { "epoch": 1.4640179910044977, "grad_norm": 0.10342119836800451, "learning_rate": 0.00012151026491001591, "loss": 0.0073, "step": 1953 }, { "epoch": 1.464767616191904, "grad_norm": 0.16486238138863205, "learning_rate": 0.00012142505672073564, "loss": 0.0193, "step": 1954 }, { "epoch": 1.4655172413793103, "grad_norm": 0.13536447693810422, "learning_rate": 0.00012133983222440335, "loss": 0.0491, "step": 1955 }, { "epoch": 1.4662668665667167, "grad_norm": 0.23330516206788135, "learning_rate": 0.00012125459148588523, "loss": 0.0301, "step": 1956 }, { "epoch": 1.4670164917541229, "grad_norm": 0.15292897747369386, "learning_rate": 0.00012116933457005972, "loss": 0.0167, "step": 1957 }, { "epoch": 1.4677661169415293, "grad_norm": 0.1901558613459153, "learning_rate": 0.00012108406154181766, "loss": 0.0093, "step": 1958 }, { "epoch": 1.4685157421289357, "grad_norm": 0.2138954820034095, "learning_rate": 0.00012099877246606206, "loss": 0.0241, "step": 1959 }, { "epoch": 1.4692653673163418, "grad_norm": 0.07976981594385227, "learning_rate": 0.00012091346740770821, "loss": 0.0156, "step": 1960 }, { "epoch": 1.470014992503748, "grad_norm": 0.08969290312785852, "learning_rate": 0.00012082814643168357, "loss": 0.0141, "step": 1961 }, { "epoch": 1.4707646176911544, "grad_norm": 0.08385247469572182, "learning_rate": 0.00012074280960292763, "loss": 0.0082, "step": 1962 }, { "epoch": 1.4715142428785608, "grad_norm": 0.11224097369553376, "learning_rate": 0.00012065745698639208, "loss": 0.0077, "step": 1963 }, { "epoch": 1.472263868065967, "grad_norm": 0.6110527097464731, "learning_rate": 0.00012057208864704051, "loss": 0.0154, "step": 1964 }, { "epoch": 1.4730134932533734, "grad_norm": 0.20503801470472588, "learning_rate": 0.0001204867046498485, "loss": 0.0238, "step": 1965 }, { "epoch": 1.4737631184407796, "grad_norm": 0.058020772911770324, "learning_rate": 0.00012040130505980365, "loss": 0.0052, "step": 1966 }, { "epoch": 1.474512743628186, "grad_norm": 0.2052620451143929, "learning_rate": 0.00012031588994190526, "loss": 0.0168, "step": 1967 }, { "epoch": 1.4752623688155921, "grad_norm": 0.09603284968761991, "learning_rate": 0.00012023045936116459, "loss": 0.0098, "step": 1968 }, { "epoch": 1.4760119940029985, "grad_norm": 0.1805164431825691, "learning_rate": 0.00012014501338260462, "loss": 0.007, "step": 1969 }, { "epoch": 1.4767616191904047, "grad_norm": 0.18701411878470012, "learning_rate": 0.00012005955207126, "loss": 0.0204, "step": 1970 }, { "epoch": 1.4775112443778111, "grad_norm": 0.19871909726227946, "learning_rate": 0.00011997407549217717, "loss": 0.0299, "step": 1971 }, { "epoch": 1.4782608695652173, "grad_norm": 0.09369152034670397, "learning_rate": 0.000119888583710414, "loss": 0.0134, "step": 1972 }, { "epoch": 1.4790104947526237, "grad_norm": 0.2156334124260664, "learning_rate": 0.00011980307679104015, "loss": 0.0197, "step": 1973 }, { "epoch": 1.47976011994003, "grad_norm": 0.09856555218171896, "learning_rate": 0.00011971755479913665, "loss": 0.0096, "step": 1974 }, { "epoch": 1.4805097451274363, "grad_norm": 0.320903279814656, "learning_rate": 0.00011963201779979604, "loss": 0.02, "step": 1975 }, { "epoch": 1.4812593703148424, "grad_norm": 0.07879126478261443, "learning_rate": 0.00011954646585812233, "loss": 0.0081, "step": 1976 }, { "epoch": 1.4820089955022488, "grad_norm": 0.17444491435700865, "learning_rate": 0.00011946089903923081, "loss": 0.0274, "step": 1977 }, { "epoch": 1.4827586206896552, "grad_norm": 0.12417897909896483, "learning_rate": 0.00011937531740824819, "loss": 0.017, "step": 1978 }, { "epoch": 1.4835082458770614, "grad_norm": 0.30630006591297376, "learning_rate": 0.00011928972103031242, "loss": 0.0366, "step": 1979 }, { "epoch": 1.4842578710644678, "grad_norm": 0.2002393018537466, "learning_rate": 0.00011920410997057265, "loss": 0.0079, "step": 1980 }, { "epoch": 1.485007496251874, "grad_norm": 0.26995257634941094, "learning_rate": 0.00011911848429418923, "loss": 0.018, "step": 1981 }, { "epoch": 1.4857571214392804, "grad_norm": 0.23129376115177372, "learning_rate": 0.00011903284406633361, "loss": 0.0159, "step": 1982 }, { "epoch": 1.4865067466266866, "grad_norm": 0.21531353753419102, "learning_rate": 0.00011894718935218836, "loss": 0.021, "step": 1983 }, { "epoch": 1.487256371814093, "grad_norm": 0.1347198207422198, "learning_rate": 0.000118861520216947, "loss": 0.0263, "step": 1984 }, { "epoch": 1.4880059970014994, "grad_norm": 0.10419003906180573, "learning_rate": 0.00011877583672581411, "loss": 0.0132, "step": 1985 }, { "epoch": 1.4887556221889056, "grad_norm": 0.1425071654540163, "learning_rate": 0.00011869013894400516, "loss": 0.0105, "step": 1986 }, { "epoch": 1.4895052473763117, "grad_norm": 0.09482127929336295, "learning_rate": 0.00011860442693674647, "loss": 0.0074, "step": 1987 }, { "epoch": 1.4902548725637181, "grad_norm": 0.1545877408511147, "learning_rate": 0.0001185187007692752, "loss": 0.0198, "step": 1988 }, { "epoch": 1.4910044977511245, "grad_norm": 0.1406951822773715, "learning_rate": 0.00011843296050683937, "loss": 0.0091, "step": 1989 }, { "epoch": 1.4917541229385307, "grad_norm": 0.1092127925059156, "learning_rate": 0.00011834720621469762, "loss": 0.0187, "step": 1990 }, { "epoch": 1.4925037481259371, "grad_norm": 0.08263137479052818, "learning_rate": 0.00011826143795811929, "loss": 0.0128, "step": 1991 }, { "epoch": 1.4932533733133433, "grad_norm": 0.18058529469223594, "learning_rate": 0.00011817565580238442, "loss": 0.0174, "step": 1992 }, { "epoch": 1.4940029985007497, "grad_norm": 0.2926184202629552, "learning_rate": 0.00011808985981278353, "loss": 0.0417, "step": 1993 }, { "epoch": 1.4947526236881559, "grad_norm": 0.15063107867199743, "learning_rate": 0.00011800405005461772, "loss": 0.025, "step": 1994 }, { "epoch": 1.4955022488755623, "grad_norm": 0.09316966552874384, "learning_rate": 0.00011791822659319858, "loss": 0.0184, "step": 1995 }, { "epoch": 1.4962518740629684, "grad_norm": 0.3583120976917404, "learning_rate": 0.0001178323894938481, "loss": 0.0192, "step": 1996 }, { "epoch": 1.4970014992503748, "grad_norm": 0.06682455310812929, "learning_rate": 0.00011774653882189867, "loss": 0.0106, "step": 1997 }, { "epoch": 1.497751124437781, "grad_norm": 0.11924451335046508, "learning_rate": 0.00011766067464269301, "loss": 0.0184, "step": 1998 }, { "epoch": 1.4985007496251874, "grad_norm": 0.21903498673009728, "learning_rate": 0.00011757479702158412, "loss": 0.0362, "step": 1999 }, { "epoch": 1.4992503748125938, "grad_norm": 0.14552346714109463, "learning_rate": 0.00011748890602393521, "loss": 0.0131, "step": 2000 }, { "epoch": 1.5, "grad_norm": 0.38878473530349916, "learning_rate": 0.00011740300171511969, "loss": 0.0343, "step": 2001 }, { "epoch": 1.5007496251874062, "grad_norm": 0.1819908689214082, "learning_rate": 0.0001173170841605211, "loss": 0.0098, "step": 2002 }, { "epoch": 1.5014992503748126, "grad_norm": 0.22899646815252692, "learning_rate": 0.00011723115342553306, "loss": 0.0215, "step": 2003 }, { "epoch": 1.502248875562219, "grad_norm": 0.11740484135767436, "learning_rate": 0.00011714520957555924, "loss": 0.0113, "step": 2004 }, { "epoch": 1.5029985007496252, "grad_norm": 0.3885868302596495, "learning_rate": 0.00011705925267601326, "loss": 0.043, "step": 2005 }, { "epoch": 1.5037481259370313, "grad_norm": 0.29570525506634887, "learning_rate": 0.00011697328279231868, "loss": 0.0335, "step": 2006 }, { "epoch": 1.5044977511244377, "grad_norm": 0.17289926376387163, "learning_rate": 0.00011688729998990897, "loss": 0.0305, "step": 2007 }, { "epoch": 1.5052473763118441, "grad_norm": 0.5384751241597092, "learning_rate": 0.0001168013043342274, "loss": 0.0403, "step": 2008 }, { "epoch": 1.5059970014992503, "grad_norm": 0.35965339130185314, "learning_rate": 0.00011671529589072702, "loss": 0.0202, "step": 2009 }, { "epoch": 1.5067466266866567, "grad_norm": 0.3715537347174127, "learning_rate": 0.00011662927472487064, "loss": 0.0589, "step": 2010 }, { "epoch": 1.507496251874063, "grad_norm": 0.1289465699723788, "learning_rate": 0.00011654324090213073, "loss": 0.0247, "step": 2011 }, { "epoch": 1.5082458770614693, "grad_norm": 0.2841186121236052, "learning_rate": 0.00011645719448798943, "loss": 0.0191, "step": 2012 }, { "epoch": 1.5089955022488755, "grad_norm": 0.08209758171226106, "learning_rate": 0.00011637113554793846, "loss": 0.0065, "step": 2013 }, { "epoch": 1.5097451274362819, "grad_norm": 0.13132648526713056, "learning_rate": 0.000116285064147479, "loss": 0.0125, "step": 2014 }, { "epoch": 1.5104947526236883, "grad_norm": 0.1583770195328219, "learning_rate": 0.00011619898035212174, "loss": 0.0265, "step": 2015 }, { "epoch": 1.5112443778110944, "grad_norm": 0.35884442000176264, "learning_rate": 0.00011611288422738691, "loss": 0.0298, "step": 2016 }, { "epoch": 1.5119940029985006, "grad_norm": 0.11422568057412678, "learning_rate": 0.00011602677583880401, "loss": 0.017, "step": 2017 }, { "epoch": 1.512743628185907, "grad_norm": 0.2073809987153342, "learning_rate": 0.00011594065525191188, "loss": 0.0237, "step": 2018 }, { "epoch": 1.5134932533733134, "grad_norm": 0.18272649079364311, "learning_rate": 0.00011585452253225872, "loss": 0.0252, "step": 2019 }, { "epoch": 1.5142428785607196, "grad_norm": 0.26491460784372994, "learning_rate": 0.0001157683777454019, "loss": 0.0433, "step": 2020 }, { "epoch": 1.5149925037481258, "grad_norm": 0.1556835860082349, "learning_rate": 0.00011568222095690797, "loss": 0.0176, "step": 2021 }, { "epoch": 1.5157421289355324, "grad_norm": 0.08848124098762954, "learning_rate": 0.00011559605223235268, "loss": 0.0124, "step": 2022 }, { "epoch": 1.5164917541229386, "grad_norm": 0.12709504996224588, "learning_rate": 0.00011550987163732075, "loss": 0.0195, "step": 2023 }, { "epoch": 1.5172413793103448, "grad_norm": 0.1158807293852502, "learning_rate": 0.00011542367923740606, "loss": 0.0206, "step": 2024 }, { "epoch": 1.5179910044977512, "grad_norm": 0.2924531296368726, "learning_rate": 0.00011533747509821145, "loss": 0.0287, "step": 2025 }, { "epoch": 1.5187406296851576, "grad_norm": 0.10536500142368622, "learning_rate": 0.00011525125928534859, "loss": 0.007, "step": 2026 }, { "epoch": 1.5194902548725637, "grad_norm": 0.12511524507356026, "learning_rate": 0.00011516503186443814, "loss": 0.0173, "step": 2027 }, { "epoch": 1.52023988005997, "grad_norm": 0.2148503368956103, "learning_rate": 0.0001150787929011096, "loss": 0.0236, "step": 2028 }, { "epoch": 1.5209895052473763, "grad_norm": 0.15375775401892824, "learning_rate": 0.00011499254246100118, "loss": 0.0152, "step": 2029 }, { "epoch": 1.5217391304347827, "grad_norm": 0.23824273067912238, "learning_rate": 0.0001149062806097599, "loss": 0.0492, "step": 2030 }, { "epoch": 1.5224887556221889, "grad_norm": 0.23191671523223392, "learning_rate": 0.0001148200074130414, "loss": 0.0339, "step": 2031 }, { "epoch": 1.523238380809595, "grad_norm": 0.13047509312978461, "learning_rate": 0.00011473372293650999, "loss": 0.0169, "step": 2032 }, { "epoch": 1.5239880059970015, "grad_norm": 0.14189321877652575, "learning_rate": 0.00011464742724583859, "loss": 0.012, "step": 2033 }, { "epoch": 1.5247376311844079, "grad_norm": 0.07837067512463858, "learning_rate": 0.00011456112040670859, "loss": 0.0104, "step": 2034 }, { "epoch": 1.525487256371814, "grad_norm": 0.10328103959029392, "learning_rate": 0.00011447480248480996, "loss": 0.007, "step": 2035 }, { "epoch": 1.5262368815592204, "grad_norm": 0.172214593174865, "learning_rate": 0.00011438847354584097, "loss": 0.0121, "step": 2036 }, { "epoch": 1.5269865067466268, "grad_norm": 0.08022427081753603, "learning_rate": 0.00011430213365550842, "loss": 0.0109, "step": 2037 }, { "epoch": 1.527736131934033, "grad_norm": 0.14878089897566166, "learning_rate": 0.00011421578287952735, "loss": 0.0059, "step": 2038 }, { "epoch": 1.5284857571214392, "grad_norm": 0.11671209020576799, "learning_rate": 0.0001141294212836211, "loss": 0.0148, "step": 2039 }, { "epoch": 1.5292353823088456, "grad_norm": 0.08369220048037276, "learning_rate": 0.00011404304893352132, "loss": 0.01, "step": 2040 }, { "epoch": 1.529985007496252, "grad_norm": 0.15577552287134389, "learning_rate": 0.0001139566658949677, "loss": 0.0144, "step": 2041 }, { "epoch": 1.5307346326836582, "grad_norm": 0.12315101821922537, "learning_rate": 0.00011387027223370821, "loss": 0.0072, "step": 2042 }, { "epoch": 1.5314842578710643, "grad_norm": 0.29388303557281226, "learning_rate": 0.00011378386801549883, "loss": 0.0131, "step": 2043 }, { "epoch": 1.5322338830584707, "grad_norm": 0.13980800324774958, "learning_rate": 0.0001136974533061036, "loss": 0.0174, "step": 2044 }, { "epoch": 1.5329835082458771, "grad_norm": 0.06204640754946424, "learning_rate": 0.00011361102817129451, "loss": 0.0076, "step": 2045 }, { "epoch": 1.5337331334332833, "grad_norm": 0.12680010891087348, "learning_rate": 0.00011352459267685152, "loss": 0.0158, "step": 2046 }, { "epoch": 1.5344827586206895, "grad_norm": 0.18932679871696698, "learning_rate": 0.00011343814688856246, "loss": 0.0263, "step": 2047 }, { "epoch": 1.535232383808096, "grad_norm": 0.11530877706564625, "learning_rate": 0.00011335169087222304, "loss": 0.0185, "step": 2048 }, { "epoch": 1.5359820089955023, "grad_norm": 0.2879999540228137, "learning_rate": 0.00011326522469363666, "loss": 0.0399, "step": 2049 }, { "epoch": 1.5367316341829085, "grad_norm": 0.10544309045823727, "learning_rate": 0.00011317874841861454, "loss": 0.009, "step": 2050 }, { "epoch": 1.5374812593703149, "grad_norm": 0.1720851235854106, "learning_rate": 0.00011309226211297552, "loss": 0.0136, "step": 2051 }, { "epoch": 1.5382308845577213, "grad_norm": 0.061021940481680546, "learning_rate": 0.00011300576584254617, "loss": 0.0091, "step": 2052 }, { "epoch": 1.5389805097451275, "grad_norm": 0.24495019966265658, "learning_rate": 0.00011291925967316055, "loss": 0.029, "step": 2053 }, { "epoch": 1.5397301349325336, "grad_norm": 0.19048454116953006, "learning_rate": 0.00011283274367066028, "loss": 0.0176, "step": 2054 }, { "epoch": 1.54047976011994, "grad_norm": 0.1966686629949861, "learning_rate": 0.00011274621790089445, "loss": 0.0203, "step": 2055 }, { "epoch": 1.5412293853073464, "grad_norm": 0.10997093120263526, "learning_rate": 0.00011265968242971967, "loss": 0.0097, "step": 2056 }, { "epoch": 1.5419790104947526, "grad_norm": 0.17625970195223611, "learning_rate": 0.00011257313732299984, "loss": 0.0122, "step": 2057 }, { "epoch": 1.5427286356821588, "grad_norm": 0.1733777878808708, "learning_rate": 0.00011248658264660617, "loss": 0.022, "step": 2058 }, { "epoch": 1.5434782608695652, "grad_norm": 0.12541753676579082, "learning_rate": 0.00011240001846641727, "loss": 0.016, "step": 2059 }, { "epoch": 1.5442278860569716, "grad_norm": 0.13712378929986008, "learning_rate": 0.00011231344484831894, "loss": 0.0157, "step": 2060 }, { "epoch": 1.5449775112443778, "grad_norm": 0.13682059079314143, "learning_rate": 0.00011222686185820409, "loss": 0.0215, "step": 2061 }, { "epoch": 1.545727136431784, "grad_norm": 0.13012752394248844, "learning_rate": 0.00011214026956197281, "loss": 0.0261, "step": 2062 }, { "epoch": 1.5464767616191906, "grad_norm": 0.11619923535577313, "learning_rate": 0.0001120536680255323, "loss": 0.0186, "step": 2063 }, { "epoch": 1.5472263868065967, "grad_norm": 0.10045403290997133, "learning_rate": 0.00011196705731479682, "loss": 0.0108, "step": 2064 }, { "epoch": 1.547976011994003, "grad_norm": 0.1341996957759823, "learning_rate": 0.00011188043749568751, "loss": 0.0125, "step": 2065 }, { "epoch": 1.5487256371814093, "grad_norm": 0.3351069282470784, "learning_rate": 0.00011179380863413249, "loss": 0.0262, "step": 2066 }, { "epoch": 1.5494752623688157, "grad_norm": 0.09092680298844832, "learning_rate": 0.00011170717079606683, "loss": 0.0151, "step": 2067 }, { "epoch": 1.550224887556222, "grad_norm": 0.07557937002901241, "learning_rate": 0.00011162052404743231, "loss": 0.0059, "step": 2068 }, { "epoch": 1.550974512743628, "grad_norm": 0.09023632236008271, "learning_rate": 0.00011153386845417757, "loss": 0.0105, "step": 2069 }, { "epoch": 1.5517241379310345, "grad_norm": 0.0890917938114594, "learning_rate": 0.00011144720408225799, "loss": 0.0124, "step": 2070 }, { "epoch": 1.5524737631184409, "grad_norm": 0.23400085026304457, "learning_rate": 0.00011136053099763561, "loss": 0.0213, "step": 2071 }, { "epoch": 1.553223388305847, "grad_norm": 0.21856781003492168, "learning_rate": 0.0001112738492662791, "loss": 0.0206, "step": 2072 }, { "epoch": 1.5539730134932532, "grad_norm": 0.08859642750860837, "learning_rate": 0.00011118715895416369, "loss": 0.0134, "step": 2073 }, { "epoch": 1.5547226386806596, "grad_norm": 0.22023807470902312, "learning_rate": 0.00011110046012727115, "loss": 0.0167, "step": 2074 }, { "epoch": 1.555472263868066, "grad_norm": 0.15376324747434672, "learning_rate": 0.00011101375285158977, "loss": 0.0185, "step": 2075 }, { "epoch": 1.5562218890554722, "grad_norm": 0.14143650908884217, "learning_rate": 0.00011092703719311423, "loss": 0.0247, "step": 2076 }, { "epoch": 1.5569715142428786, "grad_norm": 0.14443931346792963, "learning_rate": 0.00011084031321784565, "loss": 0.0078, "step": 2077 }, { "epoch": 1.557721139430285, "grad_norm": 0.21577033946330054, "learning_rate": 0.00011075358099179137, "loss": 0.0296, "step": 2078 }, { "epoch": 1.5584707646176912, "grad_norm": 0.14822292735668258, "learning_rate": 0.0001106668405809651, "loss": 0.0103, "step": 2079 }, { "epoch": 1.5592203898050974, "grad_norm": 0.280728591961263, "learning_rate": 0.00011058009205138675, "loss": 0.0325, "step": 2080 }, { "epoch": 1.5599700149925038, "grad_norm": 0.13964380316407649, "learning_rate": 0.00011049333546908238, "loss": 0.0168, "step": 2081 }, { "epoch": 1.5607196401799102, "grad_norm": 0.2665676112900603, "learning_rate": 0.00011040657090008429, "loss": 0.0342, "step": 2082 }, { "epoch": 1.5614692653673163, "grad_norm": 0.18879808411881546, "learning_rate": 0.00011031979841043074, "loss": 0.0191, "step": 2083 }, { "epoch": 1.5622188905547225, "grad_norm": 0.17893200215311023, "learning_rate": 0.00011023301806616604, "loss": 0.0593, "step": 2084 }, { "epoch": 1.562968515742129, "grad_norm": 0.10130024416265714, "learning_rate": 0.00011014622993334051, "loss": 0.0173, "step": 2085 }, { "epoch": 1.5637181409295353, "grad_norm": 0.11583261119153629, "learning_rate": 0.0001100594340780104, "loss": 0.0051, "step": 2086 }, { "epoch": 1.5644677661169415, "grad_norm": 0.20382548872544762, "learning_rate": 0.00010997263056623781, "loss": 0.0207, "step": 2087 }, { "epoch": 1.5652173913043477, "grad_norm": 0.26119103145261624, "learning_rate": 0.00010988581946409067, "loss": 0.0259, "step": 2088 }, { "epoch": 1.565967016491754, "grad_norm": 0.26387441495902386, "learning_rate": 0.00010979900083764274, "loss": 0.0295, "step": 2089 }, { "epoch": 1.5667166416791605, "grad_norm": 0.12167374287075801, "learning_rate": 0.00010971217475297341, "loss": 0.0135, "step": 2090 }, { "epoch": 1.5674662668665666, "grad_norm": 0.09404259679501854, "learning_rate": 0.00010962534127616784, "loss": 0.0143, "step": 2091 }, { "epoch": 1.568215892053973, "grad_norm": 0.11618029483945659, "learning_rate": 0.00010953850047331677, "loss": 0.0241, "step": 2092 }, { "epoch": 1.5689655172413794, "grad_norm": 0.12694899413054597, "learning_rate": 0.00010945165241051649, "loss": 0.0066, "step": 2093 }, { "epoch": 1.5697151424287856, "grad_norm": 0.13936338138363769, "learning_rate": 0.0001093647971538689, "loss": 0.0138, "step": 2094 }, { "epoch": 1.5704647676161918, "grad_norm": 0.1873410269184778, "learning_rate": 0.00010927793476948132, "loss": 0.0101, "step": 2095 }, { "epoch": 1.5712143928035982, "grad_norm": 0.3206581121869378, "learning_rate": 0.00010919106532346646, "loss": 0.0808, "step": 2096 }, { "epoch": 1.5719640179910046, "grad_norm": 0.10184226493788878, "learning_rate": 0.00010910418888194247, "loss": 0.0088, "step": 2097 }, { "epoch": 1.5727136431784108, "grad_norm": 0.144140262326729, "learning_rate": 0.0001090173055110328, "loss": 0.0183, "step": 2098 }, { "epoch": 1.573463268365817, "grad_norm": 0.16734098531032404, "learning_rate": 0.00010893041527686616, "loss": 0.0114, "step": 2099 }, { "epoch": 1.5742128935532234, "grad_norm": 0.20313748731878348, "learning_rate": 0.00010884351824557653, "loss": 0.0201, "step": 2100 }, { "epoch": 1.5749625187406298, "grad_norm": 0.18552738843936079, "learning_rate": 0.00010875661448330299, "loss": 0.0189, "step": 2101 }, { "epoch": 1.575712143928036, "grad_norm": 0.2181205184419006, "learning_rate": 0.00010866970405618979, "loss": 0.0146, "step": 2102 }, { "epoch": 1.5764617691154421, "grad_norm": 0.197999347547449, "learning_rate": 0.0001085827870303863, "loss": 0.0131, "step": 2103 }, { "epoch": 1.5772113943028487, "grad_norm": 0.18070888463567594, "learning_rate": 0.00010849586347204676, "loss": 0.0239, "step": 2104 }, { "epoch": 1.577961019490255, "grad_norm": 0.11994394448856549, "learning_rate": 0.00010840893344733056, "loss": 0.0149, "step": 2105 }, { "epoch": 1.578710644677661, "grad_norm": 0.22206888863262822, "learning_rate": 0.00010832199702240193, "loss": 0.0216, "step": 2106 }, { "epoch": 1.5794602698650675, "grad_norm": 0.09966607612949638, "learning_rate": 0.00010823505426342993, "loss": 0.0154, "step": 2107 }, { "epoch": 1.580209895052474, "grad_norm": 0.20815952559053078, "learning_rate": 0.00010814810523658851, "loss": 0.0334, "step": 2108 }, { "epoch": 1.58095952023988, "grad_norm": 0.13817033872523685, "learning_rate": 0.00010806115000805639, "loss": 0.0179, "step": 2109 }, { "epoch": 1.5817091454272862, "grad_norm": 0.20216668988737746, "learning_rate": 0.00010797418864401693, "loss": 0.0255, "step": 2110 }, { "epoch": 1.5824587706146926, "grad_norm": 0.11483519803723446, "learning_rate": 0.00010788722121065822, "loss": 0.0143, "step": 2111 }, { "epoch": 1.583208395802099, "grad_norm": 0.16669136560946193, "learning_rate": 0.00010780024777417304, "loss": 0.0224, "step": 2112 }, { "epoch": 1.5839580209895052, "grad_norm": 0.20514344820145913, "learning_rate": 0.00010771326840075857, "loss": 0.0237, "step": 2113 }, { "epoch": 1.5847076461769114, "grad_norm": 0.22128861461740099, "learning_rate": 0.00010762628315661667, "loss": 0.0208, "step": 2114 }, { "epoch": 1.5854572713643178, "grad_norm": 0.22105747964468944, "learning_rate": 0.00010753929210795358, "loss": 0.0126, "step": 2115 }, { "epoch": 1.5862068965517242, "grad_norm": 0.2316847935712145, "learning_rate": 0.00010745229532097995, "loss": 0.0133, "step": 2116 }, { "epoch": 1.5869565217391304, "grad_norm": 0.17387233501627947, "learning_rate": 0.00010736529286191086, "loss": 0.0149, "step": 2117 }, { "epoch": 1.5877061469265368, "grad_norm": 0.1357298657618581, "learning_rate": 0.00010727828479696569, "loss": 0.0167, "step": 2118 }, { "epoch": 1.5884557721139432, "grad_norm": 0.1105206812339558, "learning_rate": 0.00010719127119236803, "loss": 0.0127, "step": 2119 }, { "epoch": 1.5892053973013494, "grad_norm": 0.15783026288917643, "learning_rate": 0.00010710425211434574, "loss": 0.0259, "step": 2120 }, { "epoch": 1.5899550224887555, "grad_norm": 0.11771204673005133, "learning_rate": 0.00010701722762913082, "loss": 0.0112, "step": 2121 }, { "epoch": 1.590704647676162, "grad_norm": 0.32453207336541906, "learning_rate": 0.00010693019780295943, "loss": 0.0277, "step": 2122 }, { "epoch": 1.5914542728635683, "grad_norm": 0.1966072178788373, "learning_rate": 0.00010684316270207173, "loss": 0.037, "step": 2123 }, { "epoch": 1.5922038980509745, "grad_norm": 0.09304717367685852, "learning_rate": 0.00010675612239271195, "loss": 0.0118, "step": 2124 }, { "epoch": 1.5929535232383807, "grad_norm": 0.20236023984052592, "learning_rate": 0.00010666907694112826, "loss": 0.0219, "step": 2125 }, { "epoch": 1.593703148425787, "grad_norm": 0.11436132069365781, "learning_rate": 0.00010658202641357275, "loss": 0.0096, "step": 2126 }, { "epoch": 1.5944527736131935, "grad_norm": 0.08452129352834276, "learning_rate": 0.00010649497087630137, "loss": 0.0084, "step": 2127 }, { "epoch": 1.5952023988005997, "grad_norm": 0.22035349865028075, "learning_rate": 0.00010640791039557386, "loss": 0.048, "step": 2128 }, { "epoch": 1.5959520239880058, "grad_norm": 0.11498711500927268, "learning_rate": 0.00010632084503765376, "loss": 0.02, "step": 2129 }, { "epoch": 1.5967016491754122, "grad_norm": 0.0839174086862166, "learning_rate": 0.0001062337748688083, "loss": 0.0066, "step": 2130 }, { "epoch": 1.5974512743628186, "grad_norm": 0.12497149674304994, "learning_rate": 0.00010614669995530842, "loss": 0.0058, "step": 2131 }, { "epoch": 1.5982008995502248, "grad_norm": 0.10287884248033055, "learning_rate": 0.00010605962036342856, "loss": 0.0119, "step": 2132 }, { "epoch": 1.5989505247376312, "grad_norm": 0.1796680641405242, "learning_rate": 0.00010597253615944684, "loss": 0.0113, "step": 2133 }, { "epoch": 1.5997001499250376, "grad_norm": 0.19508523661443805, "learning_rate": 0.00010588544740964483, "loss": 0.0237, "step": 2134 }, { "epoch": 1.6004497751124438, "grad_norm": 0.19838721596349307, "learning_rate": 0.00010579835418030759, "loss": 0.0318, "step": 2135 }, { "epoch": 1.60119940029985, "grad_norm": 0.2255229237940063, "learning_rate": 0.00010571125653772351, "loss": 0.0278, "step": 2136 }, { "epoch": 1.6019490254872564, "grad_norm": 0.1940287143164911, "learning_rate": 0.00010562415454818446, "loss": 0.039, "step": 2137 }, { "epoch": 1.6026986506746628, "grad_norm": 0.19664254017654553, "learning_rate": 0.00010553704827798553, "loss": 0.0238, "step": 2138 }, { "epoch": 1.603448275862069, "grad_norm": 0.3490322734081589, "learning_rate": 0.00010544993779342511, "loss": 0.0129, "step": 2139 }, { "epoch": 1.6041979010494751, "grad_norm": 0.11696677496354138, "learning_rate": 0.00010536282316080479, "loss": 0.0138, "step": 2140 }, { "epoch": 1.6049475262368815, "grad_norm": 0.11630342379823932, "learning_rate": 0.00010527570444642929, "loss": 0.0073, "step": 2141 }, { "epoch": 1.605697151424288, "grad_norm": 0.16193962839259002, "learning_rate": 0.00010518858171660648, "loss": 0.0243, "step": 2142 }, { "epoch": 1.606446776611694, "grad_norm": 0.14777930178574433, "learning_rate": 0.00010510145503764726, "loss": 0.0151, "step": 2143 }, { "epoch": 1.6071964017991005, "grad_norm": 0.08578678039922254, "learning_rate": 0.00010501432447586552, "loss": 0.008, "step": 2144 }, { "epoch": 1.607946026986507, "grad_norm": 0.12769657648998697, "learning_rate": 0.00010492719009757815, "loss": 0.015, "step": 2145 }, { "epoch": 1.608695652173913, "grad_norm": 0.1485501355032563, "learning_rate": 0.00010484005196910489, "loss": 0.0248, "step": 2146 }, { "epoch": 1.6094452773613193, "grad_norm": 0.19054280205114993, "learning_rate": 0.00010475291015676838, "loss": 0.02, "step": 2147 }, { "epoch": 1.6101949025487257, "grad_norm": 0.22448232935165505, "learning_rate": 0.00010466576472689405, "loss": 0.0339, "step": 2148 }, { "epoch": 1.610944527736132, "grad_norm": 0.17100326139227798, "learning_rate": 0.0001045786157458101, "loss": 0.0211, "step": 2149 }, { "epoch": 1.6116941529235382, "grad_norm": 0.3515785672403247, "learning_rate": 0.00010449146327984737, "loss": 0.0297, "step": 2150 }, { "epoch": 1.6124437781109444, "grad_norm": 0.1375851914687004, "learning_rate": 0.00010440430739533945, "loss": 0.012, "step": 2151 }, { "epoch": 1.6131934032983508, "grad_norm": 0.17245457771112946, "learning_rate": 0.00010431714815862244, "loss": 0.0129, "step": 2152 }, { "epoch": 1.6139430284857572, "grad_norm": 0.1973816822681929, "learning_rate": 0.00010422998563603505, "loss": 0.0322, "step": 2153 }, { "epoch": 1.6146926536731634, "grad_norm": 0.06788410784408126, "learning_rate": 0.00010414281989391846, "loss": 0.0051, "step": 2154 }, { "epoch": 1.6154422788605696, "grad_norm": 0.09242706243543868, "learning_rate": 0.00010405565099861632, "loss": 0.0164, "step": 2155 }, { "epoch": 1.616191904047976, "grad_norm": 0.19222715958385686, "learning_rate": 0.00010396847901647468, "loss": 0.0152, "step": 2156 }, { "epoch": 1.6169415292353824, "grad_norm": 0.14608118270870263, "learning_rate": 0.00010388130401384191, "loss": 0.021, "step": 2157 }, { "epoch": 1.6176911544227885, "grad_norm": 0.10021220905764734, "learning_rate": 0.0001037941260570687, "loss": 0.0083, "step": 2158 }, { "epoch": 1.618440779610195, "grad_norm": 0.3437202444059941, "learning_rate": 0.00010370694521250801, "loss": 0.0263, "step": 2159 }, { "epoch": 1.6191904047976013, "grad_norm": 0.10922401081992254, "learning_rate": 0.00010361976154651493, "loss": 0.0135, "step": 2160 }, { "epoch": 1.6199400299850075, "grad_norm": 0.1275763182671347, "learning_rate": 0.00010353257512544683, "loss": 0.0192, "step": 2161 }, { "epoch": 1.6206896551724137, "grad_norm": 0.2856664323507451, "learning_rate": 0.00010344538601566301, "loss": 0.02, "step": 2162 }, { "epoch": 1.62143928035982, "grad_norm": 0.3372086035734109, "learning_rate": 0.0001033581942835249, "loss": 0.0556, "step": 2163 }, { "epoch": 1.6221889055472265, "grad_norm": 0.10753985912490954, "learning_rate": 0.000103270999995396, "loss": 0.0148, "step": 2164 }, { "epoch": 1.6229385307346327, "grad_norm": 0.05309486665403327, "learning_rate": 0.00010318380321764163, "loss": 0.0073, "step": 2165 }, { "epoch": 1.6236881559220389, "grad_norm": 0.17602802641704984, "learning_rate": 0.00010309660401662902, "loss": 0.0227, "step": 2166 }, { "epoch": 1.6244377811094453, "grad_norm": 0.1569913385165896, "learning_rate": 0.00010300940245872734, "loss": 0.0166, "step": 2167 }, { "epoch": 1.6251874062968517, "grad_norm": 0.12704934685718333, "learning_rate": 0.00010292219861030746, "loss": 0.0097, "step": 2168 }, { "epoch": 1.6259370314842578, "grad_norm": 0.09420389848758143, "learning_rate": 0.000102834992537742, "loss": 0.0085, "step": 2169 }, { "epoch": 1.626686656671664, "grad_norm": 0.14393386909154185, "learning_rate": 0.00010274778430740534, "loss": 0.0196, "step": 2170 }, { "epoch": 1.6274362818590704, "grad_norm": 0.09500661896692436, "learning_rate": 0.00010266057398567345, "loss": 0.0155, "step": 2171 }, { "epoch": 1.6281859070464768, "grad_norm": 0.08039664476375386, "learning_rate": 0.00010257336163892388, "loss": 0.0058, "step": 2172 }, { "epoch": 1.628935532233883, "grad_norm": 0.16527855875803454, "learning_rate": 0.00010248614733353576, "loss": 0.0109, "step": 2173 }, { "epoch": 1.6296851574212894, "grad_norm": 0.15900030028556839, "learning_rate": 0.0001023989311358897, "loss": 0.0184, "step": 2174 }, { "epoch": 1.6304347826086958, "grad_norm": 0.18865051802381824, "learning_rate": 0.00010231171311236776, "loss": 0.0162, "step": 2175 }, { "epoch": 1.631184407796102, "grad_norm": 0.22266468034711098, "learning_rate": 0.00010222449332935334, "loss": 0.0211, "step": 2176 }, { "epoch": 1.6319340329835081, "grad_norm": 0.12644029888344444, "learning_rate": 0.00010213727185323123, "loss": 0.0146, "step": 2177 }, { "epoch": 1.6326836581709145, "grad_norm": 0.08269922841842478, "learning_rate": 0.00010205004875038749, "loss": 0.0034, "step": 2178 }, { "epoch": 1.633433283358321, "grad_norm": 0.13467295910773985, "learning_rate": 0.00010196282408720943, "loss": 0.0164, "step": 2179 }, { "epoch": 1.6341829085457271, "grad_norm": 0.09751608794364057, "learning_rate": 0.00010187559793008553, "loss": 0.0089, "step": 2180 }, { "epoch": 1.6349325337331333, "grad_norm": 0.10433575276139269, "learning_rate": 0.0001017883703454054, "loss": 0.0065, "step": 2181 }, { "epoch": 1.6356821589205397, "grad_norm": 0.15904032420506176, "learning_rate": 0.00010170114139955975, "loss": 0.0158, "step": 2182 }, { "epoch": 1.636431784107946, "grad_norm": 0.3355373475253385, "learning_rate": 0.00010161391115894036, "loss": 0.0492, "step": 2183 }, { "epoch": 1.6371814092953523, "grad_norm": 0.07955988970300694, "learning_rate": 0.00010152667968993994, "loss": 0.0081, "step": 2184 }, { "epoch": 1.6379310344827587, "grad_norm": 0.10382669378097661, "learning_rate": 0.00010143944705895217, "loss": 0.0133, "step": 2185 }, { "epoch": 1.638680659670165, "grad_norm": 0.13676157868630912, "learning_rate": 0.00010135221333237154, "loss": 0.0119, "step": 2186 }, { "epoch": 1.6394302848575713, "grad_norm": 0.09265941037356405, "learning_rate": 0.00010126497857659356, "loss": 0.0107, "step": 2187 }, { "epoch": 1.6401799100449774, "grad_norm": 0.19844386592103025, "learning_rate": 0.00010117774285801432, "loss": 0.0189, "step": 2188 }, { "epoch": 1.6409295352323838, "grad_norm": 0.09822194199584616, "learning_rate": 0.00010109050624303071, "loss": 0.0153, "step": 2189 }, { "epoch": 1.6416791604197902, "grad_norm": 0.1049195490202721, "learning_rate": 0.00010100326879804034, "loss": 0.0057, "step": 2190 }, { "epoch": 1.6424287856071964, "grad_norm": 0.10430346317698513, "learning_rate": 0.00010091603058944143, "loss": 0.0116, "step": 2191 }, { "epoch": 1.6431784107946026, "grad_norm": 0.23977722911803956, "learning_rate": 0.00010082879168363277, "loss": 0.0253, "step": 2192 }, { "epoch": 1.643928035982009, "grad_norm": 0.19249755711105562, "learning_rate": 0.00010074155214701369, "loss": 0.0241, "step": 2193 }, { "epoch": 1.6446776611694154, "grad_norm": 0.13018140823584648, "learning_rate": 0.00010065431204598398, "loss": 0.0181, "step": 2194 }, { "epoch": 1.6454272863568216, "grad_norm": 0.1914619455012303, "learning_rate": 0.0001005670714469439, "loss": 0.0165, "step": 2195 }, { "epoch": 1.6461769115442277, "grad_norm": 0.1382501986835231, "learning_rate": 0.00010047983041629408, "loss": 0.0127, "step": 2196 }, { "epoch": 1.6469265367316341, "grad_norm": 0.19493616103935438, "learning_rate": 0.00010039258902043542, "loss": 0.0189, "step": 2197 }, { "epoch": 1.6476761619190405, "grad_norm": 0.21639137368174913, "learning_rate": 0.00010030534732576919, "loss": 0.0326, "step": 2198 }, { "epoch": 1.6484257871064467, "grad_norm": 0.08163997103958853, "learning_rate": 0.0001002181053986968, "loss": 0.0091, "step": 2199 }, { "epoch": 1.6491754122938531, "grad_norm": 0.10984390492478067, "learning_rate": 0.00010013086330561994, "loss": 0.0173, "step": 2200 }, { "epoch": 1.6499250374812595, "grad_norm": 0.11543924109680062, "learning_rate": 0.0001000436211129403, "loss": 0.0199, "step": 2201 }, { "epoch": 1.6506746626686657, "grad_norm": 0.12631791559353478, "learning_rate": 9.995637888705971e-05, "loss": 0.0069, "step": 2202 }, { "epoch": 1.6514242878560719, "grad_norm": 0.1760866474337531, "learning_rate": 9.986913669438006e-05, "loss": 0.0124, "step": 2203 }, { "epoch": 1.6521739130434783, "grad_norm": 0.1275355517822058, "learning_rate": 9.978189460130318e-05, "loss": 0.0287, "step": 2204 }, { "epoch": 1.6529235382308847, "grad_norm": 0.07944793817543527, "learning_rate": 9.969465267423082e-05, "loss": 0.01, "step": 2205 }, { "epoch": 1.6536731634182908, "grad_norm": 0.11128776003101262, "learning_rate": 9.960741097956459e-05, "loss": 0.0089, "step": 2206 }, { "epoch": 1.654422788605697, "grad_norm": 0.13148517577463179, "learning_rate": 9.952016958370595e-05, "loss": 0.0151, "step": 2207 }, { "epoch": 1.6551724137931034, "grad_norm": 0.19695789121035406, "learning_rate": 9.943292855305612e-05, "loss": 0.022, "step": 2208 }, { "epoch": 1.6559220389805098, "grad_norm": 0.08508472187420428, "learning_rate": 9.934568795401603e-05, "loss": 0.0105, "step": 2209 }, { "epoch": 1.656671664167916, "grad_norm": 0.15133628187895673, "learning_rate": 9.925844785298632e-05, "loss": 0.0341, "step": 2210 }, { "epoch": 1.6574212893553222, "grad_norm": 0.1922398060665263, "learning_rate": 9.917120831636725e-05, "loss": 0.0102, "step": 2211 }, { "epoch": 1.6581709145427288, "grad_norm": 0.0938351190848418, "learning_rate": 9.908396941055858e-05, "loss": 0.0126, "step": 2212 }, { "epoch": 1.658920539730135, "grad_norm": 0.15477205535520777, "learning_rate": 9.899673120195968e-05, "loss": 0.0166, "step": 2213 }, { "epoch": 1.6596701649175412, "grad_norm": 0.09216439861864109, "learning_rate": 9.890949375696932e-05, "loss": 0.0075, "step": 2214 }, { "epoch": 1.6604197901049476, "grad_norm": 0.16799827639624468, "learning_rate": 9.882225714198572e-05, "loss": 0.0104, "step": 2215 }, { "epoch": 1.661169415292354, "grad_norm": 0.08444261604782366, "learning_rate": 9.873502142340647e-05, "loss": 0.0054, "step": 2216 }, { "epoch": 1.6619190404797601, "grad_norm": 0.0954091400877419, "learning_rate": 9.864778666762848e-05, "loss": 0.0131, "step": 2217 }, { "epoch": 1.6626686656671663, "grad_norm": 0.14137959599399608, "learning_rate": 9.856055294104787e-05, "loss": 0.0136, "step": 2218 }, { "epoch": 1.6634182908545727, "grad_norm": 0.19674432327170557, "learning_rate": 9.84733203100601e-05, "loss": 0.022, "step": 2219 }, { "epoch": 1.6641679160419791, "grad_norm": 0.19241725608754048, "learning_rate": 9.838608884105967e-05, "loss": 0.0196, "step": 2220 }, { "epoch": 1.6649175412293853, "grad_norm": 0.0741158166637822, "learning_rate": 9.829885860044028e-05, "loss": 0.0099, "step": 2221 }, { "epoch": 1.6656671664167915, "grad_norm": 0.08107170814118782, "learning_rate": 9.821162965459464e-05, "loss": 0.0082, "step": 2222 }, { "epoch": 1.6664167916041979, "grad_norm": 0.45306028725278275, "learning_rate": 9.812440206991452e-05, "loss": 0.0748, "step": 2223 }, { "epoch": 1.6671664167916043, "grad_norm": 0.2149625837226016, "learning_rate": 9.803717591279061e-05, "loss": 0.0167, "step": 2224 }, { "epoch": 1.6679160419790104, "grad_norm": 0.17153411211542954, "learning_rate": 9.794995124961255e-05, "loss": 0.0256, "step": 2225 }, { "epoch": 1.6686656671664168, "grad_norm": 0.0827609854771755, "learning_rate": 9.786272814676877e-05, "loss": 0.0116, "step": 2226 }, { "epoch": 1.6694152923538232, "grad_norm": 0.1885871305873978, "learning_rate": 9.777550667064666e-05, "loss": 0.0264, "step": 2227 }, { "epoch": 1.6701649175412294, "grad_norm": 0.17937268516669583, "learning_rate": 9.768828688763225e-05, "loss": 0.0283, "step": 2228 }, { "epoch": 1.6709145427286356, "grad_norm": 0.060729515568319854, "learning_rate": 9.76010688641103e-05, "loss": 0.0041, "step": 2229 }, { "epoch": 1.671664167916042, "grad_norm": 0.13836789615313236, "learning_rate": 9.751385266646425e-05, "loss": 0.0191, "step": 2230 }, { "epoch": 1.6724137931034484, "grad_norm": 0.13593051507269474, "learning_rate": 9.742663836107616e-05, "loss": 0.015, "step": 2231 }, { "epoch": 1.6731634182908546, "grad_norm": 0.1999216010464126, "learning_rate": 9.733942601432657e-05, "loss": 0.0174, "step": 2232 }, { "epoch": 1.6739130434782608, "grad_norm": 0.06516165629180859, "learning_rate": 9.725221569259468e-05, "loss": 0.0115, "step": 2233 }, { "epoch": 1.6746626686656672, "grad_norm": 0.11151068671306183, "learning_rate": 9.716500746225802e-05, "loss": 0.0112, "step": 2234 }, { "epoch": 1.6754122938530736, "grad_norm": 0.09629089815406607, "learning_rate": 9.707780138969258e-05, "loss": 0.0093, "step": 2235 }, { "epoch": 1.6761619190404797, "grad_norm": 0.14162644878631606, "learning_rate": 9.699059754127269e-05, "loss": 0.0073, "step": 2236 }, { "epoch": 1.676911544227886, "grad_norm": 0.11520040224766206, "learning_rate": 9.690339598337099e-05, "loss": 0.0139, "step": 2237 }, { "epoch": 1.6776611694152923, "grad_norm": 0.1335076882114969, "learning_rate": 9.681619678235841e-05, "loss": 0.0179, "step": 2238 }, { "epoch": 1.6784107946026987, "grad_norm": 0.07575372094517098, "learning_rate": 9.672900000460403e-05, "loss": 0.0062, "step": 2239 }, { "epoch": 1.6791604197901049, "grad_norm": 0.13863362510064225, "learning_rate": 9.66418057164751e-05, "loss": 0.0101, "step": 2240 }, { "epoch": 1.6799100449775113, "grad_norm": 0.17980979209392906, "learning_rate": 9.655461398433703e-05, "loss": 0.0194, "step": 2241 }, { "epoch": 1.6806596701649177, "grad_norm": 0.0735726093429126, "learning_rate": 9.646742487455319e-05, "loss": 0.006, "step": 2242 }, { "epoch": 1.6814092953523239, "grad_norm": 0.10716385387797484, "learning_rate": 9.638023845348508e-05, "loss": 0.007, "step": 2243 }, { "epoch": 1.68215892053973, "grad_norm": 0.106194164771032, "learning_rate": 9.629305478749203e-05, "loss": 0.0093, "step": 2244 }, { "epoch": 1.6829085457271364, "grad_norm": 0.07602179907137255, "learning_rate": 9.620587394293133e-05, "loss": 0.0167, "step": 2245 }, { "epoch": 1.6836581709145428, "grad_norm": 0.10184174507913761, "learning_rate": 9.611869598615814e-05, "loss": 0.0259, "step": 2246 }, { "epoch": 1.684407796101949, "grad_norm": 0.10190431935445256, "learning_rate": 9.603152098352537e-05, "loss": 0.0122, "step": 2247 }, { "epoch": 1.6851574212893552, "grad_norm": 0.14810433391255484, "learning_rate": 9.594434900138372e-05, "loss": 0.0131, "step": 2248 }, { "epoch": 1.6859070464767616, "grad_norm": 0.1372986326124049, "learning_rate": 9.585718010608158e-05, "loss": 0.0242, "step": 2249 }, { "epoch": 1.686656671664168, "grad_norm": 0.22024790492225385, "learning_rate": 9.577001436396496e-05, "loss": 0.02, "step": 2250 }, { "epoch": 1.6874062968515742, "grad_norm": 0.23818210366138526, "learning_rate": 9.568285184137756e-05, "loss": 0.0339, "step": 2251 }, { "epoch": 1.6881559220389803, "grad_norm": 0.24880745595783435, "learning_rate": 9.559569260466056e-05, "loss": 0.0203, "step": 2252 }, { "epoch": 1.688905547226387, "grad_norm": 0.08934150225370609, "learning_rate": 9.550853672015264e-05, "loss": 0.0057, "step": 2253 }, { "epoch": 1.6896551724137931, "grad_norm": 0.14124142644205057, "learning_rate": 9.542138425418994e-05, "loss": 0.0243, "step": 2254 }, { "epoch": 1.6904047976011993, "grad_norm": 0.128834478994259, "learning_rate": 9.533423527310595e-05, "loss": 0.0231, "step": 2255 }, { "epoch": 1.6911544227886057, "grad_norm": 0.10252762235808675, "learning_rate": 9.524708984323164e-05, "loss": 0.0094, "step": 2256 }, { "epoch": 1.6919040479760121, "grad_norm": 0.11919608393359798, "learning_rate": 9.515994803089514e-05, "loss": 0.011, "step": 2257 }, { "epoch": 1.6926536731634183, "grad_norm": 0.10028302535446303, "learning_rate": 9.507280990242189e-05, "loss": 0.0065, "step": 2258 }, { "epoch": 1.6934032983508245, "grad_norm": 0.15029212251390084, "learning_rate": 9.498567552413451e-05, "loss": 0.0202, "step": 2259 }, { "epoch": 1.6941529235382309, "grad_norm": 0.14739855354036463, "learning_rate": 9.489854496235278e-05, "loss": 0.0137, "step": 2260 }, { "epoch": 1.6949025487256373, "grad_norm": 0.13250478800352208, "learning_rate": 9.481141828339354e-05, "loss": 0.0166, "step": 2261 }, { "epoch": 1.6956521739130435, "grad_norm": 0.19895184445701702, "learning_rate": 9.472429555357074e-05, "loss": 0.0253, "step": 2262 }, { "epoch": 1.6964017991004496, "grad_norm": 0.20379249682360126, "learning_rate": 9.463717683919525e-05, "loss": 0.0142, "step": 2263 }, { "epoch": 1.697151424287856, "grad_norm": 0.22588524145582545, "learning_rate": 9.455006220657491e-05, "loss": 0.0142, "step": 2264 }, { "epoch": 1.6979010494752624, "grad_norm": 0.2183364170199202, "learning_rate": 9.446295172201447e-05, "loss": 0.0294, "step": 2265 }, { "epoch": 1.6986506746626686, "grad_norm": 0.10529773015330877, "learning_rate": 9.437584545181556e-05, "loss": 0.0117, "step": 2266 }, { "epoch": 1.699400299850075, "grad_norm": 0.13228476642079293, "learning_rate": 9.428874346227651e-05, "loss": 0.0105, "step": 2267 }, { "epoch": 1.7001499250374814, "grad_norm": 0.11116500642428082, "learning_rate": 9.420164581969246e-05, "loss": 0.0192, "step": 2268 }, { "epoch": 1.7008995502248876, "grad_norm": 0.20191971260760297, "learning_rate": 9.41145525903552e-05, "loss": 0.0306, "step": 2269 }, { "epoch": 1.7016491754122938, "grad_norm": 0.1733740741546028, "learning_rate": 9.402746384055319e-05, "loss": 0.0148, "step": 2270 }, { "epoch": 1.7023988005997002, "grad_norm": 0.1985233647138693, "learning_rate": 9.394037963657147e-05, "loss": 0.027, "step": 2271 }, { "epoch": 1.7031484257871066, "grad_norm": 0.15488629938664647, "learning_rate": 9.385330004469165e-05, "loss": 0.0137, "step": 2272 }, { "epoch": 1.7038980509745127, "grad_norm": 0.12550295707673448, "learning_rate": 9.376622513119173e-05, "loss": 0.0213, "step": 2273 }, { "epoch": 1.704647676161919, "grad_norm": 0.0970176046056629, "learning_rate": 9.36791549623463e-05, "loss": 0.0112, "step": 2274 }, { "epoch": 1.7053973013493253, "grad_norm": 0.09282452066698203, "learning_rate": 9.359208960442614e-05, "loss": 0.0136, "step": 2275 }, { "epoch": 1.7061469265367317, "grad_norm": 0.16370503644010123, "learning_rate": 9.350502912369864e-05, "loss": 0.0135, "step": 2276 }, { "epoch": 1.706896551724138, "grad_norm": 0.3022753723913286, "learning_rate": 9.341797358642726e-05, "loss": 0.039, "step": 2277 }, { "epoch": 1.707646176911544, "grad_norm": 0.07035860921333878, "learning_rate": 9.333092305887175e-05, "loss": 0.0058, "step": 2278 }, { "epoch": 1.7083958020989505, "grad_norm": 0.1348057402631347, "learning_rate": 9.324387760728804e-05, "loss": 0.0137, "step": 2279 }, { "epoch": 1.7091454272863569, "grad_norm": 0.20890376229362348, "learning_rate": 9.315683729792827e-05, "loss": 0.0211, "step": 2280 }, { "epoch": 1.709895052473763, "grad_norm": 0.11310850996722045, "learning_rate": 9.306980219704059e-05, "loss": 0.0199, "step": 2281 }, { "epoch": 1.7106446776611695, "grad_norm": 0.24161359492379683, "learning_rate": 9.298277237086919e-05, "loss": 0.0112, "step": 2282 }, { "epoch": 1.7113943028485759, "grad_norm": 0.07882883973262984, "learning_rate": 9.28957478856543e-05, "loss": 0.0077, "step": 2283 }, { "epoch": 1.712143928035982, "grad_norm": 0.08496704552979792, "learning_rate": 9.280872880763201e-05, "loss": 0.0061, "step": 2284 }, { "epoch": 1.7128935532233882, "grad_norm": 0.14163766893656093, "learning_rate": 9.272171520303434e-05, "loss": 0.0125, "step": 2285 }, { "epoch": 1.7136431784107946, "grad_norm": 0.32724263022641503, "learning_rate": 9.263470713808915e-05, "loss": 0.0188, "step": 2286 }, { "epoch": 1.714392803598201, "grad_norm": 0.10439175468241432, "learning_rate": 9.254770467902007e-05, "loss": 0.0078, "step": 2287 }, { "epoch": 1.7151424287856072, "grad_norm": 0.14755758211112238, "learning_rate": 9.246070789204643e-05, "loss": 0.0357, "step": 2288 }, { "epoch": 1.7158920539730134, "grad_norm": 0.16791677288964893, "learning_rate": 9.237371684338334e-05, "loss": 0.0216, "step": 2289 }, { "epoch": 1.7166416791604198, "grad_norm": 0.1295912107491882, "learning_rate": 9.228673159924144e-05, "loss": 0.0174, "step": 2290 }, { "epoch": 1.7173913043478262, "grad_norm": 0.2166942636210779, "learning_rate": 9.2199752225827e-05, "loss": 0.0219, "step": 2291 }, { "epoch": 1.7181409295352323, "grad_norm": 0.19356004105232413, "learning_rate": 9.211277878934179e-05, "loss": 0.0161, "step": 2292 }, { "epoch": 1.7188905547226385, "grad_norm": 0.1116573751433848, "learning_rate": 9.202581135598312e-05, "loss": 0.011, "step": 2293 }, { "epoch": 1.7196401799100451, "grad_norm": 0.09744720227055026, "learning_rate": 9.193884999194366e-05, "loss": 0.0151, "step": 2294 }, { "epoch": 1.7203898050974513, "grad_norm": 0.10492655393579968, "learning_rate": 9.185189476341151e-05, "loss": 0.0149, "step": 2295 }, { "epoch": 1.7211394302848575, "grad_norm": 0.1727812476879939, "learning_rate": 9.176494573657011e-05, "loss": 0.017, "step": 2296 }, { "epoch": 1.721889055472264, "grad_norm": 0.17214475886272773, "learning_rate": 9.167800297759812e-05, "loss": 0.014, "step": 2297 }, { "epoch": 1.7226386806596703, "grad_norm": 0.10015290651528999, "learning_rate": 9.159106655266946e-05, "loss": 0.0094, "step": 2298 }, { "epoch": 1.7233883058470765, "grad_norm": 0.12022968701022108, "learning_rate": 9.150413652795325e-05, "loss": 0.0107, "step": 2299 }, { "epoch": 1.7241379310344827, "grad_norm": 0.20433992444486745, "learning_rate": 9.141721296961373e-05, "loss": 0.0282, "step": 2300 }, { "epoch": 1.724887556221889, "grad_norm": 0.1251335600249641, "learning_rate": 9.133029594381022e-05, "loss": 0.0055, "step": 2301 }, { "epoch": 1.7256371814092955, "grad_norm": 0.12307454879878728, "learning_rate": 9.124338551669702e-05, "loss": 0.0072, "step": 2302 }, { "epoch": 1.7263868065967016, "grad_norm": 0.10974319222209682, "learning_rate": 9.115648175442348e-05, "loss": 0.0069, "step": 2303 }, { "epoch": 1.7271364317841078, "grad_norm": 0.06147589682688045, "learning_rate": 9.106958472313385e-05, "loss": 0.0074, "step": 2304 }, { "epoch": 1.7278860569715142, "grad_norm": 0.1692104250013026, "learning_rate": 9.098269448896722e-05, "loss": 0.0212, "step": 2305 }, { "epoch": 1.7286356821589206, "grad_norm": 0.08275924919315929, "learning_rate": 9.089581111805756e-05, "loss": 0.0091, "step": 2306 }, { "epoch": 1.7293853073463268, "grad_norm": 0.3002478715846399, "learning_rate": 9.080893467653357e-05, "loss": 0.0253, "step": 2307 }, { "epoch": 1.7301349325337332, "grad_norm": 0.04807159341900259, "learning_rate": 9.072206523051872e-05, "loss": 0.0015, "step": 2308 }, { "epoch": 1.7308845577211396, "grad_norm": 0.1257871319990705, "learning_rate": 9.063520284613112e-05, "loss": 0.0092, "step": 2309 }, { "epoch": 1.7316341829085458, "grad_norm": 0.24012971821535473, "learning_rate": 9.054834758948354e-05, "loss": 0.0121, "step": 2310 }, { "epoch": 1.732383808095952, "grad_norm": 0.06868813800563243, "learning_rate": 9.046149952668326e-05, "loss": 0.006, "step": 2311 }, { "epoch": 1.7331334332833583, "grad_norm": 0.2466917223603374, "learning_rate": 9.037465872383218e-05, "loss": 0.0254, "step": 2312 }, { "epoch": 1.7338830584707647, "grad_norm": 0.09160274133796535, "learning_rate": 9.028782524702661e-05, "loss": 0.0119, "step": 2313 }, { "epoch": 1.734632683658171, "grad_norm": 0.1231184165811145, "learning_rate": 9.020099916235729e-05, "loss": 0.016, "step": 2314 }, { "epoch": 1.735382308845577, "grad_norm": 0.16531511126818504, "learning_rate": 9.011418053590934e-05, "loss": 0.007, "step": 2315 }, { "epoch": 1.7361319340329835, "grad_norm": 0.06487807881747268, "learning_rate": 9.002736943376222e-05, "loss": 0.0111, "step": 2316 }, { "epoch": 1.73688155922039, "grad_norm": 0.1349339609143589, "learning_rate": 8.994056592198963e-05, "loss": 0.0111, "step": 2317 }, { "epoch": 1.737631184407796, "grad_norm": 0.052501414741449384, "learning_rate": 8.985377006665952e-05, "loss": 0.0048, "step": 2318 }, { "epoch": 1.7383808095952022, "grad_norm": 0.1993518878703383, "learning_rate": 8.9766981933834e-05, "loss": 0.0224, "step": 2319 }, { "epoch": 1.7391304347826086, "grad_norm": 0.1527207076809983, "learning_rate": 8.968020158956932e-05, "loss": 0.009, "step": 2320 }, { "epoch": 1.739880059970015, "grad_norm": 0.31428872722225654, "learning_rate": 8.959342909991572e-05, "loss": 0.0907, "step": 2321 }, { "epoch": 1.7406296851574212, "grad_norm": 0.042976893923091876, "learning_rate": 8.950666453091763e-05, "loss": 0.0038, "step": 2322 }, { "epoch": 1.7413793103448276, "grad_norm": 0.2108541705440652, "learning_rate": 8.941990794861329e-05, "loss": 0.0221, "step": 2323 }, { "epoch": 1.742128935532234, "grad_norm": 0.09913233384673842, "learning_rate": 8.933315941903493e-05, "loss": 0.0177, "step": 2324 }, { "epoch": 1.7428785607196402, "grad_norm": 0.1689140975435586, "learning_rate": 8.924641900820864e-05, "loss": 0.0121, "step": 2325 }, { "epoch": 1.7436281859070464, "grad_norm": 0.11147509449054176, "learning_rate": 8.915968678215436e-05, "loss": 0.015, "step": 2326 }, { "epoch": 1.7443778110944528, "grad_norm": 0.13181839721952637, "learning_rate": 8.907296280688575e-05, "loss": 0.0137, "step": 2327 }, { "epoch": 1.7451274362818592, "grad_norm": 0.08804738058468466, "learning_rate": 8.898624714841024e-05, "loss": 0.005, "step": 2328 }, { "epoch": 1.7458770614692654, "grad_norm": 0.07071846345027209, "learning_rate": 8.889953987272887e-05, "loss": 0.0043, "step": 2329 }, { "epoch": 1.7466266866566715, "grad_norm": 0.07836457908704735, "learning_rate": 8.881284104583633e-05, "loss": 0.0064, "step": 2330 }, { "epoch": 1.747376311844078, "grad_norm": 0.24495018065737204, "learning_rate": 8.872615073372092e-05, "loss": 0.0145, "step": 2331 }, { "epoch": 1.7481259370314843, "grad_norm": 0.13956385729137918, "learning_rate": 8.86394690023644e-05, "loss": 0.01, "step": 2332 }, { "epoch": 1.7488755622188905, "grad_norm": 0.24149279564904225, "learning_rate": 8.855279591774202e-05, "loss": 0.0265, "step": 2333 }, { "epoch": 1.7496251874062967, "grad_norm": 0.0974355064338702, "learning_rate": 8.846613154582246e-05, "loss": 0.0106, "step": 2334 }, { "epoch": 1.7503748125937033, "grad_norm": 0.19091651414304264, "learning_rate": 8.837947595256771e-05, "loss": 0.0162, "step": 2335 }, { "epoch": 1.7511244377811095, "grad_norm": 0.13646927135675968, "learning_rate": 8.82928292039332e-05, "loss": 0.0133, "step": 2336 }, { "epoch": 1.7518740629685157, "grad_norm": 0.10580595673508876, "learning_rate": 8.820619136586752e-05, "loss": 0.015, "step": 2337 }, { "epoch": 1.752623688155922, "grad_norm": 0.16799968327604245, "learning_rate": 8.811956250431253e-05, "loss": 0.0113, "step": 2338 }, { "epoch": 1.7533733133433285, "grad_norm": 0.10343603191332432, "learning_rate": 8.803294268520321e-05, "loss": 0.0035, "step": 2339 }, { "epoch": 1.7541229385307346, "grad_norm": 0.12820836792714202, "learning_rate": 8.79463319744677e-05, "loss": 0.0199, "step": 2340 }, { "epoch": 1.7548725637181408, "grad_norm": 0.12156490486890247, "learning_rate": 8.785973043802723e-05, "loss": 0.0122, "step": 2341 }, { "epoch": 1.7556221889055472, "grad_norm": 0.08880129633365962, "learning_rate": 8.777313814179598e-05, "loss": 0.0103, "step": 2342 }, { "epoch": 1.7563718140929536, "grad_norm": 0.1361385886586821, "learning_rate": 8.768655515168111e-05, "loss": 0.0073, "step": 2343 }, { "epoch": 1.7571214392803598, "grad_norm": 0.08717390078439331, "learning_rate": 8.759998153358274e-05, "loss": 0.011, "step": 2344 }, { "epoch": 1.757871064467766, "grad_norm": 0.13739874298614746, "learning_rate": 8.751341735339386e-05, "loss": 0.026, "step": 2345 }, { "epoch": 1.7586206896551724, "grad_norm": 0.1362043887687259, "learning_rate": 8.742686267700021e-05, "loss": 0.0146, "step": 2346 }, { "epoch": 1.7593703148425788, "grad_norm": 0.16907979495658385, "learning_rate": 8.734031757028034e-05, "loss": 0.0168, "step": 2347 }, { "epoch": 1.760119940029985, "grad_norm": 0.12248994769638612, "learning_rate": 8.725378209910553e-05, "loss": 0.0182, "step": 2348 }, { "epoch": 1.7608695652173914, "grad_norm": 0.1228893815046215, "learning_rate": 8.716725632933973e-05, "loss": 0.0143, "step": 2349 }, { "epoch": 1.7616191904047978, "grad_norm": 0.19568888006797103, "learning_rate": 8.708074032683946e-05, "loss": 0.0199, "step": 2350 }, { "epoch": 1.762368815592204, "grad_norm": 0.40680375508994865, "learning_rate": 8.699423415745383e-05, "loss": 0.0362, "step": 2351 }, { "epoch": 1.76311844077961, "grad_norm": 0.10472777978935661, "learning_rate": 8.690773788702447e-05, "loss": 0.0107, "step": 2352 }, { "epoch": 1.7638680659670165, "grad_norm": 0.13838529500657282, "learning_rate": 8.682125158138547e-05, "loss": 0.0143, "step": 2353 }, { "epoch": 1.764617691154423, "grad_norm": 0.05052992508658696, "learning_rate": 8.673477530636335e-05, "loss": 0.0025, "step": 2354 }, { "epoch": 1.765367316341829, "grad_norm": 0.21295167297071438, "learning_rate": 8.6648309127777e-05, "loss": 0.0132, "step": 2355 }, { "epoch": 1.7661169415292353, "grad_norm": 0.08171822983671556, "learning_rate": 8.656185311143756e-05, "loss": 0.0088, "step": 2356 }, { "epoch": 1.7668665667166417, "grad_norm": 0.10411541551669401, "learning_rate": 8.647540732314852e-05, "loss": 0.0168, "step": 2357 }, { "epoch": 1.767616191904048, "grad_norm": 0.08343662762648035, "learning_rate": 8.638897182870551e-05, "loss": 0.0095, "step": 2358 }, { "epoch": 1.7683658170914542, "grad_norm": 0.4010883192599926, "learning_rate": 8.630254669389643e-05, "loss": 0.0507, "step": 2359 }, { "epoch": 1.7691154422788604, "grad_norm": 0.144194524741157, "learning_rate": 8.621613198450118e-05, "loss": 0.0248, "step": 2360 }, { "epoch": 1.7698650674662668, "grad_norm": 0.12433647392529536, "learning_rate": 8.61297277662918e-05, "loss": 0.0278, "step": 2361 }, { "epoch": 1.7706146926536732, "grad_norm": 0.1929365072777267, "learning_rate": 8.604333410503231e-05, "loss": 0.0393, "step": 2362 }, { "epoch": 1.7713643178410794, "grad_norm": 0.1684109117453673, "learning_rate": 8.595695106647872e-05, "loss": 0.0185, "step": 2363 }, { "epoch": 1.7721139430284858, "grad_norm": 0.07411413587451561, "learning_rate": 8.587057871637891e-05, "loss": 0.0089, "step": 2364 }, { "epoch": 1.7728635682158922, "grad_norm": 0.10625695114970526, "learning_rate": 8.578421712047269e-05, "loss": 0.0092, "step": 2365 }, { "epoch": 1.7736131934032984, "grad_norm": 0.12012490456270324, "learning_rate": 8.569786634449162e-05, "loss": 0.0156, "step": 2366 }, { "epoch": 1.7743628185907045, "grad_norm": 0.12171823516493953, "learning_rate": 8.561152645415907e-05, "loss": 0.0109, "step": 2367 }, { "epoch": 1.775112443778111, "grad_norm": 0.1400595657322927, "learning_rate": 8.552519751519008e-05, "loss": 0.0229, "step": 2368 }, { "epoch": 1.7758620689655173, "grad_norm": 0.16177276746735106, "learning_rate": 8.543887959329144e-05, "loss": 0.0219, "step": 2369 }, { "epoch": 1.7766116941529235, "grad_norm": 0.15222405621160293, "learning_rate": 8.535257275416145e-05, "loss": 0.0158, "step": 2370 }, { "epoch": 1.7773613193403297, "grad_norm": 0.05362026788424701, "learning_rate": 8.526627706349002e-05, "loss": 0.0056, "step": 2371 }, { "epoch": 1.778110944527736, "grad_norm": 0.17846732790097433, "learning_rate": 8.517999258695861e-05, "loss": 0.0109, "step": 2372 }, { "epoch": 1.7788605697151425, "grad_norm": 0.09478484403879284, "learning_rate": 8.509371939024011e-05, "loss": 0.0171, "step": 2373 }, { "epoch": 1.7796101949025487, "grad_norm": 0.09896839439022605, "learning_rate": 8.500745753899883e-05, "loss": 0.0093, "step": 2374 }, { "epoch": 1.7803598200899549, "grad_norm": 0.14864187174565247, "learning_rate": 8.492120709889041e-05, "loss": 0.0159, "step": 2375 }, { "epoch": 1.7811094452773615, "grad_norm": 0.20883578340056738, "learning_rate": 8.483496813556187e-05, "loss": 0.02, "step": 2376 }, { "epoch": 1.7818590704647677, "grad_norm": 0.06932679928893044, "learning_rate": 8.474874071465144e-05, "loss": 0.008, "step": 2377 }, { "epoch": 1.7826086956521738, "grad_norm": 0.113622304438288, "learning_rate": 8.466252490178859e-05, "loss": 0.0097, "step": 2378 }, { "epoch": 1.7833583208395802, "grad_norm": 0.10037254661684161, "learning_rate": 8.457632076259395e-05, "loss": 0.0092, "step": 2379 }, { "epoch": 1.7841079460269866, "grad_norm": 0.0823820071462789, "learning_rate": 8.449012836267928e-05, "loss": 0.0131, "step": 2380 }, { "epoch": 1.7848575712143928, "grad_norm": 0.13449999499783072, "learning_rate": 8.440394776764735e-05, "loss": 0.0114, "step": 2381 }, { "epoch": 1.785607196401799, "grad_norm": 0.2303829733454292, "learning_rate": 8.431777904309204e-05, "loss": 0.0263, "step": 2382 }, { "epoch": 1.7863568215892054, "grad_norm": 0.10160743650504148, "learning_rate": 8.423162225459812e-05, "loss": 0.0088, "step": 2383 }, { "epoch": 1.7871064467766118, "grad_norm": 0.09810569209049344, "learning_rate": 8.414547746774129e-05, "loss": 0.0076, "step": 2384 }, { "epoch": 1.787856071964018, "grad_norm": 0.10763901521364005, "learning_rate": 8.405934474808813e-05, "loss": 0.0048, "step": 2385 }, { "epoch": 1.7886056971514241, "grad_norm": 0.08881457612672823, "learning_rate": 8.397322416119602e-05, "loss": 0.009, "step": 2386 }, { "epoch": 1.7893553223388305, "grad_norm": 0.1489624361101976, "learning_rate": 8.388711577261311e-05, "loss": 0.0202, "step": 2387 }, { "epoch": 1.790104947526237, "grad_norm": 0.10090392870455397, "learning_rate": 8.380101964787827e-05, "loss": 0.0112, "step": 2388 }, { "epoch": 1.7908545727136431, "grad_norm": 0.19129451114334656, "learning_rate": 8.371493585252105e-05, "loss": 0.0196, "step": 2389 }, { "epoch": 1.7916041979010495, "grad_norm": 0.19691572762188736, "learning_rate": 8.362886445206159e-05, "loss": 0.0199, "step": 2390 }, { "epoch": 1.792353823088456, "grad_norm": 0.15452745623652986, "learning_rate": 8.354280551201058e-05, "loss": 0.0157, "step": 2391 }, { "epoch": 1.793103448275862, "grad_norm": 0.27046459835526604, "learning_rate": 8.345675909786927e-05, "loss": 0.0201, "step": 2392 }, { "epoch": 1.7938530734632683, "grad_norm": 0.13222942919639982, "learning_rate": 8.337072527512939e-05, "loss": 0.031, "step": 2393 }, { "epoch": 1.7946026986506747, "grad_norm": 0.1387837427042574, "learning_rate": 8.328470410927303e-05, "loss": 0.0271, "step": 2394 }, { "epoch": 1.795352323838081, "grad_norm": 0.09030559122487354, "learning_rate": 8.319869566577261e-05, "loss": 0.0118, "step": 2395 }, { "epoch": 1.7961019490254873, "grad_norm": 0.11471102933534097, "learning_rate": 8.311270001009103e-05, "loss": 0.0082, "step": 2396 }, { "epoch": 1.7968515742128934, "grad_norm": 0.24614017032481977, "learning_rate": 8.302671720768132e-05, "loss": 0.022, "step": 2397 }, { "epoch": 1.7976011994002998, "grad_norm": 0.07106249804454486, "learning_rate": 8.294074732398674e-05, "loss": 0.0084, "step": 2398 }, { "epoch": 1.7983508245877062, "grad_norm": 0.27858692463728674, "learning_rate": 8.285479042444076e-05, "loss": 0.028, "step": 2399 }, { "epoch": 1.7991004497751124, "grad_norm": 0.08144062567952201, "learning_rate": 8.276884657446695e-05, "loss": 0.0123, "step": 2400 }, { "epoch": 1.7998500749625186, "grad_norm": 0.1507172031836174, "learning_rate": 8.268291583947891e-05, "loss": 0.0153, "step": 2401 }, { "epoch": 1.800599700149925, "grad_norm": 0.10025116654911155, "learning_rate": 8.259699828488033e-05, "loss": 0.0087, "step": 2402 }, { "epoch": 1.8013493253373314, "grad_norm": 0.2954824278349512, "learning_rate": 8.251109397606483e-05, "loss": 0.0285, "step": 2403 }, { "epoch": 1.8020989505247376, "grad_norm": 0.11698150619357166, "learning_rate": 8.242520297841592e-05, "loss": 0.0131, "step": 2404 }, { "epoch": 1.802848575712144, "grad_norm": 0.16459252010828768, "learning_rate": 8.2339325357307e-05, "loss": 0.0197, "step": 2405 }, { "epoch": 1.8035982008995504, "grad_norm": 0.15037083494560705, "learning_rate": 8.225346117810134e-05, "loss": 0.017, "step": 2406 }, { "epoch": 1.8043478260869565, "grad_norm": 0.1285506317397058, "learning_rate": 8.216761050615191e-05, "loss": 0.0063, "step": 2407 }, { "epoch": 1.8050974512743627, "grad_norm": 0.11691442119619251, "learning_rate": 8.208177340680144e-05, "loss": 0.0162, "step": 2408 }, { "epoch": 1.8058470764617691, "grad_norm": 0.28019089269121017, "learning_rate": 8.19959499453823e-05, "loss": 0.0437, "step": 2409 }, { "epoch": 1.8065967016491755, "grad_norm": 0.156066382708508, "learning_rate": 8.19101401872165e-05, "loss": 0.0116, "step": 2410 }, { "epoch": 1.8073463268365817, "grad_norm": 0.07752708295458294, "learning_rate": 8.182434419761561e-05, "loss": 0.0068, "step": 2411 }, { "epoch": 1.8080959520239879, "grad_norm": 0.11088280544682265, "learning_rate": 8.173856204188072e-05, "loss": 0.0112, "step": 2412 }, { "epoch": 1.8088455772113943, "grad_norm": 0.06605750093101989, "learning_rate": 8.165279378530242e-05, "loss": 0.0068, "step": 2413 }, { "epoch": 1.8095952023988007, "grad_norm": 0.22384537369273771, "learning_rate": 8.156703949316064e-05, "loss": 0.0183, "step": 2414 }, { "epoch": 1.8103448275862069, "grad_norm": 0.17784538402035052, "learning_rate": 8.148129923072482e-05, "loss": 0.0116, "step": 2415 }, { "epoch": 1.811094452773613, "grad_norm": 0.24431438066392663, "learning_rate": 8.139557306325358e-05, "loss": 0.0218, "step": 2416 }, { "epoch": 1.8118440779610197, "grad_norm": 0.11453677059618239, "learning_rate": 8.13098610559949e-05, "loss": 0.0238, "step": 2417 }, { "epoch": 1.8125937031484258, "grad_norm": 0.1020464200148881, "learning_rate": 8.122416327418594e-05, "loss": 0.0086, "step": 2418 }, { "epoch": 1.813343328335832, "grad_norm": 0.2358627914517762, "learning_rate": 8.113847978305304e-05, "loss": 0.0299, "step": 2419 }, { "epoch": 1.8140929535232384, "grad_norm": 0.14323494877777182, "learning_rate": 8.105281064781165e-05, "loss": 0.0148, "step": 2420 }, { "epoch": 1.8148425787106448, "grad_norm": 0.18538828017344644, "learning_rate": 8.096715593366637e-05, "loss": 0.0243, "step": 2421 }, { "epoch": 1.815592203898051, "grad_norm": 0.26309294444457143, "learning_rate": 8.088151570581076e-05, "loss": 0.0344, "step": 2422 }, { "epoch": 1.8163418290854572, "grad_norm": 0.14817054423968734, "learning_rate": 8.079589002942733e-05, "loss": 0.0146, "step": 2423 }, { "epoch": 1.8170914542728636, "grad_norm": 0.06615204111385492, "learning_rate": 8.071027896968757e-05, "loss": 0.007, "step": 2424 }, { "epoch": 1.81784107946027, "grad_norm": 0.2143277633690874, "learning_rate": 8.06246825917518e-05, "loss": 0.0333, "step": 2425 }, { "epoch": 1.8185907046476761, "grad_norm": 0.20622070273292714, "learning_rate": 8.05391009607692e-05, "loss": 0.0245, "step": 2426 }, { "epoch": 1.8193403298350823, "grad_norm": 0.36423970885157425, "learning_rate": 8.045353414187771e-05, "loss": 0.0443, "step": 2427 }, { "epoch": 1.8200899550224887, "grad_norm": 0.09663320469898391, "learning_rate": 8.036798220020397e-05, "loss": 0.0198, "step": 2428 }, { "epoch": 1.8208395802098951, "grad_norm": 0.11993671368104146, "learning_rate": 8.028244520086337e-05, "loss": 0.0139, "step": 2429 }, { "epoch": 1.8215892053973013, "grad_norm": 0.23012223953056263, "learning_rate": 8.019692320895986e-05, "loss": 0.0212, "step": 2430 }, { "epoch": 1.8223388305847077, "grad_norm": 0.3594473594904879, "learning_rate": 8.0111416289586e-05, "loss": 0.0235, "step": 2431 }, { "epoch": 1.823088455772114, "grad_norm": 0.16619943656526007, "learning_rate": 8.002592450782287e-05, "loss": 0.0169, "step": 2432 }, { "epoch": 1.8238380809595203, "grad_norm": 0.10821109610505597, "learning_rate": 7.994044792874001e-05, "loss": 0.0109, "step": 2433 }, { "epoch": 1.8245877061469264, "grad_norm": 0.10944259492215419, "learning_rate": 7.98549866173954e-05, "loss": 0.0115, "step": 2434 }, { "epoch": 1.8253373313343328, "grad_norm": 0.16858836245980474, "learning_rate": 7.976954063883542e-05, "loss": 0.0105, "step": 2435 }, { "epoch": 1.8260869565217392, "grad_norm": 0.12882911211010897, "learning_rate": 7.968411005809476e-05, "loss": 0.01, "step": 2436 }, { "epoch": 1.8268365817091454, "grad_norm": 0.1869655169698048, "learning_rate": 7.959869494019638e-05, "loss": 0.0165, "step": 2437 }, { "epoch": 1.8275862068965516, "grad_norm": 0.15709456935311567, "learning_rate": 7.951329535015152e-05, "loss": 0.0202, "step": 2438 }, { "epoch": 1.828335832083958, "grad_norm": 0.26854060632952176, "learning_rate": 7.942791135295954e-05, "loss": 0.0137, "step": 2439 }, { "epoch": 1.8290854572713644, "grad_norm": 0.16090283940108951, "learning_rate": 7.934254301360797e-05, "loss": 0.0097, "step": 2440 }, { "epoch": 1.8298350824587706, "grad_norm": 0.0699283924536712, "learning_rate": 7.92571903970724e-05, "loss": 0.0099, "step": 2441 }, { "epoch": 1.8305847076461768, "grad_norm": 0.13273853251021484, "learning_rate": 7.917185356831648e-05, "loss": 0.0194, "step": 2442 }, { "epoch": 1.8313343328335832, "grad_norm": 0.08065492868490784, "learning_rate": 7.908653259229182e-05, "loss": 0.0092, "step": 2443 }, { "epoch": 1.8320839580209896, "grad_norm": 0.22776619342338636, "learning_rate": 7.900122753393794e-05, "loss": 0.0284, "step": 2444 }, { "epoch": 1.8328335832083957, "grad_norm": 0.07395140232369679, "learning_rate": 7.891593845818235e-05, "loss": 0.0136, "step": 2445 }, { "epoch": 1.8335832083958021, "grad_norm": 0.1562336933540751, "learning_rate": 7.883066542994026e-05, "loss": 0.0221, "step": 2446 }, { "epoch": 1.8343328335832085, "grad_norm": 0.2278781011842574, "learning_rate": 7.874540851411477e-05, "loss": 0.0341, "step": 2447 }, { "epoch": 1.8350824587706147, "grad_norm": 0.3267357831045852, "learning_rate": 7.866016777559665e-05, "loss": 0.0387, "step": 2448 }, { "epoch": 1.8358320839580209, "grad_norm": 0.09800556584697193, "learning_rate": 7.857494327926439e-05, "loss": 0.0121, "step": 2449 }, { "epoch": 1.8365817091454273, "grad_norm": 0.2104509100294957, "learning_rate": 7.84897350899841e-05, "loss": 0.0164, "step": 2450 }, { "epoch": 1.8373313343328337, "grad_norm": 0.14104718753315004, "learning_rate": 7.840454327260948e-05, "loss": 0.0171, "step": 2451 }, { "epoch": 1.8380809595202399, "grad_norm": 0.15792235266223165, "learning_rate": 7.83193678919818e-05, "loss": 0.0164, "step": 2452 }, { "epoch": 1.838830584707646, "grad_norm": 0.1687305523880618, "learning_rate": 7.823420901292979e-05, "loss": 0.0173, "step": 2453 }, { "epoch": 1.8395802098950524, "grad_norm": 0.16484720609451164, "learning_rate": 7.814906670026963e-05, "loss": 0.0136, "step": 2454 }, { "epoch": 1.8403298350824588, "grad_norm": 0.15187479733924325, "learning_rate": 7.806394101880489e-05, "loss": 0.0132, "step": 2455 }, { "epoch": 1.841079460269865, "grad_norm": 0.10826429549268864, "learning_rate": 7.797883203332645e-05, "loss": 0.0074, "step": 2456 }, { "epoch": 1.8418290854572712, "grad_norm": 0.08732079300226159, "learning_rate": 7.789373980861256e-05, "loss": 0.0107, "step": 2457 }, { "epoch": 1.8425787106446778, "grad_norm": 0.11564816944872554, "learning_rate": 7.780866440942862e-05, "loss": 0.0133, "step": 2458 }, { "epoch": 1.843328335832084, "grad_norm": 0.1478322613189044, "learning_rate": 7.77236059005273e-05, "loss": 0.0126, "step": 2459 }, { "epoch": 1.8440779610194902, "grad_norm": 0.12555558944479056, "learning_rate": 7.763856434664837e-05, "loss": 0.0106, "step": 2460 }, { "epoch": 1.8448275862068966, "grad_norm": 0.16809980334871072, "learning_rate": 7.755353981251867e-05, "loss": 0.0201, "step": 2461 }, { "epoch": 1.845577211394303, "grad_norm": 0.15373833992286495, "learning_rate": 7.74685323628522e-05, "loss": 0.0194, "step": 2462 }, { "epoch": 1.8463268365817092, "grad_norm": 0.105119977223022, "learning_rate": 7.738354206234984e-05, "loss": 0.0101, "step": 2463 }, { "epoch": 1.8470764617691153, "grad_norm": 0.19700610621960898, "learning_rate": 7.729856897569947e-05, "loss": 0.0187, "step": 2464 }, { "epoch": 1.8478260869565217, "grad_norm": 0.1627518862213529, "learning_rate": 7.721361316757584e-05, "loss": 0.0209, "step": 2465 }, { "epoch": 1.8485757121439281, "grad_norm": 0.07826135196893862, "learning_rate": 7.712867470264058e-05, "loss": 0.0082, "step": 2466 }, { "epoch": 1.8493253373313343, "grad_norm": 0.11640801846060447, "learning_rate": 7.704375364554209e-05, "loss": 0.0092, "step": 2467 }, { "epoch": 1.8500749625187405, "grad_norm": 0.1629061980346892, "learning_rate": 7.695885006091552e-05, "loss": 0.0226, "step": 2468 }, { "epoch": 1.8508245877061469, "grad_norm": 0.07211534313473594, "learning_rate": 7.68739640133828e-05, "loss": 0.006, "step": 2469 }, { "epoch": 1.8515742128935533, "grad_norm": 0.0693554167568505, "learning_rate": 7.678909556755239e-05, "loss": 0.0092, "step": 2470 }, { "epoch": 1.8523238380809595, "grad_norm": 0.11865432701123758, "learning_rate": 7.670424478801945e-05, "loss": 0.0169, "step": 2471 }, { "epoch": 1.8530734632683659, "grad_norm": 0.08334899698876731, "learning_rate": 7.661941173936564e-05, "loss": 0.0073, "step": 2472 }, { "epoch": 1.8538230884557723, "grad_norm": 0.24857862856866222, "learning_rate": 7.653459648615915e-05, "loss": 0.0199, "step": 2473 }, { "epoch": 1.8545727136431784, "grad_norm": 0.2482687204364232, "learning_rate": 7.644979909295458e-05, "loss": 0.0208, "step": 2474 }, { "epoch": 1.8553223388305846, "grad_norm": 0.19686906475653315, "learning_rate": 7.636501962429301e-05, "loss": 0.0159, "step": 2475 }, { "epoch": 1.856071964017991, "grad_norm": 0.10028826224834073, "learning_rate": 7.628025814470183e-05, "loss": 0.0139, "step": 2476 }, { "epoch": 1.8568215892053974, "grad_norm": 0.07742702749673577, "learning_rate": 7.619551471869476e-05, "loss": 0.0053, "step": 2477 }, { "epoch": 1.8575712143928036, "grad_norm": 0.14772903844749294, "learning_rate": 7.611078941077174e-05, "loss": 0.0192, "step": 2478 }, { "epoch": 1.8583208395802098, "grad_norm": 1.4667137683554423, "learning_rate": 7.602608228541898e-05, "loss": 0.0407, "step": 2479 }, { "epoch": 1.8590704647676162, "grad_norm": 0.12049533052266952, "learning_rate": 7.594139340710878e-05, "loss": 0.0098, "step": 2480 }, { "epoch": 1.8598200899550226, "grad_norm": 0.10478542644073217, "learning_rate": 7.585672284029962e-05, "loss": 0.0209, "step": 2481 }, { "epoch": 1.8605697151424287, "grad_norm": 0.16305850275521108, "learning_rate": 7.577207064943599e-05, "loss": 0.0201, "step": 2482 }, { "epoch": 1.861319340329835, "grad_norm": 0.12384962164874974, "learning_rate": 7.568743689894844e-05, "loss": 0.0101, "step": 2483 }, { "epoch": 1.8620689655172413, "grad_norm": 0.16888590528757494, "learning_rate": 7.560282165325342e-05, "loss": 0.0139, "step": 2484 }, { "epoch": 1.8628185907046477, "grad_norm": 0.09058292457085314, "learning_rate": 7.551822497675339e-05, "loss": 0.008, "step": 2485 }, { "epoch": 1.863568215892054, "grad_norm": 0.1359901168221532, "learning_rate": 7.54336469338366e-05, "loss": 0.0132, "step": 2486 }, { "epoch": 1.8643178410794603, "grad_norm": 0.0636307909796474, "learning_rate": 7.534908758887714e-05, "loss": 0.0045, "step": 2487 }, { "epoch": 1.8650674662668667, "grad_norm": 0.11762048285293739, "learning_rate": 7.526454700623484e-05, "loss": 0.0132, "step": 2488 }, { "epoch": 1.8658170914542729, "grad_norm": 0.14584312270419342, "learning_rate": 7.518002525025533e-05, "loss": 0.0111, "step": 2489 }, { "epoch": 1.866566716641679, "grad_norm": 0.057156129657237674, "learning_rate": 7.509552238526983e-05, "loss": 0.0052, "step": 2490 }, { "epoch": 1.8673163418290855, "grad_norm": 0.12061801169326321, "learning_rate": 7.501103847559523e-05, "loss": 0.0088, "step": 2491 }, { "epoch": 1.8680659670164919, "grad_norm": 0.3208573344470076, "learning_rate": 7.49265735855339e-05, "loss": 0.0237, "step": 2492 }, { "epoch": 1.868815592203898, "grad_norm": 0.08171237944606213, "learning_rate": 7.484212777937391e-05, "loss": 0.005, "step": 2493 }, { "epoch": 1.8695652173913042, "grad_norm": 0.10375078006941929, "learning_rate": 7.475770112138866e-05, "loss": 0.0075, "step": 2494 }, { "epoch": 1.8703148425787106, "grad_norm": 0.09544820985988885, "learning_rate": 7.467329367583705e-05, "loss": 0.01, "step": 2495 }, { "epoch": 1.871064467766117, "grad_norm": 0.05263185304535703, "learning_rate": 7.458890550696327e-05, "loss": 0.0067, "step": 2496 }, { "epoch": 1.8718140929535232, "grad_norm": 0.20191098821286693, "learning_rate": 7.45045366789969e-05, "loss": 0.0273, "step": 2497 }, { "epoch": 1.8725637181409296, "grad_norm": 0.1398162193094364, "learning_rate": 7.442018725615281e-05, "loss": 0.0097, "step": 2498 }, { "epoch": 1.873313343328336, "grad_norm": 0.07068993934881566, "learning_rate": 7.43358573026311e-05, "loss": 0.0063, "step": 2499 }, { "epoch": 1.8740629685157422, "grad_norm": 0.22625443784283683, "learning_rate": 7.425154688261702e-05, "loss": 0.0266, "step": 2500 }, { "epoch": 1.8748125937031483, "grad_norm": 0.12500420397821987, "learning_rate": 7.416725606028098e-05, "loss": 0.0075, "step": 2501 }, { "epoch": 1.8755622188905547, "grad_norm": 0.23427172980645034, "learning_rate": 7.408298489977843e-05, "loss": 0.0249, "step": 2502 }, { "epoch": 1.8763118440779611, "grad_norm": 0.11898534192386219, "learning_rate": 7.39987334652499e-05, "loss": 0.0076, "step": 2503 }, { "epoch": 1.8770614692653673, "grad_norm": 0.08373734719913158, "learning_rate": 7.39145018208209e-05, "loss": 0.0074, "step": 2504 }, { "epoch": 1.8778110944527735, "grad_norm": 0.09218030773502157, "learning_rate": 7.383029003060187e-05, "loss": 0.0055, "step": 2505 }, { "epoch": 1.87856071964018, "grad_norm": 0.16710726389458688, "learning_rate": 7.374609815868811e-05, "loss": 0.0113, "step": 2506 }, { "epoch": 1.8793103448275863, "grad_norm": 0.5572105217760861, "learning_rate": 7.366192626915981e-05, "loss": 0.0298, "step": 2507 }, { "epoch": 1.8800599700149925, "grad_norm": 0.14773496446583861, "learning_rate": 7.357777442608195e-05, "loss": 0.0178, "step": 2508 }, { "epoch": 1.8808095952023987, "grad_norm": 0.10878118667541402, "learning_rate": 7.34936426935042e-05, "loss": 0.011, "step": 2509 }, { "epoch": 1.881559220389805, "grad_norm": 0.3147812075243767, "learning_rate": 7.340953113546099e-05, "loss": 0.0275, "step": 2510 }, { "epoch": 1.8823088455772115, "grad_norm": 0.0634699160517354, "learning_rate": 7.332543981597131e-05, "loss": 0.0057, "step": 2511 }, { "epoch": 1.8830584707646176, "grad_norm": 0.1037282284127577, "learning_rate": 7.324136879903882e-05, "loss": 0.0059, "step": 2512 }, { "epoch": 1.883808095952024, "grad_norm": 0.28102259056009204, "learning_rate": 7.315731814865172e-05, "loss": 0.0418, "step": 2513 }, { "epoch": 1.8845577211394304, "grad_norm": 0.18082904773371083, "learning_rate": 7.307328792878265e-05, "loss": 0.0109, "step": 2514 }, { "epoch": 1.8853073463268366, "grad_norm": 0.27733764852475407, "learning_rate": 7.298927820338878e-05, "loss": 0.0129, "step": 2515 }, { "epoch": 1.8860569715142428, "grad_norm": 0.051804527098103306, "learning_rate": 7.290528903641158e-05, "loss": 0.0055, "step": 2516 }, { "epoch": 1.8868065967016492, "grad_norm": 0.1645664449774772, "learning_rate": 7.282132049177702e-05, "loss": 0.0125, "step": 2517 }, { "epoch": 1.8875562218890556, "grad_norm": 0.11093437917334256, "learning_rate": 7.273737263339526e-05, "loss": 0.0128, "step": 2518 }, { "epoch": 1.8883058470764618, "grad_norm": 0.1369863753526021, "learning_rate": 7.265344552516073e-05, "loss": 0.0171, "step": 2519 }, { "epoch": 1.889055472263868, "grad_norm": 0.1396122038028002, "learning_rate": 7.256953923095209e-05, "loss": 0.0133, "step": 2520 }, { "epoch": 1.8898050974512743, "grad_norm": 0.16666444163694127, "learning_rate": 7.248565381463211e-05, "loss": 0.0241, "step": 2521 }, { "epoch": 1.8905547226386807, "grad_norm": 0.09667613590644915, "learning_rate": 7.240178934004778e-05, "loss": 0.0084, "step": 2522 }, { "epoch": 1.891304347826087, "grad_norm": 0.28358846469437893, "learning_rate": 7.231794587103004e-05, "loss": 0.0534, "step": 2523 }, { "epoch": 1.892053973013493, "grad_norm": 0.2908858163773263, "learning_rate": 7.223412347139386e-05, "loss": 0.0159, "step": 2524 }, { "epoch": 1.8928035982008995, "grad_norm": 0.24064697503110768, "learning_rate": 7.215032220493825e-05, "loss": 0.014, "step": 2525 }, { "epoch": 1.893553223388306, "grad_norm": 0.3016866461196789, "learning_rate": 7.206654213544603e-05, "loss": 0.0119, "step": 2526 }, { "epoch": 1.894302848575712, "grad_norm": 0.2621412679543309, "learning_rate": 7.198278332668397e-05, "loss": 0.0357, "step": 2527 }, { "epoch": 1.8950524737631185, "grad_norm": 0.14120466768101003, "learning_rate": 7.189904584240258e-05, "loss": 0.0234, "step": 2528 }, { "epoch": 1.8958020989505249, "grad_norm": 0.09072777647343415, "learning_rate": 7.181532974633625e-05, "loss": 0.0103, "step": 2529 }, { "epoch": 1.896551724137931, "grad_norm": 0.14239900493075416, "learning_rate": 7.173163510220295e-05, "loss": 0.032, "step": 2530 }, { "epoch": 1.8973013493253372, "grad_norm": 0.19534998045802685, "learning_rate": 7.164796197370444e-05, "loss": 0.0221, "step": 2531 }, { "epoch": 1.8980509745127436, "grad_norm": 0.11225916161140852, "learning_rate": 7.156431042452609e-05, "loss": 0.0153, "step": 2532 }, { "epoch": 1.89880059970015, "grad_norm": 0.18199528832938758, "learning_rate": 7.148068051833676e-05, "loss": 0.034, "step": 2533 }, { "epoch": 1.8995502248875562, "grad_norm": 0.13537685599687774, "learning_rate": 7.139707231878893e-05, "loss": 0.0115, "step": 2534 }, { "epoch": 1.9002998500749624, "grad_norm": 0.07626680894580906, "learning_rate": 7.131348588951851e-05, "loss": 0.0101, "step": 2535 }, { "epoch": 1.9010494752623688, "grad_norm": 0.11597308993208941, "learning_rate": 7.122992129414485e-05, "loss": 0.0151, "step": 2536 }, { "epoch": 1.9017991004497752, "grad_norm": 0.09926554517236055, "learning_rate": 7.11463785962707e-05, "loss": 0.0091, "step": 2537 }, { "epoch": 1.9025487256371814, "grad_norm": 0.5994144403129555, "learning_rate": 7.10628578594821e-05, "loss": 0.0256, "step": 2538 }, { "epoch": 1.9032983508245878, "grad_norm": 0.21192680149026147, "learning_rate": 7.097935914734843e-05, "loss": 0.0247, "step": 2539 }, { "epoch": 1.9040479760119942, "grad_norm": 0.13979130816746754, "learning_rate": 7.089588252342222e-05, "loss": 0.0131, "step": 2540 }, { "epoch": 1.9047976011994003, "grad_norm": 0.1522540153218576, "learning_rate": 7.081242805123935e-05, "loss": 0.0138, "step": 2541 }, { "epoch": 1.9055472263868065, "grad_norm": 0.3475730065259247, "learning_rate": 7.072899579431868e-05, "loss": 0.0238, "step": 2542 }, { "epoch": 1.906296851574213, "grad_norm": 0.1584546299657851, "learning_rate": 7.064558581616223e-05, "loss": 0.0176, "step": 2543 }, { "epoch": 1.9070464767616193, "grad_norm": 0.12628040865730267, "learning_rate": 7.056219818025504e-05, "loss": 0.0204, "step": 2544 }, { "epoch": 1.9077961019490255, "grad_norm": 0.158931488712341, "learning_rate": 7.047883295006516e-05, "loss": 0.0138, "step": 2545 }, { "epoch": 1.9085457271364317, "grad_norm": 0.140558609819867, "learning_rate": 7.03954901890436e-05, "loss": 0.0122, "step": 2546 }, { "epoch": 1.909295352323838, "grad_norm": 0.07932710076810945, "learning_rate": 7.031216996062426e-05, "loss": 0.0134, "step": 2547 }, { "epoch": 1.9100449775112445, "grad_norm": 0.08371999708812815, "learning_rate": 7.022887232822384e-05, "loss": 0.0104, "step": 2548 }, { "epoch": 1.9107946026986506, "grad_norm": 0.13582280816137274, "learning_rate": 7.01455973552419e-05, "loss": 0.0083, "step": 2549 }, { "epoch": 1.9115442278860568, "grad_norm": 0.10624807259301532, "learning_rate": 7.006234510506076e-05, "loss": 0.0119, "step": 2550 }, { "epoch": 1.9122938530734632, "grad_norm": 0.15543251325160595, "learning_rate": 6.997911564104541e-05, "loss": 0.0152, "step": 2551 }, { "epoch": 1.9130434782608696, "grad_norm": 0.0921239950798916, "learning_rate": 6.989590902654349e-05, "loss": 0.0088, "step": 2552 }, { "epoch": 1.9137931034482758, "grad_norm": 0.41388948225918165, "learning_rate": 6.981272532488529e-05, "loss": 0.0568, "step": 2553 }, { "epoch": 1.9145427286356822, "grad_norm": 0.100566670663976, "learning_rate": 6.97295645993836e-05, "loss": 0.0108, "step": 2554 }, { "epoch": 1.9152923538230886, "grad_norm": 0.09049484776525325, "learning_rate": 6.964642691333383e-05, "loss": 0.014, "step": 2555 }, { "epoch": 1.9160419790104948, "grad_norm": 0.07059096508827815, "learning_rate": 6.956331233001374e-05, "loss": 0.0056, "step": 2556 }, { "epoch": 1.916791604197901, "grad_norm": 0.3242025573619945, "learning_rate": 6.948022091268356e-05, "loss": 0.0338, "step": 2557 }, { "epoch": 1.9175412293853074, "grad_norm": 0.1967536374678093, "learning_rate": 6.93971527245859e-05, "loss": 0.0181, "step": 2558 }, { "epoch": 1.9182908545727138, "grad_norm": 0.09964807966099669, "learning_rate": 6.931410782894562e-05, "loss": 0.0055, "step": 2559 }, { "epoch": 1.91904047976012, "grad_norm": 0.1056229679621337, "learning_rate": 6.923108628896995e-05, "loss": 0.0109, "step": 2560 }, { "epoch": 1.919790104947526, "grad_norm": 0.2675761383101984, "learning_rate": 6.914808816784826e-05, "loss": 0.0175, "step": 2561 }, { "epoch": 1.9205397301349325, "grad_norm": 0.07748755906739309, "learning_rate": 6.906511352875216e-05, "loss": 0.0079, "step": 2562 }, { "epoch": 1.921289355322339, "grad_norm": 0.07310728412254099, "learning_rate": 6.89821624348353e-05, "loss": 0.011, "step": 2563 }, { "epoch": 1.922038980509745, "grad_norm": 0.0957840682527892, "learning_rate": 6.889923494923352e-05, "loss": 0.0084, "step": 2564 }, { "epoch": 1.9227886056971513, "grad_norm": 0.36935961501338727, "learning_rate": 6.881633113506463e-05, "loss": 0.0422, "step": 2565 }, { "epoch": 1.9235382308845579, "grad_norm": 0.16293408389417657, "learning_rate": 6.873345105542842e-05, "loss": 0.0141, "step": 2566 }, { "epoch": 1.924287856071964, "grad_norm": 0.13888112998811752, "learning_rate": 6.865059477340662e-05, "loss": 0.0138, "step": 2567 }, { "epoch": 1.9250374812593702, "grad_norm": 0.289847503646193, "learning_rate": 6.856776235206288e-05, "loss": 0.0334, "step": 2568 }, { "epoch": 1.9257871064467766, "grad_norm": 0.12933008451268538, "learning_rate": 6.848495385444265e-05, "loss": 0.01, "step": 2569 }, { "epoch": 1.926536731634183, "grad_norm": 0.16191506189682173, "learning_rate": 6.840216934357318e-05, "loss": 0.018, "step": 2570 }, { "epoch": 1.9272863568215892, "grad_norm": 0.18943025035089628, "learning_rate": 6.831940888246344e-05, "loss": 0.0138, "step": 2571 }, { "epoch": 1.9280359820089954, "grad_norm": 0.1061248577111503, "learning_rate": 6.823667253410417e-05, "loss": 0.0143, "step": 2572 }, { "epoch": 1.9287856071964018, "grad_norm": 0.12445565669107847, "learning_rate": 6.815396036146767e-05, "loss": 0.0078, "step": 2573 }, { "epoch": 1.9295352323838082, "grad_norm": 0.110562071079399, "learning_rate": 6.80712724275079e-05, "loss": 0.0091, "step": 2574 }, { "epoch": 1.9302848575712144, "grad_norm": 0.11468291069499537, "learning_rate": 6.798860879516035e-05, "loss": 0.0141, "step": 2575 }, { "epoch": 1.9310344827586206, "grad_norm": 0.0934202521942271, "learning_rate": 6.790596952734199e-05, "loss": 0.0099, "step": 2576 }, { "epoch": 1.931784107946027, "grad_norm": 0.0929281480526714, "learning_rate": 6.782335468695127e-05, "loss": 0.0117, "step": 2577 }, { "epoch": 1.9325337331334334, "grad_norm": 0.10176325279678822, "learning_rate": 6.774076433686809e-05, "loss": 0.0103, "step": 2578 }, { "epoch": 1.9332833583208395, "grad_norm": 0.21709643686481206, "learning_rate": 6.765819853995364e-05, "loss": 0.0403, "step": 2579 }, { "epoch": 1.934032983508246, "grad_norm": 0.17589843106110198, "learning_rate": 6.757565735905043e-05, "loss": 0.0217, "step": 2580 }, { "epoch": 1.9347826086956523, "grad_norm": 0.11434728905622538, "learning_rate": 6.749314085698229e-05, "loss": 0.0223, "step": 2581 }, { "epoch": 1.9355322338830585, "grad_norm": 0.14487603652395825, "learning_rate": 6.74106490965542e-05, "loss": 0.0083, "step": 2582 }, { "epoch": 1.9362818590704647, "grad_norm": 0.1289344714288651, "learning_rate": 6.732818214055233e-05, "loss": 0.0103, "step": 2583 }, { "epoch": 1.937031484257871, "grad_norm": 0.11890488272970408, "learning_rate": 6.7245740051744e-05, "loss": 0.0125, "step": 2584 }, { "epoch": 1.9377811094452775, "grad_norm": 0.09133697176455517, "learning_rate": 6.716332289287759e-05, "loss": 0.0145, "step": 2585 }, { "epoch": 1.9385307346326837, "grad_norm": 0.109331379422392, "learning_rate": 6.70809307266825e-05, "loss": 0.0168, "step": 2586 }, { "epoch": 1.9392803598200898, "grad_norm": 0.20016189865941686, "learning_rate": 6.699856361586905e-05, "loss": 0.0332, "step": 2587 }, { "epoch": 1.9400299850074962, "grad_norm": 0.18856090528988828, "learning_rate": 6.69162216231286e-05, "loss": 0.0121, "step": 2588 }, { "epoch": 1.9407796101949026, "grad_norm": 0.09007171352149351, "learning_rate": 6.68339048111333e-05, "loss": 0.0126, "step": 2589 }, { "epoch": 1.9415292353823088, "grad_norm": 0.10575179313755036, "learning_rate": 6.675161324253623e-05, "loss": 0.0096, "step": 2590 }, { "epoch": 1.942278860569715, "grad_norm": 0.11672959400915874, "learning_rate": 6.666934697997113e-05, "loss": 0.016, "step": 2591 }, { "epoch": 1.9430284857571214, "grad_norm": 0.1559846824352955, "learning_rate": 6.658710608605259e-05, "loss": 0.0165, "step": 2592 }, { "epoch": 1.9437781109445278, "grad_norm": 0.16962939613420358, "learning_rate": 6.650489062337584e-05, "loss": 0.029, "step": 2593 }, { "epoch": 1.944527736131934, "grad_norm": 0.12100652827555615, "learning_rate": 6.642270065451677e-05, "loss": 0.0035, "step": 2594 }, { "epoch": 1.9452773613193404, "grad_norm": 0.22173551476179107, "learning_rate": 6.634053624203182e-05, "loss": 0.02, "step": 2595 }, { "epoch": 1.9460269865067468, "grad_norm": 0.16418019953890273, "learning_rate": 6.625839744845807e-05, "loss": 0.0092, "step": 2596 }, { "epoch": 1.946776611694153, "grad_norm": 0.08129102733379626, "learning_rate": 6.6176284336313e-05, "loss": 0.0058, "step": 2597 }, { "epoch": 1.9475262368815591, "grad_norm": 0.1808712429174813, "learning_rate": 6.609419696809462e-05, "loss": 0.0113, "step": 2598 }, { "epoch": 1.9482758620689655, "grad_norm": 0.11860297035413558, "learning_rate": 6.601213540628131e-05, "loss": 0.0092, "step": 2599 }, { "epoch": 1.949025487256372, "grad_norm": 0.13540127438080782, "learning_rate": 6.593009971333179e-05, "loss": 0.0101, "step": 2600 }, { "epoch": 1.949775112443778, "grad_norm": 0.10447994790470146, "learning_rate": 6.584808995168515e-05, "loss": 0.0084, "step": 2601 }, { "epoch": 1.9505247376311843, "grad_norm": 0.15497645159707168, "learning_rate": 6.576610618376071e-05, "loss": 0.0117, "step": 2602 }, { "epoch": 1.9512743628185907, "grad_norm": 0.09902340572842126, "learning_rate": 6.568414847195798e-05, "loss": 0.0057, "step": 2603 }, { "epoch": 1.952023988005997, "grad_norm": 0.1239254369290511, "learning_rate": 6.560221687865669e-05, "loss": 0.0131, "step": 2604 }, { "epoch": 1.9527736131934033, "grad_norm": 0.19577055242023142, "learning_rate": 6.552031146621664e-05, "loss": 0.0149, "step": 2605 }, { "epoch": 1.9535232383808094, "grad_norm": 0.14953347555603855, "learning_rate": 6.543843229697772e-05, "loss": 0.0175, "step": 2606 }, { "epoch": 1.954272863568216, "grad_norm": 0.16228867246023873, "learning_rate": 6.535657943325988e-05, "loss": 0.0149, "step": 2607 }, { "epoch": 1.9550224887556222, "grad_norm": 0.18842441467737261, "learning_rate": 6.527475293736301e-05, "loss": 0.0153, "step": 2608 }, { "epoch": 1.9557721139430284, "grad_norm": 0.11570043501747544, "learning_rate": 6.519295287156692e-05, "loss": 0.0073, "step": 2609 }, { "epoch": 1.9565217391304348, "grad_norm": 0.10125016943360128, "learning_rate": 6.511117929813135e-05, "loss": 0.0113, "step": 2610 }, { "epoch": 1.9572713643178412, "grad_norm": 0.27885175542005797, "learning_rate": 6.502943227929586e-05, "loss": 0.024, "step": 2611 }, { "epoch": 1.9580209895052474, "grad_norm": 0.12116488059211396, "learning_rate": 6.494771187727975e-05, "loss": 0.0265, "step": 2612 }, { "epoch": 1.9587706146926536, "grad_norm": 0.23356339007737725, "learning_rate": 6.486601815428213e-05, "loss": 0.0214, "step": 2613 }, { "epoch": 1.95952023988006, "grad_norm": 0.16281117081626795, "learning_rate": 6.478435117248175e-05, "loss": 0.0282, "step": 2614 }, { "epoch": 1.9602698650674664, "grad_norm": 0.14826397127572882, "learning_rate": 6.470271099403709e-05, "loss": 0.0172, "step": 2615 }, { "epoch": 1.9610194902548725, "grad_norm": 0.16639910228693838, "learning_rate": 6.462109768108612e-05, "loss": 0.0155, "step": 2616 }, { "epoch": 1.9617691154422787, "grad_norm": 0.12878213613918643, "learning_rate": 6.453951129574644e-05, "loss": 0.0047, "step": 2617 }, { "epoch": 1.9625187406296851, "grad_norm": 0.23620891755437637, "learning_rate": 6.44579519001151e-05, "loss": 0.0191, "step": 2618 }, { "epoch": 1.9632683658170915, "grad_norm": 0.47003799940530394, "learning_rate": 6.437641955626867e-05, "loss": 0.0346, "step": 2619 }, { "epoch": 1.9640179910044977, "grad_norm": 0.10834439872357306, "learning_rate": 6.429491432626306e-05, "loss": 0.0074, "step": 2620 }, { "epoch": 1.964767616191904, "grad_norm": 0.30263593519837156, "learning_rate": 6.42134362721336e-05, "loss": 0.0345, "step": 2621 }, { "epoch": 1.9655172413793105, "grad_norm": 0.09886349728154153, "learning_rate": 6.413198545589493e-05, "loss": 0.0086, "step": 2622 }, { "epoch": 1.9662668665667167, "grad_norm": 0.11214795676054878, "learning_rate": 6.405056193954092e-05, "loss": 0.0159, "step": 2623 }, { "epoch": 1.9670164917541229, "grad_norm": 0.08743795758245536, "learning_rate": 6.396916578504467e-05, "loss": 0.0087, "step": 2624 }, { "epoch": 1.9677661169415293, "grad_norm": 0.1203291476144883, "learning_rate": 6.388779705435852e-05, "loss": 0.0124, "step": 2625 }, { "epoch": 1.9685157421289357, "grad_norm": 0.13694478120673487, "learning_rate": 6.380645580941385e-05, "loss": 0.0144, "step": 2626 }, { "epoch": 1.9692653673163418, "grad_norm": 0.20963426631790033, "learning_rate": 6.372514211212116e-05, "loss": 0.0204, "step": 2627 }, { "epoch": 1.970014992503748, "grad_norm": 0.10474781082841776, "learning_rate": 6.364385602437e-05, "loss": 0.0113, "step": 2628 }, { "epoch": 1.9707646176911544, "grad_norm": 0.12605094387952967, "learning_rate": 6.356259760802886e-05, "loss": 0.0206, "step": 2629 }, { "epoch": 1.9715142428785608, "grad_norm": 0.10242238045281239, "learning_rate": 6.348136692494519e-05, "loss": 0.0084, "step": 2630 }, { "epoch": 1.972263868065967, "grad_norm": 0.4949245664636189, "learning_rate": 6.340016403694537e-05, "loss": 0.0146, "step": 2631 }, { "epoch": 1.9730134932533732, "grad_norm": 0.24511036686641371, "learning_rate": 6.331898900583458e-05, "loss": 0.022, "step": 2632 }, { "epoch": 1.9737631184407796, "grad_norm": 0.07186690641381413, "learning_rate": 6.323784189339677e-05, "loss": 0.0064, "step": 2633 }, { "epoch": 1.974512743628186, "grad_norm": 0.19864117764803166, "learning_rate": 6.315672276139474e-05, "loss": 0.0375, "step": 2634 }, { "epoch": 1.9752623688155921, "grad_norm": 0.27687594522339515, "learning_rate": 6.307563167156988e-05, "loss": 0.0272, "step": 2635 }, { "epoch": 1.9760119940029985, "grad_norm": 0.1775523484598218, "learning_rate": 6.299456868564235e-05, "loss": 0.0188, "step": 2636 }, { "epoch": 1.976761619190405, "grad_norm": 0.12004662288287847, "learning_rate": 6.291353386531074e-05, "loss": 0.0099, "step": 2637 }, { "epoch": 1.9775112443778111, "grad_norm": 0.3073542949957989, "learning_rate": 6.283252727225245e-05, "loss": 0.0338, "step": 2638 }, { "epoch": 1.9782608695652173, "grad_norm": 0.21760402679314944, "learning_rate": 6.275154896812323e-05, "loss": 0.0111, "step": 2639 }, { "epoch": 1.9790104947526237, "grad_norm": 0.04176693977010684, "learning_rate": 6.26705990145573e-05, "loss": 0.0038, "step": 2640 }, { "epoch": 1.97976011994003, "grad_norm": 0.08343985178592077, "learning_rate": 6.258967747316738e-05, "loss": 0.0042, "step": 2641 }, { "epoch": 1.9805097451274363, "grad_norm": 0.09999060085646638, "learning_rate": 6.250878440554448e-05, "loss": 0.0141, "step": 2642 }, { "epoch": 1.9812593703148424, "grad_norm": 0.096873305567595, "learning_rate": 6.2427919873258e-05, "loss": 0.0073, "step": 2643 }, { "epoch": 1.9820089955022488, "grad_norm": 0.11376993253947391, "learning_rate": 6.234708393785563e-05, "loss": 0.0085, "step": 2644 }, { "epoch": 1.9827586206896552, "grad_norm": 0.13578217581490346, "learning_rate": 6.22662766608632e-05, "loss": 0.0277, "step": 2645 }, { "epoch": 1.9835082458770614, "grad_norm": 0.10521727242358125, "learning_rate": 6.218549810378485e-05, "loss": 0.0215, "step": 2646 }, { "epoch": 1.9842578710644676, "grad_norm": 0.13965904247520539, "learning_rate": 6.210474832810276e-05, "loss": 0.012, "step": 2647 }, { "epoch": 1.9850074962518742, "grad_norm": 0.11152054461686513, "learning_rate": 6.202402739527729e-05, "loss": 0.0198, "step": 2648 }, { "epoch": 1.9857571214392804, "grad_norm": 0.1477238847811376, "learning_rate": 6.19433353667468e-05, "loss": 0.0209, "step": 2649 }, { "epoch": 1.9865067466266866, "grad_norm": 0.19783507828736474, "learning_rate": 6.186267230392762e-05, "loss": 0.0241, "step": 2650 }, { "epoch": 1.987256371814093, "grad_norm": 0.13984137791498613, "learning_rate": 6.178203826821409e-05, "loss": 0.0167, "step": 2651 }, { "epoch": 1.9880059970014994, "grad_norm": 0.1739092071743353, "learning_rate": 6.170143332097843e-05, "loss": 0.0264, "step": 2652 }, { "epoch": 1.9887556221889056, "grad_norm": 0.080112857244923, "learning_rate": 6.162085752357076e-05, "loss": 0.0105, "step": 2653 }, { "epoch": 1.9895052473763117, "grad_norm": 0.1773606487440933, "learning_rate": 6.154031093731894e-05, "loss": 0.0189, "step": 2654 }, { "epoch": 1.9902548725637181, "grad_norm": 0.08855210818751634, "learning_rate": 6.145979362352862e-05, "loss": 0.0138, "step": 2655 }, { "epoch": 1.9910044977511245, "grad_norm": 0.09639909177775023, "learning_rate": 6.137930564348322e-05, "loss": 0.0125, "step": 2656 }, { "epoch": 1.9917541229385307, "grad_norm": 0.08980465027172353, "learning_rate": 6.12988470584438e-05, "loss": 0.0117, "step": 2657 }, { "epoch": 1.992503748125937, "grad_norm": 0.1926546990612494, "learning_rate": 6.121841792964901e-05, "loss": 0.0122, "step": 2658 }, { "epoch": 1.9932533733133433, "grad_norm": 0.15440555314144272, "learning_rate": 6.113801831831518e-05, "loss": 0.0156, "step": 2659 }, { "epoch": 1.9940029985007497, "grad_norm": 0.09339135907497718, "learning_rate": 6.105764828563607e-05, "loss": 0.0069, "step": 2660 }, { "epoch": 1.9947526236881559, "grad_norm": 0.2918423018977411, "learning_rate": 6.097730789278292e-05, "loss": 0.0532, "step": 2661 }, { "epoch": 1.9955022488755623, "grad_norm": 0.10195907150794488, "learning_rate": 6.089699720090455e-05, "loss": 0.0067, "step": 2662 }, { "epoch": 1.9962518740629687, "grad_norm": 0.1235237078063373, "learning_rate": 6.081671627112704e-05, "loss": 0.0094, "step": 2663 }, { "epoch": 1.9970014992503748, "grad_norm": 0.07032145735999436, "learning_rate": 6.073646516455387e-05, "loss": 0.006, "step": 2664 }, { "epoch": 1.997751124437781, "grad_norm": 0.05541573305685681, "learning_rate": 6.065624394226579e-05, "loss": 0.0059, "step": 2665 }, { "epoch": 1.9985007496251874, "grad_norm": 0.10269356709036573, "learning_rate": 6.057605266532084e-05, "loss": 0.018, "step": 2666 }, { "epoch": 1.9992503748125938, "grad_norm": 0.21964472646986308, "learning_rate": 6.049589139475424e-05, "loss": 0.0181, "step": 2667 }, { "epoch": 2.0, "grad_norm": 0.11976827179422106, "learning_rate": 6.04157601915784e-05, "loss": 0.028, "step": 2668 }, { "epoch": 2.0, "eval_loss": 0.029895687475800514, "eval_runtime": 1872.329, "eval_samples_per_second": 5.542, "eval_steps_per_second": 0.693, "step": 2668 }, { "epoch": 2.000749625187406, "grad_norm": 0.12920673369246413, "learning_rate": 6.0335659116782825e-05, "loss": 0.0149, "step": 2669 }, { "epoch": 2.001499250374813, "grad_norm": 0.0920728899931142, "learning_rate": 6.0255588231334056e-05, "loss": 0.0103, "step": 2670 }, { "epoch": 2.002248875562219, "grad_norm": 0.08511625814268858, "learning_rate": 6.017554759617575e-05, "loss": 0.0081, "step": 2671 }, { "epoch": 2.002998500749625, "grad_norm": 0.05220608860817644, "learning_rate": 6.009553727222846e-05, "loss": 0.0049, "step": 2672 }, { "epoch": 2.0037481259370313, "grad_norm": 0.4497006749186715, "learning_rate": 6.0015557320389695e-05, "loss": 0.0193, "step": 2673 }, { "epoch": 2.004497751124438, "grad_norm": 0.04101615202066704, "learning_rate": 5.993560780153384e-05, "loss": 0.0024, "step": 2674 }, { "epoch": 2.005247376311844, "grad_norm": 0.08383419790926522, "learning_rate": 5.985568877651211e-05, "loss": 0.0089, "step": 2675 }, { "epoch": 2.0059970014992503, "grad_norm": 0.1879415339577029, "learning_rate": 5.977580030615254e-05, "loss": 0.0078, "step": 2676 }, { "epoch": 2.0067466266866565, "grad_norm": 0.031140431853141123, "learning_rate": 5.969594245125988e-05, "loss": 0.0024, "step": 2677 }, { "epoch": 2.007496251874063, "grad_norm": 0.08823839659496005, "learning_rate": 5.961611527261559e-05, "loss": 0.0118, "step": 2678 }, { "epoch": 2.0082458770614693, "grad_norm": 0.07251362090044315, "learning_rate": 5.953631883097777e-05, "loss": 0.0063, "step": 2679 }, { "epoch": 2.0089955022488755, "grad_norm": 0.06889138777796779, "learning_rate": 5.9456553187081146e-05, "loss": 0.0063, "step": 2680 }, { "epoch": 2.0097451274362816, "grad_norm": 0.17612485680220627, "learning_rate": 5.937681840163698e-05, "loss": 0.0164, "step": 2681 }, { "epoch": 2.0104947526236883, "grad_norm": 0.18739782304747712, "learning_rate": 5.92971145353331e-05, "loss": 0.0118, "step": 2682 }, { "epoch": 2.0112443778110944, "grad_norm": 0.14708308312536258, "learning_rate": 5.9217441648833714e-05, "loss": 0.0115, "step": 2683 }, { "epoch": 2.0119940029985006, "grad_norm": 0.15935890992827711, "learning_rate": 5.913779980277951e-05, "loss": 0.0159, "step": 2684 }, { "epoch": 2.0127436281859072, "grad_norm": 0.02942228231752664, "learning_rate": 5.905818905778753e-05, "loss": 0.0019, "step": 2685 }, { "epoch": 2.0134932533733134, "grad_norm": 0.11449594933651004, "learning_rate": 5.8978609474451176e-05, "loss": 0.0137, "step": 2686 }, { "epoch": 2.0142428785607196, "grad_norm": 0.16924864902098877, "learning_rate": 5.8899061113340117e-05, "loss": 0.0257, "step": 2687 }, { "epoch": 2.0149925037481258, "grad_norm": 0.11744959118368047, "learning_rate": 5.881954403500021e-05, "loss": 0.0245, "step": 2688 }, { "epoch": 2.0157421289355324, "grad_norm": 0.08773876902030525, "learning_rate": 5.874005829995357e-05, "loss": 0.002, "step": 2689 }, { "epoch": 2.0164917541229386, "grad_norm": 0.08362034791556955, "learning_rate": 5.8660603968698416e-05, "loss": 0.0069, "step": 2690 }, { "epoch": 2.0172413793103448, "grad_norm": 0.1398009462913696, "learning_rate": 5.858118110170906e-05, "loss": 0.0143, "step": 2691 }, { "epoch": 2.017991004497751, "grad_norm": 0.15455916642541287, "learning_rate": 5.850178975943591e-05, "loss": 0.0062, "step": 2692 }, { "epoch": 2.0187406296851576, "grad_norm": 0.06891922749265855, "learning_rate": 5.842243000230532e-05, "loss": 0.0045, "step": 2693 }, { "epoch": 2.0194902548725637, "grad_norm": 0.17892536666544281, "learning_rate": 5.83431018907197e-05, "loss": 0.0194, "step": 2694 }, { "epoch": 2.02023988005997, "grad_norm": 0.11063179206160058, "learning_rate": 5.8263805485057235e-05, "loss": 0.0083, "step": 2695 }, { "epoch": 2.0209895052473765, "grad_norm": 0.09425654621063746, "learning_rate": 5.8184540845672085e-05, "loss": 0.0043, "step": 2696 }, { "epoch": 2.0217391304347827, "grad_norm": 0.09750334625327865, "learning_rate": 5.8105308032894216e-05, "loss": 0.0079, "step": 2697 }, { "epoch": 2.022488755622189, "grad_norm": 0.0794308388984354, "learning_rate": 5.802610710702927e-05, "loss": 0.0046, "step": 2698 }, { "epoch": 2.023238380809595, "grad_norm": 0.09362722963585723, "learning_rate": 5.794693812835883e-05, "loss": 0.0049, "step": 2699 }, { "epoch": 2.0239880059970017, "grad_norm": 0.06944043013400002, "learning_rate": 5.7867801157139965e-05, "loss": 0.0044, "step": 2700 }, { "epoch": 2.024737631184408, "grad_norm": 0.049746400026874464, "learning_rate": 5.778869625360546e-05, "loss": 0.0052, "step": 2701 }, { "epoch": 2.025487256371814, "grad_norm": 0.3734118178269151, "learning_rate": 5.7709623477963694e-05, "loss": 0.0091, "step": 2702 }, { "epoch": 2.02623688155922, "grad_norm": 0.4872267575568091, "learning_rate": 5.763058289039859e-05, "loss": 0.0376, "step": 2703 }, { "epoch": 2.026986506746627, "grad_norm": 0.057540021071794865, "learning_rate": 5.7551574551069554e-05, "loss": 0.0033, "step": 2704 }, { "epoch": 2.027736131934033, "grad_norm": 0.15688083528482955, "learning_rate": 5.7472598520111464e-05, "loss": 0.0102, "step": 2705 }, { "epoch": 2.028485757121439, "grad_norm": 0.129700793724637, "learning_rate": 5.73936548576346e-05, "loss": 0.0118, "step": 2706 }, { "epoch": 2.0292353823088454, "grad_norm": 0.09155531612100912, "learning_rate": 5.731474362372462e-05, "loss": 0.0088, "step": 2707 }, { "epoch": 2.029985007496252, "grad_norm": 0.0830634850988361, "learning_rate": 5.723586487844248e-05, "loss": 0.005, "step": 2708 }, { "epoch": 2.030734632683658, "grad_norm": 0.18270164608031197, "learning_rate": 5.7157018681824434e-05, "loss": 0.0041, "step": 2709 }, { "epoch": 2.0314842578710643, "grad_norm": 0.1053575328747016, "learning_rate": 5.7078205093881885e-05, "loss": 0.0087, "step": 2710 }, { "epoch": 2.032233883058471, "grad_norm": 0.04528738014190908, "learning_rate": 5.699942417460158e-05, "loss": 0.0055, "step": 2711 }, { "epoch": 2.032983508245877, "grad_norm": 0.16142423316077267, "learning_rate": 5.692067598394523e-05, "loss": 0.0065, "step": 2712 }, { "epoch": 2.0337331334332833, "grad_norm": 0.1840512166065806, "learning_rate": 5.684196058184972e-05, "loss": 0.0058, "step": 2713 }, { "epoch": 2.0344827586206895, "grad_norm": 0.09386316725998878, "learning_rate": 5.6763278028226986e-05, "loss": 0.0041, "step": 2714 }, { "epoch": 2.035232383808096, "grad_norm": 0.2704026073642783, "learning_rate": 5.66846283829639e-05, "loss": 0.0142, "step": 2715 }, { "epoch": 2.0359820089955023, "grad_norm": 0.053751714799146394, "learning_rate": 5.660601170592236e-05, "loss": 0.0027, "step": 2716 }, { "epoch": 2.0367316341829085, "grad_norm": 0.1665132193521529, "learning_rate": 5.6527428056939113e-05, "loss": 0.0058, "step": 2717 }, { "epoch": 2.0374812593703147, "grad_norm": 0.0837882288789223, "learning_rate": 5.644887749582581e-05, "loss": 0.0056, "step": 2718 }, { "epoch": 2.0382308845577213, "grad_norm": 0.17344687620216664, "learning_rate": 5.6370360082368915e-05, "loss": 0.012, "step": 2719 }, { "epoch": 2.0389805097451275, "grad_norm": 0.08545712755787346, "learning_rate": 5.62918758763296e-05, "loss": 0.0033, "step": 2720 }, { "epoch": 2.0397301349325336, "grad_norm": 0.15846203084186525, "learning_rate": 5.621342493744381e-05, "loss": 0.009, "step": 2721 }, { "epoch": 2.04047976011994, "grad_norm": 0.13787943275064699, "learning_rate": 5.613500732542225e-05, "loss": 0.0058, "step": 2722 }, { "epoch": 2.0412293853073464, "grad_norm": 0.1739206890447071, "learning_rate": 5.605662309995012e-05, "loss": 0.0102, "step": 2723 }, { "epoch": 2.0419790104947526, "grad_norm": 0.14476344648170317, "learning_rate": 5.5978272320687286e-05, "loss": 0.0146, "step": 2724 }, { "epoch": 2.042728635682159, "grad_norm": 0.08895660251404246, "learning_rate": 5.589995504726814e-05, "loss": 0.0059, "step": 2725 }, { "epoch": 2.0434782608695654, "grad_norm": 0.034946758693803905, "learning_rate": 5.582167133930156e-05, "loss": 0.0016, "step": 2726 }, { "epoch": 2.0442278860569716, "grad_norm": 0.05293482370990212, "learning_rate": 5.574342125637089e-05, "loss": 0.0027, "step": 2727 }, { "epoch": 2.0449775112443778, "grad_norm": 0.1539671338465271, "learning_rate": 5.566520485803388e-05, "loss": 0.021, "step": 2728 }, { "epoch": 2.045727136431784, "grad_norm": 0.259356096848114, "learning_rate": 5.558702220382266e-05, "loss": 0.0131, "step": 2729 }, { "epoch": 2.0464767616191906, "grad_norm": 0.047567614571502104, "learning_rate": 5.550887335324363e-05, "loss": 0.0026, "step": 2730 }, { "epoch": 2.0472263868065967, "grad_norm": 0.14037541136100798, "learning_rate": 5.5430758365777514e-05, "loss": 0.0079, "step": 2731 }, { "epoch": 2.047976011994003, "grad_norm": 0.11024060952702228, "learning_rate": 5.535267730087922e-05, "loss": 0.0079, "step": 2732 }, { "epoch": 2.048725637181409, "grad_norm": 0.1868952649017366, "learning_rate": 5.527463021797786e-05, "loss": 0.0151, "step": 2733 }, { "epoch": 2.0494752623688157, "grad_norm": 0.0685871767217177, "learning_rate": 5.519661717647664e-05, "loss": 0.004, "step": 2734 }, { "epoch": 2.050224887556222, "grad_norm": 0.36893532451261046, "learning_rate": 5.511863823575298e-05, "loss": 0.03, "step": 2735 }, { "epoch": 2.050974512743628, "grad_norm": 0.1024390603249343, "learning_rate": 5.5040693455158206e-05, "loss": 0.0064, "step": 2736 }, { "epoch": 2.0517241379310347, "grad_norm": 0.06003674834006714, "learning_rate": 5.496278289401771e-05, "loss": 0.0048, "step": 2737 }, { "epoch": 2.052473763118441, "grad_norm": 0.0676091350613358, "learning_rate": 5.488490661163079e-05, "loss": 0.0039, "step": 2738 }, { "epoch": 2.053223388305847, "grad_norm": 0.06811684148720669, "learning_rate": 5.480706466727074e-05, "loss": 0.004, "step": 2739 }, { "epoch": 2.0539730134932532, "grad_norm": 0.22457808229890616, "learning_rate": 5.472925712018462e-05, "loss": 0.023, "step": 2740 }, { "epoch": 2.05472263868066, "grad_norm": 0.14011710230532487, "learning_rate": 5.465148402959339e-05, "loss": 0.0165, "step": 2741 }, { "epoch": 2.055472263868066, "grad_norm": 0.09396544964614165, "learning_rate": 5.457374545469174e-05, "loss": 0.0068, "step": 2742 }, { "epoch": 2.056221889055472, "grad_norm": 0.20625852590985544, "learning_rate": 5.4496041454648086e-05, "loss": 0.0085, "step": 2743 }, { "epoch": 2.0569715142428784, "grad_norm": 0.20411992021066738, "learning_rate": 5.441837208860456e-05, "loss": 0.0088, "step": 2744 }, { "epoch": 2.057721139430285, "grad_norm": 0.051337110665052886, "learning_rate": 5.4340737415676866e-05, "loss": 0.0042, "step": 2745 }, { "epoch": 2.058470764617691, "grad_norm": 0.24525192857059822, "learning_rate": 5.426313749495447e-05, "loss": 0.0079, "step": 2746 }, { "epoch": 2.0592203898050974, "grad_norm": 0.15175784875891044, "learning_rate": 5.418557238550016e-05, "loss": 0.0051, "step": 2747 }, { "epoch": 2.0599700149925035, "grad_norm": 0.04180425435431888, "learning_rate": 5.410804214635039e-05, "loss": 0.0022, "step": 2748 }, { "epoch": 2.06071964017991, "grad_norm": 0.15208975614765013, "learning_rate": 5.403054683651502e-05, "loss": 0.0077, "step": 2749 }, { "epoch": 2.0614692653673163, "grad_norm": 0.0745353317998331, "learning_rate": 5.39530865149773e-05, "loss": 0.0045, "step": 2750 }, { "epoch": 2.0622188905547225, "grad_norm": 0.0857172018999341, "learning_rate": 5.387566124069391e-05, "loss": 0.005, "step": 2751 }, { "epoch": 2.062968515742129, "grad_norm": 0.07316934882364763, "learning_rate": 5.3798271072594786e-05, "loss": 0.0047, "step": 2752 }, { "epoch": 2.0637181409295353, "grad_norm": 0.141839675930256, "learning_rate": 5.372091606958321e-05, "loss": 0.0148, "step": 2753 }, { "epoch": 2.0644677661169415, "grad_norm": 0.1318488328244515, "learning_rate": 5.364359629053566e-05, "loss": 0.0162, "step": 2754 }, { "epoch": 2.0652173913043477, "grad_norm": 0.2125988213178984, "learning_rate": 5.3566311794301784e-05, "loss": 0.0229, "step": 2755 }, { "epoch": 2.0659670164917543, "grad_norm": 0.11935852353236064, "learning_rate": 5.3489062639704434e-05, "loss": 0.0033, "step": 2756 }, { "epoch": 2.0667166416791605, "grad_norm": 0.045018097123316676, "learning_rate": 5.341184888553954e-05, "loss": 0.002, "step": 2757 }, { "epoch": 2.0674662668665666, "grad_norm": 0.17893936598833707, "learning_rate": 5.333467059057602e-05, "loss": 0.0158, "step": 2758 }, { "epoch": 2.068215892053973, "grad_norm": 0.1267157060412366, "learning_rate": 5.325752781355593e-05, "loss": 0.0067, "step": 2759 }, { "epoch": 2.0689655172413794, "grad_norm": 0.11327073400568807, "learning_rate": 5.3180420613194226e-05, "loss": 0.0086, "step": 2760 }, { "epoch": 2.0697151424287856, "grad_norm": 0.026122915286548982, "learning_rate": 5.310334904817878e-05, "loss": 0.002, "step": 2761 }, { "epoch": 2.070464767616192, "grad_norm": 0.0624022067515344, "learning_rate": 5.3026313177170303e-05, "loss": 0.0019, "step": 2762 }, { "epoch": 2.071214392803598, "grad_norm": 0.07979267478998299, "learning_rate": 5.294931305880243e-05, "loss": 0.0075, "step": 2763 }, { "epoch": 2.0719640179910046, "grad_norm": 0.34235288461531843, "learning_rate": 5.287234875168152e-05, "loss": 0.0096, "step": 2764 }, { "epoch": 2.072713643178411, "grad_norm": 0.053751697993312225, "learning_rate": 5.27954203143867e-05, "loss": 0.0036, "step": 2765 }, { "epoch": 2.073463268365817, "grad_norm": 0.09879108309949992, "learning_rate": 5.271852780546977e-05, "loss": 0.006, "step": 2766 }, { "epoch": 2.0742128935532236, "grad_norm": 0.11817567253208243, "learning_rate": 5.264167128345523e-05, "loss": 0.0175, "step": 2767 }, { "epoch": 2.0749625187406298, "grad_norm": 0.07336750448769226, "learning_rate": 5.256485080684011e-05, "loss": 0.005, "step": 2768 }, { "epoch": 2.075712143928036, "grad_norm": 0.07420326577156641, "learning_rate": 5.2488066434094116e-05, "loss": 0.0062, "step": 2769 }, { "epoch": 2.076461769115442, "grad_norm": 0.11458244964011671, "learning_rate": 5.241131822365941e-05, "loss": 0.0123, "step": 2770 }, { "epoch": 2.0772113943028487, "grad_norm": 0.08138043870500519, "learning_rate": 5.233460623395061e-05, "loss": 0.0051, "step": 2771 }, { "epoch": 2.077961019490255, "grad_norm": 0.06648218497103654, "learning_rate": 5.225793052335479e-05, "loss": 0.0056, "step": 2772 }, { "epoch": 2.078710644677661, "grad_norm": 0.10901113040322437, "learning_rate": 5.218129115023145e-05, "loss": 0.0049, "step": 2773 }, { "epoch": 2.0794602698650673, "grad_norm": 0.15590821022125223, "learning_rate": 5.210468817291234e-05, "loss": 0.0135, "step": 2774 }, { "epoch": 2.080209895052474, "grad_norm": 0.15411055638884816, "learning_rate": 5.2028121649701614e-05, "loss": 0.0066, "step": 2775 }, { "epoch": 2.08095952023988, "grad_norm": 0.1844008895299288, "learning_rate": 5.1951591638875605e-05, "loss": 0.0211, "step": 2776 }, { "epoch": 2.0817091454272862, "grad_norm": 0.17972940945558835, "learning_rate": 5.187509819868288e-05, "loss": 0.0118, "step": 2777 }, { "epoch": 2.082458770614693, "grad_norm": 0.1818504662636259, "learning_rate": 5.1798641387344184e-05, "loss": 0.013, "step": 2778 }, { "epoch": 2.083208395802099, "grad_norm": 0.13366106028225347, "learning_rate": 5.172222126305235e-05, "loss": 0.0139, "step": 2779 }, { "epoch": 2.0839580209895052, "grad_norm": 0.047564458389697646, "learning_rate": 5.164583788397234e-05, "loss": 0.0023, "step": 2780 }, { "epoch": 2.0847076461769114, "grad_norm": 0.08583461517386021, "learning_rate": 5.1569491308241094e-05, "loss": 0.0077, "step": 2781 }, { "epoch": 2.085457271364318, "grad_norm": 0.08232377535332387, "learning_rate": 5.149318159396752e-05, "loss": 0.0052, "step": 2782 }, { "epoch": 2.086206896551724, "grad_norm": 0.12920830793795776, "learning_rate": 5.14169087992326e-05, "loss": 0.0116, "step": 2783 }, { "epoch": 2.0869565217391304, "grad_norm": 0.28023458386351713, "learning_rate": 5.134067298208909e-05, "loss": 0.0283, "step": 2784 }, { "epoch": 2.0877061469265366, "grad_norm": 0.11795585858524446, "learning_rate": 5.126447420056165e-05, "loss": 0.0048, "step": 2785 }, { "epoch": 2.088455772113943, "grad_norm": 0.19569779614562738, "learning_rate": 5.118831251264673e-05, "loss": 0.0164, "step": 2786 }, { "epoch": 2.0892053973013494, "grad_norm": 0.1391438714541468, "learning_rate": 5.111218797631256e-05, "loss": 0.0037, "step": 2787 }, { "epoch": 2.0899550224887555, "grad_norm": 0.12083354676952428, "learning_rate": 5.103610064949911e-05, "loss": 0.0238, "step": 2788 }, { "epoch": 2.0907046476761617, "grad_norm": 0.11756022743007565, "learning_rate": 5.096005059011799e-05, "loss": 0.0073, "step": 2789 }, { "epoch": 2.0914542728635683, "grad_norm": 0.05654885803442007, "learning_rate": 5.088403785605247e-05, "loss": 0.0028, "step": 2790 }, { "epoch": 2.0922038980509745, "grad_norm": 0.07698581805694782, "learning_rate": 5.080806250515737e-05, "loss": 0.007, "step": 2791 }, { "epoch": 2.0929535232383807, "grad_norm": 0.2584924571449496, "learning_rate": 5.073212459525918e-05, "loss": 0.019, "step": 2792 }, { "epoch": 2.0937031484257873, "grad_norm": 0.19263855328707216, "learning_rate": 5.065622418415577e-05, "loss": 0.0177, "step": 2793 }, { "epoch": 2.0944527736131935, "grad_norm": 0.19491448696329708, "learning_rate": 5.058036132961649e-05, "loss": 0.0228, "step": 2794 }, { "epoch": 2.0952023988005997, "grad_norm": 0.07619742031197467, "learning_rate": 5.0504536089382124e-05, "loss": 0.0056, "step": 2795 }, { "epoch": 2.095952023988006, "grad_norm": 0.22193470194375173, "learning_rate": 5.042874852116485e-05, "loss": 0.0124, "step": 2796 }, { "epoch": 2.0967016491754125, "grad_norm": 0.14331149167371612, "learning_rate": 5.035299868264811e-05, "loss": 0.0065, "step": 2797 }, { "epoch": 2.0974512743628186, "grad_norm": 0.172637392773366, "learning_rate": 5.02772866314867e-05, "loss": 0.0142, "step": 2798 }, { "epoch": 2.098200899550225, "grad_norm": 0.12579988048753785, "learning_rate": 5.0201612425306597e-05, "loss": 0.0058, "step": 2799 }, { "epoch": 2.098950524737631, "grad_norm": 0.10993273901263476, "learning_rate": 5.012597612170502e-05, "loss": 0.0023, "step": 2800 }, { "epoch": 2.0997001499250376, "grad_norm": 0.065006704120964, "learning_rate": 5.005037777825029e-05, "loss": 0.0054, "step": 2801 }, { "epoch": 2.100449775112444, "grad_norm": 0.08488518929085496, "learning_rate": 4.9974817452481885e-05, "loss": 0.0094, "step": 2802 }, { "epoch": 2.10119940029985, "grad_norm": 0.06996280682513732, "learning_rate": 4.989929520191031e-05, "loss": 0.0044, "step": 2803 }, { "epoch": 2.101949025487256, "grad_norm": 0.10614255374552094, "learning_rate": 4.982381108401712e-05, "loss": 0.0051, "step": 2804 }, { "epoch": 2.1026986506746628, "grad_norm": 0.08931334364433949, "learning_rate": 4.974836515625484e-05, "loss": 0.0097, "step": 2805 }, { "epoch": 2.103448275862069, "grad_norm": 0.09602657010448278, "learning_rate": 4.967295747604685e-05, "loss": 0.0045, "step": 2806 }, { "epoch": 2.104197901049475, "grad_norm": 0.07385611806369675, "learning_rate": 4.9597588100787585e-05, "loss": 0.0031, "step": 2807 }, { "epoch": 2.1049475262368817, "grad_norm": 0.03093310014099065, "learning_rate": 4.9522257087842196e-05, "loss": 0.0011, "step": 2808 }, { "epoch": 2.105697151424288, "grad_norm": 0.11316962936701583, "learning_rate": 4.9446964494546655e-05, "loss": 0.0081, "step": 2809 }, { "epoch": 2.106446776611694, "grad_norm": 0.05960739161768324, "learning_rate": 4.93717103782077e-05, "loss": 0.002, "step": 2810 }, { "epoch": 2.1071964017991003, "grad_norm": 0.13704128945073163, "learning_rate": 4.929649479610282e-05, "loss": 0.0098, "step": 2811 }, { "epoch": 2.107946026986507, "grad_norm": 0.17973157057976616, "learning_rate": 4.9221317805480115e-05, "loss": 0.0141, "step": 2812 }, { "epoch": 2.108695652173913, "grad_norm": 0.05729033581567592, "learning_rate": 4.914617946355835e-05, "loss": 0.0036, "step": 2813 }, { "epoch": 2.1094452773613193, "grad_norm": 0.04930374438074895, "learning_rate": 4.907107982752684e-05, "loss": 0.0021, "step": 2814 }, { "epoch": 2.1101949025487254, "grad_norm": 0.10265639378615501, "learning_rate": 4.899601895454551e-05, "loss": 0.0116, "step": 2815 }, { "epoch": 2.110944527736132, "grad_norm": 0.17308963975557312, "learning_rate": 4.892099690174472e-05, "loss": 0.0151, "step": 2816 }, { "epoch": 2.1116941529235382, "grad_norm": 0.14137961791206421, "learning_rate": 4.8846013726225295e-05, "loss": 0.0126, "step": 2817 }, { "epoch": 2.1124437781109444, "grad_norm": 0.19629272020786975, "learning_rate": 4.8771069485058486e-05, "loss": 0.0043, "step": 2818 }, { "epoch": 2.113193403298351, "grad_norm": 0.1248330816683146, "learning_rate": 4.869616423528588e-05, "loss": 0.0061, "step": 2819 }, { "epoch": 2.113943028485757, "grad_norm": 0.07729755271577264, "learning_rate": 4.86212980339194e-05, "loss": 0.0042, "step": 2820 }, { "epoch": 2.1146926536731634, "grad_norm": 0.1215731772373695, "learning_rate": 4.854647093794129e-05, "loss": 0.016, "step": 2821 }, { "epoch": 2.1154422788605696, "grad_norm": 0.3168985126036469, "learning_rate": 4.847168300430397e-05, "loss": 0.0081, "step": 2822 }, { "epoch": 2.116191904047976, "grad_norm": 0.18347302630220838, "learning_rate": 4.8396934289930064e-05, "loss": 0.0163, "step": 2823 }, { "epoch": 2.1169415292353824, "grad_norm": 0.09802171455989667, "learning_rate": 4.8322224851712386e-05, "loss": 0.0037, "step": 2824 }, { "epoch": 2.1176911544227885, "grad_norm": 0.11466768689466496, "learning_rate": 4.82475547465138e-05, "loss": 0.0107, "step": 2825 }, { "epoch": 2.1184407796101947, "grad_norm": 0.15766024712357615, "learning_rate": 4.817292403116729e-05, "loss": 0.0073, "step": 2826 }, { "epoch": 2.1191904047976013, "grad_norm": 0.08553525297480681, "learning_rate": 4.8098332762475804e-05, "loss": 0.0057, "step": 2827 }, { "epoch": 2.1199400299850075, "grad_norm": 0.08917342986018453, "learning_rate": 4.802378099721232e-05, "loss": 0.0054, "step": 2828 }, { "epoch": 2.1206896551724137, "grad_norm": 0.017895313173530512, "learning_rate": 4.79492687921197e-05, "loss": 0.0005, "step": 2829 }, { "epoch": 2.12143928035982, "grad_norm": 0.15167899697433201, "learning_rate": 4.787479620391068e-05, "loss": 0.0068, "step": 2830 }, { "epoch": 2.1221889055472265, "grad_norm": 0.14085737808607107, "learning_rate": 4.780036328926797e-05, "loss": 0.0081, "step": 2831 }, { "epoch": 2.1229385307346327, "grad_norm": 0.2396864168220026, "learning_rate": 4.772597010484395e-05, "loss": 0.0185, "step": 2832 }, { "epoch": 2.123688155922039, "grad_norm": 0.08151691227139982, "learning_rate": 4.7651616707260815e-05, "loss": 0.005, "step": 2833 }, { "epoch": 2.1244377811094455, "grad_norm": 0.07783032987855823, "learning_rate": 4.757730315311045e-05, "loss": 0.0023, "step": 2834 }, { "epoch": 2.1251874062968517, "grad_norm": 0.07764939687149099, "learning_rate": 4.750302949895444e-05, "loss": 0.0033, "step": 2835 }, { "epoch": 2.125937031484258, "grad_norm": 0.12788204639640396, "learning_rate": 4.7428795801324e-05, "loss": 0.0073, "step": 2836 }, { "epoch": 2.126686656671664, "grad_norm": 0.14298556685133668, "learning_rate": 4.73546021167199e-05, "loss": 0.0091, "step": 2837 }, { "epoch": 2.1274362818590706, "grad_norm": 0.2196406994123255, "learning_rate": 4.7280448501612485e-05, "loss": 0.0046, "step": 2838 }, { "epoch": 2.128185907046477, "grad_norm": 0.07390369972391603, "learning_rate": 4.720633501244165e-05, "loss": 0.0024, "step": 2839 }, { "epoch": 2.128935532233883, "grad_norm": 0.21273204391802142, "learning_rate": 4.713226170561665e-05, "loss": 0.0181, "step": 2840 }, { "epoch": 2.129685157421289, "grad_norm": 0.07017065388286854, "learning_rate": 4.705822863751622e-05, "loss": 0.0032, "step": 2841 }, { "epoch": 2.130434782608696, "grad_norm": 0.08423142817528334, "learning_rate": 4.698423586448846e-05, "loss": 0.0039, "step": 2842 }, { "epoch": 2.131184407796102, "grad_norm": 0.07577123486070463, "learning_rate": 4.691028344285078e-05, "loss": 0.0062, "step": 2843 }, { "epoch": 2.131934032983508, "grad_norm": 0.10405964473337807, "learning_rate": 4.68363714288899e-05, "loss": 0.0057, "step": 2844 }, { "epoch": 2.1326836581709143, "grad_norm": 0.06725575601961063, "learning_rate": 4.6762499878861764e-05, "loss": 0.002, "step": 2845 }, { "epoch": 2.133433283358321, "grad_norm": 0.11374990049739897, "learning_rate": 4.668866884899154e-05, "loss": 0.0121, "step": 2846 }, { "epoch": 2.134182908545727, "grad_norm": 0.028375524722939163, "learning_rate": 4.661487839547355e-05, "loss": 0.0022, "step": 2847 }, { "epoch": 2.1349325337331333, "grad_norm": 0.1701170626738846, "learning_rate": 4.654112857447121e-05, "loss": 0.0086, "step": 2848 }, { "epoch": 2.13568215892054, "grad_norm": 0.04823068839835657, "learning_rate": 4.6467419442117046e-05, "loss": 0.0023, "step": 2849 }, { "epoch": 2.136431784107946, "grad_norm": 0.24689421196932598, "learning_rate": 4.639375105451259e-05, "loss": 0.0219, "step": 2850 }, { "epoch": 2.1371814092953523, "grad_norm": 0.054759916362402856, "learning_rate": 4.632012346772837e-05, "loss": 0.0017, "step": 2851 }, { "epoch": 2.1379310344827585, "grad_norm": 0.043825477313645465, "learning_rate": 4.624653673780386e-05, "loss": 0.0016, "step": 2852 }, { "epoch": 2.138680659670165, "grad_norm": 0.10507495229072013, "learning_rate": 4.617299092074744e-05, "loss": 0.0077, "step": 2853 }, { "epoch": 2.1394302848575713, "grad_norm": 0.14357076370983768, "learning_rate": 4.6099486072536334e-05, "loss": 0.0073, "step": 2854 }, { "epoch": 2.1401799100449774, "grad_norm": 0.1592531766218126, "learning_rate": 4.602602224911656e-05, "loss": 0.0254, "step": 2855 }, { "epoch": 2.1409295352323836, "grad_norm": 0.02049681399116887, "learning_rate": 4.595259950640304e-05, "loss": 0.0012, "step": 2856 }, { "epoch": 2.1416791604197902, "grad_norm": 0.07794412146800708, "learning_rate": 4.587921790027927e-05, "loss": 0.006, "step": 2857 }, { "epoch": 2.1424287856071964, "grad_norm": 0.09195761790497897, "learning_rate": 4.580587748659752e-05, "loss": 0.0111, "step": 2858 }, { "epoch": 2.1431784107946026, "grad_norm": 0.16849596417897122, "learning_rate": 4.57325783211787e-05, "loss": 0.0044, "step": 2859 }, { "epoch": 2.143928035982009, "grad_norm": 0.054270439033802784, "learning_rate": 4.5659320459812296e-05, "loss": 0.0026, "step": 2860 }, { "epoch": 2.1446776611694154, "grad_norm": 0.13695792493236258, "learning_rate": 4.5586103958256344e-05, "loss": 0.0118, "step": 2861 }, { "epoch": 2.1454272863568216, "grad_norm": 0.058343514676726244, "learning_rate": 4.55129288722375e-05, "loss": 0.0027, "step": 2862 }, { "epoch": 2.1461769115442277, "grad_norm": 0.06574267821808014, "learning_rate": 4.5439795257450804e-05, "loss": 0.0039, "step": 2863 }, { "epoch": 2.1469265367316344, "grad_norm": 0.10332596227148909, "learning_rate": 4.536670316955974e-05, "loss": 0.0033, "step": 2864 }, { "epoch": 2.1476761619190405, "grad_norm": 0.16550320030944354, "learning_rate": 4.5293652664196215e-05, "loss": 0.0083, "step": 2865 }, { "epoch": 2.1484257871064467, "grad_norm": 0.1822057557002658, "learning_rate": 4.522064379696048e-05, "loss": 0.0071, "step": 2866 }, { "epoch": 2.149175412293853, "grad_norm": 0.17039104462095303, "learning_rate": 4.5147676623421076e-05, "loss": 0.0081, "step": 2867 }, { "epoch": 2.1499250374812595, "grad_norm": 0.1184742262621423, "learning_rate": 4.507475119911482e-05, "loss": 0.0045, "step": 2868 }, { "epoch": 2.1506746626686657, "grad_norm": 0.06221806454687913, "learning_rate": 4.500186757954675e-05, "loss": 0.0021, "step": 2869 }, { "epoch": 2.151424287856072, "grad_norm": 0.08515968083332011, "learning_rate": 4.4929025820190105e-05, "loss": 0.0063, "step": 2870 }, { "epoch": 2.1521739130434785, "grad_norm": 0.3366899734636794, "learning_rate": 4.485622597648624e-05, "loss": 0.0339, "step": 2871 }, { "epoch": 2.1529235382308847, "grad_norm": 0.2936218445138808, "learning_rate": 4.4783468103844615e-05, "loss": 0.0214, "step": 2872 }, { "epoch": 2.153673163418291, "grad_norm": 0.026388492126442124, "learning_rate": 4.471075225764275e-05, "loss": 0.0013, "step": 2873 }, { "epoch": 2.154422788605697, "grad_norm": 0.12018222361567683, "learning_rate": 4.4638078493226176e-05, "loss": 0.0086, "step": 2874 }, { "epoch": 2.1551724137931036, "grad_norm": 0.08220028929729363, "learning_rate": 4.4565446865908376e-05, "loss": 0.0053, "step": 2875 }, { "epoch": 2.15592203898051, "grad_norm": 0.08399578333992191, "learning_rate": 4.44928574309708e-05, "loss": 0.0059, "step": 2876 }, { "epoch": 2.156671664167916, "grad_norm": 0.1404142689687109, "learning_rate": 4.442031024366274e-05, "loss": 0.0115, "step": 2877 }, { "epoch": 2.157421289355322, "grad_norm": 0.054640401967214404, "learning_rate": 4.434780535920138e-05, "loss": 0.0026, "step": 2878 }, { "epoch": 2.158170914542729, "grad_norm": 0.10806349724783954, "learning_rate": 4.427534283277162e-05, "loss": 0.0052, "step": 2879 }, { "epoch": 2.158920539730135, "grad_norm": 0.043779809246776935, "learning_rate": 4.420292271952626e-05, "loss": 0.0033, "step": 2880 }, { "epoch": 2.159670164917541, "grad_norm": 0.07964633918529888, "learning_rate": 4.413054507458571e-05, "loss": 0.0047, "step": 2881 }, { "epoch": 2.1604197901049473, "grad_norm": 0.18538411473729013, "learning_rate": 4.4058209953038086e-05, "loss": 0.0158, "step": 2882 }, { "epoch": 2.161169415292354, "grad_norm": 0.4188366014235997, "learning_rate": 4.398591740993914e-05, "loss": 0.0078, "step": 2883 }, { "epoch": 2.16191904047976, "grad_norm": 0.10735652365189656, "learning_rate": 4.391366750031217e-05, "loss": 0.006, "step": 2884 }, { "epoch": 2.1626686656671663, "grad_norm": 0.07968789225277143, "learning_rate": 4.384146027914815e-05, "loss": 0.0049, "step": 2885 }, { "epoch": 2.1634182908545725, "grad_norm": 0.04463803516349911, "learning_rate": 4.376929580140544e-05, "loss": 0.0024, "step": 2886 }, { "epoch": 2.164167916041979, "grad_norm": 0.07875465351103227, "learning_rate": 4.3697174122009934e-05, "loss": 0.0038, "step": 2887 }, { "epoch": 2.1649175412293853, "grad_norm": 0.06776221559374107, "learning_rate": 4.362509529585489e-05, "loss": 0.003, "step": 2888 }, { "epoch": 2.1656671664167915, "grad_norm": 0.08488959892000786, "learning_rate": 4.355305937780102e-05, "loss": 0.0052, "step": 2889 }, { "epoch": 2.166416791604198, "grad_norm": 0.08370426819698125, "learning_rate": 4.348106642267633e-05, "loss": 0.0037, "step": 2890 }, { "epoch": 2.1671664167916043, "grad_norm": 0.07325818139028772, "learning_rate": 4.3409116485276134e-05, "loss": 0.0062, "step": 2891 }, { "epoch": 2.1679160419790104, "grad_norm": 0.12924771341879, "learning_rate": 4.333720962036304e-05, "loss": 0.0072, "step": 2892 }, { "epoch": 2.1686656671664166, "grad_norm": 0.12552100078424228, "learning_rate": 4.326534588266681e-05, "loss": 0.0102, "step": 2893 }, { "epoch": 2.1694152923538232, "grad_norm": 0.13763354141825093, "learning_rate": 4.3193525326884435e-05, "loss": 0.0066, "step": 2894 }, { "epoch": 2.1701649175412294, "grad_norm": 0.05131955834253436, "learning_rate": 4.312174800768001e-05, "loss": 0.002, "step": 2895 }, { "epoch": 2.1709145427286356, "grad_norm": 0.18141305745303182, "learning_rate": 4.305001397968474e-05, "loss": 0.0045, "step": 2896 }, { "epoch": 2.1716641679160418, "grad_norm": 0.09518754190642088, "learning_rate": 4.297832329749687e-05, "loss": 0.0078, "step": 2897 }, { "epoch": 2.1724137931034484, "grad_norm": 0.1094915327192126, "learning_rate": 4.2906676015681644e-05, "loss": 0.0062, "step": 2898 }, { "epoch": 2.1731634182908546, "grad_norm": 0.14326768073583626, "learning_rate": 4.283507218877128e-05, "loss": 0.0095, "step": 2899 }, { "epoch": 2.1739130434782608, "grad_norm": 0.12477399089755312, "learning_rate": 4.276351187126496e-05, "loss": 0.0089, "step": 2900 }, { "epoch": 2.1746626686656674, "grad_norm": 0.11015881276147756, "learning_rate": 4.269199511762868e-05, "loss": 0.0036, "step": 2901 }, { "epoch": 2.1754122938530736, "grad_norm": 0.14941368767564808, "learning_rate": 4.2620521982295334e-05, "loss": 0.0118, "step": 2902 }, { "epoch": 2.1761619190404797, "grad_norm": 0.09648693740069292, "learning_rate": 4.2549092519664546e-05, "loss": 0.0041, "step": 2903 }, { "epoch": 2.176911544227886, "grad_norm": 0.11454174987068287, "learning_rate": 4.247770678410284e-05, "loss": 0.0043, "step": 2904 }, { "epoch": 2.1776611694152925, "grad_norm": 0.25554248116246986, "learning_rate": 4.240636482994332e-05, "loss": 0.0111, "step": 2905 }, { "epoch": 2.1784107946026987, "grad_norm": 0.20926240916107086, "learning_rate": 4.233506671148583e-05, "loss": 0.0059, "step": 2906 }, { "epoch": 2.179160419790105, "grad_norm": 0.04413399814127862, "learning_rate": 4.2263812482996814e-05, "loss": 0.0028, "step": 2907 }, { "epoch": 2.179910044977511, "grad_norm": 0.09739179400173857, "learning_rate": 4.219260219870932e-05, "loss": 0.0095, "step": 2908 }, { "epoch": 2.1806596701649177, "grad_norm": 0.1956946447085361, "learning_rate": 4.212143591282303e-05, "loss": 0.0194, "step": 2909 }, { "epoch": 2.181409295352324, "grad_norm": 0.09475992658654644, "learning_rate": 4.2050313679504015e-05, "loss": 0.0054, "step": 2910 }, { "epoch": 2.18215892053973, "grad_norm": 0.03654156738299712, "learning_rate": 4.19792355528849e-05, "loss": 0.0025, "step": 2911 }, { "epoch": 2.1829085457271367, "grad_norm": 0.13210150734611542, "learning_rate": 4.190820158706469e-05, "loss": 0.0047, "step": 2912 }, { "epoch": 2.183658170914543, "grad_norm": 0.1602288514619366, "learning_rate": 4.1837211836108814e-05, "loss": 0.0104, "step": 2913 }, { "epoch": 2.184407796101949, "grad_norm": 0.10919867016817089, "learning_rate": 4.176626635404902e-05, "loss": 0.0285, "step": 2914 }, { "epoch": 2.185157421289355, "grad_norm": 0.09016451941471065, "learning_rate": 4.169536519488339e-05, "loss": 0.0097, "step": 2915 }, { "epoch": 2.185907046476762, "grad_norm": 0.20616458903558743, "learning_rate": 4.162450841257623e-05, "loss": 0.0049, "step": 2916 }, { "epoch": 2.186656671664168, "grad_norm": 0.12993953040956022, "learning_rate": 4.1553696061058144e-05, "loss": 0.0074, "step": 2917 }, { "epoch": 2.187406296851574, "grad_norm": 0.06387145155896923, "learning_rate": 4.1482928194225826e-05, "loss": 0.0034, "step": 2918 }, { "epoch": 2.1881559220389803, "grad_norm": 0.1412799170073803, "learning_rate": 4.141220486594217e-05, "loss": 0.0109, "step": 2919 }, { "epoch": 2.188905547226387, "grad_norm": 0.08496139202783914, "learning_rate": 4.134152613003618e-05, "loss": 0.0063, "step": 2920 }, { "epoch": 2.189655172413793, "grad_norm": 0.07256124080205822, "learning_rate": 4.127089204030287e-05, "loss": 0.0061, "step": 2921 }, { "epoch": 2.1904047976011993, "grad_norm": 0.05522989278346765, "learning_rate": 4.120030265050331e-05, "loss": 0.0037, "step": 2922 }, { "epoch": 2.1911544227886055, "grad_norm": 0.1584495856412796, "learning_rate": 4.112975801436454e-05, "loss": 0.0083, "step": 2923 }, { "epoch": 2.191904047976012, "grad_norm": 0.19621265325676193, "learning_rate": 4.105925818557953e-05, "loss": 0.0306, "step": 2924 }, { "epoch": 2.1926536731634183, "grad_norm": 0.1490287113856066, "learning_rate": 4.098880321780716e-05, "loss": 0.01, "step": 2925 }, { "epoch": 2.1934032983508245, "grad_norm": 0.16054903709881607, "learning_rate": 4.0918393164672155e-05, "loss": 0.0262, "step": 2926 }, { "epoch": 2.1941529235382307, "grad_norm": 0.06522683759255349, "learning_rate": 4.0848028079765014e-05, "loss": 0.0022, "step": 2927 }, { "epoch": 2.1949025487256373, "grad_norm": 0.07793807891903581, "learning_rate": 4.0777708016642133e-05, "loss": 0.0025, "step": 2928 }, { "epoch": 2.1956521739130435, "grad_norm": 0.0671421190639738, "learning_rate": 4.070743302882551e-05, "loss": 0.0047, "step": 2929 }, { "epoch": 2.1964017991004496, "grad_norm": 0.09863003282295409, "learning_rate": 4.063720316980292e-05, "loss": 0.0112, "step": 2930 }, { "epoch": 2.1971514242878563, "grad_norm": 0.033368571987164586, "learning_rate": 4.0567018493027686e-05, "loss": 0.0016, "step": 2931 }, { "epoch": 2.1979010494752624, "grad_norm": 0.11283284783196637, "learning_rate": 4.049687905191889e-05, "loss": 0.0034, "step": 2932 }, { "epoch": 2.1986506746626686, "grad_norm": 0.15030626032340028, "learning_rate": 4.042678489986107e-05, "loss": 0.0073, "step": 2933 }, { "epoch": 2.199400299850075, "grad_norm": 0.08288939135255141, "learning_rate": 4.035673609020434e-05, "loss": 0.0065, "step": 2934 }, { "epoch": 2.2001499250374814, "grad_norm": 0.06272651785014262, "learning_rate": 4.028673267626427e-05, "loss": 0.0038, "step": 2935 }, { "epoch": 2.2008995502248876, "grad_norm": 0.04537491332847573, "learning_rate": 4.0216774711321925e-05, "loss": 0.0032, "step": 2936 }, { "epoch": 2.2016491754122938, "grad_norm": 0.1259908642818158, "learning_rate": 4.014686224862374e-05, "loss": 0.0023, "step": 2937 }, { "epoch": 2.2023988005997, "grad_norm": 0.1471177203320294, "learning_rate": 4.007699534138153e-05, "loss": 0.0113, "step": 2938 }, { "epoch": 2.2031484257871066, "grad_norm": 0.10461123602031132, "learning_rate": 4.000717404277244e-05, "loss": 0.0041, "step": 2939 }, { "epoch": 2.2038980509745127, "grad_norm": 0.0730772652802554, "learning_rate": 3.9937398405938896e-05, "loss": 0.002, "step": 2940 }, { "epoch": 2.204647676161919, "grad_norm": 0.053078479449533464, "learning_rate": 3.9867668483988585e-05, "loss": 0.0022, "step": 2941 }, { "epoch": 2.2053973013493255, "grad_norm": 0.0834540587187214, "learning_rate": 3.9797984329994363e-05, "loss": 0.0062, "step": 2942 }, { "epoch": 2.2061469265367317, "grad_norm": 0.07044363379314998, "learning_rate": 3.9728345996994296e-05, "loss": 0.0052, "step": 2943 }, { "epoch": 2.206896551724138, "grad_norm": 0.12451614573261646, "learning_rate": 3.9658753537991545e-05, "loss": 0.0104, "step": 2944 }, { "epoch": 2.207646176911544, "grad_norm": 0.06448779690377877, "learning_rate": 3.958920700595435e-05, "loss": 0.004, "step": 2945 }, { "epoch": 2.2083958020989507, "grad_norm": 0.03599955797498942, "learning_rate": 3.951970645381604e-05, "loss": 0.0025, "step": 2946 }, { "epoch": 2.209145427286357, "grad_norm": 0.06325898149816306, "learning_rate": 3.94502519344749e-05, "loss": 0.0051, "step": 2947 }, { "epoch": 2.209895052473763, "grad_norm": 0.14544048825696881, "learning_rate": 3.9380843500794195e-05, "loss": 0.0065, "step": 2948 }, { "epoch": 2.2106446776611692, "grad_norm": 0.06951395805907995, "learning_rate": 3.931148120560211e-05, "loss": 0.0043, "step": 2949 }, { "epoch": 2.211394302848576, "grad_norm": 0.04920109718547303, "learning_rate": 3.9242165101691685e-05, "loss": 0.0014, "step": 2950 }, { "epoch": 2.212143928035982, "grad_norm": 0.19249354074046146, "learning_rate": 3.9172895241820905e-05, "loss": 0.0105, "step": 2951 }, { "epoch": 2.212893553223388, "grad_norm": 0.06689161024484175, "learning_rate": 3.910367167871246e-05, "loss": 0.0061, "step": 2952 }, { "epoch": 2.213643178410795, "grad_norm": 0.1350737337552095, "learning_rate": 3.9034494465053815e-05, "loss": 0.0057, "step": 2953 }, { "epoch": 2.214392803598201, "grad_norm": 0.24292345106490407, "learning_rate": 3.8965363653497144e-05, "loss": 0.0042, "step": 2954 }, { "epoch": 2.215142428785607, "grad_norm": 0.026553507259357522, "learning_rate": 3.889627929665941e-05, "loss": 0.0007, "step": 2955 }, { "epoch": 2.2158920539730134, "grad_norm": 0.07556591664485746, "learning_rate": 3.882724144712209e-05, "loss": 0.0052, "step": 2956 }, { "epoch": 2.21664167916042, "grad_norm": 0.12006927039685981, "learning_rate": 3.875825015743132e-05, "loss": 0.0069, "step": 2957 }, { "epoch": 2.217391304347826, "grad_norm": 0.04691662091207386, "learning_rate": 3.868930548009779e-05, "loss": 0.0026, "step": 2958 }, { "epoch": 2.2181409295352323, "grad_norm": 0.13750868464230268, "learning_rate": 3.862040746759672e-05, "loss": 0.0042, "step": 2959 }, { "epoch": 2.2188905547226385, "grad_norm": 0.09332905233243183, "learning_rate": 3.855155617236781e-05, "loss": 0.0051, "step": 2960 }, { "epoch": 2.219640179910045, "grad_norm": 0.0354143689169886, "learning_rate": 3.8482751646815185e-05, "loss": 0.0021, "step": 2961 }, { "epoch": 2.2203898050974513, "grad_norm": 0.06158795473163416, "learning_rate": 3.841399394330739e-05, "loss": 0.0039, "step": 2962 }, { "epoch": 2.2211394302848575, "grad_norm": 0.1746590159335278, "learning_rate": 3.834528311417733e-05, "loss": 0.015, "step": 2963 }, { "epoch": 2.2218890554722637, "grad_norm": 0.1431609012747353, "learning_rate": 3.8276619211722245e-05, "loss": 0.0049, "step": 2964 }, { "epoch": 2.2226386806596703, "grad_norm": 0.04710948736106486, "learning_rate": 3.820800228820363e-05, "loss": 0.0033, "step": 2965 }, { "epoch": 2.2233883058470765, "grad_norm": 0.10769067240238436, "learning_rate": 3.813943239584723e-05, "loss": 0.0086, "step": 2966 }, { "epoch": 2.2241379310344827, "grad_norm": 0.1112212265090816, "learning_rate": 3.8070909586843026e-05, "loss": 0.0024, "step": 2967 }, { "epoch": 2.224887556221889, "grad_norm": 0.06666088121431606, "learning_rate": 3.8002433913345135e-05, "loss": 0.0042, "step": 2968 }, { "epoch": 2.2256371814092955, "grad_norm": 0.14479957975949176, "learning_rate": 3.79340054274718e-05, "loss": 0.006, "step": 2969 }, { "epoch": 2.2263868065967016, "grad_norm": 0.09796350711148064, "learning_rate": 3.786562418130535e-05, "loss": 0.0058, "step": 2970 }, { "epoch": 2.227136431784108, "grad_norm": 0.10417840290808125, "learning_rate": 3.7797290226892145e-05, "loss": 0.0046, "step": 2971 }, { "epoch": 2.2278860569715144, "grad_norm": 0.14637983584801909, "learning_rate": 3.7729003616242584e-05, "loss": 0.0069, "step": 2972 }, { "epoch": 2.2286356821589206, "grad_norm": 0.0596059585180575, "learning_rate": 3.766076440133097e-05, "loss": 0.0046, "step": 2973 }, { "epoch": 2.229385307346327, "grad_norm": 0.1298320643372351, "learning_rate": 3.7592572634095654e-05, "loss": 0.012, "step": 2974 }, { "epoch": 2.230134932533733, "grad_norm": 0.06279833117448319, "learning_rate": 3.752442836643876e-05, "loss": 0.0025, "step": 2975 }, { "epoch": 2.2308845577211396, "grad_norm": 0.053545212737444454, "learning_rate": 3.745633165022627e-05, "loss": 0.0041, "step": 2976 }, { "epoch": 2.2316341829085458, "grad_norm": 0.03920380410713475, "learning_rate": 3.738828253728799e-05, "loss": 0.0039, "step": 2977 }, { "epoch": 2.232383808095952, "grad_norm": 0.12025123048502838, "learning_rate": 3.732028107941755e-05, "loss": 0.0105, "step": 2978 }, { "epoch": 2.233133433283358, "grad_norm": 0.07880294740994104, "learning_rate": 3.725232732837225e-05, "loss": 0.0041, "step": 2979 }, { "epoch": 2.2338830584707647, "grad_norm": 0.10338096755245214, "learning_rate": 3.7184421335873067e-05, "loss": 0.0069, "step": 2980 }, { "epoch": 2.234632683658171, "grad_norm": 0.11514938715289685, "learning_rate": 3.7116563153604656e-05, "loss": 0.006, "step": 2981 }, { "epoch": 2.235382308845577, "grad_norm": 0.17877610807644287, "learning_rate": 3.70487528332153e-05, "loss": 0.0079, "step": 2982 }, { "epoch": 2.2361319340329837, "grad_norm": 0.06067208500268504, "learning_rate": 3.698099042631679e-05, "loss": 0.0018, "step": 2983 }, { "epoch": 2.23688155922039, "grad_norm": 0.3582779971925068, "learning_rate": 3.691327598448451e-05, "loss": 0.018, "step": 2984 }, { "epoch": 2.237631184407796, "grad_norm": 0.08483678159039827, "learning_rate": 3.684560955925732e-05, "loss": 0.0035, "step": 2985 }, { "epoch": 2.2383808095952022, "grad_norm": 0.13213318981736838, "learning_rate": 3.677799120213754e-05, "loss": 0.006, "step": 2986 }, { "epoch": 2.239130434782609, "grad_norm": 0.205558251722483, "learning_rate": 3.6710420964590876e-05, "loss": 0.0137, "step": 2987 }, { "epoch": 2.239880059970015, "grad_norm": 0.06881382245027329, "learning_rate": 3.664289889804643e-05, "loss": 0.0075, "step": 2988 }, { "epoch": 2.2406296851574212, "grad_norm": 0.04868954415719796, "learning_rate": 3.6575425053896636e-05, "loss": 0.0029, "step": 2989 }, { "epoch": 2.2413793103448274, "grad_norm": 0.10665721197763921, "learning_rate": 3.6507999483497235e-05, "loss": 0.0097, "step": 2990 }, { "epoch": 2.242128935532234, "grad_norm": 0.06542037858851574, "learning_rate": 3.644062223816721e-05, "loss": 0.003, "step": 2991 }, { "epoch": 2.24287856071964, "grad_norm": 0.07231973807330334, "learning_rate": 3.637329336918878e-05, "loss": 0.0036, "step": 2992 }, { "epoch": 2.2436281859070464, "grad_norm": 0.16776365912815602, "learning_rate": 3.630601292780733e-05, "loss": 0.0084, "step": 2993 }, { "epoch": 2.244377811094453, "grad_norm": 0.23586581493040168, "learning_rate": 3.6238780965231375e-05, "loss": 0.0069, "step": 2994 }, { "epoch": 2.245127436281859, "grad_norm": 0.1033982312441333, "learning_rate": 3.6171597532632575e-05, "loss": 0.0126, "step": 2995 }, { "epoch": 2.2458770614692654, "grad_norm": 0.171232132167371, "learning_rate": 3.61044626811456e-05, "loss": 0.0102, "step": 2996 }, { "epoch": 2.2466266866566715, "grad_norm": 0.09715820745250381, "learning_rate": 3.603737646186814e-05, "loss": 0.0035, "step": 2997 }, { "epoch": 2.247376311844078, "grad_norm": 0.10968137200157604, "learning_rate": 3.597033892586098e-05, "loss": 0.003, "step": 2998 }, { "epoch": 2.2481259370314843, "grad_norm": 0.05217799438191155, "learning_rate": 3.590335012414772e-05, "loss": 0.0059, "step": 2999 }, { "epoch": 2.2488755622188905, "grad_norm": 0.05733050677905598, "learning_rate": 3.5836410107714925e-05, "loss": 0.0032, "step": 3000 }, { "epoch": 2.2496251874062967, "grad_norm": 0.1021056862324412, "learning_rate": 3.5769518927511966e-05, "loss": 0.0032, "step": 3001 }, { "epoch": 2.2503748125937033, "grad_norm": 0.07310938180823594, "learning_rate": 3.5702676634451205e-05, "loss": 0.0053, "step": 3002 }, { "epoch": 2.2511244377811095, "grad_norm": 0.0818866352771629, "learning_rate": 3.563588327940762e-05, "loss": 0.0055, "step": 3003 }, { "epoch": 2.2518740629685157, "grad_norm": 0.01913471780865569, "learning_rate": 3.5569138913219026e-05, "loss": 0.0008, "step": 3004 }, { "epoch": 2.2526236881559223, "grad_norm": 0.055861302921822106, "learning_rate": 3.550244358668592e-05, "loss": 0.0028, "step": 3005 }, { "epoch": 2.2533733133433285, "grad_norm": 0.08024208404821104, "learning_rate": 3.543579735057151e-05, "loss": 0.0018, "step": 3006 }, { "epoch": 2.2541229385307346, "grad_norm": 0.13793260963967538, "learning_rate": 3.53692002556016e-05, "loss": 0.0049, "step": 3007 }, { "epoch": 2.254872563718141, "grad_norm": 0.12180580380490126, "learning_rate": 3.5302652352464625e-05, "loss": 0.0092, "step": 3008 }, { "epoch": 2.255622188905547, "grad_norm": 0.24832447046922468, "learning_rate": 3.5236153691811556e-05, "loss": 0.011, "step": 3009 }, { "epoch": 2.2563718140929536, "grad_norm": 0.10710552637315611, "learning_rate": 3.5169704324255895e-05, "loss": 0.0041, "step": 3010 }, { "epoch": 2.25712143928036, "grad_norm": 0.023998445286078023, "learning_rate": 3.510330430037363e-05, "loss": 0.0005, "step": 3011 }, { "epoch": 2.257871064467766, "grad_norm": 0.08477613453589629, "learning_rate": 3.503695367070318e-05, "loss": 0.0036, "step": 3012 }, { "epoch": 2.2586206896551726, "grad_norm": 0.11309357409035045, "learning_rate": 3.497065248574537e-05, "loss": 0.0074, "step": 3013 }, { "epoch": 2.2593703148425788, "grad_norm": 0.2413432662111237, "learning_rate": 3.490440079596341e-05, "loss": 0.011, "step": 3014 }, { "epoch": 2.260119940029985, "grad_norm": 0.15438899388396532, "learning_rate": 3.483819865178283e-05, "loss": 0.0051, "step": 3015 }, { "epoch": 2.260869565217391, "grad_norm": 0.11248704273274097, "learning_rate": 3.477204610359143e-05, "loss": 0.0097, "step": 3016 }, { "epoch": 2.2616191904047978, "grad_norm": 0.10270473751697841, "learning_rate": 3.4705943201739274e-05, "loss": 0.0037, "step": 3017 }, { "epoch": 2.262368815592204, "grad_norm": 0.06663120310158174, "learning_rate": 3.4639889996538664e-05, "loss": 0.0044, "step": 3018 }, { "epoch": 2.26311844077961, "grad_norm": 0.04856854864541843, "learning_rate": 3.457388653826405e-05, "loss": 0.0022, "step": 3019 }, { "epoch": 2.2638680659670163, "grad_norm": 0.048466365705767143, "learning_rate": 3.450793287715196e-05, "loss": 0.0048, "step": 3020 }, { "epoch": 2.264617691154423, "grad_norm": 0.09503234848236053, "learning_rate": 3.4442029063401195e-05, "loss": 0.0073, "step": 3021 }, { "epoch": 2.265367316341829, "grad_norm": 0.16817200927825984, "learning_rate": 3.437617514717244e-05, "loss": 0.015, "step": 3022 }, { "epoch": 2.2661169415292353, "grad_norm": 0.02779505203286547, "learning_rate": 3.431037117858848e-05, "loss": 0.0013, "step": 3023 }, { "epoch": 2.266866566716642, "grad_norm": 0.10431680116886312, "learning_rate": 3.424461720773403e-05, "loss": 0.0037, "step": 3024 }, { "epoch": 2.267616191904048, "grad_norm": 0.15647341956223487, "learning_rate": 3.4178913284655865e-05, "loss": 0.0132, "step": 3025 }, { "epoch": 2.2683658170914542, "grad_norm": 0.4506494810308986, "learning_rate": 3.411325945936257e-05, "loss": 0.0342, "step": 3026 }, { "epoch": 2.2691154422788604, "grad_norm": 0.05291685737207425, "learning_rate": 3.40476557818246e-05, "loss": 0.0039, "step": 3027 }, { "epoch": 2.269865067466267, "grad_norm": 0.0529791034376625, "learning_rate": 3.398210230197429e-05, "loss": 0.0066, "step": 3028 }, { "epoch": 2.270614692653673, "grad_norm": 0.266638755312817, "learning_rate": 3.391659906970573e-05, "loss": 0.0235, "step": 3029 }, { "epoch": 2.2713643178410794, "grad_norm": 0.09890484403926711, "learning_rate": 3.3851146134874775e-05, "loss": 0.0066, "step": 3030 }, { "epoch": 2.2721139430284856, "grad_norm": 0.0848537069501797, "learning_rate": 3.378574354729901e-05, "loss": 0.0088, "step": 3031 }, { "epoch": 2.272863568215892, "grad_norm": 0.05974783788622155, "learning_rate": 3.37203913567577e-05, "loss": 0.003, "step": 3032 }, { "epoch": 2.2736131934032984, "grad_norm": 0.24716829048847536, "learning_rate": 3.365508961299172e-05, "loss": 0.0097, "step": 3033 }, { "epoch": 2.2743628185907045, "grad_norm": 0.148063506124021, "learning_rate": 3.3589838365703594e-05, "loss": 0.0087, "step": 3034 }, { "epoch": 2.275112443778111, "grad_norm": 0.08608244559058832, "learning_rate": 3.352463766455738e-05, "loss": 0.0153, "step": 3035 }, { "epoch": 2.2758620689655173, "grad_norm": 0.08896344091548676, "learning_rate": 3.345948755917867e-05, "loss": 0.0033, "step": 3036 }, { "epoch": 2.2766116941529235, "grad_norm": 0.1066313799481121, "learning_rate": 3.339438809915456e-05, "loss": 0.0053, "step": 3037 }, { "epoch": 2.2773613193403297, "grad_norm": 0.06683865574093746, "learning_rate": 3.3329339334033595e-05, "loss": 0.0029, "step": 3038 }, { "epoch": 2.2781109445277363, "grad_norm": 0.08087808202357481, "learning_rate": 3.326434131332571e-05, "loss": 0.0034, "step": 3039 }, { "epoch": 2.2788605697151425, "grad_norm": 0.01588263533994879, "learning_rate": 3.319939408650225e-05, "loss": 0.0011, "step": 3040 }, { "epoch": 2.2796101949025487, "grad_norm": 0.12522455341806843, "learning_rate": 3.31344977029959e-05, "loss": 0.004, "step": 3041 }, { "epoch": 2.280359820089955, "grad_norm": 0.1123280419849201, "learning_rate": 3.306965221220062e-05, "loss": 0.0074, "step": 3042 }, { "epoch": 2.2811094452773615, "grad_norm": 0.2228488028010439, "learning_rate": 3.300485766347161e-05, "loss": 0.0068, "step": 3043 }, { "epoch": 2.2818590704647677, "grad_norm": 0.20103985308072542, "learning_rate": 3.294011410612541e-05, "loss": 0.0126, "step": 3044 }, { "epoch": 2.282608695652174, "grad_norm": 0.13115501711622973, "learning_rate": 3.2875421589439634e-05, "loss": 0.0042, "step": 3045 }, { "epoch": 2.2833583208395805, "grad_norm": 0.07938313674178223, "learning_rate": 3.281078016265311e-05, "loss": 0.0058, "step": 3046 }, { "epoch": 2.2841079460269866, "grad_norm": 0.22928488253082546, "learning_rate": 3.274618987496574e-05, "loss": 0.0142, "step": 3047 }, { "epoch": 2.284857571214393, "grad_norm": 0.040375034631480115, "learning_rate": 3.268165077553849e-05, "loss": 0.0022, "step": 3048 }, { "epoch": 2.285607196401799, "grad_norm": 0.07016817070990827, "learning_rate": 3.261716291349347e-05, "loss": 0.0032, "step": 3049 }, { "epoch": 2.286356821589205, "grad_norm": 0.055459174216538384, "learning_rate": 3.2552726337913686e-05, "loss": 0.003, "step": 3050 }, { "epoch": 2.287106446776612, "grad_norm": 0.2100936606920281, "learning_rate": 3.248834109784313e-05, "loss": 0.007, "step": 3051 }, { "epoch": 2.287856071964018, "grad_norm": 0.23237588883682203, "learning_rate": 3.242400724228677e-05, "loss": 0.027, "step": 3052 }, { "epoch": 2.288605697151424, "grad_norm": 0.09144615215224357, "learning_rate": 3.235972482021039e-05, "loss": 0.006, "step": 3053 }, { "epoch": 2.2893553223388308, "grad_norm": 0.08436516257363523, "learning_rate": 3.2295493880540687e-05, "loss": 0.0049, "step": 3054 }, { "epoch": 2.290104947526237, "grad_norm": 0.09340848215971319, "learning_rate": 3.223131447216514e-05, "loss": 0.0018, "step": 3055 }, { "epoch": 2.290854572713643, "grad_norm": 0.14138316361171396, "learning_rate": 3.2167186643932025e-05, "loss": 0.0101, "step": 3056 }, { "epoch": 2.2916041979010493, "grad_norm": 0.0780038451097566, "learning_rate": 3.2103110444650354e-05, "loss": 0.0041, "step": 3057 }, { "epoch": 2.292353823088456, "grad_norm": 0.13771768007587726, "learning_rate": 3.203908592308984e-05, "loss": 0.0052, "step": 3058 }, { "epoch": 2.293103448275862, "grad_norm": 0.11314310653938886, "learning_rate": 3.197511312798087e-05, "loss": 0.0031, "step": 3059 }, { "epoch": 2.2938530734632683, "grad_norm": 0.06740799151396357, "learning_rate": 3.191119210801444e-05, "loss": 0.0019, "step": 3060 }, { "epoch": 2.2946026986506745, "grad_norm": 0.08458018097254301, "learning_rate": 3.184732291184218e-05, "loss": 0.0044, "step": 3061 }, { "epoch": 2.295352323838081, "grad_norm": 0.08296125916158942, "learning_rate": 3.178350558807624e-05, "loss": 0.0054, "step": 3062 }, { "epoch": 2.2961019490254873, "grad_norm": 0.08461209266922365, "learning_rate": 3.171974018528928e-05, "loss": 0.0044, "step": 3063 }, { "epoch": 2.2968515742128934, "grad_norm": 0.31138688440724316, "learning_rate": 3.1656026752014486e-05, "loss": 0.0161, "step": 3064 }, { "epoch": 2.2976011994003, "grad_norm": 0.1700687652529863, "learning_rate": 3.159236533674547e-05, "loss": 0.0112, "step": 3065 }, { "epoch": 2.2983508245877062, "grad_norm": 0.12293576424028, "learning_rate": 3.1528755987936186e-05, "loss": 0.0078, "step": 3066 }, { "epoch": 2.2991004497751124, "grad_norm": 0.25224580670874097, "learning_rate": 3.14651987540011e-05, "loss": 0.0231, "step": 3067 }, { "epoch": 2.2998500749625186, "grad_norm": 0.12752668587526034, "learning_rate": 3.1401693683314904e-05, "loss": 0.0089, "step": 3068 }, { "epoch": 2.300599700149925, "grad_norm": 0.07308291934037676, "learning_rate": 3.1338240824212606e-05, "loss": 0.004, "step": 3069 }, { "epoch": 2.3013493253373314, "grad_norm": 0.1442663761239963, "learning_rate": 3.1274840224989485e-05, "loss": 0.008, "step": 3070 }, { "epoch": 2.3020989505247376, "grad_norm": 0.097064895836265, "learning_rate": 3.121149193390106e-05, "loss": 0.0031, "step": 3071 }, { "epoch": 2.3028485757121437, "grad_norm": 0.10366709486171628, "learning_rate": 3.1148195999162966e-05, "loss": 0.0038, "step": 3072 }, { "epoch": 2.3035982008995504, "grad_norm": 0.10820343477738625, "learning_rate": 3.108495246895111e-05, "loss": 0.0099, "step": 3073 }, { "epoch": 2.3043478260869565, "grad_norm": 0.052683725617420046, "learning_rate": 3.1021761391401424e-05, "loss": 0.0022, "step": 3074 }, { "epoch": 2.3050974512743627, "grad_norm": 0.04856462072758565, "learning_rate": 3.095862281460994e-05, "loss": 0.004, "step": 3075 }, { "epoch": 2.3058470764617693, "grad_norm": 0.10917084487952686, "learning_rate": 3.0895536786632716e-05, "loss": 0.0053, "step": 3076 }, { "epoch": 2.3065967016491755, "grad_norm": 0.09271317437410774, "learning_rate": 3.083250335548583e-05, "loss": 0.0063, "step": 3077 }, { "epoch": 2.3073463268365817, "grad_norm": 0.1014603016537938, "learning_rate": 3.0769522569145335e-05, "loss": 0.0051, "step": 3078 }, { "epoch": 2.308095952023988, "grad_norm": 0.182073711613753, "learning_rate": 3.070659447554719e-05, "loss": 0.0134, "step": 3079 }, { "epoch": 2.3088455772113945, "grad_norm": 0.051493830991612595, "learning_rate": 3.0643719122587275e-05, "loss": 0.0031, "step": 3080 }, { "epoch": 2.3095952023988007, "grad_norm": 0.06813819702008223, "learning_rate": 3.058089655812132e-05, "loss": 0.0022, "step": 3081 }, { "epoch": 2.310344827586207, "grad_norm": 0.1497102741665549, "learning_rate": 3.051812682996487e-05, "loss": 0.0113, "step": 3082 }, { "epoch": 2.311094452773613, "grad_norm": 0.13331343034849766, "learning_rate": 3.0455409985893236e-05, "loss": 0.0074, "step": 3083 }, { "epoch": 2.3118440779610197, "grad_norm": 0.11990281948287221, "learning_rate": 3.0392746073641533e-05, "loss": 0.0034, "step": 3084 }, { "epoch": 2.312593703148426, "grad_norm": 0.24197839414958924, "learning_rate": 3.0330135140904526e-05, "loss": 0.0166, "step": 3085 }, { "epoch": 2.313343328335832, "grad_norm": 0.054404535737172195, "learning_rate": 3.026757723533671e-05, "loss": 0.0033, "step": 3086 }, { "epoch": 2.3140929535232386, "grad_norm": 0.08049491433420755, "learning_rate": 3.0205072404552182e-05, "loss": 0.0032, "step": 3087 }, { "epoch": 2.314842578710645, "grad_norm": 0.16292162964705675, "learning_rate": 3.0142620696124656e-05, "loss": 0.0055, "step": 3088 }, { "epoch": 2.315592203898051, "grad_norm": 0.07344569034686321, "learning_rate": 3.0080222157587424e-05, "loss": 0.0023, "step": 3089 }, { "epoch": 2.316341829085457, "grad_norm": 0.1092571190290509, "learning_rate": 3.0017876836433245e-05, "loss": 0.0025, "step": 3090 }, { "epoch": 2.3170914542728633, "grad_norm": 0.04795213189239423, "learning_rate": 2.995558478011452e-05, "loss": 0.0014, "step": 3091 }, { "epoch": 2.31784107946027, "grad_norm": 0.0501643876826235, "learning_rate": 2.989334603604297e-05, "loss": 0.0025, "step": 3092 }, { "epoch": 2.318590704647676, "grad_norm": 0.04523174534856307, "learning_rate": 2.9831160651589775e-05, "loss": 0.0023, "step": 3093 }, { "epoch": 2.3193403298350823, "grad_norm": 0.02758127641822319, "learning_rate": 2.9769028674085532e-05, "loss": 0.0009, "step": 3094 }, { "epoch": 2.320089955022489, "grad_norm": 0.1421280114887876, "learning_rate": 2.9706950150820144e-05, "loss": 0.0074, "step": 3095 }, { "epoch": 2.320839580209895, "grad_norm": 0.1258121786875046, "learning_rate": 2.9644925129042857e-05, "loss": 0.0078, "step": 3096 }, { "epoch": 2.3215892053973013, "grad_norm": 0.09869709212905185, "learning_rate": 2.958295365596222e-05, "loss": 0.0063, "step": 3097 }, { "epoch": 2.3223388305847075, "grad_norm": 0.044815860770597356, "learning_rate": 2.9521035778745988e-05, "loss": 0.0039, "step": 3098 }, { "epoch": 2.323088455772114, "grad_norm": 0.07516192279941192, "learning_rate": 2.9459171544521125e-05, "loss": 0.0058, "step": 3099 }, { "epoch": 2.3238380809595203, "grad_norm": 0.11244585004434644, "learning_rate": 2.939736100037378e-05, "loss": 0.0101, "step": 3100 }, { "epoch": 2.3245877061469264, "grad_norm": 0.18701938835082377, "learning_rate": 2.933560419334922e-05, "loss": 0.0122, "step": 3101 }, { "epoch": 2.3253373313343326, "grad_norm": 0.10305684565093204, "learning_rate": 2.9273901170451835e-05, "loss": 0.0024, "step": 3102 }, { "epoch": 2.3260869565217392, "grad_norm": 0.0993002558158889, "learning_rate": 2.9212251978645056e-05, "loss": 0.0043, "step": 3103 }, { "epoch": 2.3268365817091454, "grad_norm": 0.10134589099502897, "learning_rate": 2.915065666485136e-05, "loss": 0.0046, "step": 3104 }, { "epoch": 2.3275862068965516, "grad_norm": 0.09878875420161998, "learning_rate": 2.9089115275952218e-05, "loss": 0.002, "step": 3105 }, { "epoch": 2.3283358320839582, "grad_norm": 0.11019541110942332, "learning_rate": 2.902762785878802e-05, "loss": 0.0057, "step": 3106 }, { "epoch": 2.3290854572713644, "grad_norm": 0.11157700778282026, "learning_rate": 2.8966194460158137e-05, "loss": 0.004, "step": 3107 }, { "epoch": 2.3298350824587706, "grad_norm": 0.40933062048626645, "learning_rate": 2.8904815126820774e-05, "loss": 0.0129, "step": 3108 }, { "epoch": 2.3305847076461768, "grad_norm": 0.07305006067035855, "learning_rate": 2.8843489905493016e-05, "loss": 0.0052, "step": 3109 }, { "epoch": 2.3313343328335834, "grad_norm": 0.0680123486769411, "learning_rate": 2.878221884285074e-05, "loss": 0.0079, "step": 3110 }, { "epoch": 2.3320839580209896, "grad_norm": 0.06019169149018181, "learning_rate": 2.872100198552863e-05, "loss": 0.0034, "step": 3111 }, { "epoch": 2.3328335832083957, "grad_norm": 0.056783640717217754, "learning_rate": 2.8659839380120092e-05, "loss": 0.0032, "step": 3112 }, { "epoch": 2.333583208395802, "grad_norm": 0.11551220701800663, "learning_rate": 2.8598731073177198e-05, "loss": 0.0086, "step": 3113 }, { "epoch": 2.3343328335832085, "grad_norm": 0.06999471091101042, "learning_rate": 2.8537677111210825e-05, "loss": 0.0036, "step": 3114 }, { "epoch": 2.3350824587706147, "grad_norm": 0.04765342441440993, "learning_rate": 2.8476677540690355e-05, "loss": 0.0015, "step": 3115 }, { "epoch": 2.335832083958021, "grad_norm": 0.05319190193486766, "learning_rate": 2.841573240804383e-05, "loss": 0.0017, "step": 3116 }, { "epoch": 2.3365817091454275, "grad_norm": 0.1813902628820559, "learning_rate": 2.8354841759657823e-05, "loss": 0.0181, "step": 3117 }, { "epoch": 2.3373313343328337, "grad_norm": 0.12581938917539412, "learning_rate": 2.8294005641877486e-05, "loss": 0.0019, "step": 3118 }, { "epoch": 2.33808095952024, "grad_norm": 0.14448099642639844, "learning_rate": 2.8233224101006428e-05, "loss": 0.0129, "step": 3119 }, { "epoch": 2.338830584707646, "grad_norm": 0.13954587924358886, "learning_rate": 2.8172497183306712e-05, "loss": 0.0058, "step": 3120 }, { "epoch": 2.3395802098950527, "grad_norm": 0.14411637591598964, "learning_rate": 2.8111824934998887e-05, "loss": 0.0043, "step": 3121 }, { "epoch": 2.340329835082459, "grad_norm": 0.06198186086028406, "learning_rate": 2.8051207402261838e-05, "loss": 0.007, "step": 3122 }, { "epoch": 2.341079460269865, "grad_norm": 0.14413877090723234, "learning_rate": 2.799064463123281e-05, "loss": 0.008, "step": 3123 }, { "epoch": 2.341829085457271, "grad_norm": 0.09347129089982951, "learning_rate": 2.7930136668007378e-05, "loss": 0.0043, "step": 3124 }, { "epoch": 2.342578710644678, "grad_norm": 0.13541759433461203, "learning_rate": 2.7869683558639393e-05, "loss": 0.0061, "step": 3125 }, { "epoch": 2.343328335832084, "grad_norm": 0.19373063801056187, "learning_rate": 2.7809285349140978e-05, "loss": 0.02, "step": 3126 }, { "epoch": 2.34407796101949, "grad_norm": 0.0766996044013267, "learning_rate": 2.7748942085482432e-05, "loss": 0.0045, "step": 3127 }, { "epoch": 2.344827586206897, "grad_norm": 0.21287550164640445, "learning_rate": 2.768865381359228e-05, "loss": 0.0146, "step": 3128 }, { "epoch": 2.345577211394303, "grad_norm": 0.1467023997059412, "learning_rate": 2.762842057935714e-05, "loss": 0.011, "step": 3129 }, { "epoch": 2.346326836581709, "grad_norm": 0.07213302935373363, "learning_rate": 2.7568242428621795e-05, "loss": 0.0076, "step": 3130 }, { "epoch": 2.3470764617691153, "grad_norm": 0.11244197680293319, "learning_rate": 2.750811940718906e-05, "loss": 0.0078, "step": 3131 }, { "epoch": 2.3478260869565215, "grad_norm": 0.13515852196537875, "learning_rate": 2.7448051560819798e-05, "loss": 0.0113, "step": 3132 }, { "epoch": 2.348575712143928, "grad_norm": 0.08773623719969813, "learning_rate": 2.7388038935232906e-05, "loss": 0.005, "step": 3133 }, { "epoch": 2.3493253373313343, "grad_norm": 0.12461713634230395, "learning_rate": 2.7328081576105212e-05, "loss": 0.0061, "step": 3134 }, { "epoch": 2.3500749625187405, "grad_norm": 0.5424887360890198, "learning_rate": 2.7268179529071504e-05, "loss": 0.0212, "step": 3135 }, { "epoch": 2.350824587706147, "grad_norm": 0.0903500759016297, "learning_rate": 2.7208332839724428e-05, "loss": 0.0042, "step": 3136 }, { "epoch": 2.3515742128935533, "grad_norm": 0.18532535793380175, "learning_rate": 2.7148541553614593e-05, "loss": 0.008, "step": 3137 }, { "epoch": 2.3523238380809595, "grad_norm": 0.07390377641173043, "learning_rate": 2.7088805716250353e-05, "loss": 0.0047, "step": 3138 }, { "epoch": 2.3530734632683656, "grad_norm": 0.14290629394801793, "learning_rate": 2.7029125373097885e-05, "loss": 0.0034, "step": 3139 }, { "epoch": 2.3538230884557723, "grad_norm": 0.154780779006307, "learning_rate": 2.696950056958112e-05, "loss": 0.013, "step": 3140 }, { "epoch": 2.3545727136431784, "grad_norm": 0.06244049624897275, "learning_rate": 2.690993135108173e-05, "loss": 0.0102, "step": 3141 }, { "epoch": 2.3553223388305846, "grad_norm": 0.06907429271018437, "learning_rate": 2.6850417762939085e-05, "loss": 0.0022, "step": 3142 }, { "epoch": 2.356071964017991, "grad_norm": 0.04446260531409627, "learning_rate": 2.6790959850450182e-05, "loss": 0.0026, "step": 3143 }, { "epoch": 2.3568215892053974, "grad_norm": 0.07036016666397808, "learning_rate": 2.6731557658869666e-05, "loss": 0.0051, "step": 3144 }, { "epoch": 2.3575712143928036, "grad_norm": 0.06282767585097133, "learning_rate": 2.667221123340975e-05, "loss": 0.0027, "step": 3145 }, { "epoch": 2.3583208395802098, "grad_norm": 0.043149230951043435, "learning_rate": 2.6612920619240267e-05, "loss": 0.0006, "step": 3146 }, { "epoch": 2.3590704647676164, "grad_norm": 0.07576831760682239, "learning_rate": 2.655368586148852e-05, "loss": 0.0024, "step": 3147 }, { "epoch": 2.3598200899550226, "grad_norm": 0.08108836245646409, "learning_rate": 2.6494507005239266e-05, "loss": 0.0062, "step": 3148 }, { "epoch": 2.3605697151424287, "grad_norm": 0.11713349844058288, "learning_rate": 2.6435384095534787e-05, "loss": 0.0057, "step": 3149 }, { "epoch": 2.361319340329835, "grad_norm": 0.08772970147787831, "learning_rate": 2.6376317177374733e-05, "loss": 0.008, "step": 3150 }, { "epoch": 2.3620689655172415, "grad_norm": 0.14803691236224617, "learning_rate": 2.631730629571614e-05, "loss": 0.0069, "step": 3151 }, { "epoch": 2.3628185907046477, "grad_norm": 0.1144611596997879, "learning_rate": 2.6258351495473422e-05, "loss": 0.0083, "step": 3152 }, { "epoch": 2.363568215892054, "grad_norm": 0.06841431105601573, "learning_rate": 2.6199452821518276e-05, "loss": 0.0053, "step": 3153 }, { "epoch": 2.36431784107946, "grad_norm": 0.16048168050678102, "learning_rate": 2.6140610318679713e-05, "loss": 0.0079, "step": 3154 }, { "epoch": 2.3650674662668667, "grad_norm": 0.1513435464155828, "learning_rate": 2.6081824031743973e-05, "loss": 0.0072, "step": 3155 }, { "epoch": 2.365817091454273, "grad_norm": 0.20449265633671435, "learning_rate": 2.6023094005454508e-05, "loss": 0.0075, "step": 3156 }, { "epoch": 2.366566716641679, "grad_norm": 0.18888809594549003, "learning_rate": 2.5964420284511936e-05, "loss": 0.0078, "step": 3157 }, { "epoch": 2.3673163418290857, "grad_norm": 0.08862512881682469, "learning_rate": 2.590580291357404e-05, "loss": 0.0038, "step": 3158 }, { "epoch": 2.368065967016492, "grad_norm": 0.1560274818938899, "learning_rate": 2.5847241937255696e-05, "loss": 0.0037, "step": 3159 }, { "epoch": 2.368815592203898, "grad_norm": 0.09133329862797442, "learning_rate": 2.5788737400128915e-05, "loss": 0.0043, "step": 3160 }, { "epoch": 2.369565217391304, "grad_norm": 0.03709155544184032, "learning_rate": 2.573028934672267e-05, "loss": 0.0034, "step": 3161 }, { "epoch": 2.370314842578711, "grad_norm": 0.05432304607523067, "learning_rate": 2.5671897821522994e-05, "loss": 0.0022, "step": 3162 }, { "epoch": 2.371064467766117, "grad_norm": 0.19186007553771886, "learning_rate": 2.5613562868972863e-05, "loss": 0.0032, "step": 3163 }, { "epoch": 2.371814092953523, "grad_norm": 0.10272902414474801, "learning_rate": 2.5555284533472225e-05, "loss": 0.0052, "step": 3164 }, { "epoch": 2.3725637181409294, "grad_norm": 0.33886922406643966, "learning_rate": 2.5497062859377907e-05, "loss": 0.0214, "step": 3165 }, { "epoch": 2.373313343328336, "grad_norm": 0.09329404194299042, "learning_rate": 2.5438897891003644e-05, "loss": 0.0058, "step": 3166 }, { "epoch": 2.374062968515742, "grad_norm": 0.027590342633528796, "learning_rate": 2.5380789672619976e-05, "loss": 0.0005, "step": 3167 }, { "epoch": 2.3748125937031483, "grad_norm": 0.10410756246755437, "learning_rate": 2.532273824845426e-05, "loss": 0.0028, "step": 3168 }, { "epoch": 2.375562218890555, "grad_norm": 0.05486602305277263, "learning_rate": 2.5264743662690626e-05, "loss": 0.0033, "step": 3169 }, { "epoch": 2.376311844077961, "grad_norm": 0.22503256238636352, "learning_rate": 2.5206805959469982e-05, "loss": 0.0051, "step": 3170 }, { "epoch": 2.3770614692653673, "grad_norm": 0.06505037277743252, "learning_rate": 2.514892518288988e-05, "loss": 0.0032, "step": 3171 }, { "epoch": 2.3778110944527735, "grad_norm": 0.11335554596112911, "learning_rate": 2.509110137700459e-05, "loss": 0.0045, "step": 3172 }, { "epoch": 2.3785607196401797, "grad_norm": 0.09207603652791845, "learning_rate": 2.5033334585824987e-05, "loss": 0.0068, "step": 3173 }, { "epoch": 2.3793103448275863, "grad_norm": 0.08269320542812943, "learning_rate": 2.497562485331857e-05, "loss": 0.0024, "step": 3174 }, { "epoch": 2.3800599700149925, "grad_norm": 0.0921472923368709, "learning_rate": 2.4917972223409403e-05, "loss": 0.003, "step": 3175 }, { "epoch": 2.3808095952023987, "grad_norm": 0.04366359037083029, "learning_rate": 2.4860376739978087e-05, "loss": 0.0019, "step": 3176 }, { "epoch": 2.3815592203898053, "grad_norm": 0.03400033401219638, "learning_rate": 2.480283844686174e-05, "loss": 0.0015, "step": 3177 }, { "epoch": 2.3823088455772115, "grad_norm": 0.09706802506791543, "learning_rate": 2.474535738785394e-05, "loss": 0.0071, "step": 3178 }, { "epoch": 2.3830584707646176, "grad_norm": 0.05951984507419919, "learning_rate": 2.4687933606704694e-05, "loss": 0.0023, "step": 3179 }, { "epoch": 2.383808095952024, "grad_norm": 0.043996798867113274, "learning_rate": 2.463056714712043e-05, "loss": 0.0018, "step": 3180 }, { "epoch": 2.3845577211394304, "grad_norm": 0.03593451814711463, "learning_rate": 2.4573258052763948e-05, "loss": 0.001, "step": 3181 }, { "epoch": 2.3853073463268366, "grad_norm": 0.14919848701160657, "learning_rate": 2.4516006367254384e-05, "loss": 0.0068, "step": 3182 }, { "epoch": 2.386056971514243, "grad_norm": 0.19455939856494417, "learning_rate": 2.4458812134167132e-05, "loss": 0.0132, "step": 3183 }, { "epoch": 2.386806596701649, "grad_norm": 0.07319318949194768, "learning_rate": 2.4401675397033964e-05, "loss": 0.005, "step": 3184 }, { "epoch": 2.3875562218890556, "grad_norm": 0.04140646732606931, "learning_rate": 2.4344596199342793e-05, "loss": 0.0011, "step": 3185 }, { "epoch": 2.3883058470764618, "grad_norm": 0.20088390351180652, "learning_rate": 2.4287574584537786e-05, "loss": 0.0094, "step": 3186 }, { "epoch": 2.389055472263868, "grad_norm": 0.15192548332497946, "learning_rate": 2.4230610596019265e-05, "loss": 0.0107, "step": 3187 }, { "epoch": 2.3898050974512746, "grad_norm": 0.11208152247157709, "learning_rate": 2.4173704277143695e-05, "loss": 0.0063, "step": 3188 }, { "epoch": 2.3905547226386807, "grad_norm": 0.08848833128247399, "learning_rate": 2.4116855671223625e-05, "loss": 0.0051, "step": 3189 }, { "epoch": 2.391304347826087, "grad_norm": 0.05282982327077367, "learning_rate": 2.406006482152774e-05, "loss": 0.003, "step": 3190 }, { "epoch": 2.392053973013493, "grad_norm": 0.08195828669539026, "learning_rate": 2.400333177128069e-05, "loss": 0.008, "step": 3191 }, { "epoch": 2.3928035982008997, "grad_norm": 0.04534374907943794, "learning_rate": 2.3946656563663196e-05, "loss": 0.002, "step": 3192 }, { "epoch": 2.393553223388306, "grad_norm": 0.08263871987216773, "learning_rate": 2.3890039241811867e-05, "loss": 0.0032, "step": 3193 }, { "epoch": 2.394302848575712, "grad_norm": 0.08472564331894322, "learning_rate": 2.3833479848819395e-05, "loss": 0.002, "step": 3194 }, { "epoch": 2.3950524737631183, "grad_norm": 0.08397816672253118, "learning_rate": 2.3776978427734263e-05, "loss": 0.0023, "step": 3195 }, { "epoch": 2.395802098950525, "grad_norm": 0.08482986650966184, "learning_rate": 2.3720535021560865e-05, "loss": 0.0061, "step": 3196 }, { "epoch": 2.396551724137931, "grad_norm": 0.12808246072318563, "learning_rate": 2.366414967325944e-05, "loss": 0.012, "step": 3197 }, { "epoch": 2.3973013493253372, "grad_norm": 0.024415326203775196, "learning_rate": 2.360782242574604e-05, "loss": 0.0007, "step": 3198 }, { "epoch": 2.398050974512744, "grad_norm": 0.06455213192453027, "learning_rate": 2.3551553321892494e-05, "loss": 0.0089, "step": 3199 }, { "epoch": 2.39880059970015, "grad_norm": 0.039121680575436184, "learning_rate": 2.3495342404526364e-05, "loss": 0.0009, "step": 3200 }, { "epoch": 2.399550224887556, "grad_norm": 0.11826139251554775, "learning_rate": 2.3439189716430954e-05, "loss": 0.0071, "step": 3201 }, { "epoch": 2.4002998500749624, "grad_norm": 0.20739755707080024, "learning_rate": 2.3383095300345224e-05, "loss": 0.0072, "step": 3202 }, { "epoch": 2.401049475262369, "grad_norm": 0.06532052332726593, "learning_rate": 2.3327059198963798e-05, "loss": 0.002, "step": 3203 }, { "epoch": 2.401799100449775, "grad_norm": 0.22750847274796626, "learning_rate": 2.3271081454936883e-05, "loss": 0.0104, "step": 3204 }, { "epoch": 2.4025487256371814, "grad_norm": 0.017377477297441345, "learning_rate": 2.3215162110870315e-05, "loss": 0.0004, "step": 3205 }, { "epoch": 2.4032983508245875, "grad_norm": 0.1778878319909849, "learning_rate": 2.3159301209325422e-05, "loss": 0.0047, "step": 3206 }, { "epoch": 2.404047976011994, "grad_norm": 0.100210020891019, "learning_rate": 2.3103498792819157e-05, "loss": 0.0041, "step": 3207 }, { "epoch": 2.4047976011994003, "grad_norm": 0.15348021525655878, "learning_rate": 2.3047754903823837e-05, "loss": 0.007, "step": 3208 }, { "epoch": 2.4055472263868065, "grad_norm": 0.09173669806021993, "learning_rate": 2.2992069584767306e-05, "loss": 0.0044, "step": 3209 }, { "epoch": 2.406296851574213, "grad_norm": 0.1449428363068518, "learning_rate": 2.2936442878032782e-05, "loss": 0.0103, "step": 3210 }, { "epoch": 2.4070464767616193, "grad_norm": 0.1931471716433352, "learning_rate": 2.288087482595892e-05, "loss": 0.0056, "step": 3211 }, { "epoch": 2.4077961019490255, "grad_norm": 0.18221181616530532, "learning_rate": 2.282536547083971e-05, "loss": 0.0088, "step": 3212 }, { "epoch": 2.4085457271364317, "grad_norm": 0.07535885371074759, "learning_rate": 2.2769914854924447e-05, "loss": 0.004, "step": 3213 }, { "epoch": 2.409295352323838, "grad_norm": 0.2867869116874561, "learning_rate": 2.2714523020417743e-05, "loss": 0.0045, "step": 3214 }, { "epoch": 2.4100449775112445, "grad_norm": 0.10463185396513008, "learning_rate": 2.2659190009479482e-05, "loss": 0.0144, "step": 3215 }, { "epoch": 2.4107946026986506, "grad_norm": 0.08266492494464286, "learning_rate": 2.2603915864224734e-05, "loss": 0.0048, "step": 3216 }, { "epoch": 2.411544227886057, "grad_norm": 0.13151923674905358, "learning_rate": 2.2548700626723774e-05, "loss": 0.0066, "step": 3217 }, { "epoch": 2.4122938530734634, "grad_norm": 0.1498302704791928, "learning_rate": 2.2493544339002116e-05, "loss": 0.0102, "step": 3218 }, { "epoch": 2.4130434782608696, "grad_norm": 0.0545047426124523, "learning_rate": 2.24384470430403e-05, "loss": 0.0033, "step": 3219 }, { "epoch": 2.413793103448276, "grad_norm": 0.13852701257857522, "learning_rate": 2.238340878077404e-05, "loss": 0.0091, "step": 3220 }, { "epoch": 2.414542728635682, "grad_norm": 0.21838529938682086, "learning_rate": 2.232842959409407e-05, "loss": 0.0092, "step": 3221 }, { "epoch": 2.4152923538230886, "grad_norm": 0.14462270899799032, "learning_rate": 2.2273509524846192e-05, "loss": 0.0077, "step": 3222 }, { "epoch": 2.4160419790104948, "grad_norm": 0.09223350982849962, "learning_rate": 2.2218648614831193e-05, "loss": 0.0088, "step": 3223 }, { "epoch": 2.416791604197901, "grad_norm": 0.11345239267831092, "learning_rate": 2.2163846905804852e-05, "loss": 0.0031, "step": 3224 }, { "epoch": 2.417541229385307, "grad_norm": 0.06562607715456643, "learning_rate": 2.2109104439477867e-05, "loss": 0.0033, "step": 3225 }, { "epoch": 2.4182908545727138, "grad_norm": 0.1258976567530301, "learning_rate": 2.205442125751588e-05, "loss": 0.0039, "step": 3226 }, { "epoch": 2.41904047976012, "grad_norm": 0.0592420034040759, "learning_rate": 2.1999797401539358e-05, "loss": 0.0036, "step": 3227 }, { "epoch": 2.419790104947526, "grad_norm": 0.062410574467906405, "learning_rate": 2.1945232913123647e-05, "loss": 0.0026, "step": 3228 }, { "epoch": 2.4205397301349327, "grad_norm": 0.16072667320626324, "learning_rate": 2.1890727833798895e-05, "loss": 0.0051, "step": 3229 }, { "epoch": 2.421289355322339, "grad_norm": 0.06435646077977134, "learning_rate": 2.1836282205050085e-05, "loss": 0.0044, "step": 3230 }, { "epoch": 2.422038980509745, "grad_norm": 0.10547962395023756, "learning_rate": 2.1781896068316875e-05, "loss": 0.0043, "step": 3231 }, { "epoch": 2.4227886056971513, "grad_norm": 0.34184567189195214, "learning_rate": 2.172756946499368e-05, "loss": 0.0205, "step": 3232 }, { "epoch": 2.423538230884558, "grad_norm": 0.01943317922835859, "learning_rate": 2.167330243642959e-05, "loss": 0.0006, "step": 3233 }, { "epoch": 2.424287856071964, "grad_norm": 0.20086372463230104, "learning_rate": 2.161909502392836e-05, "loss": 0.011, "step": 3234 }, { "epoch": 2.4250374812593702, "grad_norm": 0.07101192234715667, "learning_rate": 2.156494726874838e-05, "loss": 0.0083, "step": 3235 }, { "epoch": 2.4257871064467764, "grad_norm": 0.04049036261930732, "learning_rate": 2.151085921210261e-05, "loss": 0.0047, "step": 3236 }, { "epoch": 2.426536731634183, "grad_norm": 0.16307343023420023, "learning_rate": 2.1456830895158574e-05, "loss": 0.0059, "step": 3237 }, { "epoch": 2.427286356821589, "grad_norm": 0.12506840491379043, "learning_rate": 2.1402862359038368e-05, "loss": 0.004, "step": 3238 }, { "epoch": 2.4280359820089954, "grad_norm": 0.0885584667949349, "learning_rate": 2.1348953644818525e-05, "loss": 0.0073, "step": 3239 }, { "epoch": 2.428785607196402, "grad_norm": 0.14648244588484946, "learning_rate": 2.1295104793530097e-05, "loss": 0.0062, "step": 3240 }, { "epoch": 2.429535232383808, "grad_norm": 0.07240228865975842, "learning_rate": 2.1241315846158505e-05, "loss": 0.0055, "step": 3241 }, { "epoch": 2.4302848575712144, "grad_norm": 0.03639532646793308, "learning_rate": 2.1187586843643713e-05, "loss": 0.0017, "step": 3242 }, { "epoch": 2.4310344827586206, "grad_norm": 0.10077748270325737, "learning_rate": 2.1133917826879913e-05, "loss": 0.0131, "step": 3243 }, { "epoch": 2.431784107946027, "grad_norm": 0.26074454882474535, "learning_rate": 2.1080308836715724e-05, "loss": 0.0147, "step": 3244 }, { "epoch": 2.4325337331334334, "grad_norm": 0.092146972275415, "learning_rate": 2.102675991395403e-05, "loss": 0.0027, "step": 3245 }, { "epoch": 2.4332833583208395, "grad_norm": 0.13457985066514191, "learning_rate": 2.0973271099352034e-05, "loss": 0.0055, "step": 3246 }, { "epoch": 2.4340329835082457, "grad_norm": 0.09271671929768435, "learning_rate": 2.0919842433621174e-05, "loss": 0.0088, "step": 3247 }, { "epoch": 2.4347826086956523, "grad_norm": 0.0579896275974002, "learning_rate": 2.086647395742709e-05, "loss": 0.0025, "step": 3248 }, { "epoch": 2.4355322338830585, "grad_norm": 0.03481173805902309, "learning_rate": 2.0813165711389647e-05, "loss": 0.002, "step": 3249 }, { "epoch": 2.4362818590704647, "grad_norm": 0.11311610359304168, "learning_rate": 2.0759917736082857e-05, "loss": 0.0038, "step": 3250 }, { "epoch": 2.4370314842578713, "grad_norm": 0.13988931045307879, "learning_rate": 2.070673007203483e-05, "loss": 0.0028, "step": 3251 }, { "epoch": 2.4377811094452775, "grad_norm": 0.14957193004353075, "learning_rate": 2.0653602759727774e-05, "loss": 0.0061, "step": 3252 }, { "epoch": 2.4385307346326837, "grad_norm": 0.04143554274829354, "learning_rate": 2.0600535839598034e-05, "loss": 0.0036, "step": 3253 }, { "epoch": 2.43928035982009, "grad_norm": 0.06687076357394921, "learning_rate": 2.054752935203591e-05, "loss": 0.0017, "step": 3254 }, { "epoch": 2.440029985007496, "grad_norm": 0.05813849507678315, "learning_rate": 2.049458333738573e-05, "loss": 0.0053, "step": 3255 }, { "epoch": 2.4407796101949026, "grad_norm": 0.161437410408533, "learning_rate": 2.0441697835945794e-05, "loss": 0.0106, "step": 3256 }, { "epoch": 2.441529235382309, "grad_norm": 0.07793383157440188, "learning_rate": 2.0388872887968356e-05, "loss": 0.0033, "step": 3257 }, { "epoch": 2.442278860569715, "grad_norm": 0.3149123134517686, "learning_rate": 2.033610853365957e-05, "loss": 0.0522, "step": 3258 }, { "epoch": 2.4430284857571216, "grad_norm": 0.01664652143123375, "learning_rate": 2.0283404813179463e-05, "loss": 0.0006, "step": 3259 }, { "epoch": 2.443778110944528, "grad_norm": 0.08198414275140468, "learning_rate": 2.0230761766641927e-05, "loss": 0.0073, "step": 3260 }, { "epoch": 2.444527736131934, "grad_norm": 0.0754390799513741, "learning_rate": 2.0178179434114674e-05, "loss": 0.014, "step": 3261 }, { "epoch": 2.44527736131934, "grad_norm": 0.10862792424697457, "learning_rate": 2.0125657855619197e-05, "loss": 0.0055, "step": 3262 }, { "epoch": 2.4460269865067468, "grad_norm": 0.21754292192596295, "learning_rate": 2.0073197071130745e-05, "loss": 0.0088, "step": 3263 }, { "epoch": 2.446776611694153, "grad_norm": 0.1918952617560053, "learning_rate": 2.0020797120578306e-05, "loss": 0.0283, "step": 3264 }, { "epoch": 2.447526236881559, "grad_norm": 0.043382904168564676, "learning_rate": 1.996845804384454e-05, "loss": 0.0026, "step": 3265 }, { "epoch": 2.4482758620689653, "grad_norm": 0.19778157747190214, "learning_rate": 1.9916179880765838e-05, "loss": 0.0177, "step": 3266 }, { "epoch": 2.449025487256372, "grad_norm": 0.18805089046965645, "learning_rate": 1.9863962671132184e-05, "loss": 0.0072, "step": 3267 }, { "epoch": 2.449775112443778, "grad_norm": 0.18755316996292778, "learning_rate": 1.9811806454687142e-05, "loss": 0.018, "step": 3268 }, { "epoch": 2.4505247376311843, "grad_norm": 0.06614250670744387, "learning_rate": 1.97597112711279e-05, "loss": 0.0033, "step": 3269 }, { "epoch": 2.451274362818591, "grad_norm": 0.11396886184625207, "learning_rate": 1.9707677160105176e-05, "loss": 0.0091, "step": 3270 }, { "epoch": 2.452023988005997, "grad_norm": 0.1191653191852702, "learning_rate": 1.9655704161223197e-05, "loss": 0.0091, "step": 3271 }, { "epoch": 2.4527736131934033, "grad_norm": 0.04978826362799082, "learning_rate": 1.9603792314039693e-05, "loss": 0.002, "step": 3272 }, { "epoch": 2.4535232383808094, "grad_norm": 0.06209037623530401, "learning_rate": 1.955194165806582e-05, "loss": 0.0026, "step": 3273 }, { "epoch": 2.454272863568216, "grad_norm": 0.15311403649462496, "learning_rate": 1.95001522327662e-05, "loss": 0.0138, "step": 3274 }, { "epoch": 2.4550224887556222, "grad_norm": 0.08934123417692379, "learning_rate": 1.9448424077558823e-05, "loss": 0.0036, "step": 3275 }, { "epoch": 2.4557721139430284, "grad_norm": 0.06125059469568494, "learning_rate": 1.939675723181502e-05, "loss": 0.004, "step": 3276 }, { "epoch": 2.4565217391304346, "grad_norm": 0.08209081530940565, "learning_rate": 1.934515173485957e-05, "loss": 0.0054, "step": 3277 }, { "epoch": 2.457271364317841, "grad_norm": 0.11537540780526452, "learning_rate": 1.9293607625970423e-05, "loss": 0.0107, "step": 3278 }, { "epoch": 2.4580209895052474, "grad_norm": 0.13870867629619593, "learning_rate": 1.924212494437887e-05, "loss": 0.0098, "step": 3279 }, { "epoch": 2.4587706146926536, "grad_norm": 0.02878266136949601, "learning_rate": 1.9190703729269456e-05, "loss": 0.0009, "step": 3280 }, { "epoch": 2.45952023988006, "grad_norm": 0.1279207625734458, "learning_rate": 1.9139344019779915e-05, "loss": 0.0077, "step": 3281 }, { "epoch": 2.4602698650674664, "grad_norm": 0.07684664298238444, "learning_rate": 1.908804585500117e-05, "loss": 0.003, "step": 3282 }, { "epoch": 2.4610194902548725, "grad_norm": 0.0543302880388553, "learning_rate": 1.9036809273977328e-05, "loss": 0.0018, "step": 3283 }, { "epoch": 2.4617691154422787, "grad_norm": 0.10888747269891666, "learning_rate": 1.89856343157056e-05, "loss": 0.0047, "step": 3284 }, { "epoch": 2.4625187406296853, "grad_norm": 0.2646476257190446, "learning_rate": 1.8934521019136308e-05, "loss": 0.012, "step": 3285 }, { "epoch": 2.4632683658170915, "grad_norm": 0.17184286978028573, "learning_rate": 1.8883469423172817e-05, "loss": 0.0125, "step": 3286 }, { "epoch": 2.4640179910044977, "grad_norm": 0.1390931631406034, "learning_rate": 1.883247956667157e-05, "loss": 0.0049, "step": 3287 }, { "epoch": 2.464767616191904, "grad_norm": 0.0714921482628974, "learning_rate": 1.8781551488441972e-05, "loss": 0.003, "step": 3288 }, { "epoch": 2.4655172413793105, "grad_norm": 0.133046984565039, "learning_rate": 1.8730685227246457e-05, "loss": 0.006, "step": 3289 }, { "epoch": 2.4662668665667167, "grad_norm": 0.11209588746179831, "learning_rate": 1.8679880821800343e-05, "loss": 0.0073, "step": 3290 }, { "epoch": 2.467016491754123, "grad_norm": 0.06819768532382646, "learning_rate": 1.8629138310771965e-05, "loss": 0.0022, "step": 3291 }, { "epoch": 2.4677661169415295, "grad_norm": 0.054661698085546696, "learning_rate": 1.8578457732782474e-05, "loss": 0.0013, "step": 3292 }, { "epoch": 2.4685157421289357, "grad_norm": 0.1085360087024495, "learning_rate": 1.8527839126405887e-05, "loss": 0.0034, "step": 3293 }, { "epoch": 2.469265367316342, "grad_norm": 0.049756364784295085, "learning_rate": 1.8477282530169083e-05, "loss": 0.0024, "step": 3294 }, { "epoch": 2.470014992503748, "grad_norm": 0.0366405441766717, "learning_rate": 1.842678798255172e-05, "loss": 0.0024, "step": 3295 }, { "epoch": 2.470764617691154, "grad_norm": 0.09309741368973933, "learning_rate": 1.837635552198623e-05, "loss": 0.0095, "step": 3296 }, { "epoch": 2.471514242878561, "grad_norm": 0.15714420420691394, "learning_rate": 1.832598518685782e-05, "loss": 0.0118, "step": 3297 }, { "epoch": 2.472263868065967, "grad_norm": 0.2070285577468285, "learning_rate": 1.8275677015504378e-05, "loss": 0.0086, "step": 3298 }, { "epoch": 2.473013493253373, "grad_norm": 0.14826770507822962, "learning_rate": 1.8225431046216467e-05, "loss": 0.0123, "step": 3299 }, { "epoch": 2.47376311844078, "grad_norm": 0.1449443151390573, "learning_rate": 1.8175247317237366e-05, "loss": 0.0071, "step": 3300 }, { "epoch": 2.474512743628186, "grad_norm": 0.08054687812669925, "learning_rate": 1.812512586676294e-05, "loss": 0.0009, "step": 3301 }, { "epoch": 2.475262368815592, "grad_norm": 0.019756970121557204, "learning_rate": 1.8075066732941647e-05, "loss": 0.0008, "step": 3302 }, { "epoch": 2.4760119940029983, "grad_norm": 0.19447683391193743, "learning_rate": 1.802506995387452e-05, "loss": 0.0058, "step": 3303 }, { "epoch": 2.476761619190405, "grad_norm": 0.04533386155965311, "learning_rate": 1.797513556761514e-05, "loss": 0.002, "step": 3304 }, { "epoch": 2.477511244377811, "grad_norm": 0.05840662803628417, "learning_rate": 1.7925263612169608e-05, "loss": 0.0019, "step": 3305 }, { "epoch": 2.4782608695652173, "grad_norm": 0.07904173424928156, "learning_rate": 1.7875454125496482e-05, "loss": 0.0034, "step": 3306 }, { "epoch": 2.4790104947526235, "grad_norm": 0.07591255699380649, "learning_rate": 1.78257071455068e-05, "loss": 0.0049, "step": 3307 }, { "epoch": 2.47976011994003, "grad_norm": 0.15415615413389439, "learning_rate": 1.777602271006401e-05, "loss": 0.0042, "step": 3308 }, { "epoch": 2.4805097451274363, "grad_norm": 0.10602648104833119, "learning_rate": 1.7726400856983948e-05, "loss": 0.0039, "step": 3309 }, { "epoch": 2.4812593703148424, "grad_norm": 0.1428032240200061, "learning_rate": 1.7676841624034846e-05, "loss": 0.0023, "step": 3310 }, { "epoch": 2.482008995502249, "grad_norm": 0.1057352560848211, "learning_rate": 1.7627345048937237e-05, "loss": 0.0106, "step": 3311 }, { "epoch": 2.4827586206896552, "grad_norm": 0.21025082098498768, "learning_rate": 1.7577911169364004e-05, "loss": 0.012, "step": 3312 }, { "epoch": 2.4835082458770614, "grad_norm": 0.2070421759988733, "learning_rate": 1.7528540022940288e-05, "loss": 0.0108, "step": 3313 }, { "epoch": 2.4842578710644676, "grad_norm": 0.3783571009031663, "learning_rate": 1.7479231647243444e-05, "loss": 0.0092, "step": 3314 }, { "epoch": 2.4850074962518742, "grad_norm": 0.10719782576136593, "learning_rate": 1.742998607980315e-05, "loss": 0.0041, "step": 3315 }, { "epoch": 2.4857571214392804, "grad_norm": 0.06422424759686289, "learning_rate": 1.7380803358101206e-05, "loss": 0.0031, "step": 3316 }, { "epoch": 2.4865067466266866, "grad_norm": 0.07860804952533651, "learning_rate": 1.7331683519571594e-05, "loss": 0.0078, "step": 3317 }, { "epoch": 2.4872563718140928, "grad_norm": 0.035314739290592416, "learning_rate": 1.7282626601600415e-05, "loss": 0.0021, "step": 3318 }, { "epoch": 2.4880059970014994, "grad_norm": 0.15164882963503368, "learning_rate": 1.7233632641525934e-05, "loss": 0.0046, "step": 3319 }, { "epoch": 2.4887556221889056, "grad_norm": 0.11015905813111565, "learning_rate": 1.7184701676638427e-05, "loss": 0.0065, "step": 3320 }, { "epoch": 2.4895052473763117, "grad_norm": 0.12187436113335419, "learning_rate": 1.7135833744180274e-05, "loss": 0.0097, "step": 3321 }, { "epoch": 2.4902548725637184, "grad_norm": 0.06096731943620705, "learning_rate": 1.7087028881345846e-05, "loss": 0.0058, "step": 3322 }, { "epoch": 2.4910044977511245, "grad_norm": 0.11027908003154516, "learning_rate": 1.7038287125281584e-05, "loss": 0.0171, "step": 3323 }, { "epoch": 2.4917541229385307, "grad_norm": 0.1564353026772974, "learning_rate": 1.6989608513085786e-05, "loss": 0.0073, "step": 3324 }, { "epoch": 2.492503748125937, "grad_norm": 0.10037155018203812, "learning_rate": 1.6940993081808787e-05, "loss": 0.0104, "step": 3325 }, { "epoch": 2.4932533733133435, "grad_norm": 0.14119281576355544, "learning_rate": 1.6892440868452764e-05, "loss": 0.006, "step": 3326 }, { "epoch": 2.4940029985007497, "grad_norm": 0.20832412941132497, "learning_rate": 1.684395190997182e-05, "loss": 0.0125, "step": 3327 }, { "epoch": 2.494752623688156, "grad_norm": 0.13363735419048947, "learning_rate": 1.6795526243271908e-05, "loss": 0.0037, "step": 3328 }, { "epoch": 2.495502248875562, "grad_norm": 0.23693648411466714, "learning_rate": 1.6747163905210782e-05, "loss": 0.0259, "step": 3329 }, { "epoch": 2.4962518740629687, "grad_norm": 0.05846092967251272, "learning_rate": 1.6698864932598012e-05, "loss": 0.0049, "step": 3330 }, { "epoch": 2.497001499250375, "grad_norm": 0.0954056777302459, "learning_rate": 1.665062936219497e-05, "loss": 0.0054, "step": 3331 }, { "epoch": 2.497751124437781, "grad_norm": 0.13874395577918525, "learning_rate": 1.6602457230714707e-05, "loss": 0.0054, "step": 3332 }, { "epoch": 2.4985007496251876, "grad_norm": 0.09704989563203428, "learning_rate": 1.6554348574822043e-05, "loss": 0.0074, "step": 3333 }, { "epoch": 2.499250374812594, "grad_norm": 0.10633359164911542, "learning_rate": 1.6506303431133465e-05, "loss": 0.0048, "step": 3334 }, { "epoch": 2.5, "grad_norm": 0.0803813036357924, "learning_rate": 1.6458321836217118e-05, "loss": 0.0033, "step": 3335 }, { "epoch": 2.500749625187406, "grad_norm": 0.0678889800093358, "learning_rate": 1.641040382659279e-05, "loss": 0.0063, "step": 3336 }, { "epoch": 2.5014992503748124, "grad_norm": 0.11564895659247448, "learning_rate": 1.6362549438731857e-05, "loss": 0.011, "step": 3337 }, { "epoch": 2.502248875562219, "grad_norm": 0.10691469240997556, "learning_rate": 1.6314758709057254e-05, "loss": 0.0022, "step": 3338 }, { "epoch": 2.502998500749625, "grad_norm": 0.04258097389584946, "learning_rate": 1.6267031673943543e-05, "loss": 0.0029, "step": 3339 }, { "epoch": 2.5037481259370313, "grad_norm": 0.2022447545861962, "learning_rate": 1.6219368369716737e-05, "loss": 0.01, "step": 3340 }, { "epoch": 2.504497751124438, "grad_norm": 0.08254344073917239, "learning_rate": 1.617176883265433e-05, "loss": 0.013, "step": 3341 }, { "epoch": 2.505247376311844, "grad_norm": 0.24658516758530183, "learning_rate": 1.6124233098985352e-05, "loss": 0.0183, "step": 3342 }, { "epoch": 2.5059970014992503, "grad_norm": 0.08495882689774024, "learning_rate": 1.60767612048902e-05, "loss": 0.0053, "step": 3343 }, { "epoch": 2.506746626686657, "grad_norm": 0.07942922256090312, "learning_rate": 1.6029353186500708e-05, "loss": 0.0042, "step": 3344 }, { "epoch": 2.507496251874063, "grad_norm": 0.07118141791709931, "learning_rate": 1.5982009079900118e-05, "loss": 0.0066, "step": 3345 }, { "epoch": 2.5082458770614693, "grad_norm": 0.12139061932172453, "learning_rate": 1.5934728921122955e-05, "loss": 0.0036, "step": 3346 }, { "epoch": 2.5089955022488755, "grad_norm": 0.06628093070288814, "learning_rate": 1.5887512746155185e-05, "loss": 0.0031, "step": 3347 }, { "epoch": 2.5097451274362816, "grad_norm": 0.10456872395325517, "learning_rate": 1.5840360590933988e-05, "loss": 0.0059, "step": 3348 }, { "epoch": 2.5104947526236883, "grad_norm": 0.04529821859284247, "learning_rate": 1.579327249134783e-05, "loss": 0.0025, "step": 3349 }, { "epoch": 2.5112443778110944, "grad_norm": 0.08226942305606083, "learning_rate": 1.574624848323646e-05, "loss": 0.0065, "step": 3350 }, { "epoch": 2.5119940029985006, "grad_norm": 0.004788370896557784, "learning_rate": 1.5699288602390782e-05, "loss": 0.0001, "step": 3351 }, { "epoch": 2.5127436281859072, "grad_norm": 0.04635220007278972, "learning_rate": 1.5652392884552947e-05, "loss": 0.0029, "step": 3352 }, { "epoch": 2.5134932533733134, "grad_norm": 0.10648525699498171, "learning_rate": 1.5605561365416256e-05, "loss": 0.0077, "step": 3353 }, { "epoch": 2.5142428785607196, "grad_norm": 0.12715298764016186, "learning_rate": 1.5558794080625127e-05, "loss": 0.0082, "step": 3354 }, { "epoch": 2.5149925037481258, "grad_norm": 0.06798855324869771, "learning_rate": 1.5512091065775126e-05, "loss": 0.0076, "step": 3355 }, { "epoch": 2.5157421289355324, "grad_norm": 0.07077620336304406, "learning_rate": 1.546545235641287e-05, "loss": 0.0033, "step": 3356 }, { "epoch": 2.5164917541229386, "grad_norm": 0.06352248995169558, "learning_rate": 1.5418877988036028e-05, "loss": 0.0046, "step": 3357 }, { "epoch": 2.5172413793103448, "grad_norm": 0.13447931451764636, "learning_rate": 1.537236799609333e-05, "loss": 0.0048, "step": 3358 }, { "epoch": 2.517991004497751, "grad_norm": 0.23300269027514214, "learning_rate": 1.5325922415984482e-05, "loss": 0.0192, "step": 3359 }, { "epoch": 2.5187406296851576, "grad_norm": 0.10911871613118115, "learning_rate": 1.5279541283060163e-05, "loss": 0.0046, "step": 3360 }, { "epoch": 2.5194902548725637, "grad_norm": 0.07059905883441622, "learning_rate": 1.5233224632622023e-05, "loss": 0.0059, "step": 3361 }, { "epoch": 2.52023988005997, "grad_norm": 0.12077716301718372, "learning_rate": 1.5186972499922592e-05, "loss": 0.0054, "step": 3362 }, { "epoch": 2.5209895052473765, "grad_norm": 0.12174224416791476, "learning_rate": 1.5140784920165362e-05, "loss": 0.0061, "step": 3363 }, { "epoch": 2.5217391304347827, "grad_norm": 0.07569846123809262, "learning_rate": 1.5094661928504628e-05, "loss": 0.0049, "step": 3364 }, { "epoch": 2.522488755622189, "grad_norm": 0.11639308956168937, "learning_rate": 1.5048603560045549e-05, "loss": 0.0039, "step": 3365 }, { "epoch": 2.523238380809595, "grad_norm": 0.10689395952382459, "learning_rate": 1.5002609849844106e-05, "loss": 0.0132, "step": 3366 }, { "epoch": 2.5239880059970012, "grad_norm": 0.045761003984327045, "learning_rate": 1.4956680832907056e-05, "loss": 0.0013, "step": 3367 }, { "epoch": 2.524737631184408, "grad_norm": 0.11936305725887401, "learning_rate": 1.4910816544191908e-05, "loss": 0.0048, "step": 3368 }, { "epoch": 2.525487256371814, "grad_norm": 0.041702788660834045, "learning_rate": 1.4865017018606909e-05, "loss": 0.0006, "step": 3369 }, { "epoch": 2.52623688155922, "grad_norm": 0.20843594068598312, "learning_rate": 1.481928229101106e-05, "loss": 0.0093, "step": 3370 }, { "epoch": 2.526986506746627, "grad_norm": 0.06508216640159331, "learning_rate": 1.4773612396213988e-05, "loss": 0.003, "step": 3371 }, { "epoch": 2.527736131934033, "grad_norm": 0.10586528511790065, "learning_rate": 1.4728007368975983e-05, "loss": 0.004, "step": 3372 }, { "epoch": 2.528485757121439, "grad_norm": 0.05561954050455667, "learning_rate": 1.468246724400797e-05, "loss": 0.0024, "step": 3373 }, { "epoch": 2.529235382308846, "grad_norm": 0.18058892377561644, "learning_rate": 1.4636992055971477e-05, "loss": 0.0041, "step": 3374 }, { "epoch": 2.529985007496252, "grad_norm": 0.09340388762629254, "learning_rate": 1.4591581839478607e-05, "loss": 0.0038, "step": 3375 }, { "epoch": 2.530734632683658, "grad_norm": 0.09641022352823704, "learning_rate": 1.4546236629092025e-05, "loss": 0.0112, "step": 3376 }, { "epoch": 2.5314842578710643, "grad_norm": 0.017482766975639216, "learning_rate": 1.4500956459324876e-05, "loss": 0.0009, "step": 3377 }, { "epoch": 2.5322338830584705, "grad_norm": 0.006491921022898667, "learning_rate": 1.4455741364640862e-05, "loss": 0.0001, "step": 3378 }, { "epoch": 2.532983508245877, "grad_norm": 0.4206000354963242, "learning_rate": 1.4410591379454097e-05, "loss": 0.0258, "step": 3379 }, { "epoch": 2.5337331334332833, "grad_norm": 0.04486177965619197, "learning_rate": 1.436550653812918e-05, "loss": 0.0017, "step": 3380 }, { "epoch": 2.5344827586206895, "grad_norm": 0.0518983797022331, "learning_rate": 1.4320486874981099e-05, "loss": 0.0042, "step": 3381 }, { "epoch": 2.535232383808096, "grad_norm": 0.14914624735948265, "learning_rate": 1.4275532424275263e-05, "loss": 0.0115, "step": 3382 }, { "epoch": 2.5359820089955023, "grad_norm": 0.06403302361142668, "learning_rate": 1.4230643220227413e-05, "loss": 0.0014, "step": 3383 }, { "epoch": 2.5367316341829085, "grad_norm": 0.05932830774052173, "learning_rate": 1.4185819297003666e-05, "loss": 0.0059, "step": 3384 }, { "epoch": 2.537481259370315, "grad_norm": 0.08205924145282562, "learning_rate": 1.414106068872042e-05, "loss": 0.0055, "step": 3385 }, { "epoch": 2.5382308845577213, "grad_norm": 0.24183435329526498, "learning_rate": 1.4096367429444346e-05, "loss": 0.0072, "step": 3386 }, { "epoch": 2.5389805097451275, "grad_norm": 0.16890306225531435, "learning_rate": 1.4051739553192467e-05, "loss": 0.0042, "step": 3387 }, { "epoch": 2.5397301349325336, "grad_norm": 0.17121384666776437, "learning_rate": 1.4007177093931945e-05, "loss": 0.0065, "step": 3388 }, { "epoch": 2.54047976011994, "grad_norm": 0.06053968873643824, "learning_rate": 1.3962680085580194e-05, "loss": 0.0037, "step": 3389 }, { "epoch": 2.5412293853073464, "grad_norm": 0.07343884227867872, "learning_rate": 1.3918248562004788e-05, "loss": 0.0038, "step": 3390 }, { "epoch": 2.5419790104947526, "grad_norm": 0.10825304746636069, "learning_rate": 1.3873882557023488e-05, "loss": 0.0036, "step": 3391 }, { "epoch": 2.542728635682159, "grad_norm": 0.10489755580343259, "learning_rate": 1.3829582104404149e-05, "loss": 0.0047, "step": 3392 }, { "epoch": 2.5434782608695654, "grad_norm": 0.07540453424961847, "learning_rate": 1.3785347237864799e-05, "loss": 0.0029, "step": 3393 }, { "epoch": 2.5442278860569716, "grad_norm": 0.07260094751569643, "learning_rate": 1.3741177991073484e-05, "loss": 0.0035, "step": 3394 }, { "epoch": 2.5449775112443778, "grad_norm": 0.03176271883763127, "learning_rate": 1.3697074397648335e-05, "loss": 0.0014, "step": 3395 }, { "epoch": 2.545727136431784, "grad_norm": 0.08957438106312125, "learning_rate": 1.36530364911575e-05, "loss": 0.0101, "step": 3396 }, { "epoch": 2.5464767616191906, "grad_norm": 0.06852095264691668, "learning_rate": 1.3609064305119135e-05, "loss": 0.0051, "step": 3397 }, { "epoch": 2.5472263868065967, "grad_norm": 0.18697086935588128, "learning_rate": 1.3565157873001377e-05, "loss": 0.012, "step": 3398 }, { "epoch": 2.547976011994003, "grad_norm": 0.019448086688255033, "learning_rate": 1.3521317228222318e-05, "loss": 0.0009, "step": 3399 }, { "epoch": 2.548725637181409, "grad_norm": 0.16603021849626445, "learning_rate": 1.347754240414998e-05, "loss": 0.0091, "step": 3400 }, { "epoch": 2.5494752623688157, "grad_norm": 0.06125424646624753, "learning_rate": 1.3433833434102272e-05, "loss": 0.0028, "step": 3401 }, { "epoch": 2.550224887556222, "grad_norm": 0.03927286803929153, "learning_rate": 1.339019035134701e-05, "loss": 0.0016, "step": 3402 }, { "epoch": 2.550974512743628, "grad_norm": 0.08078465371000328, "learning_rate": 1.3346613189101819e-05, "loss": 0.0023, "step": 3403 }, { "epoch": 2.5517241379310347, "grad_norm": 0.18222397947614588, "learning_rate": 1.3303101980534184e-05, "loss": 0.0075, "step": 3404 }, { "epoch": 2.552473763118441, "grad_norm": 0.09719479831833734, "learning_rate": 1.3259656758761397e-05, "loss": 0.0076, "step": 3405 }, { "epoch": 2.553223388305847, "grad_norm": 0.11960845259668604, "learning_rate": 1.3216277556850487e-05, "loss": 0.0062, "step": 3406 }, { "epoch": 2.5539730134932532, "grad_norm": 0.04845046202394285, "learning_rate": 1.3172964407818277e-05, "loss": 0.0021, "step": 3407 }, { "epoch": 2.5547226386806594, "grad_norm": 0.08315262578770383, "learning_rate": 1.3129717344631288e-05, "loss": 0.0049, "step": 3408 }, { "epoch": 2.555472263868066, "grad_norm": 0.14214322041507318, "learning_rate": 1.3086536400205752e-05, "loss": 0.0186, "step": 3409 }, { "epoch": 2.556221889055472, "grad_norm": 0.09052640614593274, "learning_rate": 1.3043421607407557e-05, "loss": 0.0045, "step": 3410 }, { "epoch": 2.5569715142428784, "grad_norm": 0.02934896328124305, "learning_rate": 1.300037299905229e-05, "loss": 0.0008, "step": 3411 }, { "epoch": 2.557721139430285, "grad_norm": 0.04848408509157833, "learning_rate": 1.295739060790514e-05, "loss": 0.0039, "step": 3412 }, { "epoch": 2.558470764617691, "grad_norm": 0.02128701888303665, "learning_rate": 1.291447446668086e-05, "loss": 0.0006, "step": 3413 }, { "epoch": 2.5592203898050974, "grad_norm": 0.09413095292963528, "learning_rate": 1.287162460804382e-05, "loss": 0.0044, "step": 3414 }, { "epoch": 2.559970014992504, "grad_norm": 0.17174537693955969, "learning_rate": 1.2828841064607911e-05, "loss": 0.0081, "step": 3415 }, { "epoch": 2.56071964017991, "grad_norm": 0.07794231023141801, "learning_rate": 1.2786123868936617e-05, "loss": 0.003, "step": 3416 }, { "epoch": 2.5614692653673163, "grad_norm": 0.030253277823394865, "learning_rate": 1.2743473053542842e-05, "loss": 0.0021, "step": 3417 }, { "epoch": 2.5622188905547225, "grad_norm": 0.07176729695739768, "learning_rate": 1.2700888650889008e-05, "loss": 0.005, "step": 3418 }, { "epoch": 2.5629685157421287, "grad_norm": 0.07416769918242676, "learning_rate": 1.265837069338699e-05, "loss": 0.0019, "step": 3419 }, { "epoch": 2.5637181409295353, "grad_norm": 0.06427223920828888, "learning_rate": 1.2615919213398053e-05, "loss": 0.0026, "step": 3420 }, { "epoch": 2.5644677661169415, "grad_norm": 0.19369364107159953, "learning_rate": 1.2573534243232921e-05, "loss": 0.0036, "step": 3421 }, { "epoch": 2.5652173913043477, "grad_norm": 0.04976782879154124, "learning_rate": 1.2531215815151653e-05, "loss": 0.0017, "step": 3422 }, { "epoch": 2.5659670164917543, "grad_norm": 0.06578714098838653, "learning_rate": 1.2488963961363676e-05, "loss": 0.0059, "step": 3423 }, { "epoch": 2.5667166416791605, "grad_norm": 0.23465623124533136, "learning_rate": 1.2446778714027762e-05, "loss": 0.0102, "step": 3424 }, { "epoch": 2.5674662668665666, "grad_norm": 0.05855814761088096, "learning_rate": 1.2404660105251952e-05, "loss": 0.0027, "step": 3425 }, { "epoch": 2.5682158920539733, "grad_norm": 0.1571853358091456, "learning_rate": 1.2362608167093593e-05, "loss": 0.0027, "step": 3426 }, { "epoch": 2.5689655172413794, "grad_norm": 0.08157672990923209, "learning_rate": 1.2320622931559289e-05, "loss": 0.0035, "step": 3427 }, { "epoch": 2.5697151424287856, "grad_norm": 0.20591070104559273, "learning_rate": 1.227870443060486e-05, "loss": 0.016, "step": 3428 }, { "epoch": 2.570464767616192, "grad_norm": 0.22646797574654348, "learning_rate": 1.2236852696135347e-05, "loss": 0.0148, "step": 3429 }, { "epoch": 2.571214392803598, "grad_norm": 0.12966875286555696, "learning_rate": 1.2195067760004953e-05, "loss": 0.0111, "step": 3430 }, { "epoch": 2.5719640179910046, "grad_norm": 0.15851610153186296, "learning_rate": 1.2153349654017066e-05, "loss": 0.0149, "step": 3431 }, { "epoch": 2.572713643178411, "grad_norm": 0.04538796725577889, "learning_rate": 1.21116984099242e-05, "loss": 0.001, "step": 3432 }, { "epoch": 2.573463268365817, "grad_norm": 0.2040980296949387, "learning_rate": 1.2070114059427962e-05, "loss": 0.0174, "step": 3433 }, { "epoch": 2.5742128935532236, "grad_norm": 0.2319504801025156, "learning_rate": 1.2028596634179035e-05, "loss": 0.0101, "step": 3434 }, { "epoch": 2.5749625187406298, "grad_norm": 0.07389525246705174, "learning_rate": 1.1987146165777251e-05, "loss": 0.0034, "step": 3435 }, { "epoch": 2.575712143928036, "grad_norm": 0.04947547613037489, "learning_rate": 1.1945762685771367e-05, "loss": 0.0023, "step": 3436 }, { "epoch": 2.576461769115442, "grad_norm": 0.041088136619714506, "learning_rate": 1.1904446225659216e-05, "loss": 0.0038, "step": 3437 }, { "epoch": 2.5772113943028487, "grad_norm": 0.06362798204591215, "learning_rate": 1.1863196816887611e-05, "loss": 0.0057, "step": 3438 }, { "epoch": 2.577961019490255, "grad_norm": 0.0712690589017783, "learning_rate": 1.1822014490852296e-05, "loss": 0.0033, "step": 3439 }, { "epoch": 2.578710644677661, "grad_norm": 0.06630960067042307, "learning_rate": 1.1780899278898027e-05, "loss": 0.0007, "step": 3440 }, { "epoch": 2.5794602698650673, "grad_norm": 0.032333683634671456, "learning_rate": 1.173985121231843e-05, "loss": 0.0016, "step": 3441 }, { "epoch": 2.580209895052474, "grad_norm": 0.06634983990847639, "learning_rate": 1.1698870322356025e-05, "loss": 0.0031, "step": 3442 }, { "epoch": 2.58095952023988, "grad_norm": 0.08736825173573612, "learning_rate": 1.1657956640202217e-05, "loss": 0.0062, "step": 3443 }, { "epoch": 2.5817091454272862, "grad_norm": 0.0852109705429149, "learning_rate": 1.161711019699725e-05, "loss": 0.0074, "step": 3444 }, { "epoch": 2.582458770614693, "grad_norm": 0.06128883036691688, "learning_rate": 1.1576331023830189e-05, "loss": 0.0015, "step": 3445 }, { "epoch": 2.583208395802099, "grad_norm": 0.08392962163284858, "learning_rate": 1.1535619151738907e-05, "loss": 0.0042, "step": 3446 }, { "epoch": 2.5839580209895052, "grad_norm": 0.12943974098761474, "learning_rate": 1.1494974611710052e-05, "loss": 0.0065, "step": 3447 }, { "epoch": 2.5847076461769114, "grad_norm": 0.11254286919409974, "learning_rate": 1.1454397434679021e-05, "loss": 0.0058, "step": 3448 }, { "epoch": 2.5854572713643176, "grad_norm": 0.11485343113759286, "learning_rate": 1.141388765152992e-05, "loss": 0.0029, "step": 3449 }, { "epoch": 2.586206896551724, "grad_norm": 0.13626055755279318, "learning_rate": 1.1373445293095609e-05, "loss": 0.0181, "step": 3450 }, { "epoch": 2.5869565217391304, "grad_norm": 0.0631198928363645, "learning_rate": 1.1333070390157575e-05, "loss": 0.0037, "step": 3451 }, { "epoch": 2.5877061469265366, "grad_norm": 0.10345333813257934, "learning_rate": 1.1292762973446003e-05, "loss": 0.0067, "step": 3452 }, { "epoch": 2.588455772113943, "grad_norm": 0.04474885996453497, "learning_rate": 1.1252523073639686e-05, "loss": 0.0026, "step": 3453 }, { "epoch": 2.5892053973013494, "grad_norm": 0.2113317755158727, "learning_rate": 1.1212350721366061e-05, "loss": 0.0035, "step": 3454 }, { "epoch": 2.5899550224887555, "grad_norm": 0.4604011176462181, "learning_rate": 1.117224594720111e-05, "loss": 0.0249, "step": 3455 }, { "epoch": 2.590704647676162, "grad_norm": 0.08213204327434842, "learning_rate": 1.1132208781669417e-05, "loss": 0.002, "step": 3456 }, { "epoch": 2.5914542728635683, "grad_norm": 0.07210620882306176, "learning_rate": 1.1092239255244085e-05, "loss": 0.0047, "step": 3457 }, { "epoch": 2.5922038980509745, "grad_norm": 0.11264038850928215, "learning_rate": 1.1052337398346724e-05, "loss": 0.007, "step": 3458 }, { "epoch": 2.5929535232383807, "grad_norm": 0.03105684997056953, "learning_rate": 1.1012503241347505e-05, "loss": 0.001, "step": 3459 }, { "epoch": 2.593703148425787, "grad_norm": 0.12112129065732546, "learning_rate": 1.097273681456501e-05, "loss": 0.0074, "step": 3460 }, { "epoch": 2.5944527736131935, "grad_norm": 0.11672808096353475, "learning_rate": 1.0933038148266273e-05, "loss": 0.006, "step": 3461 }, { "epoch": 2.5952023988005997, "grad_norm": 0.04661222628633944, "learning_rate": 1.0893407272666755e-05, "loss": 0.0023, "step": 3462 }, { "epoch": 2.595952023988006, "grad_norm": 0.04318285411314425, "learning_rate": 1.0853844217930364e-05, "loss": 0.0022, "step": 3463 }, { "epoch": 2.5967016491754125, "grad_norm": 0.15642717583329105, "learning_rate": 1.0814349014169345e-05, "loss": 0.0083, "step": 3464 }, { "epoch": 2.5974512743628186, "grad_norm": 0.1978976408685472, "learning_rate": 1.0774921691444318e-05, "loss": 0.0174, "step": 3465 }, { "epoch": 2.598200899550225, "grad_norm": 0.0659862221373999, "learning_rate": 1.07355622797642e-05, "loss": 0.003, "step": 3466 }, { "epoch": 2.5989505247376314, "grad_norm": 0.09980610197149213, "learning_rate": 1.0696270809086284e-05, "loss": 0.0091, "step": 3467 }, { "epoch": 2.5997001499250376, "grad_norm": 0.07549789682372308, "learning_rate": 1.0657047309316093e-05, "loss": 0.0043, "step": 3468 }, { "epoch": 2.600449775112444, "grad_norm": 0.09400871513513802, "learning_rate": 1.0617891810307456e-05, "loss": 0.0016, "step": 3469 }, { "epoch": 2.60119940029985, "grad_norm": 0.07409113774820665, "learning_rate": 1.0578804341862425e-05, "loss": 0.003, "step": 3470 }, { "epoch": 2.601949025487256, "grad_norm": 0.11373640219809003, "learning_rate": 1.0539784933731267e-05, "loss": 0.0039, "step": 3471 }, { "epoch": 2.6026986506746628, "grad_norm": 0.09964998752157113, "learning_rate": 1.0500833615612482e-05, "loss": 0.0072, "step": 3472 }, { "epoch": 2.603448275862069, "grad_norm": 0.030168470320950023, "learning_rate": 1.0461950417152699e-05, "loss": 0.001, "step": 3473 }, { "epoch": 2.604197901049475, "grad_norm": 0.06410917881047125, "learning_rate": 1.0423135367946724e-05, "loss": 0.0017, "step": 3474 }, { "epoch": 2.6049475262368817, "grad_norm": 0.10069157005767379, "learning_rate": 1.0384388497537501e-05, "loss": 0.0076, "step": 3475 }, { "epoch": 2.605697151424288, "grad_norm": 0.10216851017070903, "learning_rate": 1.034570983541605e-05, "loss": 0.0052, "step": 3476 }, { "epoch": 2.606446776611694, "grad_norm": 0.05589678656898697, "learning_rate": 1.0307099411021525e-05, "loss": 0.0031, "step": 3477 }, { "epoch": 2.6071964017991007, "grad_norm": 0.03938018053569718, "learning_rate": 1.0268557253741107e-05, "loss": 0.0027, "step": 3478 }, { "epoch": 2.607946026986507, "grad_norm": 0.09995308721093468, "learning_rate": 1.0230083392910006e-05, "loss": 0.0114, "step": 3479 }, { "epoch": 2.608695652173913, "grad_norm": 0.09674423576429927, "learning_rate": 1.0191677857811499e-05, "loss": 0.0048, "step": 3480 }, { "epoch": 2.6094452773613193, "grad_norm": 0.06263119023620396, "learning_rate": 1.0153340677676781e-05, "loss": 0.0055, "step": 3481 }, { "epoch": 2.6101949025487254, "grad_norm": 0.09036050664429081, "learning_rate": 1.0115071881685134e-05, "loss": 0.0047, "step": 3482 }, { "epoch": 2.610944527736132, "grad_norm": 0.11983979358718444, "learning_rate": 1.0076871498963691e-05, "loss": 0.0043, "step": 3483 }, { "epoch": 2.6116941529235382, "grad_norm": 0.045515194585739716, "learning_rate": 1.0038739558587562e-05, "loss": 0.0036, "step": 3484 }, { "epoch": 2.6124437781109444, "grad_norm": 0.08646768960077883, "learning_rate": 1.000067608957973e-05, "loss": 0.0035, "step": 3485 }, { "epoch": 2.613193403298351, "grad_norm": 0.33876013907706143, "learning_rate": 9.96268112091111e-06, "loss": 0.013, "step": 3486 }, { "epoch": 2.613943028485757, "grad_norm": 0.04606817630370628, "learning_rate": 9.924754681500459e-06, "loss": 0.0031, "step": 3487 }, { "epoch": 2.6146926536731634, "grad_norm": 0.1435363255689042, "learning_rate": 9.88689680021435e-06, "loss": 0.0035, "step": 3488 }, { "epoch": 2.6154422788605696, "grad_norm": 0.09134904091137633, "learning_rate": 9.849107505867205e-06, "loss": 0.0032, "step": 3489 }, { "epoch": 2.6161919040479757, "grad_norm": 0.056743738916346556, "learning_rate": 9.811386827221247e-06, "loss": 0.002, "step": 3490 }, { "epoch": 2.6169415292353824, "grad_norm": 0.04951286923208861, "learning_rate": 9.773734792986434e-06, "loss": 0.0023, "step": 3491 }, { "epoch": 2.6176911544227885, "grad_norm": 0.09584100347231342, "learning_rate": 9.736151431820528e-06, "loss": 0.0028, "step": 3492 }, { "epoch": 2.6184407796101947, "grad_norm": 0.12609103636803198, "learning_rate": 9.698636772328996e-06, "loss": 0.005, "step": 3493 }, { "epoch": 2.6191904047976013, "grad_norm": 0.2714084765268302, "learning_rate": 9.661190843065004e-06, "loss": 0.1229, "step": 3494 }, { "epoch": 2.6199400299850075, "grad_norm": 0.06185219904845644, "learning_rate": 9.623813672529435e-06, "loss": 0.0019, "step": 3495 }, { "epoch": 2.6206896551724137, "grad_norm": 0.2980290546335867, "learning_rate": 9.586505289170811e-06, "loss": 0.0156, "step": 3496 }, { "epoch": 2.6214392803598203, "grad_norm": 0.09600426776503927, "learning_rate": 9.549265721385326e-06, "loss": 0.0047, "step": 3497 }, { "epoch": 2.6221889055472265, "grad_norm": 0.07846840703663327, "learning_rate": 9.512094997516763e-06, "loss": 0.0033, "step": 3498 }, { "epoch": 2.6229385307346327, "grad_norm": 0.09452717454419779, "learning_rate": 9.474993145856548e-06, "loss": 0.0078, "step": 3499 }, { "epoch": 2.623688155922039, "grad_norm": 0.021453022700013084, "learning_rate": 9.437960194643647e-06, "loss": 0.0009, "step": 3500 }, { "epoch": 2.624437781109445, "grad_norm": 0.08020110491333328, "learning_rate": 9.400996172064602e-06, "loss": 0.0036, "step": 3501 }, { "epoch": 2.6251874062968517, "grad_norm": 0.07715720284674087, "learning_rate": 9.364101106253509e-06, "loss": 0.0027, "step": 3502 }, { "epoch": 2.625937031484258, "grad_norm": 0.06303706939805435, "learning_rate": 9.327275025291948e-06, "loss": 0.0054, "step": 3503 }, { "epoch": 2.626686656671664, "grad_norm": 0.06986054148862804, "learning_rate": 9.290517957208988e-06, "loss": 0.0025, "step": 3504 }, { "epoch": 2.6274362818590706, "grad_norm": 0.2573435063825212, "learning_rate": 9.25382992998124e-06, "loss": 0.0096, "step": 3505 }, { "epoch": 2.628185907046477, "grad_norm": 0.07567213887903118, "learning_rate": 9.217210971532685e-06, "loss": 0.0058, "step": 3506 }, { "epoch": 2.628935532233883, "grad_norm": 0.22383937647866028, "learning_rate": 9.180661109734779e-06, "loss": 0.0134, "step": 3507 }, { "epoch": 2.6296851574212896, "grad_norm": 0.06378477883975421, "learning_rate": 9.144180372406341e-06, "loss": 0.0035, "step": 3508 }, { "epoch": 2.630434782608696, "grad_norm": 0.06176310498690345, "learning_rate": 9.107768787313664e-06, "loss": 0.0052, "step": 3509 }, { "epoch": 2.631184407796102, "grad_norm": 0.06780408252735382, "learning_rate": 9.071426382170334e-06, "loss": 0.0052, "step": 3510 }, { "epoch": 2.631934032983508, "grad_norm": 0.0754928588054794, "learning_rate": 9.03515318463729e-06, "loss": 0.007, "step": 3511 }, { "epoch": 2.6326836581709143, "grad_norm": 0.2771991765187472, "learning_rate": 8.998949222322828e-06, "loss": 0.0067, "step": 3512 }, { "epoch": 2.633433283358321, "grad_norm": 0.16520195859466377, "learning_rate": 8.962814522782514e-06, "loss": 0.0138, "step": 3513 }, { "epoch": 2.634182908545727, "grad_norm": 0.18707832243954112, "learning_rate": 8.926749113519228e-06, "loss": 0.0142, "step": 3514 }, { "epoch": 2.6349325337331333, "grad_norm": 0.12112654939224787, "learning_rate": 8.890753021983078e-06, "loss": 0.0062, "step": 3515 }, { "epoch": 2.63568215892054, "grad_norm": 0.09972947468133485, "learning_rate": 8.854826275571449e-06, "loss": 0.0047, "step": 3516 }, { "epoch": 2.636431784107946, "grad_norm": 0.05652703877587957, "learning_rate": 8.81896890162892e-06, "loss": 0.0032, "step": 3517 }, { "epoch": 2.6371814092953523, "grad_norm": 0.09734359585560508, "learning_rate": 8.783180927447265e-06, "loss": 0.0042, "step": 3518 }, { "epoch": 2.637931034482759, "grad_norm": 0.19651972601960688, "learning_rate": 8.74746238026547e-06, "loss": 0.0075, "step": 3519 }, { "epoch": 2.638680659670165, "grad_norm": 0.11666043079140781, "learning_rate": 8.711813287269644e-06, "loss": 0.0045, "step": 3520 }, { "epoch": 2.6394302848575713, "grad_norm": 0.10923156443553575, "learning_rate": 8.676233675593037e-06, "loss": 0.0056, "step": 3521 }, { "epoch": 2.6401799100449774, "grad_norm": 0.04615933797824494, "learning_rate": 8.640723572316045e-06, "loss": 0.0018, "step": 3522 }, { "epoch": 2.6409295352323836, "grad_norm": 0.12348983813140332, "learning_rate": 8.605283004466136e-06, "loss": 0.011, "step": 3523 }, { "epoch": 2.6416791604197902, "grad_norm": 0.1323678653865999, "learning_rate": 8.569911999017832e-06, "loss": 0.0048, "step": 3524 }, { "epoch": 2.6424287856071964, "grad_norm": 0.08003696554324499, "learning_rate": 8.534610582892765e-06, "loss": 0.0034, "step": 3525 }, { "epoch": 2.6431784107946026, "grad_norm": 0.1904035214537228, "learning_rate": 8.499378782959555e-06, "loss": 0.0103, "step": 3526 }, { "epoch": 2.643928035982009, "grad_norm": 0.15545593523005102, "learning_rate": 8.46421662603385e-06, "loss": 0.0193, "step": 3527 }, { "epoch": 2.6446776611694154, "grad_norm": 0.07256795232832776, "learning_rate": 8.429124138878274e-06, "loss": 0.004, "step": 3528 }, { "epoch": 2.6454272863568216, "grad_norm": 0.04997265242737797, "learning_rate": 8.394101348202477e-06, "loss": 0.0026, "step": 3529 }, { "epoch": 2.6461769115442277, "grad_norm": 0.09839523717548251, "learning_rate": 8.359148280663021e-06, "loss": 0.0021, "step": 3530 }, { "epoch": 2.646926536731634, "grad_norm": 0.2272177074499261, "learning_rate": 8.324264962863392e-06, "loss": 0.0375, "step": 3531 }, { "epoch": 2.6476761619190405, "grad_norm": 0.05316402736053975, "learning_rate": 8.289451421353978e-06, "loss": 0.0031, "step": 3532 }, { "epoch": 2.6484257871064467, "grad_norm": 0.09666760826592079, "learning_rate": 8.254707682632134e-06, "loss": 0.0044, "step": 3533 }, { "epoch": 2.649175412293853, "grad_norm": 0.08548886953032514, "learning_rate": 8.220033773142023e-06, "loss": 0.0031, "step": 3534 }, { "epoch": 2.6499250374812595, "grad_norm": 0.055433019245836296, "learning_rate": 8.18542971927464e-06, "loss": 0.0037, "step": 3535 }, { "epoch": 2.6506746626686657, "grad_norm": 0.06702626082211281, "learning_rate": 8.150895547367876e-06, "loss": 0.0028, "step": 3536 }, { "epoch": 2.651424287856072, "grad_norm": 0.08834920550221038, "learning_rate": 8.116431283706383e-06, "loss": 0.0056, "step": 3537 }, { "epoch": 2.6521739130434785, "grad_norm": 0.07939187196088908, "learning_rate": 8.082036954521621e-06, "loss": 0.0082, "step": 3538 }, { "epoch": 2.6529235382308847, "grad_norm": 0.0693663151265394, "learning_rate": 8.04771258599184e-06, "loss": 0.0036, "step": 3539 }, { "epoch": 2.653673163418291, "grad_norm": 0.0489002340147583, "learning_rate": 8.013458204242007e-06, "loss": 0.0027, "step": 3540 }, { "epoch": 2.654422788605697, "grad_norm": 0.06117963549083394, "learning_rate": 7.979273835343836e-06, "loss": 0.0029, "step": 3541 }, { "epoch": 2.655172413793103, "grad_norm": 0.17285850209634904, "learning_rate": 7.94515950531577e-06, "loss": 0.0111, "step": 3542 }, { "epoch": 2.65592203898051, "grad_norm": 0.07674916468104553, "learning_rate": 7.911115240122913e-06, "loss": 0.0084, "step": 3543 }, { "epoch": 2.656671664167916, "grad_norm": 0.06939015226317331, "learning_rate": 7.877141065677063e-06, "loss": 0.003, "step": 3544 }, { "epoch": 2.657421289355322, "grad_norm": 0.03697970239963438, "learning_rate": 7.843237007836657e-06, "loss": 0.0019, "step": 3545 }, { "epoch": 2.658170914542729, "grad_norm": 0.10161008931335178, "learning_rate": 7.809403092406787e-06, "loss": 0.0039, "step": 3546 }, { "epoch": 2.658920539730135, "grad_norm": 0.05432572136277282, "learning_rate": 7.77563934513913e-06, "loss": 0.0054, "step": 3547 }, { "epoch": 2.659670164917541, "grad_norm": 0.09434216477553212, "learning_rate": 7.741945791731975e-06, "loss": 0.0043, "step": 3548 }, { "epoch": 2.660419790104948, "grad_norm": 0.13438109952231186, "learning_rate": 7.708322457830175e-06, "loss": 0.0149, "step": 3549 }, { "epoch": 2.661169415292354, "grad_norm": 0.044515473296177166, "learning_rate": 7.674769369025148e-06, "loss": 0.0017, "step": 3550 }, { "epoch": 2.66191904047976, "grad_norm": 0.07980519638723763, "learning_rate": 7.64128655085482e-06, "loss": 0.0027, "step": 3551 }, { "epoch": 2.6626686656671663, "grad_norm": 0.09115039728168225, "learning_rate": 7.607874028803685e-06, "loss": 0.0049, "step": 3552 }, { "epoch": 2.6634182908545725, "grad_norm": 0.11713526622844393, "learning_rate": 7.5745318283027e-06, "loss": 0.0041, "step": 3553 }, { "epoch": 2.664167916041979, "grad_norm": 0.10597277800023548, "learning_rate": 7.541259974729276e-06, "loss": 0.0083, "step": 3554 }, { "epoch": 2.6649175412293853, "grad_norm": 0.09684578884196773, "learning_rate": 7.5080584934073105e-06, "loss": 0.0058, "step": 3555 }, { "epoch": 2.6656671664167915, "grad_norm": 0.04135982192021682, "learning_rate": 7.4749274096071575e-06, "loss": 0.002, "step": 3556 }, { "epoch": 2.666416791604198, "grad_norm": 0.06499780333835134, "learning_rate": 7.441866748545545e-06, "loss": 0.0034, "step": 3557 }, { "epoch": 2.6671664167916043, "grad_norm": 0.06255583464347211, "learning_rate": 7.408876535385645e-06, "loss": 0.0078, "step": 3558 }, { "epoch": 2.6679160419790104, "grad_norm": 0.11011116172772083, "learning_rate": 7.37595679523696e-06, "loss": 0.0042, "step": 3559 }, { "epoch": 2.668665667166417, "grad_norm": 0.0722927183692007, "learning_rate": 7.343107553155404e-06, "loss": 0.0065, "step": 3560 }, { "epoch": 2.6694152923538232, "grad_norm": 0.0678410468157105, "learning_rate": 7.31032883414321e-06, "loss": 0.0066, "step": 3561 }, { "epoch": 2.6701649175412294, "grad_norm": 0.058365571482256254, "learning_rate": 7.277620663148921e-06, "loss": 0.0025, "step": 3562 }, { "epoch": 2.6709145427286356, "grad_norm": 0.08197814793200882, "learning_rate": 7.244983065067412e-06, "loss": 0.0029, "step": 3563 }, { "epoch": 2.6716641679160418, "grad_norm": 0.18926405575286565, "learning_rate": 7.2124160647398354e-06, "loss": 0.0097, "step": 3564 }, { "epoch": 2.6724137931034484, "grad_norm": 0.06395203764121683, "learning_rate": 7.179919686953585e-06, "loss": 0.0042, "step": 3565 }, { "epoch": 2.6731634182908546, "grad_norm": 0.07331177755241726, "learning_rate": 7.147493956442353e-06, "loss": 0.0032, "step": 3566 }, { "epoch": 2.6739130434782608, "grad_norm": 0.4873608362749581, "learning_rate": 7.115138897886009e-06, "loss": 0.0245, "step": 3567 }, { "epoch": 2.6746626686656674, "grad_norm": 0.17374373823700442, "learning_rate": 7.082854535910655e-06, "loss": 0.0061, "step": 3568 }, { "epoch": 2.6754122938530736, "grad_norm": 0.08369814478199004, "learning_rate": 7.050640895088589e-06, "loss": 0.0067, "step": 3569 }, { "epoch": 2.6761619190404797, "grad_norm": 0.1308839941432331, "learning_rate": 7.01849799993829e-06, "loss": 0.0051, "step": 3570 }, { "epoch": 2.676911544227886, "grad_norm": 0.12482607476795445, "learning_rate": 6.986425874924352e-06, "loss": 0.0076, "step": 3571 }, { "epoch": 2.677661169415292, "grad_norm": 0.0895789380819289, "learning_rate": 6.954424544457549e-06, "loss": 0.0037, "step": 3572 }, { "epoch": 2.6784107946026987, "grad_norm": 0.23901158055522503, "learning_rate": 6.922494032894744e-06, "loss": 0.0092, "step": 3573 }, { "epoch": 2.679160419790105, "grad_norm": 0.05298578704840303, "learning_rate": 6.8906343645388945e-06, "loss": 0.0008, "step": 3574 }, { "epoch": 2.679910044977511, "grad_norm": 0.09438117470957384, "learning_rate": 6.858845563639083e-06, "loss": 0.0053, "step": 3575 }, { "epoch": 2.6806596701649177, "grad_norm": 0.08681760628142808, "learning_rate": 6.8271276543903975e-06, "loss": 0.0022, "step": 3576 }, { "epoch": 2.681409295352324, "grad_norm": 0.06474070177166225, "learning_rate": 6.7954806609339946e-06, "loss": 0.006, "step": 3577 }, { "epoch": 2.68215892053973, "grad_norm": 0.07631249932565169, "learning_rate": 6.763904607357063e-06, "loss": 0.0034, "step": 3578 }, { "epoch": 2.6829085457271367, "grad_norm": 0.15131390870827074, "learning_rate": 6.732399517692778e-06, "loss": 0.0086, "step": 3579 }, { "epoch": 2.683658170914543, "grad_norm": 0.12343656877439753, "learning_rate": 6.700965415920302e-06, "loss": 0.0072, "step": 3580 }, { "epoch": 2.684407796101949, "grad_norm": 0.02283778423621663, "learning_rate": 6.669602325964796e-06, "loss": 0.0015, "step": 3581 }, { "epoch": 2.685157421289355, "grad_norm": 0.09042680601212649, "learning_rate": 6.6383102716973564e-06, "loss": 0.0058, "step": 3582 }, { "epoch": 2.6859070464767614, "grad_norm": 0.10654143512759034, "learning_rate": 6.607089276934997e-06, "loss": 0.0034, "step": 3583 }, { "epoch": 2.686656671664168, "grad_norm": 0.008937713892268067, "learning_rate": 6.575939365440664e-06, "loss": 0.0003, "step": 3584 }, { "epoch": 2.687406296851574, "grad_norm": 0.08699528259742419, "learning_rate": 6.544860560923205e-06, "loss": 0.0117, "step": 3585 }, { "epoch": 2.6881559220389803, "grad_norm": 0.04413936298947285, "learning_rate": 6.51385288703732e-06, "loss": 0.0044, "step": 3586 }, { "epoch": 2.688905547226387, "grad_norm": 0.40494077939008205, "learning_rate": 6.482916367383596e-06, "loss": 0.0185, "step": 3587 }, { "epoch": 2.689655172413793, "grad_norm": 0.025317057448168645, "learning_rate": 6.4520510255084655e-06, "loss": 0.001, "step": 3588 }, { "epoch": 2.6904047976011993, "grad_norm": 0.1281661479079944, "learning_rate": 6.42125688490417e-06, "loss": 0.0047, "step": 3589 }, { "epoch": 2.691154422788606, "grad_norm": 0.14593951083085305, "learning_rate": 6.390533969008761e-06, "loss": 0.01, "step": 3590 }, { "epoch": 2.691904047976012, "grad_norm": 0.1504086206960067, "learning_rate": 6.359882301206077e-06, "loss": 0.0062, "step": 3591 }, { "epoch": 2.6926536731634183, "grad_norm": 0.03789441124058746, "learning_rate": 6.329301904825746e-06, "loss": 0.0013, "step": 3592 }, { "epoch": 2.6934032983508245, "grad_norm": 0.12908249338770855, "learning_rate": 6.298792803143138e-06, "loss": 0.0108, "step": 3593 }, { "epoch": 2.6941529235382307, "grad_norm": 0.2053287462164705, "learning_rate": 6.268355019379346e-06, "loss": 0.0196, "step": 3594 }, { "epoch": 2.6949025487256373, "grad_norm": 0.20678108268250486, "learning_rate": 6.237988576701204e-06, "loss": 0.0054, "step": 3595 }, { "epoch": 2.6956521739130435, "grad_norm": 0.0540984851276832, "learning_rate": 6.207693498221235e-06, "loss": 0.0038, "step": 3596 }, { "epoch": 2.6964017991004496, "grad_norm": 0.09401306330490736, "learning_rate": 6.177469806997627e-06, "loss": 0.0025, "step": 3597 }, { "epoch": 2.6971514242878563, "grad_norm": 0.04383015446253963, "learning_rate": 6.147317526034291e-06, "loss": 0.001, "step": 3598 }, { "epoch": 2.6979010494752624, "grad_norm": 0.024997535985464992, "learning_rate": 6.1172366782807354e-06, "loss": 0.0009, "step": 3599 }, { "epoch": 2.6986506746626686, "grad_norm": 0.0469201721239403, "learning_rate": 6.087227286632103e-06, "loss": 0.0014, "step": 3600 }, { "epoch": 2.6994002998500752, "grad_norm": 0.051395596885820925, "learning_rate": 6.0572893739291756e-06, "loss": 0.0025, "step": 3601 }, { "epoch": 2.7001499250374814, "grad_norm": 0.045749901122774626, "learning_rate": 6.027422962958296e-06, "loss": 0.0016, "step": 3602 }, { "epoch": 2.7008995502248876, "grad_norm": 0.12403820171571336, "learning_rate": 5.9976280764514116e-06, "loss": 0.0098, "step": 3603 }, { "epoch": 2.7016491754122938, "grad_norm": 0.07200283473771944, "learning_rate": 5.967904737086016e-06, "loss": 0.0049, "step": 3604 }, { "epoch": 2.7023988005997, "grad_norm": 0.07568877846311735, "learning_rate": 5.938252967485192e-06, "loss": 0.0035, "step": 3605 }, { "epoch": 2.7031484257871066, "grad_norm": 0.08252827999998712, "learning_rate": 5.908672790217474e-06, "loss": 0.0077, "step": 3606 }, { "epoch": 2.7038980509745127, "grad_norm": 0.17930247189997617, "learning_rate": 5.879164227796963e-06, "loss": 0.0153, "step": 3607 }, { "epoch": 2.704647676161919, "grad_norm": 0.045819957196655735, "learning_rate": 5.849727302683217e-06, "loss": 0.0028, "step": 3608 }, { "epoch": 2.7053973013493255, "grad_norm": 0.1472539901727524, "learning_rate": 5.820362037281302e-06, "loss": 0.004, "step": 3609 }, { "epoch": 2.7061469265367317, "grad_norm": 0.0571665163793095, "learning_rate": 5.791068453941728e-06, "loss": 0.0021, "step": 3610 }, { "epoch": 2.706896551724138, "grad_norm": 0.12266217583120254, "learning_rate": 5.761846574960428e-06, "loss": 0.0067, "step": 3611 }, { "epoch": 2.707646176911544, "grad_norm": 0.15635234506738496, "learning_rate": 5.732696422578787e-06, "loss": 0.0089, "step": 3612 }, { "epoch": 2.7083958020989503, "grad_norm": 0.24646222619048166, "learning_rate": 5.70361801898358e-06, "loss": 0.007, "step": 3613 }, { "epoch": 2.709145427286357, "grad_norm": 0.09446261697690682, "learning_rate": 5.674611386306994e-06, "loss": 0.0055, "step": 3614 }, { "epoch": 2.709895052473763, "grad_norm": 0.05438453674274307, "learning_rate": 5.645676546626555e-06, "loss": 0.0021, "step": 3615 }, { "epoch": 2.7106446776611692, "grad_norm": 0.08994894529395658, "learning_rate": 5.616813521965181e-06, "loss": 0.0055, "step": 3616 }, { "epoch": 2.711394302848576, "grad_norm": 0.17303568276679598, "learning_rate": 5.58802233429111e-06, "loss": 0.0095, "step": 3617 }, { "epoch": 2.712143928035982, "grad_norm": 0.17064533406947632, "learning_rate": 5.5593030055179014e-06, "loss": 0.0048, "step": 3618 }, { "epoch": 2.712893553223388, "grad_norm": 0.0578184933228903, "learning_rate": 5.5306555575044385e-06, "loss": 0.0041, "step": 3619 }, { "epoch": 2.713643178410795, "grad_norm": 0.07165755854478759, "learning_rate": 5.502080012054867e-06, "loss": 0.0026, "step": 3620 }, { "epoch": 2.714392803598201, "grad_norm": 0.08253336755974179, "learning_rate": 5.473576390918622e-06, "loss": 0.0042, "step": 3621 }, { "epoch": 2.715142428785607, "grad_norm": 0.38558400392740255, "learning_rate": 5.4451447157904286e-06, "loss": 0.0095, "step": 3622 }, { "epoch": 2.7158920539730134, "grad_norm": 0.11407609591189503, "learning_rate": 5.416785008310199e-06, "loss": 0.0084, "step": 3623 }, { "epoch": 2.7166416791604195, "grad_norm": 0.11103306738736951, "learning_rate": 5.388497290063099e-06, "loss": 0.0024, "step": 3624 }, { "epoch": 2.717391304347826, "grad_norm": 0.06833840067269992, "learning_rate": 5.3602815825794735e-06, "loss": 0.0036, "step": 3625 }, { "epoch": 2.7181409295352323, "grad_norm": 0.0714532486964811, "learning_rate": 5.3321379073349e-06, "loss": 0.004, "step": 3626 }, { "epoch": 2.7188905547226385, "grad_norm": 0.18719531064096853, "learning_rate": 5.304066285750109e-06, "loss": 0.0218, "step": 3627 }, { "epoch": 2.719640179910045, "grad_norm": 0.1206736823435074, "learning_rate": 5.276066739190955e-06, "loss": 0.0095, "step": 3628 }, { "epoch": 2.7203898050974513, "grad_norm": 0.13394479538875573, "learning_rate": 5.248139288968523e-06, "loss": 0.0065, "step": 3629 }, { "epoch": 2.7211394302848575, "grad_norm": 0.07118915867328775, "learning_rate": 5.220283956338934e-06, "loss": 0.004, "step": 3630 }, { "epoch": 2.721889055472264, "grad_norm": 0.07518397660658808, "learning_rate": 5.192500762503472e-06, "loss": 0.0021, "step": 3631 }, { "epoch": 2.7226386806596703, "grad_norm": 0.13623715597075606, "learning_rate": 5.164789728608488e-06, "loss": 0.0068, "step": 3632 }, { "epoch": 2.7233883058470765, "grad_norm": 0.08135641564886333, "learning_rate": 5.137150875745433e-06, "loss": 0.0047, "step": 3633 }, { "epoch": 2.7241379310344827, "grad_norm": 0.2905574517744519, "learning_rate": 5.109584224950792e-06, "loss": 0.0129, "step": 3634 }, { "epoch": 2.724887556221889, "grad_norm": 0.03198727740319656, "learning_rate": 5.082089797206124e-06, "loss": 0.0014, "step": 3635 }, { "epoch": 2.7256371814092955, "grad_norm": 0.0738532460460911, "learning_rate": 5.054667613437991e-06, "loss": 0.0105, "step": 3636 }, { "epoch": 2.7263868065967016, "grad_norm": 0.15003634894034606, "learning_rate": 5.027317694517996e-06, "loss": 0.005, "step": 3637 }, { "epoch": 2.727136431784108, "grad_norm": 0.2861318864965912, "learning_rate": 5.0000400612627114e-06, "loss": 0.022, "step": 3638 }, { "epoch": 2.7278860569715144, "grad_norm": 0.0830609886318904, "learning_rate": 4.972834734433718e-06, "loss": 0.01, "step": 3639 }, { "epoch": 2.7286356821589206, "grad_norm": 0.3801169987148443, "learning_rate": 4.94570173473754e-06, "loss": 0.0061, "step": 3640 }, { "epoch": 2.729385307346327, "grad_norm": 0.056034639807724315, "learning_rate": 4.91864108282567e-06, "loss": 0.0036, "step": 3641 }, { "epoch": 2.7301349325337334, "grad_norm": 0.08424484116620803, "learning_rate": 4.8916527992945325e-06, "loss": 0.0033, "step": 3642 }, { "epoch": 2.7308845577211396, "grad_norm": 0.09459805879195689, "learning_rate": 4.8647369046854515e-06, "loss": 0.003, "step": 3643 }, { "epoch": 2.7316341829085458, "grad_norm": 0.035342901239077684, "learning_rate": 4.837893419484663e-06, "loss": 0.001, "step": 3644 }, { "epoch": 2.732383808095952, "grad_norm": 0.024040876961602176, "learning_rate": 4.811122364123322e-06, "loss": 0.001, "step": 3645 }, { "epoch": 2.733133433283358, "grad_norm": 0.09592235037490139, "learning_rate": 4.78442375897743e-06, "loss": 0.0119, "step": 3646 }, { "epoch": 2.7338830584707647, "grad_norm": 0.07500277826770634, "learning_rate": 4.757797624367821e-06, "loss": 0.0027, "step": 3647 }, { "epoch": 2.734632683658171, "grad_norm": 0.022403652497716163, "learning_rate": 4.731243980560207e-06, "loss": 0.0006, "step": 3648 }, { "epoch": 2.735382308845577, "grad_norm": 0.0308911192677184, "learning_rate": 4.7047628477651095e-06, "loss": 0.0009, "step": 3649 }, { "epoch": 2.7361319340329837, "grad_norm": 0.04449662376820901, "learning_rate": 4.678354246137873e-06, "loss": 0.003, "step": 3650 }, { "epoch": 2.73688155922039, "grad_norm": 0.1476368913508825, "learning_rate": 4.652018195778629e-06, "loss": 0.007, "step": 3651 }, { "epoch": 2.737631184407796, "grad_norm": 0.11712744485200172, "learning_rate": 4.625754716732256e-06, "loss": 0.0074, "step": 3652 }, { "epoch": 2.7383808095952022, "grad_norm": 0.08772348977368045, "learning_rate": 4.5995638289884735e-06, "loss": 0.0064, "step": 3653 }, { "epoch": 2.7391304347826084, "grad_norm": 0.10833839381144214, "learning_rate": 4.573445552481692e-06, "loss": 0.0088, "step": 3654 }, { "epoch": 2.739880059970015, "grad_norm": 0.11699172964342061, "learning_rate": 4.547399907091055e-06, "loss": 0.005, "step": 3655 }, { "epoch": 2.7406296851574212, "grad_norm": 0.048607452332993795, "learning_rate": 4.521426912640458e-06, "loss": 0.0012, "step": 3656 }, { "epoch": 2.7413793103448274, "grad_norm": 0.1476414516790435, "learning_rate": 4.495526588898458e-06, "loss": 0.0135, "step": 3657 }, { "epoch": 2.742128935532234, "grad_norm": 0.08161599938984887, "learning_rate": 4.469698955578349e-06, "loss": 0.0031, "step": 3658 }, { "epoch": 2.74287856071964, "grad_norm": 0.08333476285401598, "learning_rate": 4.443944032338066e-06, "loss": 0.0052, "step": 3659 }, { "epoch": 2.7436281859070464, "grad_norm": 0.029269837281302242, "learning_rate": 4.4182618387802134e-06, "loss": 0.001, "step": 3660 }, { "epoch": 2.744377811094453, "grad_norm": 0.1003170085626985, "learning_rate": 4.392652394452035e-06, "loss": 0.0112, "step": 3661 }, { "epoch": 2.745127436281859, "grad_norm": 0.11014654771097936, "learning_rate": 4.367115718845405e-06, "loss": 0.0106, "step": 3662 }, { "epoch": 2.7458770614692654, "grad_norm": 0.1790138493348603, "learning_rate": 4.341651831396809e-06, "loss": 0.0093, "step": 3663 }, { "epoch": 2.7466266866566715, "grad_norm": 0.16014756329328614, "learning_rate": 4.316260751487356e-06, "loss": 0.0114, "step": 3664 }, { "epoch": 2.7473763118440777, "grad_norm": 0.08790151629116845, "learning_rate": 4.290942498442696e-06, "loss": 0.0065, "step": 3665 }, { "epoch": 2.7481259370314843, "grad_norm": 0.10020052895833542, "learning_rate": 4.265697091533083e-06, "loss": 0.007, "step": 3666 }, { "epoch": 2.7488755622188905, "grad_norm": 0.07804375760777828, "learning_rate": 4.240524549973301e-06, "loss": 0.0042, "step": 3667 }, { "epoch": 2.7496251874062967, "grad_norm": 0.11710400068698333, "learning_rate": 4.215424892922726e-06, "loss": 0.0033, "step": 3668 }, { "epoch": 2.7503748125937033, "grad_norm": 0.09659294495448248, "learning_rate": 4.190398139485196e-06, "loss": 0.0061, "step": 3669 }, { "epoch": 2.7511244377811095, "grad_norm": 0.10322465262354132, "learning_rate": 4.165444308709099e-06, "loss": 0.0025, "step": 3670 }, { "epoch": 2.7518740629685157, "grad_norm": 0.1150037801522404, "learning_rate": 4.1405634195873066e-06, "loss": 0.0058, "step": 3671 }, { "epoch": 2.7526236881559223, "grad_norm": 0.11439789106912057, "learning_rate": 4.115755491057171e-06, "loss": 0.0052, "step": 3672 }, { "epoch": 2.7533733133433285, "grad_norm": 0.09507224179197715, "learning_rate": 4.091020542000534e-06, "loss": 0.0114, "step": 3673 }, { "epoch": 2.7541229385307346, "grad_norm": 0.1418154647433098, "learning_rate": 4.0663585912436465e-06, "loss": 0.0091, "step": 3674 }, { "epoch": 2.754872563718141, "grad_norm": 0.0519004217806522, "learning_rate": 4.041769657557259e-06, "loss": 0.0043, "step": 3675 }, { "epoch": 2.755622188905547, "grad_norm": 0.1642216620545012, "learning_rate": 4.017253759656481e-06, "loss": 0.0113, "step": 3676 }, { "epoch": 2.7563718140929536, "grad_norm": 0.14499900898011348, "learning_rate": 3.992810916200895e-06, "loss": 0.0061, "step": 3677 }, { "epoch": 2.75712143928036, "grad_norm": 0.0981948755530292, "learning_rate": 3.968441145794455e-06, "loss": 0.0085, "step": 3678 }, { "epoch": 2.757871064467766, "grad_norm": 0.06275409857769353, "learning_rate": 3.944144466985477e-06, "loss": 0.0028, "step": 3679 }, { "epoch": 2.7586206896551726, "grad_norm": 0.06965249096923765, "learning_rate": 3.919920898266671e-06, "loss": 0.0042, "step": 3680 }, { "epoch": 2.7593703148425788, "grad_norm": 0.07969732276284872, "learning_rate": 3.895770458075087e-06, "loss": 0.0044, "step": 3681 }, { "epoch": 2.760119940029985, "grad_norm": 0.03777464572367483, "learning_rate": 3.871693164792145e-06, "loss": 0.0019, "step": 3682 }, { "epoch": 2.7608695652173916, "grad_norm": 0.1295711523465296, "learning_rate": 3.84768903674354e-06, "loss": 0.0078, "step": 3683 }, { "epoch": 2.7616191904047978, "grad_norm": 0.06514779876174903, "learning_rate": 3.823758092199325e-06, "loss": 0.0054, "step": 3684 }, { "epoch": 2.762368815592204, "grad_norm": 0.05547738547510124, "learning_rate": 3.7999003493738415e-06, "loss": 0.0031, "step": 3685 }, { "epoch": 2.76311844077961, "grad_norm": 0.059133497938647625, "learning_rate": 3.776115826425686e-06, "loss": 0.0018, "step": 3686 }, { "epoch": 2.7638680659670163, "grad_norm": 0.034869095838589936, "learning_rate": 3.752404541457766e-06, "loss": 0.0021, "step": 3687 }, { "epoch": 2.764617691154423, "grad_norm": 0.16037864663324225, "learning_rate": 3.728766512517223e-06, "loss": 0.0039, "step": 3688 }, { "epoch": 2.765367316341829, "grad_norm": 0.05774969577903951, "learning_rate": 3.7052017575954557e-06, "loss": 0.0028, "step": 3689 }, { "epoch": 2.7661169415292353, "grad_norm": 0.07922232786939425, "learning_rate": 3.6817102946280602e-06, "loss": 0.0018, "step": 3690 }, { "epoch": 2.766866566716642, "grad_norm": 0.16312725330182026, "learning_rate": 3.6582921414949124e-06, "loss": 0.0063, "step": 3691 }, { "epoch": 2.767616191904048, "grad_norm": 0.19983176773659292, "learning_rate": 3.6349473160200207e-06, "loss": 0.0115, "step": 3692 }, { "epoch": 2.7683658170914542, "grad_norm": 0.03632042805227702, "learning_rate": 3.6116758359716173e-06, "loss": 0.0009, "step": 3693 }, { "epoch": 2.7691154422788604, "grad_norm": 0.8993147264773352, "learning_rate": 3.588477719062111e-06, "loss": 0.0065, "step": 3694 }, { "epoch": 2.7698650674662666, "grad_norm": 0.10458974932419897, "learning_rate": 3.565352982948067e-06, "loss": 0.0079, "step": 3695 }, { "epoch": 2.770614692653673, "grad_norm": 0.05507296047559786, "learning_rate": 3.542301645230206e-06, "loss": 0.0021, "step": 3696 }, { "epoch": 2.7713643178410794, "grad_norm": 0.08673372948135946, "learning_rate": 3.5193237234533495e-06, "loss": 0.0073, "step": 3697 }, { "epoch": 2.7721139430284856, "grad_norm": 0.1187522679105704, "learning_rate": 3.4964192351064963e-06, "loss": 0.0067, "step": 3698 }, { "epoch": 2.772863568215892, "grad_norm": 0.09235170094818261, "learning_rate": 3.4735881976227013e-06, "loss": 0.0068, "step": 3699 }, { "epoch": 2.7736131934032984, "grad_norm": 0.104110530083035, "learning_rate": 3.4508306283791423e-06, "loss": 0.003, "step": 3700 }, { "epoch": 2.7743628185907045, "grad_norm": 0.10901153145720517, "learning_rate": 3.428146544697086e-06, "loss": 0.0058, "step": 3701 }, { "epoch": 2.775112443778111, "grad_norm": 0.15846183030533673, "learning_rate": 3.405535963841844e-06, "loss": 0.0067, "step": 3702 }, { "epoch": 2.7758620689655173, "grad_norm": 0.26506094090563465, "learning_rate": 3.382998903022816e-06, "loss": 0.0102, "step": 3703 }, { "epoch": 2.7766116941529235, "grad_norm": 0.0913707250463421, "learning_rate": 3.3605353793933926e-06, "loss": 0.0016, "step": 3704 }, { "epoch": 2.7773613193403297, "grad_norm": 0.1774842417966673, "learning_rate": 3.3381454100510414e-06, "loss": 0.0142, "step": 3705 }, { "epoch": 2.778110944527736, "grad_norm": 0.1706681027415529, "learning_rate": 3.3158290120372304e-06, "loss": 0.0091, "step": 3706 }, { "epoch": 2.7788605697151425, "grad_norm": 0.11213132655813458, "learning_rate": 3.2935862023374175e-06, "loss": 0.0041, "step": 3707 }, { "epoch": 2.7796101949025487, "grad_norm": 0.12547887711062505, "learning_rate": 3.2714169978810827e-06, "loss": 0.0022, "step": 3708 }, { "epoch": 2.780359820089955, "grad_norm": 0.025223296745068364, "learning_rate": 3.2493214155416507e-06, "loss": 0.0007, "step": 3709 }, { "epoch": 2.7811094452773615, "grad_norm": 0.055322515411058336, "learning_rate": 3.2272994721365245e-06, "loss": 0.003, "step": 3710 }, { "epoch": 2.7818590704647677, "grad_norm": 0.17480386434490955, "learning_rate": 3.205351184427063e-06, "loss": 0.0072, "step": 3711 }, { "epoch": 2.782608695652174, "grad_norm": 0.12545903845757225, "learning_rate": 3.1834765691185597e-06, "loss": 0.0029, "step": 3712 }, { "epoch": 2.7833583208395805, "grad_norm": 0.07361961469995401, "learning_rate": 3.161675642860229e-06, "loss": 0.003, "step": 3713 }, { "epoch": 2.7841079460269866, "grad_norm": 0.1633375249165271, "learning_rate": 3.139948422245198e-06, "loss": 0.0068, "step": 3714 }, { "epoch": 2.784857571214393, "grad_norm": 0.1566962819235762, "learning_rate": 3.1182949238105273e-06, "loss": 0.0093, "step": 3715 }, { "epoch": 2.785607196401799, "grad_norm": 0.23511158067147805, "learning_rate": 3.096715164037123e-06, "loss": 0.0119, "step": 3716 }, { "epoch": 2.786356821589205, "grad_norm": 0.09863644320338857, "learning_rate": 3.07520915934979e-06, "loss": 0.0037, "step": 3717 }, { "epoch": 2.787106446776612, "grad_norm": 0.07351010386178865, "learning_rate": 3.053776926117191e-06, "loss": 0.0026, "step": 3718 }, { "epoch": 2.787856071964018, "grad_norm": 0.036135799952357044, "learning_rate": 3.032418480651833e-06, "loss": 0.0008, "step": 3719 }, { "epoch": 2.788605697151424, "grad_norm": 0.11281415219568655, "learning_rate": 3.0111338392100785e-06, "loss": 0.0078, "step": 3720 }, { "epoch": 2.7893553223388308, "grad_norm": 0.07179961099252788, "learning_rate": 2.9899230179921135e-06, "loss": 0.0054, "step": 3721 }, { "epoch": 2.790104947526237, "grad_norm": 0.1429941007376049, "learning_rate": 2.9687860331419016e-06, "loss": 0.0011, "step": 3722 }, { "epoch": 2.790854572713643, "grad_norm": 0.09215340792878511, "learning_rate": 2.947722900747274e-06, "loss": 0.0059, "step": 3723 }, { "epoch": 2.7916041979010497, "grad_norm": 0.12098246407629774, "learning_rate": 2.926733636839785e-06, "loss": 0.0134, "step": 3724 }, { "epoch": 2.792353823088456, "grad_norm": 0.017250456202789255, "learning_rate": 2.905818257394799e-06, "loss": 0.0005, "step": 3725 }, { "epoch": 2.793103448275862, "grad_norm": 0.14333256620093657, "learning_rate": 2.8849767783314607e-06, "loss": 0.006, "step": 3726 }, { "epoch": 2.7938530734632683, "grad_norm": 0.08138190706949305, "learning_rate": 2.8642092155126367e-06, "loss": 0.0056, "step": 3727 }, { "epoch": 2.7946026986506745, "grad_norm": 0.1542464999221198, "learning_rate": 2.843515584744949e-06, "loss": 0.0101, "step": 3728 }, { "epoch": 2.795352323838081, "grad_norm": 0.2655814663047673, "learning_rate": 2.822895901778744e-06, "loss": 0.0156, "step": 3729 }, { "epoch": 2.7961019490254873, "grad_norm": 0.07464939867101461, "learning_rate": 2.8023501823080887e-06, "loss": 0.0015, "step": 3730 }, { "epoch": 2.7968515742128934, "grad_norm": 0.03688259668135321, "learning_rate": 2.7818784419707646e-06, "loss": 0.0019, "step": 3731 }, { "epoch": 2.7976011994003, "grad_norm": 0.05642397865060001, "learning_rate": 2.761480696348218e-06, "loss": 0.004, "step": 3732 }, { "epoch": 2.7983508245877062, "grad_norm": 0.12829403525319455, "learning_rate": 2.7411569609655962e-06, "loss": 0.0048, "step": 3733 }, { "epoch": 2.7991004497751124, "grad_norm": 0.020893617177272582, "learning_rate": 2.7209072512917154e-06, "loss": 0.0009, "step": 3734 }, { "epoch": 2.7998500749625186, "grad_norm": 0.05167962072763174, "learning_rate": 2.7007315827390467e-06, "loss": 0.0016, "step": 3735 }, { "epoch": 2.8005997001499248, "grad_norm": 0.11663704090659255, "learning_rate": 2.680629970663717e-06, "loss": 0.0047, "step": 3736 }, { "epoch": 2.8013493253373314, "grad_norm": 0.17861922868467878, "learning_rate": 2.660602430365444e-06, "loss": 0.0096, "step": 3737 }, { "epoch": 2.8020989505247376, "grad_norm": 0.09685293415007153, "learning_rate": 2.6406489770876453e-06, "loss": 0.006, "step": 3738 }, { "epoch": 2.8028485757121437, "grad_norm": 0.08273466032580473, "learning_rate": 2.620769626017283e-06, "loss": 0.004, "step": 3739 }, { "epoch": 2.8035982008995504, "grad_norm": 0.09701803775699593, "learning_rate": 2.6009643922849438e-06, "loss": 0.0035, "step": 3740 }, { "epoch": 2.8043478260869565, "grad_norm": 0.08971236519901336, "learning_rate": 2.5812332909647907e-06, "loss": 0.0084, "step": 3741 }, { "epoch": 2.8050974512743627, "grad_norm": 0.05662903915501467, "learning_rate": 2.5615763370745895e-06, "loss": 0.0042, "step": 3742 }, { "epoch": 2.8058470764617693, "grad_norm": 0.05845551070544907, "learning_rate": 2.5419935455756383e-06, "loss": 0.0015, "step": 3743 }, { "epoch": 2.8065967016491755, "grad_norm": 0.04267221242974165, "learning_rate": 2.522484931372804e-06, "loss": 0.0026, "step": 3744 }, { "epoch": 2.8073463268365817, "grad_norm": 0.2170617038162767, "learning_rate": 2.503050509314486e-06, "loss": 0.0096, "step": 3745 }, { "epoch": 2.808095952023988, "grad_norm": 0.10786811045959256, "learning_rate": 2.483690294192653e-06, "loss": 0.0069, "step": 3746 }, { "epoch": 2.808845577211394, "grad_norm": 0.10031047771448215, "learning_rate": 2.46440430074274e-06, "loss": 0.0065, "step": 3747 }, { "epoch": 2.8095952023988007, "grad_norm": 0.521257586527663, "learning_rate": 2.445192543643715e-06, "loss": 0.0243, "step": 3748 }, { "epoch": 2.810344827586207, "grad_norm": 0.04209398645898599, "learning_rate": 2.426055037518027e-06, "loss": 0.0023, "step": 3749 }, { "epoch": 2.811094452773613, "grad_norm": 0.11360610892491614, "learning_rate": 2.406991796931668e-06, "loss": 0.003, "step": 3750 }, { "epoch": 2.8118440779610197, "grad_norm": 0.06493169758031436, "learning_rate": 2.388002836394043e-06, "loss": 0.0031, "step": 3751 }, { "epoch": 2.812593703148426, "grad_norm": 0.06315294635620365, "learning_rate": 2.3690881703580247e-06, "loss": 0.002, "step": 3752 }, { "epoch": 2.813343328335832, "grad_norm": 0.08553548216051235, "learning_rate": 2.3502478132199745e-06, "loss": 0.0049, "step": 3753 }, { "epoch": 2.8140929535232386, "grad_norm": 0.018008874414034905, "learning_rate": 2.3314817793196665e-06, "loss": 0.0005, "step": 3754 }, { "epoch": 2.814842578710645, "grad_norm": 0.08817547281035387, "learning_rate": 2.3127900829403306e-06, "loss": 0.0052, "step": 3755 }, { "epoch": 2.815592203898051, "grad_norm": 0.10272056274155184, "learning_rate": 2.2941727383085754e-06, "loss": 0.0109, "step": 3756 }, { "epoch": 2.816341829085457, "grad_norm": 0.05751309501790076, "learning_rate": 2.2756297595944777e-06, "loss": 0.0051, "step": 3757 }, { "epoch": 2.8170914542728633, "grad_norm": 0.07757600986659895, "learning_rate": 2.257161160911447e-06, "loss": 0.0048, "step": 3758 }, { "epoch": 2.81784107946027, "grad_norm": 0.03507914686943822, "learning_rate": 2.2387669563163406e-06, "loss": 0.002, "step": 3759 }, { "epoch": 2.818590704647676, "grad_norm": 0.06233321008617979, "learning_rate": 2.2204471598093358e-06, "loss": 0.0055, "step": 3760 }, { "epoch": 2.8193403298350823, "grad_norm": 0.08369995481880844, "learning_rate": 2.202201785334046e-06, "loss": 0.0049, "step": 3761 }, { "epoch": 2.820089955022489, "grad_norm": 0.3486093717381726, "learning_rate": 2.1840308467773853e-06, "loss": 0.0208, "step": 3762 }, { "epoch": 2.820839580209895, "grad_norm": 0.18296843989128325, "learning_rate": 2.1659343579696235e-06, "loss": 0.0078, "step": 3763 }, { "epoch": 2.8215892053973013, "grad_norm": 0.04568970976539647, "learning_rate": 2.1479123326843764e-06, "loss": 0.0011, "step": 3764 }, { "epoch": 2.822338830584708, "grad_norm": 0.04835958607273264, "learning_rate": 2.129964784638572e-06, "loss": 0.004, "step": 3765 }, { "epoch": 2.823088455772114, "grad_norm": 0.0728104278943814, "learning_rate": 2.112091727492471e-06, "loss": 0.004, "step": 3766 }, { "epoch": 2.8238380809595203, "grad_norm": 0.13355061657936787, "learning_rate": 2.094293174849604e-06, "loss": 0.005, "step": 3767 }, { "epoch": 2.8245877061469264, "grad_norm": 0.04349484855573775, "learning_rate": 2.0765691402568454e-06, "loss": 0.002, "step": 3768 }, { "epoch": 2.8253373313343326, "grad_norm": 0.1122534941982581, "learning_rate": 2.058919637204304e-06, "loss": 0.0134, "step": 3769 }, { "epoch": 2.8260869565217392, "grad_norm": 0.1021653774283419, "learning_rate": 2.041344679125379e-06, "loss": 0.011, "step": 3770 }, { "epoch": 2.8268365817091454, "grad_norm": 0.08401879274385336, "learning_rate": 2.023844279396736e-06, "loss": 0.0066, "step": 3771 }, { "epoch": 2.8275862068965516, "grad_norm": 0.08403663673675378, "learning_rate": 2.0064184513382987e-06, "loss": 0.0056, "step": 3772 }, { "epoch": 2.8283358320839582, "grad_norm": 0.08761283834204707, "learning_rate": 1.989067208213202e-06, "loss": 0.0043, "step": 3773 }, { "epoch": 2.8290854572713644, "grad_norm": 0.048936710908496085, "learning_rate": 1.971790563227871e-06, "loss": 0.0022, "step": 3774 }, { "epoch": 2.8298350824587706, "grad_norm": 0.1903573056337051, "learning_rate": 1.9545885295318754e-06, "loss": 0.0083, "step": 3775 }, { "epoch": 2.8305847076461768, "grad_norm": 0.10385777442522533, "learning_rate": 1.937461120218065e-06, "loss": 0.0042, "step": 3776 }, { "epoch": 2.831334332833583, "grad_norm": 0.08232033379572473, "learning_rate": 1.9204083483224553e-06, "loss": 0.0045, "step": 3777 }, { "epoch": 2.8320839580209896, "grad_norm": 0.0866681413818064, "learning_rate": 1.9034302268242654e-06, "loss": 0.0089, "step": 3778 }, { "epoch": 2.8328335832083957, "grad_norm": 0.15012606367667924, "learning_rate": 1.88652676864588e-06, "loss": 0.0053, "step": 3779 }, { "epoch": 2.833583208395802, "grad_norm": 0.09515478494613794, "learning_rate": 1.8696979866528851e-06, "loss": 0.0055, "step": 3780 }, { "epoch": 2.8343328335832085, "grad_norm": 0.05510011211789895, "learning_rate": 1.8529438936540021e-06, "loss": 0.0025, "step": 3781 }, { "epoch": 2.8350824587706147, "grad_norm": 0.05029481154099812, "learning_rate": 1.8362645024011305e-06, "loss": 0.0045, "step": 3782 }, { "epoch": 2.835832083958021, "grad_norm": 0.06596027338712288, "learning_rate": 1.8196598255892816e-06, "loss": 0.0036, "step": 3783 }, { "epoch": 2.8365817091454275, "grad_norm": 0.07545314831200027, "learning_rate": 1.8031298758566129e-06, "loss": 0.0037, "step": 3784 }, { "epoch": 2.8373313343328337, "grad_norm": 0.21628185122520407, "learning_rate": 1.786674665784438e-06, "loss": 0.0166, "step": 3785 }, { "epoch": 2.83808095952024, "grad_norm": 0.11500979436017138, "learning_rate": 1.7702942078971384e-06, "loss": 0.0075, "step": 3786 }, { "epoch": 2.838830584707646, "grad_norm": 0.20084826385099694, "learning_rate": 1.7539885146622192e-06, "loss": 0.0054, "step": 3787 }, { "epoch": 2.839580209895052, "grad_norm": 0.08656276104804662, "learning_rate": 1.737757598490275e-06, "loss": 0.0036, "step": 3788 }, { "epoch": 2.840329835082459, "grad_norm": 0.20638457703316246, "learning_rate": 1.7216014717350015e-06, "loss": 0.0046, "step": 3789 }, { "epoch": 2.841079460269865, "grad_norm": 0.1817487754895775, "learning_rate": 1.7055201466931515e-06, "loss": 0.0033, "step": 3790 }, { "epoch": 2.841829085457271, "grad_norm": 0.06294821656981923, "learning_rate": 1.6895136356045449e-06, "loss": 0.0021, "step": 3791 }, { "epoch": 2.842578710644678, "grad_norm": 0.0595355858491971, "learning_rate": 1.673581950652081e-06, "loss": 0.0065, "step": 3792 }, { "epoch": 2.843328335832084, "grad_norm": 0.07798007068273854, "learning_rate": 1.6577251039616936e-06, "loss": 0.0011, "step": 3793 }, { "epoch": 2.84407796101949, "grad_norm": 0.057368905011597304, "learning_rate": 1.6419431076023506e-06, "loss": 0.0024, "step": 3794 }, { "epoch": 2.844827586206897, "grad_norm": 0.09375199404270616, "learning_rate": 1.6262359735860544e-06, "loss": 0.007, "step": 3795 }, { "epoch": 2.845577211394303, "grad_norm": 0.08821695567906807, "learning_rate": 1.6106037138678199e-06, "loss": 0.003, "step": 3796 }, { "epoch": 2.846326836581709, "grad_norm": 0.11360534842385173, "learning_rate": 1.595046340345685e-06, "loss": 0.0055, "step": 3797 }, { "epoch": 2.8470764617691153, "grad_norm": 0.12330877431872257, "learning_rate": 1.5795638648607003e-06, "loss": 0.0032, "step": 3798 }, { "epoch": 2.8478260869565215, "grad_norm": 0.06392015519757543, "learning_rate": 1.5641562991968949e-06, "loss": 0.0027, "step": 3799 }, { "epoch": 2.848575712143928, "grad_norm": 0.3197110206802998, "learning_rate": 1.5488236550812663e-06, "loss": 0.021, "step": 3800 }, { "epoch": 2.8493253373313343, "grad_norm": 0.06995177459050711, "learning_rate": 1.5335659441838347e-06, "loss": 0.0048, "step": 3801 }, { "epoch": 2.8500749625187405, "grad_norm": 0.0483401155538583, "learning_rate": 1.5183831781175217e-06, "loss": 0.0033, "step": 3802 }, { "epoch": 2.850824587706147, "grad_norm": 0.19698717564288212, "learning_rate": 1.5032753684382727e-06, "loss": 0.0051, "step": 3803 }, { "epoch": 2.8515742128935533, "grad_norm": 0.13355720806571342, "learning_rate": 1.4882425266449452e-06, "loss": 0.0048, "step": 3804 }, { "epoch": 2.8523238380809595, "grad_norm": 0.06441256905321165, "learning_rate": 1.4732846641793419e-06, "loss": 0.0032, "step": 3805 }, { "epoch": 2.853073463268366, "grad_norm": 0.0877528657305153, "learning_rate": 1.4584017924262005e-06, "loss": 0.0052, "step": 3806 }, { "epoch": 2.8538230884557723, "grad_norm": 0.05421868798784275, "learning_rate": 1.4435939227131713e-06, "loss": 0.0038, "step": 3807 }, { "epoch": 2.8545727136431784, "grad_norm": 0.0690574032640628, "learning_rate": 1.4288610663108382e-06, "loss": 0.0021, "step": 3808 }, { "epoch": 2.8553223388305846, "grad_norm": 0.12791567353295813, "learning_rate": 1.4142032344326983e-06, "loss": 0.0083, "step": 3809 }, { "epoch": 2.856071964017991, "grad_norm": 0.053996680372903, "learning_rate": 1.3996204382351051e-06, "loss": 0.0024, "step": 3810 }, { "epoch": 2.8568215892053974, "grad_norm": 0.12263371388253382, "learning_rate": 1.385112688817325e-06, "loss": 0.005, "step": 3811 }, { "epoch": 2.8575712143928036, "grad_norm": 0.10742409395354456, "learning_rate": 1.3706799972215246e-06, "loss": 0.0041, "step": 3812 }, { "epoch": 2.8583208395802098, "grad_norm": 0.11053414179643328, "learning_rate": 1.3563223744327058e-06, "loss": 0.0091, "step": 3813 }, { "epoch": 2.8590704647676164, "grad_norm": 0.23767852215258636, "learning_rate": 1.3420398313787607e-06, "loss": 0.0074, "step": 3814 }, { "epoch": 2.8598200899550226, "grad_norm": 0.23954200853942614, "learning_rate": 1.327832378930427e-06, "loss": 0.0084, "step": 3815 }, { "epoch": 2.8605697151424287, "grad_norm": 0.11676655596471934, "learning_rate": 1.3137000279012767e-06, "loss": 0.0076, "step": 3816 }, { "epoch": 2.861319340329835, "grad_norm": 0.08462846856681903, "learning_rate": 1.2996427890477503e-06, "loss": 0.0038, "step": 3817 }, { "epoch": 2.862068965517241, "grad_norm": 0.11947819513361728, "learning_rate": 1.285660673069089e-06, "loss": 0.0063, "step": 3818 }, { "epoch": 2.8628185907046477, "grad_norm": 0.06530871712869708, "learning_rate": 1.271753690607369e-06, "loss": 0.0027, "step": 3819 }, { "epoch": 2.863568215892054, "grad_norm": 0.2695348273709761, "learning_rate": 1.25792185224749e-06, "loss": 0.0082, "step": 3820 }, { "epoch": 2.86431784107946, "grad_norm": 0.03476298207593885, "learning_rate": 1.2441651685171417e-06, "loss": 0.0014, "step": 3821 }, { "epoch": 2.8650674662668667, "grad_norm": 0.3226581869020099, "learning_rate": 1.2304836498868265e-06, "loss": 0.0689, "step": 3822 }, { "epoch": 2.865817091454273, "grad_norm": 0.07761082439778805, "learning_rate": 1.2168773067698259e-06, "loss": 0.0045, "step": 3823 }, { "epoch": 2.866566716641679, "grad_norm": 0.3412718354202291, "learning_rate": 1.2033461495222108e-06, "loss": 0.0112, "step": 3824 }, { "epoch": 2.8673163418290857, "grad_norm": 0.16766941008019637, "learning_rate": 1.1898901884428104e-06, "loss": 0.0054, "step": 3825 }, { "epoch": 2.868065967016492, "grad_norm": 0.12532044546710538, "learning_rate": 1.1765094337732542e-06, "loss": 0.0123, "step": 3826 }, { "epoch": 2.868815592203898, "grad_norm": 0.10200359226472538, "learning_rate": 1.1632038956979064e-06, "loss": 0.0049, "step": 3827 }, { "epoch": 2.869565217391304, "grad_norm": 0.09724262866667624, "learning_rate": 1.1499735843438665e-06, "loss": 0.0024, "step": 3828 }, { "epoch": 2.8703148425787104, "grad_norm": 0.10813501029772407, "learning_rate": 1.1368185097810236e-06, "loss": 0.0059, "step": 3829 }, { "epoch": 2.871064467766117, "grad_norm": 0.06405598123253393, "learning_rate": 1.1237386820219465e-06, "loss": 0.0052, "step": 3830 }, { "epoch": 2.871814092953523, "grad_norm": 0.11886189362782622, "learning_rate": 1.1107341110219938e-06, "loss": 0.0061, "step": 3831 }, { "epoch": 2.8725637181409294, "grad_norm": 0.04310115337204529, "learning_rate": 1.0978048066791925e-06, "loss": 0.0017, "step": 3832 }, { "epoch": 2.873313343328336, "grad_norm": 0.21923193277890474, "learning_rate": 1.084950778834304e-06, "loss": 0.0186, "step": 3833 }, { "epoch": 2.874062968515742, "grad_norm": 0.15533190052223408, "learning_rate": 1.0721720372707911e-06, "loss": 0.0053, "step": 3834 }, { "epoch": 2.8748125937031483, "grad_norm": 0.16032577319751884, "learning_rate": 1.0594685917148294e-06, "loss": 0.0094, "step": 3835 }, { "epoch": 2.875562218890555, "grad_norm": 0.10724245151178886, "learning_rate": 1.0468404518352738e-06, "loss": 0.0084, "step": 3836 }, { "epoch": 2.876311844077961, "grad_norm": 0.06086983438719523, "learning_rate": 1.0342876272436464e-06, "loss": 0.0038, "step": 3837 }, { "epoch": 2.8770614692653673, "grad_norm": 0.09034829380911473, "learning_rate": 1.0218101274941604e-06, "loss": 0.0041, "step": 3838 }, { "epoch": 2.8778110944527735, "grad_norm": 0.06363614232706609, "learning_rate": 1.0094079620837194e-06, "loss": 0.0013, "step": 3839 }, { "epoch": 2.8785607196401797, "grad_norm": 0.04423131224523309, "learning_rate": 9.970811404518389e-07, "loss": 0.0029, "step": 3840 }, { "epoch": 2.8793103448275863, "grad_norm": 0.08304277803334834, "learning_rate": 9.848296719807359e-07, "loss": 0.0045, "step": 3841 }, { "epoch": 2.8800599700149925, "grad_norm": 0.028576192852339508, "learning_rate": 9.726535659952519e-07, "loss": 0.0007, "step": 3842 }, { "epoch": 2.8808095952023987, "grad_norm": 0.15636058161463368, "learning_rate": 9.60552831762873e-07, "loss": 0.0095, "step": 3843 }, { "epoch": 2.8815592203898053, "grad_norm": 0.03345494562713568, "learning_rate": 9.485274784937104e-07, "loss": 0.0019, "step": 3844 }, { "epoch": 2.8823088455772115, "grad_norm": 0.07728964927781995, "learning_rate": 9.365775153405088e-07, "loss": 0.007, "step": 3845 }, { "epoch": 2.8830584707646176, "grad_norm": 0.05631770312584182, "learning_rate": 9.247029513986482e-07, "loss": 0.0025, "step": 3846 }, { "epoch": 2.8838080959520243, "grad_norm": 0.06305904315543834, "learning_rate": 9.129037957060771e-07, "loss": 0.0009, "step": 3847 }, { "epoch": 2.8845577211394304, "grad_norm": 0.02943176703260356, "learning_rate": 9.011800572434003e-07, "loss": 0.0035, "step": 3848 }, { "epoch": 2.8853073463268366, "grad_norm": 0.12886854986809387, "learning_rate": 8.895317449337803e-07, "loss": 0.012, "step": 3849 }, { "epoch": 2.886056971514243, "grad_norm": 0.0639795246401034, "learning_rate": 8.779588676429917e-07, "loss": 0.0041, "step": 3850 }, { "epoch": 2.886806596701649, "grad_norm": 0.1330508162503493, "learning_rate": 8.664614341793886e-07, "loss": 0.0047, "step": 3851 }, { "epoch": 2.8875562218890556, "grad_norm": 0.0864366190069689, "learning_rate": 8.550394532939154e-07, "loss": 0.0018, "step": 3852 }, { "epoch": 2.8883058470764618, "grad_norm": 0.0501558109337864, "learning_rate": 8.436929336800515e-07, "loss": 0.0041, "step": 3853 }, { "epoch": 2.889055472263868, "grad_norm": 0.03194449960152962, "learning_rate": 8.324218839738996e-07, "loss": 0.0015, "step": 3854 }, { "epoch": 2.8898050974512746, "grad_norm": 0.04313150198455092, "learning_rate": 8.212263127540642e-07, "loss": 0.002, "step": 3855 }, { "epoch": 2.8905547226386807, "grad_norm": 0.15162586669753475, "learning_rate": 8.101062285417405e-07, "loss": 0.0135, "step": 3856 }, { "epoch": 2.891304347826087, "grad_norm": 0.059281772264859076, "learning_rate": 7.990616398006356e-07, "loss": 0.0034, "step": 3857 }, { "epoch": 2.892053973013493, "grad_norm": 0.07432819629295129, "learning_rate": 7.880925549370366e-07, "loss": 0.0041, "step": 3858 }, { "epoch": 2.8928035982008993, "grad_norm": 0.036534280314377575, "learning_rate": 7.771989822997206e-07, "loss": 0.0006, "step": 3859 }, { "epoch": 2.893553223388306, "grad_norm": 0.15373465674637074, "learning_rate": 7.663809301800106e-07, "loss": 0.0044, "step": 3860 }, { "epoch": 2.894302848575712, "grad_norm": 0.037159260867517294, "learning_rate": 7.55638406811765e-07, "loss": 0.0018, "step": 3861 }, { "epoch": 2.8950524737631183, "grad_norm": 0.05407922317555088, "learning_rate": 7.449714203713321e-07, "loss": 0.0034, "step": 3862 }, { "epoch": 2.895802098950525, "grad_norm": 0.13115050264454448, "learning_rate": 7.343799789775618e-07, "loss": 0.0107, "step": 3863 }, { "epoch": 2.896551724137931, "grad_norm": 0.07134410667091412, "learning_rate": 7.238640906918282e-07, "loss": 0.0073, "step": 3864 }, { "epoch": 2.8973013493253372, "grad_norm": 0.07090129947559054, "learning_rate": 7.134237635180063e-07, "loss": 0.0031, "step": 3865 }, { "epoch": 2.898050974512744, "grad_norm": 0.05134175014094741, "learning_rate": 7.030590054024178e-07, "loss": 0.0044, "step": 3866 }, { "epoch": 2.89880059970015, "grad_norm": 0.1938597397603204, "learning_rate": 6.927698242339076e-07, "loss": 0.0147, "step": 3867 }, { "epoch": 2.899550224887556, "grad_norm": 0.041853566186652764, "learning_rate": 6.825562278437781e-07, "loss": 0.0025, "step": 3868 }, { "epoch": 2.9002998500749624, "grad_norm": 0.064640348356761, "learning_rate": 6.724182240058108e-07, "loss": 0.0046, "step": 3869 }, { "epoch": 2.9010494752623686, "grad_norm": 0.21030206970660664, "learning_rate": 6.623558204362446e-07, "loss": 0.0092, "step": 3870 }, { "epoch": 2.901799100449775, "grad_norm": 0.048488401349587804, "learning_rate": 6.523690247937864e-07, "loss": 0.0023, "step": 3871 }, { "epoch": 2.9025487256371814, "grad_norm": 0.11700840312090334, "learning_rate": 6.424578446796003e-07, "loss": 0.0049, "step": 3872 }, { "epoch": 2.9032983508245875, "grad_norm": 0.09652391917869022, "learning_rate": 6.326222876372745e-07, "loss": 0.0044, "step": 3873 }, { "epoch": 2.904047976011994, "grad_norm": 0.06097657800986373, "learning_rate": 6.22862361152865e-07, "loss": 0.0067, "step": 3874 }, { "epoch": 2.9047976011994003, "grad_norm": 0.13290948988113532, "learning_rate": 6.13178072654852e-07, "loss": 0.0062, "step": 3875 }, { "epoch": 2.9055472263868065, "grad_norm": 0.03547007821767269, "learning_rate": 6.035694295141391e-07, "loss": 0.0014, "step": 3876 }, { "epoch": 2.906296851574213, "grad_norm": 0.1449129728244748, "learning_rate": 5.940364390440656e-07, "loss": 0.0081, "step": 3877 }, { "epoch": 2.9070464767616193, "grad_norm": 0.05884229221265448, "learning_rate": 5.845791085003937e-07, "loss": 0.0011, "step": 3878 }, { "epoch": 2.9077961019490255, "grad_norm": 0.048490012177309055, "learning_rate": 5.75197445081288e-07, "loss": 0.0026, "step": 3879 }, { "epoch": 2.9085457271364317, "grad_norm": 0.12293404610823899, "learning_rate": 5.658914559273143e-07, "loss": 0.0044, "step": 3880 }, { "epoch": 2.909295352323838, "grad_norm": 0.11175197743283243, "learning_rate": 5.566611481214734e-07, "loss": 0.0036, "step": 3881 }, { "epoch": 2.9100449775112445, "grad_norm": 0.027393754256709764, "learning_rate": 5.475065286891346e-07, "loss": 0.0004, "step": 3882 }, { "epoch": 2.9107946026986506, "grad_norm": 0.08192982198985065, "learning_rate": 5.384276045980574e-07, "loss": 0.0048, "step": 3883 }, { "epoch": 2.911544227886057, "grad_norm": 0.05348463886759356, "learning_rate": 5.294243827584145e-07, "loss": 0.003, "step": 3884 }, { "epoch": 2.9122938530734634, "grad_norm": 0.11884429591828721, "learning_rate": 5.204968700227242e-07, "loss": 0.0169, "step": 3885 }, { "epoch": 2.9130434782608696, "grad_norm": 0.1553683804200957, "learning_rate": 5.116450731859179e-07, "loss": 0.003, "step": 3886 }, { "epoch": 2.913793103448276, "grad_norm": 0.26089796779801405, "learning_rate": 5.02868998985273e-07, "loss": 0.019, "step": 3887 }, { "epoch": 2.9145427286356824, "grad_norm": 0.07710209310944306, "learning_rate": 4.941686541004464e-07, "loss": 0.0052, "step": 3888 }, { "epoch": 2.9152923538230886, "grad_norm": 0.09064721229356534, "learning_rate": 4.85544045153441e-07, "loss": 0.0055, "step": 3889 }, { "epoch": 2.9160419790104948, "grad_norm": 0.04485489321328558, "learning_rate": 4.76995178708628e-07, "loss": 0.0011, "step": 3890 }, { "epoch": 2.916791604197901, "grad_norm": 0.10881058038757435, "learning_rate": 4.685220612727248e-07, "loss": 0.0092, "step": 3891 }, { "epoch": 2.917541229385307, "grad_norm": 0.11097857641116497, "learning_rate": 4.6012469929479496e-07, "loss": 0.0015, "step": 3892 }, { "epoch": 2.9182908545727138, "grad_norm": 0.08700092245246938, "learning_rate": 4.5180309916623697e-07, "loss": 0.0028, "step": 3893 }, { "epoch": 2.91904047976012, "grad_norm": 0.08473276032542719, "learning_rate": 4.435572672208066e-07, "loss": 0.0078, "step": 3894 }, { "epoch": 2.919790104947526, "grad_norm": 0.10730176722413753, "learning_rate": 4.3538720973457235e-07, "loss": 0.0024, "step": 3895 }, { "epoch": 2.9205397301349327, "grad_norm": 0.03297186138157908, "learning_rate": 4.2729293292592677e-07, "loss": 0.0006, "step": 3896 }, { "epoch": 2.921289355322339, "grad_norm": 0.06178056449219258, "learning_rate": 4.1927444295559725e-07, "loss": 0.0037, "step": 3897 }, { "epoch": 2.922038980509745, "grad_norm": 0.08020392353543454, "learning_rate": 4.113317459266242e-07, "loss": 0.009, "step": 3898 }, { "epoch": 2.9227886056971513, "grad_norm": 0.20249707480901977, "learning_rate": 4.0346484788434945e-07, "loss": 0.0109, "step": 3899 }, { "epoch": 2.923538230884558, "grad_norm": 0.03934935977733298, "learning_rate": 3.95673754816428e-07, "loss": 0.0025, "step": 3900 }, { "epoch": 2.924287856071964, "grad_norm": 0.08764117867567876, "learning_rate": 3.8795847265282735e-07, "loss": 0.0058, "step": 3901 }, { "epoch": 2.9250374812593702, "grad_norm": 0.22565307935971812, "learning_rate": 3.8031900726581694e-07, "loss": 0.0107, "step": 3902 }, { "epoch": 2.9257871064467764, "grad_norm": 0.08387411944908658, "learning_rate": 3.727553644699455e-07, "loss": 0.0032, "step": 3903 }, { "epoch": 2.926536731634183, "grad_norm": 0.08318669900705351, "learning_rate": 3.652675500220415e-07, "loss": 0.0045, "step": 3904 }, { "epoch": 2.927286356821589, "grad_norm": 0.06422323549076467, "learning_rate": 3.578555696212571e-07, "loss": 0.0033, "step": 3905 }, { "epoch": 2.9280359820089954, "grad_norm": 0.04610187902934583, "learning_rate": 3.505194289089908e-07, "loss": 0.0036, "step": 3906 }, { "epoch": 2.928785607196402, "grad_norm": 0.07675639671383808, "learning_rate": 3.432591334689317e-07, "loss": 0.0073, "step": 3907 }, { "epoch": 2.929535232383808, "grad_norm": 0.05767654466937584, "learning_rate": 3.360746888270372e-07, "loss": 0.0012, "step": 3908 }, { "epoch": 2.9302848575712144, "grad_norm": 0.17132906497501388, "learning_rate": 3.289661004515554e-07, "loss": 0.0085, "step": 3909 }, { "epoch": 2.9310344827586206, "grad_norm": 0.027438192464950117, "learning_rate": 3.219333737529473e-07, "loss": 0.0008, "step": 3910 }, { "epoch": 2.9317841079460267, "grad_norm": 0.054228259408073395, "learning_rate": 3.1497651408399776e-07, "loss": 0.0016, "step": 3911 }, { "epoch": 2.9325337331334334, "grad_norm": 0.08815393033743125, "learning_rate": 3.080955267396934e-07, "loss": 0.0143, "step": 3912 }, { "epoch": 2.9332833583208395, "grad_norm": 0.20265750857265044, "learning_rate": 3.0129041695730054e-07, "loss": 0.0212, "step": 3913 }, { "epoch": 2.9340329835082457, "grad_norm": 0.12156823056329677, "learning_rate": 2.945611899163425e-07, "loss": 0.003, "step": 3914 }, { "epoch": 2.9347826086956523, "grad_norm": 0.018979792274225624, "learning_rate": 2.8790785073855574e-07, "loss": 0.0008, "step": 3915 }, { "epoch": 2.9355322338830585, "grad_norm": 0.1942467859990612, "learning_rate": 2.813304044879339e-07, "loss": 0.0114, "step": 3916 }, { "epoch": 2.9362818590704647, "grad_norm": 0.12520849384259206, "learning_rate": 2.7482885617071686e-07, "loss": 0.0049, "step": 3917 }, { "epoch": 2.9370314842578713, "grad_norm": 0.2781515226567302, "learning_rate": 2.6840321073536843e-07, "loss": 0.0124, "step": 3918 }, { "epoch": 2.9377811094452775, "grad_norm": 0.06083224080379495, "learning_rate": 2.6205347307256546e-07, "loss": 0.0027, "step": 3919 }, { "epoch": 2.9385307346326837, "grad_norm": 0.16590619218430297, "learning_rate": 2.557796480152308e-07, "loss": 0.0064, "step": 3920 }, { "epoch": 2.93928035982009, "grad_norm": 0.09392875545963016, "learning_rate": 2.495817403385114e-07, "loss": 0.0079, "step": 3921 }, { "epoch": 2.940029985007496, "grad_norm": 0.1475733909454278, "learning_rate": 2.4345975475975615e-07, "loss": 0.0091, "step": 3922 }, { "epoch": 2.9407796101949026, "grad_norm": 0.25972195245536994, "learning_rate": 2.3741369593852647e-07, "loss": 0.0126, "step": 3923 }, { "epoch": 2.941529235382309, "grad_norm": 0.06200435894219216, "learning_rate": 2.314435684766081e-07, "loss": 0.0082, "step": 3924 }, { "epoch": 2.942278860569715, "grad_norm": 0.1400935741912931, "learning_rate": 2.2554937691798839e-07, "loss": 0.0102, "step": 3925 }, { "epoch": 2.9430284857571216, "grad_norm": 0.07050883912900091, "learning_rate": 2.1973112574885656e-07, "loss": 0.0032, "step": 3926 }, { "epoch": 2.943778110944528, "grad_norm": 0.09879521943653216, "learning_rate": 2.1398881939761474e-07, "loss": 0.0047, "step": 3927 }, { "epoch": 2.944527736131934, "grad_norm": 0.09017841555169903, "learning_rate": 2.0832246223483344e-07, "loss": 0.0045, "step": 3928 }, { "epoch": 2.9452773613193406, "grad_norm": 0.0503549413885895, "learning_rate": 2.0273205857330723e-07, "loss": 0.003, "step": 3929 }, { "epoch": 2.9460269865067468, "grad_norm": 0.11413117402719496, "learning_rate": 1.972176126679881e-07, "loss": 0.0062, "step": 3930 }, { "epoch": 2.946776611694153, "grad_norm": 0.04264351749436684, "learning_rate": 1.9177912871606307e-07, "loss": 0.002, "step": 3931 }, { "epoch": 2.947526236881559, "grad_norm": 0.10185445906343903, "learning_rate": 1.8641661085685436e-07, "loss": 0.0061, "step": 3932 }, { "epoch": 2.9482758620689653, "grad_norm": 0.22741461874925956, "learning_rate": 1.8113006317188596e-07, "loss": 0.0182, "step": 3933 }, { "epoch": 2.949025487256372, "grad_norm": 0.02685321960312984, "learning_rate": 1.759194896848615e-07, "loss": 0.0015, "step": 3934 }, { "epoch": 2.949775112443778, "grad_norm": 0.04166166861207708, "learning_rate": 1.7078489436165301e-07, "loss": 0.001, "step": 3935 }, { "epoch": 2.9505247376311843, "grad_norm": 0.12658296999295252, "learning_rate": 1.657262811103122e-07, "loss": 0.0043, "step": 3936 }, { "epoch": 2.951274362818591, "grad_norm": 0.07800235121237264, "learning_rate": 1.6074365378105915e-07, "loss": 0.0028, "step": 3937 }, { "epoch": 2.952023988005997, "grad_norm": 0.1575521316690672, "learning_rate": 1.5583701616626034e-07, "loss": 0.0053, "step": 3938 }, { "epoch": 2.9527736131934033, "grad_norm": 0.049391341850405314, "learning_rate": 1.5100637200046176e-07, "loss": 0.0015, "step": 3939 }, { "epoch": 2.9535232383808094, "grad_norm": 0.08463145717344953, "learning_rate": 1.4625172496036676e-07, "loss": 0.0101, "step": 3940 }, { "epoch": 2.954272863568216, "grad_norm": 0.0913568967277725, "learning_rate": 1.4157307866484726e-07, "loss": 0.005, "step": 3941 }, { "epoch": 2.9550224887556222, "grad_norm": 0.16458575578486678, "learning_rate": 1.3697043667489918e-07, "loss": 0.0041, "step": 3942 }, { "epoch": 2.9557721139430284, "grad_norm": 0.03985907742532866, "learning_rate": 1.3244380249369804e-07, "loss": 0.001, "step": 3943 }, { "epoch": 2.9565217391304346, "grad_norm": 0.11386553888333875, "learning_rate": 1.2799317956655454e-07, "loss": 0.0065, "step": 3944 }, { "epoch": 2.957271364317841, "grad_norm": 0.09145340692489545, "learning_rate": 1.236185712809368e-07, "loss": 0.0078, "step": 3945 }, { "epoch": 2.9580209895052474, "grad_norm": 0.1222934590256073, "learning_rate": 1.1931998096644804e-07, "loss": 0.0055, "step": 3946 }, { "epoch": 2.9587706146926536, "grad_norm": 0.1625788169609916, "learning_rate": 1.150974118948156e-07, "loss": 0.0077, "step": 3947 }, { "epoch": 2.95952023988006, "grad_norm": 0.10754275067318649, "learning_rate": 1.1095086727994641e-07, "loss": 0.0054, "step": 3948 }, { "epoch": 2.9602698650674664, "grad_norm": 0.07812351741966694, "learning_rate": 1.0688035027786036e-07, "loss": 0.0046, "step": 3949 }, { "epoch": 2.9610194902548725, "grad_norm": 0.06017618913652728, "learning_rate": 1.0288586398670142e-07, "loss": 0.0019, "step": 3950 }, { "epoch": 2.9617691154422787, "grad_norm": 0.06594705186407322, "learning_rate": 9.89674114467487e-08, "loss": 0.0062, "step": 3951 }, { "epoch": 2.962518740629685, "grad_norm": 0.05508768796277397, "learning_rate": 9.512499564042764e-08, "loss": 0.0027, "step": 3952 }, { "epoch": 2.9632683658170915, "grad_norm": 0.07035356274760843, "learning_rate": 9.135861949228774e-08, "loss": 0.0029, "step": 3953 }, { "epoch": 2.9640179910044977, "grad_norm": 0.09665084820902284, "learning_rate": 8.766828586898035e-08, "loss": 0.0035, "step": 3954 }, { "epoch": 2.964767616191904, "grad_norm": 0.15054682362176386, "learning_rate": 8.40539975793031e-08, "loss": 0.006, "step": 3955 }, { "epoch": 2.9655172413793105, "grad_norm": 0.05185536960868925, "learning_rate": 8.051575737416661e-08, "loss": 0.0669, "step": 3956 }, { "epoch": 2.9662668665667167, "grad_norm": 0.04477626103777263, "learning_rate": 7.705356794659446e-08, "loss": 0.0017, "step": 3957 }, { "epoch": 2.967016491754123, "grad_norm": 0.21173344383560222, "learning_rate": 7.36674319317232e-08, "loss": 0.0099, "step": 3958 }, { "epoch": 2.9677661169415295, "grad_norm": 0.08864354866618206, "learning_rate": 7.035735190682458e-08, "loss": 0.0067, "step": 3959 }, { "epoch": 2.9685157421289357, "grad_norm": 0.08168212547347438, "learning_rate": 6.712333039126107e-08, "loss": 0.0033, "step": 3960 }, { "epoch": 2.969265367316342, "grad_norm": 0.1056606139859158, "learning_rate": 6.396536984650814e-08, "loss": 0.0167, "step": 3961 }, { "epoch": 2.970014992503748, "grad_norm": 0.23985455411275883, "learning_rate": 6.088347267616534e-08, "loss": 0.0083, "step": 3962 }, { "epoch": 2.970764617691154, "grad_norm": 0.0962979369431135, "learning_rate": 5.787764122592298e-08, "loss": 0.0018, "step": 3963 }, { "epoch": 2.971514242878561, "grad_norm": 0.07750924360925224, "learning_rate": 5.4947877783584344e-08, "loss": 0.0057, "step": 3964 }, { "epoch": 2.972263868065967, "grad_norm": 0.10843891872380376, "learning_rate": 5.209418457904347e-08, "loss": 0.0051, "step": 3965 }, { "epoch": 2.973013493253373, "grad_norm": 0.0920430540669819, "learning_rate": 4.931656378431848e-08, "loss": 0.0067, "step": 3966 }, { "epoch": 2.97376311844078, "grad_norm": 0.09960702743932079, "learning_rate": 4.661501751349606e-08, "loss": 0.0057, "step": 3967 }, { "epoch": 2.974512743628186, "grad_norm": 0.03518596057286467, "learning_rate": 4.3989547822798074e-08, "loss": 0.002, "step": 3968 }, { "epoch": 2.975262368815592, "grad_norm": 0.1287565629497624, "learning_rate": 4.144015671051493e-08, "loss": 0.0029, "step": 3969 }, { "epoch": 2.9760119940029988, "grad_norm": 0.07846377737079575, "learning_rate": 3.896684611705004e-08, "loss": 0.006, "step": 3970 }, { "epoch": 2.976761619190405, "grad_norm": 0.14591446042608083, "learning_rate": 3.656961792486424e-08, "loss": 0.003, "step": 3971 }, { "epoch": 2.977511244377811, "grad_norm": 0.17905020957307904, "learning_rate": 3.424847395856467e-08, "loss": 0.005, "step": 3972 }, { "epoch": 2.9782608695652173, "grad_norm": 0.01906684909524204, "learning_rate": 3.2003415984815934e-08, "loss": 0.0009, "step": 3973 }, { "epoch": 2.9790104947526235, "grad_norm": 0.11418258519719206, "learning_rate": 2.9834445712362266e-08, "loss": 0.0036, "step": 3974 }, { "epoch": 2.97976011994003, "grad_norm": 0.11876471382011124, "learning_rate": 2.7741564792072018e-08, "loss": 0.0068, "step": 3975 }, { "epoch": 2.9805097451274363, "grad_norm": 0.08256987244801461, "learning_rate": 2.5724774816870965e-08, "loss": 0.0052, "step": 3976 }, { "epoch": 2.9812593703148424, "grad_norm": 0.08847358453985199, "learning_rate": 2.3784077321775678e-08, "loss": 0.0094, "step": 3977 }, { "epoch": 2.982008995502249, "grad_norm": 0.0710505712846508, "learning_rate": 2.1919473783904575e-08, "loss": 0.0073, "step": 3978 }, { "epoch": 2.9827586206896552, "grad_norm": 0.05908748373326097, "learning_rate": 2.013096562242245e-08, "loss": 0.0013, "step": 3979 }, { "epoch": 2.9835082458770614, "grad_norm": 0.03487128992436554, "learning_rate": 1.8418554198629257e-08, "loss": 0.0011, "step": 3980 }, { "epoch": 2.9842578710644676, "grad_norm": 0.03919515081336705, "learning_rate": 1.6782240815849116e-08, "loss": 0.0018, "step": 3981 }, { "epoch": 2.9850074962518742, "grad_norm": 0.032663649258333316, "learning_rate": 1.5222026719530213e-08, "loss": 0.0017, "step": 3982 }, { "epoch": 2.9857571214392804, "grad_norm": 0.26096980807808, "learning_rate": 1.3737913097178201e-08, "loss": 0.0058, "step": 3983 }, { "epoch": 2.9865067466266866, "grad_norm": 0.12084288234059795, "learning_rate": 1.2329901078378392e-08, "loss": 0.0024, "step": 3984 }, { "epoch": 2.9872563718140928, "grad_norm": 0.06595470019370689, "learning_rate": 1.0997991734806868e-08, "loss": 0.0048, "step": 3985 }, { "epoch": 2.9880059970014994, "grad_norm": 0.1666299653323181, "learning_rate": 9.742186080208271e-09, "loss": 0.0096, "step": 3986 }, { "epoch": 2.9887556221889056, "grad_norm": 0.19747301065827289, "learning_rate": 8.56248507039581e-09, "loss": 0.0186, "step": 3987 }, { "epoch": 2.9895052473763117, "grad_norm": 0.076916580436097, "learning_rate": 7.458889603262353e-09, "loss": 0.006, "step": 3988 }, { "epoch": 2.9902548725637184, "grad_norm": 0.06781979930201935, "learning_rate": 6.4314005187804394e-09, "loss": 0.0021, "step": 3989 }, { "epoch": 2.9910044977511245, "grad_norm": 0.06140423539445794, "learning_rate": 5.480018598991166e-09, "loss": 0.0019, "step": 3990 }, { "epoch": 2.9917541229385307, "grad_norm": 0.1661028082846302, "learning_rate": 4.604744568015296e-09, "loss": 0.0034, "step": 3991 }, { "epoch": 2.992503748125937, "grad_norm": 0.04500281969970609, "learning_rate": 3.805579092042155e-09, "loss": 0.003, "step": 3992 }, { "epoch": 2.993253373313343, "grad_norm": 0.07088654187160519, "learning_rate": 3.0825227793296328e-09, "loss": 0.0037, "step": 3993 }, { "epoch": 2.9940029985007497, "grad_norm": 0.15553196796955818, "learning_rate": 2.43557618020418e-09, "loss": 0.0053, "step": 3994 }, { "epoch": 2.994752623688156, "grad_norm": 0.14671990188627287, "learning_rate": 1.864739787083014e-09, "loss": 0.0077, "step": 3995 }, { "epoch": 2.995502248875562, "grad_norm": 0.04418675692717229, "learning_rate": 1.3700140344408141e-09, "loss": 0.0022, "step": 3996 }, { "epoch": 2.9962518740629687, "grad_norm": 0.1705912775123118, "learning_rate": 9.513992988097188e-10, "loss": 0.016, "step": 3997 }, { "epoch": 2.997001499250375, "grad_norm": 0.1381779105328255, "learning_rate": 6.088958988237359e-10, "loss": 0.0071, "step": 3998 }, { "epoch": 2.997751124437781, "grad_norm": 0.04728810879882926, "learning_rate": 3.425040951521297e-10, "loss": 0.0027, "step": 3999 }, { "epoch": 2.9985007496251876, "grad_norm": 0.08131220348567887, "learning_rate": 1.5222409056603327e-10, "loss": 0.0013, "step": 4000 }, { "epoch": 2.999250374812594, "grad_norm": 0.08993400957198339, "learning_rate": 3.805602988293799e-11, "loss": 0.0045, "step": 4001 }, { "epoch": 3.0, "grad_norm": 0.04477219697998604, "learning_rate": 0.0, "loss": 0.0009, "step": 4002 }, { "epoch": 3.0, "eval_loss": 0.03329204395413399, "eval_runtime": 1896.306, "eval_samples_per_second": 5.472, "eval_steps_per_second": 0.684, "step": 4002 }, { "epoch": 3.0, "step": 4002, "total_flos": 3989157755650048.0, "train_loss": 0.020737408151027925, "train_runtime": 46333.6393, "train_samples_per_second": 1.382, "train_steps_per_second": 0.086 } ], "logging_steps": 1, "max_steps": 4002, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3989157755650048.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }