diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7496 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.757632398753894, + "eval_steps": 151, + "global_step": 1057, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016628559551028891, + "grad_norm": 6.876781701391548, + "learning_rate": 2e-07, + "loss": 1.2322, + "step": 1 + }, + { + "epoch": 0.0016628559551028891, + "eval_loss": 1.6686532497406006, + "eval_runtime": 4.7271, + "eval_samples_per_second": 2.327, + "eval_steps_per_second": 0.635, + "step": 1 + }, + { + "epoch": 0.0033257119102057782, + "grad_norm": 7.15147151737489, + "learning_rate": 4e-07, + "loss": 1.3856, + "step": 2 + }, + { + "epoch": 0.004988567865308668, + "grad_norm": 6.675277261967279, + "learning_rate": 6e-07, + "loss": 1.3114, + "step": 3 + }, + { + "epoch": 0.0066514238204115565, + "grad_norm": 6.276373071109286, + "learning_rate": 8e-07, + "loss": 1.2768, + "step": 4 + }, + { + "epoch": 0.008314279775514447, + "grad_norm": 6.004954010868464, + "learning_rate": 1e-06, + "loss": 1.3114, + "step": 5 + }, + { + "epoch": 0.009977135730617336, + "grad_norm": 6.210557996205687, + "learning_rate": 1.2e-06, + "loss": 1.3777, + "step": 6 + }, + { + "epoch": 0.011639991685720224, + "grad_norm": 6.22480460719047, + "learning_rate": 1.4e-06, + "loss": 1.1786, + "step": 7 + }, + { + "epoch": 0.013302847640823113, + "grad_norm": 4.853536830796908, + "learning_rate": 1.6e-06, + "loss": 1.432, + "step": 8 + }, + { + "epoch": 0.014965703595926003, + "grad_norm": 4.549188866460316, + "learning_rate": 1.8e-06, + "loss": 1.2102, + "step": 9 + }, + { + "epoch": 0.016628559551028894, + "grad_norm": 4.596321987039164, + "learning_rate": 2e-06, + "loss": 1.4431, + "step": 10 + }, + { + "epoch": 0.01829141550613178, + "grad_norm": 3.809686255936276, + "learning_rate": 1.9999984649975976e-06, + "loss": 1.1868, + "step": 11 + }, + { + "epoch": 0.01995427146123467, + "grad_norm": 3.831577778436731, + "learning_rate": 1.999993859995103e-06, + "loss": 1.2103, + "step": 12 + }, + { + "epoch": 0.021617127416337558, + "grad_norm": 3.462783579837251, + "learning_rate": 1.999986185006654e-06, + "loss": 1.2476, + "step": 13 + }, + { + "epoch": 0.02327998337144045, + "grad_norm": 3.3297536194169064, + "learning_rate": 1.999975440055812e-06, + "loss": 1.1819, + "step": 14 + }, + { + "epoch": 0.02494283932654334, + "grad_norm": 3.8443918916019637, + "learning_rate": 1.999961625175565e-06, + "loss": 1.1482, + "step": 15 + }, + { + "epoch": 0.026605695281646226, + "grad_norm": 4.209422853092109, + "learning_rate": 1.999944740408324e-06, + "loss": 1.2177, + "step": 16 + }, + { + "epoch": 0.028268551236749116, + "grad_norm": 3.6743224915375583, + "learning_rate": 1.9999247858059257e-06, + "loss": 1.3183, + "step": 17 + }, + { + "epoch": 0.029931407191852007, + "grad_norm": 3.387454350282322, + "learning_rate": 1.999901761429631e-06, + "loss": 1.1455, + "step": 18 + }, + { + "epoch": 0.0315942631469549, + "grad_norm": 2.9657179454835356, + "learning_rate": 1.9998756673501237e-06, + "loss": 1.1463, + "step": 19 + }, + { + "epoch": 0.03325711910205779, + "grad_norm": 3.455180779074787, + "learning_rate": 1.9998465036475145e-06, + "loss": 1.1571, + "step": 20 + }, + { + "epoch": 0.03491997505716067, + "grad_norm": 2.787618991649651, + "learning_rate": 1.9998142704113346e-06, + "loss": 1.2552, + "step": 21 + }, + { + "epoch": 0.03658283101226356, + "grad_norm": 2.765977452887127, + "learning_rate": 1.9997789677405414e-06, + "loss": 1.2326, + "step": 22 + }, + { + "epoch": 0.03824568696736645, + "grad_norm": 2.6518581076350554, + "learning_rate": 1.9997405957435133e-06, + "loss": 1.2298, + "step": 23 + }, + { + "epoch": 0.03990854292246934, + "grad_norm": 3.143557273823712, + "learning_rate": 1.999699154538053e-06, + "loss": 1.3306, + "step": 24 + }, + { + "epoch": 0.04157139887757223, + "grad_norm": 2.519680050881837, + "learning_rate": 1.999654644251385e-06, + "loss": 1.1641, + "step": 25 + }, + { + "epoch": 0.043234254832675116, + "grad_norm": 2.575750435555698, + "learning_rate": 1.9996070650201564e-06, + "loss": 1.0828, + "step": 26 + }, + { + "epoch": 0.044897110787778007, + "grad_norm": 2.5829804245352403, + "learning_rate": 1.9995564169904354e-06, + "loss": 1.1247, + "step": 27 + }, + { + "epoch": 0.0465599667428809, + "grad_norm": 2.4503279434629874, + "learning_rate": 1.9995027003177116e-06, + "loss": 1.1183, + "step": 28 + }, + { + "epoch": 0.04822282269798379, + "grad_norm": 2.2946133391864487, + "learning_rate": 1.9994459151668956e-06, + "loss": 1.2234, + "step": 29 + }, + { + "epoch": 0.04988567865308668, + "grad_norm": 2.403866063427151, + "learning_rate": 1.9993860617123183e-06, + "loss": 1.1327, + "step": 30 + }, + { + "epoch": 0.05154853460818957, + "grad_norm": 2.271628438557979, + "learning_rate": 1.99932314013773e-06, + "loss": 1.2817, + "step": 31 + }, + { + "epoch": 0.05321139056329245, + "grad_norm": 2.428340751451155, + "learning_rate": 1.9992571506362995e-06, + "loss": 1.1559, + "step": 32 + }, + { + "epoch": 0.05487424651839534, + "grad_norm": 2.2614420815326604, + "learning_rate": 1.999188093410616e-06, + "loss": 1.1504, + "step": 33 + }, + { + "epoch": 0.05653710247349823, + "grad_norm": 2.34799470197157, + "learning_rate": 1.9991159686726847e-06, + "loss": 1.2409, + "step": 34 + }, + { + "epoch": 0.05819995842860112, + "grad_norm": 2.2003715198087193, + "learning_rate": 1.9990407766439296e-06, + "loss": 1.1731, + "step": 35 + }, + { + "epoch": 0.05986281438370401, + "grad_norm": 2.0892124499601645, + "learning_rate": 1.99896251755519e-06, + "loss": 1.1715, + "step": 36 + }, + { + "epoch": 0.061525670338806904, + "grad_norm": 2.2608789864787218, + "learning_rate": 1.998881191646722e-06, + "loss": 1.1871, + "step": 37 + }, + { + "epoch": 0.0631885262939098, + "grad_norm": 2.4451085935787447, + "learning_rate": 1.9987967991681964e-06, + "loss": 1.31, + "step": 38 + }, + { + "epoch": 0.06485138224901268, + "grad_norm": 2.4506760337598776, + "learning_rate": 1.9987093403786983e-06, + "loss": 1.184, + "step": 39 + }, + { + "epoch": 0.06651423820411557, + "grad_norm": 2.5379516806093267, + "learning_rate": 1.9986188155467267e-06, + "loss": 1.1922, + "step": 40 + }, + { + "epoch": 0.06817709415921845, + "grad_norm": 2.35766565204526, + "learning_rate": 1.998525224950194e-06, + "loss": 1.1608, + "step": 41 + }, + { + "epoch": 0.06983995011432134, + "grad_norm": 2.2438854614295933, + "learning_rate": 1.9984285688764225e-06, + "loss": 1.1149, + "step": 42 + }, + { + "epoch": 0.07150280606942423, + "grad_norm": 2.5176050951814792, + "learning_rate": 1.998328847622148e-06, + "loss": 1.194, + "step": 43 + }, + { + "epoch": 0.07316566202452712, + "grad_norm": 3.5070711652473388, + "learning_rate": 1.998226061493514e-06, + "loss": 1.0549, + "step": 44 + }, + { + "epoch": 0.07482851797963001, + "grad_norm": 3.8844422731190065, + "learning_rate": 1.9981202108060757e-06, + "loss": 1.2055, + "step": 45 + }, + { + "epoch": 0.0764913739347329, + "grad_norm": 3.151088308194362, + "learning_rate": 1.9980112958847947e-06, + "loss": 1.3344, + "step": 46 + }, + { + "epoch": 0.0781542298898358, + "grad_norm": 2.8997950166217508, + "learning_rate": 1.9978993170640403e-06, + "loss": 1.2626, + "step": 47 + }, + { + "epoch": 0.07981708584493868, + "grad_norm": 2.7439550744638126, + "learning_rate": 1.9977842746875875e-06, + "loss": 1.2413, + "step": 48 + }, + { + "epoch": 0.08147994180004157, + "grad_norm": 2.252937617615512, + "learning_rate": 1.997666169108618e-06, + "loss": 1.1592, + "step": 49 + }, + { + "epoch": 0.08314279775514447, + "grad_norm": 2.171409977320052, + "learning_rate": 1.9975450006897158e-06, + "loss": 1.2077, + "step": 50 + }, + { + "epoch": 0.08480565371024736, + "grad_norm": 2.384700039420674, + "learning_rate": 1.9974207698028686e-06, + "loss": 1.2758, + "step": 51 + }, + { + "epoch": 0.08646850966535023, + "grad_norm": 2.2038886028156113, + "learning_rate": 1.9972934768294655e-06, + "loss": 1.0498, + "step": 52 + }, + { + "epoch": 0.08813136562045312, + "grad_norm": 2.104326907097657, + "learning_rate": 1.9971631221602976e-06, + "loss": 1.093, + "step": 53 + }, + { + "epoch": 0.08979422157555601, + "grad_norm": 2.174068419901149, + "learning_rate": 1.997029706195553e-06, + "loss": 1.2529, + "step": 54 + }, + { + "epoch": 0.0914570775306589, + "grad_norm": 2.232522077464942, + "learning_rate": 1.9968932293448205e-06, + "loss": 1.093, + "step": 55 + }, + { + "epoch": 0.0931199334857618, + "grad_norm": 3.0592613934800372, + "learning_rate": 1.996753692027084e-06, + "loss": 1.2318, + "step": 56 + }, + { + "epoch": 0.09478278944086468, + "grad_norm": 2.3919925670804103, + "learning_rate": 1.996611094670724e-06, + "loss": 1.1787, + "step": 57 + }, + { + "epoch": 0.09644564539596757, + "grad_norm": 2.839760642367617, + "learning_rate": 1.9964654377135153e-06, + "loss": 1.2441, + "step": 58 + }, + { + "epoch": 0.09810850135107047, + "grad_norm": 2.536098443986401, + "learning_rate": 1.996316721602625e-06, + "loss": 1.2302, + "step": 59 + }, + { + "epoch": 0.09977135730617336, + "grad_norm": 2.2895217740422447, + "learning_rate": 1.9961649467946124e-06, + "loss": 1.2022, + "step": 60 + }, + { + "epoch": 0.10143421326127625, + "grad_norm": 2.1513461743943414, + "learning_rate": 1.996010113755427e-06, + "loss": 1.1063, + "step": 61 + }, + { + "epoch": 0.10309706921637914, + "grad_norm": 2.1179800201775514, + "learning_rate": 1.995852222960407e-06, + "loss": 1.0463, + "step": 62 + }, + { + "epoch": 0.10475992517148203, + "grad_norm": 2.172434098998751, + "learning_rate": 1.9956912748942776e-06, + "loss": 1.1041, + "step": 63 + }, + { + "epoch": 0.1064227811265849, + "grad_norm": 2.254257836851709, + "learning_rate": 1.9955272700511504e-06, + "loss": 1.079, + "step": 64 + }, + { + "epoch": 0.1080856370816878, + "grad_norm": 2.3081819678701234, + "learning_rate": 1.9953602089345213e-06, + "loss": 1.2835, + "step": 65 + }, + { + "epoch": 0.10974849303679068, + "grad_norm": 2.4521650444864047, + "learning_rate": 1.9951900920572684e-06, + "loss": 1.0946, + "step": 66 + }, + { + "epoch": 0.11141134899189357, + "grad_norm": 2.301990359392983, + "learning_rate": 1.9950169199416512e-06, + "loss": 1.1454, + "step": 67 + }, + { + "epoch": 0.11307420494699646, + "grad_norm": 2.159086848434006, + "learning_rate": 1.994840693119309e-06, + "loss": 1.1554, + "step": 68 + }, + { + "epoch": 0.11473706090209936, + "grad_norm": 2.5488828016690834, + "learning_rate": 1.994661412131259e-06, + "loss": 1.1537, + "step": 69 + }, + { + "epoch": 0.11639991685720225, + "grad_norm": 2.171482670853498, + "learning_rate": 1.9944790775278954e-06, + "loss": 1.2015, + "step": 70 + }, + { + "epoch": 0.11806277281230514, + "grad_norm": 2.247147793101678, + "learning_rate": 1.994293689868985e-06, + "loss": 1.1658, + "step": 71 + }, + { + "epoch": 0.11972562876740803, + "grad_norm": 2.120880044321267, + "learning_rate": 1.99410524972367e-06, + "loss": 1.2776, + "step": 72 + }, + { + "epoch": 0.12138848472251092, + "grad_norm": 2.1241719868023434, + "learning_rate": 1.993913757670462e-06, + "loss": 1.1336, + "step": 73 + }, + { + "epoch": 0.12305134067761381, + "grad_norm": 2.326506113842894, + "learning_rate": 1.9937192142972426e-06, + "loss": 1.2217, + "step": 74 + }, + { + "epoch": 0.12471419663271668, + "grad_norm": 2.2329181207257536, + "learning_rate": 1.9935216202012607e-06, + "loss": 1.2696, + "step": 75 + }, + { + "epoch": 0.1263770525878196, + "grad_norm": 2.4936798260412947, + "learning_rate": 1.993320975989131e-06, + "loss": 1.1711, + "step": 76 + }, + { + "epoch": 0.12803990854292246, + "grad_norm": 1.9925320853249602, + "learning_rate": 1.993117282276833e-06, + "loss": 1.1406, + "step": 77 + }, + { + "epoch": 0.12970276449802537, + "grad_norm": 2.2228263048635073, + "learning_rate": 1.992910539689707e-06, + "loss": 1.1856, + "step": 78 + }, + { + "epoch": 0.13136562045312825, + "grad_norm": 2.316736137818131, + "learning_rate": 1.9927007488624534e-06, + "loss": 1.1939, + "step": 79 + }, + { + "epoch": 0.13302847640823115, + "grad_norm": 2.2236401181022227, + "learning_rate": 1.9924879104391306e-06, + "loss": 1.1508, + "step": 80 + }, + { + "epoch": 0.13469133236333403, + "grad_norm": 2.3513673044729053, + "learning_rate": 1.992272025073155e-06, + "loss": 1.1877, + "step": 81 + }, + { + "epoch": 0.1363541883184369, + "grad_norm": 2.1953606776371655, + "learning_rate": 1.9920530934272946e-06, + "loss": 1.0935, + "step": 82 + }, + { + "epoch": 0.1380170442735398, + "grad_norm": 2.1614518656924693, + "learning_rate": 1.9918311161736713e-06, + "loss": 1.0932, + "step": 83 + }, + { + "epoch": 0.13967990022864268, + "grad_norm": 2.2701167674984117, + "learning_rate": 1.9916060939937557e-06, + "loss": 1.2265, + "step": 84 + }, + { + "epoch": 0.1413427561837456, + "grad_norm": 2.186355070972481, + "learning_rate": 1.9913780275783674e-06, + "loss": 1.102, + "step": 85 + }, + { + "epoch": 0.14300561213884846, + "grad_norm": 2.1276524267117267, + "learning_rate": 1.991146917627671e-06, + "loss": 1.2096, + "step": 86 + }, + { + "epoch": 0.14466846809395137, + "grad_norm": 2.2049755692680653, + "learning_rate": 1.9909127648511754e-06, + "loss": 1.1638, + "step": 87 + }, + { + "epoch": 0.14633132404905425, + "grad_norm": 2.193976635251867, + "learning_rate": 1.990675569967731e-06, + "loss": 1.3051, + "step": 88 + }, + { + "epoch": 0.14799418000415715, + "grad_norm": 2.3179428851026205, + "learning_rate": 1.990435333705527e-06, + "loss": 1.1257, + "step": 89 + }, + { + "epoch": 0.14965703595926003, + "grad_norm": 2.9183136856468925, + "learning_rate": 1.9901920568020894e-06, + "loss": 1.0138, + "step": 90 + }, + { + "epoch": 0.15131989191436293, + "grad_norm": 2.0666844143324057, + "learning_rate": 1.9899457400042806e-06, + "loss": 1.0314, + "step": 91 + }, + { + "epoch": 0.1529827478694658, + "grad_norm": 2.146945135936375, + "learning_rate": 1.9896963840682935e-06, + "loss": 1.14, + "step": 92 + }, + { + "epoch": 0.15464560382456868, + "grad_norm": 2.0563546518727427, + "learning_rate": 1.989443989759652e-06, + "loss": 1.1322, + "step": 93 + }, + { + "epoch": 0.1563084597796716, + "grad_norm": 2.0942173553840124, + "learning_rate": 1.989188557853208e-06, + "loss": 1.1109, + "step": 94 + }, + { + "epoch": 0.15797131573477446, + "grad_norm": 2.077080441568994, + "learning_rate": 1.9889300891331387e-06, + "loss": 1.0046, + "step": 95 + }, + { + "epoch": 0.15963417168987737, + "grad_norm": 2.3184144062168803, + "learning_rate": 1.9886685843929446e-06, + "loss": 1.18, + "step": 96 + }, + { + "epoch": 0.16129702764498025, + "grad_norm": 2.116450161143651, + "learning_rate": 1.988404044435446e-06, + "loss": 1.1773, + "step": 97 + }, + { + "epoch": 0.16295988360008315, + "grad_norm": 2.195134284878133, + "learning_rate": 1.988136470072782e-06, + "loss": 1.1737, + "step": 98 + }, + { + "epoch": 0.16462273955518603, + "grad_norm": 2.2738180312076977, + "learning_rate": 1.987865862126408e-06, + "loss": 1.2176, + "step": 99 + }, + { + "epoch": 0.16628559551028893, + "grad_norm": 2.31803637733792, + "learning_rate": 1.98759222142709e-06, + "loss": 1.0916, + "step": 100 + }, + { + "epoch": 0.1679484514653918, + "grad_norm": 2.1944116623564676, + "learning_rate": 1.9873155488149076e-06, + "loss": 1.1836, + "step": 101 + }, + { + "epoch": 0.1696113074204947, + "grad_norm": 2.220799107434225, + "learning_rate": 1.9870358451392464e-06, + "loss": 1.0624, + "step": 102 + }, + { + "epoch": 0.1712741633755976, + "grad_norm": 2.2574992766238373, + "learning_rate": 1.9867531112587987e-06, + "loss": 1.0148, + "step": 103 + }, + { + "epoch": 0.17293701933070046, + "grad_norm": 2.2367520120502524, + "learning_rate": 1.9864673480415585e-06, + "loss": 1.2407, + "step": 104 + }, + { + "epoch": 0.17459987528580337, + "grad_norm": 2.4288409651842477, + "learning_rate": 1.98617855636482e-06, + "loss": 1.0532, + "step": 105 + }, + { + "epoch": 0.17626273124090625, + "grad_norm": 2.422107031056536, + "learning_rate": 1.9858867371151753e-06, + "loss": 1.1696, + "step": 106 + }, + { + "epoch": 0.17792558719600915, + "grad_norm": 2.1987680710210693, + "learning_rate": 1.9855918911885106e-06, + "loss": 1.0976, + "step": 107 + }, + { + "epoch": 0.17958844315111203, + "grad_norm": 2.0312525346517307, + "learning_rate": 1.985294019490005e-06, + "loss": 1.052, + "step": 108 + }, + { + "epoch": 0.18125129910621493, + "grad_norm": 2.2392770140952116, + "learning_rate": 1.9849931229341256e-06, + "loss": 1.0335, + "step": 109 + }, + { + "epoch": 0.1829141550613178, + "grad_norm": 2.4061693671071227, + "learning_rate": 1.984689202444626e-06, + "loss": 1.2724, + "step": 110 + }, + { + "epoch": 0.1845770110164207, + "grad_norm": 2.381783600731093, + "learning_rate": 1.984382258954544e-06, + "loss": 1.0571, + "step": 111 + }, + { + "epoch": 0.1862398669715236, + "grad_norm": 2.3568823162290076, + "learning_rate": 1.984072293406197e-06, + "loss": 1.1028, + "step": 112 + }, + { + "epoch": 0.1879027229266265, + "grad_norm": 3.889820858228295, + "learning_rate": 1.983759306751182e-06, + "loss": 1.2139, + "step": 113 + }, + { + "epoch": 0.18956557888172937, + "grad_norm": 2.2124619744454415, + "learning_rate": 1.9834432999503684e-06, + "loss": 1.0924, + "step": 114 + }, + { + "epoch": 0.19122843483683227, + "grad_norm": 2.3492979702975023, + "learning_rate": 1.9831242739738986e-06, + "loss": 1.2233, + "step": 115 + }, + { + "epoch": 0.19289129079193515, + "grad_norm": 2.6172379375651347, + "learning_rate": 1.982802229801184e-06, + "loss": 1.1797, + "step": 116 + }, + { + "epoch": 0.19455414674703803, + "grad_norm": 2.195119546109998, + "learning_rate": 1.9824771684209024e-06, + "loss": 1.0346, + "step": 117 + }, + { + "epoch": 0.19621700270214093, + "grad_norm": 2.1198330508663, + "learning_rate": 1.982149090830993e-06, + "loss": 1.2163, + "step": 118 + }, + { + "epoch": 0.1978798586572438, + "grad_norm": 2.23691299776538, + "learning_rate": 1.981817998038656e-06, + "loss": 1.1865, + "step": 119 + }, + { + "epoch": 0.1995427146123467, + "grad_norm": 2.088815523084208, + "learning_rate": 1.981483891060348e-06, + "loss": 1.1199, + "step": 120 + }, + { + "epoch": 0.2012055705674496, + "grad_norm": 2.273270712463945, + "learning_rate": 1.9811467709217785e-06, + "loss": 1.2325, + "step": 121 + }, + { + "epoch": 0.2028684265225525, + "grad_norm": 2.18183229638507, + "learning_rate": 1.980806638657908e-06, + "loss": 1.2246, + "step": 122 + }, + { + "epoch": 0.20453128247765537, + "grad_norm": 3.193814855632699, + "learning_rate": 1.980463495312945e-06, + "loss": 1.0395, + "step": 123 + }, + { + "epoch": 0.20619413843275827, + "grad_norm": 2.465032307682835, + "learning_rate": 1.9801173419403404e-06, + "loss": 1.171, + "step": 124 + }, + { + "epoch": 0.20785699438786115, + "grad_norm": 2.2673988159196665, + "learning_rate": 1.979768179602787e-06, + "loss": 1.0496, + "step": 125 + }, + { + "epoch": 0.20951985034296405, + "grad_norm": 2.1447719311727265, + "learning_rate": 1.9794160093722147e-06, + "loss": 1.0066, + "step": 126 + }, + { + "epoch": 0.21118270629806693, + "grad_norm": 2.760076601802052, + "learning_rate": 1.979060832329788e-06, + "loss": 1.1358, + "step": 127 + }, + { + "epoch": 0.2128455622531698, + "grad_norm": 2.2167665381661865, + "learning_rate": 1.978702649565902e-06, + "loss": 1.114, + "step": 128 + }, + { + "epoch": 0.2145084182082727, + "grad_norm": 2.2791559309596594, + "learning_rate": 1.9783414621801796e-06, + "loss": 1.0434, + "step": 129 + }, + { + "epoch": 0.2161712741633756, + "grad_norm": 2.432104553889974, + "learning_rate": 1.9779772712814675e-06, + "loss": 1.2542, + "step": 130 + }, + { + "epoch": 0.2178341301184785, + "grad_norm": 2.7306377502903985, + "learning_rate": 1.9776100779878343e-06, + "loss": 1.0987, + "step": 131 + }, + { + "epoch": 0.21949698607358137, + "grad_norm": 2.7879620153482114, + "learning_rate": 1.977239883426564e-06, + "loss": 1.1479, + "step": 132 + }, + { + "epoch": 0.22115984202868427, + "grad_norm": 2.1514985446900354, + "learning_rate": 1.9768666887341564e-06, + "loss": 1.2583, + "step": 133 + }, + { + "epoch": 0.22282269798378715, + "grad_norm": 2.0415497832604057, + "learning_rate": 1.9764904950563213e-06, + "loss": 1.1969, + "step": 134 + }, + { + "epoch": 0.22448555393889005, + "grad_norm": 2.2681683904958403, + "learning_rate": 1.9761113035479745e-06, + "loss": 1.2112, + "step": 135 + }, + { + "epoch": 0.22614840989399293, + "grad_norm": 2.260381990967556, + "learning_rate": 1.975729115373236e-06, + "loss": 1.1884, + "step": 136 + }, + { + "epoch": 0.22781126584909583, + "grad_norm": 2.384237988040654, + "learning_rate": 1.9753439317054253e-06, + "loss": 0.9922, + "step": 137 + }, + { + "epoch": 0.2294741218041987, + "grad_norm": 2.2694287379916007, + "learning_rate": 1.9749557537270584e-06, + "loss": 1.0884, + "step": 138 + }, + { + "epoch": 0.2311369777593016, + "grad_norm": 2.358144314592439, + "learning_rate": 1.974564582629843e-06, + "loss": 1.1049, + "step": 139 + }, + { + "epoch": 0.2327998337144045, + "grad_norm": 2.1123073465297306, + "learning_rate": 1.9741704196146766e-06, + "loss": 1.0306, + "step": 140 + }, + { + "epoch": 0.23446268966950737, + "grad_norm": 2.1512707106124798, + "learning_rate": 1.973773265891641e-06, + "loss": 1.0528, + "step": 141 + }, + { + "epoch": 0.23612554562461027, + "grad_norm": 2.172534073331255, + "learning_rate": 1.973373122680001e-06, + "loss": 1.1115, + "step": 142 + }, + { + "epoch": 0.23778840157971315, + "grad_norm": 2.3733216420093473, + "learning_rate": 1.972969991208198e-06, + "loss": 1.1323, + "step": 143 + }, + { + "epoch": 0.23945125753481605, + "grad_norm": 2.4625937737389094, + "learning_rate": 1.9725638727138465e-06, + "loss": 1.0889, + "step": 144 + }, + { + "epoch": 0.24111411348991893, + "grad_norm": 2.2420911044484395, + "learning_rate": 1.9721547684437332e-06, + "loss": 1.0748, + "step": 145 + }, + { + "epoch": 0.24277696944502183, + "grad_norm": 2.274748664959236, + "learning_rate": 1.97174267965381e-06, + "loss": 1.2631, + "step": 146 + }, + { + "epoch": 0.2444398254001247, + "grad_norm": 2.251081234626043, + "learning_rate": 1.9713276076091916e-06, + "loss": 1.2066, + "step": 147 + }, + { + "epoch": 0.24610268135522761, + "grad_norm": 2.2539634056834137, + "learning_rate": 1.970909553584151e-06, + "loss": 1.0727, + "step": 148 + }, + { + "epoch": 0.2477655373103305, + "grad_norm": 2.1434067229672094, + "learning_rate": 1.9704885188621157e-06, + "loss": 1.1082, + "step": 149 + }, + { + "epoch": 0.24942839326543337, + "grad_norm": 2.236934300850519, + "learning_rate": 1.970064504735665e-06, + "loss": 1.1063, + "step": 150 + }, + { + "epoch": 0.2510912492205363, + "grad_norm": 2.760825374012749, + "learning_rate": 1.969637512506524e-06, + "loss": 1.2253, + "step": 151 + }, + { + "epoch": 0.2510912492205363, + "eval_loss": 1.4304277896881104, + "eval_runtime": 24.5872, + "eval_samples_per_second": 0.447, + "eval_steps_per_second": 0.122, + "step": 151 + }, + { + "epoch": 0.2527541051756392, + "grad_norm": 2.548532327058704, + "learning_rate": 1.9692075434855604e-06, + "loss": 1.1196, + "step": 152 + }, + { + "epoch": 0.254416961130742, + "grad_norm": 2.3149961972538464, + "learning_rate": 1.9687745989927824e-06, + "loss": 1.093, + "step": 153 + }, + { + "epoch": 0.25607981708584493, + "grad_norm": 2.2766910929781257, + "learning_rate": 1.96833868035733e-06, + "loss": 1.1225, + "step": 154 + }, + { + "epoch": 0.25774267304094783, + "grad_norm": 2.7395763625060674, + "learning_rate": 1.967899788917477e-06, + "loss": 1.1423, + "step": 155 + }, + { + "epoch": 0.25940552899605074, + "grad_norm": 2.389993560718361, + "learning_rate": 1.9674579260206213e-06, + "loss": 1.1823, + "step": 156 + }, + { + "epoch": 0.2610683849511536, + "grad_norm": 2.483941805273633, + "learning_rate": 1.967013093023285e-06, + "loss": 1.2279, + "step": 157 + }, + { + "epoch": 0.2627312409062565, + "grad_norm": 2.2477103547008914, + "learning_rate": 1.9665652912911065e-06, + "loss": 1.1962, + "step": 158 + }, + { + "epoch": 0.2643940968613594, + "grad_norm": 2.841612920075852, + "learning_rate": 1.9661145221988398e-06, + "loss": 1.1292, + "step": 159 + }, + { + "epoch": 0.2660569528164623, + "grad_norm": 2.5188774282226487, + "learning_rate": 1.9656607871303485e-06, + "loss": 1.0174, + "step": 160 + }, + { + "epoch": 0.26771980877156515, + "grad_norm": 2.601863007672941, + "learning_rate": 1.9652040874786007e-06, + "loss": 1.1803, + "step": 161 + }, + { + "epoch": 0.26938266472666805, + "grad_norm": 2.1729305988620027, + "learning_rate": 1.964744424645667e-06, + "loss": 1.0692, + "step": 162 + }, + { + "epoch": 0.27104552068177096, + "grad_norm": 2.4137099619779017, + "learning_rate": 1.964281800042714e-06, + "loss": 1.2412, + "step": 163 + }, + { + "epoch": 0.2727083766368738, + "grad_norm": 2.4257004197876437, + "learning_rate": 1.9638162150900025e-06, + "loss": 1.1754, + "step": 164 + }, + { + "epoch": 0.2743712325919767, + "grad_norm": 2.127187241617996, + "learning_rate": 1.96334767121688e-06, + "loss": 1.1212, + "step": 165 + }, + { + "epoch": 0.2760340885470796, + "grad_norm": 2.5505445228517796, + "learning_rate": 1.962876169861778e-06, + "loss": 1.1873, + "step": 166 + }, + { + "epoch": 0.2776969445021825, + "grad_norm": 2.362761021929743, + "learning_rate": 1.9624017124722085e-06, + "loss": 1.1519, + "step": 167 + }, + { + "epoch": 0.27935980045728537, + "grad_norm": 2.3485374311573874, + "learning_rate": 1.9619243005047574e-06, + "loss": 1.2356, + "step": 168 + }, + { + "epoch": 0.28102265641238827, + "grad_norm": 2.1257685683782066, + "learning_rate": 1.9614439354250824e-06, + "loss": 1.1825, + "step": 169 + }, + { + "epoch": 0.2826855123674912, + "grad_norm": 2.395738763591683, + "learning_rate": 1.960960618707906e-06, + "loss": 1.1009, + "step": 170 + }, + { + "epoch": 0.2843483683225941, + "grad_norm": 2.2654812048452855, + "learning_rate": 1.9604743518370133e-06, + "loss": 1.1115, + "step": 171 + }, + { + "epoch": 0.28601122427769693, + "grad_norm": 2.377019893527625, + "learning_rate": 1.959985136305246e-06, + "loss": 1.1096, + "step": 172 + }, + { + "epoch": 0.28767408023279983, + "grad_norm": 2.2583188742925597, + "learning_rate": 1.9594929736144973e-06, + "loss": 1.0579, + "step": 173 + }, + { + "epoch": 0.28933693618790274, + "grad_norm": 2.2955420850835644, + "learning_rate": 1.9589978652757096e-06, + "loss": 1.0044, + "step": 174 + }, + { + "epoch": 0.2909997921430056, + "grad_norm": 2.135658412712305, + "learning_rate": 1.9584998128088683e-06, + "loss": 0.9674, + "step": 175 + }, + { + "epoch": 0.2926626480981085, + "grad_norm": 2.3814784805947435, + "learning_rate": 1.9579988177429965e-06, + "loss": 0.9844, + "step": 176 + }, + { + "epoch": 0.2943255040532114, + "grad_norm": 4.249522359005645, + "learning_rate": 1.957494881616151e-06, + "loss": 1.0733, + "step": 177 + }, + { + "epoch": 0.2959883600083143, + "grad_norm": 2.1491559170675005, + "learning_rate": 1.956988005975419e-06, + "loss": 1.1137, + "step": 178 + }, + { + "epoch": 0.29765121596341715, + "grad_norm": 2.237150018318634, + "learning_rate": 1.9564781923769105e-06, + "loss": 1.1954, + "step": 179 + }, + { + "epoch": 0.29931407191852005, + "grad_norm": 2.051176936877328, + "learning_rate": 1.955965442385756e-06, + "loss": 1.132, + "step": 180 + }, + { + "epoch": 0.30097692787362296, + "grad_norm": 3.5719215857546045, + "learning_rate": 1.9554497575761e-06, + "loss": 1.2088, + "step": 181 + }, + { + "epoch": 0.30263978382872586, + "grad_norm": 2.0583047844957343, + "learning_rate": 1.9549311395310982e-06, + "loss": 1.0338, + "step": 182 + }, + { + "epoch": 0.3043026397838287, + "grad_norm": 2.356690350916363, + "learning_rate": 1.9544095898429094e-06, + "loss": 1.1253, + "step": 183 + }, + { + "epoch": 0.3059654957389316, + "grad_norm": 2.31830959115815, + "learning_rate": 1.9538851101126944e-06, + "loss": 0.9722, + "step": 184 + }, + { + "epoch": 0.3076283516940345, + "grad_norm": 2.2973614356210503, + "learning_rate": 1.9533577019506085e-06, + "loss": 1.1471, + "step": 185 + }, + { + "epoch": 0.30929120764913737, + "grad_norm": 2.2790981590004233, + "learning_rate": 1.952827366975797e-06, + "loss": 1.0407, + "step": 186 + }, + { + "epoch": 0.31095406360424027, + "grad_norm": 2.2618992906420714, + "learning_rate": 1.952294106816391e-06, + "loss": 1.2548, + "step": 187 + }, + { + "epoch": 0.3126169195593432, + "grad_norm": 2.644954828913663, + "learning_rate": 1.9517579231095015e-06, + "loss": 1.2529, + "step": 188 + }, + { + "epoch": 0.3142797755144461, + "grad_norm": 2.256019290547997, + "learning_rate": 1.9512188175012153e-06, + "loss": 1.1046, + "step": 189 + }, + { + "epoch": 0.31594263146954893, + "grad_norm": 2.5306772178751133, + "learning_rate": 1.950676791646589e-06, + "loss": 1.1444, + "step": 190 + }, + { + "epoch": 0.31760548742465183, + "grad_norm": 2.530764997510064, + "learning_rate": 1.9501318472096447e-06, + "loss": 1.1185, + "step": 191 + }, + { + "epoch": 0.31926834337975474, + "grad_norm": 2.308534558092756, + "learning_rate": 1.9495839858633648e-06, + "loss": 1.1669, + "step": 192 + }, + { + "epoch": 0.32093119933485764, + "grad_norm": 2.2116575569490924, + "learning_rate": 1.9490332092896857e-06, + "loss": 1.1689, + "step": 193 + }, + { + "epoch": 0.3225940552899605, + "grad_norm": 3.098403349489372, + "learning_rate": 1.9484795191794943e-06, + "loss": 1.0652, + "step": 194 + }, + { + "epoch": 0.3242569112450634, + "grad_norm": 2.2040383274861965, + "learning_rate": 1.947922917232622e-06, + "loss": 0.9792, + "step": 195 + }, + { + "epoch": 0.3259197672001663, + "grad_norm": 2.2311156638317557, + "learning_rate": 1.9473634051578394e-06, + "loss": 1.0935, + "step": 196 + }, + { + "epoch": 0.32758262315526915, + "grad_norm": 2.2800635132242997, + "learning_rate": 1.946800984672851e-06, + "loss": 1.2076, + "step": 197 + }, + { + "epoch": 0.32924547911037205, + "grad_norm": 3.654896049770491, + "learning_rate": 1.946235657504291e-06, + "loss": 1.1199, + "step": 198 + }, + { + "epoch": 0.33090833506547496, + "grad_norm": 3.21717248498429, + "learning_rate": 1.945667425387716e-06, + "loss": 1.0107, + "step": 199 + }, + { + "epoch": 0.33257119102057786, + "grad_norm": 2.401723766209091, + "learning_rate": 1.9450962900676014e-06, + "loss": 1.1726, + "step": 200 + }, + { + "epoch": 0.3342340469756807, + "grad_norm": 2.395261717837967, + "learning_rate": 1.9445222532973356e-06, + "loss": 1.1804, + "step": 201 + }, + { + "epoch": 0.3358969029307836, + "grad_norm": 2.1446239585083053, + "learning_rate": 1.943945316839214e-06, + "loss": 1.1057, + "step": 202 + }, + { + "epoch": 0.3375597588858865, + "grad_norm": 2.316344120227303, + "learning_rate": 1.9433654824644343e-06, + "loss": 1.2313, + "step": 203 + }, + { + "epoch": 0.3392226148409894, + "grad_norm": 2.260699038364008, + "learning_rate": 1.9427827519530917e-06, + "loss": 1.0259, + "step": 204 + }, + { + "epoch": 0.34088547079609227, + "grad_norm": 2.2098708691255027, + "learning_rate": 1.94219712709417e-06, + "loss": 1.162, + "step": 205 + }, + { + "epoch": 0.3425483267511952, + "grad_norm": 2.177000943321658, + "learning_rate": 1.9416086096855414e-06, + "loss": 1.1537, + "step": 206 + }, + { + "epoch": 0.3442111827062981, + "grad_norm": 2.1645641329995127, + "learning_rate": 1.9410172015339575e-06, + "loss": 1.0991, + "step": 207 + }, + { + "epoch": 0.34587403866140093, + "grad_norm": 2.352398281001825, + "learning_rate": 1.940422904455043e-06, + "loss": 1.1333, + "step": 208 + }, + { + "epoch": 0.34753689461650383, + "grad_norm": 2.3240858436164404, + "learning_rate": 1.939825720273294e-06, + "loss": 1.1263, + "step": 209 + }, + { + "epoch": 0.34919975057160674, + "grad_norm": 2.2225865374520404, + "learning_rate": 1.939225650822068e-06, + "loss": 1.1092, + "step": 210 + }, + { + "epoch": 0.35086260652670964, + "grad_norm": 2.344261138949479, + "learning_rate": 1.938622697943581e-06, + "loss": 1.1262, + "step": 211 + }, + { + "epoch": 0.3525254624818125, + "grad_norm": 2.0594249160281817, + "learning_rate": 1.938016863488902e-06, + "loss": 1.03, + "step": 212 + }, + { + "epoch": 0.3541883184369154, + "grad_norm": 2.197981511279412, + "learning_rate": 1.9374081493179453e-06, + "loss": 1.2617, + "step": 213 + }, + { + "epoch": 0.3558511743920183, + "grad_norm": 2.3896742142995717, + "learning_rate": 1.9367965572994663e-06, + "loss": 1.1666, + "step": 214 + }, + { + "epoch": 0.3575140303471212, + "grad_norm": 2.2197343813094075, + "learning_rate": 1.9361820893110554e-06, + "loss": 1.1399, + "step": 215 + }, + { + "epoch": 0.35917688630222405, + "grad_norm": 2.0777320432807684, + "learning_rate": 1.9355647472391325e-06, + "loss": 0.997, + "step": 216 + }, + { + "epoch": 0.36083974225732696, + "grad_norm": 2.1390003170764262, + "learning_rate": 1.93494453297894e-06, + "loss": 1.0987, + "step": 217 + }, + { + "epoch": 0.36250259821242986, + "grad_norm": 2.2336666277286485, + "learning_rate": 1.93432144843454e-06, + "loss": 1.1072, + "step": 218 + }, + { + "epoch": 0.36416545416753276, + "grad_norm": 2.5151519435391565, + "learning_rate": 1.933695495518804e-06, + "loss": 1.187, + "step": 219 + }, + { + "epoch": 0.3658283101226356, + "grad_norm": 2.237546644093813, + "learning_rate": 1.9330666761534104e-06, + "loss": 1.1992, + "step": 220 + }, + { + "epoch": 0.3674911660777385, + "grad_norm": 2.20747601132904, + "learning_rate": 1.932434992268838e-06, + "loss": 1.25, + "step": 221 + }, + { + "epoch": 0.3691540220328414, + "grad_norm": 2.0703921340607954, + "learning_rate": 1.9318004458043595e-06, + "loss": 0.9968, + "step": 222 + }, + { + "epoch": 0.37081687798794427, + "grad_norm": 2.6998540968138203, + "learning_rate": 1.9311630387080355e-06, + "loss": 1.1253, + "step": 223 + }, + { + "epoch": 0.3724797339430472, + "grad_norm": 2.2382556329258083, + "learning_rate": 1.9305227729367088e-06, + "loss": 1.033, + "step": 224 + }, + { + "epoch": 0.3741425898981501, + "grad_norm": 2.0744866884439426, + "learning_rate": 1.929879650455998e-06, + "loss": 0.9939, + "step": 225 + }, + { + "epoch": 0.375805445853253, + "grad_norm": 2.1380855568440293, + "learning_rate": 1.929233673240293e-06, + "loss": 1.1454, + "step": 226 + }, + { + "epoch": 0.37746830180835583, + "grad_norm": 2.3748389575645783, + "learning_rate": 1.928584843272746e-06, + "loss": 1.0883, + "step": 227 + }, + { + "epoch": 0.37913115776345874, + "grad_norm": 1.9756985785164076, + "learning_rate": 1.9279331625452694e-06, + "loss": 1.0773, + "step": 228 + }, + { + "epoch": 0.38079401371856164, + "grad_norm": 2.8312536305491416, + "learning_rate": 1.927278633058525e-06, + "loss": 1.1389, + "step": 229 + }, + { + "epoch": 0.38245686967366455, + "grad_norm": 2.130686929883375, + "learning_rate": 1.926621256821922e-06, + "loss": 1.0132, + "step": 230 + }, + { + "epoch": 0.3841197256287674, + "grad_norm": 2.305764167292808, + "learning_rate": 1.9259610358536085e-06, + "loss": 1.1054, + "step": 231 + }, + { + "epoch": 0.3857825815838703, + "grad_norm": 2.3978186298387683, + "learning_rate": 1.9252979721804657e-06, + "loss": 1.0411, + "step": 232 + }, + { + "epoch": 0.3874454375389732, + "grad_norm": 2.195123698256728, + "learning_rate": 1.9246320678381032e-06, + "loss": 1.1931, + "step": 233 + }, + { + "epoch": 0.38910829349407605, + "grad_norm": 2.6254493837547996, + "learning_rate": 1.92396332487085e-06, + "loss": 1.1892, + "step": 234 + }, + { + "epoch": 0.39077114944917896, + "grad_norm": 2.419866448691614, + "learning_rate": 1.9232917453317492e-06, + "loss": 1.0236, + "step": 235 + }, + { + "epoch": 0.39243400540428186, + "grad_norm": 2.1862355398588287, + "learning_rate": 1.9226173312825553e-06, + "loss": 1.0528, + "step": 236 + }, + { + "epoch": 0.39409686135938476, + "grad_norm": 2.3764854199802707, + "learning_rate": 1.921940084793721e-06, + "loss": 1.0602, + "step": 237 + }, + { + "epoch": 0.3957597173144876, + "grad_norm": 2.4103818910002057, + "learning_rate": 1.921260007944397e-06, + "loss": 0.9984, + "step": 238 + }, + { + "epoch": 0.3974225732695905, + "grad_norm": 2.1867489158332396, + "learning_rate": 1.920577102822422e-06, + "loss": 1.1426, + "step": 239 + }, + { + "epoch": 0.3990854292246934, + "grad_norm": 2.5978893922687964, + "learning_rate": 1.9198913715243182e-06, + "loss": 1.1274, + "step": 240 + }, + { + "epoch": 0.4007482851797963, + "grad_norm": 2.339459827714988, + "learning_rate": 1.9192028161552843e-06, + "loss": 1.0478, + "step": 241 + }, + { + "epoch": 0.4024111411348992, + "grad_norm": 2.3678596214542162, + "learning_rate": 1.9185114388291886e-06, + "loss": 1.0155, + "step": 242 + }, + { + "epoch": 0.4040739970900021, + "grad_norm": 2.1203964226087106, + "learning_rate": 1.9178172416685625e-06, + "loss": 1.0326, + "step": 243 + }, + { + "epoch": 0.405736853045105, + "grad_norm": 2.333908782402527, + "learning_rate": 1.9171202268045946e-06, + "loss": 1.0474, + "step": 244 + }, + { + "epoch": 0.40739970900020783, + "grad_norm": 2.9030943393253454, + "learning_rate": 1.9164203963771243e-06, + "loss": 1.1753, + "step": 245 + }, + { + "epoch": 0.40906256495531074, + "grad_norm": 2.4513503490604087, + "learning_rate": 1.915717752534634e-06, + "loss": 1.0477, + "step": 246 + }, + { + "epoch": 0.41072542091041364, + "grad_norm": 2.0267508782326735, + "learning_rate": 1.915012297434243e-06, + "loss": 1.0991, + "step": 247 + }, + { + "epoch": 0.41238827686551655, + "grad_norm": 2.184393317130635, + "learning_rate": 1.9143040332417036e-06, + "loss": 1.2479, + "step": 248 + }, + { + "epoch": 0.4140511328206194, + "grad_norm": 2.026607524191998, + "learning_rate": 1.9135929621313886e-06, + "loss": 1.0745, + "step": 249 + }, + { + "epoch": 0.4157139887757223, + "grad_norm": 2.66566941286477, + "learning_rate": 1.912879086286291e-06, + "loss": 0.9754, + "step": 250 + }, + { + "epoch": 0.4173768447308252, + "grad_norm": 2.1367244864183625, + "learning_rate": 1.9121624078980122e-06, + "loss": 1.1809, + "step": 251 + }, + { + "epoch": 0.4190397006859281, + "grad_norm": 2.38106027954001, + "learning_rate": 1.911442929166758e-06, + "loss": 1.105, + "step": 252 + }, + { + "epoch": 0.42070255664103096, + "grad_norm": 2.157838169653484, + "learning_rate": 1.910720652301333e-06, + "loss": 1.1357, + "step": 253 + }, + { + "epoch": 0.42236541259613386, + "grad_norm": 2.4900405664879135, + "learning_rate": 1.9099955795191292e-06, + "loss": 1.1238, + "step": 254 + }, + { + "epoch": 0.42402826855123676, + "grad_norm": 2.2488268195059717, + "learning_rate": 1.9092677130461245e-06, + "loss": 1.0831, + "step": 255 + }, + { + "epoch": 0.4256911245063396, + "grad_norm": 2.266782137690799, + "learning_rate": 1.9085370551168718e-06, + "loss": 1.0905, + "step": 256 + }, + { + "epoch": 0.4273539804614425, + "grad_norm": 3.7403186028520934, + "learning_rate": 1.9078036079744947e-06, + "loss": 1.101, + "step": 257 + }, + { + "epoch": 0.4290168364165454, + "grad_norm": 2.455666608661609, + "learning_rate": 1.9070673738706796e-06, + "loss": 1.1266, + "step": 258 + }, + { + "epoch": 0.4306796923716483, + "grad_norm": 2.515358176239364, + "learning_rate": 1.9063283550656687e-06, + "loss": 1.1149, + "step": 259 + }, + { + "epoch": 0.4323425483267512, + "grad_norm": 2.210842033312207, + "learning_rate": 1.905586553828253e-06, + "loss": 1.0168, + "step": 260 + }, + { + "epoch": 0.4340054042818541, + "grad_norm": 2.295182467603218, + "learning_rate": 1.9048419724357658e-06, + "loss": 1.173, + "step": 261 + }, + { + "epoch": 0.435668260236957, + "grad_norm": 2.1506193772075886, + "learning_rate": 1.9040946131740762e-06, + "loss": 1.084, + "step": 262 + }, + { + "epoch": 0.4373311161920599, + "grad_norm": 2.0996773232405403, + "learning_rate": 1.9033444783375804e-06, + "loss": 1.1712, + "step": 263 + }, + { + "epoch": 0.43899397214716274, + "grad_norm": 1.9545895155808832, + "learning_rate": 1.9025915702291954e-06, + "loss": 0.9843, + "step": 264 + }, + { + "epoch": 0.44065682810226564, + "grad_norm": 2.2269341512176224, + "learning_rate": 1.9018358911603535e-06, + "loss": 1.1599, + "step": 265 + }, + { + "epoch": 0.44231968405736855, + "grad_norm": 2.1685909801965297, + "learning_rate": 1.901077443450993e-06, + "loss": 1.0454, + "step": 266 + }, + { + "epoch": 0.4439825400124714, + "grad_norm": 2.377103399607552, + "learning_rate": 1.9003162294295513e-06, + "loss": 1.0963, + "step": 267 + }, + { + "epoch": 0.4456453959675743, + "grad_norm": 2.5151408653095904, + "learning_rate": 1.8995522514329601e-06, + "loss": 1.104, + "step": 268 + }, + { + "epoch": 0.4473082519226772, + "grad_norm": 2.094881644352632, + "learning_rate": 1.8987855118066348e-06, + "loss": 1.2278, + "step": 269 + }, + { + "epoch": 0.4489711078777801, + "grad_norm": 2.2344009413987376, + "learning_rate": 1.8980160129044695e-06, + "loss": 1.1049, + "step": 270 + }, + { + "epoch": 0.45063396383288296, + "grad_norm": 2.297839277269806, + "learning_rate": 1.8972437570888307e-06, + "loss": 1.1538, + "step": 271 + }, + { + "epoch": 0.45229681978798586, + "grad_norm": 2.238645887183212, + "learning_rate": 1.8964687467305463e-06, + "loss": 1.1412, + "step": 272 + }, + { + "epoch": 0.45395967574308876, + "grad_norm": 2.145666305518761, + "learning_rate": 1.895690984208902e-06, + "loss": 1.0315, + "step": 273 + }, + { + "epoch": 0.45562253169819167, + "grad_norm": 2.310288742114033, + "learning_rate": 1.894910471911633e-06, + "loss": 1.0277, + "step": 274 + }, + { + "epoch": 0.4572853876532945, + "grad_norm": 2.155602139621284, + "learning_rate": 1.894127212234916e-06, + "loss": 1.0503, + "step": 275 + }, + { + "epoch": 0.4589482436083974, + "grad_norm": 2.6510720645982944, + "learning_rate": 1.8933412075833607e-06, + "loss": 1.0964, + "step": 276 + }, + { + "epoch": 0.4606110995635003, + "grad_norm": 2.077628159647317, + "learning_rate": 1.8925524603700062e-06, + "loss": 1.0037, + "step": 277 + }, + { + "epoch": 0.4622739555186032, + "grad_norm": 2.117391684386515, + "learning_rate": 1.8917609730163103e-06, + "loss": 1.0695, + "step": 278 + }, + { + "epoch": 0.4639368114737061, + "grad_norm": 2.372444846141058, + "learning_rate": 1.8909667479521425e-06, + "loss": 1.1472, + "step": 279 + }, + { + "epoch": 0.465599667428809, + "grad_norm": 2.4078311689148064, + "learning_rate": 1.8901697876157776e-06, + "loss": 1.2753, + "step": 280 + }, + { + "epoch": 0.4672625233839119, + "grad_norm": 3.21629533072984, + "learning_rate": 1.8893700944538881e-06, + "loss": 1.1071, + "step": 281 + }, + { + "epoch": 0.46892537933901474, + "grad_norm": 2.4295013417380287, + "learning_rate": 1.8885676709215353e-06, + "loss": 1.1355, + "step": 282 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 2.275602301735013, + "learning_rate": 1.8877625194821636e-06, + "loss": 1.2554, + "step": 283 + }, + { + "epoch": 0.47225109124922054, + "grad_norm": 2.3766077332166957, + "learning_rate": 1.8869546426075917e-06, + "loss": 1.0717, + "step": 284 + }, + { + "epoch": 0.47391394720432345, + "grad_norm": 2.132945677416556, + "learning_rate": 1.8861440427780058e-06, + "loss": 1.1736, + "step": 285 + }, + { + "epoch": 0.4755768031594263, + "grad_norm": 2.045707307029965, + "learning_rate": 1.8853307224819506e-06, + "loss": 0.9087, + "step": 286 + }, + { + "epoch": 0.4772396591145292, + "grad_norm": 2.163246456875578, + "learning_rate": 1.8845146842163238e-06, + "loss": 1.1869, + "step": 287 + }, + { + "epoch": 0.4789025150696321, + "grad_norm": 2.230799208427525, + "learning_rate": 1.8836959304863669e-06, + "loss": 1.0081, + "step": 288 + }, + { + "epoch": 0.48056537102473496, + "grad_norm": 2.430756467607878, + "learning_rate": 1.8828744638056573e-06, + "loss": 1.1908, + "step": 289 + }, + { + "epoch": 0.48222822697983786, + "grad_norm": 2.2702841588026117, + "learning_rate": 1.882050286696102e-06, + "loss": 1.1872, + "step": 290 + }, + { + "epoch": 0.48389108293494076, + "grad_norm": 2.214741995886969, + "learning_rate": 1.881223401687929e-06, + "loss": 1.101, + "step": 291 + }, + { + "epoch": 0.48555393889004367, + "grad_norm": 2.1908978532802, + "learning_rate": 1.8803938113196784e-06, + "loss": 1.0547, + "step": 292 + }, + { + "epoch": 0.4872167948451465, + "grad_norm": 2.1957825231056844, + "learning_rate": 1.8795615181381974e-06, + "loss": 1.0209, + "step": 293 + }, + { + "epoch": 0.4888796508002494, + "grad_norm": 2.4659476575942056, + "learning_rate": 1.8787265246986298e-06, + "loss": 1.0354, + "step": 294 + }, + { + "epoch": 0.4905425067553523, + "grad_norm": 2.499181314884686, + "learning_rate": 1.877888833564409e-06, + "loss": 1.1167, + "step": 295 + }, + { + "epoch": 0.49220536271045523, + "grad_norm": 2.297165687585788, + "learning_rate": 1.8770484473072517e-06, + "loss": 1.2812, + "step": 296 + }, + { + "epoch": 0.4938682186655581, + "grad_norm": 2.155148132032728, + "learning_rate": 1.8762053685071471e-06, + "loss": 0.9955, + "step": 297 + }, + { + "epoch": 0.495531074620661, + "grad_norm": 2.2105051332330183, + "learning_rate": 1.8753595997523513e-06, + "loss": 1.0817, + "step": 298 + }, + { + "epoch": 0.4971939305757639, + "grad_norm": 2.2489919353382937, + "learning_rate": 1.8745111436393785e-06, + "loss": 1.116, + "step": 299 + }, + { + "epoch": 0.49885678653086674, + "grad_norm": 2.245142666798669, + "learning_rate": 1.8736600027729933e-06, + "loss": 1.2594, + "step": 300 + }, + { + "epoch": 0.5005196424859697, + "grad_norm": 2.4738621609715263, + "learning_rate": 1.8728061797662016e-06, + "loss": 1.2273, + "step": 301 + }, + { + "epoch": 0.5021824984410725, + "grad_norm": 2.1040097093458745, + "learning_rate": 1.8719496772402447e-06, + "loss": 1.1395, + "step": 302 + }, + { + "epoch": 0.5021824984410725, + "eval_loss": 1.4084999561309814, + "eval_runtime": 24.7725, + "eval_samples_per_second": 0.444, + "eval_steps_per_second": 0.121, + "step": 302 + }, + { + "epoch": 0.5038453543961754, + "grad_norm": 2.4241733699272565, + "learning_rate": 1.8710904978245894e-06, + "loss": 1.0195, + "step": 303 + }, + { + "epoch": 0.5055082103512784, + "grad_norm": 2.0299543366481254, + "learning_rate": 1.8702286441569203e-06, + "loss": 0.9248, + "step": 304 + }, + { + "epoch": 0.5071710663063812, + "grad_norm": 2.0464859041290984, + "learning_rate": 1.8693641188831328e-06, + "loss": 1.0278, + "step": 305 + }, + { + "epoch": 0.508833922261484, + "grad_norm": 2.2925599147135043, + "learning_rate": 1.8684969246573232e-06, + "loss": 1.1409, + "step": 306 + }, + { + "epoch": 0.510496778216587, + "grad_norm": 2.4406740096357598, + "learning_rate": 1.8676270641417821e-06, + "loss": 1.139, + "step": 307 + }, + { + "epoch": 0.5121596341716899, + "grad_norm": 2.2502338928571692, + "learning_rate": 1.8667545400069856e-06, + "loss": 1.2704, + "step": 308 + }, + { + "epoch": 0.5138224901267928, + "grad_norm": 2.6218475196698274, + "learning_rate": 1.8658793549315868e-06, + "loss": 1.1125, + "step": 309 + }, + { + "epoch": 0.5154853460818957, + "grad_norm": 2.129495598032236, + "learning_rate": 1.8650015116024077e-06, + "loss": 1.0833, + "step": 310 + }, + { + "epoch": 0.5171482020369985, + "grad_norm": 4.815386485319442, + "learning_rate": 1.8641210127144328e-06, + "loss": 1.0958, + "step": 311 + }, + { + "epoch": 0.5188110579921015, + "grad_norm": 2.1912355764465383, + "learning_rate": 1.8632378609707967e-06, + "loss": 1.1466, + "step": 312 + }, + { + "epoch": 0.5204739139472043, + "grad_norm": 2.6871843307281504, + "learning_rate": 1.8623520590827797e-06, + "loss": 1.0443, + "step": 313 + }, + { + "epoch": 0.5221367699023072, + "grad_norm": 2.229171454549482, + "learning_rate": 1.8614636097697983e-06, + "loss": 1.2176, + "step": 314 + }, + { + "epoch": 0.5237996258574101, + "grad_norm": 2.3499709800000343, + "learning_rate": 1.8605725157593957e-06, + "loss": 1.0718, + "step": 315 + }, + { + "epoch": 0.525462481812513, + "grad_norm": 2.128381081935293, + "learning_rate": 1.8596787797872353e-06, + "loss": 1.0168, + "step": 316 + }, + { + "epoch": 0.5271253377676158, + "grad_norm": 2.4014872942261847, + "learning_rate": 1.85878240459709e-06, + "loss": 1.1539, + "step": 317 + }, + { + "epoch": 0.5287881937227188, + "grad_norm": 2.265996337760467, + "learning_rate": 1.857883392940837e-06, + "loss": 1.1284, + "step": 318 + }, + { + "epoch": 0.5304510496778216, + "grad_norm": 2.1606204777164826, + "learning_rate": 1.8569817475784457e-06, + "loss": 1.1039, + "step": 319 + }, + { + "epoch": 0.5321139056329246, + "grad_norm": 2.314221527250599, + "learning_rate": 1.8560774712779719e-06, + "loss": 1.1662, + "step": 320 + }, + { + "epoch": 0.5337767615880274, + "grad_norm": 2.2036586985620064, + "learning_rate": 1.8551705668155479e-06, + "loss": 1.0452, + "step": 321 + }, + { + "epoch": 0.5354396175431303, + "grad_norm": 2.749989537581693, + "learning_rate": 1.8542610369753753e-06, + "loss": 1.1471, + "step": 322 + }, + { + "epoch": 0.5371024734982333, + "grad_norm": 2.368917726575034, + "learning_rate": 1.8533488845497146e-06, + "loss": 1.3107, + "step": 323 + }, + { + "epoch": 0.5387653294533361, + "grad_norm": 2.1977699456063298, + "learning_rate": 1.8524341123388787e-06, + "loss": 0.9482, + "step": 324 + }, + { + "epoch": 0.540428185408439, + "grad_norm": 2.3335155676558625, + "learning_rate": 1.851516723151222e-06, + "loss": 1.0011, + "step": 325 + }, + { + "epoch": 0.5420910413635419, + "grad_norm": 2.2010547018844857, + "learning_rate": 1.850596719803134e-06, + "loss": 1.2211, + "step": 326 + }, + { + "epoch": 0.5437538973186448, + "grad_norm": 2.077916314192133, + "learning_rate": 1.8496741051190298e-06, + "loss": 1.1015, + "step": 327 + }, + { + "epoch": 0.5454167532737476, + "grad_norm": 2.164449806731655, + "learning_rate": 1.84874888193134e-06, + "loss": 0.9801, + "step": 328 + }, + { + "epoch": 0.5470796092288506, + "grad_norm": 2.098902846890324, + "learning_rate": 1.847821053080505e-06, + "loss": 0.9848, + "step": 329 + }, + { + "epoch": 0.5487424651839534, + "grad_norm": 2.3723967966499253, + "learning_rate": 1.8468906214149636e-06, + "loss": 1.1727, + "step": 330 + }, + { + "epoch": 0.5504053211390564, + "grad_norm": 2.28191518531219, + "learning_rate": 1.8459575897911453e-06, + "loss": 1.2235, + "step": 331 + }, + { + "epoch": 0.5520681770941592, + "grad_norm": 2.2820229639301406, + "learning_rate": 1.845021961073462e-06, + "loss": 1.2018, + "step": 332 + }, + { + "epoch": 0.5537310330492621, + "grad_norm": 2.138213918755252, + "learning_rate": 1.844083738134298e-06, + "loss": 1.2856, + "step": 333 + }, + { + "epoch": 0.555393889004365, + "grad_norm": 2.158628371398021, + "learning_rate": 1.8431429238540027e-06, + "loss": 1.0644, + "step": 334 + }, + { + "epoch": 0.5570567449594679, + "grad_norm": 2.3156623015320377, + "learning_rate": 1.84219952112088e-06, + "loss": 1.0024, + "step": 335 + }, + { + "epoch": 0.5587196009145707, + "grad_norm": 2.1704636280326923, + "learning_rate": 1.8412535328311812e-06, + "loss": 1.1603, + "step": 336 + }, + { + "epoch": 0.5603824568696737, + "grad_norm": 2.324581658968828, + "learning_rate": 1.8403049618890948e-06, + "loss": 1.2107, + "step": 337 + }, + { + "epoch": 0.5620453128247765, + "grad_norm": 2.1819124113613504, + "learning_rate": 1.8393538112067376e-06, + "loss": 1.0166, + "step": 338 + }, + { + "epoch": 0.5637081687798794, + "grad_norm": 2.14128534614559, + "learning_rate": 1.8384000837041476e-06, + "loss": 1.0348, + "step": 339 + }, + { + "epoch": 0.5653710247349824, + "grad_norm": 2.2165793831097034, + "learning_rate": 1.8374437823092722e-06, + "loss": 1.1544, + "step": 340 + }, + { + "epoch": 0.5670338806900852, + "grad_norm": 2.0867661916904456, + "learning_rate": 1.8364849099579618e-06, + "loss": 1.074, + "step": 341 + }, + { + "epoch": 0.5686967366451882, + "grad_norm": 2.278047999899843, + "learning_rate": 1.8355234695939585e-06, + "loss": 1.1037, + "step": 342 + }, + { + "epoch": 0.570359592600291, + "grad_norm": 2.15233238553855, + "learning_rate": 1.8345594641688894e-06, + "loss": 1.0763, + "step": 343 + }, + { + "epoch": 0.5720224485553939, + "grad_norm": 2.1861191898315067, + "learning_rate": 1.8335928966422553e-06, + "loss": 1.273, + "step": 344 + }, + { + "epoch": 0.5736853045104968, + "grad_norm": 2.191986681142867, + "learning_rate": 1.8326237699814238e-06, + "loss": 1.1571, + "step": 345 + }, + { + "epoch": 0.5753481604655997, + "grad_norm": 2.2910127446336737, + "learning_rate": 1.8316520871616178e-06, + "loss": 1.0243, + "step": 346 + }, + { + "epoch": 0.5770110164207025, + "grad_norm": 2.088235032433896, + "learning_rate": 1.8306778511659085e-06, + "loss": 1.0289, + "step": 347 + }, + { + "epoch": 0.5786738723758055, + "grad_norm": 2.213333845589627, + "learning_rate": 1.829701064985205e-06, + "loss": 1.1229, + "step": 348 + }, + { + "epoch": 0.5803367283309083, + "grad_norm": 2.3961880269672626, + "learning_rate": 1.8287217316182457e-06, + "loss": 0.9937, + "step": 349 + }, + { + "epoch": 0.5819995842860112, + "grad_norm": 2.052444834442602, + "learning_rate": 1.8277398540715887e-06, + "loss": 1.1965, + "step": 350 + }, + { + "epoch": 0.5836624402411141, + "grad_norm": 2.410438103487435, + "learning_rate": 1.8267554353596024e-06, + "loss": 1.1488, + "step": 351 + }, + { + "epoch": 0.585325296196217, + "grad_norm": 2.3043831544316835, + "learning_rate": 1.8257684785044576e-06, + "loss": 0.9968, + "step": 352 + }, + { + "epoch": 0.5869881521513199, + "grad_norm": 2.116368023415211, + "learning_rate": 1.8247789865361163e-06, + "loss": 1.0944, + "step": 353 + }, + { + "epoch": 0.5886510081064228, + "grad_norm": 2.0486087552163146, + "learning_rate": 1.8237869624923234e-06, + "loss": 1.127, + "step": 354 + }, + { + "epoch": 0.5903138640615256, + "grad_norm": 2.2869039816178036, + "learning_rate": 1.8227924094185978e-06, + "loss": 1.0497, + "step": 355 + }, + { + "epoch": 0.5919767200166286, + "grad_norm": 2.1915186885351448, + "learning_rate": 1.821795330368222e-06, + "loss": 1.1584, + "step": 356 + }, + { + "epoch": 0.5936395759717314, + "grad_norm": 2.4441273557487566, + "learning_rate": 1.8207957284022337e-06, + "loss": 1.1206, + "step": 357 + }, + { + "epoch": 0.5953024319268343, + "grad_norm": 2.3161936585249334, + "learning_rate": 1.8197936065894155e-06, + "loss": 1.119, + "step": 358 + }, + { + "epoch": 0.5969652878819373, + "grad_norm": 2.303219346593548, + "learning_rate": 1.8187889680062863e-06, + "loss": 1.2847, + "step": 359 + }, + { + "epoch": 0.5986281438370401, + "grad_norm": 2.1218629296829152, + "learning_rate": 1.8177818157370912e-06, + "loss": 1.0495, + "step": 360 + }, + { + "epoch": 0.600290999792143, + "grad_norm": 2.055562774852349, + "learning_rate": 1.816772152873793e-06, + "loss": 1.1212, + "step": 361 + }, + { + "epoch": 0.6019538557472459, + "grad_norm": 2.1036164422163695, + "learning_rate": 1.8157599825160607e-06, + "loss": 0.9293, + "step": 362 + }, + { + "epoch": 0.6036167117023488, + "grad_norm": 2.2429881312173596, + "learning_rate": 1.8147453077712634e-06, + "loss": 1.134, + "step": 363 + }, + { + "epoch": 0.6052795676574517, + "grad_norm": 2.147826876347206, + "learning_rate": 1.813728131754456e-06, + "loss": 1.0184, + "step": 364 + }, + { + "epoch": 0.6069424236125546, + "grad_norm": 2.221807143035665, + "learning_rate": 1.8127084575883748e-06, + "loss": 1.1232, + "step": 365 + }, + { + "epoch": 0.6086052795676574, + "grad_norm": 2.608531400881515, + "learning_rate": 1.811686288403424e-06, + "loss": 1.166, + "step": 366 + }, + { + "epoch": 0.6102681355227604, + "grad_norm": 2.3488837184353937, + "learning_rate": 1.8106616273376681e-06, + "loss": 1.0413, + "step": 367 + }, + { + "epoch": 0.6119309914778632, + "grad_norm": 2.2777707823295668, + "learning_rate": 1.8096344775368211e-06, + "loss": 1.0941, + "step": 368 + }, + { + "epoch": 0.6135938474329661, + "grad_norm": 2.2605130570731253, + "learning_rate": 1.808604842154238e-06, + "loss": 1.0537, + "step": 369 + }, + { + "epoch": 0.615256703388069, + "grad_norm": 2.292940209107732, + "learning_rate": 1.807572724350905e-06, + "loss": 1.0708, + "step": 370 + }, + { + "epoch": 0.6169195593431719, + "grad_norm": 2.10745264157216, + "learning_rate": 1.8065381272954276e-06, + "loss": 0.9477, + "step": 371 + }, + { + "epoch": 0.6185824152982747, + "grad_norm": 2.1624162140056176, + "learning_rate": 1.8055010541640243e-06, + "loss": 1.0009, + "step": 372 + }, + { + "epoch": 0.6202452712533777, + "grad_norm": 2.4110174672424347, + "learning_rate": 1.8044615081405151e-06, + "loss": 1.082, + "step": 373 + }, + { + "epoch": 0.6219081272084805, + "grad_norm": 2.3314352780690166, + "learning_rate": 1.8034194924163103e-06, + "loss": 1.2243, + "step": 374 + }, + { + "epoch": 0.6235709831635835, + "grad_norm": 2.2871677222090137, + "learning_rate": 1.802375010190404e-06, + "loss": 0.9866, + "step": 375 + }, + { + "epoch": 0.6252338391186864, + "grad_norm": 2.3308466745130465, + "learning_rate": 1.8013280646693612e-06, + "loss": 1.0981, + "step": 376 + }, + { + "epoch": 0.6268966950737892, + "grad_norm": 2.211137881920983, + "learning_rate": 1.8002786590673096e-06, + "loss": 1.0953, + "step": 377 + }, + { + "epoch": 0.6285595510288922, + "grad_norm": 2.2653592375727727, + "learning_rate": 1.7992267966059298e-06, + "loss": 1.0843, + "step": 378 + }, + { + "epoch": 0.630222406983995, + "grad_norm": 2.0429297450824024, + "learning_rate": 1.7981724805144443e-06, + "loss": 1.0853, + "step": 379 + }, + { + "epoch": 0.6318852629390979, + "grad_norm": 2.2106979108018794, + "learning_rate": 1.7971157140296088e-06, + "loss": 1.0946, + "step": 380 + }, + { + "epoch": 0.6335481188942008, + "grad_norm": 2.2495780703298025, + "learning_rate": 1.7960565003957016e-06, + "loss": 1.1758, + "step": 381 + }, + { + "epoch": 0.6352109748493037, + "grad_norm": 2.442736100176031, + "learning_rate": 1.7949948428645133e-06, + "loss": 1.0848, + "step": 382 + }, + { + "epoch": 0.6368738308044065, + "grad_norm": 2.1142990741185286, + "learning_rate": 1.7939307446953378e-06, + "loss": 1.0653, + "step": 383 + }, + { + "epoch": 0.6385366867595095, + "grad_norm": 2.0910930966792782, + "learning_rate": 1.7928642091549612e-06, + "loss": 0.9725, + "step": 384 + }, + { + "epoch": 0.6401995427146123, + "grad_norm": 2.278583439400143, + "learning_rate": 1.7917952395176535e-06, + "loss": 1.0259, + "step": 385 + }, + { + "epoch": 0.6418623986697153, + "grad_norm": 2.545286223573369, + "learning_rate": 1.790723839065156e-06, + "loss": 1.1821, + "step": 386 + }, + { + "epoch": 0.6435252546248181, + "grad_norm": 2.261488639334733, + "learning_rate": 1.7896500110866737e-06, + "loss": 0.939, + "step": 387 + }, + { + "epoch": 0.645188110579921, + "grad_norm": 2.7678067419857597, + "learning_rate": 1.7885737588788632e-06, + "loss": 1.0408, + "step": 388 + }, + { + "epoch": 0.6468509665350239, + "grad_norm": 2.11635682407254, + "learning_rate": 1.787495085745824e-06, + "loss": 1.1696, + "step": 389 + }, + { + "epoch": 0.6485138224901268, + "grad_norm": 2.3192534757366703, + "learning_rate": 1.7864139949990882e-06, + "loss": 0.9589, + "step": 390 + }, + { + "epoch": 0.6501766784452296, + "grad_norm": 2.150754996425321, + "learning_rate": 1.7853304899576091e-06, + "loss": 1.1067, + "step": 391 + }, + { + "epoch": 0.6518395344003326, + "grad_norm": 2.2383513192031574, + "learning_rate": 1.784244573947753e-06, + "loss": 1.3417, + "step": 392 + }, + { + "epoch": 0.6535023903554354, + "grad_norm": 2.290710629399966, + "learning_rate": 1.7831562503032865e-06, + "loss": 1.1442, + "step": 393 + }, + { + "epoch": 0.6551652463105383, + "grad_norm": 3.3410043284621143, + "learning_rate": 1.7820655223653689e-06, + "loss": 1.2796, + "step": 394 + }, + { + "epoch": 0.6568281022656413, + "grad_norm": 2.1116250875932727, + "learning_rate": 1.7809723934825402e-06, + "loss": 1.0086, + "step": 395 + }, + { + "epoch": 0.6584909582207441, + "grad_norm": 2.242523763487976, + "learning_rate": 1.7798768670107113e-06, + "loss": 1.0907, + "step": 396 + }, + { + "epoch": 0.6601538141758471, + "grad_norm": 2.151880661955345, + "learning_rate": 1.7787789463131535e-06, + "loss": 0.9459, + "step": 397 + }, + { + "epoch": 0.6618166701309499, + "grad_norm": 2.1313600148722047, + "learning_rate": 1.777678634760489e-06, + "loss": 1.1144, + "step": 398 + }, + { + "epoch": 0.6634795260860528, + "grad_norm": 2.445945952206475, + "learning_rate": 1.7765759357306793e-06, + "loss": 1.1141, + "step": 399 + }, + { + "epoch": 0.6651423820411557, + "grad_norm": 2.1049458345635417, + "learning_rate": 1.7754708526090155e-06, + "loss": 0.9842, + "step": 400 + }, + { + "epoch": 0.6668052379962586, + "grad_norm": 2.4326727008731135, + "learning_rate": 1.7743633887881088e-06, + "loss": 1.1459, + "step": 401 + }, + { + "epoch": 0.6684680939513614, + "grad_norm": 2.2415440353250022, + "learning_rate": 1.7732535476678776e-06, + "loss": 1.0153, + "step": 402 + }, + { + "epoch": 0.6701309499064644, + "grad_norm": 2.6676880307885034, + "learning_rate": 1.77214133265554e-06, + "loss": 1.216, + "step": 403 + }, + { + "epoch": 0.6717938058615672, + "grad_norm": 2.259873210911179, + "learning_rate": 1.7710267471656013e-06, + "loss": 1.1607, + "step": 404 + }, + { + "epoch": 0.6734566618166701, + "grad_norm": 2.1060724600194627, + "learning_rate": 1.7699097946198443e-06, + "loss": 1.0631, + "step": 405 + }, + { + "epoch": 0.675119517771773, + "grad_norm": 2.1821450546259067, + "learning_rate": 1.7687904784473186e-06, + "loss": 1.0213, + "step": 406 + }, + { + "epoch": 0.6767823737268759, + "grad_norm": 2.043314834472435, + "learning_rate": 1.7676688020843305e-06, + "loss": 1.1173, + "step": 407 + }, + { + "epoch": 0.6784452296819788, + "grad_norm": 2.342119646043276, + "learning_rate": 1.7665447689744317e-06, + "loss": 1.0894, + "step": 408 + }, + { + "epoch": 0.6801080856370817, + "grad_norm": 2.0782278008731687, + "learning_rate": 1.7654183825684091e-06, + "loss": 1.1245, + "step": 409 + }, + { + "epoch": 0.6817709415921845, + "grad_norm": 2.0556118504588388, + "learning_rate": 1.7642896463242744e-06, + "loss": 1.0274, + "step": 410 + }, + { + "epoch": 0.6834337975472875, + "grad_norm": 2.4517557799237957, + "learning_rate": 1.7631585637072535e-06, + "loss": 1.1762, + "step": 411 + }, + { + "epoch": 0.6850966535023904, + "grad_norm": 2.2629874321423635, + "learning_rate": 1.7620251381897751e-06, + "loss": 0.9806, + "step": 412 + }, + { + "epoch": 0.6867595094574932, + "grad_norm": 2.33240157213656, + "learning_rate": 1.7608893732514615e-06, + "loss": 1.156, + "step": 413 + }, + { + "epoch": 0.6884223654125962, + "grad_norm": 2.179297842815191, + "learning_rate": 1.7597512723791162e-06, + "loss": 1.0117, + "step": 414 + }, + { + "epoch": 0.690085221367699, + "grad_norm": 2.1949726299406045, + "learning_rate": 1.7586108390667142e-06, + "loss": 1.0476, + "step": 415 + }, + { + "epoch": 0.6917480773228019, + "grad_norm": 2.3336572022760884, + "learning_rate": 1.7574680768153915e-06, + "loss": 1.1267, + "step": 416 + }, + { + "epoch": 0.6934109332779048, + "grad_norm": 2.3983506966023542, + "learning_rate": 1.7563229891334336e-06, + "loss": 1.034, + "step": 417 + }, + { + "epoch": 0.6950737892330077, + "grad_norm": 2.0848820780573476, + "learning_rate": 1.7551755795362654e-06, + "loss": 0.9663, + "step": 418 + }, + { + "epoch": 0.6967366451881106, + "grad_norm": 2.2371795871187063, + "learning_rate": 1.7540258515464395e-06, + "loss": 1.1261, + "step": 419 + }, + { + "epoch": 0.6983995011432135, + "grad_norm": 2.1100352921599437, + "learning_rate": 1.7528738086936269e-06, + "loss": 1.0711, + "step": 420 + }, + { + "epoch": 0.7000623570983163, + "grad_norm": 2.147207122561456, + "learning_rate": 1.7517194545146036e-06, + "loss": 1.0874, + "step": 421 + }, + { + "epoch": 0.7017252130534193, + "grad_norm": 2.1810803049772645, + "learning_rate": 1.750562792553244e-06, + "loss": 1.0132, + "step": 422 + }, + { + "epoch": 0.7033880690085221, + "grad_norm": 2.155938991100877, + "learning_rate": 1.7494038263605049e-06, + "loss": 1.121, + "step": 423 + }, + { + "epoch": 0.705050924963625, + "grad_norm": 2.368347963131549, + "learning_rate": 1.7482425594944182e-06, + "loss": 1.1172, + "step": 424 + }, + { + "epoch": 0.7067137809187279, + "grad_norm": 2.2271099712267013, + "learning_rate": 1.7470789955200786e-06, + "loss": 1.0263, + "step": 425 + }, + { + "epoch": 0.7083766368738308, + "grad_norm": 2.229936898551502, + "learning_rate": 1.7459131380096336e-06, + "loss": 1.0274, + "step": 426 + }, + { + "epoch": 0.7100394928289336, + "grad_norm": 2.158588860368013, + "learning_rate": 1.744744990542271e-06, + "loss": 1.1544, + "step": 427 + }, + { + "epoch": 0.7117023487840366, + "grad_norm": 2.161831442303464, + "learning_rate": 1.7435745567042094e-06, + "loss": 1.0847, + "step": 428 + }, + { + "epoch": 0.7133652047391394, + "grad_norm": 2.169173816530374, + "learning_rate": 1.7424018400886858e-06, + "loss": 1.042, + "step": 429 + }, + { + "epoch": 0.7150280606942424, + "grad_norm": 2.2018091926987964, + "learning_rate": 1.7412268442959465e-06, + "loss": 1.0173, + "step": 430 + }, + { + "epoch": 0.7166909166493453, + "grad_norm": 2.2558519988159045, + "learning_rate": 1.7400495729332337e-06, + "loss": 1.2228, + "step": 431 + }, + { + "epoch": 0.7183537726044481, + "grad_norm": 2.286849860794016, + "learning_rate": 1.7388700296147763e-06, + "loss": 1.1118, + "step": 432 + }, + { + "epoch": 0.7200166285595511, + "grad_norm": 2.376710954298681, + "learning_rate": 1.737688217961778e-06, + "loss": 1.221, + "step": 433 + }, + { + "epoch": 0.7216794845146539, + "grad_norm": 2.165270182256038, + "learning_rate": 1.7365041416024063e-06, + "loss": 1.0928, + "step": 434 + }, + { + "epoch": 0.7233423404697568, + "grad_norm": 2.039272996802367, + "learning_rate": 1.7353178041717814e-06, + "loss": 1.0993, + "step": 435 + }, + { + "epoch": 0.7250051964248597, + "grad_norm": 3.179799411220361, + "learning_rate": 1.7341292093119648e-06, + "loss": 1.1437, + "step": 436 + }, + { + "epoch": 0.7266680523799626, + "grad_norm": 2.438107959779604, + "learning_rate": 1.732938360671948e-06, + "loss": 1.0467, + "step": 437 + }, + { + "epoch": 0.7283309083350655, + "grad_norm": 2.227901031363515, + "learning_rate": 1.7317452619076426e-06, + "loss": 1.2184, + "step": 438 + }, + { + "epoch": 0.7299937642901684, + "grad_norm": 2.484925764202615, + "learning_rate": 1.7305499166818679e-06, + "loss": 1.0869, + "step": 439 + }, + { + "epoch": 0.7316566202452712, + "grad_norm": 2.328525821533016, + "learning_rate": 1.7293523286643383e-06, + "loss": 1.1372, + "step": 440 + }, + { + "epoch": 0.7333194762003742, + "grad_norm": 2.3975391729646014, + "learning_rate": 1.7281525015316559e-06, + "loss": 1.164, + "step": 441 + }, + { + "epoch": 0.734982332155477, + "grad_norm": 2.045522380294782, + "learning_rate": 1.726950438967295e-06, + "loss": 0.9614, + "step": 442 + }, + { + "epoch": 0.7366451881105799, + "grad_norm": 2.233756657460333, + "learning_rate": 1.7257461446615939e-06, + "loss": 1.2181, + "step": 443 + }, + { + "epoch": 0.7383080440656828, + "grad_norm": 2.2137748173388694, + "learning_rate": 1.724539622311742e-06, + "loss": 1.143, + "step": 444 + }, + { + "epoch": 0.7399709000207857, + "grad_norm": 2.384585810876256, + "learning_rate": 1.723330875621768e-06, + "loss": 1.0785, + "step": 445 + }, + { + "epoch": 0.7416337559758885, + "grad_norm": 2.3185815551148585, + "learning_rate": 1.7221199083025305e-06, + "loss": 1.0606, + "step": 446 + }, + { + "epoch": 0.7432966119309915, + "grad_norm": 2.1918576903211053, + "learning_rate": 1.7209067240717055e-06, + "loss": 1.0824, + "step": 447 + }, + { + "epoch": 0.7449594678860944, + "grad_norm": 2.353863396233425, + "learning_rate": 1.7196913266537736e-06, + "loss": 1.1912, + "step": 448 + }, + { + "epoch": 0.7466223238411973, + "grad_norm": 2.0380687806247253, + "learning_rate": 1.7184737197800113e-06, + "loss": 1.0981, + "step": 449 + }, + { + "epoch": 0.7482851797963002, + "grad_norm": 2.5650954285686245, + "learning_rate": 1.717253907188477e-06, + "loss": 1.0268, + "step": 450 + }, + { + "epoch": 0.749948035751403, + "grad_norm": 2.3190693587689464, + "learning_rate": 1.7160318926240014e-06, + "loss": 1.1847, + "step": 451 + }, + { + "epoch": 0.751610891706506, + "grad_norm": 2.1059332821710783, + "learning_rate": 1.7148076798381754e-06, + "loss": 1.0832, + "step": 452 + }, + { + "epoch": 0.7532737476616088, + "grad_norm": 3.056086058439594, + "learning_rate": 1.713581272589338e-06, + "loss": 1.1343, + "step": 453 + }, + { + "epoch": 0.7532737476616088, + "eval_loss": 1.3934024572372437, + "eval_runtime": 24.6818, + "eval_samples_per_second": 0.446, + "eval_steps_per_second": 0.122, + "step": 453 + }, + { + "epoch": 0.7549366036167117, + "grad_norm": 2.4488905684680455, + "learning_rate": 1.7123526746425649e-06, + "loss": 1.0994, + "step": 454 + }, + { + "epoch": 0.7565994595718146, + "grad_norm": 2.6403911690075215, + "learning_rate": 1.7111218897696585e-06, + "loss": 1.0764, + "step": 455 + }, + { + "epoch": 0.7582623155269175, + "grad_norm": 2.1327294883754577, + "learning_rate": 1.7098889217491336e-06, + "loss": 1.1461, + "step": 456 + }, + { + "epoch": 0.7599251714820203, + "grad_norm": 2.0936769499266, + "learning_rate": 1.7086537743662084e-06, + "loss": 1.1827, + "step": 457 + }, + { + "epoch": 0.7615880274371233, + "grad_norm": 2.0368197792938814, + "learning_rate": 1.707416451412791e-06, + "loss": 1.0709, + "step": 458 + }, + { + "epoch": 0.7632508833922261, + "grad_norm": 2.1159264469009154, + "learning_rate": 1.7061769566874688e-06, + "loss": 1.1597, + "step": 459 + }, + { + "epoch": 0.7649137393473291, + "grad_norm": 2.2371503881264134, + "learning_rate": 1.7049352939954966e-06, + "loss": 1.1034, + "step": 460 + }, + { + "epoch": 0.7665765953024319, + "grad_norm": 2.167369770231308, + "learning_rate": 1.7036914671487849e-06, + "loss": 1.1398, + "step": 461 + }, + { + "epoch": 0.7682394512575348, + "grad_norm": 2.682964801749785, + "learning_rate": 1.7024454799658883e-06, + "loss": 1.1015, + "step": 462 + }, + { + "epoch": 0.7699023072126377, + "grad_norm": 2.3132284135399903, + "learning_rate": 1.7011973362719929e-06, + "loss": 0.9757, + "step": 463 + }, + { + "epoch": 0.7715651631677406, + "grad_norm": 2.226364690832632, + "learning_rate": 1.6999470398989066e-06, + "loss": 1.0315, + "step": 464 + }, + { + "epoch": 0.7732280191228434, + "grad_norm": 2.333679960095519, + "learning_rate": 1.6986945946850445e-06, + "loss": 1.1647, + "step": 465 + }, + { + "epoch": 0.7748908750779464, + "grad_norm": 2.1371349082975883, + "learning_rate": 1.6974400044754198e-06, + "loss": 1.0292, + "step": 466 + }, + { + "epoch": 0.7765537310330493, + "grad_norm": 2.044267477405619, + "learning_rate": 1.6961832731216305e-06, + "loss": 1.0265, + "step": 467 + }, + { + "epoch": 0.7782165869881521, + "grad_norm": 2.4049976847050987, + "learning_rate": 1.694924404481848e-06, + "loss": 1.0263, + "step": 468 + }, + { + "epoch": 0.7798794429432551, + "grad_norm": 2.1513258273801097, + "learning_rate": 1.6936634024208045e-06, + "loss": 1.0255, + "step": 469 + }, + { + "epoch": 0.7815422988983579, + "grad_norm": 2.267086285542123, + "learning_rate": 1.692400270809783e-06, + "loss": 1.1037, + "step": 470 + }, + { + "epoch": 0.7832051548534609, + "grad_norm": 2.207206827718745, + "learning_rate": 1.6911350135266034e-06, + "loss": 1.2229, + "step": 471 + }, + { + "epoch": 0.7848680108085637, + "grad_norm": 2.3370445622010267, + "learning_rate": 1.6898676344556116e-06, + "loss": 1.0456, + "step": 472 + }, + { + "epoch": 0.7865308667636666, + "grad_norm": 2.694579739504598, + "learning_rate": 1.6885981374876675e-06, + "loss": 1.1304, + "step": 473 + }, + { + "epoch": 0.7881937227187695, + "grad_norm": 2.4654672927693024, + "learning_rate": 1.6873265265201329e-06, + "loss": 1.1218, + "step": 474 + }, + { + "epoch": 0.7898565786738724, + "grad_norm": 2.4497013813355504, + "learning_rate": 1.6860528054568596e-06, + "loss": 1.188, + "step": 475 + }, + { + "epoch": 0.7915194346289752, + "grad_norm": 2.2437903296852517, + "learning_rate": 1.684776978208177e-06, + "loss": 1.0714, + "step": 476 + }, + { + "epoch": 0.7931822905840782, + "grad_norm": 2.430879750782421, + "learning_rate": 1.6834990486908816e-06, + "loss": 1.1451, + "step": 477 + }, + { + "epoch": 0.794845146539181, + "grad_norm": 2.1676565673117265, + "learning_rate": 1.6822190208282226e-06, + "loss": 1.0458, + "step": 478 + }, + { + "epoch": 0.7965080024942839, + "grad_norm": 2.179125948082596, + "learning_rate": 1.6809368985498918e-06, + "loss": 1.125, + "step": 479 + }, + { + "epoch": 0.7981708584493868, + "grad_norm": 2.277708398927927, + "learning_rate": 1.679652685792011e-06, + "loss": 1.0762, + "step": 480 + }, + { + "epoch": 0.7998337144044897, + "grad_norm": 2.2097528761591834, + "learning_rate": 1.6783663864971191e-06, + "loss": 1.0587, + "step": 481 + }, + { + "epoch": 0.8014965703595927, + "grad_norm": 1.9951583919167484, + "learning_rate": 1.6770780046141614e-06, + "loss": 1.0065, + "step": 482 + }, + { + "epoch": 0.8031594263146955, + "grad_norm": 2.1163449529859815, + "learning_rate": 1.6757875440984765e-06, + "loss": 0.9654, + "step": 483 + }, + { + "epoch": 0.8048222822697984, + "grad_norm": 2.169660817074707, + "learning_rate": 1.6744950089117845e-06, + "loss": 1.109, + "step": 484 + }, + { + "epoch": 0.8064851382249013, + "grad_norm": 2.5007409181660596, + "learning_rate": 1.6732004030221743e-06, + "loss": 1.0073, + "step": 485 + }, + { + "epoch": 0.8081479941800042, + "grad_norm": 2.382682682731147, + "learning_rate": 1.6719037304040921e-06, + "loss": 1.117, + "step": 486 + }, + { + "epoch": 0.809810850135107, + "grad_norm": 2.1404776200804827, + "learning_rate": 1.6706049950383299e-06, + "loss": 1.1485, + "step": 487 + }, + { + "epoch": 0.81147370609021, + "grad_norm": 1.9674458939644788, + "learning_rate": 1.6693042009120104e-06, + "loss": 0.9832, + "step": 488 + }, + { + "epoch": 0.8131365620453128, + "grad_norm": 2.428436601155877, + "learning_rate": 1.6680013520185786e-06, + "loss": 1.1419, + "step": 489 + }, + { + "epoch": 0.8147994180004157, + "grad_norm": 2.252054044332263, + "learning_rate": 1.6666964523577866e-06, + "loss": 1.1008, + "step": 490 + }, + { + "epoch": 0.8164622739555186, + "grad_norm": 2.3185844238230167, + "learning_rate": 1.6653895059356827e-06, + "loss": 1.0961, + "step": 491 + }, + { + "epoch": 0.8181251299106215, + "grad_norm": 2.245128772330143, + "learning_rate": 1.6640805167645984e-06, + "loss": 1.0219, + "step": 492 + }, + { + "epoch": 0.8197879858657244, + "grad_norm": 2.2252839355087404, + "learning_rate": 1.6627694888631374e-06, + "loss": 1.1808, + "step": 493 + }, + { + "epoch": 0.8214508418208273, + "grad_norm": 2.4802130292227487, + "learning_rate": 1.6614564262561608e-06, + "loss": 1.0952, + "step": 494 + }, + { + "epoch": 0.8231136977759301, + "grad_norm": 2.0600513070759594, + "learning_rate": 1.6601413329747778e-06, + "loss": 1.0461, + "step": 495 + }, + { + "epoch": 0.8247765537310331, + "grad_norm": 2.129782322403382, + "learning_rate": 1.6588242130563308e-06, + "loss": 1.0565, + "step": 496 + }, + { + "epoch": 0.8264394096861359, + "grad_norm": 2.24596861189698, + "learning_rate": 1.657505070544384e-06, + "loss": 1.2183, + "step": 497 + }, + { + "epoch": 0.8281022656412388, + "grad_norm": 2.2144169319672686, + "learning_rate": 1.6561839094887123e-06, + "loss": 1.056, + "step": 498 + }, + { + "epoch": 0.8297651215963417, + "grad_norm": 2.1326715488699346, + "learning_rate": 1.6548607339452852e-06, + "loss": 1.081, + "step": 499 + }, + { + "epoch": 0.8314279775514446, + "grad_norm": 2.400366586745486, + "learning_rate": 1.6535355479762584e-06, + "loss": 1.0388, + "step": 500 + }, + { + "epoch": 0.8330908335065474, + "grad_norm": 2.101276111938611, + "learning_rate": 1.6522083556499595e-06, + "loss": 1.13, + "step": 501 + }, + { + "epoch": 0.8347536894616504, + "grad_norm": 2.1474700452589017, + "learning_rate": 1.6508791610408751e-06, + "loss": 1.0743, + "step": 502 + }, + { + "epoch": 0.8364165454167533, + "grad_norm": 2.449496114081372, + "learning_rate": 1.6495479682296393e-06, + "loss": 1.1763, + "step": 503 + }, + { + "epoch": 0.8380794013718562, + "grad_norm": 2.2397418249566705, + "learning_rate": 1.6482147813030202e-06, + "loss": 1.0881, + "step": 504 + }, + { + "epoch": 0.8397422573269591, + "grad_norm": 2.38564204606019, + "learning_rate": 1.646879604353908e-06, + "loss": 0.9649, + "step": 505 + }, + { + "epoch": 0.8414051132820619, + "grad_norm": 2.51940590317821, + "learning_rate": 1.6455424414813024e-06, + "loss": 1.2114, + "step": 506 + }, + { + "epoch": 0.8430679692371649, + "grad_norm": 2.3181783821555175, + "learning_rate": 1.6442032967903e-06, + "loss": 1.1246, + "step": 507 + }, + { + "epoch": 0.8447308251922677, + "grad_norm": 2.159422324003234, + "learning_rate": 1.6428621743920812e-06, + "loss": 1.0351, + "step": 508 + }, + { + "epoch": 0.8463936811473706, + "grad_norm": 2.411802254120254, + "learning_rate": 1.6415190784038982e-06, + "loss": 1.0776, + "step": 509 + }, + { + "epoch": 0.8480565371024735, + "grad_norm": 2.0734882608633085, + "learning_rate": 1.6401740129490622e-06, + "loss": 1.1148, + "step": 510 + }, + { + "epoch": 0.8497193930575764, + "grad_norm": 2.318932880803621, + "learning_rate": 1.638826982156931e-06, + "loss": 1.0853, + "step": 511 + }, + { + "epoch": 0.8513822490126792, + "grad_norm": 2.1981900335907247, + "learning_rate": 1.637477990162895e-06, + "loss": 1.0151, + "step": 512 + }, + { + "epoch": 0.8530451049677822, + "grad_norm": 2.121049624802221, + "learning_rate": 1.6361270411083665e-06, + "loss": 1.0518, + "step": 513 + }, + { + "epoch": 0.854707960922885, + "grad_norm": 2.191890004201356, + "learning_rate": 1.6347741391407653e-06, + "loss": 1.0426, + "step": 514 + }, + { + "epoch": 0.856370816877988, + "grad_norm": 2.194218873534906, + "learning_rate": 1.6334192884135073e-06, + "loss": 1.1111, + "step": 515 + }, + { + "epoch": 0.8580336728330908, + "grad_norm": 2.0367103549252983, + "learning_rate": 1.6320624930859904e-06, + "loss": 1.0521, + "step": 516 + }, + { + "epoch": 0.8596965287881937, + "grad_norm": 2.5679919755305973, + "learning_rate": 1.630703757323583e-06, + "loss": 1.0547, + "step": 517 + }, + { + "epoch": 0.8613593847432967, + "grad_norm": 2.2355473603338676, + "learning_rate": 1.62934308529761e-06, + "loss": 1.0582, + "step": 518 + }, + { + "epoch": 0.8630222406983995, + "grad_norm": 2.04568580636059, + "learning_rate": 1.6279804811853418e-06, + "loss": 0.9508, + "step": 519 + }, + { + "epoch": 0.8646850966535023, + "grad_norm": 2.137101454934, + "learning_rate": 1.6266159491699787e-06, + "loss": 1.05, + "step": 520 + }, + { + "epoch": 0.8663479526086053, + "grad_norm": 2.3456035912178126, + "learning_rate": 1.6252494934406413e-06, + "loss": 1.0513, + "step": 521 + }, + { + "epoch": 0.8680108085637082, + "grad_norm": 2.3170057893277645, + "learning_rate": 1.6238811181923548e-06, + "loss": 1.0833, + "step": 522 + }, + { + "epoch": 0.869673664518811, + "grad_norm": 2.03138518708311, + "learning_rate": 1.6225108276260384e-06, + "loss": 1.0733, + "step": 523 + }, + { + "epoch": 0.871336520473914, + "grad_norm": 2.1936879157391225, + "learning_rate": 1.6211386259484898e-06, + "loss": 1.1578, + "step": 524 + }, + { + "epoch": 0.8729993764290168, + "grad_norm": 2.162845347014475, + "learning_rate": 1.6197645173723755e-06, + "loss": 1.0716, + "step": 525 + }, + { + "epoch": 0.8746622323841198, + "grad_norm": 2.2797022460412335, + "learning_rate": 1.6183885061162149e-06, + "loss": 1.1435, + "step": 526 + }, + { + "epoch": 0.8763250883392226, + "grad_norm": 2.8402768433436965, + "learning_rate": 1.6170105964043693e-06, + "loss": 1.0986, + "step": 527 + }, + { + "epoch": 0.8779879442943255, + "grad_norm": 2.2040976685633726, + "learning_rate": 1.6156307924670287e-06, + "loss": 1.1769, + "step": 528 + }, + { + "epoch": 0.8796508002494284, + "grad_norm": 2.2121586340352724, + "learning_rate": 1.614249098540197e-06, + "loss": 1.0384, + "step": 529 + }, + { + "epoch": 0.8813136562045313, + "grad_norm": 2.1737348089144772, + "learning_rate": 1.6128655188656818e-06, + "loss": 1.1178, + "step": 530 + }, + { + "epoch": 0.8829765121596341, + "grad_norm": 2.045692860407394, + "learning_rate": 1.6114800576910789e-06, + "loss": 1.0526, + "step": 531 + }, + { + "epoch": 0.8846393681147371, + "grad_norm": 2.4254673272096094, + "learning_rate": 1.610092719269761e-06, + "loss": 1.1918, + "step": 532 + }, + { + "epoch": 0.8863022240698399, + "grad_norm": 2.2186502378953463, + "learning_rate": 1.6087035078608636e-06, + "loss": 1.0816, + "step": 533 + }, + { + "epoch": 0.8879650800249428, + "grad_norm": 2.1899563735102316, + "learning_rate": 1.6073124277292726e-06, + "loss": 1.077, + "step": 534 + }, + { + "epoch": 0.8896279359800457, + "grad_norm": 2.3420002228619183, + "learning_rate": 1.6059194831456105e-06, + "loss": 1.046, + "step": 535 + }, + { + "epoch": 0.8912907919351486, + "grad_norm": 2.217136239350302, + "learning_rate": 1.6045246783862237e-06, + "loss": 1.0914, + "step": 536 + }, + { + "epoch": 0.8929536478902516, + "grad_norm": 2.6270358774502, + "learning_rate": 1.6031280177331704e-06, + "loss": 1.2156, + "step": 537 + }, + { + "epoch": 0.8946165038453544, + "grad_norm": 2.4382518238543596, + "learning_rate": 1.6017295054742044e-06, + "loss": 1.0995, + "step": 538 + }, + { + "epoch": 0.8962793598004573, + "grad_norm": 2.3688360741829917, + "learning_rate": 1.6003291459027653e-06, + "loss": 1.0843, + "step": 539 + }, + { + "epoch": 0.8979422157555602, + "grad_norm": 2.208126828921951, + "learning_rate": 1.5989269433179644e-06, + "loss": 1.1147, + "step": 540 + }, + { + "epoch": 0.8996050717106631, + "grad_norm": 2.2413007358436188, + "learning_rate": 1.5975229020245697e-06, + "loss": 1.1003, + "step": 541 + }, + { + "epoch": 0.9012679276657659, + "grad_norm": 2.226637939011591, + "learning_rate": 1.596117026332995e-06, + "loss": 0.9256, + "step": 542 + }, + { + "epoch": 0.9029307836208689, + "grad_norm": 2.117810788415696, + "learning_rate": 1.5947093205592851e-06, + "loss": 1.159, + "step": 543 + }, + { + "epoch": 0.9045936395759717, + "grad_norm": 2.0662799008829476, + "learning_rate": 1.593299789025104e-06, + "loss": 1.0951, + "step": 544 + }, + { + "epoch": 0.9062564955310746, + "grad_norm": 2.22383521867496, + "learning_rate": 1.5918884360577201e-06, + "loss": 1.0437, + "step": 545 + }, + { + "epoch": 0.9079193514861775, + "grad_norm": 2.1293539340887735, + "learning_rate": 1.5904752659899933e-06, + "loss": 1.1039, + "step": 546 + }, + { + "epoch": 0.9095822074412804, + "grad_norm": 2.1965800318638724, + "learning_rate": 1.5890602831603632e-06, + "loss": 1.0456, + "step": 547 + }, + { + "epoch": 0.9112450633963833, + "grad_norm": 2.084489657850369, + "learning_rate": 1.5876434919128334e-06, + "loss": 1.063, + "step": 548 + }, + { + "epoch": 0.9129079193514862, + "grad_norm": 2.167327544689684, + "learning_rate": 1.5862248965969603e-06, + "loss": 1.0414, + "step": 549 + }, + { + "epoch": 0.914570775306589, + "grad_norm": 2.0664162881784587, + "learning_rate": 1.584804501567838e-06, + "loss": 1.0985, + "step": 550 + }, + { + "epoch": 0.916233631261692, + "grad_norm": 2.319234065867708, + "learning_rate": 1.583382311186086e-06, + "loss": 0.9869, + "step": 551 + }, + { + "epoch": 0.9178964872167948, + "grad_norm": 2.2524367417375113, + "learning_rate": 1.581958329817836e-06, + "loss": 0.9564, + "step": 552 + }, + { + "epoch": 0.9195593431718977, + "grad_norm": 2.2424034311871184, + "learning_rate": 1.5805325618347169e-06, + "loss": 1.065, + "step": 553 + }, + { + "epoch": 0.9212221991270007, + "grad_norm": 2.337350211180452, + "learning_rate": 1.5791050116138438e-06, + "loss": 1.0277, + "step": 554 + }, + { + "epoch": 0.9228850550821035, + "grad_norm": 2.1419920030575206, + "learning_rate": 1.577675683537803e-06, + "loss": 0.9841, + "step": 555 + }, + { + "epoch": 0.9245479110372063, + "grad_norm": 2.2368638321185865, + "learning_rate": 1.5762445819946383e-06, + "loss": 1.1817, + "step": 556 + }, + { + "epoch": 0.9262107669923093, + "grad_norm": 2.1713305118749955, + "learning_rate": 1.5748117113778379e-06, + "loss": 1.0755, + "step": 557 + }, + { + "epoch": 0.9278736229474122, + "grad_norm": 2.288987960510324, + "learning_rate": 1.5733770760863219e-06, + "loss": 1.1393, + "step": 558 + }, + { + "epoch": 0.9295364789025151, + "grad_norm": 2.2628198071243526, + "learning_rate": 1.5719406805244274e-06, + "loss": 1.0763, + "step": 559 + }, + { + "epoch": 0.931199334857618, + "grad_norm": 2.164816303310072, + "learning_rate": 1.570502529101896e-06, + "loss": 1.1296, + "step": 560 + }, + { + "epoch": 0.9328621908127208, + "grad_norm": 2.198211513279556, + "learning_rate": 1.569062626233859e-06, + "loss": 1.1306, + "step": 561 + }, + { + "epoch": 0.9345250467678238, + "grad_norm": 2.0700768610981766, + "learning_rate": 1.5676209763408252e-06, + "loss": 1.0939, + "step": 562 + }, + { + "epoch": 0.9361879027229266, + "grad_norm": 2.2514400348833927, + "learning_rate": 1.5661775838486673e-06, + "loss": 1.1552, + "step": 563 + }, + { + "epoch": 0.9378507586780295, + "grad_norm": 2.090942303284215, + "learning_rate": 1.5647324531886064e-06, + "loss": 0.9839, + "step": 564 + }, + { + "epoch": 0.9395136146331324, + "grad_norm": 2.140109882341939, + "learning_rate": 1.5632855887972007e-06, + "loss": 1.2005, + "step": 565 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 1.9572836049893243, + "learning_rate": 1.5618369951163316e-06, + "loss": 1.0442, + "step": 566 + }, + { + "epoch": 0.9428393265433381, + "grad_norm": 2.247679565843514, + "learning_rate": 1.5603866765931874e-06, + "loss": 1.0675, + "step": 567 + }, + { + "epoch": 0.9445021824984411, + "grad_norm": 2.16539672713286, + "learning_rate": 1.558934637680254e-06, + "loss": 1.0969, + "step": 568 + }, + { + "epoch": 0.9461650384535439, + "grad_norm": 2.136321120880899, + "learning_rate": 1.5574808828352977e-06, + "loss": 1.0832, + "step": 569 + }, + { + "epoch": 0.9478278944086469, + "grad_norm": 2.35022238962833, + "learning_rate": 1.556025416521352e-06, + "loss": 1.0707, + "step": 570 + }, + { + "epoch": 0.9494907503637497, + "grad_norm": 2.2709844918364186, + "learning_rate": 1.5545682432067063e-06, + "loss": 1.1081, + "step": 571 + }, + { + "epoch": 0.9511536063188526, + "grad_norm": 2.2603031964486706, + "learning_rate": 1.5531093673648897e-06, + "loss": 1.1301, + "step": 572 + }, + { + "epoch": 0.9528164622739556, + "grad_norm": 2.168121102440009, + "learning_rate": 1.5516487934746575e-06, + "loss": 1.054, + "step": 573 + }, + { + "epoch": 0.9544793182290584, + "grad_norm": 2.2875944656650526, + "learning_rate": 1.5501865260199794e-06, + "loss": 1.0521, + "step": 574 + }, + { + "epoch": 0.9561421741841613, + "grad_norm": 2.034798392016362, + "learning_rate": 1.5487225694900222e-06, + "loss": 0.9311, + "step": 575 + }, + { + "epoch": 0.9578050301392642, + "grad_norm": 2.3324178914152967, + "learning_rate": 1.547256928379141e-06, + "loss": 1.2792, + "step": 576 + }, + { + "epoch": 0.9594678860943671, + "grad_norm": 2.12207756608058, + "learning_rate": 1.54578960718686e-06, + "loss": 1.0741, + "step": 577 + }, + { + "epoch": 0.9611307420494699, + "grad_norm": 2.3421431613806085, + "learning_rate": 1.5443206104178627e-06, + "loss": 1.126, + "step": 578 + }, + { + "epoch": 0.9627935980045729, + "grad_norm": 2.2122877182310714, + "learning_rate": 1.5428499425819764e-06, + "loss": 1.135, + "step": 579 + }, + { + "epoch": 0.9644564539596757, + "grad_norm": 2.2047878214189485, + "learning_rate": 1.5413776081941578e-06, + "loss": 1.1399, + "step": 580 + }, + { + "epoch": 0.9661193099147787, + "grad_norm": 2.236152079611479, + "learning_rate": 1.5399036117744811e-06, + "loss": 1.1442, + "step": 581 + }, + { + "epoch": 0.9677821658698815, + "grad_norm": 2.1601189484933476, + "learning_rate": 1.538427957848122e-06, + "loss": 1.0039, + "step": 582 + }, + { + "epoch": 0.9694450218249844, + "grad_norm": 2.060582773193502, + "learning_rate": 1.5369506509453455e-06, + "loss": 1.0841, + "step": 583 + }, + { + "epoch": 0.9711078777800873, + "grad_norm": 2.456082310651417, + "learning_rate": 1.5354716956014909e-06, + "loss": 1.1203, + "step": 584 + }, + { + "epoch": 0.9727707337351902, + "grad_norm": 2.047282112632501, + "learning_rate": 1.5339910963569583e-06, + "loss": 0.9737, + "step": 585 + }, + { + "epoch": 0.974433589690293, + "grad_norm": 2.0171083884034444, + "learning_rate": 1.5325088577571937e-06, + "loss": 1.0301, + "step": 586 + }, + { + "epoch": 0.976096445645396, + "grad_norm": 2.3643508794229513, + "learning_rate": 1.5310249843526774e-06, + "loss": 1.1398, + "step": 587 + }, + { + "epoch": 0.9777593016004988, + "grad_norm": 2.167267117059674, + "learning_rate": 1.5295394806989076e-06, + "loss": 1.1058, + "step": 588 + }, + { + "epoch": 0.9794221575556017, + "grad_norm": 2.1579109019121674, + "learning_rate": 1.5280523513563884e-06, + "loss": 0.9808, + "step": 589 + }, + { + "epoch": 0.9810850135107047, + "grad_norm": 2.2214933213880776, + "learning_rate": 1.526563600890613e-06, + "loss": 1.2226, + "step": 590 + }, + { + "epoch": 0.9827478694658075, + "grad_norm": 2.2694359795676085, + "learning_rate": 1.525073233872053e-06, + "loss": 1.0298, + "step": 591 + }, + { + "epoch": 0.9844107254209105, + "grad_norm": 2.115482333445717, + "learning_rate": 1.5235812548761424e-06, + "loss": 1.0619, + "step": 592 + }, + { + "epoch": 0.9860735813760133, + "grad_norm": 2.2939214488584883, + "learning_rate": 1.5220876684832638e-06, + "loss": 1.2193, + "step": 593 + }, + { + "epoch": 0.9877364373311162, + "grad_norm": 2.145954515190259, + "learning_rate": 1.5205924792787344e-06, + "loss": 1.1042, + "step": 594 + }, + { + "epoch": 0.9893992932862191, + "grad_norm": 2.1711020269771675, + "learning_rate": 1.5190956918527925e-06, + "loss": 1.0768, + "step": 595 + }, + { + "epoch": 0.991062149241322, + "grad_norm": 2.349827442265106, + "learning_rate": 1.517597310800582e-06, + "loss": 1.1737, + "step": 596 + }, + { + "epoch": 0.9927250051964248, + "grad_norm": 2.1677338271015074, + "learning_rate": 1.516097340722141e-06, + "loss": 1.1894, + "step": 597 + }, + { + "epoch": 0.9943878611515278, + "grad_norm": 2.37315650850326, + "learning_rate": 1.5145957862223843e-06, + "loss": 0.9953, + "step": 598 + }, + { + "epoch": 0.9960507171066306, + "grad_norm": 2.115914446964703, + "learning_rate": 1.5130926519110914e-06, + "loss": 1.1486, + "step": 599 + }, + { + "epoch": 0.9977135730617335, + "grad_norm": 2.2944478230358034, + "learning_rate": 1.5115879424028918e-06, + "loss": 1.01, + "step": 600 + }, + { + "epoch": 0.9993764290168364, + "grad_norm": 2.137749774673581, + "learning_rate": 1.5100816623172512e-06, + "loss": 1.1942, + "step": 601 + }, + { + "epoch": 1.0010392849719394, + "grad_norm": 2.1518759789580555, + "learning_rate": 1.5085738162784565e-06, + "loss": 1.0448, + "step": 602 + }, + { + "epoch": 1.0027021409270422, + "grad_norm": 2.0692500287896634, + "learning_rate": 1.5070644089156024e-06, + "loss": 1.0378, + "step": 603 + }, + { + "epoch": 1.0016614745586707, + "grad_norm": 2.230842725197309, + "learning_rate": 1.5055534448625764e-06, + "loss": 0.9973, + "step": 604 + }, + { + "epoch": 1.0016614745586707, + "eval_loss": 1.389662504196167, + "eval_runtime": 24.637, + "eval_samples_per_second": 0.446, + "eval_steps_per_second": 0.122, + "step": 604 + }, + { + "epoch": 1.0033229491173417, + "grad_norm": 2.009928207068121, + "learning_rate": 1.5040409287580457e-06, + "loss": 1.0293, + "step": 605 + }, + { + "epoch": 1.0049844236760124, + "grad_norm": 2.120439791656414, + "learning_rate": 1.5025268652454421e-06, + "loss": 1.048, + "step": 606 + }, + { + "epoch": 1.0066458982346833, + "grad_norm": 2.04371411144112, + "learning_rate": 1.501011258972948e-06, + "loss": 1.0411, + "step": 607 + }, + { + "epoch": 1.008307372793354, + "grad_norm": 1.9287047292797497, + "learning_rate": 1.4994941145934815e-06, + "loss": 0.8436, + "step": 608 + }, + { + "epoch": 1.009968847352025, + "grad_norm": 2.0850070547651067, + "learning_rate": 1.4979754367646833e-06, + "loss": 0.9103, + "step": 609 + }, + { + "epoch": 1.0116303219106957, + "grad_norm": 2.0786145289365203, + "learning_rate": 1.4964552301489018e-06, + "loss": 0.9403, + "step": 610 + }, + { + "epoch": 1.0132917964693666, + "grad_norm": 2.2212349830956652, + "learning_rate": 1.494933499413178e-06, + "loss": 1.1803, + "step": 611 + }, + { + "epoch": 1.0149532710280373, + "grad_norm": 2.2692136037661843, + "learning_rate": 1.4934102492292336e-06, + "loss": 0.9701, + "step": 612 + }, + { + "epoch": 1.0166147455867083, + "grad_norm": 2.0584997416327253, + "learning_rate": 1.491885484273453e-06, + "loss": 1.0582, + "step": 613 + }, + { + "epoch": 1.018276220145379, + "grad_norm": 2.259916957406716, + "learning_rate": 1.4903592092268726e-06, + "loss": 0.9255, + "step": 614 + }, + { + "epoch": 1.01993769470405, + "grad_norm": 2.848368582871536, + "learning_rate": 1.4888314287751638e-06, + "loss": 0.9602, + "step": 615 + }, + { + "epoch": 1.0215991692627207, + "grad_norm": 2.760714314244588, + "learning_rate": 1.48730214760862e-06, + "loss": 0.9578, + "step": 616 + }, + { + "epoch": 1.0232606438213916, + "grad_norm": 2.23754202122278, + "learning_rate": 1.4857713704221419e-06, + "loss": 1.0313, + "step": 617 + }, + { + "epoch": 1.0249221183800623, + "grad_norm": 2.348557579974889, + "learning_rate": 1.4842391019152225e-06, + "loss": 1.0304, + "step": 618 + }, + { + "epoch": 1.026583592938733, + "grad_norm": 2.3247250951347485, + "learning_rate": 1.482705346791934e-06, + "loss": 0.9455, + "step": 619 + }, + { + "epoch": 1.028245067497404, + "grad_norm": 2.0361498175354997, + "learning_rate": 1.481170109760911e-06, + "loss": 0.9218, + "step": 620 + }, + { + "epoch": 1.0299065420560747, + "grad_norm": 2.214215564027277, + "learning_rate": 1.4796333955353395e-06, + "loss": 1.0163, + "step": 621 + }, + { + "epoch": 1.0315680166147456, + "grad_norm": 2.1719690959578037, + "learning_rate": 1.4780952088329394e-06, + "loss": 1.0134, + "step": 622 + }, + { + "epoch": 1.0332294911734163, + "grad_norm": 2.2204516984962357, + "learning_rate": 1.476555554375951e-06, + "loss": 0.9959, + "step": 623 + }, + { + "epoch": 1.0348909657320873, + "grad_norm": 2.184066671975778, + "learning_rate": 1.4750144368911207e-06, + "loss": 0.9662, + "step": 624 + }, + { + "epoch": 1.036552440290758, + "grad_norm": 2.265473071872864, + "learning_rate": 1.4734718611096874e-06, + "loss": 0.893, + "step": 625 + }, + { + "epoch": 1.038213914849429, + "grad_norm": 2.2673393121172083, + "learning_rate": 1.4719278317673654e-06, + "loss": 1.0177, + "step": 626 + }, + { + "epoch": 1.0398753894080996, + "grad_norm": 2.3668732645254806, + "learning_rate": 1.4703823536043324e-06, + "loss": 0.983, + "step": 627 + }, + { + "epoch": 1.0415368639667706, + "grad_norm": 2.131498764332704, + "learning_rate": 1.468835431365214e-06, + "loss": 1.0071, + "step": 628 + }, + { + "epoch": 1.0431983385254413, + "grad_norm": 2.149621804291588, + "learning_rate": 1.4672870697990686e-06, + "loss": 0.9003, + "step": 629 + }, + { + "epoch": 1.0448598130841122, + "grad_norm": 2.159649238111743, + "learning_rate": 1.4657372736593736e-06, + "loss": 1.0028, + "step": 630 + }, + { + "epoch": 1.046521287642783, + "grad_norm": 2.1986785581325, + "learning_rate": 1.464186047704011e-06, + "loss": 1.1123, + "step": 631 + }, + { + "epoch": 1.0481827622014537, + "grad_norm": 2.2105863636515726, + "learning_rate": 1.4626333966952518e-06, + "loss": 0.9999, + "step": 632 + }, + { + "epoch": 1.0498442367601246, + "grad_norm": 2.233749646735216, + "learning_rate": 1.4610793253997419e-06, + "loss": 0.9993, + "step": 633 + }, + { + "epoch": 1.0515057113187953, + "grad_norm": 2.143359223529294, + "learning_rate": 1.4595238385884878e-06, + "loss": 0.9324, + "step": 634 + }, + { + "epoch": 1.0531671858774663, + "grad_norm": 2.1270783200542325, + "learning_rate": 1.4579669410368412e-06, + "loss": 0.913, + "step": 635 + }, + { + "epoch": 1.054828660436137, + "grad_norm": 2.311976174877509, + "learning_rate": 1.4564086375244854e-06, + "loss": 1.0857, + "step": 636 + }, + { + "epoch": 1.056490134994808, + "grad_norm": 2.1242450037351333, + "learning_rate": 1.4548489328354194e-06, + "loss": 1.0415, + "step": 637 + }, + { + "epoch": 1.0581516095534786, + "grad_norm": 2.1489145088861554, + "learning_rate": 1.4532878317579443e-06, + "loss": 1.0571, + "step": 638 + }, + { + "epoch": 1.0598130841121496, + "grad_norm": 2.1748830811019055, + "learning_rate": 1.451725339084648e-06, + "loss": 1.1093, + "step": 639 + }, + { + "epoch": 1.0614745586708203, + "grad_norm": 2.038403380979606, + "learning_rate": 1.4501614596123897e-06, + "loss": 0.8422, + "step": 640 + }, + { + "epoch": 1.0631360332294912, + "grad_norm": 2.2259593245678806, + "learning_rate": 1.4485961981422882e-06, + "loss": 1.125, + "step": 641 + }, + { + "epoch": 1.064797507788162, + "grad_norm": 2.14202683520297, + "learning_rate": 1.4470295594797028e-06, + "loss": 0.9507, + "step": 642 + }, + { + "epoch": 1.066458982346833, + "grad_norm": 2.304225430064475, + "learning_rate": 1.4454615484342222e-06, + "loss": 0.947, + "step": 643 + }, + { + "epoch": 1.0681204569055036, + "grad_norm": 2.469583985925475, + "learning_rate": 1.4438921698196477e-06, + "loss": 0.9516, + "step": 644 + }, + { + "epoch": 1.0697819314641746, + "grad_norm": 2.079662445475448, + "learning_rate": 1.4423214284539787e-06, + "loss": 0.9201, + "step": 645 + }, + { + "epoch": 1.0714434060228453, + "grad_norm": 2.2706631611703108, + "learning_rate": 1.4407493291593992e-06, + "loss": 0.9743, + "step": 646 + }, + { + "epoch": 1.073104880581516, + "grad_norm": 2.3017848193416053, + "learning_rate": 1.439175876762262e-06, + "loss": 1.0577, + "step": 647 + }, + { + "epoch": 1.074766355140187, + "grad_norm": 2.118285473386722, + "learning_rate": 1.4376010760930727e-06, + "loss": 0.8594, + "step": 648 + }, + { + "epoch": 1.0764278296988576, + "grad_norm": 2.071039252515359, + "learning_rate": 1.4360249319864775e-06, + "loss": 0.8875, + "step": 649 + }, + { + "epoch": 1.0780893042575286, + "grad_norm": 2.0620465316960934, + "learning_rate": 1.434447449281246e-06, + "loss": 1.0298, + "step": 650 + }, + { + "epoch": 1.0797507788161993, + "grad_norm": 2.3185372556623025, + "learning_rate": 1.432868632820258e-06, + "loss": 0.9762, + "step": 651 + }, + { + "epoch": 1.0814122533748702, + "grad_norm": 2.6906645034252317, + "learning_rate": 1.4312884874504873e-06, + "loss": 1.0237, + "step": 652 + }, + { + "epoch": 1.083073727933541, + "grad_norm": 2.1616057770804815, + "learning_rate": 1.4297070180229881e-06, + "loss": 0.9281, + "step": 653 + }, + { + "epoch": 1.084735202492212, + "grad_norm": 2.117360636691568, + "learning_rate": 1.4281242293928787e-06, + "loss": 0.8539, + "step": 654 + }, + { + "epoch": 1.0863966770508826, + "grad_norm": 2.3065809797011667, + "learning_rate": 1.4265401264193284e-06, + "loss": 1.0164, + "step": 655 + }, + { + "epoch": 1.0880581516095535, + "grad_norm": 2.2835922952642407, + "learning_rate": 1.4249547139655408e-06, + "loss": 0.9527, + "step": 656 + }, + { + "epoch": 1.0897196261682243, + "grad_norm": 2.2551385893064353, + "learning_rate": 1.4233679968987392e-06, + "loss": 1.189, + "step": 657 + }, + { + "epoch": 1.0913811007268952, + "grad_norm": 2.087220306176795, + "learning_rate": 1.421779980090153e-06, + "loss": 0.9813, + "step": 658 + }, + { + "epoch": 1.093042575285566, + "grad_norm": 2.291653419238335, + "learning_rate": 1.4201906684150019e-06, + "loss": 1.0983, + "step": 659 + }, + { + "epoch": 1.0947040498442369, + "grad_norm": 2.174357913738222, + "learning_rate": 1.4186000667524794e-06, + "loss": 1.0493, + "step": 660 + }, + { + "epoch": 1.0963655244029076, + "grad_norm": 2.3975146777189793, + "learning_rate": 1.417008179985741e-06, + "loss": 1.0219, + "step": 661 + }, + { + "epoch": 1.0980269989615783, + "grad_norm": 2.0940561543065135, + "learning_rate": 1.4154150130018865e-06, + "loss": 1.09, + "step": 662 + }, + { + "epoch": 1.0996884735202492, + "grad_norm": 2.376622784313192, + "learning_rate": 1.4138205706919459e-06, + "loss": 0.9597, + "step": 663 + }, + { + "epoch": 1.10134994807892, + "grad_norm": 2.504249815738343, + "learning_rate": 1.4122248579508655e-06, + "loss": 1.0957, + "step": 664 + }, + { + "epoch": 1.1030114226375909, + "grad_norm": 2.126205812177635, + "learning_rate": 1.41062787967749e-06, + "loss": 0.9934, + "step": 665 + }, + { + "epoch": 1.1046728971962616, + "grad_norm": 2.169506367867318, + "learning_rate": 1.4090296407745514e-06, + "loss": 0.9573, + "step": 666 + }, + { + "epoch": 1.1063343717549325, + "grad_norm": 2.2665210150054893, + "learning_rate": 1.4074301461486504e-06, + "loss": 1.0109, + "step": 667 + }, + { + "epoch": 1.1079958463136033, + "grad_norm": 2.154696910395812, + "learning_rate": 1.4058294007102431e-06, + "loss": 0.9593, + "step": 668 + }, + { + "epoch": 1.1096573208722742, + "grad_norm": 2.16252295450875, + "learning_rate": 1.4042274093736256e-06, + "loss": 1.0709, + "step": 669 + }, + { + "epoch": 1.111318795430945, + "grad_norm": 2.0289971880838116, + "learning_rate": 1.4026241770569196e-06, + "loss": 0.9739, + "step": 670 + }, + { + "epoch": 1.1129802699896159, + "grad_norm": 2.060249988117431, + "learning_rate": 1.4010197086820552e-06, + "loss": 0.9788, + "step": 671 + }, + { + "epoch": 1.1146417445482866, + "grad_norm": 2.169402054591631, + "learning_rate": 1.3994140091747586e-06, + "loss": 1.0504, + "step": 672 + }, + { + "epoch": 1.1163032191069575, + "grad_norm": 2.5824180613309844, + "learning_rate": 1.3978070834645348e-06, + "loss": 1.0102, + "step": 673 + }, + { + "epoch": 1.1179646936656282, + "grad_norm": 2.2309401801547173, + "learning_rate": 1.3961989364846532e-06, + "loss": 0.8809, + "step": 674 + }, + { + "epoch": 1.1196261682242992, + "grad_norm": 2.1738166442416134, + "learning_rate": 1.3945895731721331e-06, + "loss": 0.9891, + "step": 675 + }, + { + "epoch": 1.1212876427829699, + "grad_norm": 2.264956154758612, + "learning_rate": 1.3929789984677277e-06, + "loss": 1.03, + "step": 676 + }, + { + "epoch": 1.1229491173416406, + "grad_norm": 2.2123647976197796, + "learning_rate": 1.3913672173159085e-06, + "loss": 1.0692, + "step": 677 + }, + { + "epoch": 1.1246105919003115, + "grad_norm": 2.364489969539715, + "learning_rate": 1.3897542346648523e-06, + "loss": 1.0008, + "step": 678 + }, + { + "epoch": 1.1262720664589823, + "grad_norm": 2.1920721564053407, + "learning_rate": 1.3881400554664227e-06, + "loss": 0.9403, + "step": 679 + }, + { + "epoch": 1.1279335410176532, + "grad_norm": 2.0979599935589643, + "learning_rate": 1.386524684676158e-06, + "loss": 1.0091, + "step": 680 + }, + { + "epoch": 1.129595015576324, + "grad_norm": 2.278663731270055, + "learning_rate": 1.3849081272532544e-06, + "loss": 1.0526, + "step": 681 + }, + { + "epoch": 1.1312564901349949, + "grad_norm": 2.1605440095876753, + "learning_rate": 1.3832903881605507e-06, + "loss": 1.0356, + "step": 682 + }, + { + "epoch": 1.1329179646936656, + "grad_norm": 2.0718604939036136, + "learning_rate": 1.381671472364514e-06, + "loss": 0.9792, + "step": 683 + }, + { + "epoch": 1.1345794392523365, + "grad_norm": 2.1463194438640265, + "learning_rate": 1.380051384835223e-06, + "loss": 0.9814, + "step": 684 + }, + { + "epoch": 1.1362409138110072, + "grad_norm": 2.091677003972167, + "learning_rate": 1.3784301305463545e-06, + "loss": 1.0427, + "step": 685 + }, + { + "epoch": 1.1379023883696782, + "grad_norm": 2.2700678556630494, + "learning_rate": 1.3768077144751674e-06, + "loss": 0.9369, + "step": 686 + }, + { + "epoch": 1.1395638629283489, + "grad_norm": 2.0625458347908943, + "learning_rate": 1.3751841416024862e-06, + "loss": 0.8808, + "step": 687 + }, + { + "epoch": 1.1412253374870198, + "grad_norm": 2.3876992754405912, + "learning_rate": 1.3735594169126878e-06, + "loss": 0.9346, + "step": 688 + }, + { + "epoch": 1.1428868120456905, + "grad_norm": 8.47258353126556, + "learning_rate": 1.3719335453936844e-06, + "loss": 0.9763, + "step": 689 + }, + { + "epoch": 1.1445482866043615, + "grad_norm": 2.1087038863817478, + "learning_rate": 1.37030653203691e-06, + "loss": 1.159, + "step": 690 + }, + { + "epoch": 1.1462097611630322, + "grad_norm": 2.127868180262951, + "learning_rate": 1.3686783818373026e-06, + "loss": 1.0057, + "step": 691 + }, + { + "epoch": 1.147871235721703, + "grad_norm": 2.196072659767744, + "learning_rate": 1.367049099793292e-06, + "loss": 0.9821, + "step": 692 + }, + { + "epoch": 1.1495327102803738, + "grad_norm": 2.199287353415817, + "learning_rate": 1.3654186909067815e-06, + "loss": 1.0796, + "step": 693 + }, + { + "epoch": 1.1511941848390446, + "grad_norm": 2.265025353172786, + "learning_rate": 1.3637871601831338e-06, + "loss": 0.8848, + "step": 694 + }, + { + "epoch": 1.1528556593977155, + "grad_norm": 2.199148208887257, + "learning_rate": 1.3621545126311569e-06, + "loss": 0.8418, + "step": 695 + }, + { + "epoch": 1.1545171339563862, + "grad_norm": 2.1180720226692418, + "learning_rate": 1.3605207532630863e-06, + "loss": 0.8909, + "step": 696 + }, + { + "epoch": 1.1561786085150572, + "grad_norm": 2.2219661239549513, + "learning_rate": 1.358885887094571e-06, + "loss": 1.1277, + "step": 697 + }, + { + "epoch": 1.1578400830737279, + "grad_norm": 2.3215590968893474, + "learning_rate": 1.3572499191446578e-06, + "loss": 0.9389, + "step": 698 + }, + { + "epoch": 1.1595015576323988, + "grad_norm": 2.0426465674972407, + "learning_rate": 1.355612854435776e-06, + "loss": 0.9844, + "step": 699 + }, + { + "epoch": 1.1611630321910695, + "grad_norm": 2.3030330331075555, + "learning_rate": 1.3539746979937233e-06, + "loss": 0.997, + "step": 700 + }, + { + "epoch": 1.1628245067497405, + "grad_norm": 2.2497492700696475, + "learning_rate": 1.3523354548476466e-06, + "loss": 0.8244, + "step": 701 + }, + { + "epoch": 1.1644859813084112, + "grad_norm": 2.585474730795241, + "learning_rate": 1.3506951300300309e-06, + "loss": 1.0063, + "step": 702 + }, + { + "epoch": 1.1661474558670821, + "grad_norm": 2.041245335602706, + "learning_rate": 1.3490537285766808e-06, + "loss": 1.0547, + "step": 703 + }, + { + "epoch": 1.1678089304257528, + "grad_norm": 2.477977632976126, + "learning_rate": 1.347411255526707e-06, + "loss": 1.0055, + "step": 704 + }, + { + "epoch": 1.1694704049844238, + "grad_norm": 2.2319941565239425, + "learning_rate": 1.3457677159225095e-06, + "loss": 0.9844, + "step": 705 + }, + { + "epoch": 1.1711318795430945, + "grad_norm": 2.1658358037506784, + "learning_rate": 1.3441231148097628e-06, + "loss": 1.0095, + "step": 706 + }, + { + "epoch": 1.1727933541017652, + "grad_norm": 2.046058505379159, + "learning_rate": 1.3424774572374004e-06, + "loss": 1.0553, + "step": 707 + }, + { + "epoch": 1.1744548286604362, + "grad_norm": 2.1891816920862364, + "learning_rate": 1.340830748257599e-06, + "loss": 0.9626, + "step": 708 + }, + { + "epoch": 1.1761163032191069, + "grad_norm": 2.866582842225965, + "learning_rate": 1.3391829929257623e-06, + "loss": 1.0702, + "step": 709 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.076967565191604, + "learning_rate": 1.337534196300508e-06, + "loss": 0.9365, + "step": 710 + }, + { + "epoch": 1.1794392523364485, + "grad_norm": 2.1835016754122063, + "learning_rate": 1.3358843634436495e-06, + "loss": 1.0592, + "step": 711 + }, + { + "epoch": 1.1811007268951195, + "grad_norm": 2.219299561319535, + "learning_rate": 1.3342334994201814e-06, + "loss": 1.0805, + "step": 712 + }, + { + "epoch": 1.1827622014537902, + "grad_norm": 2.147024039114467, + "learning_rate": 1.332581609298264e-06, + "loss": 0.9914, + "step": 713 + }, + { + "epoch": 1.1844236760124611, + "grad_norm": 2.0325479434123457, + "learning_rate": 1.3309286981492082e-06, + "loss": 0.8889, + "step": 714 + }, + { + "epoch": 1.1860851505711318, + "grad_norm": 2.5059245553533858, + "learning_rate": 1.3292747710474592e-06, + "loss": 1.1349, + "step": 715 + }, + { + "epoch": 1.1877466251298028, + "grad_norm": 2.2414243259263524, + "learning_rate": 1.327619833070581e-06, + "loss": 0.8834, + "step": 716 + }, + { + "epoch": 1.1894080996884735, + "grad_norm": 2.2835729231057336, + "learning_rate": 1.3259638892992411e-06, + "loss": 1.0753, + "step": 717 + }, + { + "epoch": 1.1910695742471444, + "grad_norm": 2.1983108569959553, + "learning_rate": 1.3243069448171951e-06, + "loss": 0.9993, + "step": 718 + }, + { + "epoch": 1.1927310488058152, + "grad_norm": 2.1493768529520776, + "learning_rate": 1.3226490047112702e-06, + "loss": 0.9992, + "step": 719 + }, + { + "epoch": 1.194392523364486, + "grad_norm": 2.1982177635859474, + "learning_rate": 1.3209900740713506e-06, + "loss": 1.0959, + "step": 720 + }, + { + "epoch": 1.1960539979231568, + "grad_norm": 2.8542968450301105, + "learning_rate": 1.3193301579903615e-06, + "loss": 1.0538, + "step": 721 + }, + { + "epoch": 1.1977154724818275, + "grad_norm": 2.1619403628202876, + "learning_rate": 1.317669261564253e-06, + "loss": 1.0251, + "step": 722 + }, + { + "epoch": 1.1993769470404985, + "grad_norm": 2.418169263907194, + "learning_rate": 1.3160073898919852e-06, + "loss": 1.0045, + "step": 723 + }, + { + "epoch": 1.2010384215991692, + "grad_norm": 2.255661060936539, + "learning_rate": 1.3143445480755122e-06, + "loss": 0.8968, + "step": 724 + }, + { + "epoch": 1.2026998961578401, + "grad_norm": 2.272919278531151, + "learning_rate": 1.3126807412197664e-06, + "loss": 0.9584, + "step": 725 + }, + { + "epoch": 1.2043613707165108, + "grad_norm": 2.1840708120633594, + "learning_rate": 1.3110159744326426e-06, + "loss": 1.0289, + "step": 726 + }, + { + "epoch": 1.2060228452751818, + "grad_norm": 2.19563599663801, + "learning_rate": 1.3093502528249828e-06, + "loss": 0.9625, + "step": 727 + }, + { + "epoch": 1.2076843198338525, + "grad_norm": 3.020937698837689, + "learning_rate": 1.307683581510561e-06, + "loss": 0.8759, + "step": 728 + }, + { + "epoch": 1.2093457943925234, + "grad_norm": 2.246654200444507, + "learning_rate": 1.3060159656060653e-06, + "loss": 1.2153, + "step": 729 + }, + { + "epoch": 1.2110072689511941, + "grad_norm": 2.0769588680295774, + "learning_rate": 1.304347410231085e-06, + "loss": 1.0093, + "step": 730 + }, + { + "epoch": 1.212668743509865, + "grad_norm": 2.288118936663974, + "learning_rate": 1.3026779205080931e-06, + "loss": 1.0355, + "step": 731 + }, + { + "epoch": 1.2143302180685358, + "grad_norm": 2.184081985214361, + "learning_rate": 1.3010075015624308e-06, + "loss": 0.9306, + "step": 732 + }, + { + "epoch": 1.2159916926272065, + "grad_norm": 2.1299774896655106, + "learning_rate": 1.2993361585222927e-06, + "loss": 0.9958, + "step": 733 + }, + { + "epoch": 1.2176531671858775, + "grad_norm": 2.175753135707468, + "learning_rate": 1.2976638965187094e-06, + "loss": 0.9293, + "step": 734 + }, + { + "epoch": 1.2193146417445484, + "grad_norm": 2.2923713334477536, + "learning_rate": 1.295990720685534e-06, + "loss": 1.0258, + "step": 735 + }, + { + "epoch": 1.2209761163032191, + "grad_norm": 2.2957849147388525, + "learning_rate": 1.294316636159424e-06, + "loss": 0.9638, + "step": 736 + }, + { + "epoch": 1.2226375908618898, + "grad_norm": 2.250938246343808, + "learning_rate": 1.2926416480798267e-06, + "loss": 0.8653, + "step": 737 + }, + { + "epoch": 1.2242990654205608, + "grad_norm": 2.142325631732433, + "learning_rate": 1.2909657615889638e-06, + "loss": 0.9697, + "step": 738 + }, + { + "epoch": 1.2259605399792315, + "grad_norm": 2.039460360724837, + "learning_rate": 1.289288981831815e-06, + "loss": 0.9789, + "step": 739 + }, + { + "epoch": 1.2276220145379024, + "grad_norm": 2.2252231947797054, + "learning_rate": 1.2876113139561018e-06, + "loss": 1.1306, + "step": 740 + }, + { + "epoch": 1.2292834890965731, + "grad_norm": 2.101017091432986, + "learning_rate": 1.285932763112273e-06, + "loss": 0.9276, + "step": 741 + }, + { + "epoch": 1.230944963655244, + "grad_norm": 2.1947947768274334, + "learning_rate": 1.2842533344534875e-06, + "loss": 0.9702, + "step": 742 + }, + { + "epoch": 1.2326064382139148, + "grad_norm": 2.0326132196001545, + "learning_rate": 1.2825730331355995e-06, + "loss": 0.868, + "step": 743 + }, + { + "epoch": 1.2342679127725857, + "grad_norm": 2.148348415668642, + "learning_rate": 1.2808918643171423e-06, + "loss": 0.9215, + "step": 744 + }, + { + "epoch": 1.2359293873312565, + "grad_norm": 2.2147832136292327, + "learning_rate": 1.279209833159312e-06, + "loss": 0.9085, + "step": 745 + }, + { + "epoch": 1.2375908618899274, + "grad_norm": 2.2716999685750796, + "learning_rate": 1.2775269448259524e-06, + "loss": 0.9814, + "step": 746 + }, + { + "epoch": 1.2392523364485981, + "grad_norm": 2.294321169355599, + "learning_rate": 1.275843204483539e-06, + "loss": 1.0459, + "step": 747 + }, + { + "epoch": 1.2409138110072688, + "grad_norm": 2.0155276917734435, + "learning_rate": 1.2741586173011623e-06, + "loss": 0.9093, + "step": 748 + }, + { + "epoch": 1.2425752855659398, + "grad_norm": 2.176244110291096, + "learning_rate": 1.2724731884505134e-06, + "loss": 1.0437, + "step": 749 + }, + { + "epoch": 1.2442367601246107, + "grad_norm": 2.060993253717314, + "learning_rate": 1.2707869231058665e-06, + "loss": 0.9021, + "step": 750 + }, + { + "epoch": 1.2458982346832814, + "grad_norm": 2.2567394830685688, + "learning_rate": 1.2690998264440651e-06, + "loss": 1.0458, + "step": 751 + }, + { + "epoch": 1.2475597092419521, + "grad_norm": 2.1356542645695917, + "learning_rate": 1.2674119036445034e-06, + "loss": 0.8728, + "step": 752 + }, + { + "epoch": 1.249221183800623, + "grad_norm": 2.146513267279053, + "learning_rate": 1.2657231598891125e-06, + "loss": 0.9412, + "step": 753 + }, + { + "epoch": 1.2508826583592938, + "grad_norm": 2.237050919851434, + "learning_rate": 1.2640336003623442e-06, + "loss": 0.9418, + "step": 754 + }, + { + "epoch": 1.2525441329179647, + "grad_norm": 2.309743390797678, + "learning_rate": 1.2623432302511542e-06, + "loss": 1.1053, + "step": 755 + }, + { + "epoch": 1.2525441329179647, + "eval_loss": 1.3914175033569336, + "eval_runtime": 24.4622, + "eval_samples_per_second": 0.45, + "eval_steps_per_second": 0.123, + "step": 755 + }, + { + "epoch": 1.257528556593977, + "grad_norm": 2.194516385421637, + "learning_rate": 1.260652054744987e-06, + "loss": 1.0752, + "step": 756 + }, + { + "epoch": 1.259190031152648, + "grad_norm": 2.0798826656496514, + "learning_rate": 1.258960079035759e-06, + "loss": 1.0474, + "step": 757 + }, + { + "epoch": 1.2608515057113188, + "grad_norm": 2.4630029006380663, + "learning_rate": 1.2572673083178447e-06, + "loss": 1.0878, + "step": 758 + }, + { + "epoch": 1.2625129802699897, + "grad_norm": 2.059087102554878, + "learning_rate": 1.2555737477880575e-06, + "loss": 0.9406, + "step": 759 + }, + { + "epoch": 1.2641744548286604, + "grad_norm": 2.0994176763702717, + "learning_rate": 1.2538794026456365e-06, + "loss": 0.7893, + "step": 760 + }, + { + "epoch": 1.2658359293873311, + "grad_norm": 2.2656607881570534, + "learning_rate": 1.2521842780922298e-06, + "loss": 0.9353, + "step": 761 + }, + { + "epoch": 1.267497403946002, + "grad_norm": 2.2731202307003735, + "learning_rate": 1.2504883793318777e-06, + "loss": 0.9712, + "step": 762 + }, + { + "epoch": 1.269158878504673, + "grad_norm": 2.199059936980419, + "learning_rate": 1.2487917115709973e-06, + "loss": 0.8983, + "step": 763 + }, + { + "epoch": 1.2708203530633437, + "grad_norm": 2.3268110070781787, + "learning_rate": 1.2470942800183674e-06, + "loss": 0.881, + "step": 764 + }, + { + "epoch": 1.2724818276220144, + "grad_norm": 2.4916547720738245, + "learning_rate": 1.2453960898851105e-06, + "loss": 1.0081, + "step": 765 + }, + { + "epoch": 1.2741433021806854, + "grad_norm": 2.2962920389351407, + "learning_rate": 1.2436971463846788e-06, + "loss": 0.9745, + "step": 766 + }, + { + "epoch": 1.275804776739356, + "grad_norm": 2.2101054126420863, + "learning_rate": 1.2419974547328364e-06, + "loss": 0.9563, + "step": 767 + }, + { + "epoch": 1.277466251298027, + "grad_norm": 2.2891187380085913, + "learning_rate": 1.2402970201476457e-06, + "loss": 0.9303, + "step": 768 + }, + { + "epoch": 1.2791277258566978, + "grad_norm": 2.1360488073197024, + "learning_rate": 1.2385958478494484e-06, + "loss": 0.8754, + "step": 769 + }, + { + "epoch": 1.2807892004153687, + "grad_norm": 2.19618235102571, + "learning_rate": 1.236893943060852e-06, + "loss": 0.9035, + "step": 770 + }, + { + "epoch": 1.2824506749740394, + "grad_norm": 2.134112006128728, + "learning_rate": 1.235191311006712e-06, + "loss": 0.9439, + "step": 771 + }, + { + "epoch": 1.2841121495327104, + "grad_norm": 3.1612461680428816, + "learning_rate": 1.2334879569141172e-06, + "loss": 0.8783, + "step": 772 + }, + { + "epoch": 1.285773624091381, + "grad_norm": 1.9962401932268041, + "learning_rate": 1.231783886012373e-06, + "loss": 0.9263, + "step": 773 + }, + { + "epoch": 1.287435098650052, + "grad_norm": 2.1827109534797695, + "learning_rate": 1.230079103532985e-06, + "loss": 0.9417, + "step": 774 + }, + { + "epoch": 1.2890965732087227, + "grad_norm": 2.7468270854753336, + "learning_rate": 1.228373614709644e-06, + "loss": 1.004, + "step": 775 + }, + { + "epoch": 1.2907580477673934, + "grad_norm": 2.1747453190202646, + "learning_rate": 1.2266674247782086e-06, + "loss": 0.9531, + "step": 776 + }, + { + "epoch": 1.2924195223260644, + "grad_norm": 2.085633930937514, + "learning_rate": 1.2249605389766895e-06, + "loss": 1.0289, + "step": 777 + }, + { + "epoch": 1.2940809968847353, + "grad_norm": 2.2021065356232317, + "learning_rate": 1.223252962545235e-06, + "loss": 0.8765, + "step": 778 + }, + { + "epoch": 1.295742471443406, + "grad_norm": 2.1829312571834896, + "learning_rate": 1.2215447007261133e-06, + "loss": 1.0022, + "step": 779 + }, + { + "epoch": 1.2974039460020768, + "grad_norm": 2.1283332483638286, + "learning_rate": 1.2198357587636956e-06, + "loss": 0.9697, + "step": 780 + }, + { + "epoch": 1.2990654205607477, + "grad_norm": 2.525909866186781, + "learning_rate": 1.2181261419044426e-06, + "loss": 0.9726, + "step": 781 + }, + { + "epoch": 1.3007268951194184, + "grad_norm": 2.1648814032733186, + "learning_rate": 1.2164158553968855e-06, + "loss": 1.0231, + "step": 782 + }, + { + "epoch": 1.3023883696780894, + "grad_norm": 2.2334029411110556, + "learning_rate": 1.2147049044916128e-06, + "loss": 1.1608, + "step": 783 + }, + { + "epoch": 1.30404984423676, + "grad_norm": 2.3547357866951093, + "learning_rate": 1.2129932944412518e-06, + "loss": 0.9692, + "step": 784 + }, + { + "epoch": 1.305711318795431, + "grad_norm": 2.2065719120739726, + "learning_rate": 1.2112810305004535e-06, + "loss": 0.8479, + "step": 785 + }, + { + "epoch": 1.3073727933541017, + "grad_norm": 2.125915368197623, + "learning_rate": 1.2095681179258764e-06, + "loss": 0.9621, + "step": 786 + }, + { + "epoch": 1.3090342679127727, + "grad_norm": 2.645809087395215, + "learning_rate": 1.2078545619761702e-06, + "loss": 1.0085, + "step": 787 + }, + { + "epoch": 1.3106957424714434, + "grad_norm": 2.250819574066223, + "learning_rate": 1.2061403679119601e-06, + "loss": 0.9068, + "step": 788 + }, + { + "epoch": 1.3123572170301143, + "grad_norm": 2.21749937163358, + "learning_rate": 1.2044255409958303e-06, + "loss": 0.9283, + "step": 789 + }, + { + "epoch": 1.314018691588785, + "grad_norm": 2.2743487946390992, + "learning_rate": 1.2027100864923075e-06, + "loss": 0.9947, + "step": 790 + }, + { + "epoch": 1.3156801661474558, + "grad_norm": 2.6923629420708646, + "learning_rate": 1.200994009667845e-06, + "loss": 0.9471, + "step": 791 + }, + { + "epoch": 1.3173416407061267, + "grad_norm": 2.3195724880810102, + "learning_rate": 1.1992773157908072e-06, + "loss": 0.9601, + "step": 792 + }, + { + "epoch": 1.3190031152647976, + "grad_norm": 2.0807605681167405, + "learning_rate": 1.1975600101314525e-06, + "loss": 0.8885, + "step": 793 + }, + { + "epoch": 1.3206645898234683, + "grad_norm": 2.0900202951082147, + "learning_rate": 1.1958420979619175e-06, + "loss": 1.0104, + "step": 794 + }, + { + "epoch": 1.322326064382139, + "grad_norm": 2.183167054296833, + "learning_rate": 1.1941235845562005e-06, + "loss": 0.9351, + "step": 795 + }, + { + "epoch": 1.32398753894081, + "grad_norm": 2.094498837050807, + "learning_rate": 1.1924044751901464e-06, + "loss": 0.995, + "step": 796 + }, + { + "epoch": 1.3256490134994807, + "grad_norm": 2.1534518031095957, + "learning_rate": 1.1906847751414291e-06, + "loss": 1.0239, + "step": 797 + }, + { + "epoch": 1.3273104880581517, + "grad_norm": 2.4401111022025557, + "learning_rate": 1.188964489689536e-06, + "loss": 1.0308, + "step": 798 + }, + { + "epoch": 1.3289719626168224, + "grad_norm": 2.6081491691522873, + "learning_rate": 1.1872436241157518e-06, + "loss": 1.1202, + "step": 799 + }, + { + "epoch": 1.3306334371754933, + "grad_norm": 2.3979641911118383, + "learning_rate": 1.1855221837031418e-06, + "loss": 0.9712, + "step": 800 + }, + { + "epoch": 1.332294911734164, + "grad_norm": 2.3288896242545563, + "learning_rate": 1.1838001737365363e-06, + "loss": 0.9857, + "step": 801 + }, + { + "epoch": 1.333956386292835, + "grad_norm": 2.2743604591828714, + "learning_rate": 1.1820775995025146e-06, + "loss": 1.0457, + "step": 802 + }, + { + "epoch": 1.3356178608515057, + "grad_norm": 2.2419016690025106, + "learning_rate": 1.1803544662893875e-06, + "loss": 1.0037, + "step": 803 + }, + { + "epoch": 1.3372793354101766, + "grad_norm": 2.097257183297578, + "learning_rate": 1.1786307793871823e-06, + "loss": 1.0137, + "step": 804 + }, + { + "epoch": 1.3389408099688473, + "grad_norm": 2.0525680314921115, + "learning_rate": 1.1769065440876263e-06, + "loss": 0.941, + "step": 805 + }, + { + "epoch": 1.340602284527518, + "grad_norm": 2.1484754300130797, + "learning_rate": 1.1751817656841297e-06, + "loss": 0.8931, + "step": 806 + }, + { + "epoch": 1.342263759086189, + "grad_norm": 2.049973872619327, + "learning_rate": 1.1734564494717708e-06, + "loss": 0.9391, + "step": 807 + }, + { + "epoch": 1.34392523364486, + "grad_norm": 2.1411969844449117, + "learning_rate": 1.171730600747279e-06, + "loss": 0.8776, + "step": 808 + }, + { + "epoch": 1.3455867082035307, + "grad_norm": 2.333385888491211, + "learning_rate": 1.1700042248090174e-06, + "loss": 0.8865, + "step": 809 + }, + { + "epoch": 1.3472481827622014, + "grad_norm": 2.172814651117749, + "learning_rate": 1.1682773269569692e-06, + "loss": 1.1343, + "step": 810 + }, + { + "epoch": 1.3489096573208723, + "grad_norm": 2.106719631129538, + "learning_rate": 1.1665499124927182e-06, + "loss": 0.9706, + "step": 811 + }, + { + "epoch": 1.350571131879543, + "grad_norm": 2.2297462633284812, + "learning_rate": 1.164821986719436e-06, + "loss": 0.9913, + "step": 812 + }, + { + "epoch": 1.352232606438214, + "grad_norm": 2.617400755249897, + "learning_rate": 1.1630935549418626e-06, + "loss": 0.9445, + "step": 813 + }, + { + "epoch": 1.3538940809968847, + "grad_norm": 2.1827702013845895, + "learning_rate": 1.161364622466292e-06, + "loss": 0.9375, + "step": 814 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 2.2016299775994876, + "learning_rate": 1.159635194600555e-06, + "loss": 0.9031, + "step": 815 + }, + { + "epoch": 1.3572170301142263, + "grad_norm": 2.070277398005119, + "learning_rate": 1.157905276654004e-06, + "loss": 0.9009, + "step": 816 + }, + { + "epoch": 1.358878504672897, + "grad_norm": 2.141179928339341, + "learning_rate": 1.1561748739374944e-06, + "loss": 0.9099, + "step": 817 + }, + { + "epoch": 1.360539979231568, + "grad_norm": 2.007914206117487, + "learning_rate": 1.1544439917633716e-06, + "loss": 0.892, + "step": 818 + }, + { + "epoch": 1.362201453790239, + "grad_norm": 2.0648321350070526, + "learning_rate": 1.1527126354454525e-06, + "loss": 0.958, + "step": 819 + }, + { + "epoch": 1.3638629283489097, + "grad_norm": 2.0018244208752565, + "learning_rate": 1.1509808102990085e-06, + "loss": 0.9431, + "step": 820 + }, + { + "epoch": 1.3655244029075804, + "grad_norm": 2.361320004069871, + "learning_rate": 1.1492485216407513e-06, + "loss": 1.0479, + "step": 821 + }, + { + "epoch": 1.3671858774662513, + "grad_norm": 2.6806488863596862, + "learning_rate": 1.1475157747888158e-06, + "loss": 0.9181, + "step": 822 + }, + { + "epoch": 1.3688473520249222, + "grad_norm": 2.1942518382547713, + "learning_rate": 1.145782575062743e-06, + "loss": 0.9427, + "step": 823 + }, + { + "epoch": 1.370508826583593, + "grad_norm": 2.122601010531213, + "learning_rate": 1.1440489277834645e-06, + "loss": 1.0671, + "step": 824 + }, + { + "epoch": 1.3721703011422637, + "grad_norm": 1.9965619962296717, + "learning_rate": 1.1423148382732853e-06, + "loss": 0.9456, + "step": 825 + }, + { + "epoch": 1.3738317757009346, + "grad_norm": 2.318013669509405, + "learning_rate": 1.1405803118558687e-06, + "loss": 0.9946, + "step": 826 + }, + { + "epoch": 1.3754932502596053, + "grad_norm": 2.1436011385350056, + "learning_rate": 1.1388453538562195e-06, + "loss": 0.9548, + "step": 827 + }, + { + "epoch": 1.3771547248182763, + "grad_norm": 2.1480524834330588, + "learning_rate": 1.137109969600667e-06, + "loss": 1.0541, + "step": 828 + }, + { + "epoch": 1.378816199376947, + "grad_norm": 2.417891031584686, + "learning_rate": 1.1353741644168487e-06, + "loss": 0.9857, + "step": 829 + }, + { + "epoch": 1.380477673935618, + "grad_norm": 2.1577692143799516, + "learning_rate": 1.1336379436336953e-06, + "loss": 1.0296, + "step": 830 + }, + { + "epoch": 1.3821391484942886, + "grad_norm": 2.3755322611919807, + "learning_rate": 1.131901312581413e-06, + "loss": 1.0684, + "step": 831 + }, + { + "epoch": 1.3838006230529594, + "grad_norm": 2.193724004809641, + "learning_rate": 1.1301642765914672e-06, + "loss": 0.995, + "step": 832 + }, + { + "epoch": 1.3854620976116303, + "grad_norm": 2.084892517254851, + "learning_rate": 1.1284268409965671e-06, + "loss": 1.0584, + "step": 833 + }, + { + "epoch": 1.3871235721703012, + "grad_norm": 2.107802237296406, + "learning_rate": 1.1266890111306483e-06, + "loss": 0.9933, + "step": 834 + }, + { + "epoch": 1.388785046728972, + "grad_norm": 2.2944459376176116, + "learning_rate": 1.1249507923288561e-06, + "loss": 0.8813, + "step": 835 + }, + { + "epoch": 1.3904465212876427, + "grad_norm": 2.4275620341058697, + "learning_rate": 1.1232121899275313e-06, + "loss": 1.0109, + "step": 836 + }, + { + "epoch": 1.3921079958463136, + "grad_norm": 2.213400646410023, + "learning_rate": 1.1214732092641914e-06, + "loss": 1.0679, + "step": 837 + }, + { + "epoch": 1.3937694704049846, + "grad_norm": 2.1158008518205724, + "learning_rate": 1.1197338556775155e-06, + "loss": 0.945, + "step": 838 + }, + { + "epoch": 1.3954309449636553, + "grad_norm": 2.2576033887254674, + "learning_rate": 1.1179941345073277e-06, + "loss": 1.1281, + "step": 839 + }, + { + "epoch": 1.397092419522326, + "grad_norm": 2.174956086888558, + "learning_rate": 1.1162540510945798e-06, + "loss": 0.9392, + "step": 840 + }, + { + "epoch": 1.398753894080997, + "grad_norm": 2.2102299571149473, + "learning_rate": 1.1145136107813361e-06, + "loss": 0.8779, + "step": 841 + }, + { + "epoch": 1.4004153686396676, + "grad_norm": 2.293662364545717, + "learning_rate": 1.1127728189107574e-06, + "loss": 0.8608, + "step": 842 + }, + { + "epoch": 1.4020768431983386, + "grad_norm": 2.2613684494376756, + "learning_rate": 1.111031680827083e-06, + "loss": 0.8828, + "step": 843 + }, + { + "epoch": 1.4037383177570093, + "grad_norm": 2.1319924720800696, + "learning_rate": 1.1092902018756148e-06, + "loss": 1.0049, + "step": 844 + }, + { + "epoch": 1.4053997923156802, + "grad_norm": 2.28021887844631, + "learning_rate": 1.1075483874027018e-06, + "loss": 0.9074, + "step": 845 + }, + { + "epoch": 1.407061266874351, + "grad_norm": 2.1484668932462445, + "learning_rate": 1.1058062427557228e-06, + "loss": 1.0847, + "step": 846 + }, + { + "epoch": 1.4087227414330217, + "grad_norm": 2.2205863989831416, + "learning_rate": 1.10406377328307e-06, + "loss": 0.9719, + "step": 847 + }, + { + "epoch": 1.4103842159916926, + "grad_norm": 2.1508101547003564, + "learning_rate": 1.1023209843341332e-06, + "loss": 1.0049, + "step": 848 + }, + { + "epoch": 1.4120456905503636, + "grad_norm": 2.3302734390720397, + "learning_rate": 1.1005778812592832e-06, + "loss": 1.1868, + "step": 849 + }, + { + "epoch": 1.4137071651090343, + "grad_norm": 2.1781701265290327, + "learning_rate": 1.0988344694098544e-06, + "loss": 0.8906, + "step": 850 + }, + { + "epoch": 1.415368639667705, + "grad_norm": 2.226671789203206, + "learning_rate": 1.0970907541381294e-06, + "loss": 0.929, + "step": 851 + }, + { + "epoch": 1.417030114226376, + "grad_norm": 2.5515388989279866, + "learning_rate": 1.095346740797323e-06, + "loss": 0.8837, + "step": 852 + }, + { + "epoch": 1.4186915887850469, + "grad_norm": 2.236679949130665, + "learning_rate": 1.0936024347415642e-06, + "loss": 0.9804, + "step": 853 + }, + { + "epoch": 1.4203530633437176, + "grad_norm": 2.9612492685412852, + "learning_rate": 1.091857841325881e-06, + "loss": 0.9525, + "step": 854 + }, + { + "epoch": 1.4220145379023883, + "grad_norm": 2.5666504315216137, + "learning_rate": 1.0901129659061837e-06, + "loss": 0.8627, + "step": 855 + }, + { + "epoch": 1.4236760124610592, + "grad_norm": 2.4015395133256106, + "learning_rate": 1.0883678138392475e-06, + "loss": 0.9089, + "step": 856 + }, + { + "epoch": 1.42533748701973, + "grad_norm": 2.3067956889994097, + "learning_rate": 1.0866223904826989e-06, + "loss": 0.9487, + "step": 857 + }, + { + "epoch": 1.426998961578401, + "grad_norm": 2.0391869701955647, + "learning_rate": 1.084876701194995e-06, + "loss": 1.0349, + "step": 858 + }, + { + "epoch": 1.4286604361370716, + "grad_norm": 2.1857273166825095, + "learning_rate": 1.0831307513354112e-06, + "loss": 0.9983, + "step": 859 + }, + { + "epoch": 1.4303219106957425, + "grad_norm": 2.2108513988117107, + "learning_rate": 1.0813845462640206e-06, + "loss": 0.9867, + "step": 860 + }, + { + "epoch": 1.4319833852544133, + "grad_norm": 2.4469400036513838, + "learning_rate": 1.0796380913416823e-06, + "loss": 0.9024, + "step": 861 + }, + { + "epoch": 1.433644859813084, + "grad_norm": 2.1429468615368905, + "learning_rate": 1.0778913919300209e-06, + "loss": 0.8963, + "step": 862 + }, + { + "epoch": 1.435306334371755, + "grad_norm": 2.176033522508349, + "learning_rate": 1.0761444533914124e-06, + "loss": 0.9561, + "step": 863 + }, + { + "epoch": 1.4369678089304259, + "grad_norm": 2.0784290252253137, + "learning_rate": 1.0743972810889654e-06, + "loss": 0.906, + "step": 864 + }, + { + "epoch": 1.4386292834890966, + "grad_norm": 2.2109329665454087, + "learning_rate": 1.0726498803865088e-06, + "loss": 0.9316, + "step": 865 + }, + { + "epoch": 1.4402907580477673, + "grad_norm": 2.160142006014489, + "learning_rate": 1.0709022566485697e-06, + "loss": 1.0291, + "step": 866 + }, + { + "epoch": 1.4419522326064382, + "grad_norm": 2.236888922467273, + "learning_rate": 1.069154415240362e-06, + "loss": 0.9812, + "step": 867 + }, + { + "epoch": 1.4436137071651092, + "grad_norm": 2.4886237263035276, + "learning_rate": 1.067406361527768e-06, + "loss": 1.0427, + "step": 868 + }, + { + "epoch": 1.44527518172378, + "grad_norm": 2.2522311274987823, + "learning_rate": 1.0656581008773197e-06, + "loss": 0.8864, + "step": 869 + }, + { + "epoch": 1.4469366562824506, + "grad_norm": 2.126057360159879, + "learning_rate": 1.0639096386561864e-06, + "loss": 1.0294, + "step": 870 + }, + { + "epoch": 1.4485981308411215, + "grad_norm": 2.257463390997248, + "learning_rate": 1.0621609802321553e-06, + "loss": 0.9633, + "step": 871 + }, + { + "epoch": 1.4502596053997923, + "grad_norm": 2.26321915407214, + "learning_rate": 1.0604121309736163e-06, + "loss": 0.8698, + "step": 872 + }, + { + "epoch": 1.4519210799584632, + "grad_norm": 2.061806930610862, + "learning_rate": 1.058663096249545e-06, + "loss": 1.0377, + "step": 873 + }, + { + "epoch": 1.453582554517134, + "grad_norm": 2.1295799083708147, + "learning_rate": 1.0569138814294862e-06, + "loss": 0.9534, + "step": 874 + }, + { + "epoch": 1.4552440290758049, + "grad_norm": 2.3106775542374294, + "learning_rate": 1.055164491883538e-06, + "loss": 0.9403, + "step": 875 + }, + { + "epoch": 1.4569055036344756, + "grad_norm": 2.2302178209419132, + "learning_rate": 1.0534149329823347e-06, + "loss": 0.9301, + "step": 876 + }, + { + "epoch": 1.4585669781931463, + "grad_norm": 2.2949573702055117, + "learning_rate": 1.0516652100970306e-06, + "loss": 0.9528, + "step": 877 + }, + { + "epoch": 1.4602284527518172, + "grad_norm": 2.10604631236836, + "learning_rate": 1.0499153285992832e-06, + "loss": 0.9069, + "step": 878 + }, + { + "epoch": 1.4618899273104882, + "grad_norm": 2.1660045153149934, + "learning_rate": 1.0481652938612372e-06, + "loss": 0.9408, + "step": 879 + }, + { + "epoch": 1.4635514018691589, + "grad_norm": 2.2748273606386995, + "learning_rate": 1.0464151112555076e-06, + "loss": 1.0554, + "step": 880 + }, + { + "epoch": 1.4652128764278296, + "grad_norm": 2.2711524480515406, + "learning_rate": 1.0446647861551632e-06, + "loss": 1.0468, + "step": 881 + }, + { + "epoch": 1.4668743509865005, + "grad_norm": 2.2946241383411987, + "learning_rate": 1.042914323933711e-06, + "loss": 0.8936, + "step": 882 + }, + { + "epoch": 1.4685358255451713, + "grad_norm": 2.390089218019007, + "learning_rate": 1.0411637299650781e-06, + "loss": 1.0086, + "step": 883 + }, + { + "epoch": 1.4701973001038422, + "grad_norm": 2.340221690538989, + "learning_rate": 1.0394130096235965e-06, + "loss": 1.0351, + "step": 884 + }, + { + "epoch": 1.471858774662513, + "grad_norm": 2.297583872743856, + "learning_rate": 1.0376621682839856e-06, + "loss": 0.9497, + "step": 885 + }, + { + "epoch": 1.4735202492211839, + "grad_norm": 2.305498668474036, + "learning_rate": 1.0359112113213374e-06, + "loss": 0.9291, + "step": 886 + }, + { + "epoch": 1.4751817237798546, + "grad_norm": 2.182291435160706, + "learning_rate": 1.0341601441110981e-06, + "loss": 0.9932, + "step": 887 + }, + { + "epoch": 1.4768431983385255, + "grad_norm": 2.0985924867543924, + "learning_rate": 1.0324089720290522e-06, + "loss": 0.9702, + "step": 888 + }, + { + "epoch": 1.4785046728971962, + "grad_norm": 2.2916738661032863, + "learning_rate": 1.0306577004513064e-06, + "loss": 1.0412, + "step": 889 + }, + { + "epoch": 1.4801661474558672, + "grad_norm": 2.3135550693056115, + "learning_rate": 1.0289063347542726e-06, + "loss": 0.8467, + "step": 890 + }, + { + "epoch": 1.4818276220145379, + "grad_norm": 2.3188283888371393, + "learning_rate": 1.0271548803146525e-06, + "loss": 0.9271, + "step": 891 + }, + { + "epoch": 1.4834890965732086, + "grad_norm": 2.2139777839358845, + "learning_rate": 1.0254033425094196e-06, + "loss": 0.9119, + "step": 892 + }, + { + "epoch": 1.4851505711318795, + "grad_norm": 2.3212272419519713, + "learning_rate": 1.0236517267158026e-06, + "loss": 1.0185, + "step": 893 + }, + { + "epoch": 1.4868120456905505, + "grad_norm": 2.0589993974559757, + "learning_rate": 1.0219000383112713e-06, + "loss": 1.022, + "step": 894 + }, + { + "epoch": 1.4884735202492212, + "grad_norm": 2.5364209766355015, + "learning_rate": 1.020148282673517e-06, + "loss": 0.9629, + "step": 895 + }, + { + "epoch": 1.490134994807892, + "grad_norm": 2.1254602155332982, + "learning_rate": 1.0183964651804382e-06, + "loss": 0.9765, + "step": 896 + }, + { + "epoch": 1.4917964693665628, + "grad_norm": 2.2144306649179, + "learning_rate": 1.0166445912101228e-06, + "loss": 0.9865, + "step": 897 + }, + { + "epoch": 1.4934579439252336, + "grad_norm": 2.6976218997101173, + "learning_rate": 1.0148926661408327e-06, + "loss": 1.1091, + "step": 898 + }, + { + "epoch": 1.4951194184839045, + "grad_norm": 2.1447116841130347, + "learning_rate": 1.0131406953509855e-06, + "loss": 0.9854, + "step": 899 + }, + { + "epoch": 1.4967808930425752, + "grad_norm": 2.313826573818675, + "learning_rate": 1.0113886842191408e-06, + "loss": 1.0696, + "step": 900 + }, + { + "epoch": 1.4984423676012462, + "grad_norm": 2.398689146513669, + "learning_rate": 1.0096366381239806e-06, + "loss": 0.9913, + "step": 901 + }, + { + "epoch": 1.5001038421599169, + "grad_norm": 2.01368084597207, + "learning_rate": 1.0078845624442953e-06, + "loss": 0.8734, + "step": 902 + }, + { + "epoch": 1.5017653167185876, + "grad_norm": 2.162541083177183, + "learning_rate": 1.0061324625589655e-06, + "loss": 0.8432, + "step": 903 + }, + { + "epoch": 1.5034267912772585, + "grad_norm": 2.066277760482693, + "learning_rate": 1.004380343846946e-06, + "loss": 0.9469, + "step": 904 + }, + { + "epoch": 1.5050882658359295, + "grad_norm": 2.291540588940334, + "learning_rate": 1.0026282116872498e-06, + "loss": 0.8913, + "step": 905 + }, + { + "epoch": 1.5067497403946002, + "grad_norm": 2.3475144052615833, + "learning_rate": 1.000876071458931e-06, + "loss": 0.8889, + "step": 906 + }, + { + "epoch": 1.5067497403946002, + "eval_loss": 1.3885624408721924, + "eval_runtime": 24.6265, + "eval_samples_per_second": 0.447, + "eval_steps_per_second": 0.122, + "step": 906 + }, + { + "epoch": 1.508411214953271, + "grad_norm": 3.065303881065278, + "learning_rate": 9.99123928541069e-07, + "loss": 0.9061, + "step": 907 + }, + { + "epoch": 1.5100726895119418, + "grad_norm": 2.191485923954429, + "learning_rate": 9.973717883127503e-07, + "loss": 0.9699, + "step": 908 + }, + { + "epoch": 1.5117341640706128, + "grad_norm": 2.3278723421202505, + "learning_rate": 9.95619656153054e-07, + "loss": 1.0169, + "step": 909 + }, + { + "epoch": 1.5133956386292835, + "grad_norm": 2.3678043287491586, + "learning_rate": 9.938675374410346e-07, + "loss": 0.9757, + "step": 910 + }, + { + "epoch": 1.5150571131879542, + "grad_norm": 2.1901568875128836, + "learning_rate": 9.921154375557046e-07, + "loss": 1.0352, + "step": 911 + }, + { + "epoch": 1.5167185877466252, + "grad_norm": 2.30893338243549, + "learning_rate": 9.903633618760193e-07, + "loss": 0.8631, + "step": 912 + }, + { + "epoch": 1.518380062305296, + "grad_norm": 2.2873579587086366, + "learning_rate": 9.886113157808594e-07, + "loss": 0.8675, + "step": 913 + }, + { + "epoch": 1.5200415368639668, + "grad_norm": 2.085982086942701, + "learning_rate": 9.868593046490144e-07, + "loss": 0.947, + "step": 914 + }, + { + "epoch": 1.5217030114226375, + "grad_norm": 2.060117879856327, + "learning_rate": 9.851073338591675e-07, + "loss": 1.0063, + "step": 915 + }, + { + "epoch": 1.5233644859813085, + "grad_norm": 2.43994470111981, + "learning_rate": 9.833554087898773e-07, + "loss": 0.9692, + "step": 916 + }, + { + "epoch": 1.5250259605399792, + "grad_norm": 2.2294605522286948, + "learning_rate": 9.81603534819562e-07, + "loss": 0.9935, + "step": 917 + }, + { + "epoch": 1.52668743509865, + "grad_norm": 2.1907032716227883, + "learning_rate": 9.798517173264831e-07, + "loss": 0.9507, + "step": 918 + }, + { + "epoch": 1.5283489096573208, + "grad_norm": 2.1045414815637504, + "learning_rate": 9.780999616887288e-07, + "loss": 0.9059, + "step": 919 + }, + { + "epoch": 1.5300103842159918, + "grad_norm": 2.3012223647608323, + "learning_rate": 9.763482732841975e-07, + "loss": 1.0022, + "step": 920 + }, + { + "epoch": 1.5316718587746625, + "grad_norm": 2.272854650284857, + "learning_rate": 9.74596657490581e-07, + "loss": 1.1475, + "step": 921 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 2.5844520190272853, + "learning_rate": 9.728451196853476e-07, + "loss": 0.9708, + "step": 922 + }, + { + "epoch": 1.5349948078920042, + "grad_norm": 2.1870847904085626, + "learning_rate": 9.710936652457275e-07, + "loss": 1.0149, + "step": 923 + }, + { + "epoch": 1.536656282450675, + "grad_norm": 2.167131744822391, + "learning_rate": 9.693422995486938e-07, + "loss": 0.9241, + "step": 924 + }, + { + "epoch": 1.5383177570093458, + "grad_norm": 2.103729458011538, + "learning_rate": 9.675910279709475e-07, + "loss": 0.8776, + "step": 925 + }, + { + "epoch": 1.5399792315680165, + "grad_norm": 2.2605948836777827, + "learning_rate": 9.658398558889018e-07, + "loss": 0.9223, + "step": 926 + }, + { + "epoch": 1.5416407061266875, + "grad_norm": 2.1973564828092282, + "learning_rate": 9.640887886786623e-07, + "loss": 0.9611, + "step": 927 + }, + { + "epoch": 1.5433021806853584, + "grad_norm": 2.1500165200756998, + "learning_rate": 9.62337831716014e-07, + "loss": 0.9845, + "step": 928 + }, + { + "epoch": 1.5449636552440291, + "grad_norm": 2.1276385579139716, + "learning_rate": 9.605869903764036e-07, + "loss": 0.8422, + "step": 929 + }, + { + "epoch": 1.5466251298026998, + "grad_norm": 2.16875527803873, + "learning_rate": 9.588362700349218e-07, + "loss": 0.8773, + "step": 930 + }, + { + "epoch": 1.5482866043613708, + "grad_norm": 2.175442832799032, + "learning_rate": 9.570856760662888e-07, + "loss": 0.9159, + "step": 931 + }, + { + "epoch": 1.5499480789200415, + "grad_norm": 2.0911533410140652, + "learning_rate": 9.553352138448365e-07, + "loss": 0.9862, + "step": 932 + }, + { + "epoch": 1.5516095534787122, + "grad_norm": 2.4217133148552907, + "learning_rate": 9.535848887444925e-07, + "loss": 1.0448, + "step": 933 + }, + { + "epoch": 1.5532710280373832, + "grad_norm": 2.156898906325239, + "learning_rate": 9.518347061387627e-07, + "loss": 1.0456, + "step": 934 + }, + { + "epoch": 1.554932502596054, + "grad_norm": 2.1174530253263852, + "learning_rate": 9.500846714007168e-07, + "loss": 0.9817, + "step": 935 + }, + { + "epoch": 1.5565939771547248, + "grad_norm": 2.2295767742183337, + "learning_rate": 9.483347899029695e-07, + "loss": 1.1012, + "step": 936 + }, + { + "epoch": 1.5582554517133955, + "grad_norm": 2.045078620592537, + "learning_rate": 9.465850670176653e-07, + "loss": 0.8389, + "step": 937 + }, + { + "epoch": 1.5599169262720665, + "grad_norm": 2.2882936479206313, + "learning_rate": 9.44835508116462e-07, + "loss": 0.9324, + "step": 938 + }, + { + "epoch": 1.5615784008307374, + "grad_norm": 2.274529967263343, + "learning_rate": 9.430861185705137e-07, + "loss": 1.0327, + "step": 939 + }, + { + "epoch": 1.5632398753894081, + "grad_norm": 2.1598121426869565, + "learning_rate": 9.41336903750455e-07, + "loss": 0.9787, + "step": 940 + }, + { + "epoch": 1.5649013499480788, + "grad_norm": 2.2018912955453733, + "learning_rate": 9.395878690263836e-07, + "loss": 0.9555, + "step": 941 + }, + { + "epoch": 1.5665628245067498, + "grad_norm": 2.089375883352287, + "learning_rate": 9.378390197678447e-07, + "loss": 0.9679, + "step": 942 + }, + { + "epoch": 1.5682242990654207, + "grad_norm": 2.203509678408647, + "learning_rate": 9.360903613438137e-07, + "loss": 0.9715, + "step": 943 + }, + { + "epoch": 1.5698857736240914, + "grad_norm": 2.1347102317454367, + "learning_rate": 9.343418991226803e-07, + "loss": 0.7945, + "step": 944 + }, + { + "epoch": 1.5715472481827621, + "grad_norm": 2.378865551913575, + "learning_rate": 9.325936384722321e-07, + "loss": 0.9477, + "step": 945 + }, + { + "epoch": 1.573208722741433, + "grad_norm": 2.4467581458666428, + "learning_rate": 9.308455847596377e-07, + "loss": 1.0578, + "step": 946 + }, + { + "epoch": 1.5748701973001038, + "grad_norm": 2.203361322184244, + "learning_rate": 9.290977433514305e-07, + "loss": 0.884, + "step": 947 + }, + { + "epoch": 1.5765316718587745, + "grad_norm": 2.1727119561544597, + "learning_rate": 9.273501196134914e-07, + "loss": 0.9208, + "step": 948 + }, + { + "epoch": 1.5781931464174455, + "grad_norm": 2.3340178077448837, + "learning_rate": 9.256027189110344e-07, + "loss": 0.9412, + "step": 949 + }, + { + "epoch": 1.5798546209761164, + "grad_norm": 2.2245206192765807, + "learning_rate": 9.23855546608588e-07, + "loss": 0.9677, + "step": 950 + }, + { + "epoch": 1.5815160955347871, + "grad_norm": 2.3713993791607786, + "learning_rate": 9.221086080699792e-07, + "loss": 0.8594, + "step": 951 + }, + { + "epoch": 1.5831775700934578, + "grad_norm": 2.30874801141752, + "learning_rate": 9.203619086583178e-07, + "loss": 1.1003, + "step": 952 + }, + { + "epoch": 1.5848390446521288, + "grad_norm": 2.3037415512544492, + "learning_rate": 9.186154537359794e-07, + "loss": 0.991, + "step": 953 + }, + { + "epoch": 1.5865005192107997, + "grad_norm": 2.360666046738533, + "learning_rate": 9.168692486645893e-07, + "loss": 0.9452, + "step": 954 + }, + { + "epoch": 1.5881619937694704, + "grad_norm": 2.1674212494443736, + "learning_rate": 9.15123298805005e-07, + "loss": 1.0289, + "step": 955 + }, + { + "epoch": 1.5898234683281411, + "grad_norm": 2.2039402749504693, + "learning_rate": 9.133776095173013e-07, + "loss": 0.8989, + "step": 956 + }, + { + "epoch": 1.591484942886812, + "grad_norm": 2.3319779317051994, + "learning_rate": 9.116321861607523e-07, + "loss": 1.0259, + "step": 957 + }, + { + "epoch": 1.593146417445483, + "grad_norm": 2.397971695824514, + "learning_rate": 9.098870340938168e-07, + "loss": 0.9254, + "step": 958 + }, + { + "epoch": 1.5948078920041537, + "grad_norm": 2.1189888494560956, + "learning_rate": 9.081421586741188e-07, + "loss": 0.9747, + "step": 959 + }, + { + "epoch": 1.5964693665628245, + "grad_norm": 2.4628203972860994, + "learning_rate": 9.063975652584357e-07, + "loss": 1.1281, + "step": 960 + }, + { + "epoch": 1.5981308411214954, + "grad_norm": 2.2290027206829452, + "learning_rate": 9.046532592026768e-07, + "loss": 1.0188, + "step": 961 + }, + { + "epoch": 1.599792315680166, + "grad_norm": 2.101837850760124, + "learning_rate": 9.029092458618705e-07, + "loss": 1.0343, + "step": 962 + }, + { + "epoch": 1.6014537902388368, + "grad_norm": 2.1955065161912586, + "learning_rate": 9.011655305901457e-07, + "loss": 0.9229, + "step": 963 + }, + { + "epoch": 1.6031152647975078, + "grad_norm": 2.208465293699187, + "learning_rate": 8.994221187407167e-07, + "loss": 0.9811, + "step": 964 + }, + { + "epoch": 1.6047767393561787, + "grad_norm": 2.189756306702255, + "learning_rate": 8.976790156658665e-07, + "loss": 0.9948, + "step": 965 + }, + { + "epoch": 1.6064382139148494, + "grad_norm": 2.21101125223709, + "learning_rate": 8.959362267169299e-07, + "loss": 0.9206, + "step": 966 + }, + { + "epoch": 1.6080996884735201, + "grad_norm": 2.1552977090716476, + "learning_rate": 8.941937572442773e-07, + "loss": 0.9684, + "step": 967 + }, + { + "epoch": 1.609761163032191, + "grad_norm": 2.0424533004826593, + "learning_rate": 8.924516125972983e-07, + "loss": 0.9515, + "step": 968 + }, + { + "epoch": 1.611422637590862, + "grad_norm": 2.2773490170655437, + "learning_rate": 8.907097981243851e-07, + "loss": 1.0617, + "step": 969 + }, + { + "epoch": 1.6130841121495327, + "grad_norm": 2.219673265679085, + "learning_rate": 8.88968319172917e-07, + "loss": 1.0573, + "step": 970 + }, + { + "epoch": 1.6147455867082035, + "grad_norm": 2.23001657969037, + "learning_rate": 8.872271810892424e-07, + "loss": 1.0252, + "step": 971 + }, + { + "epoch": 1.6164070612668744, + "grad_norm": 2.2328021310208968, + "learning_rate": 8.854863892186639e-07, + "loss": 0.9826, + "step": 972 + }, + { + "epoch": 1.6180685358255453, + "grad_norm": 2.5771699055938475, + "learning_rate": 8.837459489054203e-07, + "loss": 0.9947, + "step": 973 + }, + { + "epoch": 1.619730010384216, + "grad_norm": 2.798440468259025, + "learning_rate": 8.820058654926725e-07, + "loss": 0.9452, + "step": 974 + }, + { + "epoch": 1.6213914849428868, + "grad_norm": 2.1504820505748325, + "learning_rate": 8.802661443224844e-07, + "loss": 1.0738, + "step": 975 + }, + { + "epoch": 1.6230529595015577, + "grad_norm": 2.133788375605966, + "learning_rate": 8.785267907358084e-07, + "loss": 1.0597, + "step": 976 + }, + { + "epoch": 1.6247144340602284, + "grad_norm": 2.1928865638126434, + "learning_rate": 8.767878100724688e-07, + "loss": 0.9914, + "step": 977 + }, + { + "epoch": 1.6263759086188991, + "grad_norm": 2.67304969048529, + "learning_rate": 8.750492076711439e-07, + "loss": 0.9834, + "step": 978 + }, + { + "epoch": 1.62803738317757, + "grad_norm": 2.242441368839289, + "learning_rate": 8.73310988869352e-07, + "loss": 0.9394, + "step": 979 + }, + { + "epoch": 1.629698857736241, + "grad_norm": 2.155656747647904, + "learning_rate": 8.715731590034329e-07, + "loss": 1.083, + "step": 980 + }, + { + "epoch": 1.6313603322949117, + "grad_norm": 2.1738442975670798, + "learning_rate": 8.698357234085327e-07, + "loss": 0.937, + "step": 981 + }, + { + "epoch": 1.6330218068535824, + "grad_norm": 2.1949307066634187, + "learning_rate": 8.680986874185872e-07, + "loss": 0.9225, + "step": 982 + }, + { + "epoch": 1.6346832814122534, + "grad_norm": 2.209794807763499, + "learning_rate": 8.663620563663046e-07, + "loss": 0.9265, + "step": 983 + }, + { + "epoch": 1.6363447559709243, + "grad_norm": 2.27946429282308, + "learning_rate": 8.646258355831513e-07, + "loss": 0.9482, + "step": 984 + }, + { + "epoch": 1.638006230529595, + "grad_norm": 2.4532997949631086, + "learning_rate": 8.628900303993334e-07, + "loss": 1.0124, + "step": 985 + }, + { + "epoch": 1.6396677050882658, + "grad_norm": 2.0841134326683264, + "learning_rate": 8.611546461437808e-07, + "loss": 1.0067, + "step": 986 + }, + { + "epoch": 1.6413291796469367, + "grad_norm": 2.0640910431827395, + "learning_rate": 8.594196881441314e-07, + "loss": 0.7401, + "step": 987 + }, + { + "epoch": 1.6429906542056076, + "grad_norm": 2.442342265002308, + "learning_rate": 8.576851617267149e-07, + "loss": 0.9464, + "step": 988 + }, + { + "epoch": 1.6446521287642781, + "grad_norm": 2.2202450152134725, + "learning_rate": 8.559510722165359e-07, + "loss": 0.9864, + "step": 989 + }, + { + "epoch": 1.646313603322949, + "grad_norm": 2.270663471909469, + "learning_rate": 8.542174249372572e-07, + "loss": 1.1029, + "step": 990 + }, + { + "epoch": 1.64797507788162, + "grad_norm": 2.2174023151217566, + "learning_rate": 8.524842252111843e-07, + "loss": 0.9365, + "step": 991 + }, + { + "epoch": 1.6496365524402907, + "grad_norm": 2.344777176470885, + "learning_rate": 8.507514783592486e-07, + "loss": 1.014, + "step": 992 + }, + { + "epoch": 1.6512980269989614, + "grad_norm": 2.422227525332542, + "learning_rate": 8.490191897009915e-07, + "loss": 0.979, + "step": 993 + }, + { + "epoch": 1.6529595015576324, + "grad_norm": 2.258033290502269, + "learning_rate": 8.472873645545474e-07, + "loss": 1.0524, + "step": 994 + }, + { + "epoch": 1.6546209761163033, + "grad_norm": 2.303002200199465, + "learning_rate": 8.45556008236628e-07, + "loss": 0.933, + "step": 995 + }, + { + "epoch": 1.656282450674974, + "grad_norm": 2.2109316644818495, + "learning_rate": 8.438251260625055e-07, + "loss": 0.9229, + "step": 996 + }, + { + "epoch": 1.6579439252336448, + "grad_norm": 2.2713621854438357, + "learning_rate": 8.420947233459962e-07, + "loss": 0.997, + "step": 997 + }, + { + "epoch": 1.6596053997923157, + "grad_norm": 2.1886965102085965, + "learning_rate": 8.403648053994447e-07, + "loss": 0.9646, + "step": 998 + }, + { + "epoch": 1.6612668743509866, + "grad_norm": 2.3286768659520765, + "learning_rate": 8.386353775337078e-07, + "loss": 1.0782, + "step": 999 + }, + { + "epoch": 1.6629283489096574, + "grad_norm": 2.2131507866973443, + "learning_rate": 8.369064450581372e-07, + "loss": 0.9542, + "step": 1000 + }, + { + "epoch": 1.664589823468328, + "grad_norm": 2.2038565852379777, + "learning_rate": 8.351780132805639e-07, + "loss": 1.009, + "step": 1001 + }, + { + "epoch": 1.666251298026999, + "grad_norm": 2.2824651606443798, + "learning_rate": 8.334500875072817e-07, + "loss": 0.9784, + "step": 1002 + }, + { + "epoch": 1.66791277258567, + "grad_norm": 2.34183081355438, + "learning_rate": 8.317226730430309e-07, + "loss": 1.0175, + "step": 1003 + }, + { + "epoch": 1.6695742471443404, + "grad_norm": 2.1288437772823134, + "learning_rate": 8.299957751909826e-07, + "loss": 0.8847, + "step": 1004 + }, + { + "epoch": 1.6712357217030114, + "grad_norm": 2.4568293153056215, + "learning_rate": 8.282693992527212e-07, + "loss": 0.9493, + "step": 1005 + }, + { + "epoch": 1.6728971962616823, + "grad_norm": 2.320949328340056, + "learning_rate": 8.265435505282292e-07, + "loss": 0.9644, + "step": 1006 + }, + { + "epoch": 1.674558670820353, + "grad_norm": 2.607704387361572, + "learning_rate": 8.248182343158705e-07, + "loss": 0.9348, + "step": 1007 + }, + { + "epoch": 1.6762201453790238, + "grad_norm": 2.5655569259629742, + "learning_rate": 8.230934559123739e-07, + "loss": 1.1464, + "step": 1008 + }, + { + "epoch": 1.6778816199376947, + "grad_norm": 2.2568110375917105, + "learning_rate": 8.213692206128178e-07, + "loss": 1.0082, + "step": 1009 + }, + { + "epoch": 1.6795430944963656, + "grad_norm": 2.329387559326929, + "learning_rate": 8.196455337106126e-07, + "loss": 1.0236, + "step": 1010 + }, + { + "epoch": 1.6812045690550363, + "grad_norm": 2.1931737355029477, + "learning_rate": 8.179224004974856e-07, + "loss": 0.9806, + "step": 1011 + }, + { + "epoch": 1.682866043613707, + "grad_norm": 2.7512987063931384, + "learning_rate": 8.161998262634636e-07, + "loss": 1.0763, + "step": 1012 + }, + { + "epoch": 1.684527518172378, + "grad_norm": 2.3447221138047123, + "learning_rate": 8.144778162968583e-07, + "loss": 0.9142, + "step": 1013 + }, + { + "epoch": 1.686188992731049, + "grad_norm": 2.1872618111159152, + "learning_rate": 8.127563758842483e-07, + "loss": 0.9387, + "step": 1014 + }, + { + "epoch": 1.6878504672897197, + "grad_norm": 2.2470446091718, + "learning_rate": 8.11035510310464e-07, + "loss": 0.9224, + "step": 1015 + }, + { + "epoch": 1.6895119418483904, + "grad_norm": 2.5029397601882417, + "learning_rate": 8.093152248585709e-07, + "loss": 0.9544, + "step": 1016 + }, + { + "epoch": 1.6911734164070613, + "grad_norm": 2.1902389788861547, + "learning_rate": 8.075955248098535e-07, + "loss": 0.9536, + "step": 1017 + }, + { + "epoch": 1.6928348909657323, + "grad_norm": 2.0810637251240225, + "learning_rate": 8.058764154437996e-07, + "loss": 1.0527, + "step": 1018 + }, + { + "epoch": 1.6944963655244027, + "grad_norm": 2.3552600183690227, + "learning_rate": 8.041579020380828e-07, + "loss": 0.9047, + "step": 1019 + }, + { + "epoch": 1.6961578400830737, + "grad_norm": 2.434839618311, + "learning_rate": 8.024399898685478e-07, + "loss": 0.9539, + "step": 1020 + }, + { + "epoch": 1.6978193146417446, + "grad_norm": 2.2927568687484237, + "learning_rate": 8.007226842091929e-07, + "loss": 0.9926, + "step": 1021 + }, + { + "epoch": 1.6994807892004153, + "grad_norm": 2.3228074078887775, + "learning_rate": 7.990059903321552e-07, + "loss": 1.0054, + "step": 1022 + }, + { + "epoch": 1.701142263759086, + "grad_norm": 2.110477821804407, + "learning_rate": 7.972899135076928e-07, + "loss": 0.919, + "step": 1023 + }, + { + "epoch": 1.702803738317757, + "grad_norm": 2.153197204937706, + "learning_rate": 7.9557445900417e-07, + "loss": 0.9917, + "step": 1024 + }, + { + "epoch": 1.704465212876428, + "grad_norm": 2.215029615261696, + "learning_rate": 7.938596320880401e-07, + "loss": 0.7933, + "step": 1025 + }, + { + "epoch": 1.7061266874350987, + "grad_norm": 2.3403292049764666, + "learning_rate": 7.9214543802383e-07, + "loss": 0.9624, + "step": 1026 + }, + { + "epoch": 1.7077881619937694, + "grad_norm": 2.1684163037426183, + "learning_rate": 7.904318820741238e-07, + "loss": 0.9629, + "step": 1027 + }, + { + "epoch": 1.7094496365524403, + "grad_norm": 2.263783250114826, + "learning_rate": 7.887189694995464e-07, + "loss": 0.8771, + "step": 1028 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.3185082936393657, + "learning_rate": 7.87006705558748e-07, + "loss": 0.9736, + "step": 1029 + }, + { + "epoch": 1.712772585669782, + "grad_norm": 2.1436040295655907, + "learning_rate": 7.85295095508387e-07, + "loss": 0.9073, + "step": 1030 + }, + { + "epoch": 1.7144340602284527, + "grad_norm": 2.31861818593136, + "learning_rate": 7.835841446031143e-07, + "loss": 0.8523, + "step": 1031 + }, + { + "epoch": 1.7160955347871236, + "grad_norm": 2.2422376300228946, + "learning_rate": 7.818738580955575e-07, + "loss": 0.9703, + "step": 1032 + }, + { + "epoch": 1.7177570093457943, + "grad_norm": 2.2643194712270307, + "learning_rate": 7.801642412363041e-07, + "loss": 1.0062, + "step": 1033 + }, + { + "epoch": 1.719418483904465, + "grad_norm": 2.237623999857119, + "learning_rate": 7.784552992738866e-07, + "loss": 0.8801, + "step": 1034 + }, + { + "epoch": 1.721079958463136, + "grad_norm": 2.282669507043474, + "learning_rate": 7.767470374547646e-07, + "loss": 1.0902, + "step": 1035 + }, + { + "epoch": 1.722741433021807, + "grad_norm": 2.2928400993422704, + "learning_rate": 7.750394610233105e-07, + "loss": 0.8845, + "step": 1036 + }, + { + "epoch": 1.7244029075804777, + "grad_norm": 2.198326988103989, + "learning_rate": 7.733325752217916e-07, + "loss": 0.9141, + "step": 1037 + }, + { + "epoch": 1.7260643821391484, + "grad_norm": 2.2329846089880716, + "learning_rate": 7.716263852903561e-07, + "loss": 0.8736, + "step": 1038 + }, + { + "epoch": 1.7277258566978193, + "grad_norm": 2.1112717332498416, + "learning_rate": 7.699208964670148e-07, + "loss": 0.9301, + "step": 1039 + }, + { + "epoch": 1.7293873312564902, + "grad_norm": 2.3471201925073943, + "learning_rate": 7.68216113987627e-07, + "loss": 1.1426, + "step": 1040 + }, + { + "epoch": 1.731048805815161, + "grad_norm": 2.2076292469918837, + "learning_rate": 7.665120430858828e-07, + "loss": 1.055, + "step": 1041 + }, + { + "epoch": 1.7327102803738317, + "grad_norm": 2.2641331824435107, + "learning_rate": 7.648086889932878e-07, + "loss": 0.9148, + "step": 1042 + }, + { + "epoch": 1.7343717549325026, + "grad_norm": 2.011124865835, + "learning_rate": 7.631060569391481e-07, + "loss": 0.9435, + "step": 1043 + }, + { + "epoch": 1.7360332294911736, + "grad_norm": 2.5541303942545523, + "learning_rate": 7.614041521505517e-07, + "loss": 1.051, + "step": 1044 + }, + { + "epoch": 1.7376947040498443, + "grad_norm": 2.135958705290272, + "learning_rate": 7.597029798523544e-07, + "loss": 0.8675, + "step": 1045 + }, + { + "epoch": 1.739356178608515, + "grad_norm": 2.1710241061601208, + "learning_rate": 7.580025452671635e-07, + "loss": 0.9826, + "step": 1046 + }, + { + "epoch": 1.741017653167186, + "grad_norm": 2.108624585141848, + "learning_rate": 7.563028536153212e-07, + "loss": 0.9617, + "step": 1047 + }, + { + "epoch": 1.7426791277258566, + "grad_norm": 2.106050127640713, + "learning_rate": 7.546039101148895e-07, + "loss": 1.0632, + "step": 1048 + }, + { + "epoch": 1.7443406022845274, + "grad_norm": 2.187342834226081, + "learning_rate": 7.529057199816325e-07, + "loss": 1.0858, + "step": 1049 + }, + { + "epoch": 1.7460020768431983, + "grad_norm": 2.342073364970068, + "learning_rate": 7.512082884290025e-07, + "loss": 0.9126, + "step": 1050 + }, + { + "epoch": 1.7476635514018692, + "grad_norm": 2.3328737377816613, + "learning_rate": 7.495116206681222e-07, + "loss": 0.9681, + "step": 1051 + }, + { + "epoch": 1.74932502596054, + "grad_norm": 2.149584367750816, + "learning_rate": 7.478157219077702e-07, + "loss": 1.0266, + "step": 1052 + }, + { + "epoch": 1.7509865005192107, + "grad_norm": 2.4000332654502867, + "learning_rate": 7.461205973543635e-07, + "loss": 0.9125, + "step": 1053 + }, + { + "epoch": 1.7526479750778816, + "grad_norm": 2.2222345997916446, + "learning_rate": 7.444262522119427e-07, + "loss": 0.9607, + "step": 1054 + }, + { + "epoch": 1.7543094496365526, + "grad_norm": 2.2094198456747254, + "learning_rate": 7.427326916821557e-07, + "loss": 1.0043, + "step": 1055 + }, + { + "epoch": 1.7559709241952233, + "grad_norm": 2.1757968861860615, + "learning_rate": 7.410399209642409e-07, + "loss": 0.8917, + "step": 1056 + }, + { + "epoch": 1.757632398753894, + "grad_norm": 2.493378250255824, + "learning_rate": 7.393479452550132e-07, + "loss": 1.0598, + "step": 1057 + }, + { + "epoch": 1.757632398753894, + "eval_loss": 1.3888019323349, + "eval_runtime": 24.567, + "eval_samples_per_second": 0.448, + "eval_steps_per_second": 0.122, + "step": 1057 + } + ], + "logging_steps": 1, + "max_steps": 1803, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 151, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.775024226258125e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}