{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 408768, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.4115859270095825, "learning_rate": 4.993884061374668e-05, "loss": 7.5674, "step": 500 }, { "epoch": 0.01, "grad_norm": 1.173668384552002, "learning_rate": 4.987768122749335e-05, "loss": 7.1474, "step": 1000 }, { "epoch": 0.01, "grad_norm": 1.1809221506118774, "learning_rate": 4.981652184124002e-05, "loss": 7.0218, "step": 1500 }, { "epoch": 0.01, "grad_norm": 1.898521065711975, "learning_rate": 4.975536245498669e-05, "loss": 6.9342, "step": 2000 }, { "epoch": 0.02, "grad_norm": 2.6537320613861084, "learning_rate": 4.969420306873337e-05, "loss": 6.8595, "step": 2500 }, { "epoch": 0.02, "grad_norm": 1.9779026508331299, "learning_rate": 4.963304368248004e-05, "loss": 6.8074, "step": 3000 }, { "epoch": 0.03, "grad_norm": 3.4129042625427246, "learning_rate": 4.957188429622671e-05, "loss": 6.7628, "step": 3500 }, { "epoch": 0.03, "grad_norm": 2.0669405460357666, "learning_rate": 4.951072490997339e-05, "loss": 6.7217, "step": 4000 }, { "epoch": 0.03, "grad_norm": 2.6271133422851562, "learning_rate": 4.944956552372006e-05, "loss": 6.6808, "step": 4500 }, { "epoch": 0.04, "grad_norm": 2.944228410720825, "learning_rate": 4.938840613746673e-05, "loss": 6.6526, "step": 5000 }, { "epoch": 0.04, "grad_norm": 1.9261572360992432, "learning_rate": 4.9327246751213406e-05, "loss": 6.6157, "step": 5500 }, { "epoch": 0.04, "grad_norm": 2.7971737384796143, "learning_rate": 4.926608736496008e-05, "loss": 6.585, "step": 6000 }, { "epoch": 0.05, "grad_norm": 2.061311960220337, "learning_rate": 4.920492797870675e-05, "loss": 6.5625, "step": 6500 }, { "epoch": 0.05, "grad_norm": 1.9171266555786133, "learning_rate": 4.914376859245342e-05, "loss": 6.5476, "step": 7000 }, { "epoch": 0.06, "grad_norm": 2.557237386703491, "learning_rate": 4.9082609206200096e-05, "loss": 6.5063, "step": 7500 }, { "epoch": 0.06, "grad_norm": 1.9273940324783325, "learning_rate": 4.902144981994677e-05, "loss": 6.4848, "step": 8000 }, { "epoch": 0.06, "grad_norm": 2.46166729927063, "learning_rate": 4.896029043369344e-05, "loss": 6.4549, "step": 8500 }, { "epoch": 0.07, "grad_norm": 2.1991207599639893, "learning_rate": 4.889913104744012e-05, "loss": 6.4151, "step": 9000 }, { "epoch": 0.07, "grad_norm": 3.0304417610168457, "learning_rate": 4.8837971661186786e-05, "loss": 6.3421, "step": 9500 }, { "epoch": 0.07, "grad_norm": 2.5440313816070557, "learning_rate": 4.877681227493346e-05, "loss": 6.2588, "step": 10000 }, { "epoch": 0.08, "grad_norm": 2.8865408897399902, "learning_rate": 4.8715652888680135e-05, "loss": 6.1081, "step": 10500 }, { "epoch": 0.08, "grad_norm": 3.689366579055786, "learning_rate": 4.865449350242681e-05, "loss": 5.965, "step": 11000 }, { "epoch": 0.08, "grad_norm": 3.3553738594055176, "learning_rate": 4.8593334116173477e-05, "loss": 5.8321, "step": 11500 }, { "epoch": 0.09, "grad_norm": 3.879040479660034, "learning_rate": 4.853217472992016e-05, "loss": 5.7097, "step": 12000 }, { "epoch": 0.09, "grad_norm": 3.504615306854248, "learning_rate": 4.8471015343666825e-05, "loss": 5.5818, "step": 12500 }, { "epoch": 0.1, "grad_norm": 3.884676218032837, "learning_rate": 4.84098559574135e-05, "loss": 5.4549, "step": 13000 }, { "epoch": 0.1, "grad_norm": 3.560129165649414, "learning_rate": 4.8348696571160174e-05, "loss": 5.3163, "step": 13500 }, { "epoch": 0.1, "grad_norm": 3.542060613632202, "learning_rate": 4.828753718490685e-05, "loss": 5.1749, "step": 14000 }, { "epoch": 0.11, "grad_norm": 3.485215425491333, "learning_rate": 4.8226377798653515e-05, "loss": 5.0056, "step": 14500 }, { "epoch": 0.11, "grad_norm": 3.323887586593628, "learning_rate": 4.816521841240019e-05, "loss": 4.8195, "step": 15000 }, { "epoch": 0.11, "grad_norm": 3.07159161567688, "learning_rate": 4.8104059026146864e-05, "loss": 4.6421, "step": 15500 }, { "epoch": 0.12, "grad_norm": 3.2840936183929443, "learning_rate": 4.804289963989354e-05, "loss": 4.475, "step": 16000 }, { "epoch": 0.12, "grad_norm": 2.858837366104126, "learning_rate": 4.798174025364021e-05, "loss": 4.3387, "step": 16500 }, { "epoch": 0.12, "grad_norm": 2.913006544113159, "learning_rate": 4.792058086738688e-05, "loss": 4.2142, "step": 17000 }, { "epoch": 0.13, "grad_norm": 2.8114237785339355, "learning_rate": 4.7859421481133554e-05, "loss": 4.116, "step": 17500 }, { "epoch": 0.13, "grad_norm": 3.1581108570098877, "learning_rate": 4.779826209488023e-05, "loss": 4.0327, "step": 18000 }, { "epoch": 0.14, "grad_norm": 2.9168930053710938, "learning_rate": 4.77371027086269e-05, "loss": 3.9404, "step": 18500 }, { "epoch": 0.14, "grad_norm": 2.8912034034729004, "learning_rate": 4.767594332237357e-05, "loss": 3.874, "step": 19000 }, { "epoch": 0.14, "grad_norm": 2.909111976623535, "learning_rate": 4.7614783936120244e-05, "loss": 3.8014, "step": 19500 }, { "epoch": 0.15, "grad_norm": 3.0821359157562256, "learning_rate": 4.755362454986692e-05, "loss": 3.7285, "step": 20000 }, { "epoch": 0.15, "grad_norm": 2.603447675704956, "learning_rate": 4.749246516361359e-05, "loss": 3.6757, "step": 20500 }, { "epoch": 0.15, "grad_norm": 2.466923952102661, "learning_rate": 4.743130577736026e-05, "loss": 3.6054, "step": 21000 }, { "epoch": 0.16, "grad_norm": 2.949532985687256, "learning_rate": 4.737014639110694e-05, "loss": 3.5478, "step": 21500 }, { "epoch": 0.16, "grad_norm": 2.7499911785125732, "learning_rate": 4.730898700485361e-05, "loss": 3.5018, "step": 22000 }, { "epoch": 0.17, "grad_norm": 2.8840396404266357, "learning_rate": 4.724782761860028e-05, "loss": 3.4503, "step": 22500 }, { "epoch": 0.17, "grad_norm": 2.7901082038879395, "learning_rate": 4.718666823234696e-05, "loss": 3.385, "step": 23000 }, { "epoch": 0.17, "grad_norm": 2.499488353729248, "learning_rate": 4.712550884609363e-05, "loss": 3.3385, "step": 23500 }, { "epoch": 0.18, "grad_norm": 2.77323055267334, "learning_rate": 4.70643494598403e-05, "loss": 3.2953, "step": 24000 }, { "epoch": 0.18, "grad_norm": 3.0041913986206055, "learning_rate": 4.700319007358698e-05, "loss": 3.2472, "step": 24500 }, { "epoch": 0.18, "grad_norm": 2.72615909576416, "learning_rate": 4.694203068733365e-05, "loss": 3.2107, "step": 25000 }, { "epoch": 0.19, "grad_norm": 2.766868829727173, "learning_rate": 4.688087130108032e-05, "loss": 3.1768, "step": 25500 }, { "epoch": 0.19, "grad_norm": 2.5954720973968506, "learning_rate": 4.6819711914826996e-05, "loss": 3.1358, "step": 26000 }, { "epoch": 0.19, "grad_norm": 2.7101314067840576, "learning_rate": 4.675855252857367e-05, "loss": 3.0949, "step": 26500 }, { "epoch": 0.2, "grad_norm": 2.7481093406677246, "learning_rate": 4.669739314232034e-05, "loss": 3.0655, "step": 27000 }, { "epoch": 0.2, "grad_norm": 2.869677782058716, "learning_rate": 4.663623375606702e-05, "loss": 3.0188, "step": 27500 }, { "epoch": 0.21, "grad_norm": 3.081693649291992, "learning_rate": 4.6575074369813686e-05, "loss": 2.9893, "step": 28000 }, { "epoch": 0.21, "grad_norm": 3.1503679752349854, "learning_rate": 4.651391498356036e-05, "loss": 2.975, "step": 28500 }, { "epoch": 0.21, "grad_norm": 2.9327192306518555, "learning_rate": 4.645275559730703e-05, "loss": 2.9276, "step": 29000 }, { "epoch": 0.22, "grad_norm": 2.580777406692505, "learning_rate": 4.639159621105371e-05, "loss": 2.9062, "step": 29500 }, { "epoch": 0.22, "grad_norm": 2.6058924198150635, "learning_rate": 4.633043682480038e-05, "loss": 2.8895, "step": 30000 }, { "epoch": 0.22, "grad_norm": 2.9445390701293945, "learning_rate": 4.626927743854705e-05, "loss": 2.8634, "step": 30500 }, { "epoch": 0.23, "grad_norm": 3.4329161643981934, "learning_rate": 4.6208118052293725e-05, "loss": 2.8399, "step": 31000 }, { "epoch": 0.23, "grad_norm": 2.910855770111084, "learning_rate": 4.61469586660404e-05, "loss": 2.8167, "step": 31500 }, { "epoch": 0.23, "grad_norm": 2.9852588176727295, "learning_rate": 4.608579927978707e-05, "loss": 2.8001, "step": 32000 }, { "epoch": 0.24, "grad_norm": 2.6188220977783203, "learning_rate": 4.602463989353374e-05, "loss": 2.7929, "step": 32500 }, { "epoch": 0.24, "grad_norm": 2.753516912460327, "learning_rate": 4.5963480507280416e-05, "loss": 2.7677, "step": 33000 }, { "epoch": 0.25, "grad_norm": 2.5995850563049316, "learning_rate": 4.590232112102709e-05, "loss": 2.7399, "step": 33500 }, { "epoch": 0.25, "grad_norm": 2.697634696960449, "learning_rate": 4.5841161734773764e-05, "loss": 2.7241, "step": 34000 }, { "epoch": 0.25, "grad_norm": 3.0841758251190186, "learning_rate": 4.578000234852043e-05, "loss": 2.7143, "step": 34500 }, { "epoch": 0.26, "grad_norm": 3.0621519088745117, "learning_rate": 4.5718842962267106e-05, "loss": 2.6897, "step": 35000 }, { "epoch": 0.26, "grad_norm": 2.912416934967041, "learning_rate": 4.565768357601378e-05, "loss": 2.6762, "step": 35500 }, { "epoch": 0.26, "grad_norm": 2.95345401763916, "learning_rate": 4.5596524189760454e-05, "loss": 2.6708, "step": 36000 }, { "epoch": 0.27, "grad_norm": 2.7842564582824707, "learning_rate": 4.553536480350712e-05, "loss": 2.6507, "step": 36500 }, { "epoch": 0.27, "grad_norm": 2.9051456451416016, "learning_rate": 4.54742054172538e-05, "loss": 2.6361, "step": 37000 }, { "epoch": 0.28, "grad_norm": 2.8248302936553955, "learning_rate": 4.541304603100047e-05, "loss": 2.619, "step": 37500 }, { "epoch": 0.28, "grad_norm": 3.076840400695801, "learning_rate": 4.5351886644747145e-05, "loss": 2.6011, "step": 38000 }, { "epoch": 0.28, "grad_norm": 2.7716526985168457, "learning_rate": 4.529072725849382e-05, "loss": 2.5928, "step": 38500 }, { "epoch": 0.29, "grad_norm": 2.8269171714782715, "learning_rate": 4.522956787224049e-05, "loss": 2.5813, "step": 39000 }, { "epoch": 0.29, "grad_norm": 3.041726589202881, "learning_rate": 4.516840848598716e-05, "loss": 2.5583, "step": 39500 }, { "epoch": 0.29, "grad_norm": 2.676236391067505, "learning_rate": 4.510724909973384e-05, "loss": 2.5538, "step": 40000 }, { "epoch": 0.3, "grad_norm": 2.8164145946502686, "learning_rate": 4.504608971348051e-05, "loss": 2.55, "step": 40500 }, { "epoch": 0.3, "grad_norm": 2.637202024459839, "learning_rate": 4.498493032722718e-05, "loss": 2.5273, "step": 41000 }, { "epoch": 0.3, "grad_norm": 2.9004831314086914, "learning_rate": 4.492377094097385e-05, "loss": 2.5142, "step": 41500 }, { "epoch": 0.31, "grad_norm": 2.7393102645874023, "learning_rate": 4.486261155472053e-05, "loss": 2.5093, "step": 42000 }, { "epoch": 0.31, "grad_norm": 2.814011335372925, "learning_rate": 4.48014521684672e-05, "loss": 2.4936, "step": 42500 }, { "epoch": 0.32, "grad_norm": 2.625415802001953, "learning_rate": 4.4740292782213874e-05, "loss": 2.4787, "step": 43000 }, { "epoch": 0.32, "grad_norm": 2.955406427383423, "learning_rate": 4.467913339596055e-05, "loss": 2.471, "step": 43500 }, { "epoch": 0.32, "grad_norm": 2.6002650260925293, "learning_rate": 4.461797400970722e-05, "loss": 2.4626, "step": 44000 }, { "epoch": 0.33, "grad_norm": 2.75281023979187, "learning_rate": 4.455681462345389e-05, "loss": 2.459, "step": 44500 }, { "epoch": 0.33, "grad_norm": 2.5558836460113525, "learning_rate": 4.449565523720057e-05, "loss": 2.4441, "step": 45000 }, { "epoch": 0.33, "grad_norm": 2.634889602661133, "learning_rate": 4.443449585094724e-05, "loss": 2.4392, "step": 45500 }, { "epoch": 0.34, "grad_norm": 3.015256643295288, "learning_rate": 4.437333646469391e-05, "loss": 2.4218, "step": 46000 }, { "epoch": 0.34, "grad_norm": 2.656592607498169, "learning_rate": 4.431217707844059e-05, "loss": 2.4148, "step": 46500 }, { "epoch": 0.34, "grad_norm": 2.9560022354125977, "learning_rate": 4.425101769218726e-05, "loss": 2.4074, "step": 47000 }, { "epoch": 0.35, "grad_norm": 2.8009862899780273, "learning_rate": 4.418985830593393e-05, "loss": 2.4028, "step": 47500 }, { "epoch": 0.35, "grad_norm": 2.7291080951690674, "learning_rate": 4.41286989196806e-05, "loss": 2.3874, "step": 48000 }, { "epoch": 0.36, "grad_norm": 2.959327220916748, "learning_rate": 4.406753953342728e-05, "loss": 2.3803, "step": 48500 }, { "epoch": 0.36, "grad_norm": 2.4076898097991943, "learning_rate": 4.400638014717395e-05, "loss": 2.3641, "step": 49000 }, { "epoch": 0.36, "grad_norm": 2.703214168548584, "learning_rate": 4.3945220760920625e-05, "loss": 2.3707, "step": 49500 }, { "epoch": 0.37, "grad_norm": 2.7057530879974365, "learning_rate": 4.388406137466729e-05, "loss": 2.3551, "step": 50000 }, { "epoch": 0.37, "grad_norm": 2.656576156616211, "learning_rate": 4.382290198841397e-05, "loss": 2.3501, "step": 50500 }, { "epoch": 0.37, "grad_norm": 2.7500181198120117, "learning_rate": 4.376174260216064e-05, "loss": 2.342, "step": 51000 }, { "epoch": 0.38, "grad_norm": 2.560018301010132, "learning_rate": 4.3700583215907316e-05, "loss": 2.3378, "step": 51500 }, { "epoch": 0.38, "grad_norm": 2.5425586700439453, "learning_rate": 4.363942382965398e-05, "loss": 2.3263, "step": 52000 }, { "epoch": 0.39, "grad_norm": 2.7227046489715576, "learning_rate": 4.357826444340066e-05, "loss": 2.3216, "step": 52500 }, { "epoch": 0.39, "grad_norm": 2.8094546794891357, "learning_rate": 4.351710505714733e-05, "loss": 2.3168, "step": 53000 }, { "epoch": 0.39, "grad_norm": 2.559802293777466, "learning_rate": 4.3455945670894006e-05, "loss": 2.3129, "step": 53500 }, { "epoch": 0.4, "grad_norm": 2.9485251903533936, "learning_rate": 4.3394786284640673e-05, "loss": 2.3055, "step": 54000 }, { "epoch": 0.4, "grad_norm": 2.554258346557617, "learning_rate": 4.3333626898387355e-05, "loss": 2.2897, "step": 54500 }, { "epoch": 0.4, "grad_norm": 2.9118120670318604, "learning_rate": 4.327246751213402e-05, "loss": 2.2804, "step": 55000 }, { "epoch": 0.41, "grad_norm": 2.6395227909088135, "learning_rate": 4.3211308125880696e-05, "loss": 2.285, "step": 55500 }, { "epoch": 0.41, "grad_norm": 2.8277575969696045, "learning_rate": 4.315014873962737e-05, "loss": 2.2725, "step": 56000 }, { "epoch": 0.41, "grad_norm": 2.6950581073760986, "learning_rate": 4.3088989353374045e-05, "loss": 2.2726, "step": 56500 }, { "epoch": 0.42, "grad_norm": 2.5823190212249756, "learning_rate": 4.302782996712071e-05, "loss": 2.2676, "step": 57000 }, { "epoch": 0.42, "grad_norm": 2.9325387477874756, "learning_rate": 4.296667058086739e-05, "loss": 2.25, "step": 57500 }, { "epoch": 0.43, "grad_norm": 2.915308952331543, "learning_rate": 4.290551119461406e-05, "loss": 2.237, "step": 58000 }, { "epoch": 0.43, "grad_norm": 2.782322883605957, "learning_rate": 4.2844351808360735e-05, "loss": 2.2435, "step": 58500 }, { "epoch": 0.43, "grad_norm": 2.8868165016174316, "learning_rate": 4.278319242210741e-05, "loss": 2.2282, "step": 59000 }, { "epoch": 0.44, "grad_norm": 2.906238079071045, "learning_rate": 4.2722033035854084e-05, "loss": 2.2278, "step": 59500 }, { "epoch": 0.44, "grad_norm": 2.8209023475646973, "learning_rate": 4.266087364960075e-05, "loss": 2.2212, "step": 60000 }, { "epoch": 0.44, "grad_norm": 2.944169044494629, "learning_rate": 4.259971426334743e-05, "loss": 2.2107, "step": 60500 }, { "epoch": 0.45, "grad_norm": 3.010463237762451, "learning_rate": 4.25385548770941e-05, "loss": 2.2112, "step": 61000 }, { "epoch": 0.45, "grad_norm": 2.5480105876922607, "learning_rate": 4.2477395490840774e-05, "loss": 2.2232, "step": 61500 }, { "epoch": 0.46, "grad_norm": 2.619253635406494, "learning_rate": 4.241623610458745e-05, "loss": 2.1997, "step": 62000 }, { "epoch": 0.46, "grad_norm": 2.716602325439453, "learning_rate": 4.235507671833412e-05, "loss": 2.1967, "step": 62500 }, { "epoch": 0.46, "grad_norm": 2.7304656505584717, "learning_rate": 4.229391733208079e-05, "loss": 2.1865, "step": 63000 }, { "epoch": 0.47, "grad_norm": 2.3193280696868896, "learning_rate": 4.2232757945827464e-05, "loss": 2.1783, "step": 63500 }, { "epoch": 0.47, "grad_norm": 2.659099817276001, "learning_rate": 4.217159855957414e-05, "loss": 2.1669, "step": 64000 }, { "epoch": 0.47, "grad_norm": 2.8396430015563965, "learning_rate": 4.211043917332081e-05, "loss": 2.1794, "step": 64500 }, { "epoch": 0.48, "grad_norm": 2.5535542964935303, "learning_rate": 4.204927978706748e-05, "loss": 2.1688, "step": 65000 }, { "epoch": 0.48, "grad_norm": 2.776742696762085, "learning_rate": 4.1988120400814154e-05, "loss": 2.1661, "step": 65500 }, { "epoch": 0.48, "grad_norm": 2.4852330684661865, "learning_rate": 4.192696101456083e-05, "loss": 2.1591, "step": 66000 }, { "epoch": 0.49, "grad_norm": 2.8885903358459473, "learning_rate": 4.1865801628307496e-05, "loss": 2.1581, "step": 66500 }, { "epoch": 0.49, "grad_norm": 2.6640567779541016, "learning_rate": 4.180464224205418e-05, "loss": 2.1549, "step": 67000 }, { "epoch": 0.5, "grad_norm": 2.27127742767334, "learning_rate": 4.1743482855800845e-05, "loss": 2.1492, "step": 67500 }, { "epoch": 0.5, "grad_norm": 2.591395139694214, "learning_rate": 4.168232346954752e-05, "loss": 2.1382, "step": 68000 }, { "epoch": 0.5, "grad_norm": 2.8147051334381104, "learning_rate": 4.162116408329419e-05, "loss": 2.1407, "step": 68500 }, { "epoch": 0.51, "grad_norm": 2.650275707244873, "learning_rate": 4.156000469704087e-05, "loss": 2.1345, "step": 69000 }, { "epoch": 0.51, "grad_norm": 2.7816410064697266, "learning_rate": 4.1498845310787535e-05, "loss": 2.1371, "step": 69500 }, { "epoch": 0.51, "grad_norm": 2.6034348011016846, "learning_rate": 4.1437685924534216e-05, "loss": 2.1264, "step": 70000 }, { "epoch": 0.52, "grad_norm": 2.6060938835144043, "learning_rate": 4.1376526538280883e-05, "loss": 2.1186, "step": 70500 }, { "epoch": 0.52, "grad_norm": 2.754519462585449, "learning_rate": 4.131536715202756e-05, "loss": 2.1167, "step": 71000 }, { "epoch": 0.52, "grad_norm": 2.665511131286621, "learning_rate": 4.125420776577423e-05, "loss": 2.1025, "step": 71500 }, { "epoch": 0.53, "grad_norm": 2.8608968257904053, "learning_rate": 4.1193048379520906e-05, "loss": 2.1087, "step": 72000 }, { "epoch": 0.53, "grad_norm": 2.521726369857788, "learning_rate": 4.1131888993267574e-05, "loss": 2.1035, "step": 72500 }, { "epoch": 0.54, "grad_norm": 2.713449001312256, "learning_rate": 4.1070729607014255e-05, "loss": 2.0853, "step": 73000 }, { "epoch": 0.54, "grad_norm": 2.6165554523468018, "learning_rate": 4.100957022076092e-05, "loss": 2.1032, "step": 73500 }, { "epoch": 0.54, "grad_norm": 2.5650808811187744, "learning_rate": 4.0948410834507596e-05, "loss": 2.0823, "step": 74000 }, { "epoch": 0.55, "grad_norm": 2.7067785263061523, "learning_rate": 4.0887251448254264e-05, "loss": 2.0843, "step": 74500 }, { "epoch": 0.55, "grad_norm": 2.6049702167510986, "learning_rate": 4.0826092062000945e-05, "loss": 2.0851, "step": 75000 }, { "epoch": 0.55, "grad_norm": 2.487355947494507, "learning_rate": 4.076493267574761e-05, "loss": 2.0745, "step": 75500 }, { "epoch": 0.56, "grad_norm": 2.4251976013183594, "learning_rate": 4.070377328949429e-05, "loss": 2.0728, "step": 76000 }, { "epoch": 0.56, "grad_norm": 2.6600770950317383, "learning_rate": 4.064261390324096e-05, "loss": 2.0725, "step": 76500 }, { "epoch": 0.57, "grad_norm": 2.488598346710205, "learning_rate": 4.0581454516987635e-05, "loss": 2.0638, "step": 77000 }, { "epoch": 0.57, "grad_norm": 2.8305716514587402, "learning_rate": 4.05202951307343e-05, "loss": 2.0673, "step": 77500 }, { "epoch": 0.57, "grad_norm": 2.607948064804077, "learning_rate": 4.0459135744480984e-05, "loss": 2.0698, "step": 78000 }, { "epoch": 0.58, "grad_norm": 2.558473825454712, "learning_rate": 4.039797635822765e-05, "loss": 2.0531, "step": 78500 }, { "epoch": 0.58, "grad_norm": 2.675361394882202, "learning_rate": 4.0336816971974326e-05, "loss": 2.0568, "step": 79000 }, { "epoch": 0.58, "grad_norm": 2.430924654006958, "learning_rate": 4.0275657585721e-05, "loss": 2.0472, "step": 79500 }, { "epoch": 0.59, "grad_norm": 2.637683153152466, "learning_rate": 4.021449819946767e-05, "loss": 2.0422, "step": 80000 }, { "epoch": 0.59, "grad_norm": 2.8251748085021973, "learning_rate": 4.015333881321434e-05, "loss": 2.0438, "step": 80500 }, { "epoch": 0.59, "grad_norm": 2.9130163192749023, "learning_rate": 4.0092179426961016e-05, "loss": 2.0334, "step": 81000 }, { "epoch": 0.6, "grad_norm": 2.6857762336730957, "learning_rate": 4.003102004070769e-05, "loss": 2.0298, "step": 81500 }, { "epoch": 0.6, "grad_norm": 2.7029411792755127, "learning_rate": 3.996986065445436e-05, "loss": 2.0266, "step": 82000 }, { "epoch": 0.61, "grad_norm": 2.43390154838562, "learning_rate": 3.990870126820104e-05, "loss": 2.0323, "step": 82500 }, { "epoch": 0.61, "grad_norm": 2.8077657222747803, "learning_rate": 3.9847541881947706e-05, "loss": 2.0204, "step": 83000 }, { "epoch": 0.61, "grad_norm": 2.6263062953948975, "learning_rate": 3.978638249569438e-05, "loss": 2.0238, "step": 83500 }, { "epoch": 0.62, "grad_norm": 2.582228660583496, "learning_rate": 3.9725223109441055e-05, "loss": 2.0157, "step": 84000 }, { "epoch": 0.62, "grad_norm": 2.989870548248291, "learning_rate": 3.966406372318773e-05, "loss": 2.0123, "step": 84500 }, { "epoch": 0.62, "grad_norm": 2.50876522064209, "learning_rate": 3.9602904336934396e-05, "loss": 2.0049, "step": 85000 }, { "epoch": 0.63, "grad_norm": 2.754103183746338, "learning_rate": 3.954174495068108e-05, "loss": 2.0108, "step": 85500 }, { "epoch": 0.63, "grad_norm": 2.742558240890503, "learning_rate": 3.9480585564427745e-05, "loss": 2.0013, "step": 86000 }, { "epoch": 0.63, "grad_norm": 2.7211074829101562, "learning_rate": 3.941942617817442e-05, "loss": 2.0056, "step": 86500 }, { "epoch": 0.64, "grad_norm": 2.5889461040496826, "learning_rate": 3.9358266791921087e-05, "loss": 1.9933, "step": 87000 }, { "epoch": 0.64, "grad_norm": 2.581122398376465, "learning_rate": 3.929710740566777e-05, "loss": 1.9896, "step": 87500 }, { "epoch": 0.65, "grad_norm": 2.7021663188934326, "learning_rate": 3.9235948019414435e-05, "loss": 1.9878, "step": 88000 }, { "epoch": 0.65, "grad_norm": 2.6844136714935303, "learning_rate": 3.917478863316111e-05, "loss": 1.9838, "step": 88500 }, { "epoch": 0.65, "grad_norm": 2.6349422931671143, "learning_rate": 3.9113629246907784e-05, "loss": 1.9796, "step": 89000 }, { "epoch": 0.66, "grad_norm": 2.6799721717834473, "learning_rate": 3.905246986065446e-05, "loss": 1.9793, "step": 89500 }, { "epoch": 0.66, "grad_norm": 2.502464771270752, "learning_rate": 3.8991310474401125e-05, "loss": 1.9789, "step": 90000 }, { "epoch": 0.66, "grad_norm": 2.897421360015869, "learning_rate": 3.8930151088147806e-05, "loss": 1.9766, "step": 90500 }, { "epoch": 0.67, "grad_norm": 2.6226820945739746, "learning_rate": 3.8868991701894474e-05, "loss": 1.9785, "step": 91000 }, { "epoch": 0.67, "grad_norm": 2.7630228996276855, "learning_rate": 3.880783231564115e-05, "loss": 1.9805, "step": 91500 }, { "epoch": 0.68, "grad_norm": 2.7849583625793457, "learning_rate": 3.874667292938782e-05, "loss": 1.9672, "step": 92000 }, { "epoch": 0.68, "grad_norm": 2.643397569656372, "learning_rate": 3.86855135431345e-05, "loss": 1.9618, "step": 92500 }, { "epoch": 0.68, "grad_norm": 2.6938283443450928, "learning_rate": 3.8624354156881164e-05, "loss": 1.9674, "step": 93000 }, { "epoch": 0.69, "grad_norm": 2.7914974689483643, "learning_rate": 3.856319477062784e-05, "loss": 1.9616, "step": 93500 }, { "epoch": 0.69, "grad_norm": 2.6223716735839844, "learning_rate": 3.850203538437451e-05, "loss": 1.9566, "step": 94000 }, { "epoch": 0.69, "grad_norm": 2.6575443744659424, "learning_rate": 3.844087599812119e-05, "loss": 1.9611, "step": 94500 }, { "epoch": 0.7, "grad_norm": 2.684488534927368, "learning_rate": 3.837971661186786e-05, "loss": 1.9467, "step": 95000 }, { "epoch": 0.7, "grad_norm": 2.668365001678467, "learning_rate": 3.831855722561453e-05, "loss": 1.9548, "step": 95500 }, { "epoch": 0.7, "grad_norm": 2.967519521713257, "learning_rate": 3.82573978393612e-05, "loss": 1.9535, "step": 96000 }, { "epoch": 0.71, "grad_norm": 2.5876193046569824, "learning_rate": 3.819623845310788e-05, "loss": 1.9446, "step": 96500 }, { "epoch": 0.71, "grad_norm": 2.7293176651000977, "learning_rate": 3.813507906685455e-05, "loss": 1.9481, "step": 97000 }, { "epoch": 0.72, "grad_norm": 2.6928365230560303, "learning_rate": 3.807391968060122e-05, "loss": 1.9296, "step": 97500 }, { "epoch": 0.72, "grad_norm": 2.568150043487549, "learning_rate": 3.801276029434789e-05, "loss": 1.9422, "step": 98000 }, { "epoch": 0.72, "grad_norm": 2.7275748252868652, "learning_rate": 3.795160090809457e-05, "loss": 1.9314, "step": 98500 }, { "epoch": 0.73, "grad_norm": 2.613135576248169, "learning_rate": 3.789044152184124e-05, "loss": 1.931, "step": 99000 }, { "epoch": 0.73, "grad_norm": 2.408534049987793, "learning_rate": 3.782928213558791e-05, "loss": 1.9314, "step": 99500 }, { "epoch": 0.73, "grad_norm": 2.896430015563965, "learning_rate": 3.776812274933459e-05, "loss": 1.9221, "step": 100000 }, { "epoch": 0.74, "grad_norm": 2.6729750633239746, "learning_rate": 3.770696336308126e-05, "loss": 1.9247, "step": 100500 }, { "epoch": 0.74, "grad_norm": 2.5819644927978516, "learning_rate": 3.764580397682793e-05, "loss": 1.9243, "step": 101000 }, { "epoch": 0.74, "grad_norm": 2.561739206314087, "learning_rate": 3.7584644590574606e-05, "loss": 1.9291, "step": 101500 }, { "epoch": 0.75, "grad_norm": 2.663254976272583, "learning_rate": 3.752348520432128e-05, "loss": 1.9138, "step": 102000 }, { "epoch": 0.75, "grad_norm": 2.662156581878662, "learning_rate": 3.746232581806795e-05, "loss": 1.9128, "step": 102500 }, { "epoch": 0.76, "grad_norm": 2.6324357986450195, "learning_rate": 3.740116643181463e-05, "loss": 1.9215, "step": 103000 }, { "epoch": 0.76, "grad_norm": 2.619344711303711, "learning_rate": 3.7340007045561297e-05, "loss": 1.9043, "step": 103500 }, { "epoch": 0.76, "grad_norm": 2.681112051010132, "learning_rate": 3.727884765930797e-05, "loss": 1.9077, "step": 104000 }, { "epoch": 0.77, "grad_norm": 2.846804141998291, "learning_rate": 3.7217688273054645e-05, "loss": 1.9034, "step": 104500 }, { "epoch": 0.77, "grad_norm": 3.0270516872406006, "learning_rate": 3.715652888680132e-05, "loss": 1.901, "step": 105000 }, { "epoch": 0.77, "grad_norm": 2.5290517807006836, "learning_rate": 3.709536950054799e-05, "loss": 1.9031, "step": 105500 }, { "epoch": 0.78, "grad_norm": 2.661867380142212, "learning_rate": 3.703421011429467e-05, "loss": 1.902, "step": 106000 }, { "epoch": 0.78, "grad_norm": 2.816241979598999, "learning_rate": 3.6973050728041335e-05, "loss": 1.8885, "step": 106500 }, { "epoch": 0.79, "grad_norm": 2.8065085411071777, "learning_rate": 3.691189134178801e-05, "loss": 1.8931, "step": 107000 }, { "epoch": 0.79, "grad_norm": 2.4863102436065674, "learning_rate": 3.6850731955534684e-05, "loss": 1.8907, "step": 107500 }, { "epoch": 0.79, "grad_norm": 2.4044525623321533, "learning_rate": 3.678957256928136e-05, "loss": 1.8911, "step": 108000 }, { "epoch": 0.8, "grad_norm": 2.6208319664001465, "learning_rate": 3.6728413183028026e-05, "loss": 1.8889, "step": 108500 }, { "epoch": 0.8, "grad_norm": 2.4432547092437744, "learning_rate": 3.66672537967747e-05, "loss": 1.8794, "step": 109000 }, { "epoch": 0.8, "grad_norm": 2.9175052642822266, "learning_rate": 3.6606094410521374e-05, "loss": 1.8769, "step": 109500 }, { "epoch": 0.81, "grad_norm": 2.7171223163604736, "learning_rate": 3.654493502426805e-05, "loss": 1.8834, "step": 110000 }, { "epoch": 0.81, "grad_norm": 2.5070419311523438, "learning_rate": 3.6483775638014716e-05, "loss": 1.8706, "step": 110500 }, { "epoch": 0.81, "grad_norm": 2.59771990776062, "learning_rate": 3.642261625176139e-05, "loss": 1.8687, "step": 111000 }, { "epoch": 0.82, "grad_norm": 3.022465944290161, "learning_rate": 3.6361456865508064e-05, "loss": 1.8721, "step": 111500 }, { "epoch": 0.82, "grad_norm": 2.8927907943725586, "learning_rate": 3.630029747925474e-05, "loss": 1.8761, "step": 112000 }, { "epoch": 0.83, "grad_norm": 2.612518787384033, "learning_rate": 3.623913809300141e-05, "loss": 1.8744, "step": 112500 }, { "epoch": 0.83, "grad_norm": 2.7625935077667236, "learning_rate": 3.617797870674808e-05, "loss": 1.8636, "step": 113000 }, { "epoch": 0.83, "grad_norm": 2.535382032394409, "learning_rate": 3.6116819320494755e-05, "loss": 1.8683, "step": 113500 }, { "epoch": 0.84, "grad_norm": 2.575298547744751, "learning_rate": 3.605565993424143e-05, "loss": 1.8657, "step": 114000 }, { "epoch": 0.84, "grad_norm": 2.6413776874542236, "learning_rate": 3.59945005479881e-05, "loss": 1.8571, "step": 114500 }, { "epoch": 0.84, "grad_norm": 2.675283908843994, "learning_rate": 3.593334116173477e-05, "loss": 1.8575, "step": 115000 }, { "epoch": 0.85, "grad_norm": 2.7104618549346924, "learning_rate": 3.587218177548145e-05, "loss": 1.8555, "step": 115500 }, { "epoch": 0.85, "grad_norm": 2.7391483783721924, "learning_rate": 3.581102238922812e-05, "loss": 1.8555, "step": 116000 }, { "epoch": 0.86, "grad_norm": 2.561523675918579, "learning_rate": 3.5749863002974793e-05, "loss": 1.8475, "step": 116500 }, { "epoch": 0.86, "grad_norm": 2.590790033340454, "learning_rate": 3.568870361672147e-05, "loss": 1.8425, "step": 117000 }, { "epoch": 0.86, "grad_norm": 2.889119863510132, "learning_rate": 3.562754423046814e-05, "loss": 1.8491, "step": 117500 }, { "epoch": 0.87, "grad_norm": 2.77632212638855, "learning_rate": 3.556638484421481e-05, "loss": 1.8469, "step": 118000 }, { "epoch": 0.87, "grad_norm": 2.720357894897461, "learning_rate": 3.550522545796149e-05, "loss": 1.8462, "step": 118500 }, { "epoch": 0.87, "grad_norm": 2.8213210105895996, "learning_rate": 3.544406607170816e-05, "loss": 1.8399, "step": 119000 }, { "epoch": 0.88, "grad_norm": 2.834599733352661, "learning_rate": 3.538290668545483e-05, "loss": 1.8429, "step": 119500 }, { "epoch": 0.88, "grad_norm": 2.578364133834839, "learning_rate": 3.5321747299201506e-05, "loss": 1.8386, "step": 120000 }, { "epoch": 0.88, "grad_norm": 2.6725339889526367, "learning_rate": 3.526058791294818e-05, "loss": 1.85, "step": 120500 }, { "epoch": 0.89, "grad_norm": 2.5288286209106445, "learning_rate": 3.519942852669485e-05, "loss": 1.831, "step": 121000 }, { "epoch": 0.89, "grad_norm": 2.4805219173431396, "learning_rate": 3.513826914044152e-05, "loss": 1.836, "step": 121500 }, { "epoch": 0.9, "grad_norm": 2.6729605197906494, "learning_rate": 3.50771097541882e-05, "loss": 1.8297, "step": 122000 }, { "epoch": 0.9, "grad_norm": 2.5666863918304443, "learning_rate": 3.501595036793487e-05, "loss": 1.8231, "step": 122500 }, { "epoch": 0.9, "grad_norm": 2.5059523582458496, "learning_rate": 3.495479098168154e-05, "loss": 1.8296, "step": 123000 }, { "epoch": 0.91, "grad_norm": 2.758755922317505, "learning_rate": 3.489363159542822e-05, "loss": 1.8218, "step": 123500 }, { "epoch": 0.91, "grad_norm": 2.5374386310577393, "learning_rate": 3.483247220917489e-05, "loss": 1.8234, "step": 124000 }, { "epoch": 0.91, "grad_norm": 2.5575644969940186, "learning_rate": 3.477131282292156e-05, "loss": 1.8158, "step": 124500 }, { "epoch": 0.92, "grad_norm": 2.567166328430176, "learning_rate": 3.4710153436668236e-05, "loss": 1.8179, "step": 125000 }, { "epoch": 0.92, "grad_norm": 2.4243805408477783, "learning_rate": 3.464899405041491e-05, "loss": 1.82, "step": 125500 }, { "epoch": 0.92, "grad_norm": 2.665632486343384, "learning_rate": 3.458783466416158e-05, "loss": 1.8151, "step": 126000 }, { "epoch": 0.93, "grad_norm": 2.90759539604187, "learning_rate": 3.452667527790825e-05, "loss": 1.8109, "step": 126500 }, { "epoch": 0.93, "grad_norm": 2.6775169372558594, "learning_rate": 3.4465515891654926e-05, "loss": 1.8199, "step": 127000 }, { "epoch": 0.94, "grad_norm": 2.430788516998291, "learning_rate": 3.44043565054016e-05, "loss": 1.8128, "step": 127500 }, { "epoch": 0.94, "grad_norm": 2.9997665882110596, "learning_rate": 3.4343197119148274e-05, "loss": 1.8106, "step": 128000 }, { "epoch": 0.94, "grad_norm": 2.672255039215088, "learning_rate": 3.428203773289494e-05, "loss": 1.8141, "step": 128500 }, { "epoch": 0.95, "grad_norm": 2.4398484230041504, "learning_rate": 3.4220878346641616e-05, "loss": 1.8036, "step": 129000 }, { "epoch": 0.95, "grad_norm": 2.897477149963379, "learning_rate": 3.415971896038829e-05, "loss": 1.8006, "step": 129500 }, { "epoch": 0.95, "grad_norm": 2.7050111293792725, "learning_rate": 3.4098559574134965e-05, "loss": 1.803, "step": 130000 }, { "epoch": 0.96, "grad_norm": 2.503981828689575, "learning_rate": 3.403740018788163e-05, "loss": 1.7992, "step": 130500 }, { "epoch": 0.96, "grad_norm": 2.9682281017303467, "learning_rate": 3.397624080162831e-05, "loss": 1.796, "step": 131000 }, { "epoch": 0.97, "grad_norm": 3.1613168716430664, "learning_rate": 3.391508141537498e-05, "loss": 1.8083, "step": 131500 }, { "epoch": 0.97, "grad_norm": 2.6427714824676514, "learning_rate": 3.3853922029121655e-05, "loss": 1.7968, "step": 132000 }, { "epoch": 0.97, "grad_norm": 2.6238105297088623, "learning_rate": 3.379276264286832e-05, "loss": 1.7887, "step": 132500 }, { "epoch": 0.98, "grad_norm": 2.566740036010742, "learning_rate": 3.3731603256615e-05, "loss": 1.7979, "step": 133000 }, { "epoch": 0.98, "grad_norm": 2.6818795204162598, "learning_rate": 3.367044387036167e-05, "loss": 1.7856, "step": 133500 }, { "epoch": 0.98, "grad_norm": 2.7290897369384766, "learning_rate": 3.3609284484108345e-05, "loss": 1.785, "step": 134000 }, { "epoch": 0.99, "grad_norm": 2.8657619953155518, "learning_rate": 3.354812509785502e-05, "loss": 1.7925, "step": 134500 }, { "epoch": 0.99, "grad_norm": 2.4724438190460205, "learning_rate": 3.3486965711601694e-05, "loss": 1.7854, "step": 135000 }, { "epoch": 0.99, "grad_norm": 2.658123016357422, "learning_rate": 3.342580632534836e-05, "loss": 1.7904, "step": 135500 }, { "epoch": 1.0, "grad_norm": 2.828024387359619, "learning_rate": 3.336464693909504e-05, "loss": 1.7768, "step": 136000 }, { "epoch": 1.0, "grad_norm": 2.7401976585388184, "learning_rate": 3.330348755284171e-05, "loss": 1.7808, "step": 136500 }, { "epoch": 1.01, "grad_norm": 2.651334524154663, "learning_rate": 3.3242328166588384e-05, "loss": 1.7769, "step": 137000 }, { "epoch": 1.01, "grad_norm": 2.9663844108581543, "learning_rate": 3.318116878033506e-05, "loss": 1.7685, "step": 137500 }, { "epoch": 1.01, "grad_norm": 2.6409363746643066, "learning_rate": 3.312000939408173e-05, "loss": 1.7819, "step": 138000 }, { "epoch": 1.02, "grad_norm": 2.4980876445770264, "learning_rate": 3.30588500078284e-05, "loss": 1.7781, "step": 138500 }, { "epoch": 1.02, "grad_norm": 2.583472967147827, "learning_rate": 3.299769062157508e-05, "loss": 1.7642, "step": 139000 }, { "epoch": 1.02, "grad_norm": 2.7035281658172607, "learning_rate": 3.293653123532175e-05, "loss": 1.7707, "step": 139500 }, { "epoch": 1.03, "grad_norm": 2.647327184677124, "learning_rate": 3.287537184906842e-05, "loss": 1.77, "step": 140000 }, { "epoch": 1.03, "grad_norm": 2.6016039848327637, "learning_rate": 3.28142124628151e-05, "loss": 1.7689, "step": 140500 }, { "epoch": 1.03, "grad_norm": 2.871412515640259, "learning_rate": 3.275305307656177e-05, "loss": 1.7614, "step": 141000 }, { "epoch": 1.04, "grad_norm": 2.5391454696655273, "learning_rate": 3.269189369030844e-05, "loss": 1.7666, "step": 141500 }, { "epoch": 1.04, "grad_norm": 2.7399189472198486, "learning_rate": 3.263073430405511e-05, "loss": 1.7698, "step": 142000 }, { "epoch": 1.05, "grad_norm": 2.550523281097412, "learning_rate": 3.256957491780179e-05, "loss": 1.7661, "step": 142500 }, { "epoch": 1.05, "grad_norm": 2.7033884525299072, "learning_rate": 3.250841553154846e-05, "loss": 1.7576, "step": 143000 }, { "epoch": 1.05, "grad_norm": 2.712890386581421, "learning_rate": 3.244725614529513e-05, "loss": 1.759, "step": 143500 }, { "epoch": 1.06, "grad_norm": 2.7690348625183105, "learning_rate": 3.23860967590418e-05, "loss": 1.757, "step": 144000 }, { "epoch": 1.06, "grad_norm": 2.5629446506500244, "learning_rate": 3.232493737278848e-05, "loss": 1.7593, "step": 144500 }, { "epoch": 1.06, "grad_norm": 2.5981359481811523, "learning_rate": 3.226377798653515e-05, "loss": 1.7506, "step": 145000 }, { "epoch": 1.07, "grad_norm": 2.465391159057617, "learning_rate": 3.2202618600281826e-05, "loss": 1.7558, "step": 145500 }, { "epoch": 1.07, "grad_norm": 3.0468790531158447, "learning_rate": 3.2141459214028493e-05, "loss": 1.7559, "step": 146000 }, { "epoch": 1.08, "grad_norm": 2.5434939861297607, "learning_rate": 3.208029982777517e-05, "loss": 1.7507, "step": 146500 }, { "epoch": 1.08, "grad_norm": 2.479449987411499, "learning_rate": 3.201914044152184e-05, "loss": 1.7457, "step": 147000 }, { "epoch": 1.08, "grad_norm": 2.621965169906616, "learning_rate": 3.1957981055268516e-05, "loss": 1.7513, "step": 147500 }, { "epoch": 1.09, "grad_norm": 2.6055707931518555, "learning_rate": 3.1896821669015184e-05, "loss": 1.7427, "step": 148000 }, { "epoch": 1.09, "grad_norm": 2.2937374114990234, "learning_rate": 3.1835662282761865e-05, "loss": 1.7524, "step": 148500 }, { "epoch": 1.09, "grad_norm": 2.7363667488098145, "learning_rate": 3.177450289650853e-05, "loss": 1.7492, "step": 149000 }, { "epoch": 1.1, "grad_norm": 2.660330057144165, "learning_rate": 3.1713343510255207e-05, "loss": 1.7405, "step": 149500 }, { "epoch": 1.1, "grad_norm": 2.589137077331543, "learning_rate": 3.165218412400188e-05, "loss": 1.7389, "step": 150000 }, { "epoch": 1.1, "grad_norm": 2.7419252395629883, "learning_rate": 3.1591024737748555e-05, "loss": 1.7478, "step": 150500 }, { "epoch": 1.11, "grad_norm": 2.6772820949554443, "learning_rate": 3.152986535149522e-05, "loss": 1.7365, "step": 151000 }, { "epoch": 1.11, "grad_norm": 2.765460968017578, "learning_rate": 3.1468705965241904e-05, "loss": 1.7399, "step": 151500 }, { "epoch": 1.12, "grad_norm": 2.5273730754852295, "learning_rate": 3.140754657898857e-05, "loss": 1.7336, "step": 152000 }, { "epoch": 1.12, "grad_norm": 2.6962389945983887, "learning_rate": 3.1346387192735245e-05, "loss": 1.7347, "step": 152500 }, { "epoch": 1.12, "grad_norm": 2.6664233207702637, "learning_rate": 3.128522780648192e-05, "loss": 1.7212, "step": 153000 }, { "epoch": 1.13, "grad_norm": 2.923704147338867, "learning_rate": 3.1224068420228594e-05, "loss": 1.7327, "step": 153500 }, { "epoch": 1.13, "grad_norm": 2.7295753955841064, "learning_rate": 3.116290903397526e-05, "loss": 1.7304, "step": 154000 }, { "epoch": 1.13, "grad_norm": 2.6523427963256836, "learning_rate": 3.110174964772194e-05, "loss": 1.7295, "step": 154500 }, { "epoch": 1.14, "grad_norm": 2.544809341430664, "learning_rate": 3.104059026146861e-05, "loss": 1.7303, "step": 155000 }, { "epoch": 1.14, "grad_norm": 2.608091354370117, "learning_rate": 3.0979430875215284e-05, "loss": 1.7288, "step": 155500 }, { "epoch": 1.14, "grad_norm": 2.6781680583953857, "learning_rate": 3.091827148896195e-05, "loss": 1.7333, "step": 156000 }, { "epoch": 1.15, "grad_norm": 2.5772953033447266, "learning_rate": 3.085711210270863e-05, "loss": 1.7313, "step": 156500 }, { "epoch": 1.15, "grad_norm": 2.6381750106811523, "learning_rate": 3.07959527164553e-05, "loss": 1.723, "step": 157000 }, { "epoch": 1.16, "grad_norm": 2.609584093093872, "learning_rate": 3.0734793330201974e-05, "loss": 1.7317, "step": 157500 }, { "epoch": 1.16, "grad_norm": 2.556508779525757, "learning_rate": 3.067363394394865e-05, "loss": 1.7168, "step": 158000 }, { "epoch": 1.16, "grad_norm": 2.4912405014038086, "learning_rate": 3.061247455769532e-05, "loss": 1.7221, "step": 158500 }, { "epoch": 1.17, "grad_norm": 3.072758436203003, "learning_rate": 3.055131517144199e-05, "loss": 1.7193, "step": 159000 }, { "epoch": 1.17, "grad_norm": 2.5012080669403076, "learning_rate": 3.0490155785188668e-05, "loss": 1.7187, "step": 159500 }, { "epoch": 1.17, "grad_norm": 2.6786646842956543, "learning_rate": 3.042899639893534e-05, "loss": 1.7099, "step": 160000 }, { "epoch": 1.18, "grad_norm": 2.6225085258483887, "learning_rate": 3.036783701268201e-05, "loss": 1.7135, "step": 160500 }, { "epoch": 1.18, "grad_norm": 2.6484742164611816, "learning_rate": 3.0306677626428687e-05, "loss": 1.7086, "step": 161000 }, { "epoch": 1.19, "grad_norm": 2.948645830154419, "learning_rate": 3.0245518240175358e-05, "loss": 1.7098, "step": 161500 }, { "epoch": 1.19, "grad_norm": 2.766124725341797, "learning_rate": 3.018435885392203e-05, "loss": 1.7072, "step": 162000 }, { "epoch": 1.19, "grad_norm": 2.8391172885894775, "learning_rate": 3.0123199467668707e-05, "loss": 1.7116, "step": 162500 }, { "epoch": 1.2, "grad_norm": 2.5152761936187744, "learning_rate": 3.0062040081415378e-05, "loss": 1.7073, "step": 163000 }, { "epoch": 1.2, "grad_norm": 2.719190835952759, "learning_rate": 3.000088069516205e-05, "loss": 1.7017, "step": 163500 }, { "epoch": 1.2, "grad_norm": 2.5250284671783447, "learning_rate": 2.9939721308908726e-05, "loss": 1.7055, "step": 164000 }, { "epoch": 1.21, "grad_norm": 3.0202980041503906, "learning_rate": 2.9878561922655397e-05, "loss": 1.7041, "step": 164500 }, { "epoch": 1.21, "grad_norm": 2.7549800872802734, "learning_rate": 2.9817402536402068e-05, "loss": 1.6984, "step": 165000 }, { "epoch": 1.21, "grad_norm": 2.891324281692505, "learning_rate": 2.9756243150148742e-05, "loss": 1.7033, "step": 165500 }, { "epoch": 1.22, "grad_norm": 3.0068359375, "learning_rate": 2.9695083763895416e-05, "loss": 1.6969, "step": 166000 }, { "epoch": 1.22, "grad_norm": 2.5715763568878174, "learning_rate": 2.9633924377642087e-05, "loss": 1.698, "step": 166500 }, { "epoch": 1.23, "grad_norm": 2.5422136783599854, "learning_rate": 2.9572764991388758e-05, "loss": 1.6987, "step": 167000 }, { "epoch": 1.23, "grad_norm": 2.5007290840148926, "learning_rate": 2.9511605605135432e-05, "loss": 1.6993, "step": 167500 }, { "epoch": 1.23, "grad_norm": 2.5966525077819824, "learning_rate": 2.9450446218882107e-05, "loss": 1.6923, "step": 168000 }, { "epoch": 1.24, "grad_norm": 2.5626885890960693, "learning_rate": 2.9389286832628778e-05, "loss": 1.6964, "step": 168500 }, { "epoch": 1.24, "grad_norm": 2.76600980758667, "learning_rate": 2.9328127446375452e-05, "loss": 1.6875, "step": 169000 }, { "epoch": 1.24, "grad_norm": 2.922257661819458, "learning_rate": 2.9266968060122123e-05, "loss": 1.6985, "step": 169500 }, { "epoch": 1.25, "grad_norm": 2.641627311706543, "learning_rate": 2.9205808673868794e-05, "loss": 1.6888, "step": 170000 }, { "epoch": 1.25, "grad_norm": 2.6927127838134766, "learning_rate": 2.914464928761547e-05, "loss": 1.6924, "step": 170500 }, { "epoch": 1.25, "grad_norm": 2.4802746772766113, "learning_rate": 2.9083489901362142e-05, "loss": 1.6946, "step": 171000 }, { "epoch": 1.26, "grad_norm": 2.5318949222564697, "learning_rate": 2.9022330515108813e-05, "loss": 1.6894, "step": 171500 }, { "epoch": 1.26, "grad_norm": 2.7798898220062256, "learning_rate": 2.896117112885549e-05, "loss": 1.6861, "step": 172000 }, { "epoch": 1.27, "grad_norm": 2.7204222679138184, "learning_rate": 2.890001174260216e-05, "loss": 1.6839, "step": 172500 }, { "epoch": 1.27, "grad_norm": 2.8077077865600586, "learning_rate": 2.8838852356348832e-05, "loss": 1.684, "step": 173000 }, { "epoch": 1.27, "grad_norm": 2.565995454788208, "learning_rate": 2.877769297009551e-05, "loss": 1.682, "step": 173500 }, { "epoch": 1.28, "grad_norm": 2.7671403884887695, "learning_rate": 2.871653358384218e-05, "loss": 1.6782, "step": 174000 }, { "epoch": 1.28, "grad_norm": 2.6801698207855225, "learning_rate": 2.8655374197588852e-05, "loss": 1.6853, "step": 174500 }, { "epoch": 1.28, "grad_norm": 2.810450553894043, "learning_rate": 2.859421481133553e-05, "loss": 1.6841, "step": 175000 }, { "epoch": 1.29, "grad_norm": 2.797452211380005, "learning_rate": 2.85330554250822e-05, "loss": 1.6753, "step": 175500 }, { "epoch": 1.29, "grad_norm": 2.5832931995391846, "learning_rate": 2.847189603882887e-05, "loss": 1.6905, "step": 176000 }, { "epoch": 1.3, "grad_norm": 2.8013391494750977, "learning_rate": 2.841073665257555e-05, "loss": 1.673, "step": 176500 }, { "epoch": 1.3, "grad_norm": 2.5176284313201904, "learning_rate": 2.834957726632222e-05, "loss": 1.6742, "step": 177000 }, { "epoch": 1.3, "grad_norm": 2.7248387336730957, "learning_rate": 2.828841788006889e-05, "loss": 1.6663, "step": 177500 }, { "epoch": 1.31, "grad_norm": 3.006441831588745, "learning_rate": 2.822725849381556e-05, "loss": 1.6762, "step": 178000 }, { "epoch": 1.31, "grad_norm": 2.754427671432495, "learning_rate": 2.816609910756224e-05, "loss": 1.6711, "step": 178500 }, { "epoch": 1.31, "grad_norm": 2.6871962547302246, "learning_rate": 2.810493972130891e-05, "loss": 1.6749, "step": 179000 }, { "epoch": 1.32, "grad_norm": 2.7660982608795166, "learning_rate": 2.804378033505558e-05, "loss": 1.6694, "step": 179500 }, { "epoch": 1.32, "grad_norm": 2.5820930004119873, "learning_rate": 2.798262094880226e-05, "loss": 1.68, "step": 180000 }, { "epoch": 1.32, "grad_norm": 2.50264048576355, "learning_rate": 2.792146156254893e-05, "loss": 1.6745, "step": 180500 }, { "epoch": 1.33, "grad_norm": 2.759570837020874, "learning_rate": 2.78603021762956e-05, "loss": 1.6602, "step": 181000 }, { "epoch": 1.33, "grad_norm": 2.6648566722869873, "learning_rate": 2.7799142790042278e-05, "loss": 1.67, "step": 181500 }, { "epoch": 1.34, "grad_norm": 2.7140583992004395, "learning_rate": 2.773798340378895e-05, "loss": 1.6645, "step": 182000 }, { "epoch": 1.34, "grad_norm": 2.4393863677978516, "learning_rate": 2.767682401753562e-05, "loss": 1.665, "step": 182500 }, { "epoch": 1.34, "grad_norm": 2.66521954536438, "learning_rate": 2.7615664631282294e-05, "loss": 1.6694, "step": 183000 }, { "epoch": 1.35, "grad_norm": 2.9926490783691406, "learning_rate": 2.7554505245028965e-05, "loss": 1.669, "step": 183500 }, { "epoch": 1.35, "grad_norm": 2.611051321029663, "learning_rate": 2.749334585877564e-05, "loss": 1.6606, "step": 184000 }, { "epoch": 1.35, "grad_norm": 2.6490185260772705, "learning_rate": 2.7432186472522313e-05, "loss": 1.6639, "step": 184500 }, { "epoch": 1.36, "grad_norm": 2.7830920219421387, "learning_rate": 2.7371027086268984e-05, "loss": 1.6584, "step": 185000 }, { "epoch": 1.36, "grad_norm": 2.776111602783203, "learning_rate": 2.7309867700015655e-05, "loss": 1.6564, "step": 185500 }, { "epoch": 1.37, "grad_norm": 2.5335960388183594, "learning_rate": 2.7248708313762333e-05, "loss": 1.6553, "step": 186000 }, { "epoch": 1.37, "grad_norm": 2.585458755493164, "learning_rate": 2.7187548927509004e-05, "loss": 1.653, "step": 186500 }, { "epoch": 1.37, "grad_norm": 2.851865768432617, "learning_rate": 2.7126389541255674e-05, "loss": 1.6654, "step": 187000 }, { "epoch": 1.38, "grad_norm": 2.649545907974243, "learning_rate": 2.7065230155002352e-05, "loss": 1.657, "step": 187500 }, { "epoch": 1.38, "grad_norm": 2.552381753921509, "learning_rate": 2.7004070768749023e-05, "loss": 1.6569, "step": 188000 }, { "epoch": 1.38, "grad_norm": 2.6055853366851807, "learning_rate": 2.6942911382495694e-05, "loss": 1.6522, "step": 188500 }, { "epoch": 1.39, "grad_norm": 2.848911762237549, "learning_rate": 2.6881751996242365e-05, "loss": 1.6435, "step": 189000 }, { "epoch": 1.39, "grad_norm": 2.8290162086486816, "learning_rate": 2.6820592609989042e-05, "loss": 1.6457, "step": 189500 }, { "epoch": 1.39, "grad_norm": 2.682929277420044, "learning_rate": 2.6759433223735713e-05, "loss": 1.6521, "step": 190000 }, { "epoch": 1.4, "grad_norm": 2.7035279273986816, "learning_rate": 2.6698273837482384e-05, "loss": 1.652, "step": 190500 }, { "epoch": 1.4, "grad_norm": 2.6156182289123535, "learning_rate": 2.6637114451229062e-05, "loss": 1.6421, "step": 191000 }, { "epoch": 1.41, "grad_norm": 2.7647957801818848, "learning_rate": 2.6575955064975733e-05, "loss": 1.6496, "step": 191500 }, { "epoch": 1.41, "grad_norm": 2.5763864517211914, "learning_rate": 2.6514795678722403e-05, "loss": 1.6454, "step": 192000 }, { "epoch": 1.41, "grad_norm": 2.6585116386413574, "learning_rate": 2.645363629246908e-05, "loss": 1.6443, "step": 192500 }, { "epoch": 1.42, "grad_norm": 2.7471868991851807, "learning_rate": 2.6392476906215752e-05, "loss": 1.6494, "step": 193000 }, { "epoch": 1.42, "grad_norm": 2.7787129878997803, "learning_rate": 2.6331317519962423e-05, "loss": 1.6441, "step": 193500 }, { "epoch": 1.42, "grad_norm": 2.3297078609466553, "learning_rate": 2.62701581337091e-05, "loss": 1.6462, "step": 194000 }, { "epoch": 1.43, "grad_norm": 2.8310294151306152, "learning_rate": 2.620899874745577e-05, "loss": 1.6473, "step": 194500 }, { "epoch": 1.43, "grad_norm": 2.6443045139312744, "learning_rate": 2.6147839361202442e-05, "loss": 1.6468, "step": 195000 }, { "epoch": 1.43, "grad_norm": 2.5064589977264404, "learning_rate": 2.608667997494912e-05, "loss": 1.6385, "step": 195500 }, { "epoch": 1.44, "grad_norm": 2.6140296459198, "learning_rate": 2.602552058869579e-05, "loss": 1.6357, "step": 196000 }, { "epoch": 1.44, "grad_norm": 2.461705207824707, "learning_rate": 2.596436120244246e-05, "loss": 1.6401, "step": 196500 }, { "epoch": 1.45, "grad_norm": 2.782813787460327, "learning_rate": 2.590320181618914e-05, "loss": 1.6426, "step": 197000 }, { "epoch": 1.45, "grad_norm": 2.5911970138549805, "learning_rate": 2.584204242993581e-05, "loss": 1.6409, "step": 197500 }, { "epoch": 1.45, "grad_norm": 2.593752384185791, "learning_rate": 2.578088304368248e-05, "loss": 1.6345, "step": 198000 }, { "epoch": 1.46, "grad_norm": 2.9096670150756836, "learning_rate": 2.5719723657429155e-05, "loss": 1.6351, "step": 198500 }, { "epoch": 1.46, "grad_norm": 2.9551987648010254, "learning_rate": 2.5658564271175826e-05, "loss": 1.6322, "step": 199000 }, { "epoch": 1.46, "grad_norm": 2.6173858642578125, "learning_rate": 2.55974048849225e-05, "loss": 1.6307, "step": 199500 }, { "epoch": 1.47, "grad_norm": 2.695869207382202, "learning_rate": 2.5536245498669175e-05, "loss": 1.6264, "step": 200000 }, { "epoch": 1.47, "grad_norm": 2.711869955062866, "learning_rate": 2.5475086112415846e-05, "loss": 1.6283, "step": 200500 }, { "epoch": 1.48, "grad_norm": 2.570518732070923, "learning_rate": 2.5413926726162516e-05, "loss": 1.6323, "step": 201000 }, { "epoch": 1.48, "grad_norm": 2.7032439708709717, "learning_rate": 2.535276733990919e-05, "loss": 1.6339, "step": 201500 }, { "epoch": 1.48, "grad_norm": 2.7625739574432373, "learning_rate": 2.5291607953655865e-05, "loss": 1.6279, "step": 202000 }, { "epoch": 1.49, "grad_norm": 2.829380989074707, "learning_rate": 2.5230448567402536e-05, "loss": 1.6263, "step": 202500 }, { "epoch": 1.49, "grad_norm": 2.499410629272461, "learning_rate": 2.5169289181149207e-05, "loss": 1.6201, "step": 203000 }, { "epoch": 1.49, "grad_norm": 2.6228952407836914, "learning_rate": 2.5108129794895884e-05, "loss": 1.6244, "step": 203500 }, { "epoch": 1.5, "grad_norm": 2.609665870666504, "learning_rate": 2.5046970408642555e-05, "loss": 1.6319, "step": 204000 }, { "epoch": 1.5, "grad_norm": 2.8935351371765137, "learning_rate": 2.498581102238923e-05, "loss": 1.6216, "step": 204500 }, { "epoch": 1.5, "grad_norm": 2.7964882850646973, "learning_rate": 2.49246516361359e-05, "loss": 1.621, "step": 205000 }, { "epoch": 1.51, "grad_norm": 2.465930938720703, "learning_rate": 2.4863492249882575e-05, "loss": 1.6332, "step": 205500 }, { "epoch": 1.51, "grad_norm": 2.9245595932006836, "learning_rate": 2.480233286362925e-05, "loss": 1.6239, "step": 206000 }, { "epoch": 1.52, "grad_norm": 2.548551321029663, "learning_rate": 2.474117347737592e-05, "loss": 1.6148, "step": 206500 }, { "epoch": 1.52, "grad_norm": 2.6611809730529785, "learning_rate": 2.4680014091122594e-05, "loss": 1.6216, "step": 207000 }, { "epoch": 1.52, "grad_norm": 2.6596455574035645, "learning_rate": 2.4618854704869268e-05, "loss": 1.6057, "step": 207500 }, { "epoch": 1.53, "grad_norm": 2.645918607711792, "learning_rate": 2.455769531861594e-05, "loss": 1.613, "step": 208000 }, { "epoch": 1.53, "grad_norm": 2.6304965019226074, "learning_rate": 2.4496535932362613e-05, "loss": 1.6148, "step": 208500 }, { "epoch": 1.53, "grad_norm": 2.9523110389709473, "learning_rate": 2.4435376546109284e-05, "loss": 1.615, "step": 209000 }, { "epoch": 1.54, "grad_norm": 2.6215062141418457, "learning_rate": 2.437421715985596e-05, "loss": 1.623, "step": 209500 }, { "epoch": 1.54, "grad_norm": 2.7585043907165527, "learning_rate": 2.4313057773602633e-05, "loss": 1.6228, "step": 210000 }, { "epoch": 1.54, "grad_norm": 2.626432418823242, "learning_rate": 2.4251898387349304e-05, "loss": 1.6152, "step": 210500 }, { "epoch": 1.55, "grad_norm": 2.481905221939087, "learning_rate": 2.4190739001095978e-05, "loss": 1.6041, "step": 211000 }, { "epoch": 1.55, "grad_norm": 2.5762555599212646, "learning_rate": 2.4129579614842652e-05, "loss": 1.6117, "step": 211500 }, { "epoch": 1.56, "grad_norm": 2.6616873741149902, "learning_rate": 2.4068420228589323e-05, "loss": 1.6123, "step": 212000 }, { "epoch": 1.56, "grad_norm": 2.6225013732910156, "learning_rate": 2.4007260842335997e-05, "loss": 1.6096, "step": 212500 }, { "epoch": 1.56, "grad_norm": 2.6868574619293213, "learning_rate": 2.394610145608267e-05, "loss": 1.6103, "step": 213000 }, { "epoch": 1.57, "grad_norm": 2.8061540126800537, "learning_rate": 2.3884942069829342e-05, "loss": 1.6051, "step": 213500 }, { "epoch": 1.57, "grad_norm": 2.733086585998535, "learning_rate": 2.3823782683576017e-05, "loss": 1.6069, "step": 214000 }, { "epoch": 1.57, "grad_norm": 2.596497058868408, "learning_rate": 2.3762623297322688e-05, "loss": 1.602, "step": 214500 }, { "epoch": 1.58, "grad_norm": 2.496598243713379, "learning_rate": 2.3701463911069362e-05, "loss": 1.6107, "step": 215000 }, { "epoch": 1.58, "grad_norm": 2.4470176696777344, "learning_rate": 2.3640304524816033e-05, "loss": 1.6021, "step": 215500 }, { "epoch": 1.59, "grad_norm": 2.589895486831665, "learning_rate": 2.3579145138562707e-05, "loss": 1.6029, "step": 216000 }, { "epoch": 1.59, "grad_norm": 2.7477266788482666, "learning_rate": 2.3517985752309378e-05, "loss": 1.6011, "step": 216500 }, { "epoch": 1.59, "grad_norm": 2.7007384300231934, "learning_rate": 2.3456826366056052e-05, "loss": 1.6023, "step": 217000 }, { "epoch": 1.6, "grad_norm": 2.6846890449523926, "learning_rate": 2.3395666979802723e-05, "loss": 1.592, "step": 217500 }, { "epoch": 1.6, "grad_norm": 2.69858455657959, "learning_rate": 2.3334507593549397e-05, "loss": 1.6047, "step": 218000 }, { "epoch": 1.6, "grad_norm": 2.6157824993133545, "learning_rate": 2.327334820729607e-05, "loss": 1.5992, "step": 218500 }, { "epoch": 1.61, "grad_norm": 2.616908073425293, "learning_rate": 2.3212188821042742e-05, "loss": 1.596, "step": 219000 }, { "epoch": 1.61, "grad_norm": 2.7912027835845947, "learning_rate": 2.3151029434789417e-05, "loss": 1.604, "step": 219500 }, { "epoch": 1.61, "grad_norm": 2.6151885986328125, "learning_rate": 2.308987004853609e-05, "loss": 1.5953, "step": 220000 }, { "epoch": 1.62, "grad_norm": 2.8206794261932373, "learning_rate": 2.3028710662282762e-05, "loss": 1.602, "step": 220500 }, { "epoch": 1.62, "grad_norm": 2.6507091522216797, "learning_rate": 2.2967551276029436e-05, "loss": 1.5903, "step": 221000 }, { "epoch": 1.63, "grad_norm": 2.752617359161377, "learning_rate": 2.2906391889776107e-05, "loss": 1.5971, "step": 221500 }, { "epoch": 1.63, "grad_norm": 2.8615899085998535, "learning_rate": 2.284523250352278e-05, "loss": 1.596, "step": 222000 }, { "epoch": 1.63, "grad_norm": 3.0563414096832275, "learning_rate": 2.2784073117269455e-05, "loss": 1.5993, "step": 222500 }, { "epoch": 1.64, "grad_norm": 2.715120553970337, "learning_rate": 2.2722913731016126e-05, "loss": 1.5965, "step": 223000 }, { "epoch": 1.64, "grad_norm": 2.8256382942199707, "learning_rate": 2.26617543447628e-05, "loss": 1.5883, "step": 223500 }, { "epoch": 1.64, "grad_norm": 2.8050873279571533, "learning_rate": 2.2600594958509475e-05, "loss": 1.5914, "step": 224000 }, { "epoch": 1.65, "grad_norm": 2.773902416229248, "learning_rate": 2.2539435572256146e-05, "loss": 1.5884, "step": 224500 }, { "epoch": 1.65, "grad_norm": 2.7655787467956543, "learning_rate": 2.247827618600282e-05, "loss": 1.5965, "step": 225000 }, { "epoch": 1.65, "grad_norm": 2.7787845134735107, "learning_rate": 2.2417116799749494e-05, "loss": 1.5872, "step": 225500 }, { "epoch": 1.66, "grad_norm": 2.73518705368042, "learning_rate": 2.2355957413496165e-05, "loss": 1.588, "step": 226000 }, { "epoch": 1.66, "grad_norm": 2.743821382522583, "learning_rate": 2.229479802724284e-05, "loss": 1.5871, "step": 226500 }, { "epoch": 1.67, "grad_norm": 2.444350242614746, "learning_rate": 2.223363864098951e-05, "loss": 1.5802, "step": 227000 }, { "epoch": 1.67, "grad_norm": 2.597966194152832, "learning_rate": 2.2172479254736185e-05, "loss": 1.5872, "step": 227500 }, { "epoch": 1.67, "grad_norm": 2.7924256324768066, "learning_rate": 2.211131986848286e-05, "loss": 1.5877, "step": 228000 }, { "epoch": 1.68, "grad_norm": 2.5780932903289795, "learning_rate": 2.205016048222953e-05, "loss": 1.583, "step": 228500 }, { "epoch": 1.68, "grad_norm": 2.9303081035614014, "learning_rate": 2.1989001095976204e-05, "loss": 1.5901, "step": 229000 }, { "epoch": 1.68, "grad_norm": 2.601661443710327, "learning_rate": 2.1927841709722878e-05, "loss": 1.5837, "step": 229500 }, { "epoch": 1.69, "grad_norm": 2.6851816177368164, "learning_rate": 2.186668232346955e-05, "loss": 1.5736, "step": 230000 }, { "epoch": 1.69, "grad_norm": 2.592660903930664, "learning_rate": 2.1805522937216223e-05, "loss": 1.5797, "step": 230500 }, { "epoch": 1.7, "grad_norm": 2.876065492630005, "learning_rate": 2.1744363550962894e-05, "loss": 1.5851, "step": 231000 }, { "epoch": 1.7, "grad_norm": 2.507368564605713, "learning_rate": 2.168320416470957e-05, "loss": 1.5868, "step": 231500 }, { "epoch": 1.7, "grad_norm": 2.5661709308624268, "learning_rate": 2.162204477845624e-05, "loss": 1.5826, "step": 232000 }, { "epoch": 1.71, "grad_norm": 2.698857545852661, "learning_rate": 2.1560885392202914e-05, "loss": 1.5739, "step": 232500 }, { "epoch": 1.71, "grad_norm": 2.5845346450805664, "learning_rate": 2.1499726005949584e-05, "loss": 1.5769, "step": 233000 }, { "epoch": 1.71, "grad_norm": 2.7823565006256104, "learning_rate": 2.143856661969626e-05, "loss": 1.5781, "step": 233500 }, { "epoch": 1.72, "grad_norm": 2.675457239151001, "learning_rate": 2.137740723344293e-05, "loss": 1.5773, "step": 234000 }, { "epoch": 1.72, "grad_norm": 2.789083957672119, "learning_rate": 2.1316247847189604e-05, "loss": 1.5662, "step": 234500 }, { "epoch": 1.72, "grad_norm": 2.5719103813171387, "learning_rate": 2.1255088460936278e-05, "loss": 1.58, "step": 235000 }, { "epoch": 1.73, "grad_norm": 2.7980144023895264, "learning_rate": 2.119392907468295e-05, "loss": 1.5769, "step": 235500 }, { "epoch": 1.73, "grad_norm": 2.6691505908966064, "learning_rate": 2.1132769688429623e-05, "loss": 1.5703, "step": 236000 }, { "epoch": 1.74, "grad_norm": 2.839600086212158, "learning_rate": 2.1071610302176297e-05, "loss": 1.5714, "step": 236500 }, { "epoch": 1.74, "grad_norm": 2.8428940773010254, "learning_rate": 2.101045091592297e-05, "loss": 1.5723, "step": 237000 }, { "epoch": 1.74, "grad_norm": 2.5756494998931885, "learning_rate": 2.0949291529669643e-05, "loss": 1.5672, "step": 237500 }, { "epoch": 1.75, "grad_norm": 2.4937775135040283, "learning_rate": 2.0888132143416314e-05, "loss": 1.5715, "step": 238000 }, { "epoch": 1.75, "grad_norm": 2.8386645317077637, "learning_rate": 2.0826972757162988e-05, "loss": 1.566, "step": 238500 }, { "epoch": 1.75, "grad_norm": 2.9764533042907715, "learning_rate": 2.0765813370909662e-05, "loss": 1.5678, "step": 239000 }, { "epoch": 1.76, "grad_norm": 2.5615928173065186, "learning_rate": 2.0704653984656333e-05, "loss": 1.5711, "step": 239500 }, { "epoch": 1.76, "grad_norm": 2.431802988052368, "learning_rate": 2.0643494598403007e-05, "loss": 1.5632, "step": 240000 }, { "epoch": 1.77, "grad_norm": 2.691328287124634, "learning_rate": 2.058233521214968e-05, "loss": 1.5761, "step": 240500 }, { "epoch": 1.77, "grad_norm": 2.833160161972046, "learning_rate": 2.0521175825896352e-05, "loss": 1.568, "step": 241000 }, { "epoch": 1.77, "grad_norm": 2.9443514347076416, "learning_rate": 2.0460016439643027e-05, "loss": 1.5644, "step": 241500 }, { "epoch": 1.78, "grad_norm": 2.6418864727020264, "learning_rate": 2.03988570533897e-05, "loss": 1.5644, "step": 242000 }, { "epoch": 1.78, "grad_norm": 2.559652090072632, "learning_rate": 2.033769766713637e-05, "loss": 1.5588, "step": 242500 }, { "epoch": 1.78, "grad_norm": 2.376955509185791, "learning_rate": 2.0276538280883046e-05, "loss": 1.5659, "step": 243000 }, { "epoch": 1.79, "grad_norm": 3.0132250785827637, "learning_rate": 2.0215378894629717e-05, "loss": 1.56, "step": 243500 }, { "epoch": 1.79, "grad_norm": 2.493617534637451, "learning_rate": 2.015421950837639e-05, "loss": 1.5602, "step": 244000 }, { "epoch": 1.79, "grad_norm": 2.6484365463256836, "learning_rate": 2.0093060122123065e-05, "loss": 1.5646, "step": 244500 }, { "epoch": 1.8, "grad_norm": 2.5682971477508545, "learning_rate": 2.0031900735869736e-05, "loss": 1.5622, "step": 245000 }, { "epoch": 1.8, "grad_norm": 2.783363103866577, "learning_rate": 1.997074134961641e-05, "loss": 1.5568, "step": 245500 }, { "epoch": 1.81, "grad_norm": 2.5576345920562744, "learning_rate": 1.9909581963363085e-05, "loss": 1.558, "step": 246000 }, { "epoch": 1.81, "grad_norm": 2.3469157218933105, "learning_rate": 1.9848422577109756e-05, "loss": 1.562, "step": 246500 }, { "epoch": 1.81, "grad_norm": 2.7063257694244385, "learning_rate": 1.978726319085643e-05, "loss": 1.5582, "step": 247000 }, { "epoch": 1.82, "grad_norm": 3.00256085395813, "learning_rate": 1.97261038046031e-05, "loss": 1.5574, "step": 247500 }, { "epoch": 1.82, "grad_norm": 2.35555100440979, "learning_rate": 1.966494441834977e-05, "loss": 1.5567, "step": 248000 }, { "epoch": 1.82, "grad_norm": 2.5847179889678955, "learning_rate": 1.9603785032096446e-05, "loss": 1.5647, "step": 248500 }, { "epoch": 1.83, "grad_norm": 2.629279613494873, "learning_rate": 1.9542625645843117e-05, "loss": 1.552, "step": 249000 }, { "epoch": 1.83, "grad_norm": 2.6433770656585693, "learning_rate": 1.948146625958979e-05, "loss": 1.5547, "step": 249500 }, { "epoch": 1.83, "grad_norm": 2.6378979682922363, "learning_rate": 1.9420306873336465e-05, "loss": 1.5549, "step": 250000 }, { "epoch": 1.84, "grad_norm": 2.7272751331329346, "learning_rate": 1.9359147487083136e-05, "loss": 1.5496, "step": 250500 }, { "epoch": 1.84, "grad_norm": 2.661400556564331, "learning_rate": 1.929798810082981e-05, "loss": 1.5597, "step": 251000 }, { "epoch": 1.85, "grad_norm": 2.76647686958313, "learning_rate": 1.9236828714576485e-05, "loss": 1.5559, "step": 251500 }, { "epoch": 1.85, "grad_norm": 2.4355571269989014, "learning_rate": 1.9175669328323156e-05, "loss": 1.5512, "step": 252000 }, { "epoch": 1.85, "grad_norm": 2.503006935119629, "learning_rate": 1.911450994206983e-05, "loss": 1.5459, "step": 252500 }, { "epoch": 1.86, "grad_norm": 2.4940273761749268, "learning_rate": 1.9053350555816504e-05, "loss": 1.5563, "step": 253000 }, { "epoch": 1.86, "grad_norm": 3.0512688159942627, "learning_rate": 1.8992191169563175e-05, "loss": 1.5542, "step": 253500 }, { "epoch": 1.86, "grad_norm": 2.811276912689209, "learning_rate": 1.893103178330985e-05, "loss": 1.5447, "step": 254000 }, { "epoch": 1.87, "grad_norm": 2.565730571746826, "learning_rate": 1.8869872397056523e-05, "loss": 1.5446, "step": 254500 }, { "epoch": 1.87, "grad_norm": 2.6504178047180176, "learning_rate": 1.8808713010803194e-05, "loss": 1.5506, "step": 255000 }, { "epoch": 1.88, "grad_norm": 3.0442628860473633, "learning_rate": 1.874755362454987e-05, "loss": 1.55, "step": 255500 }, { "epoch": 1.88, "grad_norm": 2.6336920261383057, "learning_rate": 1.868639423829654e-05, "loss": 1.5424, "step": 256000 }, { "epoch": 1.88, "grad_norm": 2.7758066654205322, "learning_rate": 1.8625234852043214e-05, "loss": 1.5479, "step": 256500 }, { "epoch": 1.89, "grad_norm": 2.818814992904663, "learning_rate": 1.8564075465789888e-05, "loss": 1.5486, "step": 257000 }, { "epoch": 1.89, "grad_norm": 2.6956701278686523, "learning_rate": 1.850291607953656e-05, "loss": 1.5497, "step": 257500 }, { "epoch": 1.89, "grad_norm": 2.7896413803100586, "learning_rate": 1.8441756693283233e-05, "loss": 1.5437, "step": 258000 }, { "epoch": 1.9, "grad_norm": 2.917079448699951, "learning_rate": 1.8380597307029907e-05, "loss": 1.5432, "step": 258500 }, { "epoch": 1.9, "grad_norm": 2.761766195297241, "learning_rate": 1.8319437920776578e-05, "loss": 1.5413, "step": 259000 }, { "epoch": 1.9, "grad_norm": 2.7666103839874268, "learning_rate": 1.8258278534523253e-05, "loss": 1.5396, "step": 259500 }, { "epoch": 1.91, "grad_norm": 2.691253423690796, "learning_rate": 1.8197119148269927e-05, "loss": 1.5372, "step": 260000 }, { "epoch": 1.91, "grad_norm": 2.911930799484253, "learning_rate": 1.8135959762016598e-05, "loss": 1.5485, "step": 260500 }, { "epoch": 1.92, "grad_norm": 2.5208046436309814, "learning_rate": 1.8074800375763272e-05, "loss": 1.5438, "step": 261000 }, { "epoch": 1.92, "grad_norm": 2.41379976272583, "learning_rate": 1.8013640989509943e-05, "loss": 1.5384, "step": 261500 }, { "epoch": 1.92, "grad_norm": 2.636869430541992, "learning_rate": 1.7952481603256617e-05, "loss": 1.5477, "step": 262000 }, { "epoch": 1.93, "grad_norm": 2.6929407119750977, "learning_rate": 1.7891322217003288e-05, "loss": 1.5384, "step": 262500 }, { "epoch": 1.93, "grad_norm": 2.849163055419922, "learning_rate": 1.7830162830749962e-05, "loss": 1.5394, "step": 263000 }, { "epoch": 1.93, "grad_norm": 2.5682120323181152, "learning_rate": 1.7769003444496633e-05, "loss": 1.5353, "step": 263500 }, { "epoch": 1.94, "grad_norm": 2.5825769901275635, "learning_rate": 1.7707844058243307e-05, "loss": 1.535, "step": 264000 }, { "epoch": 1.94, "grad_norm": 2.426283597946167, "learning_rate": 1.7646684671989978e-05, "loss": 1.5373, "step": 264500 }, { "epoch": 1.94, "grad_norm": 2.706394910812378, "learning_rate": 1.7585525285736652e-05, "loss": 1.5358, "step": 265000 }, { "epoch": 1.95, "grad_norm": 2.6370396614074707, "learning_rate": 1.7524365899483327e-05, "loss": 1.5391, "step": 265500 }, { "epoch": 1.95, "grad_norm": 2.553217649459839, "learning_rate": 1.7463206513229998e-05, "loss": 1.5299, "step": 266000 }, { "epoch": 1.96, "grad_norm": 2.884148120880127, "learning_rate": 1.7402047126976672e-05, "loss": 1.5432, "step": 266500 }, { "epoch": 1.96, "grad_norm": 2.7331855297088623, "learning_rate": 1.7340887740723343e-05, "loss": 1.5329, "step": 267000 }, { "epoch": 1.96, "grad_norm": 2.841865062713623, "learning_rate": 1.7279728354470017e-05, "loss": 1.5319, "step": 267500 }, { "epoch": 1.97, "grad_norm": 2.463677406311035, "learning_rate": 1.721856896821669e-05, "loss": 1.5274, "step": 268000 }, { "epoch": 1.97, "grad_norm": 2.7880847454071045, "learning_rate": 1.7157409581963362e-05, "loss": 1.5244, "step": 268500 }, { "epoch": 1.97, "grad_norm": 2.5323753356933594, "learning_rate": 1.7096250195710036e-05, "loss": 1.5343, "step": 269000 }, { "epoch": 1.98, "grad_norm": 2.93086838722229, "learning_rate": 1.703509080945671e-05, "loss": 1.5253, "step": 269500 }, { "epoch": 1.98, "grad_norm": 2.8919107913970947, "learning_rate": 1.697393142320338e-05, "loss": 1.5334, "step": 270000 }, { "epoch": 1.99, "grad_norm": 2.8613593578338623, "learning_rate": 1.6912772036950056e-05, "loss": 1.5342, "step": 270500 }, { "epoch": 1.99, "grad_norm": 2.5317909717559814, "learning_rate": 1.685161265069673e-05, "loss": 1.5278, "step": 271000 }, { "epoch": 1.99, "grad_norm": 2.832613706588745, "learning_rate": 1.67904532644434e-05, "loss": 1.5318, "step": 271500 }, { "epoch": 2.0, "grad_norm": 2.5811901092529297, "learning_rate": 1.6729293878190075e-05, "loss": 1.5338, "step": 272000 }, { "epoch": 2.0, "grad_norm": 2.640382766723633, "learning_rate": 1.6668134491936746e-05, "loss": 1.5278, "step": 272500 }, { "epoch": 2.0, "grad_norm": 2.777024745941162, "learning_rate": 1.660697510568342e-05, "loss": 1.5192, "step": 273000 }, { "epoch": 2.01, "grad_norm": 2.546867609024048, "learning_rate": 1.6545815719430095e-05, "loss": 1.5203, "step": 273500 }, { "epoch": 2.01, "grad_norm": 2.459458589553833, "learning_rate": 1.6484656333176765e-05, "loss": 1.5232, "step": 274000 }, { "epoch": 2.01, "grad_norm": 2.6832683086395264, "learning_rate": 1.642349694692344e-05, "loss": 1.5133, "step": 274500 }, { "epoch": 2.02, "grad_norm": 2.6847174167633057, "learning_rate": 1.6362337560670114e-05, "loss": 1.5202, "step": 275000 }, { "epoch": 2.02, "grad_norm": 2.819836139678955, "learning_rate": 1.6301178174416785e-05, "loss": 1.5225, "step": 275500 }, { "epoch": 2.03, "grad_norm": 2.4789822101593018, "learning_rate": 1.624001878816346e-05, "loss": 1.5185, "step": 276000 }, { "epoch": 2.03, "grad_norm": 2.8469271659851074, "learning_rate": 1.6178859401910133e-05, "loss": 1.5281, "step": 276500 }, { "epoch": 2.03, "grad_norm": 2.4741554260253906, "learning_rate": 1.6117700015656804e-05, "loss": 1.5187, "step": 277000 }, { "epoch": 2.04, "grad_norm": 2.7348639965057373, "learning_rate": 1.605654062940348e-05, "loss": 1.519, "step": 277500 }, { "epoch": 2.04, "grad_norm": 2.590632677078247, "learning_rate": 1.599538124315015e-05, "loss": 1.5242, "step": 278000 }, { "epoch": 2.04, "grad_norm": 2.926156997680664, "learning_rate": 1.5934221856896824e-05, "loss": 1.5182, "step": 278500 }, { "epoch": 2.05, "grad_norm": 2.3463704586029053, "learning_rate": 1.5873062470643494e-05, "loss": 1.5186, "step": 279000 }, { "epoch": 2.05, "grad_norm": 2.8778836727142334, "learning_rate": 1.581190308439017e-05, "loss": 1.5133, "step": 279500 }, { "epoch": 2.05, "grad_norm": 2.7937684059143066, "learning_rate": 1.575074369813684e-05, "loss": 1.5204, "step": 280000 }, { "epoch": 2.06, "grad_norm": 2.6967952251434326, "learning_rate": 1.5689584311883514e-05, "loss": 1.5238, "step": 280500 }, { "epoch": 2.06, "grad_norm": 2.7939419746398926, "learning_rate": 1.5628424925630185e-05, "loss": 1.5162, "step": 281000 }, { "epoch": 2.07, "grad_norm": 2.4184165000915527, "learning_rate": 1.556726553937686e-05, "loss": 1.5083, "step": 281500 }, { "epoch": 2.07, "grad_norm": 2.5736517906188965, "learning_rate": 1.5506106153123533e-05, "loss": 1.5225, "step": 282000 }, { "epoch": 2.07, "grad_norm": 2.775562286376953, "learning_rate": 1.5444946766870204e-05, "loss": 1.5107, "step": 282500 }, { "epoch": 2.08, "grad_norm": 2.65218186378479, "learning_rate": 1.538378738061688e-05, "loss": 1.5101, "step": 283000 }, { "epoch": 2.08, "grad_norm": 2.9510700702667236, "learning_rate": 1.532262799436355e-05, "loss": 1.5108, "step": 283500 }, { "epoch": 2.08, "grad_norm": 2.663459300994873, "learning_rate": 1.5261468608110224e-05, "loss": 1.5039, "step": 284000 }, { "epoch": 2.09, "grad_norm": 2.621185541152954, "learning_rate": 1.5200309221856898e-05, "loss": 1.5044, "step": 284500 }, { "epoch": 2.09, "grad_norm": 2.7597007751464844, "learning_rate": 1.5139149835603569e-05, "loss": 1.5123, "step": 285000 }, { "epoch": 2.1, "grad_norm": 2.9049315452575684, "learning_rate": 1.5077990449350243e-05, "loss": 1.5129, "step": 285500 }, { "epoch": 2.1, "grad_norm": 2.7064170837402344, "learning_rate": 1.5016831063096917e-05, "loss": 1.5075, "step": 286000 }, { "epoch": 2.1, "grad_norm": 2.8447062969207764, "learning_rate": 1.4955671676843588e-05, "loss": 1.5107, "step": 286500 }, { "epoch": 2.11, "grad_norm": 2.63680100440979, "learning_rate": 1.4894512290590262e-05, "loss": 1.5095, "step": 287000 }, { "epoch": 2.11, "grad_norm": 2.9696691036224365, "learning_rate": 1.4833352904336937e-05, "loss": 1.5069, "step": 287500 }, { "epoch": 2.11, "grad_norm": 2.7010321617126465, "learning_rate": 1.4772193518083607e-05, "loss": 1.5094, "step": 288000 }, { "epoch": 2.12, "grad_norm": 2.5756781101226807, "learning_rate": 1.4711034131830282e-05, "loss": 1.5054, "step": 288500 }, { "epoch": 2.12, "grad_norm": 3.0450093746185303, "learning_rate": 1.4649874745576956e-05, "loss": 1.5115, "step": 289000 }, { "epoch": 2.12, "grad_norm": 2.551755905151367, "learning_rate": 1.4588715359323627e-05, "loss": 1.5129, "step": 289500 }, { "epoch": 2.13, "grad_norm": 2.865170478820801, "learning_rate": 1.4527555973070301e-05, "loss": 1.4972, "step": 290000 }, { "epoch": 2.13, "grad_norm": 2.648294687271118, "learning_rate": 1.4466396586816972e-05, "loss": 1.5093, "step": 290500 }, { "epoch": 2.14, "grad_norm": 2.600937604904175, "learning_rate": 1.4405237200563646e-05, "loss": 1.5043, "step": 291000 }, { "epoch": 2.14, "grad_norm": 2.9919681549072266, "learning_rate": 1.4344077814310319e-05, "loss": 1.4997, "step": 291500 }, { "epoch": 2.14, "grad_norm": 2.8291046619415283, "learning_rate": 1.4282918428056991e-05, "loss": 1.5196, "step": 292000 }, { "epoch": 2.15, "grad_norm": 2.66756272315979, "learning_rate": 1.4221759041803664e-05, "loss": 1.5007, "step": 292500 }, { "epoch": 2.15, "grad_norm": 2.809164524078369, "learning_rate": 1.4160599655550338e-05, "loss": 1.5033, "step": 293000 }, { "epoch": 2.15, "grad_norm": 2.6483566761016846, "learning_rate": 1.4099440269297009e-05, "loss": 1.5065, "step": 293500 }, { "epoch": 2.16, "grad_norm": 2.4449145793914795, "learning_rate": 1.4038280883043683e-05, "loss": 1.5032, "step": 294000 }, { "epoch": 2.16, "grad_norm": 2.6919500827789307, "learning_rate": 1.3977121496790358e-05, "loss": 1.5053, "step": 294500 }, { "epoch": 2.17, "grad_norm": 2.8122289180755615, "learning_rate": 1.3915962110537028e-05, "loss": 1.5079, "step": 295000 }, { "epoch": 2.17, "grad_norm": 2.7903494834899902, "learning_rate": 1.3854802724283703e-05, "loss": 1.4965, "step": 295500 }, { "epoch": 2.17, "grad_norm": 2.525930404663086, "learning_rate": 1.3793643338030374e-05, "loss": 1.5043, "step": 296000 }, { "epoch": 2.18, "grad_norm": 2.493638277053833, "learning_rate": 1.3732483951777048e-05, "loss": 1.4974, "step": 296500 }, { "epoch": 2.18, "grad_norm": 2.4521799087524414, "learning_rate": 1.3671324565523722e-05, "loss": 1.4941, "step": 297000 }, { "epoch": 2.18, "grad_norm": 2.8091464042663574, "learning_rate": 1.3610165179270393e-05, "loss": 1.5018, "step": 297500 }, { "epoch": 2.19, "grad_norm": 2.5954153537750244, "learning_rate": 1.3549005793017067e-05, "loss": 1.4999, "step": 298000 }, { "epoch": 2.19, "grad_norm": 2.7937843799591064, "learning_rate": 1.348784640676374e-05, "loss": 1.4971, "step": 298500 }, { "epoch": 2.19, "grad_norm": 2.731354236602783, "learning_rate": 1.3426687020510412e-05, "loss": 1.5019, "step": 299000 }, { "epoch": 2.2, "grad_norm": 2.893202066421509, "learning_rate": 1.3365527634257085e-05, "loss": 1.5084, "step": 299500 }, { "epoch": 2.2, "grad_norm": 2.5517237186431885, "learning_rate": 1.330436824800376e-05, "loss": 1.4979, "step": 300000 }, { "epoch": 2.21, "grad_norm": 2.5626368522644043, "learning_rate": 1.324320886175043e-05, "loss": 1.5023, "step": 300500 }, { "epoch": 2.21, "grad_norm": 2.9477968215942383, "learning_rate": 1.3182049475497104e-05, "loss": 1.4948, "step": 301000 }, { "epoch": 2.21, "grad_norm": 2.5781774520874023, "learning_rate": 1.3120890089243775e-05, "loss": 1.4968, "step": 301500 }, { "epoch": 2.22, "grad_norm": 2.8032429218292236, "learning_rate": 1.305973070299045e-05, "loss": 1.5017, "step": 302000 }, { "epoch": 2.22, "grad_norm": 2.6801342964172363, "learning_rate": 1.2998571316737124e-05, "loss": 1.4932, "step": 302500 }, { "epoch": 2.22, "grad_norm": 2.6974122524261475, "learning_rate": 1.2937411930483795e-05, "loss": 1.4965, "step": 303000 }, { "epoch": 2.23, "grad_norm": 2.4389328956604004, "learning_rate": 1.2876252544230469e-05, "loss": 1.4933, "step": 303500 }, { "epoch": 2.23, "grad_norm": 2.66622257232666, "learning_rate": 1.2815093157977143e-05, "loss": 1.4941, "step": 304000 }, { "epoch": 2.23, "grad_norm": 2.6904194355010986, "learning_rate": 1.2753933771723814e-05, "loss": 1.4946, "step": 304500 }, { "epoch": 2.24, "grad_norm": 2.673464298248291, "learning_rate": 1.2692774385470488e-05, "loss": 1.5015, "step": 305000 }, { "epoch": 2.24, "grad_norm": 2.6104812622070312, "learning_rate": 1.2631614999217163e-05, "loss": 1.4886, "step": 305500 }, { "epoch": 2.25, "grad_norm": 2.8773016929626465, "learning_rate": 1.2570455612963833e-05, "loss": 1.4916, "step": 306000 }, { "epoch": 2.25, "grad_norm": 2.437274217605591, "learning_rate": 1.2509296226710508e-05, "loss": 1.4909, "step": 306500 }, { "epoch": 2.25, "grad_norm": 3.1659114360809326, "learning_rate": 1.244813684045718e-05, "loss": 1.488, "step": 307000 }, { "epoch": 2.26, "grad_norm": 2.607539653778076, "learning_rate": 1.2386977454203851e-05, "loss": 1.4916, "step": 307500 }, { "epoch": 2.26, "grad_norm": 2.589136838912964, "learning_rate": 1.2325818067950524e-05, "loss": 1.4893, "step": 308000 }, { "epoch": 2.26, "grad_norm": 3.010464668273926, "learning_rate": 1.2264658681697198e-05, "loss": 1.4867, "step": 308500 }, { "epoch": 2.27, "grad_norm": 2.713313579559326, "learning_rate": 1.220349929544387e-05, "loss": 1.4788, "step": 309000 }, { "epoch": 2.27, "grad_norm": 2.753493070602417, "learning_rate": 1.2142339909190543e-05, "loss": 1.4839, "step": 309500 }, { "epoch": 2.28, "grad_norm": 2.8799803256988525, "learning_rate": 1.2081180522937217e-05, "loss": 1.4914, "step": 310000 }, { "epoch": 2.28, "grad_norm": 2.8280301094055176, "learning_rate": 1.202002113668389e-05, "loss": 1.4809, "step": 310500 }, { "epoch": 2.28, "grad_norm": 2.9053263664245605, "learning_rate": 1.1958861750430562e-05, "loss": 1.4788, "step": 311000 }, { "epoch": 2.29, "grad_norm": 2.879546880722046, "learning_rate": 1.1897702364177235e-05, "loss": 1.4825, "step": 311500 }, { "epoch": 2.29, "grad_norm": 2.473529577255249, "learning_rate": 1.183654297792391e-05, "loss": 1.4883, "step": 312000 }, { "epoch": 2.29, "grad_norm": 2.743178367614746, "learning_rate": 1.1775383591670582e-05, "loss": 1.4804, "step": 312500 }, { "epoch": 2.3, "grad_norm": 2.6918370723724365, "learning_rate": 1.1714224205417254e-05, "loss": 1.4891, "step": 313000 }, { "epoch": 2.3, "grad_norm": 2.9803996086120605, "learning_rate": 1.1653064819163927e-05, "loss": 1.486, "step": 313500 }, { "epoch": 2.3, "grad_norm": 2.544872999191284, "learning_rate": 1.1591905432910601e-05, "loss": 1.4879, "step": 314000 }, { "epoch": 2.31, "grad_norm": 2.8242433071136475, "learning_rate": 1.1530746046657274e-05, "loss": 1.4848, "step": 314500 }, { "epoch": 2.31, "grad_norm": 2.7912473678588867, "learning_rate": 1.1469586660403946e-05, "loss": 1.4847, "step": 315000 }, { "epoch": 2.32, "grad_norm": 3.1455202102661133, "learning_rate": 1.1408427274150619e-05, "loss": 1.4899, "step": 315500 }, { "epoch": 2.32, "grad_norm": 2.8553197383880615, "learning_rate": 1.1347267887897291e-05, "loss": 1.4799, "step": 316000 }, { "epoch": 2.32, "grad_norm": 2.7605557441711426, "learning_rate": 1.1286108501643964e-05, "loss": 1.4808, "step": 316500 }, { "epoch": 2.33, "grad_norm": 2.7065718173980713, "learning_rate": 1.1224949115390637e-05, "loss": 1.4846, "step": 317000 }, { "epoch": 2.33, "grad_norm": 2.719977378845215, "learning_rate": 1.1163789729137311e-05, "loss": 1.4831, "step": 317500 }, { "epoch": 2.33, "grad_norm": 2.569617509841919, "learning_rate": 1.1102630342883983e-05, "loss": 1.4798, "step": 318000 }, { "epoch": 2.34, "grad_norm": 2.4670286178588867, "learning_rate": 1.1041470956630656e-05, "loss": 1.4765, "step": 318500 }, { "epoch": 2.34, "grad_norm": 2.797725200653076, "learning_rate": 1.098031157037733e-05, "loss": 1.4817, "step": 319000 }, { "epoch": 2.34, "grad_norm": 2.8332033157348633, "learning_rate": 1.0919152184124003e-05, "loss": 1.4835, "step": 319500 }, { "epoch": 2.35, "grad_norm": 2.494609832763672, "learning_rate": 1.0857992797870675e-05, "loss": 1.4746, "step": 320000 }, { "epoch": 2.35, "grad_norm": 2.708406925201416, "learning_rate": 1.0796833411617348e-05, "loss": 1.4764, "step": 320500 }, { "epoch": 2.36, "grad_norm": 2.59369158744812, "learning_rate": 1.0735674025364022e-05, "loss": 1.4808, "step": 321000 }, { "epoch": 2.36, "grad_norm": 2.803255558013916, "learning_rate": 1.0674514639110695e-05, "loss": 1.48, "step": 321500 }, { "epoch": 2.36, "grad_norm": 2.5560402870178223, "learning_rate": 1.0613355252857367e-05, "loss": 1.4843, "step": 322000 }, { "epoch": 2.37, "grad_norm": 2.911194324493408, "learning_rate": 1.055219586660404e-05, "loss": 1.4801, "step": 322500 }, { "epoch": 2.37, "grad_norm": 2.8196239471435547, "learning_rate": 1.0491036480350713e-05, "loss": 1.4762, "step": 323000 }, { "epoch": 2.37, "grad_norm": 2.709317445755005, "learning_rate": 1.0429877094097385e-05, "loss": 1.476, "step": 323500 }, { "epoch": 2.38, "grad_norm": 2.627985715866089, "learning_rate": 1.0368717707844058e-05, "loss": 1.4781, "step": 324000 }, { "epoch": 2.38, "grad_norm": 2.8382914066314697, "learning_rate": 1.0307558321590732e-05, "loss": 1.4728, "step": 324500 }, { "epoch": 2.39, "grad_norm": 2.8126072883605957, "learning_rate": 1.0246398935337404e-05, "loss": 1.4778, "step": 325000 }, { "epoch": 2.39, "grad_norm": 2.6362712383270264, "learning_rate": 1.0185239549084077e-05, "loss": 1.476, "step": 325500 }, { "epoch": 2.39, "grad_norm": 2.761763572692871, "learning_rate": 1.012408016283075e-05, "loss": 1.4704, "step": 326000 }, { "epoch": 2.4, "grad_norm": 2.6072146892547607, "learning_rate": 1.0062920776577424e-05, "loss": 1.4703, "step": 326500 }, { "epoch": 2.4, "grad_norm": 3.07877254486084, "learning_rate": 1.0001761390324096e-05, "loss": 1.4863, "step": 327000 }, { "epoch": 2.4, "grad_norm": 2.986053705215454, "learning_rate": 9.940602004070769e-06, "loss": 1.4837, "step": 327500 }, { "epoch": 2.41, "grad_norm": 2.7128584384918213, "learning_rate": 9.879442617817442e-06, "loss": 1.4648, "step": 328000 }, { "epoch": 2.41, "grad_norm": 2.6193928718566895, "learning_rate": 9.818283231564116e-06, "loss": 1.4628, "step": 328500 }, { "epoch": 2.41, "grad_norm": 2.7647864818573, "learning_rate": 9.757123845310788e-06, "loss": 1.4799, "step": 329000 }, { "epoch": 2.42, "grad_norm": 2.70143985748291, "learning_rate": 9.695964459057461e-06, "loss": 1.4629, "step": 329500 }, { "epoch": 2.42, "grad_norm": 2.705559730529785, "learning_rate": 9.634805072804135e-06, "loss": 1.4662, "step": 330000 }, { "epoch": 2.43, "grad_norm": 2.678466796875, "learning_rate": 9.573645686550808e-06, "loss": 1.4693, "step": 330500 }, { "epoch": 2.43, "grad_norm": 2.5051000118255615, "learning_rate": 9.51248630029748e-06, "loss": 1.4705, "step": 331000 }, { "epoch": 2.43, "grad_norm": 2.8841006755828857, "learning_rate": 9.451326914044153e-06, "loss": 1.4651, "step": 331500 }, { "epoch": 2.44, "grad_norm": 2.7045044898986816, "learning_rate": 9.390167527790825e-06, "loss": 1.4665, "step": 332000 }, { "epoch": 2.44, "grad_norm": 3.101134777069092, "learning_rate": 9.329008141537498e-06, "loss": 1.4746, "step": 332500 }, { "epoch": 2.44, "grad_norm": 2.5567667484283447, "learning_rate": 9.26784875528417e-06, "loss": 1.4667, "step": 333000 }, { "epoch": 2.45, "grad_norm": 2.5476863384246826, "learning_rate": 9.206689369030843e-06, "loss": 1.4593, "step": 333500 }, { "epoch": 2.45, "grad_norm": 2.6363370418548584, "learning_rate": 9.145529982777517e-06, "loss": 1.4632, "step": 334000 }, { "epoch": 2.45, "grad_norm": 2.9167027473449707, "learning_rate": 9.08437059652419e-06, "loss": 1.4595, "step": 334500 }, { "epoch": 2.46, "grad_norm": 2.77966046333313, "learning_rate": 9.023211210270863e-06, "loss": 1.4604, "step": 335000 }, { "epoch": 2.46, "grad_norm": 3.0701239109039307, "learning_rate": 8.962051824017537e-06, "loss": 1.4613, "step": 335500 }, { "epoch": 2.47, "grad_norm": 2.6307058334350586, "learning_rate": 8.90089243776421e-06, "loss": 1.4627, "step": 336000 }, { "epoch": 2.47, "grad_norm": 2.46291184425354, "learning_rate": 8.839733051510882e-06, "loss": 1.4667, "step": 336500 }, { "epoch": 2.47, "grad_norm": 2.7968499660491943, "learning_rate": 8.778573665257555e-06, "loss": 1.4644, "step": 337000 }, { "epoch": 2.48, "grad_norm": 2.7745018005371094, "learning_rate": 8.717414279004229e-06, "loss": 1.458, "step": 337500 }, { "epoch": 2.48, "grad_norm": 2.951845645904541, "learning_rate": 8.656254892750901e-06, "loss": 1.4723, "step": 338000 }, { "epoch": 2.48, "grad_norm": 2.6524295806884766, "learning_rate": 8.595095506497574e-06, "loss": 1.4607, "step": 338500 }, { "epoch": 2.49, "grad_norm": 2.800586223602295, "learning_rate": 8.533936120244246e-06, "loss": 1.4606, "step": 339000 }, { "epoch": 2.49, "grad_norm": 2.947486639022827, "learning_rate": 8.472776733990919e-06, "loss": 1.4608, "step": 339500 }, { "epoch": 2.5, "grad_norm": 2.936547040939331, "learning_rate": 8.411617347737592e-06, "loss": 1.4582, "step": 340000 }, { "epoch": 2.5, "grad_norm": 2.820474147796631, "learning_rate": 8.350457961484264e-06, "loss": 1.4648, "step": 340500 }, { "epoch": 2.5, "grad_norm": 2.754638910293579, "learning_rate": 8.289298575230938e-06, "loss": 1.4613, "step": 341000 }, { "epoch": 2.51, "grad_norm": 2.46992564201355, "learning_rate": 8.228139188977611e-06, "loss": 1.4685, "step": 341500 }, { "epoch": 2.51, "grad_norm": 2.8246257305145264, "learning_rate": 8.166979802724284e-06, "loss": 1.4632, "step": 342000 }, { "epoch": 2.51, "grad_norm": 2.966745138168335, "learning_rate": 8.105820416470956e-06, "loss": 1.4615, "step": 342500 }, { "epoch": 2.52, "grad_norm": 2.9904332160949707, "learning_rate": 8.04466103021763e-06, "loss": 1.4646, "step": 343000 }, { "epoch": 2.52, "grad_norm": 2.7209649085998535, "learning_rate": 7.983501643964303e-06, "loss": 1.4602, "step": 343500 }, { "epoch": 2.52, "grad_norm": 2.7970163822174072, "learning_rate": 7.922342257710976e-06, "loss": 1.4557, "step": 344000 }, { "epoch": 2.53, "grad_norm": 2.646637201309204, "learning_rate": 7.86118287145765e-06, "loss": 1.4554, "step": 344500 }, { "epoch": 2.53, "grad_norm": 2.8239455223083496, "learning_rate": 7.800023485204322e-06, "loss": 1.4527, "step": 345000 }, { "epoch": 2.54, "grad_norm": 2.9307682514190674, "learning_rate": 7.738864098950995e-06, "loss": 1.461, "step": 345500 }, { "epoch": 2.54, "grad_norm": 2.840571165084839, "learning_rate": 7.677704712697668e-06, "loss": 1.4592, "step": 346000 }, { "epoch": 2.54, "grad_norm": 2.7356936931610107, "learning_rate": 7.616545326444341e-06, "loss": 1.46, "step": 346500 }, { "epoch": 2.55, "grad_norm": 2.902578353881836, "learning_rate": 7.5553859401910135e-06, "loss": 1.4627, "step": 347000 }, { "epoch": 2.55, "grad_norm": 3.005869150161743, "learning_rate": 7.494226553937686e-06, "loss": 1.4592, "step": 347500 }, { "epoch": 2.55, "grad_norm": 2.7847769260406494, "learning_rate": 7.433067167684359e-06, "loss": 1.4652, "step": 348000 }, { "epoch": 2.56, "grad_norm": 2.821125030517578, "learning_rate": 7.371907781431033e-06, "loss": 1.4585, "step": 348500 }, { "epoch": 2.56, "grad_norm": 3.5766139030456543, "learning_rate": 7.3107483951777054e-06, "loss": 1.4511, "step": 349000 }, { "epoch": 2.57, "grad_norm": 2.9950060844421387, "learning_rate": 7.249589008924377e-06, "loss": 1.45, "step": 349500 }, { "epoch": 2.57, "grad_norm": 2.922325611114502, "learning_rate": 7.1884296226710514e-06, "loss": 1.4549, "step": 350000 }, { "epoch": 2.57, "grad_norm": 3.0797903537750244, "learning_rate": 7.127270236417724e-06, "loss": 1.4643, "step": 350500 }, { "epoch": 2.58, "grad_norm": 2.6984283924102783, "learning_rate": 7.0661108501643966e-06, "loss": 1.4528, "step": 351000 }, { "epoch": 2.58, "grad_norm": 2.804563283920288, "learning_rate": 7.004951463911069e-06, "loss": 1.4565, "step": 351500 }, { "epoch": 2.58, "grad_norm": 2.6416571140289307, "learning_rate": 6.943792077657743e-06, "loss": 1.4523, "step": 352000 }, { "epoch": 2.59, "grad_norm": 2.5998694896698, "learning_rate": 6.882632691404416e-06, "loss": 1.4546, "step": 352500 }, { "epoch": 2.59, "grad_norm": 2.714494228363037, "learning_rate": 6.8214733051510885e-06, "loss": 1.4581, "step": 353000 }, { "epoch": 2.59, "grad_norm": 2.8569843769073486, "learning_rate": 6.760313918897761e-06, "loss": 1.4562, "step": 353500 }, { "epoch": 2.6, "grad_norm": 2.5230369567871094, "learning_rate": 6.6991545326444345e-06, "loss": 1.4469, "step": 354000 }, { "epoch": 2.6, "grad_norm": 2.654069423675537, "learning_rate": 6.637995146391107e-06, "loss": 1.449, "step": 354500 }, { "epoch": 2.61, "grad_norm": 3.2150166034698486, "learning_rate": 6.57683576013778e-06, "loss": 1.4506, "step": 355000 }, { "epoch": 2.61, "grad_norm": 2.6119723320007324, "learning_rate": 6.515676373884454e-06, "loss": 1.4507, "step": 355500 }, { "epoch": 2.61, "grad_norm": 2.6928510665893555, "learning_rate": 6.4545169876311265e-06, "loss": 1.4565, "step": 356000 }, { "epoch": 2.62, "grad_norm": 2.555009603500366, "learning_rate": 6.393357601377799e-06, "loss": 1.4514, "step": 356500 }, { "epoch": 2.62, "grad_norm": 2.785787343978882, "learning_rate": 6.332198215124472e-06, "loss": 1.4525, "step": 357000 }, { "epoch": 2.62, "grad_norm": 2.8000409603118896, "learning_rate": 6.271038828871145e-06, "loss": 1.4498, "step": 357500 }, { "epoch": 2.63, "grad_norm": 3.049229860305786, "learning_rate": 6.209879442617818e-06, "loss": 1.45, "step": 358000 }, { "epoch": 2.63, "grad_norm": 2.8990321159362793, "learning_rate": 6.14872005636449e-06, "loss": 1.454, "step": 358500 }, { "epoch": 2.63, "grad_norm": 2.6227800846099854, "learning_rate": 6.0875606701111636e-06, "loss": 1.4535, "step": 359000 }, { "epoch": 2.64, "grad_norm": 2.856273651123047, "learning_rate": 6.026401283857837e-06, "loss": 1.453, "step": 359500 }, { "epoch": 2.64, "grad_norm": 2.6688528060913086, "learning_rate": 5.9652418976045095e-06, "loss": 1.4396, "step": 360000 }, { "epoch": 2.65, "grad_norm": 2.765559196472168, "learning_rate": 5.904082511351183e-06, "loss": 1.447, "step": 360500 }, { "epoch": 2.65, "grad_norm": 2.546724319458008, "learning_rate": 5.8429231250978555e-06, "loss": 1.4445, "step": 361000 }, { "epoch": 2.65, "grad_norm": 2.7436957359313965, "learning_rate": 5.781763738844528e-06, "loss": 1.4512, "step": 361500 }, { "epoch": 2.66, "grad_norm": 2.612231731414795, "learning_rate": 5.720604352591201e-06, "loss": 1.4496, "step": 362000 }, { "epoch": 2.66, "grad_norm": 2.537179946899414, "learning_rate": 5.659444966337874e-06, "loss": 1.4417, "step": 362500 }, { "epoch": 2.66, "grad_norm": 2.82316255569458, "learning_rate": 5.598285580084547e-06, "loss": 1.447, "step": 363000 }, { "epoch": 2.67, "grad_norm": 2.607912302017212, "learning_rate": 5.53712619383122e-06, "loss": 1.4465, "step": 363500 }, { "epoch": 2.67, "grad_norm": 2.6389966011047363, "learning_rate": 5.4759668075778935e-06, "loss": 1.4511, "step": 364000 }, { "epoch": 2.68, "grad_norm": 2.7712039947509766, "learning_rate": 5.414807421324566e-06, "loss": 1.4429, "step": 364500 }, { "epoch": 2.68, "grad_norm": 2.6119940280914307, "learning_rate": 5.353648035071239e-06, "loss": 1.4473, "step": 365000 }, { "epoch": 2.68, "grad_norm": 2.855820894241333, "learning_rate": 5.292488648817911e-06, "loss": 1.4409, "step": 365500 }, { "epoch": 2.69, "grad_norm": 3.3360650539398193, "learning_rate": 5.2313292625645846e-06, "loss": 1.4448, "step": 366000 }, { "epoch": 2.69, "grad_norm": 2.7165558338165283, "learning_rate": 5.170169876311257e-06, "loss": 1.4459, "step": 366500 }, { "epoch": 2.69, "grad_norm": 2.638815402984619, "learning_rate": 5.1090104900579306e-06, "loss": 1.4384, "step": 367000 }, { "epoch": 2.7, "grad_norm": 2.605776071548462, "learning_rate": 5.047851103804603e-06, "loss": 1.4515, "step": 367500 }, { "epoch": 2.7, "grad_norm": 2.7470364570617676, "learning_rate": 4.9866917175512765e-06, "loss": 1.4433, "step": 368000 }, { "epoch": 2.7, "grad_norm": 2.7127552032470703, "learning_rate": 4.925532331297949e-06, "loss": 1.4475, "step": 368500 }, { "epoch": 2.71, "grad_norm": 2.991445541381836, "learning_rate": 4.8643729450446225e-06, "loss": 1.4392, "step": 369000 }, { "epoch": 2.71, "grad_norm": 2.5964415073394775, "learning_rate": 4.803213558791295e-06, "loss": 1.4435, "step": 369500 }, { "epoch": 2.72, "grad_norm": 2.73907732963562, "learning_rate": 4.742054172537968e-06, "loss": 1.4487, "step": 370000 }, { "epoch": 2.72, "grad_norm": 2.7901346683502197, "learning_rate": 4.680894786284641e-06, "loss": 1.439, "step": 370500 }, { "epoch": 2.72, "grad_norm": 2.4311490058898926, "learning_rate": 4.619735400031314e-06, "loss": 1.4449, "step": 371000 }, { "epoch": 2.73, "grad_norm": 2.748305320739746, "learning_rate": 4.558576013777987e-06, "loss": 1.4558, "step": 371500 }, { "epoch": 2.73, "grad_norm": 2.8858590126037598, "learning_rate": 4.49741662752466e-06, "loss": 1.4422, "step": 372000 }, { "epoch": 2.73, "grad_norm": 2.893162727355957, "learning_rate": 4.436257241271333e-06, "loss": 1.4465, "step": 372500 }, { "epoch": 2.74, "grad_norm": 2.519300937652588, "learning_rate": 4.375097855018006e-06, "loss": 1.4354, "step": 373000 }, { "epoch": 2.74, "grad_norm": 2.798462152481079, "learning_rate": 4.313938468764678e-06, "loss": 1.4456, "step": 373500 }, { "epoch": 2.74, "grad_norm": 2.894874095916748, "learning_rate": 4.2527790825113516e-06, "loss": 1.4358, "step": 374000 }, { "epoch": 2.75, "grad_norm": 2.979597806930542, "learning_rate": 4.191619696258024e-06, "loss": 1.444, "step": 374500 }, { "epoch": 2.75, "grad_norm": 2.878843069076538, "learning_rate": 4.1304603100046975e-06, "loss": 1.4423, "step": 375000 }, { "epoch": 2.76, "grad_norm": 2.841049909591675, "learning_rate": 4.06930092375137e-06, "loss": 1.4403, "step": 375500 }, { "epoch": 2.76, "grad_norm": 2.413423538208008, "learning_rate": 4.0081415374980435e-06, "loss": 1.4491, "step": 376000 }, { "epoch": 2.76, "grad_norm": 2.7661304473876953, "learning_rate": 3.946982151244716e-06, "loss": 1.4399, "step": 376500 }, { "epoch": 2.77, "grad_norm": 2.5709524154663086, "learning_rate": 3.885822764991389e-06, "loss": 1.4391, "step": 377000 }, { "epoch": 2.77, "grad_norm": 2.8157260417938232, "learning_rate": 3.824663378738061e-06, "loss": 1.4368, "step": 377500 }, { "epoch": 2.77, "grad_norm": 2.7713100910186768, "learning_rate": 3.7635039924847346e-06, "loss": 1.4314, "step": 378000 }, { "epoch": 2.78, "grad_norm": 2.9643430709838867, "learning_rate": 3.702344606231407e-06, "loss": 1.4329, "step": 378500 }, { "epoch": 2.78, "grad_norm": 2.798428535461426, "learning_rate": 3.6411852199780806e-06, "loss": 1.4358, "step": 379000 }, { "epoch": 2.79, "grad_norm": 2.7989561557769775, "learning_rate": 3.580025833724754e-06, "loss": 1.4383, "step": 379500 }, { "epoch": 2.79, "grad_norm": 2.718421459197998, "learning_rate": 3.5188664474714266e-06, "loss": 1.4335, "step": 380000 }, { "epoch": 2.79, "grad_norm": 2.8248419761657715, "learning_rate": 3.4577070612180996e-06, "loss": 1.439, "step": 380500 }, { "epoch": 2.8, "grad_norm": 2.4477925300598145, "learning_rate": 3.396547674964772e-06, "loss": 1.4356, "step": 381000 }, { "epoch": 2.8, "grad_norm": 2.6281678676605225, "learning_rate": 3.3353882887114456e-06, "loss": 1.4441, "step": 381500 }, { "epoch": 2.8, "grad_norm": 2.6818716526031494, "learning_rate": 3.274228902458118e-06, "loss": 1.434, "step": 382000 }, { "epoch": 2.81, "grad_norm": 3.0857114791870117, "learning_rate": 3.213069516204791e-06, "loss": 1.4401, "step": 382500 }, { "epoch": 2.81, "grad_norm": 2.920851230621338, "learning_rate": 3.1519101299514637e-06, "loss": 1.4394, "step": 383000 }, { "epoch": 2.81, "grad_norm": 3.0690174102783203, "learning_rate": 3.090750743698137e-06, "loss": 1.4371, "step": 383500 }, { "epoch": 2.82, "grad_norm": 2.846827745437622, "learning_rate": 3.02959135744481e-06, "loss": 1.4301, "step": 384000 }, { "epoch": 2.82, "grad_norm": 2.927429676055908, "learning_rate": 2.9684319711914827e-06, "loss": 1.426, "step": 384500 }, { "epoch": 2.83, "grad_norm": 3.124462842941284, "learning_rate": 2.9072725849381557e-06, "loss": 1.4312, "step": 385000 }, { "epoch": 2.83, "grad_norm": 2.4795315265655518, "learning_rate": 2.8461131986848286e-06, "loss": 1.4406, "step": 385500 }, { "epoch": 2.83, "grad_norm": 2.942664384841919, "learning_rate": 2.7849538124315016e-06, "loss": 1.4338, "step": 386000 }, { "epoch": 2.84, "grad_norm": 2.6400539875030518, "learning_rate": 2.7237944261781746e-06, "loss": 1.4335, "step": 386500 }, { "epoch": 2.84, "grad_norm": 2.5888681411743164, "learning_rate": 2.6626350399248476e-06, "loss": 1.4293, "step": 387000 }, { "epoch": 2.84, "grad_norm": 2.6692755222320557, "learning_rate": 2.6014756536715206e-06, "loss": 1.4294, "step": 387500 }, { "epoch": 2.85, "grad_norm": 2.7141776084899902, "learning_rate": 2.540316267418193e-06, "loss": 1.4418, "step": 388000 }, { "epoch": 2.85, "grad_norm": 2.638432264328003, "learning_rate": 2.479156881164866e-06, "loss": 1.43, "step": 388500 }, { "epoch": 2.85, "grad_norm": 2.7874903678894043, "learning_rate": 2.417997494911539e-06, "loss": 1.4337, "step": 389000 }, { "epoch": 2.86, "grad_norm": 2.8501386642456055, "learning_rate": 2.356838108658212e-06, "loss": 1.4363, "step": 389500 }, { "epoch": 2.86, "grad_norm": 3.1942977905273438, "learning_rate": 2.295678722404885e-06, "loss": 1.4362, "step": 390000 }, { "epoch": 2.87, "grad_norm": 2.591784715652466, "learning_rate": 2.2345193361515577e-06, "loss": 1.44, "step": 390500 }, { "epoch": 2.87, "grad_norm": 3.0462796688079834, "learning_rate": 2.1733599498982307e-06, "loss": 1.4318, "step": 391000 }, { "epoch": 2.87, "grad_norm": 2.728050708770752, "learning_rate": 2.112200563644904e-06, "loss": 1.4404, "step": 391500 }, { "epoch": 2.88, "grad_norm": 2.7059884071350098, "learning_rate": 2.0510411773915767e-06, "loss": 1.4343, "step": 392000 }, { "epoch": 2.88, "grad_norm": 2.7897191047668457, "learning_rate": 1.9898817911382497e-06, "loss": 1.4264, "step": 392500 }, { "epoch": 2.88, "grad_norm": 2.517503261566162, "learning_rate": 1.9287224048849227e-06, "loss": 1.4393, "step": 393000 }, { "epoch": 2.89, "grad_norm": 2.8523480892181396, "learning_rate": 1.8675630186315954e-06, "loss": 1.4253, "step": 393500 }, { "epoch": 2.89, "grad_norm": 2.5820095539093018, "learning_rate": 1.8064036323782684e-06, "loss": 1.4311, "step": 394000 }, { "epoch": 2.9, "grad_norm": 2.8148810863494873, "learning_rate": 1.7452442461249414e-06, "loss": 1.4347, "step": 394500 }, { "epoch": 2.9, "grad_norm": 2.8168435096740723, "learning_rate": 1.6840848598716142e-06, "loss": 1.4316, "step": 395000 }, { "epoch": 2.9, "grad_norm": 2.9340474605560303, "learning_rate": 1.6229254736182872e-06, "loss": 1.4312, "step": 395500 }, { "epoch": 2.91, "grad_norm": 2.8138039112091064, "learning_rate": 1.5617660873649602e-06, "loss": 1.4246, "step": 396000 }, { "epoch": 2.91, "grad_norm": 2.826143980026245, "learning_rate": 1.500606701111633e-06, "loss": 1.4416, "step": 396500 }, { "epoch": 2.91, "grad_norm": 2.8407280445098877, "learning_rate": 1.439447314858306e-06, "loss": 1.4341, "step": 397000 }, { "epoch": 2.92, "grad_norm": 2.8632097244262695, "learning_rate": 1.378287928604979e-06, "loss": 1.4253, "step": 397500 }, { "epoch": 2.92, "grad_norm": 2.6355011463165283, "learning_rate": 1.317128542351652e-06, "loss": 1.4387, "step": 398000 }, { "epoch": 2.92, "grad_norm": 2.9903597831726074, "learning_rate": 1.2559691560983247e-06, "loss": 1.4269, "step": 398500 }, { "epoch": 2.93, "grad_norm": 2.6634271144866943, "learning_rate": 1.1948097698449977e-06, "loss": 1.4278, "step": 399000 }, { "epoch": 2.93, "grad_norm": 2.844621419906616, "learning_rate": 1.1336503835916707e-06, "loss": 1.4376, "step": 399500 }, { "epoch": 2.94, "grad_norm": 2.8783321380615234, "learning_rate": 1.0724909973383437e-06, "loss": 1.4309, "step": 400000 }, { "epoch": 2.94, "grad_norm": 2.565383195877075, "learning_rate": 1.0113316110850164e-06, "loss": 1.4182, "step": 400500 }, { "epoch": 2.94, "grad_norm": 2.6493799686431885, "learning_rate": 9.501722248316894e-07, "loss": 1.4354, "step": 401000 }, { "epoch": 2.95, "grad_norm": 2.7736098766326904, "learning_rate": 8.890128385783623e-07, "loss": 1.4269, "step": 401500 }, { "epoch": 2.95, "grad_norm": 2.5747058391571045, "learning_rate": 8.278534523250352e-07, "loss": 1.4244, "step": 402000 }, { "epoch": 2.95, "grad_norm": 2.9367337226867676, "learning_rate": 7.666940660717082e-07, "loss": 1.4339, "step": 402500 }, { "epoch": 2.96, "grad_norm": 3.1144633293151855, "learning_rate": 7.055346798183812e-07, "loss": 1.4342, "step": 403000 }, { "epoch": 2.96, "grad_norm": 2.8952739238739014, "learning_rate": 6.443752935650541e-07, "loss": 1.4216, "step": 403500 }, { "epoch": 2.97, "grad_norm": 2.9333155155181885, "learning_rate": 5.83215907311727e-07, "loss": 1.4254, "step": 404000 }, { "epoch": 2.97, "grad_norm": 2.900174140930176, "learning_rate": 5.220565210583999e-07, "loss": 1.433, "step": 404500 }, { "epoch": 2.97, "grad_norm": 2.6606194972991943, "learning_rate": 4.608971348050728e-07, "loss": 1.427, "step": 405000 }, { "epoch": 2.98, "grad_norm": 2.6916987895965576, "learning_rate": 3.997377485517458e-07, "loss": 1.4344, "step": 405500 }, { "epoch": 2.98, "grad_norm": 2.7830684185028076, "learning_rate": 3.385783622984187e-07, "loss": 1.432, "step": 406000 }, { "epoch": 2.98, "grad_norm": 2.9338104724884033, "learning_rate": 2.7741897604509164e-07, "loss": 1.4311, "step": 406500 }, { "epoch": 2.99, "grad_norm": 2.861415147781372, "learning_rate": 2.1625958979176455e-07, "loss": 1.4245, "step": 407000 }, { "epoch": 2.99, "grad_norm": 3.0713891983032227, "learning_rate": 1.5510020353843746e-07, "loss": 1.4246, "step": 407500 }, { "epoch": 2.99, "grad_norm": 2.7229363918304443, "learning_rate": 9.39408172851104e-08, "loss": 1.4289, "step": 408000 }, { "epoch": 3.0, "grad_norm": 2.5204861164093018, "learning_rate": 3.278143103178331e-08, "loss": 1.4267, "step": 408500 }, { "epoch": 3.0, "step": 408768, "total_flos": 3.442569138534612e+18, "train_loss": 1.939745183989198, "train_runtime": 329612.5954, "train_samples_per_second": 39.685, "train_steps_per_second": 1.24 } ], "logging_steps": 500, "max_steps": 408768, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 3.442569138534612e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }