bert-web-bg-cased / trainer_state.json
usmiva's picture
Upload 14 files
72804da verified
raw
history blame
133 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 408768,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.4115859270095825,
"learning_rate": 4.993884061374668e-05,
"loss": 7.5674,
"step": 500
},
{
"epoch": 0.01,
"grad_norm": 1.173668384552002,
"learning_rate": 4.987768122749335e-05,
"loss": 7.1474,
"step": 1000
},
{
"epoch": 0.01,
"grad_norm": 1.1809221506118774,
"learning_rate": 4.981652184124002e-05,
"loss": 7.0218,
"step": 1500
},
{
"epoch": 0.01,
"grad_norm": 1.898521065711975,
"learning_rate": 4.975536245498669e-05,
"loss": 6.9342,
"step": 2000
},
{
"epoch": 0.02,
"grad_norm": 2.6537320613861084,
"learning_rate": 4.969420306873337e-05,
"loss": 6.8595,
"step": 2500
},
{
"epoch": 0.02,
"grad_norm": 1.9779026508331299,
"learning_rate": 4.963304368248004e-05,
"loss": 6.8074,
"step": 3000
},
{
"epoch": 0.03,
"grad_norm": 3.4129042625427246,
"learning_rate": 4.957188429622671e-05,
"loss": 6.7628,
"step": 3500
},
{
"epoch": 0.03,
"grad_norm": 2.0669405460357666,
"learning_rate": 4.951072490997339e-05,
"loss": 6.7217,
"step": 4000
},
{
"epoch": 0.03,
"grad_norm": 2.6271133422851562,
"learning_rate": 4.944956552372006e-05,
"loss": 6.6808,
"step": 4500
},
{
"epoch": 0.04,
"grad_norm": 2.944228410720825,
"learning_rate": 4.938840613746673e-05,
"loss": 6.6526,
"step": 5000
},
{
"epoch": 0.04,
"grad_norm": 1.9261572360992432,
"learning_rate": 4.9327246751213406e-05,
"loss": 6.6157,
"step": 5500
},
{
"epoch": 0.04,
"grad_norm": 2.7971737384796143,
"learning_rate": 4.926608736496008e-05,
"loss": 6.585,
"step": 6000
},
{
"epoch": 0.05,
"grad_norm": 2.061311960220337,
"learning_rate": 4.920492797870675e-05,
"loss": 6.5625,
"step": 6500
},
{
"epoch": 0.05,
"grad_norm": 1.9171266555786133,
"learning_rate": 4.914376859245342e-05,
"loss": 6.5476,
"step": 7000
},
{
"epoch": 0.06,
"grad_norm": 2.557237386703491,
"learning_rate": 4.9082609206200096e-05,
"loss": 6.5063,
"step": 7500
},
{
"epoch": 0.06,
"grad_norm": 1.9273940324783325,
"learning_rate": 4.902144981994677e-05,
"loss": 6.4848,
"step": 8000
},
{
"epoch": 0.06,
"grad_norm": 2.46166729927063,
"learning_rate": 4.896029043369344e-05,
"loss": 6.4549,
"step": 8500
},
{
"epoch": 0.07,
"grad_norm": 2.1991207599639893,
"learning_rate": 4.889913104744012e-05,
"loss": 6.4151,
"step": 9000
},
{
"epoch": 0.07,
"grad_norm": 3.0304417610168457,
"learning_rate": 4.8837971661186786e-05,
"loss": 6.3421,
"step": 9500
},
{
"epoch": 0.07,
"grad_norm": 2.5440313816070557,
"learning_rate": 4.877681227493346e-05,
"loss": 6.2588,
"step": 10000
},
{
"epoch": 0.08,
"grad_norm": 2.8865408897399902,
"learning_rate": 4.8715652888680135e-05,
"loss": 6.1081,
"step": 10500
},
{
"epoch": 0.08,
"grad_norm": 3.689366579055786,
"learning_rate": 4.865449350242681e-05,
"loss": 5.965,
"step": 11000
},
{
"epoch": 0.08,
"grad_norm": 3.3553738594055176,
"learning_rate": 4.8593334116173477e-05,
"loss": 5.8321,
"step": 11500
},
{
"epoch": 0.09,
"grad_norm": 3.879040479660034,
"learning_rate": 4.853217472992016e-05,
"loss": 5.7097,
"step": 12000
},
{
"epoch": 0.09,
"grad_norm": 3.504615306854248,
"learning_rate": 4.8471015343666825e-05,
"loss": 5.5818,
"step": 12500
},
{
"epoch": 0.1,
"grad_norm": 3.884676218032837,
"learning_rate": 4.84098559574135e-05,
"loss": 5.4549,
"step": 13000
},
{
"epoch": 0.1,
"grad_norm": 3.560129165649414,
"learning_rate": 4.8348696571160174e-05,
"loss": 5.3163,
"step": 13500
},
{
"epoch": 0.1,
"grad_norm": 3.542060613632202,
"learning_rate": 4.828753718490685e-05,
"loss": 5.1749,
"step": 14000
},
{
"epoch": 0.11,
"grad_norm": 3.485215425491333,
"learning_rate": 4.8226377798653515e-05,
"loss": 5.0056,
"step": 14500
},
{
"epoch": 0.11,
"grad_norm": 3.323887586593628,
"learning_rate": 4.816521841240019e-05,
"loss": 4.8195,
"step": 15000
},
{
"epoch": 0.11,
"grad_norm": 3.07159161567688,
"learning_rate": 4.8104059026146864e-05,
"loss": 4.6421,
"step": 15500
},
{
"epoch": 0.12,
"grad_norm": 3.2840936183929443,
"learning_rate": 4.804289963989354e-05,
"loss": 4.475,
"step": 16000
},
{
"epoch": 0.12,
"grad_norm": 2.858837366104126,
"learning_rate": 4.798174025364021e-05,
"loss": 4.3387,
"step": 16500
},
{
"epoch": 0.12,
"grad_norm": 2.913006544113159,
"learning_rate": 4.792058086738688e-05,
"loss": 4.2142,
"step": 17000
},
{
"epoch": 0.13,
"grad_norm": 2.8114237785339355,
"learning_rate": 4.7859421481133554e-05,
"loss": 4.116,
"step": 17500
},
{
"epoch": 0.13,
"grad_norm": 3.1581108570098877,
"learning_rate": 4.779826209488023e-05,
"loss": 4.0327,
"step": 18000
},
{
"epoch": 0.14,
"grad_norm": 2.9168930053710938,
"learning_rate": 4.77371027086269e-05,
"loss": 3.9404,
"step": 18500
},
{
"epoch": 0.14,
"grad_norm": 2.8912034034729004,
"learning_rate": 4.767594332237357e-05,
"loss": 3.874,
"step": 19000
},
{
"epoch": 0.14,
"grad_norm": 2.909111976623535,
"learning_rate": 4.7614783936120244e-05,
"loss": 3.8014,
"step": 19500
},
{
"epoch": 0.15,
"grad_norm": 3.0821359157562256,
"learning_rate": 4.755362454986692e-05,
"loss": 3.7285,
"step": 20000
},
{
"epoch": 0.15,
"grad_norm": 2.603447675704956,
"learning_rate": 4.749246516361359e-05,
"loss": 3.6757,
"step": 20500
},
{
"epoch": 0.15,
"grad_norm": 2.466923952102661,
"learning_rate": 4.743130577736026e-05,
"loss": 3.6054,
"step": 21000
},
{
"epoch": 0.16,
"grad_norm": 2.949532985687256,
"learning_rate": 4.737014639110694e-05,
"loss": 3.5478,
"step": 21500
},
{
"epoch": 0.16,
"grad_norm": 2.7499911785125732,
"learning_rate": 4.730898700485361e-05,
"loss": 3.5018,
"step": 22000
},
{
"epoch": 0.17,
"grad_norm": 2.8840396404266357,
"learning_rate": 4.724782761860028e-05,
"loss": 3.4503,
"step": 22500
},
{
"epoch": 0.17,
"grad_norm": 2.7901082038879395,
"learning_rate": 4.718666823234696e-05,
"loss": 3.385,
"step": 23000
},
{
"epoch": 0.17,
"grad_norm": 2.499488353729248,
"learning_rate": 4.712550884609363e-05,
"loss": 3.3385,
"step": 23500
},
{
"epoch": 0.18,
"grad_norm": 2.77323055267334,
"learning_rate": 4.70643494598403e-05,
"loss": 3.2953,
"step": 24000
},
{
"epoch": 0.18,
"grad_norm": 3.0041913986206055,
"learning_rate": 4.700319007358698e-05,
"loss": 3.2472,
"step": 24500
},
{
"epoch": 0.18,
"grad_norm": 2.72615909576416,
"learning_rate": 4.694203068733365e-05,
"loss": 3.2107,
"step": 25000
},
{
"epoch": 0.19,
"grad_norm": 2.766868829727173,
"learning_rate": 4.688087130108032e-05,
"loss": 3.1768,
"step": 25500
},
{
"epoch": 0.19,
"grad_norm": 2.5954720973968506,
"learning_rate": 4.6819711914826996e-05,
"loss": 3.1358,
"step": 26000
},
{
"epoch": 0.19,
"grad_norm": 2.7101314067840576,
"learning_rate": 4.675855252857367e-05,
"loss": 3.0949,
"step": 26500
},
{
"epoch": 0.2,
"grad_norm": 2.7481093406677246,
"learning_rate": 4.669739314232034e-05,
"loss": 3.0655,
"step": 27000
},
{
"epoch": 0.2,
"grad_norm": 2.869677782058716,
"learning_rate": 4.663623375606702e-05,
"loss": 3.0188,
"step": 27500
},
{
"epoch": 0.21,
"grad_norm": 3.081693649291992,
"learning_rate": 4.6575074369813686e-05,
"loss": 2.9893,
"step": 28000
},
{
"epoch": 0.21,
"grad_norm": 3.1503679752349854,
"learning_rate": 4.651391498356036e-05,
"loss": 2.975,
"step": 28500
},
{
"epoch": 0.21,
"grad_norm": 2.9327192306518555,
"learning_rate": 4.645275559730703e-05,
"loss": 2.9276,
"step": 29000
},
{
"epoch": 0.22,
"grad_norm": 2.580777406692505,
"learning_rate": 4.639159621105371e-05,
"loss": 2.9062,
"step": 29500
},
{
"epoch": 0.22,
"grad_norm": 2.6058924198150635,
"learning_rate": 4.633043682480038e-05,
"loss": 2.8895,
"step": 30000
},
{
"epoch": 0.22,
"grad_norm": 2.9445390701293945,
"learning_rate": 4.626927743854705e-05,
"loss": 2.8634,
"step": 30500
},
{
"epoch": 0.23,
"grad_norm": 3.4329161643981934,
"learning_rate": 4.6208118052293725e-05,
"loss": 2.8399,
"step": 31000
},
{
"epoch": 0.23,
"grad_norm": 2.910855770111084,
"learning_rate": 4.61469586660404e-05,
"loss": 2.8167,
"step": 31500
},
{
"epoch": 0.23,
"grad_norm": 2.9852588176727295,
"learning_rate": 4.608579927978707e-05,
"loss": 2.8001,
"step": 32000
},
{
"epoch": 0.24,
"grad_norm": 2.6188220977783203,
"learning_rate": 4.602463989353374e-05,
"loss": 2.7929,
"step": 32500
},
{
"epoch": 0.24,
"grad_norm": 2.753516912460327,
"learning_rate": 4.5963480507280416e-05,
"loss": 2.7677,
"step": 33000
},
{
"epoch": 0.25,
"grad_norm": 2.5995850563049316,
"learning_rate": 4.590232112102709e-05,
"loss": 2.7399,
"step": 33500
},
{
"epoch": 0.25,
"grad_norm": 2.697634696960449,
"learning_rate": 4.5841161734773764e-05,
"loss": 2.7241,
"step": 34000
},
{
"epoch": 0.25,
"grad_norm": 3.0841758251190186,
"learning_rate": 4.578000234852043e-05,
"loss": 2.7143,
"step": 34500
},
{
"epoch": 0.26,
"grad_norm": 3.0621519088745117,
"learning_rate": 4.5718842962267106e-05,
"loss": 2.6897,
"step": 35000
},
{
"epoch": 0.26,
"grad_norm": 2.912416934967041,
"learning_rate": 4.565768357601378e-05,
"loss": 2.6762,
"step": 35500
},
{
"epoch": 0.26,
"grad_norm": 2.95345401763916,
"learning_rate": 4.5596524189760454e-05,
"loss": 2.6708,
"step": 36000
},
{
"epoch": 0.27,
"grad_norm": 2.7842564582824707,
"learning_rate": 4.553536480350712e-05,
"loss": 2.6507,
"step": 36500
},
{
"epoch": 0.27,
"grad_norm": 2.9051456451416016,
"learning_rate": 4.54742054172538e-05,
"loss": 2.6361,
"step": 37000
},
{
"epoch": 0.28,
"grad_norm": 2.8248302936553955,
"learning_rate": 4.541304603100047e-05,
"loss": 2.619,
"step": 37500
},
{
"epoch": 0.28,
"grad_norm": 3.076840400695801,
"learning_rate": 4.5351886644747145e-05,
"loss": 2.6011,
"step": 38000
},
{
"epoch": 0.28,
"grad_norm": 2.7716526985168457,
"learning_rate": 4.529072725849382e-05,
"loss": 2.5928,
"step": 38500
},
{
"epoch": 0.29,
"grad_norm": 2.8269171714782715,
"learning_rate": 4.522956787224049e-05,
"loss": 2.5813,
"step": 39000
},
{
"epoch": 0.29,
"grad_norm": 3.041726589202881,
"learning_rate": 4.516840848598716e-05,
"loss": 2.5583,
"step": 39500
},
{
"epoch": 0.29,
"grad_norm": 2.676236391067505,
"learning_rate": 4.510724909973384e-05,
"loss": 2.5538,
"step": 40000
},
{
"epoch": 0.3,
"grad_norm": 2.8164145946502686,
"learning_rate": 4.504608971348051e-05,
"loss": 2.55,
"step": 40500
},
{
"epoch": 0.3,
"grad_norm": 2.637202024459839,
"learning_rate": 4.498493032722718e-05,
"loss": 2.5273,
"step": 41000
},
{
"epoch": 0.3,
"grad_norm": 2.9004831314086914,
"learning_rate": 4.492377094097385e-05,
"loss": 2.5142,
"step": 41500
},
{
"epoch": 0.31,
"grad_norm": 2.7393102645874023,
"learning_rate": 4.486261155472053e-05,
"loss": 2.5093,
"step": 42000
},
{
"epoch": 0.31,
"grad_norm": 2.814011335372925,
"learning_rate": 4.48014521684672e-05,
"loss": 2.4936,
"step": 42500
},
{
"epoch": 0.32,
"grad_norm": 2.625415802001953,
"learning_rate": 4.4740292782213874e-05,
"loss": 2.4787,
"step": 43000
},
{
"epoch": 0.32,
"grad_norm": 2.955406427383423,
"learning_rate": 4.467913339596055e-05,
"loss": 2.471,
"step": 43500
},
{
"epoch": 0.32,
"grad_norm": 2.6002650260925293,
"learning_rate": 4.461797400970722e-05,
"loss": 2.4626,
"step": 44000
},
{
"epoch": 0.33,
"grad_norm": 2.75281023979187,
"learning_rate": 4.455681462345389e-05,
"loss": 2.459,
"step": 44500
},
{
"epoch": 0.33,
"grad_norm": 2.5558836460113525,
"learning_rate": 4.449565523720057e-05,
"loss": 2.4441,
"step": 45000
},
{
"epoch": 0.33,
"grad_norm": 2.634889602661133,
"learning_rate": 4.443449585094724e-05,
"loss": 2.4392,
"step": 45500
},
{
"epoch": 0.34,
"grad_norm": 3.015256643295288,
"learning_rate": 4.437333646469391e-05,
"loss": 2.4218,
"step": 46000
},
{
"epoch": 0.34,
"grad_norm": 2.656592607498169,
"learning_rate": 4.431217707844059e-05,
"loss": 2.4148,
"step": 46500
},
{
"epoch": 0.34,
"grad_norm": 2.9560022354125977,
"learning_rate": 4.425101769218726e-05,
"loss": 2.4074,
"step": 47000
},
{
"epoch": 0.35,
"grad_norm": 2.8009862899780273,
"learning_rate": 4.418985830593393e-05,
"loss": 2.4028,
"step": 47500
},
{
"epoch": 0.35,
"grad_norm": 2.7291080951690674,
"learning_rate": 4.41286989196806e-05,
"loss": 2.3874,
"step": 48000
},
{
"epoch": 0.36,
"grad_norm": 2.959327220916748,
"learning_rate": 4.406753953342728e-05,
"loss": 2.3803,
"step": 48500
},
{
"epoch": 0.36,
"grad_norm": 2.4076898097991943,
"learning_rate": 4.400638014717395e-05,
"loss": 2.3641,
"step": 49000
},
{
"epoch": 0.36,
"grad_norm": 2.703214168548584,
"learning_rate": 4.3945220760920625e-05,
"loss": 2.3707,
"step": 49500
},
{
"epoch": 0.37,
"grad_norm": 2.7057530879974365,
"learning_rate": 4.388406137466729e-05,
"loss": 2.3551,
"step": 50000
},
{
"epoch": 0.37,
"grad_norm": 2.656576156616211,
"learning_rate": 4.382290198841397e-05,
"loss": 2.3501,
"step": 50500
},
{
"epoch": 0.37,
"grad_norm": 2.7500181198120117,
"learning_rate": 4.376174260216064e-05,
"loss": 2.342,
"step": 51000
},
{
"epoch": 0.38,
"grad_norm": 2.560018301010132,
"learning_rate": 4.3700583215907316e-05,
"loss": 2.3378,
"step": 51500
},
{
"epoch": 0.38,
"grad_norm": 2.5425586700439453,
"learning_rate": 4.363942382965398e-05,
"loss": 2.3263,
"step": 52000
},
{
"epoch": 0.39,
"grad_norm": 2.7227046489715576,
"learning_rate": 4.357826444340066e-05,
"loss": 2.3216,
"step": 52500
},
{
"epoch": 0.39,
"grad_norm": 2.8094546794891357,
"learning_rate": 4.351710505714733e-05,
"loss": 2.3168,
"step": 53000
},
{
"epoch": 0.39,
"grad_norm": 2.559802293777466,
"learning_rate": 4.3455945670894006e-05,
"loss": 2.3129,
"step": 53500
},
{
"epoch": 0.4,
"grad_norm": 2.9485251903533936,
"learning_rate": 4.3394786284640673e-05,
"loss": 2.3055,
"step": 54000
},
{
"epoch": 0.4,
"grad_norm": 2.554258346557617,
"learning_rate": 4.3333626898387355e-05,
"loss": 2.2897,
"step": 54500
},
{
"epoch": 0.4,
"grad_norm": 2.9118120670318604,
"learning_rate": 4.327246751213402e-05,
"loss": 2.2804,
"step": 55000
},
{
"epoch": 0.41,
"grad_norm": 2.6395227909088135,
"learning_rate": 4.3211308125880696e-05,
"loss": 2.285,
"step": 55500
},
{
"epoch": 0.41,
"grad_norm": 2.8277575969696045,
"learning_rate": 4.315014873962737e-05,
"loss": 2.2725,
"step": 56000
},
{
"epoch": 0.41,
"grad_norm": 2.6950581073760986,
"learning_rate": 4.3088989353374045e-05,
"loss": 2.2726,
"step": 56500
},
{
"epoch": 0.42,
"grad_norm": 2.5823190212249756,
"learning_rate": 4.302782996712071e-05,
"loss": 2.2676,
"step": 57000
},
{
"epoch": 0.42,
"grad_norm": 2.9325387477874756,
"learning_rate": 4.296667058086739e-05,
"loss": 2.25,
"step": 57500
},
{
"epoch": 0.43,
"grad_norm": 2.915308952331543,
"learning_rate": 4.290551119461406e-05,
"loss": 2.237,
"step": 58000
},
{
"epoch": 0.43,
"grad_norm": 2.782322883605957,
"learning_rate": 4.2844351808360735e-05,
"loss": 2.2435,
"step": 58500
},
{
"epoch": 0.43,
"grad_norm": 2.8868165016174316,
"learning_rate": 4.278319242210741e-05,
"loss": 2.2282,
"step": 59000
},
{
"epoch": 0.44,
"grad_norm": 2.906238079071045,
"learning_rate": 4.2722033035854084e-05,
"loss": 2.2278,
"step": 59500
},
{
"epoch": 0.44,
"grad_norm": 2.8209023475646973,
"learning_rate": 4.266087364960075e-05,
"loss": 2.2212,
"step": 60000
},
{
"epoch": 0.44,
"grad_norm": 2.944169044494629,
"learning_rate": 4.259971426334743e-05,
"loss": 2.2107,
"step": 60500
},
{
"epoch": 0.45,
"grad_norm": 3.010463237762451,
"learning_rate": 4.25385548770941e-05,
"loss": 2.2112,
"step": 61000
},
{
"epoch": 0.45,
"grad_norm": 2.5480105876922607,
"learning_rate": 4.2477395490840774e-05,
"loss": 2.2232,
"step": 61500
},
{
"epoch": 0.46,
"grad_norm": 2.619253635406494,
"learning_rate": 4.241623610458745e-05,
"loss": 2.1997,
"step": 62000
},
{
"epoch": 0.46,
"grad_norm": 2.716602325439453,
"learning_rate": 4.235507671833412e-05,
"loss": 2.1967,
"step": 62500
},
{
"epoch": 0.46,
"grad_norm": 2.7304656505584717,
"learning_rate": 4.229391733208079e-05,
"loss": 2.1865,
"step": 63000
},
{
"epoch": 0.47,
"grad_norm": 2.3193280696868896,
"learning_rate": 4.2232757945827464e-05,
"loss": 2.1783,
"step": 63500
},
{
"epoch": 0.47,
"grad_norm": 2.659099817276001,
"learning_rate": 4.217159855957414e-05,
"loss": 2.1669,
"step": 64000
},
{
"epoch": 0.47,
"grad_norm": 2.8396430015563965,
"learning_rate": 4.211043917332081e-05,
"loss": 2.1794,
"step": 64500
},
{
"epoch": 0.48,
"grad_norm": 2.5535542964935303,
"learning_rate": 4.204927978706748e-05,
"loss": 2.1688,
"step": 65000
},
{
"epoch": 0.48,
"grad_norm": 2.776742696762085,
"learning_rate": 4.1988120400814154e-05,
"loss": 2.1661,
"step": 65500
},
{
"epoch": 0.48,
"grad_norm": 2.4852330684661865,
"learning_rate": 4.192696101456083e-05,
"loss": 2.1591,
"step": 66000
},
{
"epoch": 0.49,
"grad_norm": 2.8885903358459473,
"learning_rate": 4.1865801628307496e-05,
"loss": 2.1581,
"step": 66500
},
{
"epoch": 0.49,
"grad_norm": 2.6640567779541016,
"learning_rate": 4.180464224205418e-05,
"loss": 2.1549,
"step": 67000
},
{
"epoch": 0.5,
"grad_norm": 2.27127742767334,
"learning_rate": 4.1743482855800845e-05,
"loss": 2.1492,
"step": 67500
},
{
"epoch": 0.5,
"grad_norm": 2.591395139694214,
"learning_rate": 4.168232346954752e-05,
"loss": 2.1382,
"step": 68000
},
{
"epoch": 0.5,
"grad_norm": 2.8147051334381104,
"learning_rate": 4.162116408329419e-05,
"loss": 2.1407,
"step": 68500
},
{
"epoch": 0.51,
"grad_norm": 2.650275707244873,
"learning_rate": 4.156000469704087e-05,
"loss": 2.1345,
"step": 69000
},
{
"epoch": 0.51,
"grad_norm": 2.7816410064697266,
"learning_rate": 4.1498845310787535e-05,
"loss": 2.1371,
"step": 69500
},
{
"epoch": 0.51,
"grad_norm": 2.6034348011016846,
"learning_rate": 4.1437685924534216e-05,
"loss": 2.1264,
"step": 70000
},
{
"epoch": 0.52,
"grad_norm": 2.6060938835144043,
"learning_rate": 4.1376526538280883e-05,
"loss": 2.1186,
"step": 70500
},
{
"epoch": 0.52,
"grad_norm": 2.754519462585449,
"learning_rate": 4.131536715202756e-05,
"loss": 2.1167,
"step": 71000
},
{
"epoch": 0.52,
"grad_norm": 2.665511131286621,
"learning_rate": 4.125420776577423e-05,
"loss": 2.1025,
"step": 71500
},
{
"epoch": 0.53,
"grad_norm": 2.8608968257904053,
"learning_rate": 4.1193048379520906e-05,
"loss": 2.1087,
"step": 72000
},
{
"epoch": 0.53,
"grad_norm": 2.521726369857788,
"learning_rate": 4.1131888993267574e-05,
"loss": 2.1035,
"step": 72500
},
{
"epoch": 0.54,
"grad_norm": 2.713449001312256,
"learning_rate": 4.1070729607014255e-05,
"loss": 2.0853,
"step": 73000
},
{
"epoch": 0.54,
"grad_norm": 2.6165554523468018,
"learning_rate": 4.100957022076092e-05,
"loss": 2.1032,
"step": 73500
},
{
"epoch": 0.54,
"grad_norm": 2.5650808811187744,
"learning_rate": 4.0948410834507596e-05,
"loss": 2.0823,
"step": 74000
},
{
"epoch": 0.55,
"grad_norm": 2.7067785263061523,
"learning_rate": 4.0887251448254264e-05,
"loss": 2.0843,
"step": 74500
},
{
"epoch": 0.55,
"grad_norm": 2.6049702167510986,
"learning_rate": 4.0826092062000945e-05,
"loss": 2.0851,
"step": 75000
},
{
"epoch": 0.55,
"grad_norm": 2.487355947494507,
"learning_rate": 4.076493267574761e-05,
"loss": 2.0745,
"step": 75500
},
{
"epoch": 0.56,
"grad_norm": 2.4251976013183594,
"learning_rate": 4.070377328949429e-05,
"loss": 2.0728,
"step": 76000
},
{
"epoch": 0.56,
"grad_norm": 2.6600770950317383,
"learning_rate": 4.064261390324096e-05,
"loss": 2.0725,
"step": 76500
},
{
"epoch": 0.57,
"grad_norm": 2.488598346710205,
"learning_rate": 4.0581454516987635e-05,
"loss": 2.0638,
"step": 77000
},
{
"epoch": 0.57,
"grad_norm": 2.8305716514587402,
"learning_rate": 4.05202951307343e-05,
"loss": 2.0673,
"step": 77500
},
{
"epoch": 0.57,
"grad_norm": 2.607948064804077,
"learning_rate": 4.0459135744480984e-05,
"loss": 2.0698,
"step": 78000
},
{
"epoch": 0.58,
"grad_norm": 2.558473825454712,
"learning_rate": 4.039797635822765e-05,
"loss": 2.0531,
"step": 78500
},
{
"epoch": 0.58,
"grad_norm": 2.675361394882202,
"learning_rate": 4.0336816971974326e-05,
"loss": 2.0568,
"step": 79000
},
{
"epoch": 0.58,
"grad_norm": 2.430924654006958,
"learning_rate": 4.0275657585721e-05,
"loss": 2.0472,
"step": 79500
},
{
"epoch": 0.59,
"grad_norm": 2.637683153152466,
"learning_rate": 4.021449819946767e-05,
"loss": 2.0422,
"step": 80000
},
{
"epoch": 0.59,
"grad_norm": 2.8251748085021973,
"learning_rate": 4.015333881321434e-05,
"loss": 2.0438,
"step": 80500
},
{
"epoch": 0.59,
"grad_norm": 2.9130163192749023,
"learning_rate": 4.0092179426961016e-05,
"loss": 2.0334,
"step": 81000
},
{
"epoch": 0.6,
"grad_norm": 2.6857762336730957,
"learning_rate": 4.003102004070769e-05,
"loss": 2.0298,
"step": 81500
},
{
"epoch": 0.6,
"grad_norm": 2.7029411792755127,
"learning_rate": 3.996986065445436e-05,
"loss": 2.0266,
"step": 82000
},
{
"epoch": 0.61,
"grad_norm": 2.43390154838562,
"learning_rate": 3.990870126820104e-05,
"loss": 2.0323,
"step": 82500
},
{
"epoch": 0.61,
"grad_norm": 2.8077657222747803,
"learning_rate": 3.9847541881947706e-05,
"loss": 2.0204,
"step": 83000
},
{
"epoch": 0.61,
"grad_norm": 2.6263062953948975,
"learning_rate": 3.978638249569438e-05,
"loss": 2.0238,
"step": 83500
},
{
"epoch": 0.62,
"grad_norm": 2.582228660583496,
"learning_rate": 3.9725223109441055e-05,
"loss": 2.0157,
"step": 84000
},
{
"epoch": 0.62,
"grad_norm": 2.989870548248291,
"learning_rate": 3.966406372318773e-05,
"loss": 2.0123,
"step": 84500
},
{
"epoch": 0.62,
"grad_norm": 2.50876522064209,
"learning_rate": 3.9602904336934396e-05,
"loss": 2.0049,
"step": 85000
},
{
"epoch": 0.63,
"grad_norm": 2.754103183746338,
"learning_rate": 3.954174495068108e-05,
"loss": 2.0108,
"step": 85500
},
{
"epoch": 0.63,
"grad_norm": 2.742558240890503,
"learning_rate": 3.9480585564427745e-05,
"loss": 2.0013,
"step": 86000
},
{
"epoch": 0.63,
"grad_norm": 2.7211074829101562,
"learning_rate": 3.941942617817442e-05,
"loss": 2.0056,
"step": 86500
},
{
"epoch": 0.64,
"grad_norm": 2.5889461040496826,
"learning_rate": 3.9358266791921087e-05,
"loss": 1.9933,
"step": 87000
},
{
"epoch": 0.64,
"grad_norm": 2.581122398376465,
"learning_rate": 3.929710740566777e-05,
"loss": 1.9896,
"step": 87500
},
{
"epoch": 0.65,
"grad_norm": 2.7021663188934326,
"learning_rate": 3.9235948019414435e-05,
"loss": 1.9878,
"step": 88000
},
{
"epoch": 0.65,
"grad_norm": 2.6844136714935303,
"learning_rate": 3.917478863316111e-05,
"loss": 1.9838,
"step": 88500
},
{
"epoch": 0.65,
"grad_norm": 2.6349422931671143,
"learning_rate": 3.9113629246907784e-05,
"loss": 1.9796,
"step": 89000
},
{
"epoch": 0.66,
"grad_norm": 2.6799721717834473,
"learning_rate": 3.905246986065446e-05,
"loss": 1.9793,
"step": 89500
},
{
"epoch": 0.66,
"grad_norm": 2.502464771270752,
"learning_rate": 3.8991310474401125e-05,
"loss": 1.9789,
"step": 90000
},
{
"epoch": 0.66,
"grad_norm": 2.897421360015869,
"learning_rate": 3.8930151088147806e-05,
"loss": 1.9766,
"step": 90500
},
{
"epoch": 0.67,
"grad_norm": 2.6226820945739746,
"learning_rate": 3.8868991701894474e-05,
"loss": 1.9785,
"step": 91000
},
{
"epoch": 0.67,
"grad_norm": 2.7630228996276855,
"learning_rate": 3.880783231564115e-05,
"loss": 1.9805,
"step": 91500
},
{
"epoch": 0.68,
"grad_norm": 2.7849583625793457,
"learning_rate": 3.874667292938782e-05,
"loss": 1.9672,
"step": 92000
},
{
"epoch": 0.68,
"grad_norm": 2.643397569656372,
"learning_rate": 3.86855135431345e-05,
"loss": 1.9618,
"step": 92500
},
{
"epoch": 0.68,
"grad_norm": 2.6938283443450928,
"learning_rate": 3.8624354156881164e-05,
"loss": 1.9674,
"step": 93000
},
{
"epoch": 0.69,
"grad_norm": 2.7914974689483643,
"learning_rate": 3.856319477062784e-05,
"loss": 1.9616,
"step": 93500
},
{
"epoch": 0.69,
"grad_norm": 2.6223716735839844,
"learning_rate": 3.850203538437451e-05,
"loss": 1.9566,
"step": 94000
},
{
"epoch": 0.69,
"grad_norm": 2.6575443744659424,
"learning_rate": 3.844087599812119e-05,
"loss": 1.9611,
"step": 94500
},
{
"epoch": 0.7,
"grad_norm": 2.684488534927368,
"learning_rate": 3.837971661186786e-05,
"loss": 1.9467,
"step": 95000
},
{
"epoch": 0.7,
"grad_norm": 2.668365001678467,
"learning_rate": 3.831855722561453e-05,
"loss": 1.9548,
"step": 95500
},
{
"epoch": 0.7,
"grad_norm": 2.967519521713257,
"learning_rate": 3.82573978393612e-05,
"loss": 1.9535,
"step": 96000
},
{
"epoch": 0.71,
"grad_norm": 2.5876193046569824,
"learning_rate": 3.819623845310788e-05,
"loss": 1.9446,
"step": 96500
},
{
"epoch": 0.71,
"grad_norm": 2.7293176651000977,
"learning_rate": 3.813507906685455e-05,
"loss": 1.9481,
"step": 97000
},
{
"epoch": 0.72,
"grad_norm": 2.6928365230560303,
"learning_rate": 3.807391968060122e-05,
"loss": 1.9296,
"step": 97500
},
{
"epoch": 0.72,
"grad_norm": 2.568150043487549,
"learning_rate": 3.801276029434789e-05,
"loss": 1.9422,
"step": 98000
},
{
"epoch": 0.72,
"grad_norm": 2.7275748252868652,
"learning_rate": 3.795160090809457e-05,
"loss": 1.9314,
"step": 98500
},
{
"epoch": 0.73,
"grad_norm": 2.613135576248169,
"learning_rate": 3.789044152184124e-05,
"loss": 1.931,
"step": 99000
},
{
"epoch": 0.73,
"grad_norm": 2.408534049987793,
"learning_rate": 3.782928213558791e-05,
"loss": 1.9314,
"step": 99500
},
{
"epoch": 0.73,
"grad_norm": 2.896430015563965,
"learning_rate": 3.776812274933459e-05,
"loss": 1.9221,
"step": 100000
},
{
"epoch": 0.74,
"grad_norm": 2.6729750633239746,
"learning_rate": 3.770696336308126e-05,
"loss": 1.9247,
"step": 100500
},
{
"epoch": 0.74,
"grad_norm": 2.5819644927978516,
"learning_rate": 3.764580397682793e-05,
"loss": 1.9243,
"step": 101000
},
{
"epoch": 0.74,
"grad_norm": 2.561739206314087,
"learning_rate": 3.7584644590574606e-05,
"loss": 1.9291,
"step": 101500
},
{
"epoch": 0.75,
"grad_norm": 2.663254976272583,
"learning_rate": 3.752348520432128e-05,
"loss": 1.9138,
"step": 102000
},
{
"epoch": 0.75,
"grad_norm": 2.662156581878662,
"learning_rate": 3.746232581806795e-05,
"loss": 1.9128,
"step": 102500
},
{
"epoch": 0.76,
"grad_norm": 2.6324357986450195,
"learning_rate": 3.740116643181463e-05,
"loss": 1.9215,
"step": 103000
},
{
"epoch": 0.76,
"grad_norm": 2.619344711303711,
"learning_rate": 3.7340007045561297e-05,
"loss": 1.9043,
"step": 103500
},
{
"epoch": 0.76,
"grad_norm": 2.681112051010132,
"learning_rate": 3.727884765930797e-05,
"loss": 1.9077,
"step": 104000
},
{
"epoch": 0.77,
"grad_norm": 2.846804141998291,
"learning_rate": 3.7217688273054645e-05,
"loss": 1.9034,
"step": 104500
},
{
"epoch": 0.77,
"grad_norm": 3.0270516872406006,
"learning_rate": 3.715652888680132e-05,
"loss": 1.901,
"step": 105000
},
{
"epoch": 0.77,
"grad_norm": 2.5290517807006836,
"learning_rate": 3.709536950054799e-05,
"loss": 1.9031,
"step": 105500
},
{
"epoch": 0.78,
"grad_norm": 2.661867380142212,
"learning_rate": 3.703421011429467e-05,
"loss": 1.902,
"step": 106000
},
{
"epoch": 0.78,
"grad_norm": 2.816241979598999,
"learning_rate": 3.6973050728041335e-05,
"loss": 1.8885,
"step": 106500
},
{
"epoch": 0.79,
"grad_norm": 2.8065085411071777,
"learning_rate": 3.691189134178801e-05,
"loss": 1.8931,
"step": 107000
},
{
"epoch": 0.79,
"grad_norm": 2.4863102436065674,
"learning_rate": 3.6850731955534684e-05,
"loss": 1.8907,
"step": 107500
},
{
"epoch": 0.79,
"grad_norm": 2.4044525623321533,
"learning_rate": 3.678957256928136e-05,
"loss": 1.8911,
"step": 108000
},
{
"epoch": 0.8,
"grad_norm": 2.6208319664001465,
"learning_rate": 3.6728413183028026e-05,
"loss": 1.8889,
"step": 108500
},
{
"epoch": 0.8,
"grad_norm": 2.4432547092437744,
"learning_rate": 3.66672537967747e-05,
"loss": 1.8794,
"step": 109000
},
{
"epoch": 0.8,
"grad_norm": 2.9175052642822266,
"learning_rate": 3.6606094410521374e-05,
"loss": 1.8769,
"step": 109500
},
{
"epoch": 0.81,
"grad_norm": 2.7171223163604736,
"learning_rate": 3.654493502426805e-05,
"loss": 1.8834,
"step": 110000
},
{
"epoch": 0.81,
"grad_norm": 2.5070419311523438,
"learning_rate": 3.6483775638014716e-05,
"loss": 1.8706,
"step": 110500
},
{
"epoch": 0.81,
"grad_norm": 2.59771990776062,
"learning_rate": 3.642261625176139e-05,
"loss": 1.8687,
"step": 111000
},
{
"epoch": 0.82,
"grad_norm": 3.022465944290161,
"learning_rate": 3.6361456865508064e-05,
"loss": 1.8721,
"step": 111500
},
{
"epoch": 0.82,
"grad_norm": 2.8927907943725586,
"learning_rate": 3.630029747925474e-05,
"loss": 1.8761,
"step": 112000
},
{
"epoch": 0.83,
"grad_norm": 2.612518787384033,
"learning_rate": 3.623913809300141e-05,
"loss": 1.8744,
"step": 112500
},
{
"epoch": 0.83,
"grad_norm": 2.7625935077667236,
"learning_rate": 3.617797870674808e-05,
"loss": 1.8636,
"step": 113000
},
{
"epoch": 0.83,
"grad_norm": 2.535382032394409,
"learning_rate": 3.6116819320494755e-05,
"loss": 1.8683,
"step": 113500
},
{
"epoch": 0.84,
"grad_norm": 2.575298547744751,
"learning_rate": 3.605565993424143e-05,
"loss": 1.8657,
"step": 114000
},
{
"epoch": 0.84,
"grad_norm": 2.6413776874542236,
"learning_rate": 3.59945005479881e-05,
"loss": 1.8571,
"step": 114500
},
{
"epoch": 0.84,
"grad_norm": 2.675283908843994,
"learning_rate": 3.593334116173477e-05,
"loss": 1.8575,
"step": 115000
},
{
"epoch": 0.85,
"grad_norm": 2.7104618549346924,
"learning_rate": 3.587218177548145e-05,
"loss": 1.8555,
"step": 115500
},
{
"epoch": 0.85,
"grad_norm": 2.7391483783721924,
"learning_rate": 3.581102238922812e-05,
"loss": 1.8555,
"step": 116000
},
{
"epoch": 0.86,
"grad_norm": 2.561523675918579,
"learning_rate": 3.5749863002974793e-05,
"loss": 1.8475,
"step": 116500
},
{
"epoch": 0.86,
"grad_norm": 2.590790033340454,
"learning_rate": 3.568870361672147e-05,
"loss": 1.8425,
"step": 117000
},
{
"epoch": 0.86,
"grad_norm": 2.889119863510132,
"learning_rate": 3.562754423046814e-05,
"loss": 1.8491,
"step": 117500
},
{
"epoch": 0.87,
"grad_norm": 2.77632212638855,
"learning_rate": 3.556638484421481e-05,
"loss": 1.8469,
"step": 118000
},
{
"epoch": 0.87,
"grad_norm": 2.720357894897461,
"learning_rate": 3.550522545796149e-05,
"loss": 1.8462,
"step": 118500
},
{
"epoch": 0.87,
"grad_norm": 2.8213210105895996,
"learning_rate": 3.544406607170816e-05,
"loss": 1.8399,
"step": 119000
},
{
"epoch": 0.88,
"grad_norm": 2.834599733352661,
"learning_rate": 3.538290668545483e-05,
"loss": 1.8429,
"step": 119500
},
{
"epoch": 0.88,
"grad_norm": 2.578364133834839,
"learning_rate": 3.5321747299201506e-05,
"loss": 1.8386,
"step": 120000
},
{
"epoch": 0.88,
"grad_norm": 2.6725339889526367,
"learning_rate": 3.526058791294818e-05,
"loss": 1.85,
"step": 120500
},
{
"epoch": 0.89,
"grad_norm": 2.5288286209106445,
"learning_rate": 3.519942852669485e-05,
"loss": 1.831,
"step": 121000
},
{
"epoch": 0.89,
"grad_norm": 2.4805219173431396,
"learning_rate": 3.513826914044152e-05,
"loss": 1.836,
"step": 121500
},
{
"epoch": 0.9,
"grad_norm": 2.6729605197906494,
"learning_rate": 3.50771097541882e-05,
"loss": 1.8297,
"step": 122000
},
{
"epoch": 0.9,
"grad_norm": 2.5666863918304443,
"learning_rate": 3.501595036793487e-05,
"loss": 1.8231,
"step": 122500
},
{
"epoch": 0.9,
"grad_norm": 2.5059523582458496,
"learning_rate": 3.495479098168154e-05,
"loss": 1.8296,
"step": 123000
},
{
"epoch": 0.91,
"grad_norm": 2.758755922317505,
"learning_rate": 3.489363159542822e-05,
"loss": 1.8218,
"step": 123500
},
{
"epoch": 0.91,
"grad_norm": 2.5374386310577393,
"learning_rate": 3.483247220917489e-05,
"loss": 1.8234,
"step": 124000
},
{
"epoch": 0.91,
"grad_norm": 2.5575644969940186,
"learning_rate": 3.477131282292156e-05,
"loss": 1.8158,
"step": 124500
},
{
"epoch": 0.92,
"grad_norm": 2.567166328430176,
"learning_rate": 3.4710153436668236e-05,
"loss": 1.8179,
"step": 125000
},
{
"epoch": 0.92,
"grad_norm": 2.4243805408477783,
"learning_rate": 3.464899405041491e-05,
"loss": 1.82,
"step": 125500
},
{
"epoch": 0.92,
"grad_norm": 2.665632486343384,
"learning_rate": 3.458783466416158e-05,
"loss": 1.8151,
"step": 126000
},
{
"epoch": 0.93,
"grad_norm": 2.90759539604187,
"learning_rate": 3.452667527790825e-05,
"loss": 1.8109,
"step": 126500
},
{
"epoch": 0.93,
"grad_norm": 2.6775169372558594,
"learning_rate": 3.4465515891654926e-05,
"loss": 1.8199,
"step": 127000
},
{
"epoch": 0.94,
"grad_norm": 2.430788516998291,
"learning_rate": 3.44043565054016e-05,
"loss": 1.8128,
"step": 127500
},
{
"epoch": 0.94,
"grad_norm": 2.9997665882110596,
"learning_rate": 3.4343197119148274e-05,
"loss": 1.8106,
"step": 128000
},
{
"epoch": 0.94,
"grad_norm": 2.672255039215088,
"learning_rate": 3.428203773289494e-05,
"loss": 1.8141,
"step": 128500
},
{
"epoch": 0.95,
"grad_norm": 2.4398484230041504,
"learning_rate": 3.4220878346641616e-05,
"loss": 1.8036,
"step": 129000
},
{
"epoch": 0.95,
"grad_norm": 2.897477149963379,
"learning_rate": 3.415971896038829e-05,
"loss": 1.8006,
"step": 129500
},
{
"epoch": 0.95,
"grad_norm": 2.7050111293792725,
"learning_rate": 3.4098559574134965e-05,
"loss": 1.803,
"step": 130000
},
{
"epoch": 0.96,
"grad_norm": 2.503981828689575,
"learning_rate": 3.403740018788163e-05,
"loss": 1.7992,
"step": 130500
},
{
"epoch": 0.96,
"grad_norm": 2.9682281017303467,
"learning_rate": 3.397624080162831e-05,
"loss": 1.796,
"step": 131000
},
{
"epoch": 0.97,
"grad_norm": 3.1613168716430664,
"learning_rate": 3.391508141537498e-05,
"loss": 1.8083,
"step": 131500
},
{
"epoch": 0.97,
"grad_norm": 2.6427714824676514,
"learning_rate": 3.3853922029121655e-05,
"loss": 1.7968,
"step": 132000
},
{
"epoch": 0.97,
"grad_norm": 2.6238105297088623,
"learning_rate": 3.379276264286832e-05,
"loss": 1.7887,
"step": 132500
},
{
"epoch": 0.98,
"grad_norm": 2.566740036010742,
"learning_rate": 3.3731603256615e-05,
"loss": 1.7979,
"step": 133000
},
{
"epoch": 0.98,
"grad_norm": 2.6818795204162598,
"learning_rate": 3.367044387036167e-05,
"loss": 1.7856,
"step": 133500
},
{
"epoch": 0.98,
"grad_norm": 2.7290897369384766,
"learning_rate": 3.3609284484108345e-05,
"loss": 1.785,
"step": 134000
},
{
"epoch": 0.99,
"grad_norm": 2.8657619953155518,
"learning_rate": 3.354812509785502e-05,
"loss": 1.7925,
"step": 134500
},
{
"epoch": 0.99,
"grad_norm": 2.4724438190460205,
"learning_rate": 3.3486965711601694e-05,
"loss": 1.7854,
"step": 135000
},
{
"epoch": 0.99,
"grad_norm": 2.658123016357422,
"learning_rate": 3.342580632534836e-05,
"loss": 1.7904,
"step": 135500
},
{
"epoch": 1.0,
"grad_norm": 2.828024387359619,
"learning_rate": 3.336464693909504e-05,
"loss": 1.7768,
"step": 136000
},
{
"epoch": 1.0,
"grad_norm": 2.7401976585388184,
"learning_rate": 3.330348755284171e-05,
"loss": 1.7808,
"step": 136500
},
{
"epoch": 1.01,
"grad_norm": 2.651334524154663,
"learning_rate": 3.3242328166588384e-05,
"loss": 1.7769,
"step": 137000
},
{
"epoch": 1.01,
"grad_norm": 2.9663844108581543,
"learning_rate": 3.318116878033506e-05,
"loss": 1.7685,
"step": 137500
},
{
"epoch": 1.01,
"grad_norm": 2.6409363746643066,
"learning_rate": 3.312000939408173e-05,
"loss": 1.7819,
"step": 138000
},
{
"epoch": 1.02,
"grad_norm": 2.4980876445770264,
"learning_rate": 3.30588500078284e-05,
"loss": 1.7781,
"step": 138500
},
{
"epoch": 1.02,
"grad_norm": 2.583472967147827,
"learning_rate": 3.299769062157508e-05,
"loss": 1.7642,
"step": 139000
},
{
"epoch": 1.02,
"grad_norm": 2.7035281658172607,
"learning_rate": 3.293653123532175e-05,
"loss": 1.7707,
"step": 139500
},
{
"epoch": 1.03,
"grad_norm": 2.647327184677124,
"learning_rate": 3.287537184906842e-05,
"loss": 1.77,
"step": 140000
},
{
"epoch": 1.03,
"grad_norm": 2.6016039848327637,
"learning_rate": 3.28142124628151e-05,
"loss": 1.7689,
"step": 140500
},
{
"epoch": 1.03,
"grad_norm": 2.871412515640259,
"learning_rate": 3.275305307656177e-05,
"loss": 1.7614,
"step": 141000
},
{
"epoch": 1.04,
"grad_norm": 2.5391454696655273,
"learning_rate": 3.269189369030844e-05,
"loss": 1.7666,
"step": 141500
},
{
"epoch": 1.04,
"grad_norm": 2.7399189472198486,
"learning_rate": 3.263073430405511e-05,
"loss": 1.7698,
"step": 142000
},
{
"epoch": 1.05,
"grad_norm": 2.550523281097412,
"learning_rate": 3.256957491780179e-05,
"loss": 1.7661,
"step": 142500
},
{
"epoch": 1.05,
"grad_norm": 2.7033884525299072,
"learning_rate": 3.250841553154846e-05,
"loss": 1.7576,
"step": 143000
},
{
"epoch": 1.05,
"grad_norm": 2.712890386581421,
"learning_rate": 3.244725614529513e-05,
"loss": 1.759,
"step": 143500
},
{
"epoch": 1.06,
"grad_norm": 2.7690348625183105,
"learning_rate": 3.23860967590418e-05,
"loss": 1.757,
"step": 144000
},
{
"epoch": 1.06,
"grad_norm": 2.5629446506500244,
"learning_rate": 3.232493737278848e-05,
"loss": 1.7593,
"step": 144500
},
{
"epoch": 1.06,
"grad_norm": 2.5981359481811523,
"learning_rate": 3.226377798653515e-05,
"loss": 1.7506,
"step": 145000
},
{
"epoch": 1.07,
"grad_norm": 2.465391159057617,
"learning_rate": 3.2202618600281826e-05,
"loss": 1.7558,
"step": 145500
},
{
"epoch": 1.07,
"grad_norm": 3.0468790531158447,
"learning_rate": 3.2141459214028493e-05,
"loss": 1.7559,
"step": 146000
},
{
"epoch": 1.08,
"grad_norm": 2.5434939861297607,
"learning_rate": 3.208029982777517e-05,
"loss": 1.7507,
"step": 146500
},
{
"epoch": 1.08,
"grad_norm": 2.479449987411499,
"learning_rate": 3.201914044152184e-05,
"loss": 1.7457,
"step": 147000
},
{
"epoch": 1.08,
"grad_norm": 2.621965169906616,
"learning_rate": 3.1957981055268516e-05,
"loss": 1.7513,
"step": 147500
},
{
"epoch": 1.09,
"grad_norm": 2.6055707931518555,
"learning_rate": 3.1896821669015184e-05,
"loss": 1.7427,
"step": 148000
},
{
"epoch": 1.09,
"grad_norm": 2.2937374114990234,
"learning_rate": 3.1835662282761865e-05,
"loss": 1.7524,
"step": 148500
},
{
"epoch": 1.09,
"grad_norm": 2.7363667488098145,
"learning_rate": 3.177450289650853e-05,
"loss": 1.7492,
"step": 149000
},
{
"epoch": 1.1,
"grad_norm": 2.660330057144165,
"learning_rate": 3.1713343510255207e-05,
"loss": 1.7405,
"step": 149500
},
{
"epoch": 1.1,
"grad_norm": 2.589137077331543,
"learning_rate": 3.165218412400188e-05,
"loss": 1.7389,
"step": 150000
},
{
"epoch": 1.1,
"grad_norm": 2.7419252395629883,
"learning_rate": 3.1591024737748555e-05,
"loss": 1.7478,
"step": 150500
},
{
"epoch": 1.11,
"grad_norm": 2.6772820949554443,
"learning_rate": 3.152986535149522e-05,
"loss": 1.7365,
"step": 151000
},
{
"epoch": 1.11,
"grad_norm": 2.765460968017578,
"learning_rate": 3.1468705965241904e-05,
"loss": 1.7399,
"step": 151500
},
{
"epoch": 1.12,
"grad_norm": 2.5273730754852295,
"learning_rate": 3.140754657898857e-05,
"loss": 1.7336,
"step": 152000
},
{
"epoch": 1.12,
"grad_norm": 2.6962389945983887,
"learning_rate": 3.1346387192735245e-05,
"loss": 1.7347,
"step": 152500
},
{
"epoch": 1.12,
"grad_norm": 2.6664233207702637,
"learning_rate": 3.128522780648192e-05,
"loss": 1.7212,
"step": 153000
},
{
"epoch": 1.13,
"grad_norm": 2.923704147338867,
"learning_rate": 3.1224068420228594e-05,
"loss": 1.7327,
"step": 153500
},
{
"epoch": 1.13,
"grad_norm": 2.7295753955841064,
"learning_rate": 3.116290903397526e-05,
"loss": 1.7304,
"step": 154000
},
{
"epoch": 1.13,
"grad_norm": 2.6523427963256836,
"learning_rate": 3.110174964772194e-05,
"loss": 1.7295,
"step": 154500
},
{
"epoch": 1.14,
"grad_norm": 2.544809341430664,
"learning_rate": 3.104059026146861e-05,
"loss": 1.7303,
"step": 155000
},
{
"epoch": 1.14,
"grad_norm": 2.608091354370117,
"learning_rate": 3.0979430875215284e-05,
"loss": 1.7288,
"step": 155500
},
{
"epoch": 1.14,
"grad_norm": 2.6781680583953857,
"learning_rate": 3.091827148896195e-05,
"loss": 1.7333,
"step": 156000
},
{
"epoch": 1.15,
"grad_norm": 2.5772953033447266,
"learning_rate": 3.085711210270863e-05,
"loss": 1.7313,
"step": 156500
},
{
"epoch": 1.15,
"grad_norm": 2.6381750106811523,
"learning_rate": 3.07959527164553e-05,
"loss": 1.723,
"step": 157000
},
{
"epoch": 1.16,
"grad_norm": 2.609584093093872,
"learning_rate": 3.0734793330201974e-05,
"loss": 1.7317,
"step": 157500
},
{
"epoch": 1.16,
"grad_norm": 2.556508779525757,
"learning_rate": 3.067363394394865e-05,
"loss": 1.7168,
"step": 158000
},
{
"epoch": 1.16,
"grad_norm": 2.4912405014038086,
"learning_rate": 3.061247455769532e-05,
"loss": 1.7221,
"step": 158500
},
{
"epoch": 1.17,
"grad_norm": 3.072758436203003,
"learning_rate": 3.055131517144199e-05,
"loss": 1.7193,
"step": 159000
},
{
"epoch": 1.17,
"grad_norm": 2.5012080669403076,
"learning_rate": 3.0490155785188668e-05,
"loss": 1.7187,
"step": 159500
},
{
"epoch": 1.17,
"grad_norm": 2.6786646842956543,
"learning_rate": 3.042899639893534e-05,
"loss": 1.7099,
"step": 160000
},
{
"epoch": 1.18,
"grad_norm": 2.6225085258483887,
"learning_rate": 3.036783701268201e-05,
"loss": 1.7135,
"step": 160500
},
{
"epoch": 1.18,
"grad_norm": 2.6484742164611816,
"learning_rate": 3.0306677626428687e-05,
"loss": 1.7086,
"step": 161000
},
{
"epoch": 1.19,
"grad_norm": 2.948645830154419,
"learning_rate": 3.0245518240175358e-05,
"loss": 1.7098,
"step": 161500
},
{
"epoch": 1.19,
"grad_norm": 2.766124725341797,
"learning_rate": 3.018435885392203e-05,
"loss": 1.7072,
"step": 162000
},
{
"epoch": 1.19,
"grad_norm": 2.8391172885894775,
"learning_rate": 3.0123199467668707e-05,
"loss": 1.7116,
"step": 162500
},
{
"epoch": 1.2,
"grad_norm": 2.5152761936187744,
"learning_rate": 3.0062040081415378e-05,
"loss": 1.7073,
"step": 163000
},
{
"epoch": 1.2,
"grad_norm": 2.719190835952759,
"learning_rate": 3.000088069516205e-05,
"loss": 1.7017,
"step": 163500
},
{
"epoch": 1.2,
"grad_norm": 2.5250284671783447,
"learning_rate": 2.9939721308908726e-05,
"loss": 1.7055,
"step": 164000
},
{
"epoch": 1.21,
"grad_norm": 3.0202980041503906,
"learning_rate": 2.9878561922655397e-05,
"loss": 1.7041,
"step": 164500
},
{
"epoch": 1.21,
"grad_norm": 2.7549800872802734,
"learning_rate": 2.9817402536402068e-05,
"loss": 1.6984,
"step": 165000
},
{
"epoch": 1.21,
"grad_norm": 2.891324281692505,
"learning_rate": 2.9756243150148742e-05,
"loss": 1.7033,
"step": 165500
},
{
"epoch": 1.22,
"grad_norm": 3.0068359375,
"learning_rate": 2.9695083763895416e-05,
"loss": 1.6969,
"step": 166000
},
{
"epoch": 1.22,
"grad_norm": 2.5715763568878174,
"learning_rate": 2.9633924377642087e-05,
"loss": 1.698,
"step": 166500
},
{
"epoch": 1.23,
"grad_norm": 2.5422136783599854,
"learning_rate": 2.9572764991388758e-05,
"loss": 1.6987,
"step": 167000
},
{
"epoch": 1.23,
"grad_norm": 2.5007290840148926,
"learning_rate": 2.9511605605135432e-05,
"loss": 1.6993,
"step": 167500
},
{
"epoch": 1.23,
"grad_norm": 2.5966525077819824,
"learning_rate": 2.9450446218882107e-05,
"loss": 1.6923,
"step": 168000
},
{
"epoch": 1.24,
"grad_norm": 2.5626885890960693,
"learning_rate": 2.9389286832628778e-05,
"loss": 1.6964,
"step": 168500
},
{
"epoch": 1.24,
"grad_norm": 2.76600980758667,
"learning_rate": 2.9328127446375452e-05,
"loss": 1.6875,
"step": 169000
},
{
"epoch": 1.24,
"grad_norm": 2.922257661819458,
"learning_rate": 2.9266968060122123e-05,
"loss": 1.6985,
"step": 169500
},
{
"epoch": 1.25,
"grad_norm": 2.641627311706543,
"learning_rate": 2.9205808673868794e-05,
"loss": 1.6888,
"step": 170000
},
{
"epoch": 1.25,
"grad_norm": 2.6927127838134766,
"learning_rate": 2.914464928761547e-05,
"loss": 1.6924,
"step": 170500
},
{
"epoch": 1.25,
"grad_norm": 2.4802746772766113,
"learning_rate": 2.9083489901362142e-05,
"loss": 1.6946,
"step": 171000
},
{
"epoch": 1.26,
"grad_norm": 2.5318949222564697,
"learning_rate": 2.9022330515108813e-05,
"loss": 1.6894,
"step": 171500
},
{
"epoch": 1.26,
"grad_norm": 2.7798898220062256,
"learning_rate": 2.896117112885549e-05,
"loss": 1.6861,
"step": 172000
},
{
"epoch": 1.27,
"grad_norm": 2.7204222679138184,
"learning_rate": 2.890001174260216e-05,
"loss": 1.6839,
"step": 172500
},
{
"epoch": 1.27,
"grad_norm": 2.8077077865600586,
"learning_rate": 2.8838852356348832e-05,
"loss": 1.684,
"step": 173000
},
{
"epoch": 1.27,
"grad_norm": 2.565995454788208,
"learning_rate": 2.877769297009551e-05,
"loss": 1.682,
"step": 173500
},
{
"epoch": 1.28,
"grad_norm": 2.7671403884887695,
"learning_rate": 2.871653358384218e-05,
"loss": 1.6782,
"step": 174000
},
{
"epoch": 1.28,
"grad_norm": 2.6801698207855225,
"learning_rate": 2.8655374197588852e-05,
"loss": 1.6853,
"step": 174500
},
{
"epoch": 1.28,
"grad_norm": 2.810450553894043,
"learning_rate": 2.859421481133553e-05,
"loss": 1.6841,
"step": 175000
},
{
"epoch": 1.29,
"grad_norm": 2.797452211380005,
"learning_rate": 2.85330554250822e-05,
"loss": 1.6753,
"step": 175500
},
{
"epoch": 1.29,
"grad_norm": 2.5832931995391846,
"learning_rate": 2.847189603882887e-05,
"loss": 1.6905,
"step": 176000
},
{
"epoch": 1.3,
"grad_norm": 2.8013391494750977,
"learning_rate": 2.841073665257555e-05,
"loss": 1.673,
"step": 176500
},
{
"epoch": 1.3,
"grad_norm": 2.5176284313201904,
"learning_rate": 2.834957726632222e-05,
"loss": 1.6742,
"step": 177000
},
{
"epoch": 1.3,
"grad_norm": 2.7248387336730957,
"learning_rate": 2.828841788006889e-05,
"loss": 1.6663,
"step": 177500
},
{
"epoch": 1.31,
"grad_norm": 3.006441831588745,
"learning_rate": 2.822725849381556e-05,
"loss": 1.6762,
"step": 178000
},
{
"epoch": 1.31,
"grad_norm": 2.754427671432495,
"learning_rate": 2.816609910756224e-05,
"loss": 1.6711,
"step": 178500
},
{
"epoch": 1.31,
"grad_norm": 2.6871962547302246,
"learning_rate": 2.810493972130891e-05,
"loss": 1.6749,
"step": 179000
},
{
"epoch": 1.32,
"grad_norm": 2.7660982608795166,
"learning_rate": 2.804378033505558e-05,
"loss": 1.6694,
"step": 179500
},
{
"epoch": 1.32,
"grad_norm": 2.5820930004119873,
"learning_rate": 2.798262094880226e-05,
"loss": 1.68,
"step": 180000
},
{
"epoch": 1.32,
"grad_norm": 2.50264048576355,
"learning_rate": 2.792146156254893e-05,
"loss": 1.6745,
"step": 180500
},
{
"epoch": 1.33,
"grad_norm": 2.759570837020874,
"learning_rate": 2.78603021762956e-05,
"loss": 1.6602,
"step": 181000
},
{
"epoch": 1.33,
"grad_norm": 2.6648566722869873,
"learning_rate": 2.7799142790042278e-05,
"loss": 1.67,
"step": 181500
},
{
"epoch": 1.34,
"grad_norm": 2.7140583992004395,
"learning_rate": 2.773798340378895e-05,
"loss": 1.6645,
"step": 182000
},
{
"epoch": 1.34,
"grad_norm": 2.4393863677978516,
"learning_rate": 2.767682401753562e-05,
"loss": 1.665,
"step": 182500
},
{
"epoch": 1.34,
"grad_norm": 2.66521954536438,
"learning_rate": 2.7615664631282294e-05,
"loss": 1.6694,
"step": 183000
},
{
"epoch": 1.35,
"grad_norm": 2.9926490783691406,
"learning_rate": 2.7554505245028965e-05,
"loss": 1.669,
"step": 183500
},
{
"epoch": 1.35,
"grad_norm": 2.611051321029663,
"learning_rate": 2.749334585877564e-05,
"loss": 1.6606,
"step": 184000
},
{
"epoch": 1.35,
"grad_norm": 2.6490185260772705,
"learning_rate": 2.7432186472522313e-05,
"loss": 1.6639,
"step": 184500
},
{
"epoch": 1.36,
"grad_norm": 2.7830920219421387,
"learning_rate": 2.7371027086268984e-05,
"loss": 1.6584,
"step": 185000
},
{
"epoch": 1.36,
"grad_norm": 2.776111602783203,
"learning_rate": 2.7309867700015655e-05,
"loss": 1.6564,
"step": 185500
},
{
"epoch": 1.37,
"grad_norm": 2.5335960388183594,
"learning_rate": 2.7248708313762333e-05,
"loss": 1.6553,
"step": 186000
},
{
"epoch": 1.37,
"grad_norm": 2.585458755493164,
"learning_rate": 2.7187548927509004e-05,
"loss": 1.653,
"step": 186500
},
{
"epoch": 1.37,
"grad_norm": 2.851865768432617,
"learning_rate": 2.7126389541255674e-05,
"loss": 1.6654,
"step": 187000
},
{
"epoch": 1.38,
"grad_norm": 2.649545907974243,
"learning_rate": 2.7065230155002352e-05,
"loss": 1.657,
"step": 187500
},
{
"epoch": 1.38,
"grad_norm": 2.552381753921509,
"learning_rate": 2.7004070768749023e-05,
"loss": 1.6569,
"step": 188000
},
{
"epoch": 1.38,
"grad_norm": 2.6055853366851807,
"learning_rate": 2.6942911382495694e-05,
"loss": 1.6522,
"step": 188500
},
{
"epoch": 1.39,
"grad_norm": 2.848911762237549,
"learning_rate": 2.6881751996242365e-05,
"loss": 1.6435,
"step": 189000
},
{
"epoch": 1.39,
"grad_norm": 2.8290162086486816,
"learning_rate": 2.6820592609989042e-05,
"loss": 1.6457,
"step": 189500
},
{
"epoch": 1.39,
"grad_norm": 2.682929277420044,
"learning_rate": 2.6759433223735713e-05,
"loss": 1.6521,
"step": 190000
},
{
"epoch": 1.4,
"grad_norm": 2.7035279273986816,
"learning_rate": 2.6698273837482384e-05,
"loss": 1.652,
"step": 190500
},
{
"epoch": 1.4,
"grad_norm": 2.6156182289123535,
"learning_rate": 2.6637114451229062e-05,
"loss": 1.6421,
"step": 191000
},
{
"epoch": 1.41,
"grad_norm": 2.7647957801818848,
"learning_rate": 2.6575955064975733e-05,
"loss": 1.6496,
"step": 191500
},
{
"epoch": 1.41,
"grad_norm": 2.5763864517211914,
"learning_rate": 2.6514795678722403e-05,
"loss": 1.6454,
"step": 192000
},
{
"epoch": 1.41,
"grad_norm": 2.6585116386413574,
"learning_rate": 2.645363629246908e-05,
"loss": 1.6443,
"step": 192500
},
{
"epoch": 1.42,
"grad_norm": 2.7471868991851807,
"learning_rate": 2.6392476906215752e-05,
"loss": 1.6494,
"step": 193000
},
{
"epoch": 1.42,
"grad_norm": 2.7787129878997803,
"learning_rate": 2.6331317519962423e-05,
"loss": 1.6441,
"step": 193500
},
{
"epoch": 1.42,
"grad_norm": 2.3297078609466553,
"learning_rate": 2.62701581337091e-05,
"loss": 1.6462,
"step": 194000
},
{
"epoch": 1.43,
"grad_norm": 2.8310294151306152,
"learning_rate": 2.620899874745577e-05,
"loss": 1.6473,
"step": 194500
},
{
"epoch": 1.43,
"grad_norm": 2.6443045139312744,
"learning_rate": 2.6147839361202442e-05,
"loss": 1.6468,
"step": 195000
},
{
"epoch": 1.43,
"grad_norm": 2.5064589977264404,
"learning_rate": 2.608667997494912e-05,
"loss": 1.6385,
"step": 195500
},
{
"epoch": 1.44,
"grad_norm": 2.6140296459198,
"learning_rate": 2.602552058869579e-05,
"loss": 1.6357,
"step": 196000
},
{
"epoch": 1.44,
"grad_norm": 2.461705207824707,
"learning_rate": 2.596436120244246e-05,
"loss": 1.6401,
"step": 196500
},
{
"epoch": 1.45,
"grad_norm": 2.782813787460327,
"learning_rate": 2.590320181618914e-05,
"loss": 1.6426,
"step": 197000
},
{
"epoch": 1.45,
"grad_norm": 2.5911970138549805,
"learning_rate": 2.584204242993581e-05,
"loss": 1.6409,
"step": 197500
},
{
"epoch": 1.45,
"grad_norm": 2.593752384185791,
"learning_rate": 2.578088304368248e-05,
"loss": 1.6345,
"step": 198000
},
{
"epoch": 1.46,
"grad_norm": 2.9096670150756836,
"learning_rate": 2.5719723657429155e-05,
"loss": 1.6351,
"step": 198500
},
{
"epoch": 1.46,
"grad_norm": 2.9551987648010254,
"learning_rate": 2.5658564271175826e-05,
"loss": 1.6322,
"step": 199000
},
{
"epoch": 1.46,
"grad_norm": 2.6173858642578125,
"learning_rate": 2.55974048849225e-05,
"loss": 1.6307,
"step": 199500
},
{
"epoch": 1.47,
"grad_norm": 2.695869207382202,
"learning_rate": 2.5536245498669175e-05,
"loss": 1.6264,
"step": 200000
},
{
"epoch": 1.47,
"grad_norm": 2.711869955062866,
"learning_rate": 2.5475086112415846e-05,
"loss": 1.6283,
"step": 200500
},
{
"epoch": 1.48,
"grad_norm": 2.570518732070923,
"learning_rate": 2.5413926726162516e-05,
"loss": 1.6323,
"step": 201000
},
{
"epoch": 1.48,
"grad_norm": 2.7032439708709717,
"learning_rate": 2.535276733990919e-05,
"loss": 1.6339,
"step": 201500
},
{
"epoch": 1.48,
"grad_norm": 2.7625739574432373,
"learning_rate": 2.5291607953655865e-05,
"loss": 1.6279,
"step": 202000
},
{
"epoch": 1.49,
"grad_norm": 2.829380989074707,
"learning_rate": 2.5230448567402536e-05,
"loss": 1.6263,
"step": 202500
},
{
"epoch": 1.49,
"grad_norm": 2.499410629272461,
"learning_rate": 2.5169289181149207e-05,
"loss": 1.6201,
"step": 203000
},
{
"epoch": 1.49,
"grad_norm": 2.6228952407836914,
"learning_rate": 2.5108129794895884e-05,
"loss": 1.6244,
"step": 203500
},
{
"epoch": 1.5,
"grad_norm": 2.609665870666504,
"learning_rate": 2.5046970408642555e-05,
"loss": 1.6319,
"step": 204000
},
{
"epoch": 1.5,
"grad_norm": 2.8935351371765137,
"learning_rate": 2.498581102238923e-05,
"loss": 1.6216,
"step": 204500
},
{
"epoch": 1.5,
"grad_norm": 2.7964882850646973,
"learning_rate": 2.49246516361359e-05,
"loss": 1.621,
"step": 205000
},
{
"epoch": 1.51,
"grad_norm": 2.465930938720703,
"learning_rate": 2.4863492249882575e-05,
"loss": 1.6332,
"step": 205500
},
{
"epoch": 1.51,
"grad_norm": 2.9245595932006836,
"learning_rate": 2.480233286362925e-05,
"loss": 1.6239,
"step": 206000
},
{
"epoch": 1.52,
"grad_norm": 2.548551321029663,
"learning_rate": 2.474117347737592e-05,
"loss": 1.6148,
"step": 206500
},
{
"epoch": 1.52,
"grad_norm": 2.6611809730529785,
"learning_rate": 2.4680014091122594e-05,
"loss": 1.6216,
"step": 207000
},
{
"epoch": 1.52,
"grad_norm": 2.6596455574035645,
"learning_rate": 2.4618854704869268e-05,
"loss": 1.6057,
"step": 207500
},
{
"epoch": 1.53,
"grad_norm": 2.645918607711792,
"learning_rate": 2.455769531861594e-05,
"loss": 1.613,
"step": 208000
},
{
"epoch": 1.53,
"grad_norm": 2.6304965019226074,
"learning_rate": 2.4496535932362613e-05,
"loss": 1.6148,
"step": 208500
},
{
"epoch": 1.53,
"grad_norm": 2.9523110389709473,
"learning_rate": 2.4435376546109284e-05,
"loss": 1.615,
"step": 209000
},
{
"epoch": 1.54,
"grad_norm": 2.6215062141418457,
"learning_rate": 2.437421715985596e-05,
"loss": 1.623,
"step": 209500
},
{
"epoch": 1.54,
"grad_norm": 2.7585043907165527,
"learning_rate": 2.4313057773602633e-05,
"loss": 1.6228,
"step": 210000
},
{
"epoch": 1.54,
"grad_norm": 2.626432418823242,
"learning_rate": 2.4251898387349304e-05,
"loss": 1.6152,
"step": 210500
},
{
"epoch": 1.55,
"grad_norm": 2.481905221939087,
"learning_rate": 2.4190739001095978e-05,
"loss": 1.6041,
"step": 211000
},
{
"epoch": 1.55,
"grad_norm": 2.5762555599212646,
"learning_rate": 2.4129579614842652e-05,
"loss": 1.6117,
"step": 211500
},
{
"epoch": 1.56,
"grad_norm": 2.6616873741149902,
"learning_rate": 2.4068420228589323e-05,
"loss": 1.6123,
"step": 212000
},
{
"epoch": 1.56,
"grad_norm": 2.6225013732910156,
"learning_rate": 2.4007260842335997e-05,
"loss": 1.6096,
"step": 212500
},
{
"epoch": 1.56,
"grad_norm": 2.6868574619293213,
"learning_rate": 2.394610145608267e-05,
"loss": 1.6103,
"step": 213000
},
{
"epoch": 1.57,
"grad_norm": 2.8061540126800537,
"learning_rate": 2.3884942069829342e-05,
"loss": 1.6051,
"step": 213500
},
{
"epoch": 1.57,
"grad_norm": 2.733086585998535,
"learning_rate": 2.3823782683576017e-05,
"loss": 1.6069,
"step": 214000
},
{
"epoch": 1.57,
"grad_norm": 2.596497058868408,
"learning_rate": 2.3762623297322688e-05,
"loss": 1.602,
"step": 214500
},
{
"epoch": 1.58,
"grad_norm": 2.496598243713379,
"learning_rate": 2.3701463911069362e-05,
"loss": 1.6107,
"step": 215000
},
{
"epoch": 1.58,
"grad_norm": 2.4470176696777344,
"learning_rate": 2.3640304524816033e-05,
"loss": 1.6021,
"step": 215500
},
{
"epoch": 1.59,
"grad_norm": 2.589895486831665,
"learning_rate": 2.3579145138562707e-05,
"loss": 1.6029,
"step": 216000
},
{
"epoch": 1.59,
"grad_norm": 2.7477266788482666,
"learning_rate": 2.3517985752309378e-05,
"loss": 1.6011,
"step": 216500
},
{
"epoch": 1.59,
"grad_norm": 2.7007384300231934,
"learning_rate": 2.3456826366056052e-05,
"loss": 1.6023,
"step": 217000
},
{
"epoch": 1.6,
"grad_norm": 2.6846890449523926,
"learning_rate": 2.3395666979802723e-05,
"loss": 1.592,
"step": 217500
},
{
"epoch": 1.6,
"grad_norm": 2.69858455657959,
"learning_rate": 2.3334507593549397e-05,
"loss": 1.6047,
"step": 218000
},
{
"epoch": 1.6,
"grad_norm": 2.6157824993133545,
"learning_rate": 2.327334820729607e-05,
"loss": 1.5992,
"step": 218500
},
{
"epoch": 1.61,
"grad_norm": 2.616908073425293,
"learning_rate": 2.3212188821042742e-05,
"loss": 1.596,
"step": 219000
},
{
"epoch": 1.61,
"grad_norm": 2.7912027835845947,
"learning_rate": 2.3151029434789417e-05,
"loss": 1.604,
"step": 219500
},
{
"epoch": 1.61,
"grad_norm": 2.6151885986328125,
"learning_rate": 2.308987004853609e-05,
"loss": 1.5953,
"step": 220000
},
{
"epoch": 1.62,
"grad_norm": 2.8206794261932373,
"learning_rate": 2.3028710662282762e-05,
"loss": 1.602,
"step": 220500
},
{
"epoch": 1.62,
"grad_norm": 2.6507091522216797,
"learning_rate": 2.2967551276029436e-05,
"loss": 1.5903,
"step": 221000
},
{
"epoch": 1.63,
"grad_norm": 2.752617359161377,
"learning_rate": 2.2906391889776107e-05,
"loss": 1.5971,
"step": 221500
},
{
"epoch": 1.63,
"grad_norm": 2.8615899085998535,
"learning_rate": 2.284523250352278e-05,
"loss": 1.596,
"step": 222000
},
{
"epoch": 1.63,
"grad_norm": 3.0563414096832275,
"learning_rate": 2.2784073117269455e-05,
"loss": 1.5993,
"step": 222500
},
{
"epoch": 1.64,
"grad_norm": 2.715120553970337,
"learning_rate": 2.2722913731016126e-05,
"loss": 1.5965,
"step": 223000
},
{
"epoch": 1.64,
"grad_norm": 2.8256382942199707,
"learning_rate": 2.26617543447628e-05,
"loss": 1.5883,
"step": 223500
},
{
"epoch": 1.64,
"grad_norm": 2.8050873279571533,
"learning_rate": 2.2600594958509475e-05,
"loss": 1.5914,
"step": 224000
},
{
"epoch": 1.65,
"grad_norm": 2.773902416229248,
"learning_rate": 2.2539435572256146e-05,
"loss": 1.5884,
"step": 224500
},
{
"epoch": 1.65,
"grad_norm": 2.7655787467956543,
"learning_rate": 2.247827618600282e-05,
"loss": 1.5965,
"step": 225000
},
{
"epoch": 1.65,
"grad_norm": 2.7787845134735107,
"learning_rate": 2.2417116799749494e-05,
"loss": 1.5872,
"step": 225500
},
{
"epoch": 1.66,
"grad_norm": 2.73518705368042,
"learning_rate": 2.2355957413496165e-05,
"loss": 1.588,
"step": 226000
},
{
"epoch": 1.66,
"grad_norm": 2.743821382522583,
"learning_rate": 2.229479802724284e-05,
"loss": 1.5871,
"step": 226500
},
{
"epoch": 1.67,
"grad_norm": 2.444350242614746,
"learning_rate": 2.223363864098951e-05,
"loss": 1.5802,
"step": 227000
},
{
"epoch": 1.67,
"grad_norm": 2.597966194152832,
"learning_rate": 2.2172479254736185e-05,
"loss": 1.5872,
"step": 227500
},
{
"epoch": 1.67,
"grad_norm": 2.7924256324768066,
"learning_rate": 2.211131986848286e-05,
"loss": 1.5877,
"step": 228000
},
{
"epoch": 1.68,
"grad_norm": 2.5780932903289795,
"learning_rate": 2.205016048222953e-05,
"loss": 1.583,
"step": 228500
},
{
"epoch": 1.68,
"grad_norm": 2.9303081035614014,
"learning_rate": 2.1989001095976204e-05,
"loss": 1.5901,
"step": 229000
},
{
"epoch": 1.68,
"grad_norm": 2.601661443710327,
"learning_rate": 2.1927841709722878e-05,
"loss": 1.5837,
"step": 229500
},
{
"epoch": 1.69,
"grad_norm": 2.6851816177368164,
"learning_rate": 2.186668232346955e-05,
"loss": 1.5736,
"step": 230000
},
{
"epoch": 1.69,
"grad_norm": 2.592660903930664,
"learning_rate": 2.1805522937216223e-05,
"loss": 1.5797,
"step": 230500
},
{
"epoch": 1.7,
"grad_norm": 2.876065492630005,
"learning_rate": 2.1744363550962894e-05,
"loss": 1.5851,
"step": 231000
},
{
"epoch": 1.7,
"grad_norm": 2.507368564605713,
"learning_rate": 2.168320416470957e-05,
"loss": 1.5868,
"step": 231500
},
{
"epoch": 1.7,
"grad_norm": 2.5661709308624268,
"learning_rate": 2.162204477845624e-05,
"loss": 1.5826,
"step": 232000
},
{
"epoch": 1.71,
"grad_norm": 2.698857545852661,
"learning_rate": 2.1560885392202914e-05,
"loss": 1.5739,
"step": 232500
},
{
"epoch": 1.71,
"grad_norm": 2.5845346450805664,
"learning_rate": 2.1499726005949584e-05,
"loss": 1.5769,
"step": 233000
},
{
"epoch": 1.71,
"grad_norm": 2.7823565006256104,
"learning_rate": 2.143856661969626e-05,
"loss": 1.5781,
"step": 233500
},
{
"epoch": 1.72,
"grad_norm": 2.675457239151001,
"learning_rate": 2.137740723344293e-05,
"loss": 1.5773,
"step": 234000
},
{
"epoch": 1.72,
"grad_norm": 2.789083957672119,
"learning_rate": 2.1316247847189604e-05,
"loss": 1.5662,
"step": 234500
},
{
"epoch": 1.72,
"grad_norm": 2.5719103813171387,
"learning_rate": 2.1255088460936278e-05,
"loss": 1.58,
"step": 235000
},
{
"epoch": 1.73,
"grad_norm": 2.7980144023895264,
"learning_rate": 2.119392907468295e-05,
"loss": 1.5769,
"step": 235500
},
{
"epoch": 1.73,
"grad_norm": 2.6691505908966064,
"learning_rate": 2.1132769688429623e-05,
"loss": 1.5703,
"step": 236000
},
{
"epoch": 1.74,
"grad_norm": 2.839600086212158,
"learning_rate": 2.1071610302176297e-05,
"loss": 1.5714,
"step": 236500
},
{
"epoch": 1.74,
"grad_norm": 2.8428940773010254,
"learning_rate": 2.101045091592297e-05,
"loss": 1.5723,
"step": 237000
},
{
"epoch": 1.74,
"grad_norm": 2.5756494998931885,
"learning_rate": 2.0949291529669643e-05,
"loss": 1.5672,
"step": 237500
},
{
"epoch": 1.75,
"grad_norm": 2.4937775135040283,
"learning_rate": 2.0888132143416314e-05,
"loss": 1.5715,
"step": 238000
},
{
"epoch": 1.75,
"grad_norm": 2.8386645317077637,
"learning_rate": 2.0826972757162988e-05,
"loss": 1.566,
"step": 238500
},
{
"epoch": 1.75,
"grad_norm": 2.9764533042907715,
"learning_rate": 2.0765813370909662e-05,
"loss": 1.5678,
"step": 239000
},
{
"epoch": 1.76,
"grad_norm": 2.5615928173065186,
"learning_rate": 2.0704653984656333e-05,
"loss": 1.5711,
"step": 239500
},
{
"epoch": 1.76,
"grad_norm": 2.431802988052368,
"learning_rate": 2.0643494598403007e-05,
"loss": 1.5632,
"step": 240000
},
{
"epoch": 1.77,
"grad_norm": 2.691328287124634,
"learning_rate": 2.058233521214968e-05,
"loss": 1.5761,
"step": 240500
},
{
"epoch": 1.77,
"grad_norm": 2.833160161972046,
"learning_rate": 2.0521175825896352e-05,
"loss": 1.568,
"step": 241000
},
{
"epoch": 1.77,
"grad_norm": 2.9443514347076416,
"learning_rate": 2.0460016439643027e-05,
"loss": 1.5644,
"step": 241500
},
{
"epoch": 1.78,
"grad_norm": 2.6418864727020264,
"learning_rate": 2.03988570533897e-05,
"loss": 1.5644,
"step": 242000
},
{
"epoch": 1.78,
"grad_norm": 2.559652090072632,
"learning_rate": 2.033769766713637e-05,
"loss": 1.5588,
"step": 242500
},
{
"epoch": 1.78,
"grad_norm": 2.376955509185791,
"learning_rate": 2.0276538280883046e-05,
"loss": 1.5659,
"step": 243000
},
{
"epoch": 1.79,
"grad_norm": 3.0132250785827637,
"learning_rate": 2.0215378894629717e-05,
"loss": 1.56,
"step": 243500
},
{
"epoch": 1.79,
"grad_norm": 2.493617534637451,
"learning_rate": 2.015421950837639e-05,
"loss": 1.5602,
"step": 244000
},
{
"epoch": 1.79,
"grad_norm": 2.6484365463256836,
"learning_rate": 2.0093060122123065e-05,
"loss": 1.5646,
"step": 244500
},
{
"epoch": 1.8,
"grad_norm": 2.5682971477508545,
"learning_rate": 2.0031900735869736e-05,
"loss": 1.5622,
"step": 245000
},
{
"epoch": 1.8,
"grad_norm": 2.783363103866577,
"learning_rate": 1.997074134961641e-05,
"loss": 1.5568,
"step": 245500
},
{
"epoch": 1.81,
"grad_norm": 2.5576345920562744,
"learning_rate": 1.9909581963363085e-05,
"loss": 1.558,
"step": 246000
},
{
"epoch": 1.81,
"grad_norm": 2.3469157218933105,
"learning_rate": 1.9848422577109756e-05,
"loss": 1.562,
"step": 246500
},
{
"epoch": 1.81,
"grad_norm": 2.7063257694244385,
"learning_rate": 1.978726319085643e-05,
"loss": 1.5582,
"step": 247000
},
{
"epoch": 1.82,
"grad_norm": 3.00256085395813,
"learning_rate": 1.97261038046031e-05,
"loss": 1.5574,
"step": 247500
},
{
"epoch": 1.82,
"grad_norm": 2.35555100440979,
"learning_rate": 1.966494441834977e-05,
"loss": 1.5567,
"step": 248000
},
{
"epoch": 1.82,
"grad_norm": 2.5847179889678955,
"learning_rate": 1.9603785032096446e-05,
"loss": 1.5647,
"step": 248500
},
{
"epoch": 1.83,
"grad_norm": 2.629279613494873,
"learning_rate": 1.9542625645843117e-05,
"loss": 1.552,
"step": 249000
},
{
"epoch": 1.83,
"grad_norm": 2.6433770656585693,
"learning_rate": 1.948146625958979e-05,
"loss": 1.5547,
"step": 249500
},
{
"epoch": 1.83,
"grad_norm": 2.6378979682922363,
"learning_rate": 1.9420306873336465e-05,
"loss": 1.5549,
"step": 250000
},
{
"epoch": 1.84,
"grad_norm": 2.7272751331329346,
"learning_rate": 1.9359147487083136e-05,
"loss": 1.5496,
"step": 250500
},
{
"epoch": 1.84,
"grad_norm": 2.661400556564331,
"learning_rate": 1.929798810082981e-05,
"loss": 1.5597,
"step": 251000
},
{
"epoch": 1.85,
"grad_norm": 2.76647686958313,
"learning_rate": 1.9236828714576485e-05,
"loss": 1.5559,
"step": 251500
},
{
"epoch": 1.85,
"grad_norm": 2.4355571269989014,
"learning_rate": 1.9175669328323156e-05,
"loss": 1.5512,
"step": 252000
},
{
"epoch": 1.85,
"grad_norm": 2.503006935119629,
"learning_rate": 1.911450994206983e-05,
"loss": 1.5459,
"step": 252500
},
{
"epoch": 1.86,
"grad_norm": 2.4940273761749268,
"learning_rate": 1.9053350555816504e-05,
"loss": 1.5563,
"step": 253000
},
{
"epoch": 1.86,
"grad_norm": 3.0512688159942627,
"learning_rate": 1.8992191169563175e-05,
"loss": 1.5542,
"step": 253500
},
{
"epoch": 1.86,
"grad_norm": 2.811276912689209,
"learning_rate": 1.893103178330985e-05,
"loss": 1.5447,
"step": 254000
},
{
"epoch": 1.87,
"grad_norm": 2.565730571746826,
"learning_rate": 1.8869872397056523e-05,
"loss": 1.5446,
"step": 254500
},
{
"epoch": 1.87,
"grad_norm": 2.6504178047180176,
"learning_rate": 1.8808713010803194e-05,
"loss": 1.5506,
"step": 255000
},
{
"epoch": 1.88,
"grad_norm": 3.0442628860473633,
"learning_rate": 1.874755362454987e-05,
"loss": 1.55,
"step": 255500
},
{
"epoch": 1.88,
"grad_norm": 2.6336920261383057,
"learning_rate": 1.868639423829654e-05,
"loss": 1.5424,
"step": 256000
},
{
"epoch": 1.88,
"grad_norm": 2.7758066654205322,
"learning_rate": 1.8625234852043214e-05,
"loss": 1.5479,
"step": 256500
},
{
"epoch": 1.89,
"grad_norm": 2.818814992904663,
"learning_rate": 1.8564075465789888e-05,
"loss": 1.5486,
"step": 257000
},
{
"epoch": 1.89,
"grad_norm": 2.6956701278686523,
"learning_rate": 1.850291607953656e-05,
"loss": 1.5497,
"step": 257500
},
{
"epoch": 1.89,
"grad_norm": 2.7896413803100586,
"learning_rate": 1.8441756693283233e-05,
"loss": 1.5437,
"step": 258000
},
{
"epoch": 1.9,
"grad_norm": 2.917079448699951,
"learning_rate": 1.8380597307029907e-05,
"loss": 1.5432,
"step": 258500
},
{
"epoch": 1.9,
"grad_norm": 2.761766195297241,
"learning_rate": 1.8319437920776578e-05,
"loss": 1.5413,
"step": 259000
},
{
"epoch": 1.9,
"grad_norm": 2.7666103839874268,
"learning_rate": 1.8258278534523253e-05,
"loss": 1.5396,
"step": 259500
},
{
"epoch": 1.91,
"grad_norm": 2.691253423690796,
"learning_rate": 1.8197119148269927e-05,
"loss": 1.5372,
"step": 260000
},
{
"epoch": 1.91,
"grad_norm": 2.911930799484253,
"learning_rate": 1.8135959762016598e-05,
"loss": 1.5485,
"step": 260500
},
{
"epoch": 1.92,
"grad_norm": 2.5208046436309814,
"learning_rate": 1.8074800375763272e-05,
"loss": 1.5438,
"step": 261000
},
{
"epoch": 1.92,
"grad_norm": 2.41379976272583,
"learning_rate": 1.8013640989509943e-05,
"loss": 1.5384,
"step": 261500
},
{
"epoch": 1.92,
"grad_norm": 2.636869430541992,
"learning_rate": 1.7952481603256617e-05,
"loss": 1.5477,
"step": 262000
},
{
"epoch": 1.93,
"grad_norm": 2.6929407119750977,
"learning_rate": 1.7891322217003288e-05,
"loss": 1.5384,
"step": 262500
},
{
"epoch": 1.93,
"grad_norm": 2.849163055419922,
"learning_rate": 1.7830162830749962e-05,
"loss": 1.5394,
"step": 263000
},
{
"epoch": 1.93,
"grad_norm": 2.5682120323181152,
"learning_rate": 1.7769003444496633e-05,
"loss": 1.5353,
"step": 263500
},
{
"epoch": 1.94,
"grad_norm": 2.5825769901275635,
"learning_rate": 1.7707844058243307e-05,
"loss": 1.535,
"step": 264000
},
{
"epoch": 1.94,
"grad_norm": 2.426283597946167,
"learning_rate": 1.7646684671989978e-05,
"loss": 1.5373,
"step": 264500
},
{
"epoch": 1.94,
"grad_norm": 2.706394910812378,
"learning_rate": 1.7585525285736652e-05,
"loss": 1.5358,
"step": 265000
},
{
"epoch": 1.95,
"grad_norm": 2.6370396614074707,
"learning_rate": 1.7524365899483327e-05,
"loss": 1.5391,
"step": 265500
},
{
"epoch": 1.95,
"grad_norm": 2.553217649459839,
"learning_rate": 1.7463206513229998e-05,
"loss": 1.5299,
"step": 266000
},
{
"epoch": 1.96,
"grad_norm": 2.884148120880127,
"learning_rate": 1.7402047126976672e-05,
"loss": 1.5432,
"step": 266500
},
{
"epoch": 1.96,
"grad_norm": 2.7331855297088623,
"learning_rate": 1.7340887740723343e-05,
"loss": 1.5329,
"step": 267000
},
{
"epoch": 1.96,
"grad_norm": 2.841865062713623,
"learning_rate": 1.7279728354470017e-05,
"loss": 1.5319,
"step": 267500
},
{
"epoch": 1.97,
"grad_norm": 2.463677406311035,
"learning_rate": 1.721856896821669e-05,
"loss": 1.5274,
"step": 268000
},
{
"epoch": 1.97,
"grad_norm": 2.7880847454071045,
"learning_rate": 1.7157409581963362e-05,
"loss": 1.5244,
"step": 268500
},
{
"epoch": 1.97,
"grad_norm": 2.5323753356933594,
"learning_rate": 1.7096250195710036e-05,
"loss": 1.5343,
"step": 269000
},
{
"epoch": 1.98,
"grad_norm": 2.93086838722229,
"learning_rate": 1.703509080945671e-05,
"loss": 1.5253,
"step": 269500
},
{
"epoch": 1.98,
"grad_norm": 2.8919107913970947,
"learning_rate": 1.697393142320338e-05,
"loss": 1.5334,
"step": 270000
},
{
"epoch": 1.99,
"grad_norm": 2.8613593578338623,
"learning_rate": 1.6912772036950056e-05,
"loss": 1.5342,
"step": 270500
},
{
"epoch": 1.99,
"grad_norm": 2.5317909717559814,
"learning_rate": 1.685161265069673e-05,
"loss": 1.5278,
"step": 271000
},
{
"epoch": 1.99,
"grad_norm": 2.832613706588745,
"learning_rate": 1.67904532644434e-05,
"loss": 1.5318,
"step": 271500
},
{
"epoch": 2.0,
"grad_norm": 2.5811901092529297,
"learning_rate": 1.6729293878190075e-05,
"loss": 1.5338,
"step": 272000
},
{
"epoch": 2.0,
"grad_norm": 2.640382766723633,
"learning_rate": 1.6668134491936746e-05,
"loss": 1.5278,
"step": 272500
},
{
"epoch": 2.0,
"grad_norm": 2.777024745941162,
"learning_rate": 1.660697510568342e-05,
"loss": 1.5192,
"step": 273000
},
{
"epoch": 2.01,
"grad_norm": 2.546867609024048,
"learning_rate": 1.6545815719430095e-05,
"loss": 1.5203,
"step": 273500
},
{
"epoch": 2.01,
"grad_norm": 2.459458589553833,
"learning_rate": 1.6484656333176765e-05,
"loss": 1.5232,
"step": 274000
},
{
"epoch": 2.01,
"grad_norm": 2.6832683086395264,
"learning_rate": 1.642349694692344e-05,
"loss": 1.5133,
"step": 274500
},
{
"epoch": 2.02,
"grad_norm": 2.6847174167633057,
"learning_rate": 1.6362337560670114e-05,
"loss": 1.5202,
"step": 275000
},
{
"epoch": 2.02,
"grad_norm": 2.819836139678955,
"learning_rate": 1.6301178174416785e-05,
"loss": 1.5225,
"step": 275500
},
{
"epoch": 2.03,
"grad_norm": 2.4789822101593018,
"learning_rate": 1.624001878816346e-05,
"loss": 1.5185,
"step": 276000
},
{
"epoch": 2.03,
"grad_norm": 2.8469271659851074,
"learning_rate": 1.6178859401910133e-05,
"loss": 1.5281,
"step": 276500
},
{
"epoch": 2.03,
"grad_norm": 2.4741554260253906,
"learning_rate": 1.6117700015656804e-05,
"loss": 1.5187,
"step": 277000
},
{
"epoch": 2.04,
"grad_norm": 2.7348639965057373,
"learning_rate": 1.605654062940348e-05,
"loss": 1.519,
"step": 277500
},
{
"epoch": 2.04,
"grad_norm": 2.590632677078247,
"learning_rate": 1.599538124315015e-05,
"loss": 1.5242,
"step": 278000
},
{
"epoch": 2.04,
"grad_norm": 2.926156997680664,
"learning_rate": 1.5934221856896824e-05,
"loss": 1.5182,
"step": 278500
},
{
"epoch": 2.05,
"grad_norm": 2.3463704586029053,
"learning_rate": 1.5873062470643494e-05,
"loss": 1.5186,
"step": 279000
},
{
"epoch": 2.05,
"grad_norm": 2.8778836727142334,
"learning_rate": 1.581190308439017e-05,
"loss": 1.5133,
"step": 279500
},
{
"epoch": 2.05,
"grad_norm": 2.7937684059143066,
"learning_rate": 1.575074369813684e-05,
"loss": 1.5204,
"step": 280000
},
{
"epoch": 2.06,
"grad_norm": 2.6967952251434326,
"learning_rate": 1.5689584311883514e-05,
"loss": 1.5238,
"step": 280500
},
{
"epoch": 2.06,
"grad_norm": 2.7939419746398926,
"learning_rate": 1.5628424925630185e-05,
"loss": 1.5162,
"step": 281000
},
{
"epoch": 2.07,
"grad_norm": 2.4184165000915527,
"learning_rate": 1.556726553937686e-05,
"loss": 1.5083,
"step": 281500
},
{
"epoch": 2.07,
"grad_norm": 2.5736517906188965,
"learning_rate": 1.5506106153123533e-05,
"loss": 1.5225,
"step": 282000
},
{
"epoch": 2.07,
"grad_norm": 2.775562286376953,
"learning_rate": 1.5444946766870204e-05,
"loss": 1.5107,
"step": 282500
},
{
"epoch": 2.08,
"grad_norm": 2.65218186378479,
"learning_rate": 1.538378738061688e-05,
"loss": 1.5101,
"step": 283000
},
{
"epoch": 2.08,
"grad_norm": 2.9510700702667236,
"learning_rate": 1.532262799436355e-05,
"loss": 1.5108,
"step": 283500
},
{
"epoch": 2.08,
"grad_norm": 2.663459300994873,
"learning_rate": 1.5261468608110224e-05,
"loss": 1.5039,
"step": 284000
},
{
"epoch": 2.09,
"grad_norm": 2.621185541152954,
"learning_rate": 1.5200309221856898e-05,
"loss": 1.5044,
"step": 284500
},
{
"epoch": 2.09,
"grad_norm": 2.7597007751464844,
"learning_rate": 1.5139149835603569e-05,
"loss": 1.5123,
"step": 285000
},
{
"epoch": 2.1,
"grad_norm": 2.9049315452575684,
"learning_rate": 1.5077990449350243e-05,
"loss": 1.5129,
"step": 285500
},
{
"epoch": 2.1,
"grad_norm": 2.7064170837402344,
"learning_rate": 1.5016831063096917e-05,
"loss": 1.5075,
"step": 286000
},
{
"epoch": 2.1,
"grad_norm": 2.8447062969207764,
"learning_rate": 1.4955671676843588e-05,
"loss": 1.5107,
"step": 286500
},
{
"epoch": 2.11,
"grad_norm": 2.63680100440979,
"learning_rate": 1.4894512290590262e-05,
"loss": 1.5095,
"step": 287000
},
{
"epoch": 2.11,
"grad_norm": 2.9696691036224365,
"learning_rate": 1.4833352904336937e-05,
"loss": 1.5069,
"step": 287500
},
{
"epoch": 2.11,
"grad_norm": 2.7010321617126465,
"learning_rate": 1.4772193518083607e-05,
"loss": 1.5094,
"step": 288000
},
{
"epoch": 2.12,
"grad_norm": 2.5756781101226807,
"learning_rate": 1.4711034131830282e-05,
"loss": 1.5054,
"step": 288500
},
{
"epoch": 2.12,
"grad_norm": 3.0450093746185303,
"learning_rate": 1.4649874745576956e-05,
"loss": 1.5115,
"step": 289000
},
{
"epoch": 2.12,
"grad_norm": 2.551755905151367,
"learning_rate": 1.4588715359323627e-05,
"loss": 1.5129,
"step": 289500
},
{
"epoch": 2.13,
"grad_norm": 2.865170478820801,
"learning_rate": 1.4527555973070301e-05,
"loss": 1.4972,
"step": 290000
},
{
"epoch": 2.13,
"grad_norm": 2.648294687271118,
"learning_rate": 1.4466396586816972e-05,
"loss": 1.5093,
"step": 290500
},
{
"epoch": 2.14,
"grad_norm": 2.600937604904175,
"learning_rate": 1.4405237200563646e-05,
"loss": 1.5043,
"step": 291000
},
{
"epoch": 2.14,
"grad_norm": 2.9919681549072266,
"learning_rate": 1.4344077814310319e-05,
"loss": 1.4997,
"step": 291500
},
{
"epoch": 2.14,
"grad_norm": 2.8291046619415283,
"learning_rate": 1.4282918428056991e-05,
"loss": 1.5196,
"step": 292000
},
{
"epoch": 2.15,
"grad_norm": 2.66756272315979,
"learning_rate": 1.4221759041803664e-05,
"loss": 1.5007,
"step": 292500
},
{
"epoch": 2.15,
"grad_norm": 2.809164524078369,
"learning_rate": 1.4160599655550338e-05,
"loss": 1.5033,
"step": 293000
},
{
"epoch": 2.15,
"grad_norm": 2.6483566761016846,
"learning_rate": 1.4099440269297009e-05,
"loss": 1.5065,
"step": 293500
},
{
"epoch": 2.16,
"grad_norm": 2.4449145793914795,
"learning_rate": 1.4038280883043683e-05,
"loss": 1.5032,
"step": 294000
},
{
"epoch": 2.16,
"grad_norm": 2.6919500827789307,
"learning_rate": 1.3977121496790358e-05,
"loss": 1.5053,
"step": 294500
},
{
"epoch": 2.17,
"grad_norm": 2.8122289180755615,
"learning_rate": 1.3915962110537028e-05,
"loss": 1.5079,
"step": 295000
},
{
"epoch": 2.17,
"grad_norm": 2.7903494834899902,
"learning_rate": 1.3854802724283703e-05,
"loss": 1.4965,
"step": 295500
},
{
"epoch": 2.17,
"grad_norm": 2.525930404663086,
"learning_rate": 1.3793643338030374e-05,
"loss": 1.5043,
"step": 296000
},
{
"epoch": 2.18,
"grad_norm": 2.493638277053833,
"learning_rate": 1.3732483951777048e-05,
"loss": 1.4974,
"step": 296500
},
{
"epoch": 2.18,
"grad_norm": 2.4521799087524414,
"learning_rate": 1.3671324565523722e-05,
"loss": 1.4941,
"step": 297000
},
{
"epoch": 2.18,
"grad_norm": 2.8091464042663574,
"learning_rate": 1.3610165179270393e-05,
"loss": 1.5018,
"step": 297500
},
{
"epoch": 2.19,
"grad_norm": 2.5954153537750244,
"learning_rate": 1.3549005793017067e-05,
"loss": 1.4999,
"step": 298000
},
{
"epoch": 2.19,
"grad_norm": 2.7937843799591064,
"learning_rate": 1.348784640676374e-05,
"loss": 1.4971,
"step": 298500
},
{
"epoch": 2.19,
"grad_norm": 2.731354236602783,
"learning_rate": 1.3426687020510412e-05,
"loss": 1.5019,
"step": 299000
},
{
"epoch": 2.2,
"grad_norm": 2.893202066421509,
"learning_rate": 1.3365527634257085e-05,
"loss": 1.5084,
"step": 299500
},
{
"epoch": 2.2,
"grad_norm": 2.5517237186431885,
"learning_rate": 1.330436824800376e-05,
"loss": 1.4979,
"step": 300000
},
{
"epoch": 2.21,
"grad_norm": 2.5626368522644043,
"learning_rate": 1.324320886175043e-05,
"loss": 1.5023,
"step": 300500
},
{
"epoch": 2.21,
"grad_norm": 2.9477968215942383,
"learning_rate": 1.3182049475497104e-05,
"loss": 1.4948,
"step": 301000
},
{
"epoch": 2.21,
"grad_norm": 2.5781774520874023,
"learning_rate": 1.3120890089243775e-05,
"loss": 1.4968,
"step": 301500
},
{
"epoch": 2.22,
"grad_norm": 2.8032429218292236,
"learning_rate": 1.305973070299045e-05,
"loss": 1.5017,
"step": 302000
},
{
"epoch": 2.22,
"grad_norm": 2.6801342964172363,
"learning_rate": 1.2998571316737124e-05,
"loss": 1.4932,
"step": 302500
},
{
"epoch": 2.22,
"grad_norm": 2.6974122524261475,
"learning_rate": 1.2937411930483795e-05,
"loss": 1.4965,
"step": 303000
},
{
"epoch": 2.23,
"grad_norm": 2.4389328956604004,
"learning_rate": 1.2876252544230469e-05,
"loss": 1.4933,
"step": 303500
},
{
"epoch": 2.23,
"grad_norm": 2.66622257232666,
"learning_rate": 1.2815093157977143e-05,
"loss": 1.4941,
"step": 304000
},
{
"epoch": 2.23,
"grad_norm": 2.6904194355010986,
"learning_rate": 1.2753933771723814e-05,
"loss": 1.4946,
"step": 304500
},
{
"epoch": 2.24,
"grad_norm": 2.673464298248291,
"learning_rate": 1.2692774385470488e-05,
"loss": 1.5015,
"step": 305000
},
{
"epoch": 2.24,
"grad_norm": 2.6104812622070312,
"learning_rate": 1.2631614999217163e-05,
"loss": 1.4886,
"step": 305500
},
{
"epoch": 2.25,
"grad_norm": 2.8773016929626465,
"learning_rate": 1.2570455612963833e-05,
"loss": 1.4916,
"step": 306000
},
{
"epoch": 2.25,
"grad_norm": 2.437274217605591,
"learning_rate": 1.2509296226710508e-05,
"loss": 1.4909,
"step": 306500
},
{
"epoch": 2.25,
"grad_norm": 3.1659114360809326,
"learning_rate": 1.244813684045718e-05,
"loss": 1.488,
"step": 307000
},
{
"epoch": 2.26,
"grad_norm": 2.607539653778076,
"learning_rate": 1.2386977454203851e-05,
"loss": 1.4916,
"step": 307500
},
{
"epoch": 2.26,
"grad_norm": 2.589136838912964,
"learning_rate": 1.2325818067950524e-05,
"loss": 1.4893,
"step": 308000
},
{
"epoch": 2.26,
"grad_norm": 3.010464668273926,
"learning_rate": 1.2264658681697198e-05,
"loss": 1.4867,
"step": 308500
},
{
"epoch": 2.27,
"grad_norm": 2.713313579559326,
"learning_rate": 1.220349929544387e-05,
"loss": 1.4788,
"step": 309000
},
{
"epoch": 2.27,
"grad_norm": 2.753493070602417,
"learning_rate": 1.2142339909190543e-05,
"loss": 1.4839,
"step": 309500
},
{
"epoch": 2.28,
"grad_norm": 2.8799803256988525,
"learning_rate": 1.2081180522937217e-05,
"loss": 1.4914,
"step": 310000
},
{
"epoch": 2.28,
"grad_norm": 2.8280301094055176,
"learning_rate": 1.202002113668389e-05,
"loss": 1.4809,
"step": 310500
},
{
"epoch": 2.28,
"grad_norm": 2.9053263664245605,
"learning_rate": 1.1958861750430562e-05,
"loss": 1.4788,
"step": 311000
},
{
"epoch": 2.29,
"grad_norm": 2.879546880722046,
"learning_rate": 1.1897702364177235e-05,
"loss": 1.4825,
"step": 311500
},
{
"epoch": 2.29,
"grad_norm": 2.473529577255249,
"learning_rate": 1.183654297792391e-05,
"loss": 1.4883,
"step": 312000
},
{
"epoch": 2.29,
"grad_norm": 2.743178367614746,
"learning_rate": 1.1775383591670582e-05,
"loss": 1.4804,
"step": 312500
},
{
"epoch": 2.3,
"grad_norm": 2.6918370723724365,
"learning_rate": 1.1714224205417254e-05,
"loss": 1.4891,
"step": 313000
},
{
"epoch": 2.3,
"grad_norm": 2.9803996086120605,
"learning_rate": 1.1653064819163927e-05,
"loss": 1.486,
"step": 313500
},
{
"epoch": 2.3,
"grad_norm": 2.544872999191284,
"learning_rate": 1.1591905432910601e-05,
"loss": 1.4879,
"step": 314000
},
{
"epoch": 2.31,
"grad_norm": 2.8242433071136475,
"learning_rate": 1.1530746046657274e-05,
"loss": 1.4848,
"step": 314500
},
{
"epoch": 2.31,
"grad_norm": 2.7912473678588867,
"learning_rate": 1.1469586660403946e-05,
"loss": 1.4847,
"step": 315000
},
{
"epoch": 2.32,
"grad_norm": 3.1455202102661133,
"learning_rate": 1.1408427274150619e-05,
"loss": 1.4899,
"step": 315500
},
{
"epoch": 2.32,
"grad_norm": 2.8553197383880615,
"learning_rate": 1.1347267887897291e-05,
"loss": 1.4799,
"step": 316000
},
{
"epoch": 2.32,
"grad_norm": 2.7605557441711426,
"learning_rate": 1.1286108501643964e-05,
"loss": 1.4808,
"step": 316500
},
{
"epoch": 2.33,
"grad_norm": 2.7065718173980713,
"learning_rate": 1.1224949115390637e-05,
"loss": 1.4846,
"step": 317000
},
{
"epoch": 2.33,
"grad_norm": 2.719977378845215,
"learning_rate": 1.1163789729137311e-05,
"loss": 1.4831,
"step": 317500
},
{
"epoch": 2.33,
"grad_norm": 2.569617509841919,
"learning_rate": 1.1102630342883983e-05,
"loss": 1.4798,
"step": 318000
},
{
"epoch": 2.34,
"grad_norm": 2.4670286178588867,
"learning_rate": 1.1041470956630656e-05,
"loss": 1.4765,
"step": 318500
},
{
"epoch": 2.34,
"grad_norm": 2.797725200653076,
"learning_rate": 1.098031157037733e-05,
"loss": 1.4817,
"step": 319000
},
{
"epoch": 2.34,
"grad_norm": 2.8332033157348633,
"learning_rate": 1.0919152184124003e-05,
"loss": 1.4835,
"step": 319500
},
{
"epoch": 2.35,
"grad_norm": 2.494609832763672,
"learning_rate": 1.0857992797870675e-05,
"loss": 1.4746,
"step": 320000
},
{
"epoch": 2.35,
"grad_norm": 2.708406925201416,
"learning_rate": 1.0796833411617348e-05,
"loss": 1.4764,
"step": 320500
},
{
"epoch": 2.36,
"grad_norm": 2.59369158744812,
"learning_rate": 1.0735674025364022e-05,
"loss": 1.4808,
"step": 321000
},
{
"epoch": 2.36,
"grad_norm": 2.803255558013916,
"learning_rate": 1.0674514639110695e-05,
"loss": 1.48,
"step": 321500
},
{
"epoch": 2.36,
"grad_norm": 2.5560402870178223,
"learning_rate": 1.0613355252857367e-05,
"loss": 1.4843,
"step": 322000
},
{
"epoch": 2.37,
"grad_norm": 2.911194324493408,
"learning_rate": 1.055219586660404e-05,
"loss": 1.4801,
"step": 322500
},
{
"epoch": 2.37,
"grad_norm": 2.8196239471435547,
"learning_rate": 1.0491036480350713e-05,
"loss": 1.4762,
"step": 323000
},
{
"epoch": 2.37,
"grad_norm": 2.709317445755005,
"learning_rate": 1.0429877094097385e-05,
"loss": 1.476,
"step": 323500
},
{
"epoch": 2.38,
"grad_norm": 2.627985715866089,
"learning_rate": 1.0368717707844058e-05,
"loss": 1.4781,
"step": 324000
},
{
"epoch": 2.38,
"grad_norm": 2.8382914066314697,
"learning_rate": 1.0307558321590732e-05,
"loss": 1.4728,
"step": 324500
},
{
"epoch": 2.39,
"grad_norm": 2.8126072883605957,
"learning_rate": 1.0246398935337404e-05,
"loss": 1.4778,
"step": 325000
},
{
"epoch": 2.39,
"grad_norm": 2.6362712383270264,
"learning_rate": 1.0185239549084077e-05,
"loss": 1.476,
"step": 325500
},
{
"epoch": 2.39,
"grad_norm": 2.761763572692871,
"learning_rate": 1.012408016283075e-05,
"loss": 1.4704,
"step": 326000
},
{
"epoch": 2.4,
"grad_norm": 2.6072146892547607,
"learning_rate": 1.0062920776577424e-05,
"loss": 1.4703,
"step": 326500
},
{
"epoch": 2.4,
"grad_norm": 3.07877254486084,
"learning_rate": 1.0001761390324096e-05,
"loss": 1.4863,
"step": 327000
},
{
"epoch": 2.4,
"grad_norm": 2.986053705215454,
"learning_rate": 9.940602004070769e-06,
"loss": 1.4837,
"step": 327500
},
{
"epoch": 2.41,
"grad_norm": 2.7128584384918213,
"learning_rate": 9.879442617817442e-06,
"loss": 1.4648,
"step": 328000
},
{
"epoch": 2.41,
"grad_norm": 2.6193928718566895,
"learning_rate": 9.818283231564116e-06,
"loss": 1.4628,
"step": 328500
},
{
"epoch": 2.41,
"grad_norm": 2.7647864818573,
"learning_rate": 9.757123845310788e-06,
"loss": 1.4799,
"step": 329000
},
{
"epoch": 2.42,
"grad_norm": 2.70143985748291,
"learning_rate": 9.695964459057461e-06,
"loss": 1.4629,
"step": 329500
},
{
"epoch": 2.42,
"grad_norm": 2.705559730529785,
"learning_rate": 9.634805072804135e-06,
"loss": 1.4662,
"step": 330000
},
{
"epoch": 2.43,
"grad_norm": 2.678466796875,
"learning_rate": 9.573645686550808e-06,
"loss": 1.4693,
"step": 330500
},
{
"epoch": 2.43,
"grad_norm": 2.5051000118255615,
"learning_rate": 9.51248630029748e-06,
"loss": 1.4705,
"step": 331000
},
{
"epoch": 2.43,
"grad_norm": 2.8841006755828857,
"learning_rate": 9.451326914044153e-06,
"loss": 1.4651,
"step": 331500
},
{
"epoch": 2.44,
"grad_norm": 2.7045044898986816,
"learning_rate": 9.390167527790825e-06,
"loss": 1.4665,
"step": 332000
},
{
"epoch": 2.44,
"grad_norm": 3.101134777069092,
"learning_rate": 9.329008141537498e-06,
"loss": 1.4746,
"step": 332500
},
{
"epoch": 2.44,
"grad_norm": 2.5567667484283447,
"learning_rate": 9.26784875528417e-06,
"loss": 1.4667,
"step": 333000
},
{
"epoch": 2.45,
"grad_norm": 2.5476863384246826,
"learning_rate": 9.206689369030843e-06,
"loss": 1.4593,
"step": 333500
},
{
"epoch": 2.45,
"grad_norm": 2.6363370418548584,
"learning_rate": 9.145529982777517e-06,
"loss": 1.4632,
"step": 334000
},
{
"epoch": 2.45,
"grad_norm": 2.9167027473449707,
"learning_rate": 9.08437059652419e-06,
"loss": 1.4595,
"step": 334500
},
{
"epoch": 2.46,
"grad_norm": 2.77966046333313,
"learning_rate": 9.023211210270863e-06,
"loss": 1.4604,
"step": 335000
},
{
"epoch": 2.46,
"grad_norm": 3.0701239109039307,
"learning_rate": 8.962051824017537e-06,
"loss": 1.4613,
"step": 335500
},
{
"epoch": 2.47,
"grad_norm": 2.6307058334350586,
"learning_rate": 8.90089243776421e-06,
"loss": 1.4627,
"step": 336000
},
{
"epoch": 2.47,
"grad_norm": 2.46291184425354,
"learning_rate": 8.839733051510882e-06,
"loss": 1.4667,
"step": 336500
},
{
"epoch": 2.47,
"grad_norm": 2.7968499660491943,
"learning_rate": 8.778573665257555e-06,
"loss": 1.4644,
"step": 337000
},
{
"epoch": 2.48,
"grad_norm": 2.7745018005371094,
"learning_rate": 8.717414279004229e-06,
"loss": 1.458,
"step": 337500
},
{
"epoch": 2.48,
"grad_norm": 2.951845645904541,
"learning_rate": 8.656254892750901e-06,
"loss": 1.4723,
"step": 338000
},
{
"epoch": 2.48,
"grad_norm": 2.6524295806884766,
"learning_rate": 8.595095506497574e-06,
"loss": 1.4607,
"step": 338500
},
{
"epoch": 2.49,
"grad_norm": 2.800586223602295,
"learning_rate": 8.533936120244246e-06,
"loss": 1.4606,
"step": 339000
},
{
"epoch": 2.49,
"grad_norm": 2.947486639022827,
"learning_rate": 8.472776733990919e-06,
"loss": 1.4608,
"step": 339500
},
{
"epoch": 2.5,
"grad_norm": 2.936547040939331,
"learning_rate": 8.411617347737592e-06,
"loss": 1.4582,
"step": 340000
},
{
"epoch": 2.5,
"grad_norm": 2.820474147796631,
"learning_rate": 8.350457961484264e-06,
"loss": 1.4648,
"step": 340500
},
{
"epoch": 2.5,
"grad_norm": 2.754638910293579,
"learning_rate": 8.289298575230938e-06,
"loss": 1.4613,
"step": 341000
},
{
"epoch": 2.51,
"grad_norm": 2.46992564201355,
"learning_rate": 8.228139188977611e-06,
"loss": 1.4685,
"step": 341500
},
{
"epoch": 2.51,
"grad_norm": 2.8246257305145264,
"learning_rate": 8.166979802724284e-06,
"loss": 1.4632,
"step": 342000
},
{
"epoch": 2.51,
"grad_norm": 2.966745138168335,
"learning_rate": 8.105820416470956e-06,
"loss": 1.4615,
"step": 342500
},
{
"epoch": 2.52,
"grad_norm": 2.9904332160949707,
"learning_rate": 8.04466103021763e-06,
"loss": 1.4646,
"step": 343000
},
{
"epoch": 2.52,
"grad_norm": 2.7209649085998535,
"learning_rate": 7.983501643964303e-06,
"loss": 1.4602,
"step": 343500
},
{
"epoch": 2.52,
"grad_norm": 2.7970163822174072,
"learning_rate": 7.922342257710976e-06,
"loss": 1.4557,
"step": 344000
},
{
"epoch": 2.53,
"grad_norm": 2.646637201309204,
"learning_rate": 7.86118287145765e-06,
"loss": 1.4554,
"step": 344500
},
{
"epoch": 2.53,
"grad_norm": 2.8239455223083496,
"learning_rate": 7.800023485204322e-06,
"loss": 1.4527,
"step": 345000
},
{
"epoch": 2.54,
"grad_norm": 2.9307682514190674,
"learning_rate": 7.738864098950995e-06,
"loss": 1.461,
"step": 345500
},
{
"epoch": 2.54,
"grad_norm": 2.840571165084839,
"learning_rate": 7.677704712697668e-06,
"loss": 1.4592,
"step": 346000
},
{
"epoch": 2.54,
"grad_norm": 2.7356936931610107,
"learning_rate": 7.616545326444341e-06,
"loss": 1.46,
"step": 346500
},
{
"epoch": 2.55,
"grad_norm": 2.902578353881836,
"learning_rate": 7.5553859401910135e-06,
"loss": 1.4627,
"step": 347000
},
{
"epoch": 2.55,
"grad_norm": 3.005869150161743,
"learning_rate": 7.494226553937686e-06,
"loss": 1.4592,
"step": 347500
},
{
"epoch": 2.55,
"grad_norm": 2.7847769260406494,
"learning_rate": 7.433067167684359e-06,
"loss": 1.4652,
"step": 348000
},
{
"epoch": 2.56,
"grad_norm": 2.821125030517578,
"learning_rate": 7.371907781431033e-06,
"loss": 1.4585,
"step": 348500
},
{
"epoch": 2.56,
"grad_norm": 3.5766139030456543,
"learning_rate": 7.3107483951777054e-06,
"loss": 1.4511,
"step": 349000
},
{
"epoch": 2.57,
"grad_norm": 2.9950060844421387,
"learning_rate": 7.249589008924377e-06,
"loss": 1.45,
"step": 349500
},
{
"epoch": 2.57,
"grad_norm": 2.922325611114502,
"learning_rate": 7.1884296226710514e-06,
"loss": 1.4549,
"step": 350000
},
{
"epoch": 2.57,
"grad_norm": 3.0797903537750244,
"learning_rate": 7.127270236417724e-06,
"loss": 1.4643,
"step": 350500
},
{
"epoch": 2.58,
"grad_norm": 2.6984283924102783,
"learning_rate": 7.0661108501643966e-06,
"loss": 1.4528,
"step": 351000
},
{
"epoch": 2.58,
"grad_norm": 2.804563283920288,
"learning_rate": 7.004951463911069e-06,
"loss": 1.4565,
"step": 351500
},
{
"epoch": 2.58,
"grad_norm": 2.6416571140289307,
"learning_rate": 6.943792077657743e-06,
"loss": 1.4523,
"step": 352000
},
{
"epoch": 2.59,
"grad_norm": 2.5998694896698,
"learning_rate": 6.882632691404416e-06,
"loss": 1.4546,
"step": 352500
},
{
"epoch": 2.59,
"grad_norm": 2.714494228363037,
"learning_rate": 6.8214733051510885e-06,
"loss": 1.4581,
"step": 353000
},
{
"epoch": 2.59,
"grad_norm": 2.8569843769073486,
"learning_rate": 6.760313918897761e-06,
"loss": 1.4562,
"step": 353500
},
{
"epoch": 2.6,
"grad_norm": 2.5230369567871094,
"learning_rate": 6.6991545326444345e-06,
"loss": 1.4469,
"step": 354000
},
{
"epoch": 2.6,
"grad_norm": 2.654069423675537,
"learning_rate": 6.637995146391107e-06,
"loss": 1.449,
"step": 354500
},
{
"epoch": 2.61,
"grad_norm": 3.2150166034698486,
"learning_rate": 6.57683576013778e-06,
"loss": 1.4506,
"step": 355000
},
{
"epoch": 2.61,
"grad_norm": 2.6119723320007324,
"learning_rate": 6.515676373884454e-06,
"loss": 1.4507,
"step": 355500
},
{
"epoch": 2.61,
"grad_norm": 2.6928510665893555,
"learning_rate": 6.4545169876311265e-06,
"loss": 1.4565,
"step": 356000
},
{
"epoch": 2.62,
"grad_norm": 2.555009603500366,
"learning_rate": 6.393357601377799e-06,
"loss": 1.4514,
"step": 356500
},
{
"epoch": 2.62,
"grad_norm": 2.785787343978882,
"learning_rate": 6.332198215124472e-06,
"loss": 1.4525,
"step": 357000
},
{
"epoch": 2.62,
"grad_norm": 2.8000409603118896,
"learning_rate": 6.271038828871145e-06,
"loss": 1.4498,
"step": 357500
},
{
"epoch": 2.63,
"grad_norm": 3.049229860305786,
"learning_rate": 6.209879442617818e-06,
"loss": 1.45,
"step": 358000
},
{
"epoch": 2.63,
"grad_norm": 2.8990321159362793,
"learning_rate": 6.14872005636449e-06,
"loss": 1.454,
"step": 358500
},
{
"epoch": 2.63,
"grad_norm": 2.6227800846099854,
"learning_rate": 6.0875606701111636e-06,
"loss": 1.4535,
"step": 359000
},
{
"epoch": 2.64,
"grad_norm": 2.856273651123047,
"learning_rate": 6.026401283857837e-06,
"loss": 1.453,
"step": 359500
},
{
"epoch": 2.64,
"grad_norm": 2.6688528060913086,
"learning_rate": 5.9652418976045095e-06,
"loss": 1.4396,
"step": 360000
},
{
"epoch": 2.65,
"grad_norm": 2.765559196472168,
"learning_rate": 5.904082511351183e-06,
"loss": 1.447,
"step": 360500
},
{
"epoch": 2.65,
"grad_norm": 2.546724319458008,
"learning_rate": 5.8429231250978555e-06,
"loss": 1.4445,
"step": 361000
},
{
"epoch": 2.65,
"grad_norm": 2.7436957359313965,
"learning_rate": 5.781763738844528e-06,
"loss": 1.4512,
"step": 361500
},
{
"epoch": 2.66,
"grad_norm": 2.612231731414795,
"learning_rate": 5.720604352591201e-06,
"loss": 1.4496,
"step": 362000
},
{
"epoch": 2.66,
"grad_norm": 2.537179946899414,
"learning_rate": 5.659444966337874e-06,
"loss": 1.4417,
"step": 362500
},
{
"epoch": 2.66,
"grad_norm": 2.82316255569458,
"learning_rate": 5.598285580084547e-06,
"loss": 1.447,
"step": 363000
},
{
"epoch": 2.67,
"grad_norm": 2.607912302017212,
"learning_rate": 5.53712619383122e-06,
"loss": 1.4465,
"step": 363500
},
{
"epoch": 2.67,
"grad_norm": 2.6389966011047363,
"learning_rate": 5.4759668075778935e-06,
"loss": 1.4511,
"step": 364000
},
{
"epoch": 2.68,
"grad_norm": 2.7712039947509766,
"learning_rate": 5.414807421324566e-06,
"loss": 1.4429,
"step": 364500
},
{
"epoch": 2.68,
"grad_norm": 2.6119940280914307,
"learning_rate": 5.353648035071239e-06,
"loss": 1.4473,
"step": 365000
},
{
"epoch": 2.68,
"grad_norm": 2.855820894241333,
"learning_rate": 5.292488648817911e-06,
"loss": 1.4409,
"step": 365500
},
{
"epoch": 2.69,
"grad_norm": 3.3360650539398193,
"learning_rate": 5.2313292625645846e-06,
"loss": 1.4448,
"step": 366000
},
{
"epoch": 2.69,
"grad_norm": 2.7165558338165283,
"learning_rate": 5.170169876311257e-06,
"loss": 1.4459,
"step": 366500
},
{
"epoch": 2.69,
"grad_norm": 2.638815402984619,
"learning_rate": 5.1090104900579306e-06,
"loss": 1.4384,
"step": 367000
},
{
"epoch": 2.7,
"grad_norm": 2.605776071548462,
"learning_rate": 5.047851103804603e-06,
"loss": 1.4515,
"step": 367500
},
{
"epoch": 2.7,
"grad_norm": 2.7470364570617676,
"learning_rate": 4.9866917175512765e-06,
"loss": 1.4433,
"step": 368000
},
{
"epoch": 2.7,
"grad_norm": 2.7127552032470703,
"learning_rate": 4.925532331297949e-06,
"loss": 1.4475,
"step": 368500
},
{
"epoch": 2.71,
"grad_norm": 2.991445541381836,
"learning_rate": 4.8643729450446225e-06,
"loss": 1.4392,
"step": 369000
},
{
"epoch": 2.71,
"grad_norm": 2.5964415073394775,
"learning_rate": 4.803213558791295e-06,
"loss": 1.4435,
"step": 369500
},
{
"epoch": 2.72,
"grad_norm": 2.73907732963562,
"learning_rate": 4.742054172537968e-06,
"loss": 1.4487,
"step": 370000
},
{
"epoch": 2.72,
"grad_norm": 2.7901346683502197,
"learning_rate": 4.680894786284641e-06,
"loss": 1.439,
"step": 370500
},
{
"epoch": 2.72,
"grad_norm": 2.4311490058898926,
"learning_rate": 4.619735400031314e-06,
"loss": 1.4449,
"step": 371000
},
{
"epoch": 2.73,
"grad_norm": 2.748305320739746,
"learning_rate": 4.558576013777987e-06,
"loss": 1.4558,
"step": 371500
},
{
"epoch": 2.73,
"grad_norm": 2.8858590126037598,
"learning_rate": 4.49741662752466e-06,
"loss": 1.4422,
"step": 372000
},
{
"epoch": 2.73,
"grad_norm": 2.893162727355957,
"learning_rate": 4.436257241271333e-06,
"loss": 1.4465,
"step": 372500
},
{
"epoch": 2.74,
"grad_norm": 2.519300937652588,
"learning_rate": 4.375097855018006e-06,
"loss": 1.4354,
"step": 373000
},
{
"epoch": 2.74,
"grad_norm": 2.798462152481079,
"learning_rate": 4.313938468764678e-06,
"loss": 1.4456,
"step": 373500
},
{
"epoch": 2.74,
"grad_norm": 2.894874095916748,
"learning_rate": 4.2527790825113516e-06,
"loss": 1.4358,
"step": 374000
},
{
"epoch": 2.75,
"grad_norm": 2.979597806930542,
"learning_rate": 4.191619696258024e-06,
"loss": 1.444,
"step": 374500
},
{
"epoch": 2.75,
"grad_norm": 2.878843069076538,
"learning_rate": 4.1304603100046975e-06,
"loss": 1.4423,
"step": 375000
},
{
"epoch": 2.76,
"grad_norm": 2.841049909591675,
"learning_rate": 4.06930092375137e-06,
"loss": 1.4403,
"step": 375500
},
{
"epoch": 2.76,
"grad_norm": 2.413423538208008,
"learning_rate": 4.0081415374980435e-06,
"loss": 1.4491,
"step": 376000
},
{
"epoch": 2.76,
"grad_norm": 2.7661304473876953,
"learning_rate": 3.946982151244716e-06,
"loss": 1.4399,
"step": 376500
},
{
"epoch": 2.77,
"grad_norm": 2.5709524154663086,
"learning_rate": 3.885822764991389e-06,
"loss": 1.4391,
"step": 377000
},
{
"epoch": 2.77,
"grad_norm": 2.8157260417938232,
"learning_rate": 3.824663378738061e-06,
"loss": 1.4368,
"step": 377500
},
{
"epoch": 2.77,
"grad_norm": 2.7713100910186768,
"learning_rate": 3.7635039924847346e-06,
"loss": 1.4314,
"step": 378000
},
{
"epoch": 2.78,
"grad_norm": 2.9643430709838867,
"learning_rate": 3.702344606231407e-06,
"loss": 1.4329,
"step": 378500
},
{
"epoch": 2.78,
"grad_norm": 2.798428535461426,
"learning_rate": 3.6411852199780806e-06,
"loss": 1.4358,
"step": 379000
},
{
"epoch": 2.79,
"grad_norm": 2.7989561557769775,
"learning_rate": 3.580025833724754e-06,
"loss": 1.4383,
"step": 379500
},
{
"epoch": 2.79,
"grad_norm": 2.718421459197998,
"learning_rate": 3.5188664474714266e-06,
"loss": 1.4335,
"step": 380000
},
{
"epoch": 2.79,
"grad_norm": 2.8248419761657715,
"learning_rate": 3.4577070612180996e-06,
"loss": 1.439,
"step": 380500
},
{
"epoch": 2.8,
"grad_norm": 2.4477925300598145,
"learning_rate": 3.396547674964772e-06,
"loss": 1.4356,
"step": 381000
},
{
"epoch": 2.8,
"grad_norm": 2.6281678676605225,
"learning_rate": 3.3353882887114456e-06,
"loss": 1.4441,
"step": 381500
},
{
"epoch": 2.8,
"grad_norm": 2.6818716526031494,
"learning_rate": 3.274228902458118e-06,
"loss": 1.434,
"step": 382000
},
{
"epoch": 2.81,
"grad_norm": 3.0857114791870117,
"learning_rate": 3.213069516204791e-06,
"loss": 1.4401,
"step": 382500
},
{
"epoch": 2.81,
"grad_norm": 2.920851230621338,
"learning_rate": 3.1519101299514637e-06,
"loss": 1.4394,
"step": 383000
},
{
"epoch": 2.81,
"grad_norm": 3.0690174102783203,
"learning_rate": 3.090750743698137e-06,
"loss": 1.4371,
"step": 383500
},
{
"epoch": 2.82,
"grad_norm": 2.846827745437622,
"learning_rate": 3.02959135744481e-06,
"loss": 1.4301,
"step": 384000
},
{
"epoch": 2.82,
"grad_norm": 2.927429676055908,
"learning_rate": 2.9684319711914827e-06,
"loss": 1.426,
"step": 384500
},
{
"epoch": 2.83,
"grad_norm": 3.124462842941284,
"learning_rate": 2.9072725849381557e-06,
"loss": 1.4312,
"step": 385000
},
{
"epoch": 2.83,
"grad_norm": 2.4795315265655518,
"learning_rate": 2.8461131986848286e-06,
"loss": 1.4406,
"step": 385500
},
{
"epoch": 2.83,
"grad_norm": 2.942664384841919,
"learning_rate": 2.7849538124315016e-06,
"loss": 1.4338,
"step": 386000
},
{
"epoch": 2.84,
"grad_norm": 2.6400539875030518,
"learning_rate": 2.7237944261781746e-06,
"loss": 1.4335,
"step": 386500
},
{
"epoch": 2.84,
"grad_norm": 2.5888681411743164,
"learning_rate": 2.6626350399248476e-06,
"loss": 1.4293,
"step": 387000
},
{
"epoch": 2.84,
"grad_norm": 2.6692755222320557,
"learning_rate": 2.6014756536715206e-06,
"loss": 1.4294,
"step": 387500
},
{
"epoch": 2.85,
"grad_norm": 2.7141776084899902,
"learning_rate": 2.540316267418193e-06,
"loss": 1.4418,
"step": 388000
},
{
"epoch": 2.85,
"grad_norm": 2.638432264328003,
"learning_rate": 2.479156881164866e-06,
"loss": 1.43,
"step": 388500
},
{
"epoch": 2.85,
"grad_norm": 2.7874903678894043,
"learning_rate": 2.417997494911539e-06,
"loss": 1.4337,
"step": 389000
},
{
"epoch": 2.86,
"grad_norm": 2.8501386642456055,
"learning_rate": 2.356838108658212e-06,
"loss": 1.4363,
"step": 389500
},
{
"epoch": 2.86,
"grad_norm": 3.1942977905273438,
"learning_rate": 2.295678722404885e-06,
"loss": 1.4362,
"step": 390000
},
{
"epoch": 2.87,
"grad_norm": 2.591784715652466,
"learning_rate": 2.2345193361515577e-06,
"loss": 1.44,
"step": 390500
},
{
"epoch": 2.87,
"grad_norm": 3.0462796688079834,
"learning_rate": 2.1733599498982307e-06,
"loss": 1.4318,
"step": 391000
},
{
"epoch": 2.87,
"grad_norm": 2.728050708770752,
"learning_rate": 2.112200563644904e-06,
"loss": 1.4404,
"step": 391500
},
{
"epoch": 2.88,
"grad_norm": 2.7059884071350098,
"learning_rate": 2.0510411773915767e-06,
"loss": 1.4343,
"step": 392000
},
{
"epoch": 2.88,
"grad_norm": 2.7897191047668457,
"learning_rate": 1.9898817911382497e-06,
"loss": 1.4264,
"step": 392500
},
{
"epoch": 2.88,
"grad_norm": 2.517503261566162,
"learning_rate": 1.9287224048849227e-06,
"loss": 1.4393,
"step": 393000
},
{
"epoch": 2.89,
"grad_norm": 2.8523480892181396,
"learning_rate": 1.8675630186315954e-06,
"loss": 1.4253,
"step": 393500
},
{
"epoch": 2.89,
"grad_norm": 2.5820095539093018,
"learning_rate": 1.8064036323782684e-06,
"loss": 1.4311,
"step": 394000
},
{
"epoch": 2.9,
"grad_norm": 2.8148810863494873,
"learning_rate": 1.7452442461249414e-06,
"loss": 1.4347,
"step": 394500
},
{
"epoch": 2.9,
"grad_norm": 2.8168435096740723,
"learning_rate": 1.6840848598716142e-06,
"loss": 1.4316,
"step": 395000
},
{
"epoch": 2.9,
"grad_norm": 2.9340474605560303,
"learning_rate": 1.6229254736182872e-06,
"loss": 1.4312,
"step": 395500
},
{
"epoch": 2.91,
"grad_norm": 2.8138039112091064,
"learning_rate": 1.5617660873649602e-06,
"loss": 1.4246,
"step": 396000
},
{
"epoch": 2.91,
"grad_norm": 2.826143980026245,
"learning_rate": 1.500606701111633e-06,
"loss": 1.4416,
"step": 396500
},
{
"epoch": 2.91,
"grad_norm": 2.8407280445098877,
"learning_rate": 1.439447314858306e-06,
"loss": 1.4341,
"step": 397000
},
{
"epoch": 2.92,
"grad_norm": 2.8632097244262695,
"learning_rate": 1.378287928604979e-06,
"loss": 1.4253,
"step": 397500
},
{
"epoch": 2.92,
"grad_norm": 2.6355011463165283,
"learning_rate": 1.317128542351652e-06,
"loss": 1.4387,
"step": 398000
},
{
"epoch": 2.92,
"grad_norm": 2.9903597831726074,
"learning_rate": 1.2559691560983247e-06,
"loss": 1.4269,
"step": 398500
},
{
"epoch": 2.93,
"grad_norm": 2.6634271144866943,
"learning_rate": 1.1948097698449977e-06,
"loss": 1.4278,
"step": 399000
},
{
"epoch": 2.93,
"grad_norm": 2.844621419906616,
"learning_rate": 1.1336503835916707e-06,
"loss": 1.4376,
"step": 399500
},
{
"epoch": 2.94,
"grad_norm": 2.8783321380615234,
"learning_rate": 1.0724909973383437e-06,
"loss": 1.4309,
"step": 400000
},
{
"epoch": 2.94,
"grad_norm": 2.565383195877075,
"learning_rate": 1.0113316110850164e-06,
"loss": 1.4182,
"step": 400500
},
{
"epoch": 2.94,
"grad_norm": 2.6493799686431885,
"learning_rate": 9.501722248316894e-07,
"loss": 1.4354,
"step": 401000
},
{
"epoch": 2.95,
"grad_norm": 2.7736098766326904,
"learning_rate": 8.890128385783623e-07,
"loss": 1.4269,
"step": 401500
},
{
"epoch": 2.95,
"grad_norm": 2.5747058391571045,
"learning_rate": 8.278534523250352e-07,
"loss": 1.4244,
"step": 402000
},
{
"epoch": 2.95,
"grad_norm": 2.9367337226867676,
"learning_rate": 7.666940660717082e-07,
"loss": 1.4339,
"step": 402500
},
{
"epoch": 2.96,
"grad_norm": 3.1144633293151855,
"learning_rate": 7.055346798183812e-07,
"loss": 1.4342,
"step": 403000
},
{
"epoch": 2.96,
"grad_norm": 2.8952739238739014,
"learning_rate": 6.443752935650541e-07,
"loss": 1.4216,
"step": 403500
},
{
"epoch": 2.97,
"grad_norm": 2.9333155155181885,
"learning_rate": 5.83215907311727e-07,
"loss": 1.4254,
"step": 404000
},
{
"epoch": 2.97,
"grad_norm": 2.900174140930176,
"learning_rate": 5.220565210583999e-07,
"loss": 1.433,
"step": 404500
},
{
"epoch": 2.97,
"grad_norm": 2.6606194972991943,
"learning_rate": 4.608971348050728e-07,
"loss": 1.427,
"step": 405000
},
{
"epoch": 2.98,
"grad_norm": 2.6916987895965576,
"learning_rate": 3.997377485517458e-07,
"loss": 1.4344,
"step": 405500
},
{
"epoch": 2.98,
"grad_norm": 2.7830684185028076,
"learning_rate": 3.385783622984187e-07,
"loss": 1.432,
"step": 406000
},
{
"epoch": 2.98,
"grad_norm": 2.9338104724884033,
"learning_rate": 2.7741897604509164e-07,
"loss": 1.4311,
"step": 406500
},
{
"epoch": 2.99,
"grad_norm": 2.861415147781372,
"learning_rate": 2.1625958979176455e-07,
"loss": 1.4245,
"step": 407000
},
{
"epoch": 2.99,
"grad_norm": 3.0713891983032227,
"learning_rate": 1.5510020353843746e-07,
"loss": 1.4246,
"step": 407500
},
{
"epoch": 2.99,
"grad_norm": 2.7229363918304443,
"learning_rate": 9.39408172851104e-08,
"loss": 1.4289,
"step": 408000
},
{
"epoch": 3.0,
"grad_norm": 2.5204861164093018,
"learning_rate": 3.278143103178331e-08,
"loss": 1.4267,
"step": 408500
},
{
"epoch": 3.0,
"step": 408768,
"total_flos": 3.442569138534612e+18,
"train_loss": 1.939745183989198,
"train_runtime": 329612.5954,
"train_samples_per_second": 39.685,
"train_steps_per_second": 1.24
}
],
"logging_steps": 500,
"max_steps": 408768,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 3.442569138534612e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}