|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 408768, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.4115859270095825, |
|
"learning_rate": 4.993884061374668e-05, |
|
"loss": 7.5674, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.173668384552002, |
|
"learning_rate": 4.987768122749335e-05, |
|
"loss": 7.1474, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1809221506118774, |
|
"learning_rate": 4.981652184124002e-05, |
|
"loss": 7.0218, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.898521065711975, |
|
"learning_rate": 4.975536245498669e-05, |
|
"loss": 6.9342, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.6537320613861084, |
|
"learning_rate": 4.969420306873337e-05, |
|
"loss": 6.8595, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.9779026508331299, |
|
"learning_rate": 4.963304368248004e-05, |
|
"loss": 6.8074, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.4129042625427246, |
|
"learning_rate": 4.957188429622671e-05, |
|
"loss": 6.7628, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.0669405460357666, |
|
"learning_rate": 4.951072490997339e-05, |
|
"loss": 6.7217, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.6271133422851562, |
|
"learning_rate": 4.944956552372006e-05, |
|
"loss": 6.6808, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.944228410720825, |
|
"learning_rate": 4.938840613746673e-05, |
|
"loss": 6.6526, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9261572360992432, |
|
"learning_rate": 4.9327246751213406e-05, |
|
"loss": 6.6157, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.7971737384796143, |
|
"learning_rate": 4.926608736496008e-05, |
|
"loss": 6.585, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.061311960220337, |
|
"learning_rate": 4.920492797870675e-05, |
|
"loss": 6.5625, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.9171266555786133, |
|
"learning_rate": 4.914376859245342e-05, |
|
"loss": 6.5476, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.557237386703491, |
|
"learning_rate": 4.9082609206200096e-05, |
|
"loss": 6.5063, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.9273940324783325, |
|
"learning_rate": 4.902144981994677e-05, |
|
"loss": 6.4848, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.46166729927063, |
|
"learning_rate": 4.896029043369344e-05, |
|
"loss": 6.4549, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.1991207599639893, |
|
"learning_rate": 4.889913104744012e-05, |
|
"loss": 6.4151, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.0304417610168457, |
|
"learning_rate": 4.8837971661186786e-05, |
|
"loss": 6.3421, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.5440313816070557, |
|
"learning_rate": 4.877681227493346e-05, |
|
"loss": 6.2588, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.8865408897399902, |
|
"learning_rate": 4.8715652888680135e-05, |
|
"loss": 6.1081, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.689366579055786, |
|
"learning_rate": 4.865449350242681e-05, |
|
"loss": 5.965, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.3553738594055176, |
|
"learning_rate": 4.8593334116173477e-05, |
|
"loss": 5.8321, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.879040479660034, |
|
"learning_rate": 4.853217472992016e-05, |
|
"loss": 5.7097, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.504615306854248, |
|
"learning_rate": 4.8471015343666825e-05, |
|
"loss": 5.5818, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.884676218032837, |
|
"learning_rate": 4.84098559574135e-05, |
|
"loss": 5.4549, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.560129165649414, |
|
"learning_rate": 4.8348696571160174e-05, |
|
"loss": 5.3163, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.542060613632202, |
|
"learning_rate": 4.828753718490685e-05, |
|
"loss": 5.1749, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.485215425491333, |
|
"learning_rate": 4.8226377798653515e-05, |
|
"loss": 5.0056, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.323887586593628, |
|
"learning_rate": 4.816521841240019e-05, |
|
"loss": 4.8195, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.07159161567688, |
|
"learning_rate": 4.8104059026146864e-05, |
|
"loss": 4.6421, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.2840936183929443, |
|
"learning_rate": 4.804289963989354e-05, |
|
"loss": 4.475, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.858837366104126, |
|
"learning_rate": 4.798174025364021e-05, |
|
"loss": 4.3387, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.913006544113159, |
|
"learning_rate": 4.792058086738688e-05, |
|
"loss": 4.2142, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.8114237785339355, |
|
"learning_rate": 4.7859421481133554e-05, |
|
"loss": 4.116, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.1581108570098877, |
|
"learning_rate": 4.779826209488023e-05, |
|
"loss": 4.0327, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.9168930053710938, |
|
"learning_rate": 4.77371027086269e-05, |
|
"loss": 3.9404, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8912034034729004, |
|
"learning_rate": 4.767594332237357e-05, |
|
"loss": 3.874, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.909111976623535, |
|
"learning_rate": 4.7614783936120244e-05, |
|
"loss": 3.8014, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.0821359157562256, |
|
"learning_rate": 4.755362454986692e-05, |
|
"loss": 3.7285, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.603447675704956, |
|
"learning_rate": 4.749246516361359e-05, |
|
"loss": 3.6757, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 2.466923952102661, |
|
"learning_rate": 4.743130577736026e-05, |
|
"loss": 3.6054, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.949532985687256, |
|
"learning_rate": 4.737014639110694e-05, |
|
"loss": 3.5478, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.7499911785125732, |
|
"learning_rate": 4.730898700485361e-05, |
|
"loss": 3.5018, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.8840396404266357, |
|
"learning_rate": 4.724782761860028e-05, |
|
"loss": 3.4503, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.7901082038879395, |
|
"learning_rate": 4.718666823234696e-05, |
|
"loss": 3.385, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.499488353729248, |
|
"learning_rate": 4.712550884609363e-05, |
|
"loss": 3.3385, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.77323055267334, |
|
"learning_rate": 4.70643494598403e-05, |
|
"loss": 3.2953, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.0041913986206055, |
|
"learning_rate": 4.700319007358698e-05, |
|
"loss": 3.2472, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.72615909576416, |
|
"learning_rate": 4.694203068733365e-05, |
|
"loss": 3.2107, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.766868829727173, |
|
"learning_rate": 4.688087130108032e-05, |
|
"loss": 3.1768, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.5954720973968506, |
|
"learning_rate": 4.6819711914826996e-05, |
|
"loss": 3.1358, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.7101314067840576, |
|
"learning_rate": 4.675855252857367e-05, |
|
"loss": 3.0949, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.7481093406677246, |
|
"learning_rate": 4.669739314232034e-05, |
|
"loss": 3.0655, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.869677782058716, |
|
"learning_rate": 4.663623375606702e-05, |
|
"loss": 3.0188, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.081693649291992, |
|
"learning_rate": 4.6575074369813686e-05, |
|
"loss": 2.9893, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.1503679752349854, |
|
"learning_rate": 4.651391498356036e-05, |
|
"loss": 2.975, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.9327192306518555, |
|
"learning_rate": 4.645275559730703e-05, |
|
"loss": 2.9276, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.580777406692505, |
|
"learning_rate": 4.639159621105371e-05, |
|
"loss": 2.9062, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.6058924198150635, |
|
"learning_rate": 4.633043682480038e-05, |
|
"loss": 2.8895, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.9445390701293945, |
|
"learning_rate": 4.626927743854705e-05, |
|
"loss": 2.8634, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.4329161643981934, |
|
"learning_rate": 4.6208118052293725e-05, |
|
"loss": 2.8399, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.910855770111084, |
|
"learning_rate": 4.61469586660404e-05, |
|
"loss": 2.8167, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.9852588176727295, |
|
"learning_rate": 4.608579927978707e-05, |
|
"loss": 2.8001, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.6188220977783203, |
|
"learning_rate": 4.602463989353374e-05, |
|
"loss": 2.7929, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.753516912460327, |
|
"learning_rate": 4.5963480507280416e-05, |
|
"loss": 2.7677, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.5995850563049316, |
|
"learning_rate": 4.590232112102709e-05, |
|
"loss": 2.7399, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.697634696960449, |
|
"learning_rate": 4.5841161734773764e-05, |
|
"loss": 2.7241, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.0841758251190186, |
|
"learning_rate": 4.578000234852043e-05, |
|
"loss": 2.7143, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.0621519088745117, |
|
"learning_rate": 4.5718842962267106e-05, |
|
"loss": 2.6897, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.912416934967041, |
|
"learning_rate": 4.565768357601378e-05, |
|
"loss": 2.6762, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.95345401763916, |
|
"learning_rate": 4.5596524189760454e-05, |
|
"loss": 2.6708, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.7842564582824707, |
|
"learning_rate": 4.553536480350712e-05, |
|
"loss": 2.6507, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 2.9051456451416016, |
|
"learning_rate": 4.54742054172538e-05, |
|
"loss": 2.6361, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.8248302936553955, |
|
"learning_rate": 4.541304603100047e-05, |
|
"loss": 2.619, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.076840400695801, |
|
"learning_rate": 4.5351886644747145e-05, |
|
"loss": 2.6011, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.7716526985168457, |
|
"learning_rate": 4.529072725849382e-05, |
|
"loss": 2.5928, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.8269171714782715, |
|
"learning_rate": 4.522956787224049e-05, |
|
"loss": 2.5813, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.041726589202881, |
|
"learning_rate": 4.516840848598716e-05, |
|
"loss": 2.5583, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.676236391067505, |
|
"learning_rate": 4.510724909973384e-05, |
|
"loss": 2.5538, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.8164145946502686, |
|
"learning_rate": 4.504608971348051e-05, |
|
"loss": 2.55, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.637202024459839, |
|
"learning_rate": 4.498493032722718e-05, |
|
"loss": 2.5273, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.9004831314086914, |
|
"learning_rate": 4.492377094097385e-05, |
|
"loss": 2.5142, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.7393102645874023, |
|
"learning_rate": 4.486261155472053e-05, |
|
"loss": 2.5093, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 2.814011335372925, |
|
"learning_rate": 4.48014521684672e-05, |
|
"loss": 2.4936, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.625415802001953, |
|
"learning_rate": 4.4740292782213874e-05, |
|
"loss": 2.4787, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.955406427383423, |
|
"learning_rate": 4.467913339596055e-05, |
|
"loss": 2.471, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.6002650260925293, |
|
"learning_rate": 4.461797400970722e-05, |
|
"loss": 2.4626, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.75281023979187, |
|
"learning_rate": 4.455681462345389e-05, |
|
"loss": 2.459, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.5558836460113525, |
|
"learning_rate": 4.449565523720057e-05, |
|
"loss": 2.4441, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 2.634889602661133, |
|
"learning_rate": 4.443449585094724e-05, |
|
"loss": 2.4392, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.015256643295288, |
|
"learning_rate": 4.437333646469391e-05, |
|
"loss": 2.4218, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.656592607498169, |
|
"learning_rate": 4.431217707844059e-05, |
|
"loss": 2.4148, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 2.9560022354125977, |
|
"learning_rate": 4.425101769218726e-05, |
|
"loss": 2.4074, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.8009862899780273, |
|
"learning_rate": 4.418985830593393e-05, |
|
"loss": 2.4028, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.7291080951690674, |
|
"learning_rate": 4.41286989196806e-05, |
|
"loss": 2.3874, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.959327220916748, |
|
"learning_rate": 4.406753953342728e-05, |
|
"loss": 2.3803, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.4076898097991943, |
|
"learning_rate": 4.400638014717395e-05, |
|
"loss": 2.3641, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.703214168548584, |
|
"learning_rate": 4.3945220760920625e-05, |
|
"loss": 2.3707, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.7057530879974365, |
|
"learning_rate": 4.388406137466729e-05, |
|
"loss": 2.3551, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.656576156616211, |
|
"learning_rate": 4.382290198841397e-05, |
|
"loss": 2.3501, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.7500181198120117, |
|
"learning_rate": 4.376174260216064e-05, |
|
"loss": 2.342, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.560018301010132, |
|
"learning_rate": 4.3700583215907316e-05, |
|
"loss": 2.3378, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.5425586700439453, |
|
"learning_rate": 4.363942382965398e-05, |
|
"loss": 2.3263, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.7227046489715576, |
|
"learning_rate": 4.357826444340066e-05, |
|
"loss": 2.3216, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.8094546794891357, |
|
"learning_rate": 4.351710505714733e-05, |
|
"loss": 2.3168, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.559802293777466, |
|
"learning_rate": 4.3455945670894006e-05, |
|
"loss": 2.3129, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.9485251903533936, |
|
"learning_rate": 4.3394786284640673e-05, |
|
"loss": 2.3055, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.554258346557617, |
|
"learning_rate": 4.3333626898387355e-05, |
|
"loss": 2.2897, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.9118120670318604, |
|
"learning_rate": 4.327246751213402e-05, |
|
"loss": 2.2804, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.6395227909088135, |
|
"learning_rate": 4.3211308125880696e-05, |
|
"loss": 2.285, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.8277575969696045, |
|
"learning_rate": 4.315014873962737e-05, |
|
"loss": 2.2725, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.6950581073760986, |
|
"learning_rate": 4.3088989353374045e-05, |
|
"loss": 2.2726, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.5823190212249756, |
|
"learning_rate": 4.302782996712071e-05, |
|
"loss": 2.2676, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.9325387477874756, |
|
"learning_rate": 4.296667058086739e-05, |
|
"loss": 2.25, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.915308952331543, |
|
"learning_rate": 4.290551119461406e-05, |
|
"loss": 2.237, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.782322883605957, |
|
"learning_rate": 4.2844351808360735e-05, |
|
"loss": 2.2435, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 2.8868165016174316, |
|
"learning_rate": 4.278319242210741e-05, |
|
"loss": 2.2282, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.906238079071045, |
|
"learning_rate": 4.2722033035854084e-05, |
|
"loss": 2.2278, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.8209023475646973, |
|
"learning_rate": 4.266087364960075e-05, |
|
"loss": 2.2212, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.944169044494629, |
|
"learning_rate": 4.259971426334743e-05, |
|
"loss": 2.2107, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.010463237762451, |
|
"learning_rate": 4.25385548770941e-05, |
|
"loss": 2.2112, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.5480105876922607, |
|
"learning_rate": 4.2477395490840774e-05, |
|
"loss": 2.2232, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.619253635406494, |
|
"learning_rate": 4.241623610458745e-05, |
|
"loss": 2.1997, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.716602325439453, |
|
"learning_rate": 4.235507671833412e-05, |
|
"loss": 2.1967, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.7304656505584717, |
|
"learning_rate": 4.229391733208079e-05, |
|
"loss": 2.1865, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.3193280696868896, |
|
"learning_rate": 4.2232757945827464e-05, |
|
"loss": 2.1783, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.659099817276001, |
|
"learning_rate": 4.217159855957414e-05, |
|
"loss": 2.1669, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.8396430015563965, |
|
"learning_rate": 4.211043917332081e-05, |
|
"loss": 2.1794, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.5535542964935303, |
|
"learning_rate": 4.204927978706748e-05, |
|
"loss": 2.1688, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.776742696762085, |
|
"learning_rate": 4.1988120400814154e-05, |
|
"loss": 2.1661, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.4852330684661865, |
|
"learning_rate": 4.192696101456083e-05, |
|
"loss": 2.1591, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.8885903358459473, |
|
"learning_rate": 4.1865801628307496e-05, |
|
"loss": 2.1581, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.6640567779541016, |
|
"learning_rate": 4.180464224205418e-05, |
|
"loss": 2.1549, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.27127742767334, |
|
"learning_rate": 4.1743482855800845e-05, |
|
"loss": 2.1492, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.591395139694214, |
|
"learning_rate": 4.168232346954752e-05, |
|
"loss": 2.1382, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 2.8147051334381104, |
|
"learning_rate": 4.162116408329419e-05, |
|
"loss": 2.1407, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.650275707244873, |
|
"learning_rate": 4.156000469704087e-05, |
|
"loss": 2.1345, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.7816410064697266, |
|
"learning_rate": 4.1498845310787535e-05, |
|
"loss": 2.1371, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.6034348011016846, |
|
"learning_rate": 4.1437685924534216e-05, |
|
"loss": 2.1264, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.6060938835144043, |
|
"learning_rate": 4.1376526538280883e-05, |
|
"loss": 2.1186, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.754519462585449, |
|
"learning_rate": 4.131536715202756e-05, |
|
"loss": 2.1167, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.665511131286621, |
|
"learning_rate": 4.125420776577423e-05, |
|
"loss": 2.1025, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.8608968257904053, |
|
"learning_rate": 4.1193048379520906e-05, |
|
"loss": 2.1087, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 2.521726369857788, |
|
"learning_rate": 4.1131888993267574e-05, |
|
"loss": 2.1035, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.713449001312256, |
|
"learning_rate": 4.1070729607014255e-05, |
|
"loss": 2.0853, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.6165554523468018, |
|
"learning_rate": 4.100957022076092e-05, |
|
"loss": 2.1032, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.5650808811187744, |
|
"learning_rate": 4.0948410834507596e-05, |
|
"loss": 2.0823, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.7067785263061523, |
|
"learning_rate": 4.0887251448254264e-05, |
|
"loss": 2.0843, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.6049702167510986, |
|
"learning_rate": 4.0826092062000945e-05, |
|
"loss": 2.0851, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.487355947494507, |
|
"learning_rate": 4.076493267574761e-05, |
|
"loss": 2.0745, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.4251976013183594, |
|
"learning_rate": 4.070377328949429e-05, |
|
"loss": 2.0728, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 2.6600770950317383, |
|
"learning_rate": 4.064261390324096e-05, |
|
"loss": 2.0725, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.488598346710205, |
|
"learning_rate": 4.0581454516987635e-05, |
|
"loss": 2.0638, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.8305716514587402, |
|
"learning_rate": 4.05202951307343e-05, |
|
"loss": 2.0673, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.607948064804077, |
|
"learning_rate": 4.0459135744480984e-05, |
|
"loss": 2.0698, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.558473825454712, |
|
"learning_rate": 4.039797635822765e-05, |
|
"loss": 2.0531, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.675361394882202, |
|
"learning_rate": 4.0336816971974326e-05, |
|
"loss": 2.0568, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 2.430924654006958, |
|
"learning_rate": 4.0275657585721e-05, |
|
"loss": 2.0472, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.637683153152466, |
|
"learning_rate": 4.021449819946767e-05, |
|
"loss": 2.0422, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.8251748085021973, |
|
"learning_rate": 4.015333881321434e-05, |
|
"loss": 2.0438, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.9130163192749023, |
|
"learning_rate": 4.0092179426961016e-05, |
|
"loss": 2.0334, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.6857762336730957, |
|
"learning_rate": 4.003102004070769e-05, |
|
"loss": 2.0298, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.7029411792755127, |
|
"learning_rate": 3.996986065445436e-05, |
|
"loss": 2.0266, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.43390154838562, |
|
"learning_rate": 3.990870126820104e-05, |
|
"loss": 2.0323, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.8077657222747803, |
|
"learning_rate": 3.9847541881947706e-05, |
|
"loss": 2.0204, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.6263062953948975, |
|
"learning_rate": 3.978638249569438e-05, |
|
"loss": 2.0238, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.582228660583496, |
|
"learning_rate": 3.9725223109441055e-05, |
|
"loss": 2.0157, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.989870548248291, |
|
"learning_rate": 3.966406372318773e-05, |
|
"loss": 2.0123, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.50876522064209, |
|
"learning_rate": 3.9602904336934396e-05, |
|
"loss": 2.0049, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.754103183746338, |
|
"learning_rate": 3.954174495068108e-05, |
|
"loss": 2.0108, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.742558240890503, |
|
"learning_rate": 3.9480585564427745e-05, |
|
"loss": 2.0013, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 2.7211074829101562, |
|
"learning_rate": 3.941942617817442e-05, |
|
"loss": 2.0056, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.5889461040496826, |
|
"learning_rate": 3.9358266791921087e-05, |
|
"loss": 1.9933, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.581122398376465, |
|
"learning_rate": 3.929710740566777e-05, |
|
"loss": 1.9896, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.7021663188934326, |
|
"learning_rate": 3.9235948019414435e-05, |
|
"loss": 1.9878, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.6844136714935303, |
|
"learning_rate": 3.917478863316111e-05, |
|
"loss": 1.9838, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.6349422931671143, |
|
"learning_rate": 3.9113629246907784e-05, |
|
"loss": 1.9796, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.6799721717834473, |
|
"learning_rate": 3.905246986065446e-05, |
|
"loss": 1.9793, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.502464771270752, |
|
"learning_rate": 3.8991310474401125e-05, |
|
"loss": 1.9789, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.897421360015869, |
|
"learning_rate": 3.8930151088147806e-05, |
|
"loss": 1.9766, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.6226820945739746, |
|
"learning_rate": 3.8868991701894474e-05, |
|
"loss": 1.9785, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 2.7630228996276855, |
|
"learning_rate": 3.880783231564115e-05, |
|
"loss": 1.9805, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.7849583625793457, |
|
"learning_rate": 3.874667292938782e-05, |
|
"loss": 1.9672, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.643397569656372, |
|
"learning_rate": 3.86855135431345e-05, |
|
"loss": 1.9618, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 2.6938283443450928, |
|
"learning_rate": 3.8624354156881164e-05, |
|
"loss": 1.9674, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.7914974689483643, |
|
"learning_rate": 3.856319477062784e-05, |
|
"loss": 1.9616, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.6223716735839844, |
|
"learning_rate": 3.850203538437451e-05, |
|
"loss": 1.9566, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.6575443744659424, |
|
"learning_rate": 3.844087599812119e-05, |
|
"loss": 1.9611, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.684488534927368, |
|
"learning_rate": 3.837971661186786e-05, |
|
"loss": 1.9467, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.668365001678467, |
|
"learning_rate": 3.831855722561453e-05, |
|
"loss": 1.9548, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.967519521713257, |
|
"learning_rate": 3.82573978393612e-05, |
|
"loss": 1.9535, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.5876193046569824, |
|
"learning_rate": 3.819623845310788e-05, |
|
"loss": 1.9446, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 2.7293176651000977, |
|
"learning_rate": 3.813507906685455e-05, |
|
"loss": 1.9481, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.6928365230560303, |
|
"learning_rate": 3.807391968060122e-05, |
|
"loss": 1.9296, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.568150043487549, |
|
"learning_rate": 3.801276029434789e-05, |
|
"loss": 1.9422, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.7275748252868652, |
|
"learning_rate": 3.795160090809457e-05, |
|
"loss": 1.9314, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.613135576248169, |
|
"learning_rate": 3.789044152184124e-05, |
|
"loss": 1.931, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.408534049987793, |
|
"learning_rate": 3.782928213558791e-05, |
|
"loss": 1.9314, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.896430015563965, |
|
"learning_rate": 3.776812274933459e-05, |
|
"loss": 1.9221, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.6729750633239746, |
|
"learning_rate": 3.770696336308126e-05, |
|
"loss": 1.9247, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.5819644927978516, |
|
"learning_rate": 3.764580397682793e-05, |
|
"loss": 1.9243, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.561739206314087, |
|
"learning_rate": 3.7584644590574606e-05, |
|
"loss": 1.9291, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.663254976272583, |
|
"learning_rate": 3.752348520432128e-05, |
|
"loss": 1.9138, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.662156581878662, |
|
"learning_rate": 3.746232581806795e-05, |
|
"loss": 1.9128, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.6324357986450195, |
|
"learning_rate": 3.740116643181463e-05, |
|
"loss": 1.9215, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.619344711303711, |
|
"learning_rate": 3.7340007045561297e-05, |
|
"loss": 1.9043, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.681112051010132, |
|
"learning_rate": 3.727884765930797e-05, |
|
"loss": 1.9077, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.846804141998291, |
|
"learning_rate": 3.7217688273054645e-05, |
|
"loss": 1.9034, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.0270516872406006, |
|
"learning_rate": 3.715652888680132e-05, |
|
"loss": 1.901, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.5290517807006836, |
|
"learning_rate": 3.709536950054799e-05, |
|
"loss": 1.9031, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.661867380142212, |
|
"learning_rate": 3.703421011429467e-05, |
|
"loss": 1.902, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 2.816241979598999, |
|
"learning_rate": 3.6973050728041335e-05, |
|
"loss": 1.8885, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.8065085411071777, |
|
"learning_rate": 3.691189134178801e-05, |
|
"loss": 1.8931, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.4863102436065674, |
|
"learning_rate": 3.6850731955534684e-05, |
|
"loss": 1.8907, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 2.4044525623321533, |
|
"learning_rate": 3.678957256928136e-05, |
|
"loss": 1.8911, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.6208319664001465, |
|
"learning_rate": 3.6728413183028026e-05, |
|
"loss": 1.8889, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.4432547092437744, |
|
"learning_rate": 3.66672537967747e-05, |
|
"loss": 1.8794, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.9175052642822266, |
|
"learning_rate": 3.6606094410521374e-05, |
|
"loss": 1.8769, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.7171223163604736, |
|
"learning_rate": 3.654493502426805e-05, |
|
"loss": 1.8834, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.5070419311523438, |
|
"learning_rate": 3.6483775638014716e-05, |
|
"loss": 1.8706, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.59771990776062, |
|
"learning_rate": 3.642261625176139e-05, |
|
"loss": 1.8687, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.022465944290161, |
|
"learning_rate": 3.6361456865508064e-05, |
|
"loss": 1.8721, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.8927907943725586, |
|
"learning_rate": 3.630029747925474e-05, |
|
"loss": 1.8761, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.612518787384033, |
|
"learning_rate": 3.623913809300141e-05, |
|
"loss": 1.8744, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.7625935077667236, |
|
"learning_rate": 3.617797870674808e-05, |
|
"loss": 1.8636, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.535382032394409, |
|
"learning_rate": 3.6116819320494755e-05, |
|
"loss": 1.8683, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.575298547744751, |
|
"learning_rate": 3.605565993424143e-05, |
|
"loss": 1.8657, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.6413776874542236, |
|
"learning_rate": 3.59945005479881e-05, |
|
"loss": 1.8571, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.675283908843994, |
|
"learning_rate": 3.593334116173477e-05, |
|
"loss": 1.8575, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.7104618549346924, |
|
"learning_rate": 3.587218177548145e-05, |
|
"loss": 1.8555, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 2.7391483783721924, |
|
"learning_rate": 3.581102238922812e-05, |
|
"loss": 1.8555, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.561523675918579, |
|
"learning_rate": 3.5749863002974793e-05, |
|
"loss": 1.8475, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.590790033340454, |
|
"learning_rate": 3.568870361672147e-05, |
|
"loss": 1.8425, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 2.889119863510132, |
|
"learning_rate": 3.562754423046814e-05, |
|
"loss": 1.8491, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.77632212638855, |
|
"learning_rate": 3.556638484421481e-05, |
|
"loss": 1.8469, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.720357894897461, |
|
"learning_rate": 3.550522545796149e-05, |
|
"loss": 1.8462, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 2.8213210105895996, |
|
"learning_rate": 3.544406607170816e-05, |
|
"loss": 1.8399, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.834599733352661, |
|
"learning_rate": 3.538290668545483e-05, |
|
"loss": 1.8429, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.578364133834839, |
|
"learning_rate": 3.5321747299201506e-05, |
|
"loss": 1.8386, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.6725339889526367, |
|
"learning_rate": 3.526058791294818e-05, |
|
"loss": 1.85, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.5288286209106445, |
|
"learning_rate": 3.519942852669485e-05, |
|
"loss": 1.831, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.4805219173431396, |
|
"learning_rate": 3.513826914044152e-05, |
|
"loss": 1.836, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.6729605197906494, |
|
"learning_rate": 3.50771097541882e-05, |
|
"loss": 1.8297, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.5666863918304443, |
|
"learning_rate": 3.501595036793487e-05, |
|
"loss": 1.8231, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 2.5059523582458496, |
|
"learning_rate": 3.495479098168154e-05, |
|
"loss": 1.8296, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.758755922317505, |
|
"learning_rate": 3.489363159542822e-05, |
|
"loss": 1.8218, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.5374386310577393, |
|
"learning_rate": 3.483247220917489e-05, |
|
"loss": 1.8234, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 2.5575644969940186, |
|
"learning_rate": 3.477131282292156e-05, |
|
"loss": 1.8158, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.567166328430176, |
|
"learning_rate": 3.4710153436668236e-05, |
|
"loss": 1.8179, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.4243805408477783, |
|
"learning_rate": 3.464899405041491e-05, |
|
"loss": 1.82, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.665632486343384, |
|
"learning_rate": 3.458783466416158e-05, |
|
"loss": 1.8151, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.90759539604187, |
|
"learning_rate": 3.452667527790825e-05, |
|
"loss": 1.8109, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 2.6775169372558594, |
|
"learning_rate": 3.4465515891654926e-05, |
|
"loss": 1.8199, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.430788516998291, |
|
"learning_rate": 3.44043565054016e-05, |
|
"loss": 1.8128, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.9997665882110596, |
|
"learning_rate": 3.4343197119148274e-05, |
|
"loss": 1.8106, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 2.672255039215088, |
|
"learning_rate": 3.428203773289494e-05, |
|
"loss": 1.8141, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.4398484230041504, |
|
"learning_rate": 3.4220878346641616e-05, |
|
"loss": 1.8036, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.897477149963379, |
|
"learning_rate": 3.415971896038829e-05, |
|
"loss": 1.8006, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.7050111293792725, |
|
"learning_rate": 3.4098559574134965e-05, |
|
"loss": 1.803, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.503981828689575, |
|
"learning_rate": 3.403740018788163e-05, |
|
"loss": 1.7992, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 2.9682281017303467, |
|
"learning_rate": 3.397624080162831e-05, |
|
"loss": 1.796, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.1613168716430664, |
|
"learning_rate": 3.391508141537498e-05, |
|
"loss": 1.8083, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.6427714824676514, |
|
"learning_rate": 3.3853922029121655e-05, |
|
"loss": 1.7968, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 2.6238105297088623, |
|
"learning_rate": 3.379276264286832e-05, |
|
"loss": 1.7887, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.566740036010742, |
|
"learning_rate": 3.3731603256615e-05, |
|
"loss": 1.7979, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.6818795204162598, |
|
"learning_rate": 3.367044387036167e-05, |
|
"loss": 1.7856, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 2.7290897369384766, |
|
"learning_rate": 3.3609284484108345e-05, |
|
"loss": 1.785, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.8657619953155518, |
|
"learning_rate": 3.354812509785502e-05, |
|
"loss": 1.7925, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.4724438190460205, |
|
"learning_rate": 3.3486965711601694e-05, |
|
"loss": 1.7854, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.658123016357422, |
|
"learning_rate": 3.342580632534836e-05, |
|
"loss": 1.7904, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.828024387359619, |
|
"learning_rate": 3.336464693909504e-05, |
|
"loss": 1.7768, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.7401976585388184, |
|
"learning_rate": 3.330348755284171e-05, |
|
"loss": 1.7808, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.651334524154663, |
|
"learning_rate": 3.3242328166588384e-05, |
|
"loss": 1.7769, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.9663844108581543, |
|
"learning_rate": 3.318116878033506e-05, |
|
"loss": 1.7685, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.6409363746643066, |
|
"learning_rate": 3.312000939408173e-05, |
|
"loss": 1.7819, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.4980876445770264, |
|
"learning_rate": 3.30588500078284e-05, |
|
"loss": 1.7781, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.583472967147827, |
|
"learning_rate": 3.299769062157508e-05, |
|
"loss": 1.7642, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 2.7035281658172607, |
|
"learning_rate": 3.293653123532175e-05, |
|
"loss": 1.7707, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.647327184677124, |
|
"learning_rate": 3.287537184906842e-05, |
|
"loss": 1.77, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.6016039848327637, |
|
"learning_rate": 3.28142124628151e-05, |
|
"loss": 1.7689, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 2.871412515640259, |
|
"learning_rate": 3.275305307656177e-05, |
|
"loss": 1.7614, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.5391454696655273, |
|
"learning_rate": 3.269189369030844e-05, |
|
"loss": 1.7666, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.7399189472198486, |
|
"learning_rate": 3.263073430405511e-05, |
|
"loss": 1.7698, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.550523281097412, |
|
"learning_rate": 3.256957491780179e-05, |
|
"loss": 1.7661, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.7033884525299072, |
|
"learning_rate": 3.250841553154846e-05, |
|
"loss": 1.7576, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 2.712890386581421, |
|
"learning_rate": 3.244725614529513e-05, |
|
"loss": 1.759, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.7690348625183105, |
|
"learning_rate": 3.23860967590418e-05, |
|
"loss": 1.757, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.5629446506500244, |
|
"learning_rate": 3.232493737278848e-05, |
|
"loss": 1.7593, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 2.5981359481811523, |
|
"learning_rate": 3.226377798653515e-05, |
|
"loss": 1.7506, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 2.465391159057617, |
|
"learning_rate": 3.2202618600281826e-05, |
|
"loss": 1.7558, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 3.0468790531158447, |
|
"learning_rate": 3.2141459214028493e-05, |
|
"loss": 1.7559, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.5434939861297607, |
|
"learning_rate": 3.208029982777517e-05, |
|
"loss": 1.7507, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.479449987411499, |
|
"learning_rate": 3.201914044152184e-05, |
|
"loss": 1.7457, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 2.621965169906616, |
|
"learning_rate": 3.1957981055268516e-05, |
|
"loss": 1.7513, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.6055707931518555, |
|
"learning_rate": 3.1896821669015184e-05, |
|
"loss": 1.7427, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.2937374114990234, |
|
"learning_rate": 3.1835662282761865e-05, |
|
"loss": 1.7524, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 2.7363667488098145, |
|
"learning_rate": 3.177450289650853e-05, |
|
"loss": 1.7492, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.660330057144165, |
|
"learning_rate": 3.1713343510255207e-05, |
|
"loss": 1.7405, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.589137077331543, |
|
"learning_rate": 3.165218412400188e-05, |
|
"loss": 1.7389, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 2.7419252395629883, |
|
"learning_rate": 3.1591024737748555e-05, |
|
"loss": 1.7478, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.6772820949554443, |
|
"learning_rate": 3.152986535149522e-05, |
|
"loss": 1.7365, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 2.765460968017578, |
|
"learning_rate": 3.1468705965241904e-05, |
|
"loss": 1.7399, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.5273730754852295, |
|
"learning_rate": 3.140754657898857e-05, |
|
"loss": 1.7336, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.6962389945983887, |
|
"learning_rate": 3.1346387192735245e-05, |
|
"loss": 1.7347, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.6664233207702637, |
|
"learning_rate": 3.128522780648192e-05, |
|
"loss": 1.7212, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.923704147338867, |
|
"learning_rate": 3.1224068420228594e-05, |
|
"loss": 1.7327, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.7295753955841064, |
|
"learning_rate": 3.116290903397526e-05, |
|
"loss": 1.7304, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 2.6523427963256836, |
|
"learning_rate": 3.110174964772194e-05, |
|
"loss": 1.7295, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.544809341430664, |
|
"learning_rate": 3.104059026146861e-05, |
|
"loss": 1.7303, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.608091354370117, |
|
"learning_rate": 3.0979430875215284e-05, |
|
"loss": 1.7288, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 2.6781680583953857, |
|
"learning_rate": 3.091827148896195e-05, |
|
"loss": 1.7333, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.5772953033447266, |
|
"learning_rate": 3.085711210270863e-05, |
|
"loss": 1.7313, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 2.6381750106811523, |
|
"learning_rate": 3.07959527164553e-05, |
|
"loss": 1.723, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.609584093093872, |
|
"learning_rate": 3.0734793330201974e-05, |
|
"loss": 1.7317, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.556508779525757, |
|
"learning_rate": 3.067363394394865e-05, |
|
"loss": 1.7168, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.4912405014038086, |
|
"learning_rate": 3.061247455769532e-05, |
|
"loss": 1.7221, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 3.072758436203003, |
|
"learning_rate": 3.055131517144199e-05, |
|
"loss": 1.7193, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.5012080669403076, |
|
"learning_rate": 3.0490155785188668e-05, |
|
"loss": 1.7187, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 2.6786646842956543, |
|
"learning_rate": 3.042899639893534e-05, |
|
"loss": 1.7099, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.6225085258483887, |
|
"learning_rate": 3.036783701268201e-05, |
|
"loss": 1.7135, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 2.6484742164611816, |
|
"learning_rate": 3.0306677626428687e-05, |
|
"loss": 1.7086, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.948645830154419, |
|
"learning_rate": 3.0245518240175358e-05, |
|
"loss": 1.7098, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.766124725341797, |
|
"learning_rate": 3.018435885392203e-05, |
|
"loss": 1.7072, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.8391172885894775, |
|
"learning_rate": 3.0123199467668707e-05, |
|
"loss": 1.7116, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.5152761936187744, |
|
"learning_rate": 3.0062040081415378e-05, |
|
"loss": 1.7073, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.719190835952759, |
|
"learning_rate": 3.000088069516205e-05, |
|
"loss": 1.7017, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.5250284671783447, |
|
"learning_rate": 2.9939721308908726e-05, |
|
"loss": 1.7055, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 3.0202980041503906, |
|
"learning_rate": 2.9878561922655397e-05, |
|
"loss": 1.7041, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.7549800872802734, |
|
"learning_rate": 2.9817402536402068e-05, |
|
"loss": 1.6984, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 2.891324281692505, |
|
"learning_rate": 2.9756243150148742e-05, |
|
"loss": 1.7033, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 3.0068359375, |
|
"learning_rate": 2.9695083763895416e-05, |
|
"loss": 1.6969, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 2.5715763568878174, |
|
"learning_rate": 2.9633924377642087e-05, |
|
"loss": 1.698, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.5422136783599854, |
|
"learning_rate": 2.9572764991388758e-05, |
|
"loss": 1.6987, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.5007290840148926, |
|
"learning_rate": 2.9511605605135432e-05, |
|
"loss": 1.6993, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 2.5966525077819824, |
|
"learning_rate": 2.9450446218882107e-05, |
|
"loss": 1.6923, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.5626885890960693, |
|
"learning_rate": 2.9389286832628778e-05, |
|
"loss": 1.6964, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.76600980758667, |
|
"learning_rate": 2.9328127446375452e-05, |
|
"loss": 1.6875, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 2.922257661819458, |
|
"learning_rate": 2.9266968060122123e-05, |
|
"loss": 1.6985, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.641627311706543, |
|
"learning_rate": 2.9205808673868794e-05, |
|
"loss": 1.6888, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.6927127838134766, |
|
"learning_rate": 2.914464928761547e-05, |
|
"loss": 1.6924, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 2.4802746772766113, |
|
"learning_rate": 2.9083489901362142e-05, |
|
"loss": 1.6946, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.5318949222564697, |
|
"learning_rate": 2.9022330515108813e-05, |
|
"loss": 1.6894, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.7798898220062256, |
|
"learning_rate": 2.896117112885549e-05, |
|
"loss": 1.6861, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.7204222679138184, |
|
"learning_rate": 2.890001174260216e-05, |
|
"loss": 1.6839, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.8077077865600586, |
|
"learning_rate": 2.8838852356348832e-05, |
|
"loss": 1.684, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 2.565995454788208, |
|
"learning_rate": 2.877769297009551e-05, |
|
"loss": 1.682, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.7671403884887695, |
|
"learning_rate": 2.871653358384218e-05, |
|
"loss": 1.6782, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.6801698207855225, |
|
"learning_rate": 2.8655374197588852e-05, |
|
"loss": 1.6853, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.810450553894043, |
|
"learning_rate": 2.859421481133553e-05, |
|
"loss": 1.6841, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.797452211380005, |
|
"learning_rate": 2.85330554250822e-05, |
|
"loss": 1.6753, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 2.5832931995391846, |
|
"learning_rate": 2.847189603882887e-05, |
|
"loss": 1.6905, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.8013391494750977, |
|
"learning_rate": 2.841073665257555e-05, |
|
"loss": 1.673, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.5176284313201904, |
|
"learning_rate": 2.834957726632222e-05, |
|
"loss": 1.6742, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 2.7248387336730957, |
|
"learning_rate": 2.828841788006889e-05, |
|
"loss": 1.6663, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 3.006441831588745, |
|
"learning_rate": 2.822725849381556e-05, |
|
"loss": 1.6762, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.754427671432495, |
|
"learning_rate": 2.816609910756224e-05, |
|
"loss": 1.6711, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 2.6871962547302246, |
|
"learning_rate": 2.810493972130891e-05, |
|
"loss": 1.6749, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.7660982608795166, |
|
"learning_rate": 2.804378033505558e-05, |
|
"loss": 1.6694, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.5820930004119873, |
|
"learning_rate": 2.798262094880226e-05, |
|
"loss": 1.68, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.50264048576355, |
|
"learning_rate": 2.792146156254893e-05, |
|
"loss": 1.6745, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.759570837020874, |
|
"learning_rate": 2.78603021762956e-05, |
|
"loss": 1.6602, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 2.6648566722869873, |
|
"learning_rate": 2.7799142790042278e-05, |
|
"loss": 1.67, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.7140583992004395, |
|
"learning_rate": 2.773798340378895e-05, |
|
"loss": 1.6645, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.4393863677978516, |
|
"learning_rate": 2.767682401753562e-05, |
|
"loss": 1.665, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 2.66521954536438, |
|
"learning_rate": 2.7615664631282294e-05, |
|
"loss": 1.6694, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.9926490783691406, |
|
"learning_rate": 2.7554505245028965e-05, |
|
"loss": 1.669, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.611051321029663, |
|
"learning_rate": 2.749334585877564e-05, |
|
"loss": 1.6606, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 2.6490185260772705, |
|
"learning_rate": 2.7432186472522313e-05, |
|
"loss": 1.6639, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.7830920219421387, |
|
"learning_rate": 2.7371027086268984e-05, |
|
"loss": 1.6584, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 2.776111602783203, |
|
"learning_rate": 2.7309867700015655e-05, |
|
"loss": 1.6564, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.5335960388183594, |
|
"learning_rate": 2.7248708313762333e-05, |
|
"loss": 1.6553, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.585458755493164, |
|
"learning_rate": 2.7187548927509004e-05, |
|
"loss": 1.653, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 2.851865768432617, |
|
"learning_rate": 2.7126389541255674e-05, |
|
"loss": 1.6654, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.649545907974243, |
|
"learning_rate": 2.7065230155002352e-05, |
|
"loss": 1.657, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.552381753921509, |
|
"learning_rate": 2.7004070768749023e-05, |
|
"loss": 1.6569, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 2.6055853366851807, |
|
"learning_rate": 2.6942911382495694e-05, |
|
"loss": 1.6522, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.848911762237549, |
|
"learning_rate": 2.6881751996242365e-05, |
|
"loss": 1.6435, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.8290162086486816, |
|
"learning_rate": 2.6820592609989042e-05, |
|
"loss": 1.6457, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 2.682929277420044, |
|
"learning_rate": 2.6759433223735713e-05, |
|
"loss": 1.6521, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.7035279273986816, |
|
"learning_rate": 2.6698273837482384e-05, |
|
"loss": 1.652, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.6156182289123535, |
|
"learning_rate": 2.6637114451229062e-05, |
|
"loss": 1.6421, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.7647957801818848, |
|
"learning_rate": 2.6575955064975733e-05, |
|
"loss": 1.6496, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.5763864517211914, |
|
"learning_rate": 2.6514795678722403e-05, |
|
"loss": 1.6454, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 2.6585116386413574, |
|
"learning_rate": 2.645363629246908e-05, |
|
"loss": 1.6443, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.7471868991851807, |
|
"learning_rate": 2.6392476906215752e-05, |
|
"loss": 1.6494, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.7787129878997803, |
|
"learning_rate": 2.6331317519962423e-05, |
|
"loss": 1.6441, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 2.3297078609466553, |
|
"learning_rate": 2.62701581337091e-05, |
|
"loss": 1.6462, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.8310294151306152, |
|
"learning_rate": 2.620899874745577e-05, |
|
"loss": 1.6473, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.6443045139312744, |
|
"learning_rate": 2.6147839361202442e-05, |
|
"loss": 1.6468, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.5064589977264404, |
|
"learning_rate": 2.608667997494912e-05, |
|
"loss": 1.6385, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.6140296459198, |
|
"learning_rate": 2.602552058869579e-05, |
|
"loss": 1.6357, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 2.461705207824707, |
|
"learning_rate": 2.596436120244246e-05, |
|
"loss": 1.6401, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.782813787460327, |
|
"learning_rate": 2.590320181618914e-05, |
|
"loss": 1.6426, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.5911970138549805, |
|
"learning_rate": 2.584204242993581e-05, |
|
"loss": 1.6409, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 2.593752384185791, |
|
"learning_rate": 2.578088304368248e-05, |
|
"loss": 1.6345, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.9096670150756836, |
|
"learning_rate": 2.5719723657429155e-05, |
|
"loss": 1.6351, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.9551987648010254, |
|
"learning_rate": 2.5658564271175826e-05, |
|
"loss": 1.6322, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 2.6173858642578125, |
|
"learning_rate": 2.55974048849225e-05, |
|
"loss": 1.6307, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.695869207382202, |
|
"learning_rate": 2.5536245498669175e-05, |
|
"loss": 1.6264, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 2.711869955062866, |
|
"learning_rate": 2.5475086112415846e-05, |
|
"loss": 1.6283, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.570518732070923, |
|
"learning_rate": 2.5413926726162516e-05, |
|
"loss": 1.6323, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.7032439708709717, |
|
"learning_rate": 2.535276733990919e-05, |
|
"loss": 1.6339, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.7625739574432373, |
|
"learning_rate": 2.5291607953655865e-05, |
|
"loss": 1.6279, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.829380989074707, |
|
"learning_rate": 2.5230448567402536e-05, |
|
"loss": 1.6263, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.499410629272461, |
|
"learning_rate": 2.5169289181149207e-05, |
|
"loss": 1.6201, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 2.6228952407836914, |
|
"learning_rate": 2.5108129794895884e-05, |
|
"loss": 1.6244, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.609665870666504, |
|
"learning_rate": 2.5046970408642555e-05, |
|
"loss": 1.6319, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.8935351371765137, |
|
"learning_rate": 2.498581102238923e-05, |
|
"loss": 1.6216, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.7964882850646973, |
|
"learning_rate": 2.49246516361359e-05, |
|
"loss": 1.621, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.465930938720703, |
|
"learning_rate": 2.4863492249882575e-05, |
|
"loss": 1.6332, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 2.9245595932006836, |
|
"learning_rate": 2.480233286362925e-05, |
|
"loss": 1.6239, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.548551321029663, |
|
"learning_rate": 2.474117347737592e-05, |
|
"loss": 1.6148, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.6611809730529785, |
|
"learning_rate": 2.4680014091122594e-05, |
|
"loss": 1.6216, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 2.6596455574035645, |
|
"learning_rate": 2.4618854704869268e-05, |
|
"loss": 1.6057, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.645918607711792, |
|
"learning_rate": 2.455769531861594e-05, |
|
"loss": 1.613, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.6304965019226074, |
|
"learning_rate": 2.4496535932362613e-05, |
|
"loss": 1.6148, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 2.9523110389709473, |
|
"learning_rate": 2.4435376546109284e-05, |
|
"loss": 1.615, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.6215062141418457, |
|
"learning_rate": 2.437421715985596e-05, |
|
"loss": 1.623, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.7585043907165527, |
|
"learning_rate": 2.4313057773602633e-05, |
|
"loss": 1.6228, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 2.626432418823242, |
|
"learning_rate": 2.4251898387349304e-05, |
|
"loss": 1.6152, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.481905221939087, |
|
"learning_rate": 2.4190739001095978e-05, |
|
"loss": 1.6041, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 2.5762555599212646, |
|
"learning_rate": 2.4129579614842652e-05, |
|
"loss": 1.6117, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.6616873741149902, |
|
"learning_rate": 2.4068420228589323e-05, |
|
"loss": 1.6123, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.6225013732910156, |
|
"learning_rate": 2.4007260842335997e-05, |
|
"loss": 1.6096, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.6868574619293213, |
|
"learning_rate": 2.394610145608267e-05, |
|
"loss": 1.6103, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.8061540126800537, |
|
"learning_rate": 2.3884942069829342e-05, |
|
"loss": 1.6051, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.733086585998535, |
|
"learning_rate": 2.3823782683576017e-05, |
|
"loss": 1.6069, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 2.596497058868408, |
|
"learning_rate": 2.3762623297322688e-05, |
|
"loss": 1.602, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.496598243713379, |
|
"learning_rate": 2.3701463911069362e-05, |
|
"loss": 1.6107, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 2.4470176696777344, |
|
"learning_rate": 2.3640304524816033e-05, |
|
"loss": 1.6021, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.589895486831665, |
|
"learning_rate": 2.3579145138562707e-05, |
|
"loss": 1.6029, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.7477266788482666, |
|
"learning_rate": 2.3517985752309378e-05, |
|
"loss": 1.6011, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 2.7007384300231934, |
|
"learning_rate": 2.3456826366056052e-05, |
|
"loss": 1.6023, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.6846890449523926, |
|
"learning_rate": 2.3395666979802723e-05, |
|
"loss": 1.592, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.69858455657959, |
|
"learning_rate": 2.3334507593549397e-05, |
|
"loss": 1.6047, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.6157824993133545, |
|
"learning_rate": 2.327334820729607e-05, |
|
"loss": 1.5992, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.616908073425293, |
|
"learning_rate": 2.3212188821042742e-05, |
|
"loss": 1.596, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.7912027835845947, |
|
"learning_rate": 2.3151029434789417e-05, |
|
"loss": 1.604, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 2.6151885986328125, |
|
"learning_rate": 2.308987004853609e-05, |
|
"loss": 1.5953, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.8206794261932373, |
|
"learning_rate": 2.3028710662282762e-05, |
|
"loss": 1.602, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.6507091522216797, |
|
"learning_rate": 2.2967551276029436e-05, |
|
"loss": 1.5903, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.752617359161377, |
|
"learning_rate": 2.2906391889776107e-05, |
|
"loss": 1.5971, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 2.8615899085998535, |
|
"learning_rate": 2.284523250352278e-05, |
|
"loss": 1.596, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.0563414096832275, |
|
"learning_rate": 2.2784073117269455e-05, |
|
"loss": 1.5993, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.715120553970337, |
|
"learning_rate": 2.2722913731016126e-05, |
|
"loss": 1.5965, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.8256382942199707, |
|
"learning_rate": 2.26617543447628e-05, |
|
"loss": 1.5883, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 2.8050873279571533, |
|
"learning_rate": 2.2600594958509475e-05, |
|
"loss": 1.5914, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.773902416229248, |
|
"learning_rate": 2.2539435572256146e-05, |
|
"loss": 1.5884, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.7655787467956543, |
|
"learning_rate": 2.247827618600282e-05, |
|
"loss": 1.5965, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 2.7787845134735107, |
|
"learning_rate": 2.2417116799749494e-05, |
|
"loss": 1.5872, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.73518705368042, |
|
"learning_rate": 2.2355957413496165e-05, |
|
"loss": 1.588, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 2.743821382522583, |
|
"learning_rate": 2.229479802724284e-05, |
|
"loss": 1.5871, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.444350242614746, |
|
"learning_rate": 2.223363864098951e-05, |
|
"loss": 1.5802, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.597966194152832, |
|
"learning_rate": 2.2172479254736185e-05, |
|
"loss": 1.5872, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 2.7924256324768066, |
|
"learning_rate": 2.211131986848286e-05, |
|
"loss": 1.5877, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.5780932903289795, |
|
"learning_rate": 2.205016048222953e-05, |
|
"loss": 1.583, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.9303081035614014, |
|
"learning_rate": 2.1989001095976204e-05, |
|
"loss": 1.5901, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 2.601661443710327, |
|
"learning_rate": 2.1927841709722878e-05, |
|
"loss": 1.5837, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.6851816177368164, |
|
"learning_rate": 2.186668232346955e-05, |
|
"loss": 1.5736, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 2.592660903930664, |
|
"learning_rate": 2.1805522937216223e-05, |
|
"loss": 1.5797, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.876065492630005, |
|
"learning_rate": 2.1744363550962894e-05, |
|
"loss": 1.5851, |
|
"step": 231000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.507368564605713, |
|
"learning_rate": 2.168320416470957e-05, |
|
"loss": 1.5868, |
|
"step": 231500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 2.5661709308624268, |
|
"learning_rate": 2.162204477845624e-05, |
|
"loss": 1.5826, |
|
"step": 232000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.698857545852661, |
|
"learning_rate": 2.1560885392202914e-05, |
|
"loss": 1.5739, |
|
"step": 232500 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.5845346450805664, |
|
"learning_rate": 2.1499726005949584e-05, |
|
"loss": 1.5769, |
|
"step": 233000 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 2.7823565006256104, |
|
"learning_rate": 2.143856661969626e-05, |
|
"loss": 1.5781, |
|
"step": 233500 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.675457239151001, |
|
"learning_rate": 2.137740723344293e-05, |
|
"loss": 1.5773, |
|
"step": 234000 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.789083957672119, |
|
"learning_rate": 2.1316247847189604e-05, |
|
"loss": 1.5662, |
|
"step": 234500 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.5719103813171387, |
|
"learning_rate": 2.1255088460936278e-05, |
|
"loss": 1.58, |
|
"step": 235000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.7980144023895264, |
|
"learning_rate": 2.119392907468295e-05, |
|
"loss": 1.5769, |
|
"step": 235500 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.6691505908966064, |
|
"learning_rate": 2.1132769688429623e-05, |
|
"loss": 1.5703, |
|
"step": 236000 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.839600086212158, |
|
"learning_rate": 2.1071610302176297e-05, |
|
"loss": 1.5714, |
|
"step": 236500 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.8428940773010254, |
|
"learning_rate": 2.101045091592297e-05, |
|
"loss": 1.5723, |
|
"step": 237000 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 2.5756494998931885, |
|
"learning_rate": 2.0949291529669643e-05, |
|
"loss": 1.5672, |
|
"step": 237500 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.4937775135040283, |
|
"learning_rate": 2.0888132143416314e-05, |
|
"loss": 1.5715, |
|
"step": 238000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.8386645317077637, |
|
"learning_rate": 2.0826972757162988e-05, |
|
"loss": 1.566, |
|
"step": 238500 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 2.9764533042907715, |
|
"learning_rate": 2.0765813370909662e-05, |
|
"loss": 1.5678, |
|
"step": 239000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.5615928173065186, |
|
"learning_rate": 2.0704653984656333e-05, |
|
"loss": 1.5711, |
|
"step": 239500 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 2.431802988052368, |
|
"learning_rate": 2.0643494598403007e-05, |
|
"loss": 1.5632, |
|
"step": 240000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.691328287124634, |
|
"learning_rate": 2.058233521214968e-05, |
|
"loss": 1.5761, |
|
"step": 240500 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.833160161972046, |
|
"learning_rate": 2.0521175825896352e-05, |
|
"loss": 1.568, |
|
"step": 241000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 2.9443514347076416, |
|
"learning_rate": 2.0460016439643027e-05, |
|
"loss": 1.5644, |
|
"step": 241500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.6418864727020264, |
|
"learning_rate": 2.03988570533897e-05, |
|
"loss": 1.5644, |
|
"step": 242000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.559652090072632, |
|
"learning_rate": 2.033769766713637e-05, |
|
"loss": 1.5588, |
|
"step": 242500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 2.376955509185791, |
|
"learning_rate": 2.0276538280883046e-05, |
|
"loss": 1.5659, |
|
"step": 243000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.0132250785827637, |
|
"learning_rate": 2.0215378894629717e-05, |
|
"loss": 1.56, |
|
"step": 243500 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.493617534637451, |
|
"learning_rate": 2.015421950837639e-05, |
|
"loss": 1.5602, |
|
"step": 244000 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 2.6484365463256836, |
|
"learning_rate": 2.0093060122123065e-05, |
|
"loss": 1.5646, |
|
"step": 244500 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.5682971477508545, |
|
"learning_rate": 2.0031900735869736e-05, |
|
"loss": 1.5622, |
|
"step": 245000 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.783363103866577, |
|
"learning_rate": 1.997074134961641e-05, |
|
"loss": 1.5568, |
|
"step": 245500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.5576345920562744, |
|
"learning_rate": 1.9909581963363085e-05, |
|
"loss": 1.558, |
|
"step": 246000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.3469157218933105, |
|
"learning_rate": 1.9848422577109756e-05, |
|
"loss": 1.562, |
|
"step": 246500 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 2.7063257694244385, |
|
"learning_rate": 1.978726319085643e-05, |
|
"loss": 1.5582, |
|
"step": 247000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 3.00256085395813, |
|
"learning_rate": 1.97261038046031e-05, |
|
"loss": 1.5574, |
|
"step": 247500 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.35555100440979, |
|
"learning_rate": 1.966494441834977e-05, |
|
"loss": 1.5567, |
|
"step": 248000 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 2.5847179889678955, |
|
"learning_rate": 1.9603785032096446e-05, |
|
"loss": 1.5647, |
|
"step": 248500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.629279613494873, |
|
"learning_rate": 1.9542625645843117e-05, |
|
"loss": 1.552, |
|
"step": 249000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.6433770656585693, |
|
"learning_rate": 1.948146625958979e-05, |
|
"loss": 1.5547, |
|
"step": 249500 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 2.6378979682922363, |
|
"learning_rate": 1.9420306873336465e-05, |
|
"loss": 1.5549, |
|
"step": 250000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.7272751331329346, |
|
"learning_rate": 1.9359147487083136e-05, |
|
"loss": 1.5496, |
|
"step": 250500 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 2.661400556564331, |
|
"learning_rate": 1.929798810082981e-05, |
|
"loss": 1.5597, |
|
"step": 251000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.76647686958313, |
|
"learning_rate": 1.9236828714576485e-05, |
|
"loss": 1.5559, |
|
"step": 251500 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.4355571269989014, |
|
"learning_rate": 1.9175669328323156e-05, |
|
"loss": 1.5512, |
|
"step": 252000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 2.503006935119629, |
|
"learning_rate": 1.911450994206983e-05, |
|
"loss": 1.5459, |
|
"step": 252500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.4940273761749268, |
|
"learning_rate": 1.9053350555816504e-05, |
|
"loss": 1.5563, |
|
"step": 253000 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 3.0512688159942627, |
|
"learning_rate": 1.8992191169563175e-05, |
|
"loss": 1.5542, |
|
"step": 253500 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 2.811276912689209, |
|
"learning_rate": 1.893103178330985e-05, |
|
"loss": 1.5447, |
|
"step": 254000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.565730571746826, |
|
"learning_rate": 1.8869872397056523e-05, |
|
"loss": 1.5446, |
|
"step": 254500 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 2.6504178047180176, |
|
"learning_rate": 1.8808713010803194e-05, |
|
"loss": 1.5506, |
|
"step": 255000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 3.0442628860473633, |
|
"learning_rate": 1.874755362454987e-05, |
|
"loss": 1.55, |
|
"step": 255500 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.6336920261383057, |
|
"learning_rate": 1.868639423829654e-05, |
|
"loss": 1.5424, |
|
"step": 256000 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 2.7758066654205322, |
|
"learning_rate": 1.8625234852043214e-05, |
|
"loss": 1.5479, |
|
"step": 256500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.818814992904663, |
|
"learning_rate": 1.8564075465789888e-05, |
|
"loss": 1.5486, |
|
"step": 257000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.6956701278686523, |
|
"learning_rate": 1.850291607953656e-05, |
|
"loss": 1.5497, |
|
"step": 257500 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 2.7896413803100586, |
|
"learning_rate": 1.8441756693283233e-05, |
|
"loss": 1.5437, |
|
"step": 258000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.917079448699951, |
|
"learning_rate": 1.8380597307029907e-05, |
|
"loss": 1.5432, |
|
"step": 258500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.761766195297241, |
|
"learning_rate": 1.8319437920776578e-05, |
|
"loss": 1.5413, |
|
"step": 259000 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.7666103839874268, |
|
"learning_rate": 1.8258278534523253e-05, |
|
"loss": 1.5396, |
|
"step": 259500 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.691253423690796, |
|
"learning_rate": 1.8197119148269927e-05, |
|
"loss": 1.5372, |
|
"step": 260000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 2.911930799484253, |
|
"learning_rate": 1.8135959762016598e-05, |
|
"loss": 1.5485, |
|
"step": 260500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.5208046436309814, |
|
"learning_rate": 1.8074800375763272e-05, |
|
"loss": 1.5438, |
|
"step": 261000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.41379976272583, |
|
"learning_rate": 1.8013640989509943e-05, |
|
"loss": 1.5384, |
|
"step": 261500 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 2.636869430541992, |
|
"learning_rate": 1.7952481603256617e-05, |
|
"loss": 1.5477, |
|
"step": 262000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.6929407119750977, |
|
"learning_rate": 1.7891322217003288e-05, |
|
"loss": 1.5384, |
|
"step": 262500 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.849163055419922, |
|
"learning_rate": 1.7830162830749962e-05, |
|
"loss": 1.5394, |
|
"step": 263000 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 2.5682120323181152, |
|
"learning_rate": 1.7769003444496633e-05, |
|
"loss": 1.5353, |
|
"step": 263500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.5825769901275635, |
|
"learning_rate": 1.7707844058243307e-05, |
|
"loss": 1.535, |
|
"step": 264000 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.426283597946167, |
|
"learning_rate": 1.7646684671989978e-05, |
|
"loss": 1.5373, |
|
"step": 264500 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 2.706394910812378, |
|
"learning_rate": 1.7585525285736652e-05, |
|
"loss": 1.5358, |
|
"step": 265000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.6370396614074707, |
|
"learning_rate": 1.7524365899483327e-05, |
|
"loss": 1.5391, |
|
"step": 265500 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 2.553217649459839, |
|
"learning_rate": 1.7463206513229998e-05, |
|
"loss": 1.5299, |
|
"step": 266000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.884148120880127, |
|
"learning_rate": 1.7402047126976672e-05, |
|
"loss": 1.5432, |
|
"step": 266500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.7331855297088623, |
|
"learning_rate": 1.7340887740723343e-05, |
|
"loss": 1.5329, |
|
"step": 267000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 2.841865062713623, |
|
"learning_rate": 1.7279728354470017e-05, |
|
"loss": 1.5319, |
|
"step": 267500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.463677406311035, |
|
"learning_rate": 1.721856896821669e-05, |
|
"loss": 1.5274, |
|
"step": 268000 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.7880847454071045, |
|
"learning_rate": 1.7157409581963362e-05, |
|
"loss": 1.5244, |
|
"step": 268500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 2.5323753356933594, |
|
"learning_rate": 1.7096250195710036e-05, |
|
"loss": 1.5343, |
|
"step": 269000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.93086838722229, |
|
"learning_rate": 1.703509080945671e-05, |
|
"loss": 1.5253, |
|
"step": 269500 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 2.8919107913970947, |
|
"learning_rate": 1.697393142320338e-05, |
|
"loss": 1.5334, |
|
"step": 270000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.8613593578338623, |
|
"learning_rate": 1.6912772036950056e-05, |
|
"loss": 1.5342, |
|
"step": 270500 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.5317909717559814, |
|
"learning_rate": 1.685161265069673e-05, |
|
"loss": 1.5278, |
|
"step": 271000 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 2.832613706588745, |
|
"learning_rate": 1.67904532644434e-05, |
|
"loss": 1.5318, |
|
"step": 271500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.5811901092529297, |
|
"learning_rate": 1.6729293878190075e-05, |
|
"loss": 1.5338, |
|
"step": 272000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.640382766723633, |
|
"learning_rate": 1.6668134491936746e-05, |
|
"loss": 1.5278, |
|
"step": 272500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.777024745941162, |
|
"learning_rate": 1.660697510568342e-05, |
|
"loss": 1.5192, |
|
"step": 273000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.546867609024048, |
|
"learning_rate": 1.6545815719430095e-05, |
|
"loss": 1.5203, |
|
"step": 273500 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.459458589553833, |
|
"learning_rate": 1.6484656333176765e-05, |
|
"loss": 1.5232, |
|
"step": 274000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 2.6832683086395264, |
|
"learning_rate": 1.642349694692344e-05, |
|
"loss": 1.5133, |
|
"step": 274500 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.6847174167633057, |
|
"learning_rate": 1.6362337560670114e-05, |
|
"loss": 1.5202, |
|
"step": 275000 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 2.819836139678955, |
|
"learning_rate": 1.6301178174416785e-05, |
|
"loss": 1.5225, |
|
"step": 275500 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.4789822101593018, |
|
"learning_rate": 1.624001878816346e-05, |
|
"loss": 1.5185, |
|
"step": 276000 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.8469271659851074, |
|
"learning_rate": 1.6178859401910133e-05, |
|
"loss": 1.5281, |
|
"step": 276500 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 2.4741554260253906, |
|
"learning_rate": 1.6117700015656804e-05, |
|
"loss": 1.5187, |
|
"step": 277000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.7348639965057373, |
|
"learning_rate": 1.605654062940348e-05, |
|
"loss": 1.519, |
|
"step": 277500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.590632677078247, |
|
"learning_rate": 1.599538124315015e-05, |
|
"loss": 1.5242, |
|
"step": 278000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 2.926156997680664, |
|
"learning_rate": 1.5934221856896824e-05, |
|
"loss": 1.5182, |
|
"step": 278500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.3463704586029053, |
|
"learning_rate": 1.5873062470643494e-05, |
|
"loss": 1.5186, |
|
"step": 279000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.8778836727142334, |
|
"learning_rate": 1.581190308439017e-05, |
|
"loss": 1.5133, |
|
"step": 279500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 2.7937684059143066, |
|
"learning_rate": 1.575074369813684e-05, |
|
"loss": 1.5204, |
|
"step": 280000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.6967952251434326, |
|
"learning_rate": 1.5689584311883514e-05, |
|
"loss": 1.5238, |
|
"step": 280500 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 2.7939419746398926, |
|
"learning_rate": 1.5628424925630185e-05, |
|
"loss": 1.5162, |
|
"step": 281000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.4184165000915527, |
|
"learning_rate": 1.556726553937686e-05, |
|
"loss": 1.5083, |
|
"step": 281500 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.5736517906188965, |
|
"learning_rate": 1.5506106153123533e-05, |
|
"loss": 1.5225, |
|
"step": 282000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 2.775562286376953, |
|
"learning_rate": 1.5444946766870204e-05, |
|
"loss": 1.5107, |
|
"step": 282500 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.65218186378479, |
|
"learning_rate": 1.538378738061688e-05, |
|
"loss": 1.5101, |
|
"step": 283000 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.9510700702667236, |
|
"learning_rate": 1.532262799436355e-05, |
|
"loss": 1.5108, |
|
"step": 283500 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 2.663459300994873, |
|
"learning_rate": 1.5261468608110224e-05, |
|
"loss": 1.5039, |
|
"step": 284000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.621185541152954, |
|
"learning_rate": 1.5200309221856898e-05, |
|
"loss": 1.5044, |
|
"step": 284500 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 2.7597007751464844, |
|
"learning_rate": 1.5139149835603569e-05, |
|
"loss": 1.5123, |
|
"step": 285000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.9049315452575684, |
|
"learning_rate": 1.5077990449350243e-05, |
|
"loss": 1.5129, |
|
"step": 285500 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.7064170837402344, |
|
"learning_rate": 1.5016831063096917e-05, |
|
"loss": 1.5075, |
|
"step": 286000 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 2.8447062969207764, |
|
"learning_rate": 1.4955671676843588e-05, |
|
"loss": 1.5107, |
|
"step": 286500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 2.63680100440979, |
|
"learning_rate": 1.4894512290590262e-05, |
|
"loss": 1.5095, |
|
"step": 287000 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 2.9696691036224365, |
|
"learning_rate": 1.4833352904336937e-05, |
|
"loss": 1.5069, |
|
"step": 287500 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 2.7010321617126465, |
|
"learning_rate": 1.4772193518083607e-05, |
|
"loss": 1.5094, |
|
"step": 288000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.5756781101226807, |
|
"learning_rate": 1.4711034131830282e-05, |
|
"loss": 1.5054, |
|
"step": 288500 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 3.0450093746185303, |
|
"learning_rate": 1.4649874745576956e-05, |
|
"loss": 1.5115, |
|
"step": 289000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.551755905151367, |
|
"learning_rate": 1.4588715359323627e-05, |
|
"loss": 1.5129, |
|
"step": 289500 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.865170478820801, |
|
"learning_rate": 1.4527555973070301e-05, |
|
"loss": 1.4972, |
|
"step": 290000 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 2.648294687271118, |
|
"learning_rate": 1.4466396586816972e-05, |
|
"loss": 1.5093, |
|
"step": 290500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.600937604904175, |
|
"learning_rate": 1.4405237200563646e-05, |
|
"loss": 1.5043, |
|
"step": 291000 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.9919681549072266, |
|
"learning_rate": 1.4344077814310319e-05, |
|
"loss": 1.4997, |
|
"step": 291500 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 2.8291046619415283, |
|
"learning_rate": 1.4282918428056991e-05, |
|
"loss": 1.5196, |
|
"step": 292000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.66756272315979, |
|
"learning_rate": 1.4221759041803664e-05, |
|
"loss": 1.5007, |
|
"step": 292500 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.809164524078369, |
|
"learning_rate": 1.4160599655550338e-05, |
|
"loss": 1.5033, |
|
"step": 293000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 2.6483566761016846, |
|
"learning_rate": 1.4099440269297009e-05, |
|
"loss": 1.5065, |
|
"step": 293500 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.4449145793914795, |
|
"learning_rate": 1.4038280883043683e-05, |
|
"loss": 1.5032, |
|
"step": 294000 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 2.6919500827789307, |
|
"learning_rate": 1.3977121496790358e-05, |
|
"loss": 1.5053, |
|
"step": 294500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.8122289180755615, |
|
"learning_rate": 1.3915962110537028e-05, |
|
"loss": 1.5079, |
|
"step": 295000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.7903494834899902, |
|
"learning_rate": 1.3854802724283703e-05, |
|
"loss": 1.4965, |
|
"step": 295500 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 2.525930404663086, |
|
"learning_rate": 1.3793643338030374e-05, |
|
"loss": 1.5043, |
|
"step": 296000 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.493638277053833, |
|
"learning_rate": 1.3732483951777048e-05, |
|
"loss": 1.4974, |
|
"step": 296500 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.4521799087524414, |
|
"learning_rate": 1.3671324565523722e-05, |
|
"loss": 1.4941, |
|
"step": 297000 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 2.8091464042663574, |
|
"learning_rate": 1.3610165179270393e-05, |
|
"loss": 1.5018, |
|
"step": 297500 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.5954153537750244, |
|
"learning_rate": 1.3549005793017067e-05, |
|
"loss": 1.4999, |
|
"step": 298000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.7937843799591064, |
|
"learning_rate": 1.348784640676374e-05, |
|
"loss": 1.4971, |
|
"step": 298500 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 2.731354236602783, |
|
"learning_rate": 1.3426687020510412e-05, |
|
"loss": 1.5019, |
|
"step": 299000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.893202066421509, |
|
"learning_rate": 1.3365527634257085e-05, |
|
"loss": 1.5084, |
|
"step": 299500 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 2.5517237186431885, |
|
"learning_rate": 1.330436824800376e-05, |
|
"loss": 1.4979, |
|
"step": 300000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.5626368522644043, |
|
"learning_rate": 1.324320886175043e-05, |
|
"loss": 1.5023, |
|
"step": 300500 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.9477968215942383, |
|
"learning_rate": 1.3182049475497104e-05, |
|
"loss": 1.4948, |
|
"step": 301000 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 2.5781774520874023, |
|
"learning_rate": 1.3120890089243775e-05, |
|
"loss": 1.4968, |
|
"step": 301500 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.8032429218292236, |
|
"learning_rate": 1.305973070299045e-05, |
|
"loss": 1.5017, |
|
"step": 302000 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.6801342964172363, |
|
"learning_rate": 1.2998571316737124e-05, |
|
"loss": 1.4932, |
|
"step": 302500 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 2.6974122524261475, |
|
"learning_rate": 1.2937411930483795e-05, |
|
"loss": 1.4965, |
|
"step": 303000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.4389328956604004, |
|
"learning_rate": 1.2876252544230469e-05, |
|
"loss": 1.4933, |
|
"step": 303500 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.66622257232666, |
|
"learning_rate": 1.2815093157977143e-05, |
|
"loss": 1.4941, |
|
"step": 304000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 2.6904194355010986, |
|
"learning_rate": 1.2753933771723814e-05, |
|
"loss": 1.4946, |
|
"step": 304500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.673464298248291, |
|
"learning_rate": 1.2692774385470488e-05, |
|
"loss": 1.5015, |
|
"step": 305000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.6104812622070312, |
|
"learning_rate": 1.2631614999217163e-05, |
|
"loss": 1.4886, |
|
"step": 305500 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.8773016929626465, |
|
"learning_rate": 1.2570455612963833e-05, |
|
"loss": 1.4916, |
|
"step": 306000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.437274217605591, |
|
"learning_rate": 1.2509296226710508e-05, |
|
"loss": 1.4909, |
|
"step": 306500 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 3.1659114360809326, |
|
"learning_rate": 1.244813684045718e-05, |
|
"loss": 1.488, |
|
"step": 307000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.607539653778076, |
|
"learning_rate": 1.2386977454203851e-05, |
|
"loss": 1.4916, |
|
"step": 307500 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 2.589136838912964, |
|
"learning_rate": 1.2325818067950524e-05, |
|
"loss": 1.4893, |
|
"step": 308000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 3.010464668273926, |
|
"learning_rate": 1.2264658681697198e-05, |
|
"loss": 1.4867, |
|
"step": 308500 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.713313579559326, |
|
"learning_rate": 1.220349929544387e-05, |
|
"loss": 1.4788, |
|
"step": 309000 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 2.753493070602417, |
|
"learning_rate": 1.2142339909190543e-05, |
|
"loss": 1.4839, |
|
"step": 309500 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.8799803256988525, |
|
"learning_rate": 1.2081180522937217e-05, |
|
"loss": 1.4914, |
|
"step": 310000 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.8280301094055176, |
|
"learning_rate": 1.202002113668389e-05, |
|
"loss": 1.4809, |
|
"step": 310500 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 2.9053263664245605, |
|
"learning_rate": 1.1958861750430562e-05, |
|
"loss": 1.4788, |
|
"step": 311000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.879546880722046, |
|
"learning_rate": 1.1897702364177235e-05, |
|
"loss": 1.4825, |
|
"step": 311500 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.473529577255249, |
|
"learning_rate": 1.183654297792391e-05, |
|
"loss": 1.4883, |
|
"step": 312000 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 2.743178367614746, |
|
"learning_rate": 1.1775383591670582e-05, |
|
"loss": 1.4804, |
|
"step": 312500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.6918370723724365, |
|
"learning_rate": 1.1714224205417254e-05, |
|
"loss": 1.4891, |
|
"step": 313000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.9803996086120605, |
|
"learning_rate": 1.1653064819163927e-05, |
|
"loss": 1.486, |
|
"step": 313500 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 2.544872999191284, |
|
"learning_rate": 1.1591905432910601e-05, |
|
"loss": 1.4879, |
|
"step": 314000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.8242433071136475, |
|
"learning_rate": 1.1530746046657274e-05, |
|
"loss": 1.4848, |
|
"step": 314500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 2.7912473678588867, |
|
"learning_rate": 1.1469586660403946e-05, |
|
"loss": 1.4847, |
|
"step": 315000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 3.1455202102661133, |
|
"learning_rate": 1.1408427274150619e-05, |
|
"loss": 1.4899, |
|
"step": 315500 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.8553197383880615, |
|
"learning_rate": 1.1347267887897291e-05, |
|
"loss": 1.4799, |
|
"step": 316000 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.7605557441711426, |
|
"learning_rate": 1.1286108501643964e-05, |
|
"loss": 1.4808, |
|
"step": 316500 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.7065718173980713, |
|
"learning_rate": 1.1224949115390637e-05, |
|
"loss": 1.4846, |
|
"step": 317000 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.719977378845215, |
|
"learning_rate": 1.1163789729137311e-05, |
|
"loss": 1.4831, |
|
"step": 317500 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 2.569617509841919, |
|
"learning_rate": 1.1102630342883983e-05, |
|
"loss": 1.4798, |
|
"step": 318000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.4670286178588867, |
|
"learning_rate": 1.1041470956630656e-05, |
|
"loss": 1.4765, |
|
"step": 318500 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.797725200653076, |
|
"learning_rate": 1.098031157037733e-05, |
|
"loss": 1.4817, |
|
"step": 319000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 2.8332033157348633, |
|
"learning_rate": 1.0919152184124003e-05, |
|
"loss": 1.4835, |
|
"step": 319500 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.494609832763672, |
|
"learning_rate": 1.0857992797870675e-05, |
|
"loss": 1.4746, |
|
"step": 320000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 2.708406925201416, |
|
"learning_rate": 1.0796833411617348e-05, |
|
"loss": 1.4764, |
|
"step": 320500 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.59369158744812, |
|
"learning_rate": 1.0735674025364022e-05, |
|
"loss": 1.4808, |
|
"step": 321000 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.803255558013916, |
|
"learning_rate": 1.0674514639110695e-05, |
|
"loss": 1.48, |
|
"step": 321500 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 2.5560402870178223, |
|
"learning_rate": 1.0613355252857367e-05, |
|
"loss": 1.4843, |
|
"step": 322000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.911194324493408, |
|
"learning_rate": 1.055219586660404e-05, |
|
"loss": 1.4801, |
|
"step": 322500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.8196239471435547, |
|
"learning_rate": 1.0491036480350713e-05, |
|
"loss": 1.4762, |
|
"step": 323000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 2.709317445755005, |
|
"learning_rate": 1.0429877094097385e-05, |
|
"loss": 1.476, |
|
"step": 323500 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.627985715866089, |
|
"learning_rate": 1.0368717707844058e-05, |
|
"loss": 1.4781, |
|
"step": 324000 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.8382914066314697, |
|
"learning_rate": 1.0307558321590732e-05, |
|
"loss": 1.4728, |
|
"step": 324500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.8126072883605957, |
|
"learning_rate": 1.0246398935337404e-05, |
|
"loss": 1.4778, |
|
"step": 325000 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.6362712383270264, |
|
"learning_rate": 1.0185239549084077e-05, |
|
"loss": 1.476, |
|
"step": 325500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 2.761763572692871, |
|
"learning_rate": 1.012408016283075e-05, |
|
"loss": 1.4704, |
|
"step": 326000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.6072146892547607, |
|
"learning_rate": 1.0062920776577424e-05, |
|
"loss": 1.4703, |
|
"step": 326500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 3.07877254486084, |
|
"learning_rate": 1.0001761390324096e-05, |
|
"loss": 1.4863, |
|
"step": 327000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.986053705215454, |
|
"learning_rate": 9.940602004070769e-06, |
|
"loss": 1.4837, |
|
"step": 327500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.7128584384918213, |
|
"learning_rate": 9.879442617817442e-06, |
|
"loss": 1.4648, |
|
"step": 328000 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.6193928718566895, |
|
"learning_rate": 9.818283231564116e-06, |
|
"loss": 1.4628, |
|
"step": 328500 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 2.7647864818573, |
|
"learning_rate": 9.757123845310788e-06, |
|
"loss": 1.4799, |
|
"step": 329000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.70143985748291, |
|
"learning_rate": 9.695964459057461e-06, |
|
"loss": 1.4629, |
|
"step": 329500 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 2.705559730529785, |
|
"learning_rate": 9.634805072804135e-06, |
|
"loss": 1.4662, |
|
"step": 330000 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.678466796875, |
|
"learning_rate": 9.573645686550808e-06, |
|
"loss": 1.4693, |
|
"step": 330500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.5051000118255615, |
|
"learning_rate": 9.51248630029748e-06, |
|
"loss": 1.4705, |
|
"step": 331000 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 2.8841006755828857, |
|
"learning_rate": 9.451326914044153e-06, |
|
"loss": 1.4651, |
|
"step": 331500 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.7045044898986816, |
|
"learning_rate": 9.390167527790825e-06, |
|
"loss": 1.4665, |
|
"step": 332000 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 3.101134777069092, |
|
"learning_rate": 9.329008141537498e-06, |
|
"loss": 1.4746, |
|
"step": 332500 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 2.5567667484283447, |
|
"learning_rate": 9.26784875528417e-06, |
|
"loss": 1.4667, |
|
"step": 333000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.5476863384246826, |
|
"learning_rate": 9.206689369030843e-06, |
|
"loss": 1.4593, |
|
"step": 333500 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.6363370418548584, |
|
"learning_rate": 9.145529982777517e-06, |
|
"loss": 1.4632, |
|
"step": 334000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 2.9167027473449707, |
|
"learning_rate": 9.08437059652419e-06, |
|
"loss": 1.4595, |
|
"step": 334500 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 2.77966046333313, |
|
"learning_rate": 9.023211210270863e-06, |
|
"loss": 1.4604, |
|
"step": 335000 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 3.0701239109039307, |
|
"learning_rate": 8.962051824017537e-06, |
|
"loss": 1.4613, |
|
"step": 335500 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.6307058334350586, |
|
"learning_rate": 8.90089243776421e-06, |
|
"loss": 1.4627, |
|
"step": 336000 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.46291184425354, |
|
"learning_rate": 8.839733051510882e-06, |
|
"loss": 1.4667, |
|
"step": 336500 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 2.7968499660491943, |
|
"learning_rate": 8.778573665257555e-06, |
|
"loss": 1.4644, |
|
"step": 337000 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.7745018005371094, |
|
"learning_rate": 8.717414279004229e-06, |
|
"loss": 1.458, |
|
"step": 337500 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.951845645904541, |
|
"learning_rate": 8.656254892750901e-06, |
|
"loss": 1.4723, |
|
"step": 338000 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 2.6524295806884766, |
|
"learning_rate": 8.595095506497574e-06, |
|
"loss": 1.4607, |
|
"step": 338500 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.800586223602295, |
|
"learning_rate": 8.533936120244246e-06, |
|
"loss": 1.4606, |
|
"step": 339000 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 2.947486639022827, |
|
"learning_rate": 8.472776733990919e-06, |
|
"loss": 1.4608, |
|
"step": 339500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.936547040939331, |
|
"learning_rate": 8.411617347737592e-06, |
|
"loss": 1.4582, |
|
"step": 340000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.820474147796631, |
|
"learning_rate": 8.350457961484264e-06, |
|
"loss": 1.4648, |
|
"step": 340500 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.754638910293579, |
|
"learning_rate": 8.289298575230938e-06, |
|
"loss": 1.4613, |
|
"step": 341000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.46992564201355, |
|
"learning_rate": 8.228139188977611e-06, |
|
"loss": 1.4685, |
|
"step": 341500 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.8246257305145264, |
|
"learning_rate": 8.166979802724284e-06, |
|
"loss": 1.4632, |
|
"step": 342000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 2.966745138168335, |
|
"learning_rate": 8.105820416470956e-06, |
|
"loss": 1.4615, |
|
"step": 342500 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.9904332160949707, |
|
"learning_rate": 8.04466103021763e-06, |
|
"loss": 1.4646, |
|
"step": 343000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.7209649085998535, |
|
"learning_rate": 7.983501643964303e-06, |
|
"loss": 1.4602, |
|
"step": 343500 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 2.7970163822174072, |
|
"learning_rate": 7.922342257710976e-06, |
|
"loss": 1.4557, |
|
"step": 344000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 2.646637201309204, |
|
"learning_rate": 7.86118287145765e-06, |
|
"loss": 1.4554, |
|
"step": 344500 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 2.8239455223083496, |
|
"learning_rate": 7.800023485204322e-06, |
|
"loss": 1.4527, |
|
"step": 345000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.9307682514190674, |
|
"learning_rate": 7.738864098950995e-06, |
|
"loss": 1.461, |
|
"step": 345500 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.840571165084839, |
|
"learning_rate": 7.677704712697668e-06, |
|
"loss": 1.4592, |
|
"step": 346000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 2.7356936931610107, |
|
"learning_rate": 7.616545326444341e-06, |
|
"loss": 1.46, |
|
"step": 346500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.902578353881836, |
|
"learning_rate": 7.5553859401910135e-06, |
|
"loss": 1.4627, |
|
"step": 347000 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 3.005869150161743, |
|
"learning_rate": 7.494226553937686e-06, |
|
"loss": 1.4592, |
|
"step": 347500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.7847769260406494, |
|
"learning_rate": 7.433067167684359e-06, |
|
"loss": 1.4652, |
|
"step": 348000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 2.821125030517578, |
|
"learning_rate": 7.371907781431033e-06, |
|
"loss": 1.4585, |
|
"step": 348500 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 3.5766139030456543, |
|
"learning_rate": 7.3107483951777054e-06, |
|
"loss": 1.4511, |
|
"step": 349000 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 2.9950060844421387, |
|
"learning_rate": 7.249589008924377e-06, |
|
"loss": 1.45, |
|
"step": 349500 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 2.922325611114502, |
|
"learning_rate": 7.1884296226710514e-06, |
|
"loss": 1.4549, |
|
"step": 350000 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 3.0797903537750244, |
|
"learning_rate": 7.127270236417724e-06, |
|
"loss": 1.4643, |
|
"step": 350500 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.6984283924102783, |
|
"learning_rate": 7.0661108501643966e-06, |
|
"loss": 1.4528, |
|
"step": 351000 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.804563283920288, |
|
"learning_rate": 7.004951463911069e-06, |
|
"loss": 1.4565, |
|
"step": 351500 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 2.6416571140289307, |
|
"learning_rate": 6.943792077657743e-06, |
|
"loss": 1.4523, |
|
"step": 352000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.5998694896698, |
|
"learning_rate": 6.882632691404416e-06, |
|
"loss": 1.4546, |
|
"step": 352500 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.714494228363037, |
|
"learning_rate": 6.8214733051510885e-06, |
|
"loss": 1.4581, |
|
"step": 353000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 2.8569843769073486, |
|
"learning_rate": 6.760313918897761e-06, |
|
"loss": 1.4562, |
|
"step": 353500 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.5230369567871094, |
|
"learning_rate": 6.6991545326444345e-06, |
|
"loss": 1.4469, |
|
"step": 354000 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.654069423675537, |
|
"learning_rate": 6.637995146391107e-06, |
|
"loss": 1.449, |
|
"step": 354500 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 3.2150166034698486, |
|
"learning_rate": 6.57683576013778e-06, |
|
"loss": 1.4506, |
|
"step": 355000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 2.6119723320007324, |
|
"learning_rate": 6.515676373884454e-06, |
|
"loss": 1.4507, |
|
"step": 355500 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 2.6928510665893555, |
|
"learning_rate": 6.4545169876311265e-06, |
|
"loss": 1.4565, |
|
"step": 356000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.555009603500366, |
|
"learning_rate": 6.393357601377799e-06, |
|
"loss": 1.4514, |
|
"step": 356500 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.785787343978882, |
|
"learning_rate": 6.332198215124472e-06, |
|
"loss": 1.4525, |
|
"step": 357000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.8000409603118896, |
|
"learning_rate": 6.271038828871145e-06, |
|
"loss": 1.4498, |
|
"step": 357500 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 3.049229860305786, |
|
"learning_rate": 6.209879442617818e-06, |
|
"loss": 1.45, |
|
"step": 358000 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.8990321159362793, |
|
"learning_rate": 6.14872005636449e-06, |
|
"loss": 1.454, |
|
"step": 358500 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 2.6227800846099854, |
|
"learning_rate": 6.0875606701111636e-06, |
|
"loss": 1.4535, |
|
"step": 359000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.856273651123047, |
|
"learning_rate": 6.026401283857837e-06, |
|
"loss": 1.453, |
|
"step": 359500 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 2.6688528060913086, |
|
"learning_rate": 5.9652418976045095e-06, |
|
"loss": 1.4396, |
|
"step": 360000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.765559196472168, |
|
"learning_rate": 5.904082511351183e-06, |
|
"loss": 1.447, |
|
"step": 360500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.546724319458008, |
|
"learning_rate": 5.8429231250978555e-06, |
|
"loss": 1.4445, |
|
"step": 361000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 2.7436957359313965, |
|
"learning_rate": 5.781763738844528e-06, |
|
"loss": 1.4512, |
|
"step": 361500 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.612231731414795, |
|
"learning_rate": 5.720604352591201e-06, |
|
"loss": 1.4496, |
|
"step": 362000 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.537179946899414, |
|
"learning_rate": 5.659444966337874e-06, |
|
"loss": 1.4417, |
|
"step": 362500 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 2.82316255569458, |
|
"learning_rate": 5.598285580084547e-06, |
|
"loss": 1.447, |
|
"step": 363000 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.607912302017212, |
|
"learning_rate": 5.53712619383122e-06, |
|
"loss": 1.4465, |
|
"step": 363500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 2.6389966011047363, |
|
"learning_rate": 5.4759668075778935e-06, |
|
"loss": 1.4511, |
|
"step": 364000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.7712039947509766, |
|
"learning_rate": 5.414807421324566e-06, |
|
"loss": 1.4429, |
|
"step": 364500 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.6119940280914307, |
|
"learning_rate": 5.353648035071239e-06, |
|
"loss": 1.4473, |
|
"step": 365000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.855820894241333, |
|
"learning_rate": 5.292488648817911e-06, |
|
"loss": 1.4409, |
|
"step": 365500 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 3.3360650539398193, |
|
"learning_rate": 5.2313292625645846e-06, |
|
"loss": 1.4448, |
|
"step": 366000 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.7165558338165283, |
|
"learning_rate": 5.170169876311257e-06, |
|
"loss": 1.4459, |
|
"step": 366500 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 2.638815402984619, |
|
"learning_rate": 5.1090104900579306e-06, |
|
"loss": 1.4384, |
|
"step": 367000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.605776071548462, |
|
"learning_rate": 5.047851103804603e-06, |
|
"loss": 1.4515, |
|
"step": 367500 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.7470364570617676, |
|
"learning_rate": 4.9866917175512765e-06, |
|
"loss": 1.4433, |
|
"step": 368000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 2.7127552032470703, |
|
"learning_rate": 4.925532331297949e-06, |
|
"loss": 1.4475, |
|
"step": 368500 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.991445541381836, |
|
"learning_rate": 4.8643729450446225e-06, |
|
"loss": 1.4392, |
|
"step": 369000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 2.5964415073394775, |
|
"learning_rate": 4.803213558791295e-06, |
|
"loss": 1.4435, |
|
"step": 369500 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.73907732963562, |
|
"learning_rate": 4.742054172537968e-06, |
|
"loss": 1.4487, |
|
"step": 370000 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.7901346683502197, |
|
"learning_rate": 4.680894786284641e-06, |
|
"loss": 1.439, |
|
"step": 370500 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 2.4311490058898926, |
|
"learning_rate": 4.619735400031314e-06, |
|
"loss": 1.4449, |
|
"step": 371000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.748305320739746, |
|
"learning_rate": 4.558576013777987e-06, |
|
"loss": 1.4558, |
|
"step": 371500 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.8858590126037598, |
|
"learning_rate": 4.49741662752466e-06, |
|
"loss": 1.4422, |
|
"step": 372000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 2.893162727355957, |
|
"learning_rate": 4.436257241271333e-06, |
|
"loss": 1.4465, |
|
"step": 372500 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.519300937652588, |
|
"learning_rate": 4.375097855018006e-06, |
|
"loss": 1.4354, |
|
"step": 373000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.798462152481079, |
|
"learning_rate": 4.313938468764678e-06, |
|
"loss": 1.4456, |
|
"step": 373500 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 2.894874095916748, |
|
"learning_rate": 4.2527790825113516e-06, |
|
"loss": 1.4358, |
|
"step": 374000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.979597806930542, |
|
"learning_rate": 4.191619696258024e-06, |
|
"loss": 1.444, |
|
"step": 374500 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 2.878843069076538, |
|
"learning_rate": 4.1304603100046975e-06, |
|
"loss": 1.4423, |
|
"step": 375000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.841049909591675, |
|
"learning_rate": 4.06930092375137e-06, |
|
"loss": 1.4403, |
|
"step": 375500 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.413423538208008, |
|
"learning_rate": 4.0081415374980435e-06, |
|
"loss": 1.4491, |
|
"step": 376000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 2.7661304473876953, |
|
"learning_rate": 3.946982151244716e-06, |
|
"loss": 1.4399, |
|
"step": 376500 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.5709524154663086, |
|
"learning_rate": 3.885822764991389e-06, |
|
"loss": 1.4391, |
|
"step": 377000 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.8157260417938232, |
|
"learning_rate": 3.824663378738061e-06, |
|
"loss": 1.4368, |
|
"step": 377500 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 2.7713100910186768, |
|
"learning_rate": 3.7635039924847346e-06, |
|
"loss": 1.4314, |
|
"step": 378000 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.9643430709838867, |
|
"learning_rate": 3.702344606231407e-06, |
|
"loss": 1.4329, |
|
"step": 378500 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 2.798428535461426, |
|
"learning_rate": 3.6411852199780806e-06, |
|
"loss": 1.4358, |
|
"step": 379000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.7989561557769775, |
|
"learning_rate": 3.580025833724754e-06, |
|
"loss": 1.4383, |
|
"step": 379500 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.718421459197998, |
|
"learning_rate": 3.5188664474714266e-06, |
|
"loss": 1.4335, |
|
"step": 380000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 2.8248419761657715, |
|
"learning_rate": 3.4577070612180996e-06, |
|
"loss": 1.439, |
|
"step": 380500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.4477925300598145, |
|
"learning_rate": 3.396547674964772e-06, |
|
"loss": 1.4356, |
|
"step": 381000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.6281678676605225, |
|
"learning_rate": 3.3353882887114456e-06, |
|
"loss": 1.4441, |
|
"step": 381500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.6818716526031494, |
|
"learning_rate": 3.274228902458118e-06, |
|
"loss": 1.434, |
|
"step": 382000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 3.0857114791870117, |
|
"learning_rate": 3.213069516204791e-06, |
|
"loss": 1.4401, |
|
"step": 382500 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 2.920851230621338, |
|
"learning_rate": 3.1519101299514637e-06, |
|
"loss": 1.4394, |
|
"step": 383000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 3.0690174102783203, |
|
"learning_rate": 3.090750743698137e-06, |
|
"loss": 1.4371, |
|
"step": 383500 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.846827745437622, |
|
"learning_rate": 3.02959135744481e-06, |
|
"loss": 1.4301, |
|
"step": 384000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 2.927429676055908, |
|
"learning_rate": 2.9684319711914827e-06, |
|
"loss": 1.426, |
|
"step": 384500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 3.124462842941284, |
|
"learning_rate": 2.9072725849381557e-06, |
|
"loss": 1.4312, |
|
"step": 385000 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 2.4795315265655518, |
|
"learning_rate": 2.8461131986848286e-06, |
|
"loss": 1.4406, |
|
"step": 385500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 2.942664384841919, |
|
"learning_rate": 2.7849538124315016e-06, |
|
"loss": 1.4338, |
|
"step": 386000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.6400539875030518, |
|
"learning_rate": 2.7237944261781746e-06, |
|
"loss": 1.4335, |
|
"step": 386500 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.5888681411743164, |
|
"learning_rate": 2.6626350399248476e-06, |
|
"loss": 1.4293, |
|
"step": 387000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 2.6692755222320557, |
|
"learning_rate": 2.6014756536715206e-06, |
|
"loss": 1.4294, |
|
"step": 387500 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.7141776084899902, |
|
"learning_rate": 2.540316267418193e-06, |
|
"loss": 1.4418, |
|
"step": 388000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.638432264328003, |
|
"learning_rate": 2.479156881164866e-06, |
|
"loss": 1.43, |
|
"step": 388500 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 2.7874903678894043, |
|
"learning_rate": 2.417997494911539e-06, |
|
"loss": 1.4337, |
|
"step": 389000 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 2.8501386642456055, |
|
"learning_rate": 2.356838108658212e-06, |
|
"loss": 1.4363, |
|
"step": 389500 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 3.1942977905273438, |
|
"learning_rate": 2.295678722404885e-06, |
|
"loss": 1.4362, |
|
"step": 390000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.591784715652466, |
|
"learning_rate": 2.2345193361515577e-06, |
|
"loss": 1.44, |
|
"step": 390500 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 3.0462796688079834, |
|
"learning_rate": 2.1733599498982307e-06, |
|
"loss": 1.4318, |
|
"step": 391000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 2.728050708770752, |
|
"learning_rate": 2.112200563644904e-06, |
|
"loss": 1.4404, |
|
"step": 391500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.7059884071350098, |
|
"learning_rate": 2.0510411773915767e-06, |
|
"loss": 1.4343, |
|
"step": 392000 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.7897191047668457, |
|
"learning_rate": 1.9898817911382497e-06, |
|
"loss": 1.4264, |
|
"step": 392500 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.517503261566162, |
|
"learning_rate": 1.9287224048849227e-06, |
|
"loss": 1.4393, |
|
"step": 393000 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.8523480892181396, |
|
"learning_rate": 1.8675630186315954e-06, |
|
"loss": 1.4253, |
|
"step": 393500 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 2.5820095539093018, |
|
"learning_rate": 1.8064036323782684e-06, |
|
"loss": 1.4311, |
|
"step": 394000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 2.8148810863494873, |
|
"learning_rate": 1.7452442461249414e-06, |
|
"loss": 1.4347, |
|
"step": 394500 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 2.8168435096740723, |
|
"learning_rate": 1.6840848598716142e-06, |
|
"loss": 1.4316, |
|
"step": 395000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 2.9340474605560303, |
|
"learning_rate": 1.6229254736182872e-06, |
|
"loss": 1.4312, |
|
"step": 395500 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.8138039112091064, |
|
"learning_rate": 1.5617660873649602e-06, |
|
"loss": 1.4246, |
|
"step": 396000 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.826143980026245, |
|
"learning_rate": 1.500606701111633e-06, |
|
"loss": 1.4416, |
|
"step": 396500 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 2.8407280445098877, |
|
"learning_rate": 1.439447314858306e-06, |
|
"loss": 1.4341, |
|
"step": 397000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.8632097244262695, |
|
"learning_rate": 1.378287928604979e-06, |
|
"loss": 1.4253, |
|
"step": 397500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.6355011463165283, |
|
"learning_rate": 1.317128542351652e-06, |
|
"loss": 1.4387, |
|
"step": 398000 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 2.9903597831726074, |
|
"learning_rate": 1.2559691560983247e-06, |
|
"loss": 1.4269, |
|
"step": 398500 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.6634271144866943, |
|
"learning_rate": 1.1948097698449977e-06, |
|
"loss": 1.4278, |
|
"step": 399000 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.844621419906616, |
|
"learning_rate": 1.1336503835916707e-06, |
|
"loss": 1.4376, |
|
"step": 399500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.8783321380615234, |
|
"learning_rate": 1.0724909973383437e-06, |
|
"loss": 1.4309, |
|
"step": 400000 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.565383195877075, |
|
"learning_rate": 1.0113316110850164e-06, |
|
"loss": 1.4182, |
|
"step": 400500 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 2.6493799686431885, |
|
"learning_rate": 9.501722248316894e-07, |
|
"loss": 1.4354, |
|
"step": 401000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 2.7736098766326904, |
|
"learning_rate": 8.890128385783623e-07, |
|
"loss": 1.4269, |
|
"step": 401500 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 2.5747058391571045, |
|
"learning_rate": 8.278534523250352e-07, |
|
"loss": 1.4244, |
|
"step": 402000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 2.9367337226867676, |
|
"learning_rate": 7.666940660717082e-07, |
|
"loss": 1.4339, |
|
"step": 402500 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 3.1144633293151855, |
|
"learning_rate": 7.055346798183812e-07, |
|
"loss": 1.4342, |
|
"step": 403000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 2.8952739238739014, |
|
"learning_rate": 6.443752935650541e-07, |
|
"loss": 1.4216, |
|
"step": 403500 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.9333155155181885, |
|
"learning_rate": 5.83215907311727e-07, |
|
"loss": 1.4254, |
|
"step": 404000 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.900174140930176, |
|
"learning_rate": 5.220565210583999e-07, |
|
"loss": 1.433, |
|
"step": 404500 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 2.6606194972991943, |
|
"learning_rate": 4.608971348050728e-07, |
|
"loss": 1.427, |
|
"step": 405000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.6916987895965576, |
|
"learning_rate": 3.997377485517458e-07, |
|
"loss": 1.4344, |
|
"step": 405500 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.7830684185028076, |
|
"learning_rate": 3.385783622984187e-07, |
|
"loss": 1.432, |
|
"step": 406000 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 2.9338104724884033, |
|
"learning_rate": 2.7741897604509164e-07, |
|
"loss": 1.4311, |
|
"step": 406500 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.861415147781372, |
|
"learning_rate": 2.1625958979176455e-07, |
|
"loss": 1.4245, |
|
"step": 407000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 3.0713891983032227, |
|
"learning_rate": 1.5510020353843746e-07, |
|
"loss": 1.4246, |
|
"step": 407500 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 2.7229363918304443, |
|
"learning_rate": 9.39408172851104e-08, |
|
"loss": 1.4289, |
|
"step": 408000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 2.5204861164093018, |
|
"learning_rate": 3.278143103178331e-08, |
|
"loss": 1.4267, |
|
"step": 408500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 408768, |
|
"total_flos": 3.442569138534612e+18, |
|
"train_loss": 1.939745183989198, |
|
"train_runtime": 329612.5954, |
|
"train_samples_per_second": 39.685, |
|
"train_steps_per_second": 1.24 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 408768, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"total_flos": 3.442569138534612e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|